001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.util;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.fail;
022
023import java.io.IOException;
024import java.util.ArrayList;
025import java.util.Collection;
026import java.util.EnumSet;
027import java.util.HashMap;
028import java.util.List;
029import java.util.Map;
030import java.util.Optional;
031import java.util.concurrent.CountDownLatch;
032import java.util.concurrent.ExecutorService;
033import java.util.concurrent.ScheduledThreadPoolExecutor;
034import org.apache.hadoop.conf.Configuration;
035import org.apache.hadoop.fs.FileStatus;
036import org.apache.hadoop.fs.FileSystem;
037import org.apache.hadoop.fs.Path;
038import org.apache.hadoop.hbase.ClusterMetrics;
039import org.apache.hadoop.hbase.ClusterMetrics.Option;
040import org.apache.hadoop.hbase.HBaseTestingUtility;
041import org.apache.hadoop.hbase.HColumnDescriptor;
042import org.apache.hadoop.hbase.HConstants;
043import org.apache.hadoop.hbase.HRegionLocation;
044import org.apache.hadoop.hbase.HTableDescriptor;
045import org.apache.hadoop.hbase.ServerName;
046import org.apache.hadoop.hbase.TableName;
047import org.apache.hadoop.hbase.client.Admin;
048import org.apache.hadoop.hbase.client.ClusterConnection;
049import org.apache.hadoop.hbase.client.Connection;
050import org.apache.hadoop.hbase.client.ConnectionFactory;
051import org.apache.hadoop.hbase.client.Delete;
052import org.apache.hadoop.hbase.client.Put;
053import org.apache.hadoop.hbase.client.RegionInfo;
054import org.apache.hadoop.hbase.client.RegionLocator;
055import org.apache.hadoop.hbase.client.Scan;
056import org.apache.hadoop.hbase.client.Table;
057import org.apache.hadoop.hbase.client.TableDescriptor;
058import org.apache.hadoop.hbase.coprocessor.MasterCoprocessor;
059import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment;
060import org.apache.hadoop.hbase.coprocessor.MasterObserver;
061import org.apache.hadoop.hbase.coprocessor.ObserverContext;
062import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
063import org.apache.hadoop.hbase.master.assignment.RegionStates;
064import org.apache.hadoop.hbase.mob.MobFileName;
065import org.apache.hadoop.hbase.mob.MobUtils;
066import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
067import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
068import org.junit.rules.TestName;
069import org.slf4j.Logger;
070import org.slf4j.LoggerFactory;
071
072import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
073import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos;
074
075/**
076 * This is the base class for HBaseFsck's ability to detect reasons for inconsistent tables. Actual
077 * tests are in : TestHBaseFsckTwoRS TestHBaseFsckOneRS TestHBaseFsckMOB TestHBaseFsckReplicas
078 */
079public class BaseTestHBaseFsck {
080  static final int POOL_SIZE = 7;
081  protected static final Logger LOG = LoggerFactory.getLogger(BaseTestHBaseFsck.class);
082  protected final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
083  protected final static Configuration conf = TEST_UTIL.getConfiguration();
084  protected final static String FAM_STR = "fam";
085  protected final static byte[] FAM = Bytes.toBytes(FAM_STR);
086  protected final static int REGION_ONLINE_TIMEOUT = 800;
087  protected static AssignmentManager assignmentManager;
088  protected static RegionStates regionStates;
089  protected static ExecutorService tableExecutorService;
090  protected static ScheduledThreadPoolExecutor hbfsckExecutorService;
091  protected static ClusterConnection connection;
092  protected static Admin admin;
093
094  // for the instance, reset every test run
095  protected Table tbl;
096  protected final static byte[][] SPLITS =
097    new byte[][] { Bytes.toBytes("A"), Bytes.toBytes("B"), Bytes.toBytes("C") };
098  // one row per region.
099  protected final static byte[][] ROWKEYS = new byte[][] { Bytes.toBytes("00"), Bytes.toBytes("50"),
100    Bytes.toBytes("A0"), Bytes.toBytes("A5"), Bytes.toBytes("B0"), Bytes.toBytes("B5"),
101    Bytes.toBytes("C0"), Bytes.toBytes("C5") };
102
103  /**
104   * Debugging method to dump the contents of meta.
105   */
106  protected void dumpMeta(TableName tableName) throws IOException {
107    List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
108    for (byte[] row : metaRows) {
109      LOG.info(Bytes.toString(row));
110    }
111  }
112
113  /**
114   * This method is used to undeploy a region -- close it and attempt to remove its state from the
115   * Master.
116   */
117  protected void undeployRegion(Connection conn, ServerName sn, RegionInfo hri)
118    throws IOException, InterruptedException {
119    try {
120      HBaseFsckRepair.closeRegionSilentlyAndWait(conn, sn, hri);
121      if (!hri.isMetaRegion()) {
122        admin.offline(hri.getRegionName());
123      }
124    } catch (IOException ioe) {
125      LOG.warn(
126        "Got exception when attempting to offline region " + Bytes.toString(hri.getRegionName()),
127        ioe);
128    }
129  }
130
131  /**
132   * Delete a region from assignments, meta, or completely from hdfs.
133   * @param unassign if true unassign region if assigned
134   * @param metaRow  if true remove region's row from META
135   * @param hdfs     if true remove region's dir in HDFS
136   */
137  protected void deleteRegion(Configuration conf, final HTableDescriptor htd, byte[] startKey,
138    byte[] endKey, boolean unassign, boolean metaRow, boolean hdfs)
139    throws IOException, InterruptedException {
140    deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false,
141      RegionInfo.DEFAULT_REPLICA_ID);
142  }
143
144  /**
145   * Delete a region from assignments, meta, or completely from hdfs.
146   * @param unassign       if true unassign region if assigned
147   * @param metaRow        if true remove region's row from META
148   * @param hdfs           if true remove region's dir in HDFS
149   * @param regionInfoOnly if true remove a region dir's .regioninfo file
150   * @param replicaId      replica id
151   */
152  protected void deleteRegion(Configuration conf, final HTableDescriptor htd, byte[] startKey,
153    byte[] endKey, boolean unassign, boolean metaRow, boolean hdfs, boolean regionInfoOnly,
154    int replicaId) throws IOException, InterruptedException {
155    LOG.info("** Before delete:");
156    dumpMeta(htd.getTableName());
157
158    List<HRegionLocation> locations;
159    try (RegionLocator rl = connection.getRegionLocator(tbl.getName())) {
160      locations = rl.getAllRegionLocations();
161    }
162
163    for (HRegionLocation location : locations) {
164      RegionInfo hri = location.getRegionInfo();
165      ServerName hsa = location.getServerName();
166      if (
167        Bytes.compareTo(hri.getStartKey(), startKey) == 0
168          && Bytes.compareTo(hri.getEndKey(), endKey) == 0 && hri.getReplicaId() == replicaId
169      ) {
170
171        LOG.info("RegionName: " + hri.getRegionNameAsString());
172        byte[] deleteRow = hri.getRegionName();
173
174        if (unassign) {
175          LOG.info("Undeploying region " + hri + " from server " + hsa);
176          undeployRegion(connection, hsa, hri);
177        }
178
179        if (regionInfoOnly) {
180          LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
181          Path rootDir = CommonFSUtils.getRootDir(conf);
182          FileSystem fs = rootDir.getFileSystem(conf);
183          Path p =
184            new Path(CommonFSUtils.getTableDir(rootDir, htd.getTableName()), hri.getEncodedName());
185          Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
186          fs.delete(hriPath, true);
187        }
188
189        if (hdfs) {
190          LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
191          Path rootDir = CommonFSUtils.getRootDir(conf);
192          FileSystem fs = rootDir.getFileSystem(conf);
193          Path p =
194            new Path(CommonFSUtils.getTableDir(rootDir, htd.getTableName()), hri.getEncodedName());
195          HBaseFsck.debugLsr(conf, p);
196          boolean success = fs.delete(p, true);
197          LOG.info("Deleted " + p + " sucessfully? " + success);
198          HBaseFsck.debugLsr(conf, p);
199        }
200
201        if (metaRow) {
202          try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
203            Delete delete = new Delete(deleteRow);
204            meta.delete(delete);
205          }
206        }
207      }
208      LOG.info(hri.toString() + hsa.toString());
209    }
210
211    TEST_UTIL.getMetaTableRows(htd.getTableName());
212    LOG.info("*** After delete:");
213    dumpMeta(htd.getTableName());
214  }
215
216  /**
217   * Setup a clean table before we start mucking with it. It will set tbl which needs to be closed
218   * after test
219   */
220  void setupTable(TableName tablename) throws Exception {
221    setupTableWithRegionReplica(tablename, 1);
222  }
223
224  /**
225   * Setup a clean table with a certain region_replica count It will set tbl which needs to be
226   * closed after test
227   */
228  void setupTableWithRegionReplica(TableName tablename, int replicaCount) throws Exception {
229    HTableDescriptor desc = new HTableDescriptor(tablename);
230    desc.setRegionReplication(replicaCount);
231    HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
232    desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
233    createTable(TEST_UTIL, desc, SPLITS);
234
235    tbl = connection.getTable(tablename, tableExecutorService);
236    List<Put> puts = new ArrayList<>(ROWKEYS.length);
237    for (byte[] row : ROWKEYS) {
238      Put p = new Put(row);
239      p.addColumn(FAM, Bytes.toBytes("val"), row);
240      puts.add(p);
241    }
242    tbl.put(puts);
243  }
244
245  /**
246   * Setup a clean table with a mob-enabled column.
247   * @param tablename The name of a table to be created.
248   */
249  void setupMobTable(TableName tablename) throws Exception {
250    HTableDescriptor desc = new HTableDescriptor(tablename);
251    HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
252    hcd.setMobEnabled(true);
253    hcd.setMobThreshold(0);
254    desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
255    createTable(TEST_UTIL, desc, SPLITS);
256
257    tbl = connection.getTable(tablename, tableExecutorService);
258    List<Put> puts = new ArrayList<>(ROWKEYS.length);
259    for (byte[] row : ROWKEYS) {
260      Put p = new Put(row);
261      p.addColumn(FAM, Bytes.toBytes("val"), row);
262      puts.add(p);
263    }
264    tbl.put(puts);
265  }
266
267  /**
268   * Counts the number of rows to verify data loss or non-dataloss.
269   */
270  int countRows() throws IOException {
271    return TEST_UTIL.countRows(tbl);
272  }
273
274  /**
275   * Counts the number of rows to verify data loss or non-dataloss.
276   */
277  int countRows(byte[] start, byte[] end) throws IOException {
278    return TEST_UTIL.countRows(tbl, new Scan(start, end));
279  }
280
281  /**
282   * delete table in preparation for next test
283   */
284  void cleanupTable(TableName tablename) throws Exception {
285    if (tbl != null) {
286      tbl.close();
287      tbl = null;
288    }
289
290    ((ClusterConnection) connection).clearRegionLocationCache();
291    deleteTable(TEST_UTIL, tablename);
292  }
293
294  /**
295   * Get region info from local cluster.
296   */
297  Map<ServerName, List<String>> getDeployedHRIs(final Admin admin) throws IOException {
298    ClusterMetrics status = admin.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS));
299    Collection<ServerName> regionServers = status.getLiveServerMetrics().keySet();
300    Map<ServerName, List<String>> mm = new HashMap<>();
301    for (ServerName hsi : regionServers) {
302      AdminProtos.AdminService.BlockingInterface server = connection.getAdmin(hsi);
303
304      // list all online regions from this region server
305      List<RegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
306      List<String> regionNames = new ArrayList<>(regions.size());
307      for (RegionInfo hri : regions) {
308        regionNames.add(hri.getRegionNameAsString());
309      }
310      mm.put(hsi, regionNames);
311    }
312    return mm;
313  }
314
315  /**
316   * Returns the HSI a region info is on.
317   */
318  ServerName findDeployedHSI(Map<ServerName, List<String>> mm, RegionInfo hri) {
319    for (Map.Entry<ServerName, List<String>> e : mm.entrySet()) {
320      if (e.getValue().contains(hri.getRegionNameAsString())) {
321        return e.getKey();
322      }
323    }
324    return null;
325  }
326
327  public void deleteTableDir(TableName table) throws IOException {
328    Path rootDir = CommonFSUtils.getRootDir(conf);
329    FileSystem fs = rootDir.getFileSystem(conf);
330    Path p = CommonFSUtils.getTableDir(rootDir, table);
331    HBaseFsck.debugLsr(conf, p);
332    boolean success = fs.delete(p, true);
333    LOG.info("Deleted " + p + " sucessfully? " + success);
334  }
335
336  /**
337   * We don't have an easy way to verify that a flush completed, so we loop until we find a
338   * legitimate hfile and return it.
339   * @return Path of a flushed hfile.
340   */
341  Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
342    Path tableDir = CommonFSUtils.getTableDir(CommonFSUtils.getRootDir(conf), table);
343    Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
344    Path famDir = new Path(regionDir, FAM_STR);
345
346    // keep doing this until we get a legit hfile
347    while (true) {
348      FileStatus[] hfFss = fs.listStatus(famDir);
349      if (hfFss.length == 0) {
350        continue;
351      }
352      for (FileStatus hfs : hfFss) {
353        if (!hfs.isDirectory()) {
354          return hfs.getPath();
355        }
356      }
357    }
358  }
359
360  /**
361   * Gets flushed mob files.
362   * @param fs    The current file system.
363   * @param table The current table name.
364   * @return Path of a flushed hfile.
365   */
366  Path getFlushedMobFile(FileSystem fs, TableName table) throws IOException {
367    Path famDir = MobUtils.getMobFamilyPath(conf, table, FAM_STR);
368
369    // keep doing this until we get a legit hfile
370    while (true) {
371      FileStatus[] hfFss = fs.listStatus(famDir);
372      if (hfFss.length == 0) {
373        continue;
374      }
375      for (FileStatus hfs : hfFss) {
376        if (!hfs.isDirectory()) {
377          return hfs.getPath();
378        }
379      }
380    }
381  }
382
383  /**
384   * Creates a new mob file name by the old one.
385   * @param oldFileName The old mob file name.
386   * @return The new mob file name.
387   */
388  String createMobFileName(String oldFileName) {
389    MobFileName mobFileName = MobFileName.create(oldFileName);
390    String startKey = mobFileName.getStartKey();
391    String date = mobFileName.getDate();
392    return MobFileName
393      .create(startKey, date, TEST_UTIL.getRandomUUID().toString().replaceAll("-", ""), "abcdef")
394      .getFileName();
395  }
396
397  /**
398   * Test that use this should have a timeout, because this method could potentially wait forever.
399   */
400  protected void doQuarantineTest(TableName table, HBaseFsck hbck, int check, int corrupt, int fail,
401    int quar, int missing) throws Exception {
402    try {
403      setupTable(table);
404      assertEquals(ROWKEYS.length, countRows());
405      admin.flush(table); // flush is async.
406
407      // Mess it up by leaving a hole in the assignment, meta, and hdfs data
408      admin.disableTable(table);
409
410      String[] args = { "-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
411        table.getNameAsString() };
412      HBaseFsck res = hbck.exec(hbfsckExecutorService, args);
413
414      HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
415      assertEquals(hfcc.getHFilesChecked(), check);
416      assertEquals(hfcc.getCorrupted().size(), corrupt);
417      assertEquals(hfcc.getFailures().size(), fail);
418      assertEquals(hfcc.getQuarantined().size(), quar);
419      assertEquals(hfcc.getMissing().size(), missing);
420
421      // its been fixed, verify that we can enable
422      admin.enableTableAsync(table);
423      while (!admin.isTableEnabled(table)) {
424        try {
425          Thread.sleep(250);
426        } catch (InterruptedException e) {
427          e.printStackTrace();
428          fail("Interrupted when trying to enable table " + table);
429        }
430      }
431    } finally {
432      cleanupTable(table);
433    }
434  }
435
436  static class MockErrorReporter implements HbckErrorReporter {
437    static int calledCount = 0;
438
439    @Override
440    public void clear() {
441      calledCount++;
442    }
443
444    @Override
445    public void report(String message) {
446      calledCount++;
447    }
448
449    @Override
450    public void reportError(String message) {
451      calledCount++;
452    }
453
454    @Override
455    public void reportError(ERROR_CODE errorCode, String message) {
456      calledCount++;
457    }
458
459    @Override
460    public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table) {
461      calledCount++;
462    }
463
464    @Override
465    public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table,
466      HbckRegionInfo info) {
467      calledCount++;
468    }
469
470    @Override
471    public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table,
472      HbckRegionInfo info1, HbckRegionInfo info2) {
473      calledCount++;
474    }
475
476    @Override
477    public int summarize() {
478      return ++calledCount;
479    }
480
481    @Override
482    public void detail(String details) {
483      calledCount++;
484    }
485
486    @Override
487    public ArrayList<ERROR_CODE> getErrorList() {
488      calledCount++;
489      return new ArrayList<>();
490    }
491
492    @Override
493    public void progress() {
494      calledCount++;
495    }
496
497    @Override
498    public void print(String message) {
499      calledCount++;
500    }
501
502    @Override
503    public void resetErrors() {
504      calledCount++;
505    }
506
507    @Override
508    public boolean tableHasErrors(HbckTableInfo table) {
509      calledCount++;
510      return false;
511    }
512  }
513
514  protected void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
515    boolean regionInfoOnly) throws IOException, InterruptedException {
516    HRegionLocation metaLocation = connection.getRegionLocator(TableName.META_TABLE_NAME)
517      .getRegionLocation(HConstants.EMPTY_START_ROW);
518    ServerName hsa = metaLocation.getServerName();
519    RegionInfo hri = metaLocation.getRegionInfo();
520    if (unassign) {
521      LOG.info("Undeploying meta region " + hri + " from server " + hsa);
522      try (Connection unmanagedConnection = ConnectionFactory.createConnection(conf)) {
523        undeployRegion(unmanagedConnection, hsa, hri);
524      }
525    }
526
527    if (regionInfoOnly) {
528      LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
529      Path rootDir = CommonFSUtils.getRootDir(conf);
530      FileSystem fs = rootDir.getFileSystem(conf);
531      Path p =
532        new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(), hri.getEncodedName());
533      Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
534      fs.delete(hriPath, true);
535    }
536
537    if (hdfs) {
538      LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
539      Path rootDir = CommonFSUtils.getRootDir(conf);
540      FileSystem fs = rootDir.getFileSystem(conf);
541      Path p =
542        new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(), hri.getEncodedName());
543      HBaseFsck.debugLsr(conf, p);
544      boolean success = fs.delete(p, true);
545      LOG.info("Deleted " + p + " sucessfully? " + success);
546      HBaseFsck.debugLsr(conf, p);
547    }
548  }
549
550  @org.junit.Rule
551  public TestName name = new TestName();
552
553  public static class MasterSyncCoprocessor implements MasterCoprocessor, MasterObserver {
554    volatile CountDownLatch tableCreationLatch = null;
555    volatile CountDownLatch tableDeletionLatch = null;
556
557    @Override
558    public Optional<MasterObserver> getMasterObserver() {
559      return Optional.of(this);
560    }
561
562    @Override
563    public void postCompletedCreateTableAction(
564      final ObserverContext<MasterCoprocessorEnvironment> ctx, final TableDescriptor desc,
565      final RegionInfo[] regions) throws IOException {
566      // the AccessController test, some times calls only and directly the
567      // postCompletedCreateTableAction()
568      if (tableCreationLatch != null) {
569        tableCreationLatch.countDown();
570      }
571    }
572
573    @Override
574    public void postCompletedDeleteTableAction(
575      final ObserverContext<MasterCoprocessorEnvironment> ctx, final TableName tableName)
576      throws IOException {
577      // the AccessController test, some times calls only and directly the
578      // postCompletedDeleteTableAction()
579      if (tableDeletionLatch != null) {
580        tableDeletionLatch.countDown();
581      }
582    }
583  }
584
585  public static void createTable(HBaseTestingUtility testUtil, HTableDescriptor htd,
586    byte[][] splitKeys) throws Exception {
587    // NOTE: We need a latch because admin is not sync,
588    // so the postOp coprocessor method may be called after the admin operation returned.
589    MasterSyncCoprocessor coproc = testUtil.getHBaseCluster().getMaster().getMasterCoprocessorHost()
590      .findCoprocessor(MasterSyncCoprocessor.class);
591    coproc.tableCreationLatch = new CountDownLatch(1);
592    if (splitKeys != null) {
593      admin.createTable(htd, splitKeys);
594    } else {
595      admin.createTable(htd);
596    }
597    coproc.tableCreationLatch.await();
598    coproc.tableCreationLatch = null;
599    testUtil.waitUntilAllRegionsAssigned(htd.getTableName());
600  }
601
602  public static void deleteTable(HBaseTestingUtility testUtil, TableName tableName)
603    throws Exception {
604    // NOTE: We need a latch because admin is not sync,
605    // so the postOp coprocessor method may be called after the admin operation returned.
606    MasterSyncCoprocessor coproc = testUtil.getHBaseCluster().getMaster().getMasterCoprocessorHost()
607      .findCoprocessor(MasterSyncCoprocessor.class);
608    coproc.tableDeletionLatch = new CountDownLatch(1);
609    try {
610      admin.disableTable(tableName);
611    } catch (Exception e) {
612      LOG.debug("Table: " + tableName + " already disabled, so just deleting it.");
613    }
614    admin.deleteTable(tableName);
615    coproc.tableDeletionLatch.await();
616    coproc.tableDeletionLatch = null;
617  }
618}