001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.util; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.fail; 022 023import java.io.IOException; 024import java.util.ArrayList; 025import java.util.Collection; 026import java.util.EnumSet; 027import java.util.HashMap; 028import java.util.List; 029import java.util.Map; 030import java.util.Optional; 031import java.util.concurrent.CountDownLatch; 032import java.util.concurrent.ExecutorService; 033import java.util.concurrent.ScheduledThreadPoolExecutor; 034import org.apache.hadoop.conf.Configuration; 035import org.apache.hadoop.fs.FileStatus; 036import org.apache.hadoop.fs.FileSystem; 037import org.apache.hadoop.fs.Path; 038import org.apache.hadoop.hbase.ClusterMetrics; 039import org.apache.hadoop.hbase.ClusterMetrics.Option; 040import org.apache.hadoop.hbase.HBaseTestingUtility; 041import org.apache.hadoop.hbase.HColumnDescriptor; 042import org.apache.hadoop.hbase.HConstants; 043import org.apache.hadoop.hbase.HRegionLocation; 044import org.apache.hadoop.hbase.HTableDescriptor; 045import org.apache.hadoop.hbase.ServerName; 046import org.apache.hadoop.hbase.TableName; 047import org.apache.hadoop.hbase.client.Admin; 048import org.apache.hadoop.hbase.client.ClusterConnection; 049import org.apache.hadoop.hbase.client.Connection; 050import org.apache.hadoop.hbase.client.ConnectionFactory; 051import org.apache.hadoop.hbase.client.Delete; 052import org.apache.hadoop.hbase.client.Put; 053import org.apache.hadoop.hbase.client.RegionInfo; 054import org.apache.hadoop.hbase.client.RegionLocator; 055import org.apache.hadoop.hbase.client.Scan; 056import org.apache.hadoop.hbase.client.Table; 057import org.apache.hadoop.hbase.client.TableDescriptor; 058import org.apache.hadoop.hbase.coprocessor.MasterCoprocessor; 059import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment; 060import org.apache.hadoop.hbase.coprocessor.MasterObserver; 061import org.apache.hadoop.hbase.coprocessor.ObserverContext; 062import org.apache.hadoop.hbase.master.assignment.AssignmentManager; 063import org.apache.hadoop.hbase.master.assignment.RegionStates; 064import org.apache.hadoop.hbase.mob.MobFileName; 065import org.apache.hadoop.hbase.mob.MobUtils; 066import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; 067import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker; 068import org.junit.rules.TestName; 069import org.slf4j.Logger; 070import org.slf4j.LoggerFactory; 071 072import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 073import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos; 074 075/** 076 * This is the base class for HBaseFsck's ability to detect reasons for inconsistent tables. Actual 077 * tests are in : TestHBaseFsckTwoRS TestHBaseFsckOneRS TestHBaseFsckMOB TestHBaseFsckReplicas 078 */ 079public class BaseTestHBaseFsck { 080 static final int POOL_SIZE = 7; 081 protected static final Logger LOG = LoggerFactory.getLogger(BaseTestHBaseFsck.class); 082 protected final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 083 protected final static Configuration conf = TEST_UTIL.getConfiguration(); 084 protected final static String FAM_STR = "fam"; 085 protected final static byte[] FAM = Bytes.toBytes(FAM_STR); 086 protected final static int REGION_ONLINE_TIMEOUT = 800; 087 protected static AssignmentManager assignmentManager; 088 protected static RegionStates regionStates; 089 protected static ExecutorService tableExecutorService; 090 protected static ScheduledThreadPoolExecutor hbfsckExecutorService; 091 protected static ClusterConnection connection; 092 protected static Admin admin; 093 094 // for the instance, reset every test run 095 protected Table tbl; 096 protected final static byte[][] SPLITS = 097 new byte[][] { Bytes.toBytes("A"), Bytes.toBytes("B"), Bytes.toBytes("C") }; 098 // one row per region. 099 protected final static byte[][] ROWKEYS = new byte[][] { Bytes.toBytes("00"), Bytes.toBytes("50"), 100 Bytes.toBytes("A0"), Bytes.toBytes("A5"), Bytes.toBytes("B0"), Bytes.toBytes("B5"), 101 Bytes.toBytes("C0"), Bytes.toBytes("C5") }; 102 103 /** 104 * Debugging method to dump the contents of meta. 105 */ 106 protected void dumpMeta(TableName tableName) throws IOException { 107 List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName); 108 for (byte[] row : metaRows) { 109 LOG.info(Bytes.toString(row)); 110 } 111 } 112 113 /** 114 * This method is used to undeploy a region -- close it and attempt to remove its state from the 115 * Master. 116 */ 117 protected void undeployRegion(Connection conn, ServerName sn, RegionInfo hri) 118 throws IOException, InterruptedException { 119 try { 120 HBaseFsckRepair.closeRegionSilentlyAndWait(conn, sn, hri); 121 if (!hri.isMetaRegion()) { 122 admin.offline(hri.getRegionName()); 123 } 124 } catch (IOException ioe) { 125 LOG.warn( 126 "Got exception when attempting to offline region " + Bytes.toString(hri.getRegionName()), 127 ioe); 128 } 129 } 130 131 /** 132 * Delete a region from assignments, meta, or completely from hdfs. 133 * @param unassign if true unassign region if assigned 134 * @param metaRow if true remove region's row from META 135 * @param hdfs if true remove region's dir in HDFS 136 */ 137 protected void deleteRegion(Configuration conf, final HTableDescriptor htd, byte[] startKey, 138 byte[] endKey, boolean unassign, boolean metaRow, boolean hdfs) 139 throws IOException, InterruptedException { 140 deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false, 141 RegionInfo.DEFAULT_REPLICA_ID); 142 } 143 144 /** 145 * Delete a region from assignments, meta, or completely from hdfs. 146 * @param unassign if true unassign region if assigned 147 * @param metaRow if true remove region's row from META 148 * @param hdfs if true remove region's dir in HDFS 149 * @param regionInfoOnly if true remove a region dir's .regioninfo file 150 * @param replicaId replica id 151 */ 152 protected void deleteRegion(Configuration conf, final HTableDescriptor htd, byte[] startKey, 153 byte[] endKey, boolean unassign, boolean metaRow, boolean hdfs, boolean regionInfoOnly, 154 int replicaId) throws IOException, InterruptedException { 155 LOG.info("** Before delete:"); 156 dumpMeta(htd.getTableName()); 157 158 List<HRegionLocation> locations; 159 try (RegionLocator rl = connection.getRegionLocator(tbl.getName())) { 160 locations = rl.getAllRegionLocations(); 161 } 162 163 for (HRegionLocation location : locations) { 164 RegionInfo hri = location.getRegionInfo(); 165 ServerName hsa = location.getServerName(); 166 if ( 167 Bytes.compareTo(hri.getStartKey(), startKey) == 0 168 && Bytes.compareTo(hri.getEndKey(), endKey) == 0 && hri.getReplicaId() == replicaId 169 ) { 170 171 LOG.info("RegionName: " + hri.getRegionNameAsString()); 172 byte[] deleteRow = hri.getRegionName(); 173 174 if (unassign) { 175 LOG.info("Undeploying region " + hri + " from server " + hsa); 176 undeployRegion(connection, hsa, hri); 177 } 178 179 if (regionInfoOnly) { 180 LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString()); 181 Path rootDir = CommonFSUtils.getRootDir(conf); 182 FileSystem fs = rootDir.getFileSystem(conf); 183 Path p = 184 new Path(CommonFSUtils.getTableDir(rootDir, htd.getTableName()), hri.getEncodedName()); 185 Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE); 186 fs.delete(hriPath, true); 187 } 188 189 if (hdfs) { 190 LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString()); 191 Path rootDir = CommonFSUtils.getRootDir(conf); 192 FileSystem fs = rootDir.getFileSystem(conf); 193 Path p = 194 new Path(CommonFSUtils.getTableDir(rootDir, htd.getTableName()), hri.getEncodedName()); 195 HBaseFsck.debugLsr(conf, p); 196 boolean success = fs.delete(p, true); 197 LOG.info("Deleted " + p + " sucessfully? " + success); 198 HBaseFsck.debugLsr(conf, p); 199 } 200 201 if (metaRow) { 202 try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) { 203 Delete delete = new Delete(deleteRow); 204 meta.delete(delete); 205 } 206 } 207 } 208 LOG.info(hri.toString() + hsa.toString()); 209 } 210 211 TEST_UTIL.getMetaTableRows(htd.getTableName()); 212 LOG.info("*** After delete:"); 213 dumpMeta(htd.getTableName()); 214 } 215 216 /** 217 * Setup a clean table before we start mucking with it. It will set tbl which needs to be closed 218 * after test 219 */ 220 void setupTable(TableName tablename) throws Exception { 221 setupTableWithRegionReplica(tablename, 1); 222 } 223 224 /** 225 * Setup a clean table with a certain region_replica count It will set tbl which needs to be 226 * closed after test 227 */ 228 void setupTableWithRegionReplica(TableName tablename, int replicaCount) throws Exception { 229 HTableDescriptor desc = new HTableDescriptor(tablename); 230 desc.setRegionReplication(replicaCount); 231 HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM)); 232 desc.addFamily(hcd); // If a table has no CF's it doesn't get checked 233 createTable(TEST_UTIL, desc, SPLITS); 234 235 tbl = connection.getTable(tablename, tableExecutorService); 236 List<Put> puts = new ArrayList<>(ROWKEYS.length); 237 for (byte[] row : ROWKEYS) { 238 Put p = new Put(row); 239 p.addColumn(FAM, Bytes.toBytes("val"), row); 240 puts.add(p); 241 } 242 tbl.put(puts); 243 } 244 245 /** 246 * Setup a clean table with a mob-enabled column. 247 * @param tablename The name of a table to be created. 248 */ 249 void setupMobTable(TableName tablename) throws Exception { 250 HTableDescriptor desc = new HTableDescriptor(tablename); 251 HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM)); 252 hcd.setMobEnabled(true); 253 hcd.setMobThreshold(0); 254 desc.addFamily(hcd); // If a table has no CF's it doesn't get checked 255 createTable(TEST_UTIL, desc, SPLITS); 256 257 tbl = connection.getTable(tablename, tableExecutorService); 258 List<Put> puts = new ArrayList<>(ROWKEYS.length); 259 for (byte[] row : ROWKEYS) { 260 Put p = new Put(row); 261 p.addColumn(FAM, Bytes.toBytes("val"), row); 262 puts.add(p); 263 } 264 tbl.put(puts); 265 } 266 267 /** 268 * Counts the number of rows to verify data loss or non-dataloss. 269 */ 270 int countRows() throws IOException { 271 return TEST_UTIL.countRows(tbl); 272 } 273 274 /** 275 * Counts the number of rows to verify data loss or non-dataloss. 276 */ 277 int countRows(byte[] start, byte[] end) throws IOException { 278 return TEST_UTIL.countRows(tbl, new Scan(start, end)); 279 } 280 281 /** 282 * delete table in preparation for next test 283 */ 284 void cleanupTable(TableName tablename) throws Exception { 285 if (tbl != null) { 286 tbl.close(); 287 tbl = null; 288 } 289 290 ((ClusterConnection) connection).clearRegionLocationCache(); 291 deleteTable(TEST_UTIL, tablename); 292 } 293 294 /** 295 * Get region info from local cluster. 296 */ 297 Map<ServerName, List<String>> getDeployedHRIs(final Admin admin) throws IOException { 298 ClusterMetrics status = admin.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS)); 299 Collection<ServerName> regionServers = status.getLiveServerMetrics().keySet(); 300 Map<ServerName, List<String>> mm = new HashMap<>(); 301 for (ServerName hsi : regionServers) { 302 AdminProtos.AdminService.BlockingInterface server = connection.getAdmin(hsi); 303 304 // list all online regions from this region server 305 List<RegionInfo> regions = ProtobufUtil.getOnlineRegions(server); 306 List<String> regionNames = new ArrayList<>(regions.size()); 307 for (RegionInfo hri : regions) { 308 regionNames.add(hri.getRegionNameAsString()); 309 } 310 mm.put(hsi, regionNames); 311 } 312 return mm; 313 } 314 315 /** 316 * Returns the HSI a region info is on. 317 */ 318 ServerName findDeployedHSI(Map<ServerName, List<String>> mm, RegionInfo hri) { 319 for (Map.Entry<ServerName, List<String>> e : mm.entrySet()) { 320 if (e.getValue().contains(hri.getRegionNameAsString())) { 321 return e.getKey(); 322 } 323 } 324 return null; 325 } 326 327 public void deleteTableDir(TableName table) throws IOException { 328 Path rootDir = CommonFSUtils.getRootDir(conf); 329 FileSystem fs = rootDir.getFileSystem(conf); 330 Path p = CommonFSUtils.getTableDir(rootDir, table); 331 HBaseFsck.debugLsr(conf, p); 332 boolean success = fs.delete(p, true); 333 LOG.info("Deleted " + p + " sucessfully? " + success); 334 } 335 336 /** 337 * We don't have an easy way to verify that a flush completed, so we loop until we find a 338 * legitimate hfile and return it. 339 * @return Path of a flushed hfile. 340 */ 341 Path getFlushedHFile(FileSystem fs, TableName table) throws IOException { 342 Path tableDir = CommonFSUtils.getTableDir(CommonFSUtils.getRootDir(conf), table); 343 Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0); 344 Path famDir = new Path(regionDir, FAM_STR); 345 346 // keep doing this until we get a legit hfile 347 while (true) { 348 FileStatus[] hfFss = fs.listStatus(famDir); 349 if (hfFss.length == 0) { 350 continue; 351 } 352 for (FileStatus hfs : hfFss) { 353 if (!hfs.isDirectory()) { 354 return hfs.getPath(); 355 } 356 } 357 } 358 } 359 360 /** 361 * Gets flushed mob files. 362 * @param fs The current file system. 363 * @param table The current table name. 364 * @return Path of a flushed hfile. 365 */ 366 Path getFlushedMobFile(FileSystem fs, TableName table) throws IOException { 367 Path famDir = MobUtils.getMobFamilyPath(conf, table, FAM_STR); 368 369 // keep doing this until we get a legit hfile 370 while (true) { 371 FileStatus[] hfFss = fs.listStatus(famDir); 372 if (hfFss.length == 0) { 373 continue; 374 } 375 for (FileStatus hfs : hfFss) { 376 if (!hfs.isDirectory()) { 377 return hfs.getPath(); 378 } 379 } 380 } 381 } 382 383 /** 384 * Creates a new mob file name by the old one. 385 * @param oldFileName The old mob file name. 386 * @return The new mob file name. 387 */ 388 String createMobFileName(String oldFileName) { 389 MobFileName mobFileName = MobFileName.create(oldFileName); 390 String startKey = mobFileName.getStartKey(); 391 String date = mobFileName.getDate(); 392 return MobFileName 393 .create(startKey, date, TEST_UTIL.getRandomUUID().toString().replaceAll("-", ""), "abcdef") 394 .getFileName(); 395 } 396 397 /** 398 * Test that use this should have a timeout, because this method could potentially wait forever. 399 */ 400 protected void doQuarantineTest(TableName table, HBaseFsck hbck, int check, int corrupt, int fail, 401 int quar, int missing) throws Exception { 402 try { 403 setupTable(table); 404 assertEquals(ROWKEYS.length, countRows()); 405 admin.flush(table); // flush is async. 406 407 // Mess it up by leaving a hole in the assignment, meta, and hdfs data 408 admin.disableTable(table); 409 410 String[] args = { "-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission", 411 table.getNameAsString() }; 412 HBaseFsck res = hbck.exec(hbfsckExecutorService, args); 413 414 HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker(); 415 assertEquals(hfcc.getHFilesChecked(), check); 416 assertEquals(hfcc.getCorrupted().size(), corrupt); 417 assertEquals(hfcc.getFailures().size(), fail); 418 assertEquals(hfcc.getQuarantined().size(), quar); 419 assertEquals(hfcc.getMissing().size(), missing); 420 421 // its been fixed, verify that we can enable 422 admin.enableTableAsync(table); 423 while (!admin.isTableEnabled(table)) { 424 try { 425 Thread.sleep(250); 426 } catch (InterruptedException e) { 427 e.printStackTrace(); 428 fail("Interrupted when trying to enable table " + table); 429 } 430 } 431 } finally { 432 cleanupTable(table); 433 } 434 } 435 436 static class MockErrorReporter implements HbckErrorReporter { 437 static int calledCount = 0; 438 439 @Override 440 public void clear() { 441 calledCount++; 442 } 443 444 @Override 445 public void report(String message) { 446 calledCount++; 447 } 448 449 @Override 450 public void reportError(String message) { 451 calledCount++; 452 } 453 454 @Override 455 public void reportError(ERROR_CODE errorCode, String message) { 456 calledCount++; 457 } 458 459 @Override 460 public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table) { 461 calledCount++; 462 } 463 464 @Override 465 public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table, 466 HbckRegionInfo info) { 467 calledCount++; 468 } 469 470 @Override 471 public void reportError(ERROR_CODE errorCode, String message, HbckTableInfo table, 472 HbckRegionInfo info1, HbckRegionInfo info2) { 473 calledCount++; 474 } 475 476 @Override 477 public int summarize() { 478 return ++calledCount; 479 } 480 481 @Override 482 public void detail(String details) { 483 calledCount++; 484 } 485 486 @Override 487 public ArrayList<ERROR_CODE> getErrorList() { 488 calledCount++; 489 return new ArrayList<>(); 490 } 491 492 @Override 493 public void progress() { 494 calledCount++; 495 } 496 497 @Override 498 public void print(String message) { 499 calledCount++; 500 } 501 502 @Override 503 public void resetErrors() { 504 calledCount++; 505 } 506 507 @Override 508 public boolean tableHasErrors(HbckTableInfo table) { 509 calledCount++; 510 return false; 511 } 512 } 513 514 protected void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs, 515 boolean regionInfoOnly) throws IOException, InterruptedException { 516 HRegionLocation metaLocation = connection.getRegionLocator(TableName.META_TABLE_NAME) 517 .getRegionLocation(HConstants.EMPTY_START_ROW); 518 ServerName hsa = metaLocation.getServerName(); 519 RegionInfo hri = metaLocation.getRegionInfo(); 520 if (unassign) { 521 LOG.info("Undeploying meta region " + hri + " from server " + hsa); 522 try (Connection unmanagedConnection = ConnectionFactory.createConnection(conf)) { 523 undeployRegion(unmanagedConnection, hsa, hri); 524 } 525 } 526 527 if (regionInfoOnly) { 528 LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString()); 529 Path rootDir = CommonFSUtils.getRootDir(conf); 530 FileSystem fs = rootDir.getFileSystem(conf); 531 Path p = 532 new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(), hri.getEncodedName()); 533 Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE); 534 fs.delete(hriPath, true); 535 } 536 537 if (hdfs) { 538 LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString()); 539 Path rootDir = CommonFSUtils.getRootDir(conf); 540 FileSystem fs = rootDir.getFileSystem(conf); 541 Path p = 542 new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(), hri.getEncodedName()); 543 HBaseFsck.debugLsr(conf, p); 544 boolean success = fs.delete(p, true); 545 LOG.info("Deleted " + p + " sucessfully? " + success); 546 HBaseFsck.debugLsr(conf, p); 547 } 548 } 549 550 @org.junit.Rule 551 public TestName name = new TestName(); 552 553 public static class MasterSyncCoprocessor implements MasterCoprocessor, MasterObserver { 554 volatile CountDownLatch tableCreationLatch = null; 555 volatile CountDownLatch tableDeletionLatch = null; 556 557 @Override 558 public Optional<MasterObserver> getMasterObserver() { 559 return Optional.of(this); 560 } 561 562 @Override 563 public void postCompletedCreateTableAction( 564 final ObserverContext<MasterCoprocessorEnvironment> ctx, final TableDescriptor desc, 565 final RegionInfo[] regions) throws IOException { 566 // the AccessController test, some times calls only and directly the 567 // postCompletedCreateTableAction() 568 if (tableCreationLatch != null) { 569 tableCreationLatch.countDown(); 570 } 571 } 572 573 @Override 574 public void postCompletedDeleteTableAction( 575 final ObserverContext<MasterCoprocessorEnvironment> ctx, final TableName tableName) 576 throws IOException { 577 // the AccessController test, some times calls only and directly the 578 // postCompletedDeleteTableAction() 579 if (tableDeletionLatch != null) { 580 tableDeletionLatch.countDown(); 581 } 582 } 583 } 584 585 public static void createTable(HBaseTestingUtility testUtil, HTableDescriptor htd, 586 byte[][] splitKeys) throws Exception { 587 // NOTE: We need a latch because admin is not sync, 588 // so the postOp coprocessor method may be called after the admin operation returned. 589 MasterSyncCoprocessor coproc = testUtil.getHBaseCluster().getMaster().getMasterCoprocessorHost() 590 .findCoprocessor(MasterSyncCoprocessor.class); 591 coproc.tableCreationLatch = new CountDownLatch(1); 592 if (splitKeys != null) { 593 admin.createTable(htd, splitKeys); 594 } else { 595 admin.createTable(htd); 596 } 597 coproc.tableCreationLatch.await(); 598 coproc.tableCreationLatch = null; 599 testUtil.waitUntilAllRegionsAssigned(htd.getTableName()); 600 } 601 602 public static void deleteTable(HBaseTestingUtility testUtil, TableName tableName) 603 throws Exception { 604 // NOTE: We need a latch because admin is not sync, 605 // so the postOp coprocessor method may be called after the admin operation returned. 606 MasterSyncCoprocessor coproc = testUtil.getHBaseCluster().getMaster().getMasterCoprocessorHost() 607 .findCoprocessor(MasterSyncCoprocessor.class); 608 coproc.tableDeletionLatch = new CountDownLatch(1); 609 try { 610 admin.disableTable(tableName); 611 } catch (Exception e) { 612 LOG.debug("Table: " + tableName + " already disabled, so just deleting it."); 613 } 614 admin.deleteTable(tableName); 615 coproc.tableDeletionLatch.await(); 616 coproc.tableDeletionLatch = null; 617 } 618}