001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.rsgroup; 019 020import static org.apache.hadoop.hbase.util.Threads.sleep; 021import static org.junit.Assert.assertEquals; 022import static org.junit.Assert.assertFalse; 023import static org.junit.Assert.assertTrue; 024import static org.junit.Assert.fail; 025 026import java.io.IOException; 027import java.util.ArrayList; 028import java.util.EnumSet; 029import java.util.Iterator; 030import java.util.List; 031import java.util.Map; 032import java.util.Random; 033import java.util.Set; 034import java.util.concurrent.ThreadLocalRandom; 035import java.util.concurrent.atomic.AtomicBoolean; 036import java.util.function.Function; 037import org.apache.hadoop.hbase.ClusterMetrics.Option; 038import org.apache.hadoop.hbase.HBaseClassTestRule; 039import org.apache.hadoop.hbase.ServerName; 040import org.apache.hadoop.hbase.TableName; 041import org.apache.hadoop.hbase.Waiter; 042import org.apache.hadoop.hbase.client.RegionInfo; 043import org.apache.hadoop.hbase.constraint.ConstraintException; 044import org.apache.hadoop.hbase.master.RegionState; 045import org.apache.hadoop.hbase.master.assignment.RegionStateNode; 046import org.apache.hadoop.hbase.net.Address; 047import org.apache.hadoop.hbase.testclassification.LargeTests; 048import org.apache.hadoop.hbase.testclassification.RSGroupTests; 049import org.apache.hadoop.hbase.util.Bytes; 050import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 051import org.apache.hadoop.hbase.util.Pair; 052import org.junit.After; 053import org.junit.AfterClass; 054import org.junit.Before; 055import org.junit.BeforeClass; 056import org.junit.ClassRule; 057import org.junit.Test; 058import org.junit.experimental.categories.Category; 059import org.slf4j.Logger; 060import org.slf4j.LoggerFactory; 061 062import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; 063import org.apache.hbase.thirdparty.com.google.common.collect.Sets; 064 065@Category({ RSGroupTests.class, LargeTests.class }) 066public class TestRSGroupsAdmin2 extends TestRSGroupsBase { 067 068 @ClassRule 069 public static final HBaseClassTestRule CLASS_RULE = 070 HBaseClassTestRule.forClass(TestRSGroupsAdmin2.class); 071 072 private static final Logger LOG = LoggerFactory.getLogger(TestRSGroupsAdmin2.class); 073 074 @BeforeClass 075 public static void setUp() throws Exception { 076 setUpTestBeforeClass(); 077 } 078 079 @AfterClass 080 public static void tearDown() throws Exception { 081 tearDownAfterClass(); 082 } 083 084 @Before 085 public void beforeMethod() throws Exception { 086 setUpBeforeMethod(); 087 } 088 089 @After 090 public void afterMethod() throws Exception { 091 tearDownAfterMethod(); 092 } 093 094 @Test 095 public void testRegionMove() throws Exception { 096 final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); 097 final byte[] familyNameBytes = Bytes.toBytes("f"); 098 // All the regions created below will be assigned to the default group. 099 TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, 6); 100 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 101 @Override 102 public boolean evaluate() throws Exception { 103 List<String> regions = getTableRegionMap().get(tableName); 104 if (regions == null) { 105 return false; 106 } 107 108 return getTableRegionMap().get(tableName).size() >= 6; 109 } 110 }); 111 112 // get target region to move 113 Map<ServerName, List<String>> assignMap = getTableServerRegionMap().get(tableName); 114 String targetRegion = null; 115 for (ServerName server : assignMap.keySet()) { 116 targetRegion = assignMap.get(server).size() > 0 ? assignMap.get(server).get(0) : null; 117 if (targetRegion != null) { 118 break; 119 } 120 } 121 // get server which is not a member of new group 122 ServerName tmpTargetServer = null; 123 for (ServerName server : ADMIN.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS)) 124 .getLiveServerMetrics().keySet()) { 125 if (!newGroup.containsServer(server.getAddress())) { 126 tmpTargetServer = server; 127 break; 128 } 129 } 130 final ServerName targetServer = tmpTargetServer; 131 // move target server to group 132 ADMIN.moveServersToRSGroup(Sets.newHashSet(targetServer.getAddress()), newGroup.getName()); 133 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 134 @Override 135 public boolean evaluate() throws Exception { 136 return ADMIN.getRegions(targetServer).size() <= 0; 137 } 138 }); 139 140 // Lets move this region to the new group. 141 TEST_UTIL.getAdmin() 142 .move(Bytes.toBytes(RegionInfo.encodeRegionName(Bytes.toBytes(targetRegion))), targetServer); 143 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 144 @Override 145 public boolean evaluate() throws Exception { 146 return getTableRegionMap().get(tableName) != null 147 && getTableRegionMap().get(tableName).size() == 6 148 && ADMIN.getClusterMetrics(EnumSet.of(Option.REGIONS_IN_TRANSITION)) 149 .getRegionStatesInTransition().size() < 1; 150 } 151 }); 152 153 // verify that targetServer didn't open it 154 for (RegionInfo region : ADMIN.getRegions(targetServer)) { 155 if (targetRegion.equals(region.getRegionNameAsString())) { 156 fail("Target server opened region"); 157 } 158 } 159 } 160 161 @Test 162 public void testRegionServerMove() throws IOException, InterruptedException { 163 int initNumGroups = ADMIN.listRSGroups().size(); 164 RSGroupInfo appInfo = addGroup(getGroupName(name.getMethodName()), 1); 165 RSGroupInfo adminInfo = addGroup(getGroupName(name.getMethodName()), 1); 166 RSGroupInfo dInfo = ADMIN.getRSGroup(RSGroupInfo.DEFAULT_GROUP); 167 assertEquals(initNumGroups + 2, ADMIN.listRSGroups().size()); 168 assertEquals(1, adminInfo.getServers().size()); 169 assertEquals(1, appInfo.getServers().size()); 170 assertEquals(getNumServers() - 2, dInfo.getServers().size()); 171 ADMIN.moveServersToRSGroup(appInfo.getServers(), RSGroupInfo.DEFAULT_GROUP); 172 ADMIN.removeRSGroup(appInfo.getName()); 173 ADMIN.moveServersToRSGroup(adminInfo.getServers(), RSGroupInfo.DEFAULT_GROUP); 174 ADMIN.removeRSGroup(adminInfo.getName()); 175 assertEquals(ADMIN.listRSGroups().size(), initNumGroups); 176 } 177 178 @Test 179 public void testMoveServers() throws Exception { 180 // create groups and assign servers 181 addGroup("bar", 3); 182 ADMIN.addRSGroup("foo"); 183 184 RSGroupInfo barGroup = ADMIN.getRSGroup("bar"); 185 RSGroupInfo fooGroup = ADMIN.getRSGroup("foo"); 186 assertEquals(3, barGroup.getServers().size()); 187 assertEquals(0, fooGroup.getServers().size()); 188 189 // test fail bogus server move 190 try { 191 ADMIN.moveServersToRSGroup(Sets.newHashSet(Address.fromString("foo:9999")), "foo"); 192 fail("Bogus servers shouldn't have been successfully moved."); 193 } catch (IOException ex) { 194 String exp = "Server foo:9999 is either offline or it does not exist."; 195 String msg = "Expected '" + exp + "' in exception message: "; 196 assertTrue(msg + " " + ex.getMessage(), ex.getMessage().contains(exp)); 197 } 198 199 // test success case 200 LOG.info("moving servers " + barGroup.getServers() + " to group foo"); 201 ADMIN.moveServersToRSGroup(barGroup.getServers(), fooGroup.getName()); 202 203 barGroup = ADMIN.getRSGroup("bar"); 204 fooGroup = ADMIN.getRSGroup("foo"); 205 assertEquals(0, barGroup.getServers().size()); 206 assertEquals(3, fooGroup.getServers().size()); 207 208 LOG.info("moving servers " + fooGroup.getServers() + " to group default"); 209 ADMIN.moveServersToRSGroup(fooGroup.getServers(), RSGroupInfo.DEFAULT_GROUP); 210 211 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 212 @Override 213 public boolean evaluate() throws Exception { 214 return getNumServers() == ADMIN.getRSGroup(RSGroupInfo.DEFAULT_GROUP).getServers().size(); 215 } 216 }); 217 218 fooGroup = ADMIN.getRSGroup("foo"); 219 assertEquals(0, fooGroup.getServers().size()); 220 221 // test group removal 222 LOG.info("Remove group " + barGroup.getName()); 223 ADMIN.removeRSGroup(barGroup.getName()); 224 assertEquals(null, ADMIN.getRSGroup(barGroup.getName())); 225 LOG.info("Remove group " + fooGroup.getName()); 226 ADMIN.removeRSGroup(fooGroup.getName()); 227 assertEquals(null, ADMIN.getRSGroup(fooGroup.getName())); 228 } 229 230 @Test 231 public void testRemoveServers() throws Exception { 232 LOG.info("testRemoveServers"); 233 final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 3); 234 Iterator<Address> iterator = newGroup.getServers().iterator(); 235 ServerName targetServer = getServerName(iterator.next()); 236 237 // remove online servers 238 try { 239 ADMIN.removeServersFromRSGroup(Sets.newHashSet(targetServer.getAddress())); 240 fail("Online servers shouldn't have been successfully removed."); 241 } catch (IOException ex) { 242 String exp = 243 "Server " + targetServer.getAddress() + " is an online server, not allowed to remove."; 244 String msg = "Expected '" + exp + "' in exception message: "; 245 assertTrue(msg + " " + ex.getMessage(), ex.getMessage().contains(exp)); 246 } 247 assertTrue(newGroup.getServers().contains(targetServer.getAddress())); 248 249 // remove dead servers 250 NUM_DEAD_SERVERS = CLUSTER.getClusterMetrics().getDeadServerNames().size(); 251 try { 252 // stopping may cause an exception 253 // due to the connection loss 254 LOG.info("stopping server " + targetServer.getServerName()); 255 ADMIN.stopRegionServer(targetServer.getAddress().toString()); 256 NUM_DEAD_SERVERS++; 257 } catch (Exception e) { 258 } 259 260 // wait for stopped regionserver to dead server list 261 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 262 @Override 263 public boolean evaluate() throws Exception { 264 return !MASTER.getServerManager().areDeadServersInProgress() 265 && CLUSTER.getClusterMetrics().getDeadServerNames().size() == NUM_DEAD_SERVERS; 266 } 267 }); 268 269 try { 270 ADMIN.removeServersFromRSGroup(Sets.newHashSet(targetServer.getAddress())); 271 fail("Dead servers shouldn't have been successfully removed."); 272 } catch (IOException ex) { 273 String exp = "Server " + targetServer.getAddress() + " is on the dead servers list," 274 + " Maybe it will come back again, not allowed to remove."; 275 String msg = "Expected '" + exp + "' in exception message: "; 276 assertTrue(msg + " " + ex.getMessage(), ex.getMessage().contains(exp)); 277 } 278 assertTrue(newGroup.getServers().contains(targetServer.getAddress())); 279 280 // remove decommissioned servers 281 List<ServerName> serversToDecommission = new ArrayList<>(); 282 targetServer = getServerName(iterator.next()); 283 assertTrue(MASTER.getServerManager().getOnlineServers().containsKey(targetServer)); 284 serversToDecommission.add(targetServer); 285 286 ADMIN.decommissionRegionServers(serversToDecommission, true); 287 assertEquals(1, ADMIN.listDecommissionedRegionServers().size()); 288 289 assertTrue(newGroup.getServers().contains(targetServer.getAddress())); 290 ADMIN.removeServersFromRSGroup(Sets.newHashSet(targetServer.getAddress())); 291 Set<Address> newGroupServers = ADMIN.getRSGroup(newGroup.getName()).getServers(); 292 assertFalse(newGroupServers.contains(targetServer.getAddress())); 293 assertEquals(2, newGroupServers.size()); 294 295 assertTrue(OBSERVER.preRemoveServersCalled); 296 assertTrue(OBSERVER.postRemoveServersCalled); 297 } 298 299 @Test 300 public void testMoveServersAndTables() throws Exception { 301 LOG.info("testMoveServersAndTables"); 302 final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); 303 // create table 304 final byte[] familyNameBytes = Bytes.toBytes("f"); 305 TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, 5); 306 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 307 @Override 308 public boolean evaluate() throws Exception { 309 List<String> regions = getTableRegionMap().get(tableName); 310 if (regions == null) { 311 return false; 312 } 313 314 return getTableRegionMap().get(tableName).size() >= 5; 315 } 316 }); 317 318 // get server which is not a member of new group 319 ServerName targetServer = null; 320 for (ServerName server : ADMIN.getClusterMetrics(EnumSet.of(Option.LIVE_SERVERS)) 321 .getLiveServerMetrics().keySet()) { 322 if ( 323 !newGroup.containsServer(server.getAddress()) 324 && !ADMIN.getRSGroup("master").containsServer(server.getAddress()) 325 ) { 326 targetServer = server; 327 break; 328 } 329 } 330 331 LOG.debug("Print group info : " + ADMIN.listRSGroups()); 332 int oldDefaultGroupServerSize = ADMIN.getRSGroup(RSGroupInfo.DEFAULT_GROUP).getServers().size(); 333 int oldDefaultGroupTableSize = ADMIN.listTablesInRSGroup(RSGroupInfo.DEFAULT_GROUP).size(); 334 assertTrue(OBSERVER.preListTablesInRSGroupCalled); 335 assertTrue(OBSERVER.postListTablesInRSGroupCalled); 336 337 // test fail bogus server move 338 try { 339 ADMIN.moveServersToRSGroup(Sets.newHashSet(Address.fromString("foo:9999")), 340 newGroup.getName()); 341 ADMIN.setRSGroup(Sets.newHashSet(tableName), newGroup.getName()); 342 fail("Bogus servers shouldn't have been successfully moved."); 343 } catch (IOException ex) { 344 String exp = "Server foo:9999 is either offline or it does not exist."; 345 String msg = "Expected '" + exp + "' in exception message: "; 346 assertTrue(msg + " " + ex.getMessage(), ex.getMessage().contains(exp)); 347 } 348 349 // test move when src = dst 350 ADMIN.moveServersToRSGroup(Sets.newHashSet(targetServer.getAddress()), 351 RSGroupInfo.DEFAULT_GROUP); 352 ADMIN.setRSGroup(Sets.newHashSet(tableName), RSGroupInfo.DEFAULT_GROUP); 353 354 // verify default group info 355 assertEquals(oldDefaultGroupServerSize, 356 ADMIN.getRSGroup(RSGroupInfo.DEFAULT_GROUP).getServers().size()); 357 assertEquals(oldDefaultGroupTableSize, 358 ADMIN.listTablesInRSGroup(RSGroupInfo.DEFAULT_GROUP).size()); 359 360 // verify new group info 361 assertEquals(1, ADMIN.getRSGroup(newGroup.getName()).getServers().size()); 362 assertEquals(0, 363 ADMIN.getConfiguredNamespacesAndTablesInRSGroup(newGroup.getName()).getSecond().size()); 364 assertTrue(OBSERVER.preGetConfiguredNamespacesAndTablesInRSGroupCalled); 365 assertTrue(OBSERVER.postGetConfiguredNamespacesAndTablesInRSGroupCalled); 366 367 // get all region to move targetServer 368 List<String> regionList = getTableRegionMap().get(tableName); 369 for (String region : regionList) { 370 // Lets move this region to the targetServer 371 TEST_UTIL.getAdmin().move(Bytes.toBytes(RegionInfo.encodeRegionName(Bytes.toBytes(region))), 372 targetServer); 373 } 374 375 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 376 @Override 377 public boolean evaluate() throws Exception { 378 return getTableRegionMap().get(tableName) != null 379 && getTableRegionMap().get(tableName).size() == 5 380 && getTableServerRegionMap().get(tableName).size() == 1 381 && ADMIN.getClusterMetrics(EnumSet.of(Option.REGIONS_IN_TRANSITION)) 382 .getRegionStatesInTransition().size() < 1; 383 } 384 }); 385 386 // verify that all region move to targetServer 387 assertEquals(5, getTableServerRegionMap().get(tableName).get(targetServer).size()); 388 389 // move targetServer and table to newGroup 390 LOG.info("moving server and table to newGroup"); 391 ADMIN.moveServersToRSGroup(Sets.newHashSet(targetServer.getAddress()), newGroup.getName()); 392 ADMIN.setRSGroup(Sets.newHashSet(tableName), newGroup.getName()); 393 394 // verify group change 395 assertEquals(newGroup.getName(), ADMIN.getRSGroup(tableName).getName()); 396 397 // verify servers' not exist in old group 398 Set<Address> defaultServers = ADMIN.getRSGroup(RSGroupInfo.DEFAULT_GROUP).getServers(); 399 assertFalse(defaultServers.contains(targetServer.getAddress())); 400 401 // verify servers' exist in new group 402 Set<Address> newGroupServers = ADMIN.getRSGroup(newGroup.getName()).getServers(); 403 assertTrue(newGroupServers.contains(targetServer.getAddress())); 404 405 // verify tables' not exist in old group 406 Set<TableName> defaultTables = 407 Sets.newHashSet(ADMIN.listTablesInRSGroup(RSGroupInfo.DEFAULT_GROUP)); 408 assertFalse(defaultTables.contains(tableName)); 409 410 // verify tables' exist in new group 411 Set<TableName> newGroupTables = Sets 412 .newHashSet(ADMIN.getConfiguredNamespacesAndTablesInRSGroup(newGroup.getName()).getSecond()); 413 assertTrue(newGroupTables.contains(tableName)); 414 415 // verify that all region still assign on targetServer 416 // TODO: uncomment after we reimplement moveServersAndTables, now the implementation is 417 // moveToRSGroup first and then moveTables, so the region will be moved to other region servers. 418 // assertEquals(5, getTableServerRegionMap().get(tableName).get(targetServer).size()); 419 420 assertTrue(OBSERVER.preMoveServersCalled); 421 assertTrue(OBSERVER.postMoveServersCalled); 422 } 423 424 @Test 425 public void testMoveServersFromDefaultGroup() throws Exception { 426 // create groups and assign servers 427 ADMIN.addRSGroup("foo"); 428 429 RSGroupInfo fooGroup = ADMIN.getRSGroup("foo"); 430 assertEquals(0, fooGroup.getServers().size()); 431 RSGroupInfo defaultGroup = ADMIN.getRSGroup(RSGroupInfo.DEFAULT_GROUP); 432 433 // test remove all servers from default 434 try { 435 ADMIN.moveServersToRSGroup(defaultGroup.getServers(), fooGroup.getName()); 436 fail(RSGroupInfoManagerImpl.KEEP_ONE_SERVER_IN_DEFAULT_ERROR_MESSAGE); 437 } catch (ConstraintException ex) { 438 assertTrue( 439 ex.getMessage().contains(RSGroupInfoManagerImpl.KEEP_ONE_SERVER_IN_DEFAULT_ERROR_MESSAGE)); 440 } 441 442 // test success case, remove one server from default ,keep at least one server 443 if (defaultGroup.getServers().size() > 1) { 444 Address serverInDefaultGroup = defaultGroup.getServers().iterator().next(); 445 LOG.info("moving server " + serverInDefaultGroup + " from group default to group " 446 + fooGroup.getName()); 447 ADMIN.moveServersToRSGroup(Sets.newHashSet(serverInDefaultGroup), fooGroup.getName()); 448 } 449 450 fooGroup = ADMIN.getRSGroup("foo"); 451 LOG.info("moving servers " + fooGroup.getServers() + " to group default"); 452 ADMIN.moveServersToRSGroup(fooGroup.getServers(), RSGroupInfo.DEFAULT_GROUP); 453 454 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 455 @Override 456 public boolean evaluate() throws Exception { 457 return getNumServers() == ADMIN.getRSGroup(RSGroupInfo.DEFAULT_GROUP).getServers().size(); 458 } 459 }); 460 461 fooGroup = ADMIN.getRSGroup("foo"); 462 assertEquals(0, fooGroup.getServers().size()); 463 464 // test group removal 465 LOG.info("Remove group " + fooGroup.getName()); 466 ADMIN.removeRSGroup(fooGroup.getName()); 467 assertEquals(null, ADMIN.getRSGroup(fooGroup.getName())); 468 } 469 470 @Test 471 public void testFailedMoveBeforeRetryExhaustedWhenMoveServer() throws Exception { 472 String groupName = getGroupName(name.getMethodName()); 473 ADMIN.addRSGroup(groupName); 474 final RSGroupInfo newGroup = ADMIN.getRSGroup(groupName); 475 Pair<ServerName, RegionStateNode> gotPair = createTableWithRegionSplitting(newGroup, 10); 476 477 // start thread to recover region state 478 final ServerName movedServer = gotPair.getFirst(); 479 final RegionStateNode rsn = gotPair.getSecond(); 480 AtomicBoolean changed = new AtomicBoolean(false); 481 Thread t1 = recoverRegionStateThread(movedServer, 482 server -> MASTER.getAssignmentManager().getRegionsOnServer(movedServer), rsn, changed); 483 t1.start(); 484 485 // move target server to group 486 Thread t2 = new Thread(() -> { 487 LOG.info("thread2 start running, to move regions"); 488 try { 489 ADMIN.moveServersToRSGroup(Sets.newHashSet(movedServer.getAddress()), newGroup.getName()); 490 } catch (IOException e) { 491 LOG.error("move server error", e); 492 } 493 }); 494 t2.start(); 495 496 t1.join(); 497 t2.join(); 498 499 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 500 @Override 501 public boolean evaluate() { 502 if (changed.get()) { 503 return MASTER.getAssignmentManager().getRegionsOnServer(movedServer).size() == 0 504 && !rsn.getRegionLocation().equals(movedServer); 505 } 506 return false; 507 } 508 }); 509 } 510 511 private <T> Thread recoverRegionStateThread(T owner, Function<T, List<RegionInfo>> getRegions, 512 RegionStateNode rsn, AtomicBoolean changed) { 513 return new Thread(() -> { 514 LOG.info("thread1 start running, will recover region state"); 515 long current = EnvironmentEdgeManager.currentTime(); 516 // wait until there is only left the region we changed state and recover its state. 517 // wait time is set according to the number of max retries, all except failed regions will be 518 // moved in one retry, and will sleep 1s until next retry. 519 while ( 520 EnvironmentEdgeManager.currentTime() - current 521 <= RSGroupInfoManagerImpl.DEFAULT_MAX_RETRY_VALUE * 1000 522 ) { 523 List<RegionInfo> regions = getRegions.apply(owner); 524 LOG.debug("server table region size is:{}", regions.size()); 525 assert regions.size() >= 1; 526 // when there is exactly one region left, we can determine the move operation encountered 527 // exception caused by the strange region state. 528 if (regions.size() == 1) { 529 assertEquals(regions.get(0).getRegionNameAsString(), 530 rsn.getRegionInfo().getRegionNameAsString()); 531 rsn.setState(RegionState.State.OPEN); 532 LOG.info("set region {} state OPEN", rsn.getRegionInfo().getRegionNameAsString()); 533 changed.set(true); 534 break; 535 } 536 sleep(5000); 537 } 538 }); 539 } 540 541 private Pair<ServerName, RegionStateNode> createTableWithRegionSplitting(RSGroupInfo rsGroupInfo, 542 int tableRegionCount) throws Exception { 543 final byte[] familyNameBytes = Bytes.toBytes("f"); 544 // All the regions created below will be assigned to the default group. 545 TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); 546 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 547 @Override 548 public boolean evaluate() throws Exception { 549 List<String> regions = getTableRegionMap().get(tableName); 550 if (regions == null) { 551 return false; 552 } 553 return getTableRegionMap().get(tableName).size() >= tableRegionCount; 554 } 555 }); 556 557 return randomlySetOneRegionStateToSplitting(rsGroupInfo); 558 } 559 560 /** 561 * Randomly choose a region to set state. 562 * @param newGroup target group 563 * @return source server of region, and region state 564 * @throws IOException if methods called throw 565 */ 566 private Pair<ServerName, RegionStateNode> 567 randomlySetOneRegionStateToSplitting(RSGroupInfo newGroup) throws IOException { 568 // get target server to move, which should has more than one regions 569 // randomly set a region state to SPLITTING to make move fail 570 return randomlySetRegionState(newGroup, RegionState.State.SPLITTING, tableName); 571 } 572 573 private Pair<ServerName, RegionStateNode> randomlySetRegionState(RSGroupInfo groupInfo, 574 RegionState.State state, TableName... tableNames) throws IOException { 575 Preconditions.checkArgument(tableNames.length == 1 || tableNames.length == 2, 576 "only support one or two tables"); 577 Map<TableName, Map<ServerName, List<String>>> tableServerRegionMap = getTableServerRegionMap(); 578 Map<ServerName, List<String>> assignMap = tableServerRegionMap.get(tableNames[0]); 579 if (tableNames.length == 2) { 580 Map<ServerName, List<String>> assignMap2 = tableServerRegionMap.get(tableNames[1]); 581 assignMap2.forEach((k, v) -> { 582 if (!assignMap.containsKey(k)) { 583 assignMap.remove(k); 584 } 585 }); 586 } 587 String toCorrectRegionName = null; 588 ServerName srcServer = null; 589 for (ServerName server : assignMap.keySet()) { 590 toCorrectRegionName = 591 assignMap.get(server).size() >= 1 && !groupInfo.containsServer(server.getAddress()) 592 ? assignMap.get(server).get(0) 593 : null; 594 if (toCorrectRegionName != null) { 595 srcServer = server; 596 break; 597 } 598 } 599 assert srcServer != null; 600 RegionInfo toCorrectRegionInfo = TEST_UTIL.getMiniHBaseCluster().getMaster() 601 .getAssignmentManager().getRegionInfo(Bytes.toBytesBinary(toCorrectRegionName)); 602 RegionStateNode rsn = TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager() 603 .getRegionStates().getRegionStateNode(toCorrectRegionInfo); 604 rsn.setState(state); 605 return new Pair<>(srcServer, rsn); 606 } 607 608 @Test 609 public void testFailedMoveServersAndRepair() throws Exception { 610 // This UT calls moveToRSGroup() twice to test the idempotency of it. 611 // The first time, movement fails because a region is made in SPLITTING state 612 // which will not be moved. 613 // The second time, the region state is OPEN and check if all 614 // regions on target group servers after the call. 615 final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); 616 617 // create table 618 // randomly set a region state to SPLITTING to make move abort 619 Pair<ServerName, RegionStateNode> gotPair = 620 createTableWithRegionSplitting(newGroup, ThreadLocalRandom.current().nextInt(8) + 4); 621 RegionStateNode rsn = gotPair.getSecond(); 622 ServerName srcServer = rsn.getRegionLocation(); 623 624 // move server to newGroup and check regions 625 try { 626 ADMIN.moveServersToRSGroup(Sets.newHashSet(srcServer.getAddress()), newGroup.getName()); 627 fail("should get IOException when retry exhausted but there still exists failed moved " 628 + "regions"); 629 } catch (Exception e) { 630 assertTrue( 631 e.getMessage().contains(gotPair.getSecond().getRegionInfo().getRegionNameAsString())); 632 } 633 for (RegionInfo regionInfo : MASTER.getAssignmentManager().getAssignedRegions()) { 634 if (regionInfo.getTable().equals(tableName) && regionInfo.equals(rsn.getRegionInfo())) { 635 assertEquals( 636 MASTER.getAssignmentManager().getRegionStates().getRegionServerOfRegion(regionInfo), 637 srcServer); 638 } 639 } 640 641 // retry move server to newGroup and check if all regions on srcServer was moved 642 rsn.setState(RegionState.State.OPEN); 643 ADMIN.moveServersToRSGroup(Sets.newHashSet(srcServer.getAddress()), newGroup.getName()); 644 assertEquals(MASTER.getAssignmentManager().getRegionsOnServer(srcServer).size(), 0); 645 } 646 647 @Test 648 public void testFailedMoveServersTablesAndRepair() throws Exception { 649 // This UT calls moveTablesAndServers() twice to test the idempotency of it. 650 // The first time, movement fails because a region is made in SPLITTING state 651 // which will not be moved. 652 // The second time, the region state is OPEN and check if all 653 // regions on target group servers after the call. 654 final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 1); 655 // create table 656 final byte[] familyNameBytes = Bytes.toBytes("f"); 657 TableName table1 = TableName.valueOf(tableName.getNameAsString() + "_1"); 658 TableName table2 = TableName.valueOf(tableName.getNameAsString() + "_2"); 659 Random rand = ThreadLocalRandom.current(); 660 TEST_UTIL.createMultiRegionTable(table1, familyNameBytes, rand.nextInt(12) + 4); 661 TEST_UTIL.createMultiRegionTable(table2, familyNameBytes, rand.nextInt(12) + 4); 662 663 // randomly set a region state to SPLITTING to make move abort 664 Pair<ServerName, RegionStateNode> gotPair = 665 randomlySetRegionState(newGroup, RegionState.State.SPLITTING, table1, table2); 666 RegionStateNode rsn = gotPair.getSecond(); 667 ServerName srcServer = rsn.getRegionLocation(); 668 669 // move server and table to newGroup and check regions 670 try { 671 ADMIN.moveServersToRSGroup(Sets.newHashSet(srcServer.getAddress()), newGroup.getName()); 672 ADMIN.setRSGroup(Sets.newHashSet(table2), newGroup.getName()); 673 fail("should get IOException when retry exhausted but there still exists failed moved " 674 + "regions"); 675 } catch (Exception e) { 676 assertTrue( 677 e.getMessage().contains(gotPair.getSecond().getRegionInfo().getRegionNameAsString())); 678 } 679 for (RegionInfo regionInfo : MASTER.getAssignmentManager().getAssignedRegions()) { 680 if (regionInfo.getTable().equals(table1) && regionInfo.equals(rsn.getRegionInfo())) { 681 assertEquals( 682 MASTER.getAssignmentManager().getRegionStates().getRegionServerOfRegion(regionInfo), 683 srcServer); 684 } 685 } 686 687 // retry moveServersAndTables to newGroup and check if all regions on srcServer belongs to 688 // table2 689 rsn.setState(RegionState.State.OPEN); 690 ADMIN.moveServersToRSGroup(Sets.newHashSet(srcServer.getAddress()), newGroup.getName()); 691 ADMIN.setRSGroup(Sets.newHashSet(table2), newGroup.getName()); 692 for (RegionInfo regionsInfo : MASTER.getAssignmentManager().getRegionsOnServer(srcServer)) { 693 assertEquals(regionsInfo.getTable(), table2); 694 } 695 } 696 697 @Test 698 public void testMoveServersToRSGroupPerformance() throws Exception { 699 final RSGroupInfo newGroup = addGroup(getGroupName(name.getMethodName()), 2); 700 final byte[] familyNameBytes = Bytes.toBytes("f"); 701 // there will be 100 regions are both the serves 702 final int tableRegionCount = 200; 703 // All the regions created below will be assigned to the default group. 704 TEST_UTIL.createMultiRegionTable(tableName, familyNameBytes, tableRegionCount); 705 TEST_UTIL.waitFor(WAIT_TIMEOUT, new Waiter.Predicate<Exception>() { 706 @Override 707 public boolean evaluate() throws Exception { 708 List<String> regions = getTableRegionMap().get(tableName); 709 if (regions == null) { 710 return false; 711 } 712 return getTableRegionMap().get(tableName).size() >= tableRegionCount; 713 } 714 }); 715 ADMIN.setRSGroup(Sets.newHashSet(tableName), newGroup.getName()); 716 TEST_UTIL.waitUntilAllRegionsAssigned(tableName); 717 String rsGroup2 = "rsGroup2"; 718 ADMIN.addRSGroup(rsGroup2); 719 720 long startTime = EnvironmentEdgeManager.currentTime(); 721 ADMIN.moveServersToRSGroup(Sets.newHashSet(newGroup.getServers().iterator().next()), rsGroup2); 722 long timeTaken = EnvironmentEdgeManager.currentTime() - startTime; 723 String msg = 724 "Should not take mote than 15000 ms to move a table with 100 regions. Time taken =" 725 + timeTaken + " ms"; 726 // This test case is meant to be used for verifying the performance quickly by a developer. 727 // Moving 100 regions takes much less than 15000 ms. Given 15000 ms so test cases passes 728 // on all environment. 729 assertTrue(msg, timeTaken < 15000); 730 LOG.info("Time taken to move a table with 100 region is {} ms", timeTaken); 731 } 732}