001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.assertTrue; 022import static org.junit.Assert.fail; 023 024import java.io.IOException; 025import java.io.InterruptedIOException; 026import java.util.ArrayList; 027import java.util.Arrays; 028import java.util.Collection; 029import java.util.List; 030import org.apache.hadoop.hbase.client.Admin; 031import org.apache.hadoop.hbase.client.BalanceResponse; 032import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; 033import org.apache.hadoop.hbase.client.Connection; 034import org.apache.hadoop.hbase.client.ConnectionFactory; 035import org.apache.hadoop.hbase.client.RegionInfo; 036import org.apache.hadoop.hbase.client.RegionLocator; 037import org.apache.hadoop.hbase.client.TableDescriptor; 038import org.apache.hadoop.hbase.client.TableDescriptorBuilder; 039import org.apache.hadoop.hbase.regionserver.HRegionServer; 040import org.apache.hadoop.hbase.testclassification.FlakeyTests; 041import org.apache.hadoop.hbase.testclassification.LargeTests; 042import org.apache.hadoop.hbase.util.Bytes; 043import org.apache.hadoop.hbase.util.JVMClusterUtil; 044import org.apache.hadoop.hbase.util.Threads; 045import org.junit.After; 046import org.junit.Before; 047import org.junit.ClassRule; 048import org.junit.Test; 049import org.junit.experimental.categories.Category; 050import org.junit.runner.RunWith; 051import org.junit.runners.Parameterized; 052import org.junit.runners.Parameterized.Parameters; 053import org.slf4j.Logger; 054import org.slf4j.LoggerFactory; 055 056import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 057 058/** 059 * Test whether region re-balancing works. (HBASE-71) The test only works for cluster wide 060 * balancing, not per table wide. Increase the margin a little to make StochasticLoadBalancer result 061 * acceptable. 062 */ 063@Category({ FlakeyTests.class, LargeTests.class }) 064@RunWith(value = Parameterized.class) 065public class TestRegionRebalancing { 066 067 @ClassRule 068 public static final HBaseClassTestRule CLASS_RULE = 069 HBaseClassTestRule.forClass(TestRegionRebalancing.class); 070 071 @Parameters 072 public static Collection<Object[]> data() { 073 Object[][] balancers = 074 new String[][] { { "org.apache.hadoop.hbase.master.balancer.SimpleLoadBalancer" }, 075 { "org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer" } }; 076 return Arrays.asList(balancers); 077 } 078 079 private static final byte[] FAMILY_NAME = Bytes.toBytes("col"); 080 private static final Logger LOG = LoggerFactory.getLogger(TestRegionRebalancing.class); 081 private final HBaseTestingUtil UTIL = new HBaseTestingUtil(); 082 private RegionLocator regionLocator; 083 private TableDescriptor tableDescriptor; 084 private String balancerName; 085 086 public TestRegionRebalancing(String balancerName) { 087 this.balancerName = balancerName; 088 089 } 090 091 @After 092 public void after() throws Exception { 093 UTIL.shutdownMiniCluster(); 094 } 095 096 @Before 097 public void before() throws Exception { 098 UTIL.getConfiguration().set("hbase.master.loadbalancer.class", this.balancerName); 099 // set minCostNeedBalance to 0, make sure balancer run 100 UTIL.startMiniCluster(1); 101 102 this.tableDescriptor = TableDescriptorBuilder.newBuilder(TableName.valueOf("test")) 103 .setColumnFamily(ColumnFamilyDescriptorBuilder.of(FAMILY_NAME)).build(); 104 } 105 106 /** 107 * For HBASE-71. Try a few different configurations of starting and stopping region servers to see 108 * if the assignment or regions is pretty balanced. 109 */ 110 @Test 111 public void testRebalanceOnRegionServerNumberChange() throws IOException, InterruptedException { 112 try (Connection connection = ConnectionFactory.createConnection(UTIL.getConfiguration()); 113 Admin admin = connection.getAdmin()) { 114 admin.createTable(this.tableDescriptor, 115 Arrays.copyOfRange(HBaseTestingUtil.KEYS, 1, HBaseTestingUtil.KEYS.length)); 116 this.regionLocator = connection.getRegionLocator(this.tableDescriptor.getTableName()); 117 118 MetaTableAccessor.fullScanMetaAndPrint(admin.getConnection()); 119 120 assertEquals("Test table should have right number of regions", HBaseTestingUtil.KEYS.length, 121 this.regionLocator.getStartKeys().length); 122 123 // verify that the region assignments are balanced to start out 124 assertRegionsAreBalanced(); 125 126 // add a region server - total of 2 127 LOG.info("Started second server=" 128 + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 129 UTIL.getHBaseCluster().getMaster().balance(); 130 assertRegionsAreBalanced(); 131 132 // On a balanced cluster, calling balance() should return true 133 BalanceResponse response = UTIL.getHBaseCluster().getMaster().balance(); 134 assertTrue(response.isBalancerRan()); 135 assertEquals(0, response.getMovesCalculated()); 136 assertEquals(0, response.getMovesExecuted()); 137 138 // if we add a server, then the balance() call should return true 139 // add a region server - total of 3 140 LOG.info("Started third server=" 141 + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 142 waitForAllRegionsAssigned(); 143 144 response = UTIL.getHBaseCluster().getMaster().balance(); 145 assertTrue(response.isBalancerRan()); 146 assertTrue(response.getMovesCalculated() > 0); 147 assertEquals(response.getMovesCalculated(), response.getMovesExecuted()); 148 assertRegionsAreBalanced(); 149 150 // kill a region server - total of 2 151 LOG.info("Stopped third server=" + UTIL.getHBaseCluster().stopRegionServer(2, false)); 152 UTIL.getHBaseCluster().waitOnRegionServer(2); 153 waitOnCrashProcessing(); 154 UTIL.getHBaseCluster().getMaster().balance(); 155 assertRegionsAreBalanced(); 156 157 // start two more region servers - total of 4 158 LOG.info("Readding third server=" 159 + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 160 LOG.info("Added fourth server=" 161 + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); 162 waitOnCrashProcessing(); 163 waitForAllRegionsAssigned(); 164 165 response = UTIL.getHBaseCluster().getMaster().balance(); 166 assertTrue(response.isBalancerRan()); 167 assertTrue(response.getMovesCalculated() > 0); 168 assertEquals(response.getMovesCalculated(), response.getMovesExecuted()); 169 170 assertRegionsAreBalanced(); 171 for (int i = 0; i < 6; i++) { 172 LOG.info("Adding " + (i + 5) + "th region server"); 173 UTIL.getHBaseCluster().startRegionServer(); 174 } 175 waitForAllRegionsAssigned(); 176 177 response = UTIL.getHBaseCluster().getMaster().balance(); 178 assertTrue(response.isBalancerRan()); 179 assertTrue(response.getMovesCalculated() > 0); 180 assertEquals(response.getMovesCalculated(), response.getMovesExecuted()); 181 182 assertRegionsAreBalanced(); 183 regionLocator.close(); 184 } 185 } 186 187 /** 188 * Wait on crash processing. Balancer won't run if processing a crashed server. 189 */ 190 private void waitOnCrashProcessing() throws IOException { 191 while (UTIL.getHBaseCluster().getMaster().getServerManager().areDeadServersInProgress()) { 192 LOG.info("Waiting on processing of crashed server before proceeding..."); 193 Threads.sleep(1000); 194 } 195 } 196 197 /** 198 * Determine if regions are balanced. Figure out the total, divide by the number of online 199 * servers, then test if each server is +/- 1 of average rounded up. 200 */ 201 private void assertRegionsAreBalanced() throws IOException { 202 // TODO: Fix this test. Old balancer used to run with 'slop'. New 203 // balancer does not. 204 boolean success = false; 205 float slop = (float) UTIL.getConfiguration().getFloat("hbase.regions.slop", 0.1f); 206 if (slop <= 0) slop = 1; 207 208 for (int i = 0; i < 5; i++) { 209 success = true; 210 // make sure all the regions are reassigned before we test balance 211 waitForAllRegionsAssigned(); 212 213 long regionCount = UTIL.getMiniHBaseCluster().countServedRegions(); 214 List<HRegionServer> servers = getOnlineRegionServers(); 215 double avg = (double) regionCount / (double) servers.size(); 216 int avgLoadPlusSlop = (int) Math.ceil(avg * (1 + slop)); 217 int avgLoadMinusSlop = (int) Math.floor(avg * (1 - slop)) - 1; 218 // Increase the margin a little to accommodate StochasticLoadBalancer 219 if (this.balancerName.contains("StochasticLoadBalancer")) { 220 avgLoadPlusSlop++; 221 avgLoadMinusSlop--; 222 } 223 LOG.debug("There are " + servers.size() + " servers and " + regionCount 224 + " regions. Load Average: " + avg + " low border: " + avgLoadMinusSlop + ", up border: " 225 + avgLoadPlusSlop + "; attempt: " + i); 226 227 for (HRegionServer server : servers) { 228 int serverLoad = ProtobufUtil.getOnlineRegions(server.getRSRpcServices()).size(); 229 LOG.debug(server.getServerName() + " Avg: " + avg + " actual: " + serverLoad); 230 if (!(avg > 2.0 && serverLoad <= avgLoadPlusSlop && serverLoad >= avgLoadMinusSlop)) { 231 for (RegionInfo hri : ProtobufUtil.getOnlineRegions(server.getRSRpcServices())) { 232 if (hri.isMetaRegion()) serverLoad--; 233 // LOG.debug(hri.getRegionNameAsString()); 234 } 235 if (!(serverLoad <= avgLoadPlusSlop && serverLoad >= avgLoadMinusSlop)) { 236 LOG.debug(server.getServerName() + " Isn't balanced!!! Avg: " + avg + " actual: " 237 + serverLoad + " slop: " + slop); 238 success = false; 239 break; 240 } 241 } 242 } 243 244 if (!success) { 245 // one or more servers are not balanced. sleep a little to give it a 246 // chance to catch up. then, go back to the retry loop. 247 try { 248 Thread.sleep(10000); 249 } catch (InterruptedException e) { 250 } 251 252 UTIL.getHBaseCluster().getMaster().balance(); 253 continue; 254 } 255 256 // if we get here, all servers were balanced, so we should just return. 257 return; 258 } 259 // if we get here, we tried 5 times and never got to short circuit out of 260 // the retry loop, so this is a failure. 261 fail("After 5 attempts, region assignments were not balanced."); 262 } 263 264 private List<HRegionServer> getOnlineRegionServers() { 265 List<HRegionServer> list = new ArrayList<>(); 266 for (JVMClusterUtil.RegionServerThread rst : UTIL.getHBaseCluster().getRegionServerThreads()) { 267 if (rst.getRegionServer().isOnline()) { 268 list.add(rst.getRegionServer()); 269 } 270 } 271 return list; 272 } 273 274 /** 275 * Wait until all the regions are assigned. 276 */ 277 private void waitForAllRegionsAssigned() throws IOException { 278 int totalRegions = HBaseTestingUtil.KEYS.length; 279 try { 280 Thread.sleep(200); 281 } catch (InterruptedException e) { 282 throw new InterruptedIOException(); 283 } 284 while (UTIL.getMiniHBaseCluster().countServedRegions() < totalRegions) { 285 // while (!cluster.getMaster().allRegionsAssigned()) { 286 LOG.debug("Waiting for there to be " + totalRegions + " regions, but there are " 287 + UTIL.getMiniHBaseCluster().countServedRegions() + " right now."); 288 try { 289 Thread.sleep(200); 290 } catch (InterruptedException e) { 291 throw new InterruptedIOException(); 292 } 293 } 294 UTIL.waitUntilNoRegionsInTransition(); 295 } 296 297}