001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.assertTrue;
022import static org.junit.Assert.fail;
023
024import java.io.IOException;
025import java.io.InterruptedIOException;
026import java.util.ArrayList;
027import java.util.Arrays;
028import java.util.Collection;
029import java.util.List;
030import org.apache.hadoop.hbase.client.Admin;
031import org.apache.hadoop.hbase.client.BalanceResponse;
032import org.apache.hadoop.hbase.client.Connection;
033import org.apache.hadoop.hbase.client.ConnectionFactory;
034import org.apache.hadoop.hbase.client.RegionInfo;
035import org.apache.hadoop.hbase.client.RegionLocator;
036import org.apache.hadoop.hbase.regionserver.HRegionServer;
037import org.apache.hadoop.hbase.testclassification.FlakeyTests;
038import org.apache.hadoop.hbase.testclassification.LargeTests;
039import org.apache.hadoop.hbase.util.Bytes;
040import org.apache.hadoop.hbase.util.JVMClusterUtil;
041import org.apache.hadoop.hbase.util.Threads;
042import org.junit.After;
043import org.junit.Before;
044import org.junit.ClassRule;
045import org.junit.Test;
046import org.junit.experimental.categories.Category;
047import org.junit.runner.RunWith;
048import org.junit.runners.Parameterized;
049import org.junit.runners.Parameterized.Parameters;
050import org.slf4j.Logger;
051import org.slf4j.LoggerFactory;
052
053import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
054
055/**
056 * Test whether region re-balancing works. (HBASE-71) The test only works for cluster wide
057 * balancing, not per table wide. Increase the margin a little to make StochasticLoadBalancer result
058 * acceptable.
059 */
060@Category({ FlakeyTests.class, LargeTests.class })
061@RunWith(value = Parameterized.class)
062public class TestRegionRebalancing {
063
064  @ClassRule
065  public static final HBaseClassTestRule CLASS_RULE =
066    HBaseClassTestRule.forClass(TestRegionRebalancing.class);
067
068  @Parameters
069  public static Collection<Object[]> data() {
070    Object[][] balancers =
071      new String[][] { { "org.apache.hadoop.hbase.master.balancer.SimpleLoadBalancer" },
072        { "org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer" } };
073    return Arrays.asList(balancers);
074  }
075
076  private static final byte[] FAMILY_NAME = Bytes.toBytes("col");
077  private static final Logger LOG = LoggerFactory.getLogger(TestRegionRebalancing.class);
078  private final HBaseTestingUtility UTIL = new HBaseTestingUtility();
079  private RegionLocator regionLocator;
080  private HTableDescriptor desc;
081  private String balancerName;
082
083  public TestRegionRebalancing(String balancerName) {
084    this.balancerName = balancerName;
085
086  }
087
088  @After
089  public void after() throws Exception {
090    UTIL.shutdownMiniCluster();
091  }
092
093  @Before
094  public void before() throws Exception {
095    UTIL.getConfiguration().set("hbase.master.loadbalancer.class", this.balancerName);
096    // set minCostNeedBalance to 0, make sure balancer run
097    UTIL.startMiniCluster(1);
098    this.desc = new HTableDescriptor(TableName.valueOf("test"));
099    this.desc.addFamily(new HColumnDescriptor(FAMILY_NAME));
100  }
101
102  /**
103   * For HBASE-71. Try a few different configurations of starting and stopping region servers to see
104   * if the assignment or regions is pretty balanced.
105   */
106  @Test
107  public void testRebalanceOnRegionServerNumberChange() throws IOException, InterruptedException {
108    try (Connection connection = ConnectionFactory.createConnection(UTIL.getConfiguration());
109      Admin admin = connection.getAdmin()) {
110      admin.createTable(this.desc,
111        Arrays.copyOfRange(HBaseTestingUtility.KEYS, 1, HBaseTestingUtility.KEYS.length));
112      this.regionLocator = connection.getRegionLocator(this.desc.getTableName());
113
114      MetaTableAccessor.fullScanMetaAndPrint(admin.getConnection());
115
116      assertEquals("Test table should have right number of regions",
117        HBaseTestingUtility.KEYS.length, this.regionLocator.getStartKeys().length);
118
119      // verify that the region assignments are balanced to start out
120      assertRegionsAreBalanced();
121
122      // add a region server - total of 2
123      LOG.info("Started second server="
124        + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
125      UTIL.getHBaseCluster().getMaster().balance();
126      assertRegionsAreBalanced();
127
128      // On a balanced cluster, calling balance() should return true
129      BalanceResponse response = UTIL.getHBaseCluster().getMaster().balance();
130      assertTrue(response.isBalancerRan());
131      assertEquals(0, response.getMovesCalculated());
132      assertEquals(0, response.getMovesExecuted());
133
134      // if we add a server, then the balance() call should return true
135      // add a region server - total of 3
136      LOG.info("Started third server="
137        + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
138      waitForAllRegionsAssigned();
139
140      response = UTIL.getHBaseCluster().getMaster().balance();
141      assertTrue(response.isBalancerRan());
142      assertTrue(response.getMovesCalculated() > 0);
143      assertEquals(response.getMovesCalculated(), response.getMovesExecuted());
144      assertRegionsAreBalanced();
145
146      // kill a region server - total of 2
147      LOG.info("Stopped third server=" + UTIL.getHBaseCluster().stopRegionServer(2, false));
148      UTIL.getHBaseCluster().waitOnRegionServer(2);
149      waitOnCrashProcessing();
150      UTIL.getHBaseCluster().getMaster().balance();
151      assertRegionsAreBalanced();
152
153      // start two more region servers - total of 4
154      LOG.info("Readding third server="
155        + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
156      LOG.info("Added fourth server="
157        + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName());
158      waitOnCrashProcessing();
159      waitForAllRegionsAssigned();
160
161      response = UTIL.getHBaseCluster().getMaster().balance();
162      assertTrue(response.isBalancerRan());
163      assertTrue(response.getMovesCalculated() > 0);
164      assertEquals(response.getMovesCalculated(), response.getMovesExecuted());
165
166      assertRegionsAreBalanced();
167      for (int i = 0; i < 6; i++) {
168        LOG.info("Adding " + (i + 5) + "th region server");
169        UTIL.getHBaseCluster().startRegionServer();
170      }
171      waitForAllRegionsAssigned();
172
173      response = UTIL.getHBaseCluster().getMaster().balance();
174      assertTrue(response.isBalancerRan());
175      assertTrue(response.getMovesCalculated() > 0);
176      assertEquals(response.getMovesCalculated(), response.getMovesExecuted());
177
178      assertRegionsAreBalanced();
179      regionLocator.close();
180    }
181  }
182
183  /**
184   * Wait on crash processing. Balancer won't run if processing a crashed server.
185   */
186  private void waitOnCrashProcessing() {
187    while (UTIL.getHBaseCluster().getMaster().getServerManager().areDeadServersInProgress()) {
188      LOG.info("Waiting on processing of crashed server before proceeding...");
189      Threads.sleep(1000);
190    }
191  }
192
193  /**
194   * Determine if regions are balanced. Figure out the total, divide by the number of online
195   * servers, then test if each server is +/- 1 of average rounded up.
196   */
197  private void assertRegionsAreBalanced() throws IOException {
198    // TODO: Fix this test. Old balancer used to run with 'slop'. New
199    // balancer does not.
200    boolean success = false;
201    float slop = (float) UTIL.getConfiguration().getFloat("hbase.regions.slop", 0.1f);
202    if (slop <= 0) slop = 1;
203
204    for (int i = 0; i < 5; i++) {
205      success = true;
206      // make sure all the regions are reassigned before we test balance
207      waitForAllRegionsAssigned();
208
209      long regionCount = UTIL.getMiniHBaseCluster().countServedRegions();
210      List<HRegionServer> servers = getOnlineRegionServers();
211      double avg = (double) regionCount / (double) servers.size();
212      int avgLoadPlusSlop = (int) Math.ceil(avg * (1 + slop));
213      int avgLoadMinusSlop = (int) Math.floor(avg * (1 - slop)) - 1;
214      // Increase the margin a little to accommodate StochasticLoadBalancer
215      if (this.balancerName.contains("StochasticLoadBalancer")) {
216        avgLoadPlusSlop++;
217        avgLoadMinusSlop--;
218      }
219      LOG.debug("There are " + servers.size() + " servers and " + regionCount
220        + " regions. Load Average: " + avg + " low border: " + avgLoadMinusSlop + ", up border: "
221        + avgLoadPlusSlop + "; attempt: " + i);
222
223      for (HRegionServer server : servers) {
224        int serverLoad = ProtobufUtil.getOnlineRegions(server.getRSRpcServices()).size();
225        LOG.debug(server.getServerName() + " Avg: " + avg + " actual: " + serverLoad);
226        if (!(avg > 2.0 && serverLoad <= avgLoadPlusSlop && serverLoad >= avgLoadMinusSlop)) {
227          for (RegionInfo hri : ProtobufUtil.getOnlineRegions(server.getRSRpcServices())) {
228            if (hri.isMetaRegion()) serverLoad--;
229            // LOG.debug(hri.getRegionNameAsString());
230          }
231          if (!(serverLoad <= avgLoadPlusSlop && serverLoad >= avgLoadMinusSlop)) {
232            LOG.debug(server.getServerName() + " Isn't balanced!!! Avg: " + avg + " actual: "
233              + serverLoad + " slop: " + slop);
234            success = false;
235            break;
236          }
237        }
238      }
239
240      if (!success) {
241        // one or more servers are not balanced. sleep a little to give it a
242        // chance to catch up. then, go back to the retry loop.
243        try {
244          Thread.sleep(10000);
245        } catch (InterruptedException e) {
246        }
247
248        UTIL.getHBaseCluster().getMaster().balance();
249        continue;
250      }
251
252      // if we get here, all servers were balanced, so we should just return.
253      return;
254    }
255    // if we get here, we tried 5 times and never got to short circuit out of
256    // the retry loop, so this is a failure.
257    fail("After 5 attempts, region assignments were not balanced.");
258  }
259
260  private List<HRegionServer> getOnlineRegionServers() {
261    List<HRegionServer> list = new ArrayList<>();
262    for (JVMClusterUtil.RegionServerThread rst : UTIL.getHBaseCluster().getRegionServerThreads()) {
263      if (rst.getRegionServer().isOnline()) {
264        list.add(rst.getRegionServer());
265      }
266    }
267    return list;
268  }
269
270  /**
271   * Wait until all the regions are assigned.
272   */
273  private void waitForAllRegionsAssigned() throws IOException {
274    int totalRegions = HBaseTestingUtility.KEYS.length;
275    try {
276      Thread.sleep(200);
277    } catch (InterruptedException e) {
278      throw new InterruptedIOException();
279    }
280    while (UTIL.getMiniHBaseCluster().countServedRegions() < totalRegions) {
281      // while (!cluster.getMaster().allRegionsAssigned()) {
282      LOG.debug("Waiting for there to be " + totalRegions + " regions, but there are "
283        + UTIL.getMiniHBaseCluster().countServedRegions() + " right now.");
284      try {
285        Thread.sleep(200);
286      } catch (InterruptedException e) {
287        throw new InterruptedIOException();
288      }
289    }
290    UTIL.waitUntilNoRegionsInTransition();
291  }
292
293}