001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.assertNotNull;
022import static org.junit.Assert.assertNull;
023import static org.junit.Assert.assertTrue;
024
025import java.io.IOException;
026import java.util.List;
027import java.util.Optional;
028import java.util.concurrent.CountDownLatch;
029import java.util.stream.Collectors;
030import org.apache.hadoop.conf.Configuration;
031import org.apache.hadoop.hbase.CompatibilityFactory;
032import org.apache.hadoop.hbase.HBaseClassTestRule;
033import org.apache.hadoop.hbase.ServerName;
034import org.apache.hadoop.hbase.StartMiniClusterOption;
035import org.apache.hadoop.hbase.TableName;
036import org.apache.hadoop.hbase.Waiter.ExplainingPredicate;
037import org.apache.hadoop.hbase.client.RegionInfo;
038import org.apache.hadoop.hbase.client.Table;
039import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
040import org.apache.hadoop.hbase.master.assignment.ServerState;
041import org.apache.hadoop.hbase.master.assignment.ServerStateNode;
042import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
043import org.apache.hadoop.hbase.master.region.MasterRegion;
044import org.apache.hadoop.hbase.procedure2.Procedure;
045import org.apache.hadoop.hbase.regionserver.HRegionServer;
046import org.apache.hadoop.hbase.test.MetricsAssertHelper;
047import org.apache.hadoop.hbase.testclassification.LargeTests;
048import org.apache.hadoop.hbase.testclassification.MasterTests;
049import org.apache.hadoop.hbase.util.JVMClusterUtil;
050import org.junit.ClassRule;
051import org.junit.Test;
052import org.junit.experimental.categories.Category;
053import org.slf4j.Logger;
054import org.slf4j.LoggerFactory;
055
056@Category({ MasterTests.class, LargeTests.class })
057public class TestClusterRestartFailover extends AbstractTestRestartCluster {
058
059  @ClassRule
060  public static final HBaseClassTestRule CLASS_RULE =
061    HBaseClassTestRule.forClass(TestClusterRestartFailover.class);
062
063  private static final Logger LOG = LoggerFactory.getLogger(TestClusterRestartFailover.class);
064  private static final MetricsAssertHelper metricsHelper =
065    CompatibilityFactory.getInstance(MetricsAssertHelper.class);
066
067  private volatile static CountDownLatch SCP_LATCH;
068  private static ServerName SERVER_FOR_TEST;
069
070  @Override
071  protected boolean splitWALCoordinatedByZk() {
072    return true;
073  }
074
075  private ServerStateNode getServerStateNode(ServerName serverName) {
076    return UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates()
077      .getServerNode(serverName);
078  }
079
080  /**
081   * Test for HBASE-22964
082   */
083  @Test
084  public void test() throws Exception {
085    setupCluster();
086    setupTable();
087
088    // Find server that does not have hbase:namespace on it. This tests holds up SCPs. If it
089    // holds up the server w/ hbase:namespace, the Master initialization will be held up
090    // because this table is not online and test fails.
091    for (JVMClusterUtil.RegionServerThread rst : UTIL.getHBaseCluster()
092      .getLiveRegionServerThreads()) {
093      HRegionServer rs = rst.getRegionServer();
094      if (rs.getRegions(TableName.NAMESPACE_TABLE_NAME).isEmpty()) {
095        SERVER_FOR_TEST = rs.getServerName();
096      }
097    }
098    UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null);
099    ServerStateNode serverNode = getServerStateNode(SERVER_FOR_TEST);
100    assertNotNull(serverNode);
101    assertTrue("serverNode should be ONLINE when cluster runs normally",
102      serverNode.isInState(ServerState.ONLINE));
103
104    SCP_LATCH = new CountDownLatch(1);
105
106    // Shutdown cluster and restart
107    List<Integer> ports =
108      UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList().stream()
109        .map(serverName -> serverName.getPort()).collect(Collectors.toList());
110    LOG.info("Shutting down cluster");
111    UTIL.getHBaseCluster().killAll();
112    UTIL.getHBaseCluster().waitUntilShutDown();
113    LOG.info("Restarting cluster");
114    UTIL.restartHBaseCluster(StartMiniClusterOption.builder().masterClass(HMasterForTest.class)
115      .numMasters(1).numRegionServers(3).rsPorts(ports).build());
116    LOG.info("Started cluster");
117    UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().isInitialized());
118    LOG.info("Started cluster master, waiting for {}", SERVER_FOR_TEST);
119    UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null);
120    UTIL.waitFor(30000, new ExplainingPredicate<Exception>() {
121
122      @Override
123      public boolean evaluate() throws Exception {
124        return !getServerStateNode(SERVER_FOR_TEST).isInState(ServerState.ONLINE);
125      }
126
127      @Override
128      public String explainFailure() throws Exception {
129        return "serverNode should not be ONLINE during SCP processing";
130      }
131    });
132    Optional<Procedure<?>> procedure = UTIL.getHBaseCluster().getMaster().getProcedures().stream()
133      .filter(p -> (p instanceof ServerCrashProcedure)
134        && ((ServerCrashProcedure) p).getServerName().equals(SERVER_FOR_TEST))
135      .findAny();
136    assertTrue("Should have one SCP for " + SERVER_FOR_TEST, procedure.isPresent());
137    assertEquals("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail",
138      Procedure.NO_PROC_ID,
139      UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST));
140
141    // Wait the SCP to finish
142    LOG.info("Waiting on latch");
143    SCP_LATCH.countDown();
144    UTIL.waitFor(60000, () -> procedure.get().isFinished());
145    assertNull("serverNode should be deleted after SCP finished",
146      getServerStateNode(SERVER_FOR_TEST));
147
148    assertEquals(
149      "Even when the SCP is finished, the duplicate SCP should not be scheduled for "
150        + SERVER_FOR_TEST,
151      Procedure.NO_PROC_ID,
152      UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST));
153
154    MetricsMasterSource masterSource =
155      UTIL.getHBaseCluster().getMaster().getMasterMetrics().getMetricsSource();
156    metricsHelper.assertCounter(MetricsMasterSource.SERVER_CRASH_METRIC_PREFIX + "SubmittedCount",
157      3, masterSource);
158  }
159
160  private void setupCluster() throws Exception {
161    LOG.info("Setup cluster");
162    UTIL.startMiniCluster(StartMiniClusterOption.builder().masterClass(HMasterForTest.class)
163      .numMasters(1).numRegionServers(3).build());
164    // this test has been flaky. When it is rerun by surefire, the underlying minicluster isn't
165    // completely cleaned. specifically, the metrics system isn't reset. The result is an otherwise
166    // successful re-run is failed because there's 8 or 12 SCPcounts instead of the 4 that a
167    // single run of the test would otherwise produce. Thus, explicitly reset the metrics source
168    // each time we setup the cluster.
169    UTIL.getMiniHBaseCluster().getMaster().getMasterMetrics().getMetricsSource().init();
170    LOG.info("Cluster is up");
171    UTIL.waitFor(60000, () -> UTIL.getMiniHBaseCluster().getMaster().isInitialized());
172    LOG.info("Master is up");
173    // wait for all SCPs finished
174    UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().getProcedures().stream()
175      .noneMatch(p -> p instanceof ServerCrashProcedure));
176    LOG.info("No SCPs");
177  }
178
179  private void setupTable() throws Exception {
180    TableName tableName = TABLES[0];
181    UTIL.createMultiRegionTable(tableName, FAMILY);
182    UTIL.waitTableAvailable(tableName);
183    Table table = UTIL.getConnection().getTable(tableName);
184    for (int i = 0; i < 100; i++) {
185      UTIL.loadTable(table, FAMILY);
186    }
187  }
188
189  public static final class HMasterForTest extends HMaster {
190
191    public HMasterForTest(Configuration conf) throws IOException {
192      super(conf);
193    }
194
195    @Override
196    protected AssignmentManager createAssignmentManager(MasterServices master,
197      MasterRegion masterRegion) {
198      return new AssignmentManagerForTest(master, masterRegion);
199    }
200  }
201
202  private static final class AssignmentManagerForTest extends AssignmentManager {
203
204    public AssignmentManagerForTest(MasterServices master, MasterRegion masterRegion) {
205      super(master, masterRegion);
206    }
207
208    @Override
209    public List<RegionInfo> getRegionsOnServer(ServerName serverName) {
210      List<RegionInfo> regions = super.getRegionsOnServer(serverName);
211      // ServerCrashProcedure will call this method, so wait the CountDownLatch here
212      if (SCP_LATCH != null && SERVER_FOR_TEST != null && serverName.equals(SERVER_FOR_TEST)) {
213        try {
214          LOG.info("ServerCrashProcedure wait the CountDownLatch here");
215          SCP_LATCH.await();
216          LOG.info("Continue the ServerCrashProcedure");
217          SCP_LATCH = null;
218        } catch (InterruptedException e) {
219          throw new RuntimeException(e);
220        }
221      }
222      return regions;
223    }
224  }
225}