001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.assertNotNull; 022import static org.junit.Assert.assertNull; 023import static org.junit.Assert.assertTrue; 024 025import java.io.IOException; 026import java.util.List; 027import java.util.Optional; 028import java.util.concurrent.CountDownLatch; 029import java.util.stream.Collectors; 030import org.apache.hadoop.conf.Configuration; 031import org.apache.hadoop.hbase.CompatibilityFactory; 032import org.apache.hadoop.hbase.HBaseClassTestRule; 033import org.apache.hadoop.hbase.ServerName; 034import org.apache.hadoop.hbase.StartTestingClusterOption; 035import org.apache.hadoop.hbase.TableName; 036import org.apache.hadoop.hbase.Waiter.ExplainingPredicate; 037import org.apache.hadoop.hbase.client.RegionInfo; 038import org.apache.hadoop.hbase.client.Table; 039import org.apache.hadoop.hbase.master.assignment.AssignmentManager; 040import org.apache.hadoop.hbase.master.assignment.ServerState; 041import org.apache.hadoop.hbase.master.assignment.ServerStateNode; 042import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; 043import org.apache.hadoop.hbase.master.region.MasterRegion; 044import org.apache.hadoop.hbase.procedure2.Procedure; 045import org.apache.hadoop.hbase.test.MetricsAssertHelper; 046import org.apache.hadoop.hbase.testclassification.LargeTests; 047import org.apache.hadoop.hbase.testclassification.MasterTests; 048import org.junit.ClassRule; 049import org.junit.Test; 050import org.junit.experimental.categories.Category; 051import org.slf4j.Logger; 052import org.slf4j.LoggerFactory; 053 054@Category({ MasterTests.class, LargeTests.class }) 055public class TestClusterRestartFailover extends AbstractTestRestartCluster { 056 057 @ClassRule 058 public static final HBaseClassTestRule CLASS_RULE = 059 HBaseClassTestRule.forClass(TestClusterRestartFailover.class); 060 061 private static final Logger LOG = LoggerFactory.getLogger(TestClusterRestartFailover.class); 062 private static final MetricsAssertHelper metricsHelper = 063 CompatibilityFactory.getInstance(MetricsAssertHelper.class); 064 065 private volatile static CountDownLatch SCP_LATCH; 066 private static ServerName SERVER_FOR_TEST; 067 068 @Override 069 protected boolean splitWALCoordinatedByZk() { 070 return true; 071 } 072 073 private ServerStateNode getServerStateNode(ServerName serverName) { 074 return UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates() 075 .getServerNode(serverName); 076 } 077 078 /** 079 * Test for HBASE-22964 080 */ 081 @Test 082 public void test() throws Exception { 083 setupCluster(); 084 setupTable(); 085 086 SERVER_FOR_TEST = UTIL.getHBaseCluster().getRegionServer(0).getServerName(); 087 UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null); 088 ServerStateNode serverNode = getServerStateNode(SERVER_FOR_TEST); 089 assertNotNull(serverNode); 090 assertTrue("serverNode should be ONLINE when cluster runs normally", 091 serverNode.isInState(ServerState.ONLINE)); 092 093 SCP_LATCH = new CountDownLatch(1); 094 095 // Shutdown cluster and restart 096 List<Integer> ports = 097 UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList().stream() 098 .map(serverName -> serverName.getPort()).collect(Collectors.toList()); 099 LOG.info("Shutting down cluster"); 100 UTIL.getHBaseCluster().killAll(); 101 UTIL.getHBaseCluster().waitUntilShutDown(); 102 LOG.info("Restarting cluster"); 103 UTIL.restartHBaseCluster(StartTestingClusterOption.builder().masterClass(HMasterForTest.class) 104 .numMasters(1).numRegionServers(3).rsPorts(ports).build()); 105 LOG.info("Started cluster"); 106 UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().isInitialized()); 107 LOG.info("Started cluster master, waiting for {}", SERVER_FOR_TEST); 108 UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null); 109 UTIL.waitFor(30000, new ExplainingPredicate<Exception>() { 110 111 @Override 112 public boolean evaluate() throws Exception { 113 return !getServerStateNode(SERVER_FOR_TEST).isInState(ServerState.ONLINE); 114 } 115 116 @Override 117 public String explainFailure() throws Exception { 118 return "serverNode should not be ONLINE during SCP processing"; 119 } 120 }); 121 Optional<Procedure<?>> procedure = UTIL.getHBaseCluster().getMaster().getProcedures().stream() 122 .filter(p -> (p instanceof ServerCrashProcedure) 123 && ((ServerCrashProcedure) p).getServerName().equals(SERVER_FOR_TEST)) 124 .findAny(); 125 assertTrue("Should have one SCP for " + SERVER_FOR_TEST, procedure.isPresent()); 126 assertEquals("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail", 127 Procedure.NO_PROC_ID, 128 UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST)); 129 130 // Wait the SCP to finish 131 LOG.info("Waiting on latch"); 132 SCP_LATCH.countDown(); 133 UTIL.waitFor(60000, () -> procedure.get().isFinished()); 134 assertNull("serverNode should be deleted after SCP finished", 135 getServerStateNode(SERVER_FOR_TEST)); 136 137 assertEquals( 138 "Even when the SCP is finished, the duplicate SCP should not be scheduled for " 139 + SERVER_FOR_TEST, 140 Procedure.NO_PROC_ID, 141 UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST)); 142 143 MetricsMasterSource masterSource = 144 UTIL.getHBaseCluster().getMaster().getMasterMetrics().getMetricsSource(); 145 metricsHelper.assertCounter(MetricsMasterSource.SERVER_CRASH_METRIC_PREFIX + "SubmittedCount", 146 3, masterSource); 147 } 148 149 private void setupCluster() throws Exception { 150 LOG.info("Setup cluster"); 151 UTIL.startMiniCluster(StartTestingClusterOption.builder().masterClass(HMasterForTest.class) 152 .numMasters(1).numRegionServers(3).build()); 153 // this test has been flaky. When it is rerun by surefire, the underlying minicluster isn't 154 // completely cleaned. specifically, the metrics system isn't reset. The result is an otherwise 155 // successful re-run is failed because there's 8 or 12 SCPcounts instead of the 4 that a 156 // single run of the test would otherwise produce. Thus, explicitly reset the metrics source 157 // each time we setup the cluster. 158 UTIL.getMiniHBaseCluster().getMaster().getMasterMetrics().getMetricsSource().init(); 159 LOG.info("Cluster is up"); 160 UTIL.waitFor(60000, () -> UTIL.getMiniHBaseCluster().getMaster().isInitialized()); 161 LOG.info("Master is up"); 162 // wait for all SCPs finished 163 UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().getProcedures().stream() 164 .noneMatch(p -> p instanceof ServerCrashProcedure)); 165 LOG.info("No SCPs"); 166 } 167 168 private void setupTable() throws Exception { 169 TableName tableName = TABLES[0]; 170 UTIL.createMultiRegionTable(tableName, FAMILY); 171 UTIL.waitTableAvailable(tableName); 172 Table table = UTIL.getConnection().getTable(tableName); 173 for (int i = 0; i < 100; i++) { 174 UTIL.loadTable(table, FAMILY); 175 } 176 } 177 178 public static final class HMasterForTest extends HMaster { 179 180 public HMasterForTest(Configuration conf) throws IOException { 181 super(conf); 182 } 183 184 @Override 185 protected AssignmentManager createAssignmentManager(MasterServices master, 186 MasterRegion masterRegion) { 187 return new AssignmentManagerForTest(master, masterRegion); 188 } 189 } 190 191 private static final class AssignmentManagerForTest extends AssignmentManager { 192 193 public AssignmentManagerForTest(MasterServices master, MasterRegion masterRegion) { 194 super(master, masterRegion); 195 } 196 197 @Override 198 public List<RegionInfo> getRegionsOnServer(ServerName serverName) { 199 List<RegionInfo> regions = super.getRegionsOnServer(serverName); 200 // ServerCrashProcedure will call this method, so wait the CountDownLatch here 201 if (SCP_LATCH != null && SERVER_FOR_TEST != null && serverName.equals(SERVER_FOR_TEST)) { 202 try { 203 LOG.info("ServerCrashProcedure wait the CountDownLatch here"); 204 SCP_LATCH.await(); 205 LOG.info("Continue the ServerCrashProcedure"); 206 SCP_LATCH = null; 207 } catch (InterruptedException e) { 208 throw new RuntimeException(e); 209 } 210 } 211 return regions; 212 } 213 } 214}