001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.assertNotNull; 022import static org.junit.Assert.assertNull; 023import static org.junit.Assert.assertTrue; 024 025import java.io.IOException; 026import java.util.List; 027import java.util.Optional; 028import java.util.concurrent.CountDownLatch; 029import java.util.stream.Collectors; 030import org.apache.hadoop.conf.Configuration; 031import org.apache.hadoop.hbase.CompatibilityFactory; 032import org.apache.hadoop.hbase.HBaseClassTestRule; 033import org.apache.hadoop.hbase.ServerName; 034import org.apache.hadoop.hbase.StartMiniClusterOption; 035import org.apache.hadoop.hbase.TableName; 036import org.apache.hadoop.hbase.Waiter.ExplainingPredicate; 037import org.apache.hadoop.hbase.client.RegionInfo; 038import org.apache.hadoop.hbase.client.Table; 039import org.apache.hadoop.hbase.master.assignment.AssignmentManager; 040import org.apache.hadoop.hbase.master.assignment.ServerState; 041import org.apache.hadoop.hbase.master.assignment.ServerStateNode; 042import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; 043import org.apache.hadoop.hbase.master.region.MasterRegion; 044import org.apache.hadoop.hbase.procedure2.Procedure; 045import org.apache.hadoop.hbase.regionserver.HRegionServer; 046import org.apache.hadoop.hbase.test.MetricsAssertHelper; 047import org.apache.hadoop.hbase.testclassification.LargeTests; 048import org.apache.hadoop.hbase.testclassification.MasterTests; 049import org.apache.hadoop.hbase.util.JVMClusterUtil; 050import org.junit.ClassRule; 051import org.junit.Test; 052import org.junit.experimental.categories.Category; 053import org.slf4j.Logger; 054import org.slf4j.LoggerFactory; 055 056@Category({ MasterTests.class, LargeTests.class }) 057public class TestClusterRestartFailover extends AbstractTestRestartCluster { 058 059 @ClassRule 060 public static final HBaseClassTestRule CLASS_RULE = 061 HBaseClassTestRule.forClass(TestClusterRestartFailover.class); 062 063 private static final Logger LOG = LoggerFactory.getLogger(TestClusterRestartFailover.class); 064 private static final MetricsAssertHelper metricsHelper = 065 CompatibilityFactory.getInstance(MetricsAssertHelper.class); 066 067 private volatile static CountDownLatch SCP_LATCH; 068 private static ServerName SERVER_FOR_TEST; 069 070 @Override 071 protected boolean splitWALCoordinatedByZk() { 072 return true; 073 } 074 075 private ServerStateNode getServerStateNode(ServerName serverName) { 076 return UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates() 077 .getServerNode(serverName); 078 } 079 080 /** 081 * Test for HBASE-22964 082 */ 083 @Test 084 public void test() throws Exception { 085 setupCluster(); 086 setupTable(); 087 088 // Find server that does not have hbase:namespace on it. This tests holds up SCPs. If it 089 // holds up the server w/ hbase:namespace, the Master initialization will be held up 090 // because this table is not online and test fails. 091 for (JVMClusterUtil.RegionServerThread rst : UTIL.getHBaseCluster() 092 .getLiveRegionServerThreads()) { 093 HRegionServer rs = rst.getRegionServer(); 094 if (rs.getRegions(TableName.NAMESPACE_TABLE_NAME).isEmpty()) { 095 SERVER_FOR_TEST = rs.getServerName(); 096 } 097 } 098 UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null); 099 ServerStateNode serverNode = getServerStateNode(SERVER_FOR_TEST); 100 assertNotNull(serverNode); 101 assertTrue("serverNode should be ONLINE when cluster runs normally", 102 serverNode.isInState(ServerState.ONLINE)); 103 104 SCP_LATCH = new CountDownLatch(1); 105 106 // Shutdown cluster and restart 107 List<Integer> ports = 108 UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList().stream() 109 .map(serverName -> serverName.getPort()).collect(Collectors.toList()); 110 LOG.info("Shutting down cluster"); 111 UTIL.getHBaseCluster().killAll(); 112 UTIL.getHBaseCluster().waitUntilShutDown(); 113 LOG.info("Restarting cluster"); 114 UTIL.restartHBaseCluster(StartMiniClusterOption.builder().masterClass(HMasterForTest.class) 115 .numMasters(1).numRegionServers(3).rsPorts(ports).build()); 116 LOG.info("Started cluster"); 117 UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().isInitialized()); 118 LOG.info("Started cluster master, waiting for {}", SERVER_FOR_TEST); 119 UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null); 120 UTIL.waitFor(30000, new ExplainingPredicate<Exception>() { 121 122 @Override 123 public boolean evaluate() throws Exception { 124 return !getServerStateNode(SERVER_FOR_TEST).isInState(ServerState.ONLINE); 125 } 126 127 @Override 128 public String explainFailure() throws Exception { 129 return "serverNode should not be ONLINE during SCP processing"; 130 } 131 }); 132 Optional<Procedure<?>> procedure = UTIL.getHBaseCluster().getMaster().getProcedures().stream() 133 .filter(p -> (p instanceof ServerCrashProcedure) 134 && ((ServerCrashProcedure) p).getServerName().equals(SERVER_FOR_TEST)) 135 .findAny(); 136 assertTrue("Should have one SCP for " + SERVER_FOR_TEST, procedure.isPresent()); 137 assertEquals("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail", 138 Procedure.NO_PROC_ID, 139 UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST)); 140 141 // Wait the SCP to finish 142 LOG.info("Waiting on latch"); 143 SCP_LATCH.countDown(); 144 UTIL.waitFor(60000, () -> procedure.get().isFinished()); 145 assertNull("serverNode should be deleted after SCP finished", 146 getServerStateNode(SERVER_FOR_TEST)); 147 148 assertEquals( 149 "Even when the SCP is finished, the duplicate SCP should not be scheduled for " 150 + SERVER_FOR_TEST, 151 Procedure.NO_PROC_ID, 152 UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST)); 153 154 MetricsMasterSource masterSource = 155 UTIL.getHBaseCluster().getMaster().getMasterMetrics().getMetricsSource(); 156 metricsHelper.assertCounter(MetricsMasterSource.SERVER_CRASH_METRIC_PREFIX + "SubmittedCount", 157 3, masterSource); 158 } 159 160 private void setupCluster() throws Exception { 161 LOG.info("Setup cluster"); 162 UTIL.startMiniCluster(StartMiniClusterOption.builder().masterClass(HMasterForTest.class) 163 .numMasters(1).numRegionServers(3).build()); 164 // this test has been flaky. When it is rerun by surefire, the underlying minicluster isn't 165 // completely cleaned. specifically, the metrics system isn't reset. The result is an otherwise 166 // successful re-run is failed because there's 8 or 12 SCPcounts instead of the 4 that a 167 // single run of the test would otherwise produce. Thus, explicitly reset the metrics source 168 // each time we setup the cluster. 169 UTIL.getMiniHBaseCluster().getMaster().getMasterMetrics().getMetricsSource().init(); 170 LOG.info("Cluster is up"); 171 UTIL.waitFor(60000, () -> UTIL.getMiniHBaseCluster().getMaster().isInitialized()); 172 LOG.info("Master is up"); 173 // wait for all SCPs finished 174 UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().getProcedures().stream() 175 .noneMatch(p -> p instanceof ServerCrashProcedure)); 176 LOG.info("No SCPs"); 177 } 178 179 private void setupTable() throws Exception { 180 TableName tableName = TABLES[0]; 181 UTIL.createMultiRegionTable(tableName, FAMILY); 182 UTIL.waitTableAvailable(tableName); 183 Table table = UTIL.getConnection().getTable(tableName); 184 for (int i = 0; i < 100; i++) { 185 UTIL.loadTable(table, FAMILY); 186 } 187 } 188 189 public static final class HMasterForTest extends HMaster { 190 191 public HMasterForTest(Configuration conf) throws IOException { 192 super(conf); 193 } 194 195 @Override 196 protected AssignmentManager createAssignmentManager(MasterServices master, 197 MasterRegion masterRegion) { 198 return new AssignmentManagerForTest(master, masterRegion); 199 } 200 } 201 202 private static final class AssignmentManagerForTest extends AssignmentManager { 203 204 public AssignmentManagerForTest(MasterServices master, MasterRegion masterRegion) { 205 super(master, masterRegion); 206 } 207 208 @Override 209 public List<RegionInfo> getRegionsOnServer(ServerName serverName) { 210 List<RegionInfo> regions = super.getRegionsOnServer(serverName); 211 // ServerCrashProcedure will call this method, so wait the CountDownLatch here 212 if (SCP_LATCH != null && SERVER_FOR_TEST != null && serverName.equals(SERVER_FOR_TEST)) { 213 try { 214 LOG.info("ServerCrashProcedure wait the CountDownLatch here"); 215 SCP_LATCH.await(); 216 LOG.info("Continue the ServerCrashProcedure"); 217 SCP_LATCH = null; 218 } catch (InterruptedException e) { 219 throw new RuntimeException(e); 220 } 221 } 222 return regions; 223 } 224 } 225}