001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import static org.junit.Assert.assertEquals; 021import static org.junit.Assert.assertNotNull; 022import static org.junit.Assert.assertTrue; 023 024import java.util.List; 025import java.util.concurrent.TimeUnit; 026import org.apache.hadoop.hbase.ClusterMetrics; 027import org.apache.hadoop.hbase.HBaseClassTestRule; 028import org.apache.hadoop.hbase.HBaseTestingUtil; 029import org.apache.hadoop.hbase.ServerName; 030import org.apache.hadoop.hbase.SingleProcessHBaseCluster; 031import org.apache.hadoop.hbase.StartTestingClusterOption; 032import org.apache.hadoop.hbase.master.RegionState.State; 033import org.apache.hadoop.hbase.regionserver.HRegionServer; 034import org.apache.hadoop.hbase.testclassification.FlakeyTests; 035import org.apache.hadoop.hbase.testclassification.LargeTests; 036import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; 037import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; 038import org.junit.ClassRule; 039import org.junit.Rule; 040import org.junit.Test; 041import org.junit.experimental.categories.Category; 042import org.junit.rules.TestName; 043import org.slf4j.Logger; 044import org.slf4j.LoggerFactory; 045 046@Category({ FlakeyTests.class, LargeTests.class }) 047public class TestMasterFailover { 048 049 @ClassRule 050 public static final HBaseClassTestRule CLASS_RULE = 051 HBaseClassTestRule.forClass(TestMasterFailover.class); 052 053 private static final Logger LOG = LoggerFactory.getLogger(TestMasterFailover.class); 054 @Rule 055 public TestName name = new TestName(); 056 057 /** 058 * Simple test of master failover. 059 * <p> 060 * Starts with three masters. Kills a backup master. Then kills the active master. Ensures the 061 * final master becomes active and we can still contact the cluster. 062 */ 063 @Test 064 public void testSimpleMasterFailover() throws Exception { 065 final int NUM_MASTERS = 3; 066 final int NUM_RS = 3; 067 068 // Start the cluster 069 HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); 070 try { 071 StartTestingClusterOption option = StartTestingClusterOption.builder().numMasters(NUM_MASTERS) 072 .numRegionServers(NUM_RS).numDataNodes(NUM_RS).build(); 073 TEST_UTIL.startMiniCluster(option); 074 SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 075 076 // get all the master threads 077 List<MasterThread> masterThreads = cluster.getMasterThreads(); 078 079 // wait for each to come online 080 for (MasterThread mt : masterThreads) { 081 assertTrue(mt.isAlive()); 082 } 083 084 // verify only one is the active master and we have right number 085 int numActive = 0; 086 int activeIndex = -1; 087 ServerName activeName = null; 088 HMaster active = null; 089 for (int i = 0; i < masterThreads.size(); i++) { 090 if (masterThreads.get(i).getMaster().isActiveMaster()) { 091 numActive++; 092 activeIndex = i; 093 active = masterThreads.get(activeIndex).getMaster(); 094 activeName = active.getServerName(); 095 } 096 } 097 assertEquals(1, numActive); 098 assertEquals(NUM_MASTERS, masterThreads.size()); 099 LOG.info("Active master " + activeName); 100 101 // Check that ClusterStatus reports the correct active and backup masters 102 assertNotNull(active); 103 ClusterMetrics status = active.getClusterMetrics(); 104 assertEquals(activeName, status.getMasterName()); 105 assertEquals(2, status.getBackupMasterNames().size()); 106 107 // attempt to stop one of the inactive masters 108 int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1); 109 HMaster master = cluster.getMaster(backupIndex); 110 LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n"); 111 cluster.stopMaster(backupIndex, false); 112 cluster.waitOnMaster(backupIndex); 113 114 // Verify still one active master and it's the same 115 for (int i = 0; i < masterThreads.size(); i++) { 116 if (masterThreads.get(i).getMaster().isActiveMaster()) { 117 assertEquals(activeName, masterThreads.get(i).getMaster().getServerName()); 118 activeIndex = i; 119 active = masterThreads.get(activeIndex).getMaster(); 120 } 121 } 122 assertEquals(1, numActive); 123 assertEquals(2, masterThreads.size()); 124 int rsCount = masterThreads.get(activeIndex).getMaster().getClusterMetrics() 125 .getLiveServerMetrics().size(); 126 LOG.info( 127 "Active master " + active.getServerName() + " managing " + rsCount + " regions servers"); 128 assertEquals(3, rsCount); 129 130 // wait for the active master to acknowledge loss of the backup from ZK 131 final HMaster activeFinal = active; 132 TEST_UTIL.waitFor(TimeUnit.MINUTES.toMillis(5), 133 () -> activeFinal.getBackupMasters().size() == 1); 134 135 // Check that ClusterStatus reports the correct active and backup masters 136 assertNotNull(active); 137 status = active.getClusterMetrics(); 138 assertEquals(activeName, status.getMasterName()); 139 assertEquals(1, status.getBackupMasterNames().size()); 140 141 // kill the active master 142 LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n"); 143 cluster.stopMaster(activeIndex, false); 144 cluster.waitOnMaster(activeIndex); 145 146 // wait for an active master to show up and be ready 147 assertTrue(cluster.waitForActiveAndReadyMaster()); 148 149 LOG.debug("\n\nVerifying backup master is now active\n"); 150 // should only have one master now 151 assertEquals(1, masterThreads.size()); 152 153 // and he should be active 154 active = masterThreads.get(0).getMaster(); 155 assertNotNull(active); 156 status = active.getClusterMetrics(); 157 ServerName masterName = status.getMasterName(); 158 assertNotNull(masterName); 159 assertEquals(active.getServerName(), masterName); 160 assertTrue(active.isActiveMaster()); 161 assertEquals(0, status.getBackupMasterNames().size()); 162 int rss = status.getLiveServerMetrics().size(); 163 LOG.info("Active master {} managing {} region servers", masterName.getServerName(), rss); 164 assertEquals(3, rss); 165 } finally { 166 // Stop the cluster 167 TEST_UTIL.shutdownMiniCluster(); 168 } 169 } 170 171 /** 172 * Test meta in transition when master failover. This test used to manipulate region state up in 173 * zk. That is not allowed any more in hbase2 so I removed that messing. That makes this test 174 * anemic. 175 */ 176 @Test 177 public void testMetaInTransitionWhenMasterFailover() throws Exception { 178 // Start the cluster 179 HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); 180 TEST_UTIL.startMiniCluster(); 181 try { 182 SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 183 LOG.info("Cluster started"); 184 185 HMaster activeMaster = cluster.getMaster(); 186 ServerName metaServerName = cluster.getServerHoldingMeta(); 187 HRegionServer hrs = cluster.getRegionServer(metaServerName); 188 189 // Now kill master, meta should remain on rs, where we placed it before. 190 LOG.info("Aborting master"); 191 activeMaster.abort("test-kill"); 192 cluster.waitForMasterToStop(activeMaster.getServerName(), 30000); 193 LOG.info("Master has aborted"); 194 195 // meta should remain where it was 196 RegionState metaState = MetaTableLocator.getMetaRegionState(hrs.getZooKeeper()); 197 assertEquals("hbase:meta should be online on RS", metaState.getServerName(), metaServerName); 198 assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState()); 199 200 // Start up a new master 201 LOG.info("Starting up a new master"); 202 activeMaster = cluster.startMaster().getMaster(); 203 LOG.info("Waiting for master to be ready"); 204 cluster.waitForActiveAndReadyMaster(); 205 LOG.info("Master is ready"); 206 207 // ensure meta is still deployed on RS 208 metaState = MetaTableLocator.getMetaRegionState(activeMaster.getZooKeeper()); 209 assertEquals("hbase:meta should be online on RS", metaState.getServerName(), metaServerName); 210 assertEquals("hbase:meta should be online on RS", State.OPEN, metaState.getState()); 211 212 // Done, shutdown the cluster 213 } finally { 214 TEST_UTIL.shutdownMiniCluster(); 215 } 216 } 217}