001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.procedure; 019 020import static junit.framework.TestCase.assertFalse; 021import static junit.framework.TestCase.assertNotNull; 022import static org.junit.Assert.assertEquals; 023import static org.junit.Assert.assertNotEquals; 024import static org.junit.Assert.assertTrue; 025 026import java.io.IOException; 027import java.util.List; 028import java.util.NoSuchElementException; 029import java.util.Objects; 030import java.util.concurrent.TimeUnit; 031import org.apache.hadoop.hbase.HBaseClassTestRule; 032import org.apache.hadoop.hbase.HBaseTestingUtility; 033import org.apache.hadoop.hbase.HConstants; 034import org.apache.hadoop.hbase.MetaTableAccessor; 035import org.apache.hadoop.hbase.MiniHBaseCluster; 036import org.apache.hadoop.hbase.ServerName; 037import org.apache.hadoop.hbase.TableName; 038import org.apache.hadoop.hbase.TableNameTestRule; 039import org.apache.hadoop.hbase.client.RegionInfo; 040import org.apache.hadoop.hbase.client.Result; 041import org.apache.hadoop.hbase.client.Table; 042import org.apache.hadoop.hbase.master.HMaster; 043import org.apache.hadoop.hbase.master.RegionState; 044import org.apache.hadoop.hbase.procedure2.Procedure; 045import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; 046import org.apache.hadoop.hbase.regionserver.HRegionServer; 047import org.apache.hadoop.hbase.testclassification.LargeTests; 048import org.apache.hadoop.hbase.testclassification.MasterTests; 049import org.apache.hadoop.hbase.util.Bytes; 050import org.apache.hadoop.hbase.util.JVMClusterUtil; 051import org.apache.hadoop.hbase.util.Pair; 052import org.junit.ClassRule; 053import org.junit.Rule; 054import org.junit.Test; 055import org.junit.experimental.categories.Category; 056import org.junit.runner.RunWith; 057import org.junit.runners.Parameterized; 058import org.slf4j.Logger; 059import org.slf4j.LoggerFactory; 060 061import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException; 062 063import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 064import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos; 065 066/** 067 * Test of the HBCK-version of SCP. The HBCKSCP is an SCP only it reads hbase:meta for list of 068 * Regions that were on the server-to-process rather than consult Master in-memory-state. 069 */ 070@Category({ MasterTests.class, LargeTests.class }) 071@RunWith(Parameterized.class) 072public class TestHBCKSCP extends TestSCPBase { 073 private static final Logger LOG = LoggerFactory.getLogger(TestHBCKSCP.class); 074 075 @ClassRule 076 public static final HBaseClassTestRule CLASS_RULE = 077 HBaseClassTestRule.forClass(TestHBCKSCP.class); 078 @Rule 079 public TableNameTestRule tableNameTestRule = new TableNameTestRule(); 080 081 private final int replicas; 082 private final HBCKSCPScheduler hbckscpScheduler; 083 private final RegionSelector regionSelector; 084 085 public TestHBCKSCP(final int replicas, final HBCKSCPScheduler hbckscpScheduler, 086 final RegionSelector regionSelector) { 087 this.replicas = replicas; 088 this.hbckscpScheduler = hbckscpScheduler; 089 this.regionSelector = regionSelector; 090 } 091 092 @Parameterized.Parameters(name = "replicas:{0} scheduler:{1} selector:{2}") 093 public static Object[][] params() { 094 return new Object[][] { 095 { 1, new ScheduleServerCrashProcedure(), new PrimaryNotMetaRegionSelector() }, 096 { 3, new ScheduleServerCrashProcedure(), new ReplicaNonMetaRegionSelector() }, 097 { 1, new ScheduleSCPsForUnknownServers(), new PrimaryNotMetaRegionSelector() }, 098 { 3, new ScheduleSCPsForUnknownServers(), new ReplicaNonMetaRegionSelector() } }; 099 } 100 101 @Test 102 public void test() throws Exception { 103 // we are about to do one for it? 104 MiniHBaseCluster cluster = this.util.getHBaseCluster(); 105 106 // Assert that we have three RegionServers. Test depends on there being multiple. 107 assertEquals(RS_COUNT, cluster.getLiveRegionServerThreads().size()); 108 109 int count; 110 try (Table table = createTable(tableNameTestRule.getTableName())) { 111 // Load the table with a bit of data so some logs to split and some edits in each region. 112 this.util.loadTable(table, HBaseTestingUtility.COLUMNS[0]); 113 count = util.countRows(table); 114 } 115 assertTrue("expected some rows", count > 0); 116 117 // Make the test easier by not working on server hosting meta... 118 // Find another RS. Purge it from Master memory w/o running SCP (if 119 // SCP runs, it will clear entries from hbase:meta which frustrates 120 // our attempt at manufacturing 'Unknown Servers' condition). 121 final ServerName metaServer = util.getMiniHBaseCluster().getServerHoldingMeta(); 122 final ServerName rsServerName = cluster.getRegionServerThreads().stream() 123 .map(JVMClusterUtil.RegionServerThread::getRegionServer).map(HRegionServer::getServerName) 124 .filter(sn -> !sn.equals(metaServer)).findAny().orElseThrow(() -> new NoSuchElementException( 125 "Cannot locate a region server that is not hosting meta.")); 126 HMaster master = cluster.getMaster(); 127 // Get a Region that is on the server. 128 final List<RegionInfo> regions = master.getAssignmentManager().getRegionsOnServer(rsServerName); 129 LOG.debug("{} is holding {} regions.", rsServerName, regions.size()); 130 final RegionInfo rsRI = 131 regions.stream().peek(info -> LOG.debug("{}", info)).filter(regionSelector::regionFilter) 132 .findAny().orElseThrow(regionSelector::regionFilterFailure); 133 final int replicaId = rsRI.getReplicaId(); 134 Result r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI); 135 // Assert region is OPEN. 136 assertEquals(RegionState.State.OPEN.toString(), Bytes.toString( 137 r.getValue(HConstants.CATALOG_FAMILY, MetaTableAccessor.getRegionStateColumn(replicaId)))); 138 ServerName serverName = MetaTableAccessor.getServerName(r, replicaId); 139 assertEquals(rsServerName, serverName); 140 // moveFrom adds to dead servers and adds it to processing list only we will 141 // not be processing this server 'normally'. Remove it from processing by 142 // calling 'finish' and then remove it from dead servers so rsServerName 143 // becomes an 'Unknown Server' even though it is still around. 144 LOG.info("Killing {}", rsServerName); 145 cluster.killRegionServer(rsServerName); 146 147 master.getServerManager().moveFromOnlineToDeadServers(rsServerName); 148 master.getServerManager().getDeadServers().finish(rsServerName); 149 master.getServerManager().getDeadServers().removeDeadServer(rsServerName); 150 master.getAssignmentManager().getRegionStates().removeServer(rsServerName); 151 // Kill the server. Nothing should happen since an 'Unknown Server' as far 152 // as the Master is concerned; i.e. no SCP. 153 HRegionServer hrs = cluster.getRegionServer(rsServerName); 154 util.waitFor(TimeUnit.MINUTES.toMillis(1), hrs::isStopped); 155 LOG.info("Dead {}", rsServerName); 156 // Now assert still references in hbase:meta to the 'dead' server -- they haven't been 157 // cleaned up by an SCP or by anything else. 158 assertTrue(searchMeta(master, rsServerName)); 159 // Assert region is OPEN on dead server still. 160 r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI); 161 assertEquals(RegionState.State.OPEN.toString(), Bytes.toString( 162 r.getValue(HConstants.CATALOG_FAMILY, MetaTableAccessor.getRegionStateColumn(replicaId)))); 163 serverName = MetaTableAccessor.getServerName(r, replicaId); 164 assertNotNull(cluster.getRegionServer(serverName)); 165 assertEquals(rsServerName, serverName); 166 167 // I now have 'Unknown Server' references in hbase:meta; i.e. Server references 168 // with no corresponding SCP. Queue one. 169 long pid = scheduleHBCKSCP(rsServerName, master); 170 assertNotEquals(Procedure.NO_PROC_ID, pid); 171 ProcedureTestingUtility.waitProcedure(master.getMasterProcedureExecutor(), pid); 172 // After SCP, assert region is OPEN on new server. 173 r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI); 174 assertEquals(RegionState.State.OPEN.toString(), Bytes.toString( 175 r.getValue(HConstants.CATALOG_FAMILY, MetaTableAccessor.getRegionStateColumn(replicaId)))); 176 serverName = MetaTableAccessor.getServerName(r, 0); 177 assertNotNull(cluster.getRegionServer(serverName)); 178 assertNotEquals(rsServerName, serverName); 179 // Make sure no mention of old server post SCP. 180 assertFalse(searchMeta(master, rsServerName)); 181 } 182 183 protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException { 184 return hbckscpScheduler.scheduleHBCKSCP(rsServerName, master); 185 } 186 187 @Override 188 protected int getRegionReplication() { 189 return replicas; 190 } 191 192 /** Returns True if we find reference to <code>sn</code> in meta table. */ 193 private boolean searchMeta(HMaster master, ServerName sn) throws IOException { 194 List<Pair<RegionInfo, ServerName>> ps = 195 MetaTableAccessor.getTableRegionsAndLocations(master.getConnection(), null); 196 for (Pair<RegionInfo, ServerName> p : ps) { 197 if (p.getSecond().equals(sn)) { 198 return true; 199 } 200 } 201 return false; 202 } 203 204 /** 205 * Encapsulates the choice of which HBCK2 method to call. 206 */ 207 private abstract static class HBCKSCPScheduler { 208 abstract long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException; 209 210 @Override 211 public String toString() { 212 return this.getClass().getSimpleName(); 213 } 214 } 215 216 /** 217 * Invokes {@code MasterRpcServices#scheduleServerCrashProcedure}. 218 */ 219 private static class ScheduleServerCrashProcedure extends HBCKSCPScheduler { 220 @Override 221 public long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException { 222 MasterProtos.ScheduleServerCrashProcedureResponse response = master.getMasterRpcServices() 223 .scheduleServerCrashProcedure(null, MasterProtos.ScheduleServerCrashProcedureRequest 224 .newBuilder().addServerName(ProtobufUtil.toServerName(rsServerName)).build()); 225 assertEquals(1, response.getPidCount()); 226 return response.getPid(0); 227 } 228 } 229 230 /** 231 * Invokes {@code MasterRpcServices#scheduleSCPsForUnknownServers}. 232 */ 233 private static class ScheduleSCPsForUnknownServers extends HBCKSCPScheduler { 234 @Override 235 long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException { 236 MasterProtos.ScheduleSCPsForUnknownServersResponse response = 237 master.getMasterRpcServices().scheduleSCPsForUnknownServers(null, 238 MasterProtos.ScheduleSCPsForUnknownServersRequest.newBuilder().build()); 239 assertEquals(1, response.getPidCount()); 240 return response.getPid(0); 241 } 242 } 243 244 /** 245 * Encapsulates how the target region is selected. 246 */ 247 private static abstract class RegionSelector { 248 abstract boolean regionFilter(RegionInfo info); 249 250 abstract Exception regionFilterFailure(); 251 252 @Override 253 public String toString() { 254 return this.getClass().getSimpleName(); 255 } 256 } 257 258 /** 259 * Selects a non-meta region that is also a primary region. 260 */ 261 private static class PrimaryNotMetaRegionSelector extends RegionSelector { 262 @Override 263 boolean regionFilter(final RegionInfo info) { 264 return !Objects.equals(TableName.META_TABLE_NAME, info.getTable()) 265 && Objects.equals(RegionInfo.DEFAULT_REPLICA_ID, info.getReplicaId()); 266 } 267 268 @Override 269 Exception regionFilterFailure() { 270 return new NoSuchElementException("Cannot locate a primary, non-meta region."); 271 } 272 } 273 274 /** 275 * Selects a non-meta region that is also a replica region. 276 */ 277 private static class ReplicaNonMetaRegionSelector extends RegionSelector { 278 @Override 279 boolean regionFilter(RegionInfo info) { 280 return !Objects.equals(TableName.META_TABLE_NAME, info.getTable()) 281 && !Objects.equals(RegionInfo.DEFAULT_REPLICA_ID, info.getReplicaId()); 282 } 283 284 @Override 285 Exception regionFilterFailure() { 286 return new NoSuchElementException("Cannot locate a replica, non-meta region."); 287 } 288 } 289}