001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.assignment; 019 020import java.io.IOException; 021import java.util.List; 022import java.util.concurrent.CountDownLatch; 023import java.util.concurrent.Future; 024import org.apache.hadoop.conf.Configuration; 025import org.apache.hadoop.hbase.HBaseClassTestRule; 026import org.apache.hadoop.hbase.HBaseTestingUtility; 027import org.apache.hadoop.hbase.HConstants; 028import org.apache.hadoop.hbase.ServerName; 029import org.apache.hadoop.hbase.TableName; 030import org.apache.hadoop.hbase.client.RegionInfo; 031import org.apache.hadoop.hbase.master.HMaster; 032import org.apache.hadoop.hbase.master.MasterServices; 033import org.apache.hadoop.hbase.master.RegionPlan; 034import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher; 035import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; 036import org.apache.hadoop.hbase.master.region.MasterRegion; 037import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; 038import org.apache.hadoop.hbase.testclassification.LargeTests; 039import org.apache.hadoop.hbase.testclassification.MasterTests; 040import org.apache.hadoop.hbase.util.Bytes; 041import org.apache.zookeeper.KeeperException; 042import org.junit.AfterClass; 043import org.junit.BeforeClass; 044import org.junit.ClassRule; 045import org.junit.Test; 046import org.junit.experimental.categories.Category; 047 048/** 049 * Testcase for HBASE-23594. 050 */ 051@Category({ MasterTests.class, LargeTests.class }) 052public class TestRaceBetweenSCPAndTRSP { 053 054 @ClassRule 055 public static final HBaseClassTestRule CLASS_RULE = 056 HBaseClassTestRule.forClass(TestRaceBetweenSCPAndTRSP.class); 057 058 private static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); 059 060 private static TableName NAME = TableName.valueOf("Race"); 061 062 private static byte[] CF = Bytes.toBytes("cf"); 063 064 private static CountDownLatch ARRIVE_REGION_OPENING; 065 066 private static CountDownLatch RESUME_REGION_OPENING; 067 068 private static CountDownLatch ARRIVE_GET_REGIONS_ON_SERVER; 069 070 private static CountDownLatch RESUME_GET_REGIONS_ON_SERVER; 071 072 private static final class AssignmentManagerForTest extends AssignmentManager { 073 074 public AssignmentManagerForTest(MasterServices master, MasterRegion masterRegion) { 075 super(master, masterRegion); 076 } 077 078 @Override 079 void regionOpening(RegionStateNode regionNode) throws IOException { 080 super.regionOpening(regionNode); 081 if (regionNode.getRegionInfo().getTable().equals(NAME) && ARRIVE_REGION_OPENING != null) { 082 ARRIVE_REGION_OPENING.countDown(); 083 ARRIVE_REGION_OPENING = null; 084 try { 085 RESUME_REGION_OPENING.await(); 086 } catch (InterruptedException e) { 087 } 088 } 089 } 090 091 @Override 092 public List<RegionInfo> getRegionsOnServer(ServerName serverName) { 093 List<RegionInfo> regions = super.getRegionsOnServer(serverName); 094 if (ARRIVE_GET_REGIONS_ON_SERVER != null) { 095 ARRIVE_GET_REGIONS_ON_SERVER.countDown(); 096 ARRIVE_GET_REGIONS_ON_SERVER = null; 097 try { 098 RESUME_GET_REGIONS_ON_SERVER.await(); 099 } catch (InterruptedException e) { 100 } 101 } 102 return regions; 103 } 104 } 105 106 public static final class HMasterForTest extends HMaster { 107 108 public HMasterForTest(Configuration conf) throws IOException, KeeperException { 109 super(conf); 110 } 111 112 @Override 113 protected AssignmentManager createAssignmentManager(MasterServices master, 114 MasterRegion masterRegion) { 115 return new AssignmentManagerForTest(master, masterRegion); 116 } 117 } 118 119 @BeforeClass 120 public static void setUp() throws Exception { 121 UTIL.getConfiguration().setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class); 122 UTIL.startMiniCluster(2); 123 UTIL.createTable(NAME, CF); 124 UTIL.waitTableAvailable(NAME); 125 UTIL.getAdmin().balancerSwitch(false, true); 126 } 127 128 @AfterClass 129 public static void tearDown() throws Exception { 130 UTIL.shutdownMiniCluster(); 131 } 132 133 @Test 134 public void test() throws Exception { 135 RegionInfo region = UTIL.getMiniHBaseCluster().getRegions(NAME).get(0).getRegionInfo(); 136 AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(); 137 ServerName sn = am.getRegionStates().getRegionState(region).getServerName(); 138 139 // Assign the CountDownLatches that get nulled in background threads else we NPE checking 140 // the static. 141 ARRIVE_REGION_OPENING = new CountDownLatch(1); 142 CountDownLatch arriveRegionOpening = ARRIVE_REGION_OPENING; 143 RESUME_REGION_OPENING = new CountDownLatch(1); 144 ARRIVE_GET_REGIONS_ON_SERVER = new CountDownLatch(1); 145 CountDownLatch arriveGetRegionsOnServer = ARRIVE_GET_REGIONS_ON_SERVER; 146 RESUME_GET_REGIONS_ON_SERVER = new CountDownLatch(1); 147 148 Future<byte[]> moveFuture = am.moveAsync(new RegionPlan(region, sn, sn)); 149 arriveRegionOpening.await(); 150 151 // Kill the region server and trigger a SCP 152 UTIL.getMiniHBaseCluster().killRegionServer(sn); 153 // Wait until the SCP reaches the getRegionsOnServer call 154 arriveGetRegionsOnServer.await(); 155 RSProcedureDispatcher remoteDispatcher = UTIL.getMiniHBaseCluster().getMaster() 156 .getMasterProcedureExecutor().getEnvironment().getRemoteDispatcher(); 157 // this is necessary for making the UT stable, the problem here is that, in 158 // ServerManager.expireServer, we will submit the SCP and then the SCP will be executed in 159 // another thread(the PEWorker), so when we reach the above getRegionsOnServer call in SCP, it 160 // is still possible that the expireServer call has not been finished so the remote dispatcher 161 // still think it can dispatcher the TRSP, in this way we will be in dead lock as the TRSP will 162 // not schedule a new ORP since it relies on SCP to wake it up after everything is OK. This is 163 // not what we want to test in this UT so we need to wait here to prevent this from happening. 164 // See HBASE-27277 for more detailed analysis. 165 UTIL.waitFor(15000, () -> !remoteDispatcher.hasNode(sn)); 166 167 // Resume the TRSP, it should be able to finish 168 RESUME_REGION_OPENING.countDown(); 169 moveFuture.get(); 170 171 ProcedureExecutor<?> procExec = 172 UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); 173 long scpProcId = 174 procExec.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure) 175 .map(p -> (ServerCrashProcedure) p).findAny().get().getProcId(); 176 // Resume the SCP and make sure it can finish too 177 RESUME_GET_REGIONS_ON_SERVER.countDown(); 178 UTIL.waitFor(60000, () -> procExec.isFinished(scpProcId)); 179 } 180}