001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.assignment; 019 020import java.io.IOException; 021import java.util.List; 022import java.util.concurrent.CompletableFuture; 023import java.util.concurrent.CountDownLatch; 024import java.util.concurrent.Future; 025import org.apache.hadoop.conf.Configuration; 026import org.apache.hadoop.hbase.HBaseClassTestRule; 027import org.apache.hadoop.hbase.HBaseTestingUtil; 028import org.apache.hadoop.hbase.HConstants; 029import org.apache.hadoop.hbase.ServerName; 030import org.apache.hadoop.hbase.TableName; 031import org.apache.hadoop.hbase.client.RegionInfo; 032import org.apache.hadoop.hbase.master.HMaster; 033import org.apache.hadoop.hbase.master.MasterServices; 034import org.apache.hadoop.hbase.master.RegionPlan; 035import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher; 036import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; 037import org.apache.hadoop.hbase.master.region.MasterRegion; 038import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; 039import org.apache.hadoop.hbase.testclassification.LargeTests; 040import org.apache.hadoop.hbase.testclassification.MasterTests; 041import org.apache.hadoop.hbase.util.Bytes; 042import org.apache.hadoop.hbase.util.FutureUtils; 043import org.apache.zookeeper.KeeperException; 044import org.junit.AfterClass; 045import org.junit.BeforeClass; 046import org.junit.ClassRule; 047import org.junit.Test; 048import org.junit.experimental.categories.Category; 049 050/** 051 * Testcase for HBASE-23594. 052 */ 053@Category({ MasterTests.class, LargeTests.class }) 054public class TestRaceBetweenSCPAndTRSP { 055 056 @ClassRule 057 public static final HBaseClassTestRule CLASS_RULE = 058 HBaseClassTestRule.forClass(TestRaceBetweenSCPAndTRSP.class); 059 060 private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); 061 062 private static TableName NAME = TableName.valueOf("Race"); 063 064 private static byte[] CF = Bytes.toBytes("cf"); 065 066 private static CountDownLatch ARRIVE_REGION_OPENING; 067 068 private static CountDownLatch RESUME_REGION_OPENING; 069 070 private static CountDownLatch ARRIVE_GET_REGIONS_ON_SERVER; 071 072 private static CountDownLatch RESUME_GET_REGIONS_ON_SERVER; 073 074 private static final class AssignmentManagerForTest extends AssignmentManager { 075 076 public AssignmentManagerForTest(MasterServices master, MasterRegion masterRegion) { 077 super(master, masterRegion); 078 } 079 080 @Override 081 CompletableFuture<Void> regionOpening(RegionStateNode regionNode) { 082 CompletableFuture<Void> future = super.regionOpening(regionNode); 083 try { 084 // wait until the operation done, then trigger later processing, to make the test more 085 // stable 086 FutureUtils.get(future); 087 } catch (IOException e) { 088 } 089 if (regionNode.getRegionInfo().getTable().equals(NAME) && ARRIVE_REGION_OPENING != null) { 090 ARRIVE_REGION_OPENING.countDown(); 091 ARRIVE_REGION_OPENING = null; 092 try { 093 RESUME_REGION_OPENING.await(); 094 } catch (InterruptedException e) { 095 } 096 } 097 return future; 098 } 099 100 @Override 101 public List<RegionInfo> getRegionsOnServer(ServerName serverName) { 102 List<RegionInfo> regions = super.getRegionsOnServer(serverName); 103 if (ARRIVE_GET_REGIONS_ON_SERVER != null) { 104 ARRIVE_GET_REGIONS_ON_SERVER.countDown(); 105 ARRIVE_GET_REGIONS_ON_SERVER = null; 106 try { 107 RESUME_GET_REGIONS_ON_SERVER.await(); 108 } catch (InterruptedException e) { 109 } 110 } 111 return regions; 112 } 113 } 114 115 public static final class HMasterForTest extends HMaster { 116 117 public HMasterForTest(Configuration conf) throws IOException, KeeperException { 118 super(conf); 119 } 120 121 @Override 122 protected AssignmentManager createAssignmentManager(MasterServices master, 123 MasterRegion masterRegion) { 124 return new AssignmentManagerForTest(master, masterRegion); 125 } 126 } 127 128 @BeforeClass 129 public static void setUp() throws Exception { 130 UTIL.getConfiguration().setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class); 131 UTIL.startMiniCluster(2); 132 UTIL.createTable(NAME, CF); 133 UTIL.waitTableAvailable(NAME); 134 UTIL.getAdmin().balancerSwitch(false, true); 135 } 136 137 @AfterClass 138 public static void tearDown() throws Exception { 139 UTIL.shutdownMiniCluster(); 140 } 141 142 @Test 143 public void test() throws Exception { 144 RegionInfo region = UTIL.getMiniHBaseCluster().getRegions(NAME).get(0).getRegionInfo(); 145 AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(); 146 ServerName sn = am.getRegionStates().getRegionState(region).getServerName(); 147 148 // Assign the CountDownLatches that get nulled in background threads else we NPE checking 149 // the static. 150 ARRIVE_REGION_OPENING = new CountDownLatch(1); 151 CountDownLatch arriveRegionOpening = ARRIVE_REGION_OPENING; 152 RESUME_REGION_OPENING = new CountDownLatch(1); 153 ARRIVE_GET_REGIONS_ON_SERVER = new CountDownLatch(1); 154 CountDownLatch arriveGetRegionsOnServer = ARRIVE_GET_REGIONS_ON_SERVER; 155 RESUME_GET_REGIONS_ON_SERVER = new CountDownLatch(1); 156 157 Future<byte[]> moveFuture = am.moveAsync(new RegionPlan(region, sn, sn)); 158 arriveRegionOpening.await(); 159 160 // Kill the region server and trigger a SCP 161 UTIL.getMiniHBaseCluster().killRegionServer(sn); 162 // Wait until the SCP reaches the getRegionsOnServer call 163 arriveGetRegionsOnServer.await(); 164 RSProcedureDispatcher remoteDispatcher = UTIL.getMiniHBaseCluster().getMaster() 165 .getMasterProcedureExecutor().getEnvironment().getRemoteDispatcher(); 166 // this is necessary for making the UT stable, the problem here is that, in 167 // ServerManager.expireServer, we will submit the SCP and then the SCP will be executed in 168 // another thread(the PEWorker), so when we reach the above getRegionsOnServer call in SCP, it 169 // is still possible that the expireServer call has not been finished so the remote dispatcher 170 // still think it can dispatcher the TRSP, in this way we will be in dead lock as the TRSP will 171 // not schedule a new ORP since it relies on SCP to wake it up after everything is OK. This is 172 // not what we want to test in this UT so we need to wait here to prevent this from happening. 173 // See HBASE-27277 for more detailed analysis. 174 UTIL.waitFor(15000, () -> !remoteDispatcher.hasNode(sn)); 175 176 // Resume the TRSP, it should be able to finish 177 RESUME_REGION_OPENING.countDown(); 178 moveFuture.get(); 179 180 ProcedureExecutor<?> procExec = 181 UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); 182 long scpProcId = 183 procExec.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure) 184 .map(p -> (ServerCrashProcedure) p).findAny().get().getProcId(); 185 // Resume the SCP and make sure it can finish too 186 RESUME_GET_REGIONS_ON_SERVER.countDown(); 187 UTIL.waitFor(60000, () -> procExec.isFinished(scpProcId)); 188 } 189}