001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.assignment; 019 020import java.io.IOException; 021import java.util.concurrent.CountDownLatch; 022import java.util.concurrent.TimeUnit; 023import org.apache.hadoop.conf.Configuration; 024import org.apache.hadoop.hbase.HBaseClassTestRule; 025import org.apache.hadoop.hbase.HBaseTestingUtil; 026import org.apache.hadoop.hbase.HConstants; 027import org.apache.hadoop.hbase.PleaseHoldException; 028import org.apache.hadoop.hbase.StartTestingClusterOption; 029import org.apache.hadoop.hbase.TableName; 030import org.apache.hadoop.hbase.client.RegionInfo; 031import org.apache.hadoop.hbase.master.HMaster; 032import org.apache.hadoop.hbase.master.MasterServices; 033import org.apache.hadoop.hbase.master.RegionPlan; 034import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; 035import org.apache.hadoop.hbase.master.region.MasterRegion; 036import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; 037import org.apache.hadoop.hbase.regionserver.HRegionServer; 038import org.apache.hadoop.hbase.testclassification.MasterTests; 039import org.apache.hadoop.hbase.testclassification.MediumTests; 040import org.apache.hadoop.hbase.util.Bytes; 041import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; 042import org.apache.zookeeper.KeeperException; 043import org.junit.AfterClass; 044import org.junit.BeforeClass; 045import org.junit.ClassRule; 046import org.junit.Test; 047import org.junit.experimental.categories.Category; 048import org.slf4j.Logger; 049import org.slf4j.LoggerFactory; 050 051import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 052import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition; 053import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; 054import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest; 055import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse; 056 057/** 058 * See HBASE-22060 and HBASE-22074 for more details. 059 */ 060@Category({ MasterTests.class, MediumTests.class }) 061public class TestOpenRegionProcedureHang { 062 063 @ClassRule 064 public static final HBaseClassTestRule CLASS_RULE = 065 HBaseClassTestRule.forClass(TestOpenRegionProcedureHang.class); 066 067 private static final Logger LOG = LoggerFactory.getLogger(TestOpenRegionProcedureHang.class); 068 069 private static CountDownLatch ARRIVE; 070 private static CountDownLatch RESUME; 071 072 private static CountDownLatch FINISH; 073 074 private static CountDownLatch ABORT; 075 076 private static final class AssignmentManagerForTest extends AssignmentManager { 077 078 public AssignmentManagerForTest(MasterServices master, MasterRegion masterRegion) { 079 super(master, masterRegion); 080 } 081 082 @Override 083 public ReportRegionStateTransitionResponse reportRegionStateTransition( 084 ReportRegionStateTransitionRequest req) throws PleaseHoldException { 085 RegionStateTransition transition = req.getTransition(0); 086 if ( 087 transition.getTransitionCode() == TransitionCode.OPENED 088 && ProtobufUtil.toTableName(transition.getRegionInfo(0).getTableName()).equals(NAME) 089 && ARRIVE != null 090 ) { 091 ARRIVE.countDown(); 092 try { 093 RESUME.await(); 094 RESUME = null; 095 } catch (InterruptedException e) { 096 throw new RuntimeException(e); 097 } 098 try { 099 return super.reportRegionStateTransition(req); 100 } finally { 101 FINISH.countDown(); 102 } 103 } else { 104 return super.reportRegionStateTransition(req); 105 } 106 } 107 } 108 109 public static final class HMasterForTest extends HMaster { 110 111 public HMasterForTest(Configuration conf) throws IOException { 112 super(conf); 113 } 114 115 @Override 116 protected AssignmentManager createAssignmentManager(MasterServices master, 117 MasterRegion masterRegion) { 118 return new AssignmentManagerForTest(master, masterRegion); 119 } 120 121 @Override 122 public void abort(String reason, Throwable cause) { 123 // hang here so we can finish the reportRegionStateTransition call, which is the most 124 // important part to reproduce the bug 125 if (ABORT != null) { 126 try { 127 ABORT.await(); 128 ABORT = null; 129 } catch (InterruptedException e) { 130 throw new RuntimeException(e); 131 } 132 } 133 super.abort(reason, cause); 134 } 135 } 136 137 private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); 138 139 private static TableName NAME = TableName.valueOf("Open"); 140 141 private static byte[] CF = Bytes.toBytes("cf"); 142 143 @BeforeClass 144 public static void setUp() throws Exception { 145 Configuration conf = UTIL.getConfiguration(); 146 conf.setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class); 147 148 // make sure we do not timeout when caling reportRegionStateTransition 149 conf.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 10 * 60 * 1000); 150 conf.setInt(HConstants.HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY, 10 * 60 * 1000); 151 UTIL.startMiniCluster( 152 StartTestingClusterOption.builder().numMasters(2).numRegionServers(3).build()); 153 UTIL.createTable(NAME, CF); 154 UTIL.waitTableAvailable(NAME); 155 UTIL.getAdmin().balancerSwitch(false, true); 156 } 157 158 @AfterClass 159 public static void tearDown() throws Exception { 160 UTIL.shutdownMiniCluster(); 161 } 162 163 @Test 164 public void test() throws InterruptedException, KeeperException, IOException { 165 RegionInfo region = UTIL.getMiniHBaseCluster().getRegions(NAME).get(0).getRegionInfo(); 166 AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(); 167 168 HRegionServer rs1 = UTIL.getRSForFirstRegionInTable(NAME); 169 HRegionServer rs2 = UTIL.getOtherRegionServer(rs1); 170 171 ARRIVE = new CountDownLatch(1); 172 RESUME = new CountDownLatch(1); 173 FINISH = new CountDownLatch(1); 174 ABORT = new CountDownLatch(1); 175 am.moveAsync(new RegionPlan(region, rs1.getServerName(), rs2.getServerName())); 176 177 ARRIVE.await(); 178 ARRIVE = null; 179 HMaster master = UTIL.getMiniHBaseCluster().getMaster(); 180 master.getZooKeeper().close(); 181 UTIL.waitFor(30000, () -> { 182 for (MasterThread mt : UTIL.getMiniHBaseCluster().getMasterThreads()) { 183 if (mt.getMaster() != master && mt.getMaster().isActiveMaster()) { 184 return mt.getMaster().isInitialized(); 185 } 186 } 187 return false; 188 }); 189 ProcedureExecutor<MasterProcedureEnv> procExec = 190 UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); 191 UTIL.waitFor(30000, 192 () -> procExec.getProcedures().stream().filter(p -> p instanceof OpenRegionProcedure) 193 .map(p -> (OpenRegionProcedure) p).anyMatch(p -> p.region.getTable().equals(NAME))); 194 OpenRegionProcedure proc = procExec.getProcedures().stream() 195 .filter(p -> p instanceof OpenRegionProcedure).map(p -> (OpenRegionProcedure) p) 196 .filter(p -> p.region.getTable().equals(NAME)).findFirst().get(); 197 // wait a bit to let the OpenRegionProcedure send out the request 198 Thread.sleep(2000); 199 RESUME.countDown(); 200 if (!FINISH.await(15, TimeUnit.SECONDS)) { 201 LOG.info("Wait reportRegionStateTransition to finish timed out, this is possible if" 202 + " we update the procedure store, as the WALProcedureStore" 203 + " will retry forever to roll the writer if it is not closed"); 204 } 205 FINISH = null; 206 // if the reportRegionTransition is finished, wait a bit to let it return the data to RS 207 Thread.sleep(2000); 208 ABORT.countDown(); 209 210 UTIL.waitFor(30000, () -> procExec.isFinished(proc.getProcId())); 211 UTIL.waitFor(30000, () -> procExec.isFinished(proc.getParentProcId())); 212 } 213}