001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.assignment; 019 020import java.io.IOException; 021import java.util.concurrent.CountDownLatch; 022import org.apache.hadoop.hbase.HBaseClassTestRule; 023import org.apache.hadoop.hbase.HBaseTestingUtil; 024import org.apache.hadoop.hbase.ProcedureTestUtil; 025import org.apache.hadoop.hbase.ServerName; 026import org.apache.hadoop.hbase.TableName; 027import org.apache.hadoop.hbase.client.Put; 028import org.apache.hadoop.hbase.client.RegionInfo; 029import org.apache.hadoop.hbase.client.Table; 030import org.apache.hadoop.hbase.master.HMaster; 031import org.apache.hadoop.hbase.master.ServerManager; 032import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; 033import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; 034import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface; 035import org.apache.hadoop.hbase.procedure2.Procedure; 036import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; 037import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; 038import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; 039import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; 040import org.apache.hadoop.hbase.regionserver.HRegionServer; 041import org.apache.hadoop.hbase.testclassification.MasterTests; 042import org.apache.hadoop.hbase.testclassification.MediumTests; 043import org.apache.hadoop.hbase.util.Bytes; 044import org.junit.AfterClass; 045import org.junit.BeforeClass; 046import org.junit.ClassRule; 047import org.junit.Test; 048import org.junit.experimental.categories.Category; 049import org.slf4j.Logger; 050import org.slf4j.LoggerFactory; 051 052/** 053 * Confirm that we will do backoff when retrying on closing a region, to avoid consuming all the 054 * CPUs. 055 */ 056@Category({ MasterTests.class, MediumTests.class }) 057public class TestCloseRegionWhileRSCrash { 058 private static final Logger LOG = LoggerFactory.getLogger(TestCloseRegionWhileRSCrash.class); 059 060 @ClassRule 061 public static final HBaseClassTestRule CLASS_RULE = 062 HBaseClassTestRule.forClass(TestCloseRegionWhileRSCrash.class); 063 064 private static final HBaseTestingUtil UTIL = new HBaseTestingUtil(); 065 066 private static TableName TABLE_NAME = TableName.valueOf("Backoff"); 067 068 private static byte[] CF = Bytes.toBytes("cf"); 069 070 private static CountDownLatch ARRIVE = new CountDownLatch(1); 071 072 private static CountDownLatch RESUME = new CountDownLatch(1); 073 074 public static final class DummyServerProcedure extends Procedure<MasterProcedureEnv> 075 implements ServerProcedureInterface { 076 077 private ServerName serverName; 078 079 public DummyServerProcedure() { 080 } 081 082 public DummyServerProcedure(ServerName serverName) { 083 this.serverName = serverName; 084 } 085 086 @Override 087 public ServerName getServerName() { 088 return serverName; 089 } 090 091 @Override 092 public boolean hasMetaTableRegion() { 093 return false; 094 } 095 096 @Override 097 public ServerOperationType getServerOperationType() { 098 return ServerOperationType.CRASH_HANDLER; 099 } 100 101 @Override 102 protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env) 103 throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException { 104 ARRIVE.countDown(); 105 RESUME.await(); 106 return null; 107 } 108 109 @Override 110 protected LockState acquireLock(final MasterProcedureEnv env) { 111 if (env.getProcedureScheduler().waitServerExclusiveLock(this, getServerName())) { 112 return LockState.LOCK_EVENT_WAIT; 113 } 114 return LockState.LOCK_ACQUIRED; 115 } 116 117 @Override 118 protected void releaseLock(final MasterProcedureEnv env) { 119 env.getProcedureScheduler().wakeServerExclusiveLock(this, getServerName()); 120 } 121 122 @Override 123 protected boolean holdLock(MasterProcedureEnv env) { 124 return true; 125 } 126 127 @Override 128 protected void rollback(MasterProcedureEnv env) throws IOException, InterruptedException { 129 } 130 131 @Override 132 protected boolean abort(MasterProcedureEnv env) { 133 return false; 134 } 135 136 @Override 137 protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException { 138 139 } 140 141 @Override 142 protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException { 143 } 144 } 145 146 @BeforeClass 147 public static void setUp() throws Exception { 148 UTIL.getConfiguration().setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1); 149 UTIL.startMiniCluster(3); 150 UTIL.createTable(TABLE_NAME, CF); 151 UTIL.getAdmin().balancerSwitch(false, true); 152 HRegionServer srcRs = UTIL.getRSForFirstRegionInTable(TABLE_NAME); 153 if (!srcRs.getRegions(TableName.META_TABLE_NAME).isEmpty()) { 154 RegionInfo metaRegion = srcRs.getRegions(TableName.META_TABLE_NAME).get(0).getRegionInfo(); 155 HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs); 156 UTIL.getAdmin().move(metaRegion.getEncodedNameAsBytes(), dstRs.getServerName()); 157 UTIL.waitFor(30000, () -> !dstRs.getRegions(TableName.META_TABLE_NAME).isEmpty()); 158 } 159 } 160 161 @AfterClass 162 public static void tearDown() throws Exception { 163 UTIL.shutdownMiniCluster(); 164 } 165 166 @Test 167 public void testRetryBackoff() throws IOException, InterruptedException { 168 HRegionServer srcRs = UTIL.getRSForFirstRegionInTable(TABLE_NAME); 169 RegionInfo region = srcRs.getRegions(TABLE_NAME).get(0).getRegionInfo(); 170 HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs); 171 ProcedureExecutor<MasterProcedureEnv> procExec = 172 UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); 173 procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName())); 174 ARRIVE.await(); 175 UTIL.getMiniHBaseCluster().killRegionServer(srcRs.getServerName()); 176 UTIL.waitFor(30000, 177 () -> procExec.getProcedures().stream().anyMatch(p -> p instanceof ServerCrashProcedure)); 178 Thread t = new Thread(() -> { 179 try { 180 UTIL.getAdmin().move(region.getEncodedNameAsBytes(), dstRs.getServerName()); 181 } catch (IOException e) { 182 LOG.info("Failed move of {}", region.getRegionNameAsString(), e); 183 } 184 }); 185 t.start(); 186 // wait until we enter the WAITING_TIMEOUT state 187 ProcedureTestUtil.waitUntilProcedureWaitingTimeout(UTIL, TransitRegionStateProcedure.class, 188 30000); 189 // wait until the timeout value increase three times 190 ProcedureTestUtil.waitUntilProcedureTimeoutIncrease(UTIL, TransitRegionStateProcedure.class, 3); 191 // close connection to make sure that we can not finish the TRSP 192 final HMaster master = UTIL.getMiniHBaseCluster().getMaster(); 193 master.getConnection().close(); 194 RESUME.countDown(); 195 UTIL.waitFor(30000, () -> !master.isAlive()); 196 // here we start a new master 197 HMaster master2 = UTIL.getMiniHBaseCluster().startMaster().getMaster(); 198 LOG.info("Master2 {}, joining move thread", master2.getServerName()); 199 t.join(); 200 // Make sure that the region is online, it may not on the original target server, as we will set 201 // forceNewPlan to true if there is a server crash 202 try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) { 203 table.put(new Put(Bytes.toBytes(1)).addColumn(CF, Bytes.toBytes("cq"), Bytes.toBytes(1))); 204 } 205 // Make sure that the region is online, it may not be on the original target server, as we will 206 // set forceNewPlan to true if there is a server crash. 207 try (Table table = UTIL.getConnection().getTable(TABLE_NAME)) { 208 table.put(new Put(Bytes.toBytes(1)).addColumn(CF, Bytes.toBytes("cq"), Bytes.toBytes(1))); 209 } 210 } 211}