001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import java.io.IOException; 021import java.util.concurrent.atomic.AtomicBoolean; 022import java.util.concurrent.atomic.AtomicInteger; 023import org.apache.hadoop.conf.Configuration; 024import org.apache.hadoop.fs.FileSystem; 025import org.apache.hadoop.fs.Path; 026import org.apache.hadoop.hbase.HBaseClassTestRule; 027import org.apache.hadoop.hbase.HBaseTestingUtil; 028import org.apache.hadoop.hbase.HConstants; 029import org.apache.hadoop.hbase.RegionTooBusyException; 030import org.apache.hadoop.hbase.ServerName; 031import org.apache.hadoop.hbase.SingleProcessHBaseCluster; 032import org.apache.hadoop.hbase.StartTestingClusterOption; 033import org.apache.hadoop.hbase.TableName; 034import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; 035import org.apache.hadoop.hbase.client.Mutation; 036import org.apache.hadoop.hbase.client.RegionInfo; 037import org.apache.hadoop.hbase.client.TableDescriptor; 038import org.apache.hadoop.hbase.client.TableDescriptorBuilder; 039import org.apache.hadoop.hbase.master.hbck.HbckChore; 040import org.apache.hadoop.hbase.master.hbck.HbckReport; 041import org.apache.hadoop.hbase.master.region.MasterRegionFactory; 042import org.apache.hadoop.hbase.regionserver.HRegion; 043import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; 044import org.apache.hadoop.hbase.regionserver.HRegionServer; 045import org.apache.hadoop.hbase.regionserver.OperationStatus; 046import org.apache.hadoop.hbase.regionserver.RegionServerServices; 047import org.apache.hadoop.hbase.testclassification.LargeTests; 048import org.apache.hadoop.hbase.testclassification.MasterTests; 049import org.apache.hadoop.hbase.util.Bytes; 050import org.apache.hadoop.hbase.wal.WAL; 051import org.junit.AfterClass; 052import org.junit.Assert; 053import org.junit.Before; 054import org.junit.BeforeClass; 055import org.junit.ClassRule; 056import org.junit.Rule; 057import org.junit.Test; 058import org.junit.experimental.categories.Category; 059import org.junit.rules.TestName; 060import org.slf4j.Logger; 061import org.slf4j.LoggerFactory; 062 063import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; 064 065/** 066 * MasterRegion related test that ensures the operations continue even when Procedure state update 067 * encounters retriable IO errors. 068 */ 069@Category({ MasterTests.class, LargeTests.class }) 070public class TestMasterRegionMutation1 { 071 072 private static final Logger LOG = LoggerFactory.getLogger(TestMasterRegionMutation1.class); 073 074 @ClassRule 075 public static final HBaseClassTestRule CLASS_RULE = 076 HBaseClassTestRule.forClass(TestMasterRegionMutation1.class); 077 078 @Rule 079 public TestName name = new TestName(); 080 081 protected static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil(); 082 protected static ServerName rs0; 083 084 protected static final AtomicBoolean ERROR_OUT = new AtomicBoolean(false); 085 private static final AtomicInteger ERROR_COUNTER = new AtomicInteger(0); 086 private static final AtomicBoolean FIRST_TIME_ERROR = new AtomicBoolean(true); 087 088 @BeforeClass 089 public static void setUpBeforeClass() throws Exception { 090 TEST_UTIL.getConfiguration().setClass(HConstants.REGION_IMPL, TestRegion.class, HRegion.class); 091 StartTestingClusterOption.Builder builder = StartTestingClusterOption.builder(); 092 // 1 master is expected to be aborted with this test 093 builder.numMasters(2).numRegionServers(3); 094 TEST_UTIL.startMiniCluster(builder.build()); 095 SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 096 rs0 = cluster.getRegionServer(0).getServerName(); 097 TEST_UTIL.getAdmin().balancerSwitch(false, true); 098 } 099 100 @AfterClass 101 public static void tearDownAfterClass() throws Exception { 102 TEST_UTIL.shutdownMiniCluster(); 103 } 104 105 @Before 106 public void setUp() throws Exception { 107 final TableName tableName = TableName.valueOf(name.getMethodName()); 108 TableDescriptor tableDesc = TableDescriptorBuilder.newBuilder(tableName) 109 .setColumnFamily(ColumnFamilyDescriptorBuilder.of("fam1")).build(); 110 int startKey = 0; 111 int endKey = 80000; 112 TEST_UTIL.getAdmin().createTable(tableDesc, Bytes.toBytes(startKey), Bytes.toBytes(endKey), 9); 113 } 114 115 @Test 116 public void testMasterRegionMutations() throws Exception { 117 HbckChore hbckChore = new HbckChore(TEST_UTIL.getHBaseCluster().getMaster()); 118 SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 119 120 HRegionServer hRegionServer0 = cluster.getRegionServer(0); 121 HRegionServer hRegionServer1 = cluster.getRegionServer(1); 122 HRegionServer hRegionServer2 = cluster.getRegionServer(2); 123 int numRegions0 = hRegionServer0.getNumberOfOnlineRegions(); 124 int numRegions1 = hRegionServer1.getNumberOfOnlineRegions(); 125 int numRegions2 = hRegionServer2.getNumberOfOnlineRegions(); 126 127 hbckChore.choreForTesting(); 128 HbckReport hbckReport = hbckChore.getLastReport(); 129 Assert.assertEquals(0, hbckReport.getInconsistentRegions().size()); 130 Assert.assertEquals(0, hbckReport.getOrphanRegionsOnFS().size()); 131 Assert.assertEquals(0, hbckReport.getOrphanRegionsOnRS().size()); 132 133 // procedure state store update encounters retriable error, master abort is not required 134 ERROR_OUT.set(true); 135 136 // move one region from server 1 to server 0 137 TEST_UTIL.getAdmin() 138 .move(hRegionServer1.getRegions().get(0).getRegionInfo().getEncodedNameAsBytes(), rs0); 139 140 // procedure state store update encounters retriable error, however all retries are exhausted. 141 // This leads to the trigger of active master abort and hence master failover. 142 ERROR_OUT.set(true); 143 144 // move one region from server 2 to server 0 145 TEST_UTIL.getAdmin() 146 .move(hRegionServer2.getRegions().get(0).getRegionInfo().getEncodedNameAsBytes(), rs0); 147 148 HMaster master = TEST_UTIL.getHBaseCluster().getMaster(); 149 150 // Ensure: 151 // 1. num of regions before and after master abort remain same 152 // 2. all procedures are successfully completed 153 TEST_UTIL.waitFor(5000, 1000, () -> { 154 LOG.info("numRegions0: {} , numRegions1: {} , numRegions2: {}", numRegions0, numRegions1, 155 numRegions2); 156 LOG.info("Online regions - server0 : {} , server1: {} , server2: {}", 157 cluster.getRegionServer(0).getNumberOfOnlineRegions(), 158 cluster.getRegionServer(1).getNumberOfOnlineRegions(), 159 cluster.getRegionServer(2).getNumberOfOnlineRegions()); 160 LOG.info("Num of successfully completed procedures: {} , num of all procedures: {}", 161 master.getMasterProcedureExecutor().getProcedures().stream() 162 .filter(masterProcedureEnvProcedure -> masterProcedureEnvProcedure.getState() 163 == ProcedureProtos.ProcedureState.SUCCESS) 164 .count(), 165 master.getMasterProcedureExecutor().getProcedures().size()); 166 return (numRegions0 + numRegions1 + numRegions2) 167 == (cluster.getRegionServer(0).getNumberOfOnlineRegions() 168 + cluster.getRegionServer(1).getNumberOfOnlineRegions() 169 + cluster.getRegionServer(2).getNumberOfOnlineRegions()) 170 && master.getMasterProcedureExecutor().getProcedures().stream() 171 .filter(masterProcedureEnvProcedure -> masterProcedureEnvProcedure.getState() 172 == ProcedureProtos.ProcedureState.SUCCESS) 173 .count() == master.getMasterProcedureExecutor().getProcedures().size(); 174 }); 175 176 // Ensure we have no inconsistent regions 177 TEST_UTIL.waitFor(5000, 1000, () -> { 178 HbckChore hbck = new HbckChore(TEST_UTIL.getHBaseCluster().getMaster()); 179 hbck.choreForTesting(); 180 HbckReport report = hbck.getLastReport(); 181 return report.getInconsistentRegions().isEmpty() && report.getOrphanRegionsOnFS().isEmpty() 182 && report.getOrphanRegionsOnRS().isEmpty(); 183 }); 184 185 } 186 187 public static class TestRegion extends HRegion { 188 189 public TestRegion(Path tableDir, WAL wal, FileSystem fs, Configuration confParam, 190 RegionInfo regionInfo, TableDescriptor htd, RegionServerServices rsServices) { 191 super(tableDir, wal, fs, confParam, regionInfo, htd, rsServices); 192 } 193 194 public TestRegion(HRegionFileSystem fs, WAL wal, Configuration confParam, TableDescriptor htd, 195 RegionServerServices rsServices) { 196 super(fs, wal, confParam, htd, rsServices); 197 } 198 199 @Override 200 public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic, long nonceGroup, 201 long nonce) throws IOException { 202 if ( 203 MasterRegionFactory.TABLE_NAME.equals(getTableDescriptor().getTableName()) 204 && ERROR_OUT.get() 205 ) { 206 // First time errors are recovered with enough retries 207 if (FIRST_TIME_ERROR.get() && ERROR_COUNTER.getAndIncrement() == 5) { 208 ERROR_OUT.set(false); 209 ERROR_COUNTER.set(0); 210 FIRST_TIME_ERROR.set(false); 211 return super.batchMutate(mutations, atomic, nonceGroup, nonce); 212 } 213 // Second time errors are not recovered with enough retries, leading to master abort 214 if (!FIRST_TIME_ERROR.get() && ERROR_COUNTER.getAndIncrement() == 8) { 215 ERROR_OUT.set(false); 216 ERROR_COUNTER.set(0); 217 } 218 throw new RegionTooBusyException("test error..."); 219 } 220 return super.batchMutate(mutations, atomic, nonceGroup, nonce); 221 } 222 } 223 224}