001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master;
019
020import java.io.IOException;
021import java.util.concurrent.atomic.AtomicBoolean;
022import java.util.concurrent.atomic.AtomicInteger;
023import org.apache.hadoop.conf.Configuration;
024import org.apache.hadoop.fs.FileSystem;
025import org.apache.hadoop.fs.Path;
026import org.apache.hadoop.hbase.HBaseClassTestRule;
027import org.apache.hadoop.hbase.HBaseTestingUtil;
028import org.apache.hadoop.hbase.HConstants;
029import org.apache.hadoop.hbase.RegionTooBusyException;
030import org.apache.hadoop.hbase.ServerName;
031import org.apache.hadoop.hbase.SingleProcessHBaseCluster;
032import org.apache.hadoop.hbase.StartTestingClusterOption;
033import org.apache.hadoop.hbase.TableName;
034import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
035import org.apache.hadoop.hbase.client.Mutation;
036import org.apache.hadoop.hbase.client.RegionInfo;
037import org.apache.hadoop.hbase.client.TableDescriptor;
038import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
039import org.apache.hadoop.hbase.master.hbck.HbckChore;
040import org.apache.hadoop.hbase.master.hbck.HbckReport;
041import org.apache.hadoop.hbase.master.region.MasterRegionFactory;
042import org.apache.hadoop.hbase.regionserver.HRegion;
043import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
044import org.apache.hadoop.hbase.regionserver.HRegionServer;
045import org.apache.hadoop.hbase.regionserver.OperationStatus;
046import org.apache.hadoop.hbase.regionserver.RegionServerServices;
047import org.apache.hadoop.hbase.testclassification.LargeTests;
048import org.apache.hadoop.hbase.testclassification.MasterTests;
049import org.apache.hadoop.hbase.util.Bytes;
050import org.apache.hadoop.hbase.wal.WAL;
051import org.junit.AfterClass;
052import org.junit.Assert;
053import org.junit.Before;
054import org.junit.BeforeClass;
055import org.junit.ClassRule;
056import org.junit.Rule;
057import org.junit.Test;
058import org.junit.experimental.categories.Category;
059import org.junit.rules.TestName;
060import org.slf4j.Logger;
061import org.slf4j.LoggerFactory;
062
063import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
064
065/**
066 * MasterRegion related test that ensures the operations continue even when Procedure state update
067 * encounters retriable IO errors.
068 */
069@Category({ MasterTests.class, LargeTests.class })
070public class TestMasterRegionMutation1 {
071
072  private static final Logger LOG = LoggerFactory.getLogger(TestMasterRegionMutation1.class);
073
074  @ClassRule
075  public static final HBaseClassTestRule CLASS_RULE =
076    HBaseClassTestRule.forClass(TestMasterRegionMutation1.class);
077
078  @Rule
079  public TestName name = new TestName();
080
081  protected static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
082  protected static ServerName rs0;
083
084  protected static final AtomicBoolean ERROR_OUT = new AtomicBoolean(false);
085  private static final AtomicInteger ERROR_COUNTER = new AtomicInteger(0);
086  private static final AtomicBoolean FIRST_TIME_ERROR = new AtomicBoolean(true);
087
088  @BeforeClass
089  public static void setUpBeforeClass() throws Exception {
090    TEST_UTIL.getConfiguration().setClass(HConstants.REGION_IMPL, TestRegion.class, HRegion.class);
091    StartTestingClusterOption.Builder builder = StartTestingClusterOption.builder();
092    // 1 master is expected to be aborted with this test
093    builder.numMasters(2).numRegionServers(3);
094    TEST_UTIL.startMiniCluster(builder.build());
095    SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
096    rs0 = cluster.getRegionServer(0).getServerName();
097    TEST_UTIL.getAdmin().balancerSwitch(false, true);
098  }
099
100  @AfterClass
101  public static void tearDownAfterClass() throws Exception {
102    TEST_UTIL.shutdownMiniCluster();
103  }
104
105  @Before
106  public void setUp() throws Exception {
107    final TableName tableName = TableName.valueOf(name.getMethodName());
108    TableDescriptor tableDesc = TableDescriptorBuilder.newBuilder(tableName)
109      .setColumnFamily(ColumnFamilyDescriptorBuilder.of("fam1")).build();
110    int startKey = 0;
111    int endKey = 80000;
112    TEST_UTIL.getAdmin().createTable(tableDesc, Bytes.toBytes(startKey), Bytes.toBytes(endKey), 9);
113  }
114
115  @Test
116  public void testMasterRegionMutations() throws Exception {
117    HbckChore hbckChore = new HbckChore(TEST_UTIL.getHBaseCluster().getMaster());
118    SingleProcessHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
119
120    HRegionServer hRegionServer0 = cluster.getRegionServer(0);
121    HRegionServer hRegionServer1 = cluster.getRegionServer(1);
122    HRegionServer hRegionServer2 = cluster.getRegionServer(2);
123    int numRegions0 = hRegionServer0.getNumberOfOnlineRegions();
124    int numRegions1 = hRegionServer1.getNumberOfOnlineRegions();
125    int numRegions2 = hRegionServer2.getNumberOfOnlineRegions();
126
127    hbckChore.choreForTesting();
128    HbckReport hbckReport = hbckChore.getLastReport();
129    Assert.assertEquals(0, hbckReport.getInconsistentRegions().size());
130    Assert.assertEquals(0, hbckReport.getOrphanRegionsOnFS().size());
131    Assert.assertEquals(0, hbckReport.getOrphanRegionsOnRS().size());
132
133    // procedure state store update encounters retriable error, master abort is not required
134    ERROR_OUT.set(true);
135
136    // move one region from server 1 to server 0
137    TEST_UTIL.getAdmin()
138      .move(hRegionServer1.getRegions().get(0).getRegionInfo().getEncodedNameAsBytes(), rs0);
139
140    // procedure state store update encounters retriable error, however all retries are exhausted.
141    // This leads to the trigger of active master abort and hence master failover.
142    ERROR_OUT.set(true);
143
144    // move one region from server 2 to server 0
145    TEST_UTIL.getAdmin()
146      .move(hRegionServer2.getRegions().get(0).getRegionInfo().getEncodedNameAsBytes(), rs0);
147
148    HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
149
150    // Ensure:
151    // 1. num of regions before and after master abort remain same
152    // 2. all procedures are successfully completed
153    TEST_UTIL.waitFor(5000, 1000, () -> {
154      LOG.info("numRegions0: {} , numRegions1: {} , numRegions2: {}", numRegions0, numRegions1,
155        numRegions2);
156      LOG.info("Online regions - server0 : {} , server1: {} , server2: {}",
157        cluster.getRegionServer(0).getNumberOfOnlineRegions(),
158        cluster.getRegionServer(1).getNumberOfOnlineRegions(),
159        cluster.getRegionServer(2).getNumberOfOnlineRegions());
160      LOG.info("Num of successfully completed procedures: {} , num of all procedures: {}",
161        master.getMasterProcedureExecutor().getProcedures().stream()
162          .filter(masterProcedureEnvProcedure -> masterProcedureEnvProcedure.getState()
163              == ProcedureProtos.ProcedureState.SUCCESS)
164          .count(),
165        master.getMasterProcedureExecutor().getProcedures().size());
166      return (numRegions0 + numRegions1 + numRegions2)
167          == (cluster.getRegionServer(0).getNumberOfOnlineRegions()
168            + cluster.getRegionServer(1).getNumberOfOnlineRegions()
169            + cluster.getRegionServer(2).getNumberOfOnlineRegions())
170        && master.getMasterProcedureExecutor().getProcedures().stream()
171          .filter(masterProcedureEnvProcedure -> masterProcedureEnvProcedure.getState()
172              == ProcedureProtos.ProcedureState.SUCCESS)
173          .count() == master.getMasterProcedureExecutor().getProcedures().size();
174    });
175
176    // Ensure we have no inconsistent regions
177    TEST_UTIL.waitFor(5000, 1000, () -> {
178      HbckChore hbck = new HbckChore(TEST_UTIL.getHBaseCluster().getMaster());
179      hbck.choreForTesting();
180      HbckReport report = hbck.getLastReport();
181      return report.getInconsistentRegions().isEmpty() && report.getOrphanRegionsOnFS().isEmpty()
182        && report.getOrphanRegionsOnRS().isEmpty();
183    });
184
185  }
186
187  public static class TestRegion extends HRegion {
188
189    public TestRegion(Path tableDir, WAL wal, FileSystem fs, Configuration confParam,
190      RegionInfo regionInfo, TableDescriptor htd, RegionServerServices rsServices) {
191      super(tableDir, wal, fs, confParam, regionInfo, htd, rsServices);
192    }
193
194    public TestRegion(HRegionFileSystem fs, WAL wal, Configuration confParam, TableDescriptor htd,
195      RegionServerServices rsServices) {
196      super(fs, wal, confParam, htd, rsServices);
197    }
198
199    @Override
200    public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic, long nonceGroup,
201      long nonce) throws IOException {
202      if (
203        MasterRegionFactory.TABLE_NAME.equals(getTableDescriptor().getTableName())
204          && ERROR_OUT.get()
205      ) {
206        // First time errors are recovered with enough retries
207        if (FIRST_TIME_ERROR.get() && ERROR_COUNTER.getAndIncrement() == 5) {
208          ERROR_OUT.set(false);
209          ERROR_COUNTER.set(0);
210          FIRST_TIME_ERROR.set(false);
211          return super.batchMutate(mutations, atomic, nonceGroup, nonce);
212        }
213        // Second time errors are not recovered with enough retries, leading to master abort
214        if (!FIRST_TIME_ERROR.get() && ERROR_COUNTER.getAndIncrement() == 8) {
215          ERROR_OUT.set(false);
216          ERROR_COUNTER.set(0);
217        }
218        throw new RegionTooBusyException("test error...");
219      }
220      return super.batchMutate(mutations, atomic, nonceGroup, nonce);
221    }
222  }
223
224}