001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.procedure;
019
020import static junit.framework.TestCase.assertFalse;
021import static junit.framework.TestCase.assertNotNull;
022import static org.junit.Assert.assertEquals;
023import static org.junit.Assert.assertNotEquals;
024import static org.junit.Assert.assertTrue;
025
026import java.io.IOException;
027import java.util.List;
028import java.util.NoSuchElementException;
029import java.util.Objects;
030import java.util.concurrent.TimeUnit;
031import org.apache.hadoop.hbase.HBaseClassTestRule;
032import org.apache.hadoop.hbase.HBaseTestingUtility;
033import org.apache.hadoop.hbase.HConstants;
034import org.apache.hadoop.hbase.MetaTableAccessor;
035import org.apache.hadoop.hbase.MiniHBaseCluster;
036import org.apache.hadoop.hbase.ServerName;
037import org.apache.hadoop.hbase.TableName;
038import org.apache.hadoop.hbase.TableNameTestRule;
039import org.apache.hadoop.hbase.client.RegionInfo;
040import org.apache.hadoop.hbase.client.Result;
041import org.apache.hadoop.hbase.client.Table;
042import org.apache.hadoop.hbase.master.HMaster;
043import org.apache.hadoop.hbase.master.RegionState;
044import org.apache.hadoop.hbase.procedure2.Procedure;
045import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
046import org.apache.hadoop.hbase.regionserver.HRegionServer;
047import org.apache.hadoop.hbase.testclassification.LargeTests;
048import org.apache.hadoop.hbase.testclassification.MasterTests;
049import org.apache.hadoop.hbase.util.Bytes;
050import org.apache.hadoop.hbase.util.JVMClusterUtil;
051import org.apache.hadoop.hbase.util.Pair;
052import org.junit.ClassRule;
053import org.junit.Rule;
054import org.junit.Test;
055import org.junit.experimental.categories.Category;
056import org.junit.runner.RunWith;
057import org.junit.runners.Parameterized;
058import org.slf4j.Logger;
059import org.slf4j.LoggerFactory;
060
061import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
062
063import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
064import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos;
065
066/**
067 * Test of the HBCK-version of SCP. The HBCKSCP is an SCP only it reads hbase:meta for list of
068 * Regions that were on the server-to-process rather than consult Master in-memory-state.
069 */
070@Category({ MasterTests.class, LargeTests.class })
071@RunWith(Parameterized.class)
072public class TestHBCKSCP extends TestSCPBase {
073  private static final Logger LOG = LoggerFactory.getLogger(TestHBCKSCP.class);
074
075  @ClassRule
076  public static final HBaseClassTestRule CLASS_RULE =
077    HBaseClassTestRule.forClass(TestHBCKSCP.class);
078  @Rule
079  public TableNameTestRule tableNameTestRule = new TableNameTestRule();
080
081  private final int replicas;
082  private final HBCKSCPScheduler hbckscpScheduler;
083  private final RegionSelector regionSelector;
084
085  public TestHBCKSCP(final int replicas, final HBCKSCPScheduler hbckscpScheduler,
086    final RegionSelector regionSelector) {
087    this.replicas = replicas;
088    this.hbckscpScheduler = hbckscpScheduler;
089    this.regionSelector = regionSelector;
090  }
091
092  @Parameterized.Parameters(name = "replicas:{0} scheduler:{1} selector:{2}")
093  public static Object[][] params() {
094    return new Object[][] {
095      { 1, new ScheduleServerCrashProcedure(), new PrimaryNotMetaRegionSelector() },
096      { 3, new ScheduleServerCrashProcedure(), new ReplicaNonMetaRegionSelector() },
097      { 1, new ScheduleSCPsForUnknownServers(), new PrimaryNotMetaRegionSelector() },
098      { 3, new ScheduleSCPsForUnknownServers(), new ReplicaNonMetaRegionSelector() } };
099  }
100
101  @Test
102  public void test() throws Exception {
103    // we are about to do one for it?
104    MiniHBaseCluster cluster = this.util.getHBaseCluster();
105
106    // Assert that we have three RegionServers. Test depends on there being multiple.
107    assertEquals(RS_COUNT, cluster.getLiveRegionServerThreads().size());
108
109    int count;
110    try (Table table = createTable(tableNameTestRule.getTableName())) {
111      // Load the table with a bit of data so some logs to split and some edits in each region.
112      this.util.loadTable(table, HBaseTestingUtility.COLUMNS[0]);
113      count = util.countRows(table);
114    }
115    assertTrue("expected some rows", count > 0);
116
117    // Make the test easier by not working on server hosting meta...
118    // Find another RS. Purge it from Master memory w/o running SCP (if
119    // SCP runs, it will clear entries from hbase:meta which frustrates
120    // our attempt at manufacturing 'Unknown Servers' condition).
121    final ServerName metaServer = util.getMiniHBaseCluster().getServerHoldingMeta();
122    final ServerName rsServerName = cluster.getRegionServerThreads().stream()
123      .map(JVMClusterUtil.RegionServerThread::getRegionServer).map(HRegionServer::getServerName)
124      .filter(sn -> !sn.equals(metaServer)).findAny().orElseThrow(() -> new NoSuchElementException(
125        "Cannot locate a region server that is not hosting meta."));
126    HMaster master = cluster.getMaster();
127    // Get a Region that is on the server.
128    final List<RegionInfo> regions = master.getAssignmentManager().getRegionsOnServer(rsServerName);
129    LOG.debug("{} is holding {} regions.", rsServerName, regions.size());
130    final RegionInfo rsRI =
131      regions.stream().peek(info -> LOG.debug("{}", info)).filter(regionSelector::regionFilter)
132        .findAny().orElseThrow(regionSelector::regionFilterFailure);
133    final int replicaId = rsRI.getReplicaId();
134    Result r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI);
135    // Assert region is OPEN.
136    assertEquals(RegionState.State.OPEN.toString(), Bytes.toString(
137      r.getValue(HConstants.CATALOG_FAMILY, MetaTableAccessor.getRegionStateColumn(replicaId))));
138    ServerName serverName = MetaTableAccessor.getServerName(r, replicaId);
139    assertEquals(rsServerName, serverName);
140    // moveFrom adds to dead servers and adds it to processing list only we will
141    // not be processing this server 'normally'. Remove it from processing by
142    // calling 'finish' and then remove it from dead servers so rsServerName
143    // becomes an 'Unknown Server' even though it is still around.
144    LOG.info("Killing {}", rsServerName);
145    cluster.killRegionServer(rsServerName);
146
147    master.getServerManager().moveFromOnlineToDeadServers(rsServerName);
148    master.getServerManager().getDeadServers().finish(rsServerName);
149    master.getServerManager().getDeadServers().removeDeadServer(rsServerName);
150    master.getAssignmentManager().getRegionStates().removeServer(rsServerName);
151    // Kill the server. Nothing should happen since an 'Unknown Server' as far
152    // as the Master is concerned; i.e. no SCP.
153    HRegionServer hrs = cluster.getRegionServer(rsServerName);
154    util.waitFor(TimeUnit.MINUTES.toMillis(1), hrs::isStopped);
155    LOG.info("Dead {}", rsServerName);
156    // Now assert still references in hbase:meta to the 'dead' server -- they haven't been
157    // cleaned up by an SCP or by anything else.
158    assertTrue(searchMeta(master, rsServerName));
159    // Assert region is OPEN on dead server still.
160    r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI);
161    assertEquals(RegionState.State.OPEN.toString(), Bytes.toString(
162      r.getValue(HConstants.CATALOG_FAMILY, MetaTableAccessor.getRegionStateColumn(replicaId))));
163    serverName = MetaTableAccessor.getServerName(r, replicaId);
164    assertNotNull(cluster.getRegionServer(serverName));
165    assertEquals(rsServerName, serverName);
166
167    // I now have 'Unknown Server' references in hbase:meta; i.e. Server references
168    // with no corresponding SCP. Queue one.
169    long pid = scheduleHBCKSCP(rsServerName, master);
170    assertNotEquals(Procedure.NO_PROC_ID, pid);
171    ProcedureTestingUtility.waitProcedure(master.getMasterProcedureExecutor(), pid);
172    // After SCP, assert region is OPEN on new server.
173    r = MetaTableAccessor.getRegionResult(master.getConnection(), rsRI);
174    assertEquals(RegionState.State.OPEN.toString(), Bytes.toString(
175      r.getValue(HConstants.CATALOG_FAMILY, MetaTableAccessor.getRegionStateColumn(replicaId))));
176    serverName = MetaTableAccessor.getServerName(r, 0);
177    assertNotNull(cluster.getRegionServer(serverName));
178    assertNotEquals(rsServerName, serverName);
179    // Make sure no mention of old server post SCP.
180    assertFalse(searchMeta(master, rsServerName));
181  }
182
183  protected long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
184    return hbckscpScheduler.scheduleHBCKSCP(rsServerName, master);
185  }
186
187  @Override
188  protected int getRegionReplication() {
189    return replicas;
190  }
191
192  /** Returns True if we find reference to <code>sn</code> in meta table. */
193  private boolean searchMeta(HMaster master, ServerName sn) throws IOException {
194    List<Pair<RegionInfo, ServerName>> ps =
195      MetaTableAccessor.getTableRegionsAndLocations(master.getConnection(), null);
196    for (Pair<RegionInfo, ServerName> p : ps) {
197      if (p.getSecond().equals(sn)) {
198        return true;
199      }
200    }
201    return false;
202  }
203
204  /**
205   * Encapsulates the choice of which HBCK2 method to call.
206   */
207  private abstract static class HBCKSCPScheduler {
208    abstract long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException;
209
210    @Override
211    public String toString() {
212      return this.getClass().getSimpleName();
213    }
214  }
215
216  /**
217   * Invokes {@code MasterRpcServices#scheduleServerCrashProcedure}.
218   */
219  private static class ScheduleServerCrashProcedure extends HBCKSCPScheduler {
220    @Override
221    public long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
222      MasterProtos.ScheduleServerCrashProcedureResponse response = master.getMasterRpcServices()
223        .scheduleServerCrashProcedure(null, MasterProtos.ScheduleServerCrashProcedureRequest
224          .newBuilder().addServerName(ProtobufUtil.toServerName(rsServerName)).build());
225      assertEquals(1, response.getPidCount());
226      return response.getPid(0);
227    }
228  }
229
230  /**
231   * Invokes {@code MasterRpcServices#scheduleSCPsForUnknownServers}.
232   */
233  private static class ScheduleSCPsForUnknownServers extends HBCKSCPScheduler {
234    @Override
235    long scheduleHBCKSCP(ServerName rsServerName, HMaster master) throws ServiceException {
236      MasterProtos.ScheduleSCPsForUnknownServersResponse response =
237        master.getMasterRpcServices().scheduleSCPsForUnknownServers(null,
238          MasterProtos.ScheduleSCPsForUnknownServersRequest.newBuilder().build());
239      assertEquals(1, response.getPidCount());
240      return response.getPid(0);
241    }
242  }
243
244  /**
245   * Encapsulates how the target region is selected.
246   */
247  private static abstract class RegionSelector {
248    abstract boolean regionFilter(RegionInfo info);
249
250    abstract Exception regionFilterFailure();
251
252    @Override
253    public String toString() {
254      return this.getClass().getSimpleName();
255    }
256  }
257
258  /**
259   * Selects a non-meta region that is also a primary region.
260   */
261  private static class PrimaryNotMetaRegionSelector extends RegionSelector {
262    @Override
263    boolean regionFilter(final RegionInfo info) {
264      return !Objects.equals(TableName.META_TABLE_NAME, info.getTable())
265        && Objects.equals(RegionInfo.DEFAULT_REPLICA_ID, info.getReplicaId());
266    }
267
268    @Override
269    Exception regionFilterFailure() {
270      return new NoSuchElementException("Cannot locate a primary, non-meta region.");
271    }
272  }
273
274  /**
275   * Selects a non-meta region that is also a replica region.
276   */
277  private static class ReplicaNonMetaRegionSelector extends RegionSelector {
278    @Override
279    boolean regionFilter(RegionInfo info) {
280      return !Objects.equals(TableName.META_TABLE_NAME, info.getTable())
281        && !Objects.equals(RegionInfo.DEFAULT_REPLICA_ID, info.getReplicaId());
282    }
283
284    @Override
285    Exception regionFilterFailure() {
286      return new NoSuchElementException("Cannot locate a replica, non-meta region.");
287    }
288  }
289}