001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.replication;
019
020import static org.junit.Assert.assertFalse;
021import static org.junit.Assert.assertTrue;
022
023import java.io.IOException;
024import java.util.ArrayList;
025import java.util.List;
026import java.util.stream.Collectors;
027import org.apache.hadoop.fs.Path;
028import org.apache.hadoop.hbase.HBaseClassTestRule;
029import org.apache.hadoop.hbase.ServerName;
030import org.apache.hadoop.hbase.master.MasterFileSystem;
031import org.apache.hadoop.hbase.master.ServerManager;
032import org.apache.hadoop.hbase.testclassification.LargeTests;
033import org.apache.hadoop.hbase.testclassification.ReplicationTests;
034import org.apache.hadoop.hbase.util.JVMClusterUtil;
035import org.junit.ClassRule;
036import org.junit.Test;
037import org.junit.experimental.categories.Category;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040
041@Category({ ReplicationTests.class, LargeTests.class })
042public class TestSyncReplicationStandbyKillRS extends SyncReplicationTestBase {
043
044  private static final Logger LOG = LoggerFactory.getLogger(TestSyncReplicationStandbyKillRS.class);
045
046  private final long SLEEP_TIME = 1000;
047
048  private final int COUNT = 1000;
049
050  @ClassRule
051  public static final HBaseClassTestRule CLASS_RULE =
052    HBaseClassTestRule.forClass(TestSyncReplicationStandbyKillRS.class);
053
054  @Test
055  public void testStandbyKillRegionServer() throws Exception {
056    MasterFileSystem mfs = UTIL2.getHBaseCluster().getMaster().getMasterFileSystem();
057    Path remoteWALDir = getRemoteWALDir(mfs, PEER_ID);
058    assertFalse(mfs.getWALFileSystem().exists(remoteWALDir));
059    UTIL2.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID,
060      SyncReplicationState.STANDBY);
061    assertTrue(mfs.getWALFileSystem().exists(remoteWALDir));
062    UTIL1.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID,
063      SyncReplicationState.ACTIVE);
064
065    // Disable async replication and write data, then shutdown
066    UTIL1.getAdmin().disableReplicationPeer(PEER_ID);
067    write(UTIL1, 0, COUNT);
068    UTIL1.shutdownMiniCluster();
069
070    JVMClusterUtil.MasterThread activeMaster = UTIL2.getMiniHBaseCluster().getMasterThread();
071    String threadName = "RegionServer-Restarter";
072    Thread t = new Thread(() -> {
073      try {
074        List<JVMClusterUtil.RegionServerThread> regionServers =
075          new ArrayList<>(UTIL2.getMiniHBaseCluster().getLiveRegionServerThreads());
076        LOG.debug("Going to stop {} RSes: [{}]", regionServers.size(),
077          regionServers.stream().map(rst -> rst.getRegionServer().getServerName().getServerName())
078            .collect(Collectors.joining(", ")));
079        for (JVMClusterUtil.RegionServerThread rst : regionServers) {
080          ServerName serverName = rst.getRegionServer().getServerName();
081          LOG.debug("Going to RS stop [{}]", serverName);
082          rst.getRegionServer().stop("Stop RS for test");
083          waitForRSShutdownToStartAndFinish(activeMaster, serverName);
084          LOG.debug("Going to start a new RS");
085          JVMClusterUtil.RegionServerThread restarted =
086            UTIL2.getMiniHBaseCluster().startRegionServer();
087          LOG.debug("Waiting RS [{}] to online", restarted.getRegionServer().getServerName());
088          restarted.waitForServerOnline();
089          LOG.debug("Waiting the old RS {} thread to quit", rst.getName());
090          rst.join();
091          LOG.debug("Done stop RS [{}] and restart [{}]", serverName,
092            restarted.getRegionServer().getServerName());
093        }
094        LOG.debug("All RSes restarted");
095      } catch (Exception e) {
096        LOG.error("Failed to kill RS", e);
097      }
098    }, threadName);
099    t.start();
100
101    LOG.debug("Going to transit peer {} to {} state", PEER_ID,
102      SyncReplicationState.DOWNGRADE_ACTIVE);
103    // Transit standby to DA to replay logs
104    try {
105      UTIL2.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID,
106        SyncReplicationState.DOWNGRADE_ACTIVE);
107    } catch (Exception e) {
108      LOG.error("Failed to transit standby cluster to " + SyncReplicationState.DOWNGRADE_ACTIVE, e);
109    }
110
111    LOG.debug("Waiting for the restarter thread {} to quit", threadName);
112    t.join();
113
114    while (
115      UTIL2.getAdmin().getReplicationPeerSyncReplicationState(PEER_ID)
116          != SyncReplicationState.DOWNGRADE_ACTIVE
117    ) {
118      LOG.debug("Waiting for peer {} to be in {} state", PEER_ID,
119        SyncReplicationState.DOWNGRADE_ACTIVE);
120      Thread.sleep(SLEEP_TIME);
121    }
122    LOG.debug("Going to verify the result, {} records expected", COUNT);
123    verify(UTIL2, 0, COUNT);
124    LOG.debug("Verification successfully done");
125  }
126
127  private void waitForRSShutdownToStartAndFinish(JVMClusterUtil.MasterThread activeMaster,
128    ServerName serverName) throws InterruptedException, IOException {
129    ServerManager sm = activeMaster.getMaster().getServerManager();
130    // First wait for it to be in dead list
131    while (!sm.getDeadServers().isDeadServer(serverName)) {
132      LOG.debug("Waiting for {} to be listed as dead in master", serverName);
133      Thread.sleep(SLEEP_TIME);
134    }
135    LOG.debug("Server {} marked as dead, waiting for it to finish dead processing", serverName);
136    while (sm.areDeadServersInProgress()) {
137      LOG.debug("Server {} still being processed, waiting", serverName);
138      Thread.sleep(SLEEP_TIME);
139    }
140    LOG.debug("Server {} done with server shutdown processing", serverName);
141  }
142}