001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.replication; 019 020import static org.junit.Assert.assertFalse; 021import static org.junit.Assert.assertTrue; 022 023import java.io.IOException; 024import java.util.ArrayList; 025import java.util.List; 026import java.util.stream.Collectors; 027import org.apache.hadoop.fs.Path; 028import org.apache.hadoop.hbase.HBaseClassTestRule; 029import org.apache.hadoop.hbase.ServerName; 030import org.apache.hadoop.hbase.master.MasterFileSystem; 031import org.apache.hadoop.hbase.master.ServerManager; 032import org.apache.hadoop.hbase.testclassification.LargeTests; 033import org.apache.hadoop.hbase.testclassification.ReplicationTests; 034import org.apache.hadoop.hbase.util.JVMClusterUtil; 035import org.junit.ClassRule; 036import org.junit.Test; 037import org.junit.experimental.categories.Category; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041@Category({ ReplicationTests.class, LargeTests.class }) 042public class TestSyncReplicationStandbyKillRS extends SyncReplicationTestBase { 043 044 private static final Logger LOG = LoggerFactory.getLogger(TestSyncReplicationStandbyKillRS.class); 045 046 private final long SLEEP_TIME = 1000; 047 048 private final int COUNT = 1000; 049 050 @ClassRule 051 public static final HBaseClassTestRule CLASS_RULE = 052 HBaseClassTestRule.forClass(TestSyncReplicationStandbyKillRS.class); 053 054 @Test 055 public void testStandbyKillRegionServer() throws Exception { 056 MasterFileSystem mfs = UTIL2.getHBaseCluster().getMaster().getMasterFileSystem(); 057 Path remoteWALDir = getRemoteWALDir(mfs, PEER_ID); 058 assertFalse(mfs.getWALFileSystem().exists(remoteWALDir)); 059 UTIL2.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID, 060 SyncReplicationState.STANDBY); 061 assertTrue(mfs.getWALFileSystem().exists(remoteWALDir)); 062 UTIL1.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID, 063 SyncReplicationState.ACTIVE); 064 065 // Disable async replication and write data, then shutdown 066 UTIL1.getAdmin().disableReplicationPeer(PEER_ID); 067 write(UTIL1, 0, COUNT); 068 UTIL1.shutdownMiniCluster(); 069 070 JVMClusterUtil.MasterThread activeMaster = UTIL2.getMiniHBaseCluster().getMasterThread(); 071 String threadName = "RegionServer-Restarter"; 072 Thread t = new Thread(() -> { 073 try { 074 List<JVMClusterUtil.RegionServerThread> regionServers = 075 new ArrayList<>(UTIL2.getMiniHBaseCluster().getLiveRegionServerThreads()); 076 LOG.debug("Going to stop {} RSes: [{}]", regionServers.size(), 077 regionServers.stream().map(rst -> rst.getRegionServer().getServerName().getServerName()) 078 .collect(Collectors.joining(", "))); 079 for (JVMClusterUtil.RegionServerThread rst : regionServers) { 080 ServerName serverName = rst.getRegionServer().getServerName(); 081 LOG.debug("Going to RS stop [{}]", serverName); 082 rst.getRegionServer().stop("Stop RS for test"); 083 waitForRSShutdownToStartAndFinish(activeMaster, serverName); 084 LOG.debug("Going to start a new RS"); 085 JVMClusterUtil.RegionServerThread restarted = 086 UTIL2.getMiniHBaseCluster().startRegionServer(); 087 LOG.debug("Waiting RS [{}] to online", restarted.getRegionServer().getServerName()); 088 restarted.waitForServerOnline(); 089 LOG.debug("Waiting the old RS {} thread to quit", rst.getName()); 090 rst.join(); 091 LOG.debug("Done stop RS [{}] and restart [{}]", serverName, 092 restarted.getRegionServer().getServerName()); 093 } 094 LOG.debug("All RSes restarted"); 095 } catch (Exception e) { 096 LOG.error("Failed to kill RS", e); 097 } 098 }, threadName); 099 t.start(); 100 101 LOG.debug("Going to transit peer {} to {} state", PEER_ID, 102 SyncReplicationState.DOWNGRADE_ACTIVE); 103 // Transit standby to DA to replay logs 104 try { 105 UTIL2.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID, 106 SyncReplicationState.DOWNGRADE_ACTIVE); 107 } catch (Exception e) { 108 LOG.error("Failed to transit standby cluster to " + SyncReplicationState.DOWNGRADE_ACTIVE, e); 109 } 110 111 LOG.debug("Waiting for the restarter thread {} to quit", threadName); 112 t.join(); 113 114 while ( 115 UTIL2.getAdmin().getReplicationPeerSyncReplicationState(PEER_ID) 116 != SyncReplicationState.DOWNGRADE_ACTIVE 117 ) { 118 LOG.debug("Waiting for peer {} to be in {} state", PEER_ID, 119 SyncReplicationState.DOWNGRADE_ACTIVE); 120 Thread.sleep(SLEEP_TIME); 121 } 122 LOG.debug("Going to verify the result, {} records expected", COUNT); 123 verify(UTIL2, 0, COUNT); 124 LOG.debug("Verification successfully done"); 125 } 126 127 private void waitForRSShutdownToStartAndFinish(JVMClusterUtil.MasterThread activeMaster, 128 ServerName serverName) throws InterruptedException, IOException { 129 ServerManager sm = activeMaster.getMaster().getServerManager(); 130 // First wait for it to be in dead list 131 while (!sm.getDeadServers().isDeadServer(serverName)) { 132 LOG.debug("Waiting for {} to be listed as dead in master", serverName); 133 Thread.sleep(SLEEP_TIME); 134 } 135 LOG.debug("Server {} marked as dead, waiting for it to finish dead processing", serverName); 136 while (sm.areDeadServersInProgress()) { 137 LOG.debug("Server {} still being processed, waiting", serverName); 138 Thread.sleep(SLEEP_TIME); 139 } 140 LOG.debug("Server {} done with server shutdown processing", serverName); 141 } 142}