001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.chaos.actions; 019 020import java.util.HashSet; 021import java.util.List; 022import java.util.Set; 023import org.apache.hadoop.hbase.ServerName; 024import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; 025import org.slf4j.Logger; 026import org.slf4j.LoggerFactory; 027 028/** 029 * Restarts a ratio of the running regionservers at the same time 030 */ 031public class BatchRestartRsAction extends RestartActionBaseAction { 032 float ratio; // ratio of regionservers to restart 033 private static final Logger LOG = LoggerFactory.getLogger(BatchRestartRsAction.class); 034 035 public BatchRestartRsAction(long sleepTime, float ratio) { 036 super(sleepTime); 037 this.ratio = ratio; 038 } 039 040 @Override 041 protected Logger getLogger() { 042 return LOG; 043 } 044 045 @Override 046 public void perform() throws Exception { 047 getLogger().info(String.format("Performing action: Batch restarting %d%% of region servers", 048 (int) (ratio * 100))); 049 List<ServerName> selectedServers = 050 PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio); 051 052 Set<ServerName> killedServers = new HashSet<>(); 053 054 for (ServerName server : selectedServers) { 055 // Don't keep killing servers if we're 056 // trying to stop the monkey. 057 if (context.isStopping()) { 058 break; 059 } 060 getLogger().info("Killing region server:" + server); 061 cluster.killRegionServer(server); 062 killedServers.add(server); 063 } 064 065 for (ServerName server : killedServers) { 066 cluster.waitForRegionServerToStop(server, PolicyBasedChaosMonkey.TIMEOUT); 067 } 068 069 getLogger().info("Killed " + killedServers.size() + " region servers. Reported num of rs:" 070 + cluster.getClusterMetrics().getLiveServerMetrics().size()); 071 072 sleep(sleepTime); 073 074 for (ServerName server : killedServers) { 075 getLogger().info("Starting region server:" + server.getHostname()); 076 cluster.startRegionServer(server.getHostname(), server.getPort()); 077 078 } 079 for (ServerName server : killedServers) { 080 cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), 081 PolicyBasedChaosMonkey.TIMEOUT); 082 } 083 getLogger().info("Started " + killedServers.size() + " region servers. Reported num of rs:" 084 + cluster.getClusterMetrics().getLiveServerMetrics().size()); 085 } 086}