001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.chaos.actions;
019
020import java.util.HashSet;
021import java.util.List;
022import java.util.Set;
023import org.apache.hadoop.hbase.ServerName;
024import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
025import org.slf4j.Logger;
026import org.slf4j.LoggerFactory;
027
028/**
029 * Restarts a ratio of the running regionservers at the same time
030 */
031public class BatchRestartRsAction extends RestartActionBaseAction {
032  float ratio; // ratio of regionservers to restart
033  private static final Logger LOG = LoggerFactory.getLogger(BatchRestartRsAction.class);
034
035  public BatchRestartRsAction(long sleepTime, float ratio) {
036    super(sleepTime);
037    this.ratio = ratio;
038  }
039
040  @Override
041  protected Logger getLogger() {
042    return LOG;
043  }
044
045  @Override
046  public void perform() throws Exception {
047    getLogger().info(String.format("Performing action: Batch restarting %d%% of region servers",
048      (int) (ratio * 100)));
049    List<ServerName> selectedServers =
050      PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
051
052    Set<ServerName> killedServers = new HashSet<>();
053
054    for (ServerName server : selectedServers) {
055      // Don't keep killing servers if we're
056      // trying to stop the monkey.
057      if (context.isStopping()) {
058        break;
059      }
060      getLogger().info("Killing region server:" + server);
061      cluster.killRegionServer(server);
062      killedServers.add(server);
063    }
064
065    for (ServerName server : killedServers) {
066      cluster.waitForRegionServerToStop(server, PolicyBasedChaosMonkey.TIMEOUT);
067    }
068
069    getLogger().info("Killed " + killedServers.size() + " region servers. Reported num of rs:"
070      + cluster.getClusterMetrics().getLiveServerMetrics().size());
071
072    sleep(sleepTime);
073
074    for (ServerName server : killedServers) {
075      getLogger().info("Starting region server:" + server.getHostname());
076      cluster.startRegionServer(server.getHostname(), server.getPort());
077
078    }
079    for (ServerName server : killedServers) {
080      cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(),
081        PolicyBasedChaosMonkey.TIMEOUT);
082    }
083    getLogger().info("Started " + killedServers.size() + " region servers. Reported num of rs:"
084      + cluster.getClusterMetrics().getLiveServerMetrics().size());
085  }
086}