001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase; 019 020import org.apache.hadoop.conf.Configuration; 021import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus; 022import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 023import org.apache.hadoop.util.StringUtils; 024import org.apache.yetus.audience.InterfaceAudience; 025import org.slf4j.Logger; 026import org.slf4j.LoggerFactory; 027 028/** 029 * The Class HealthCheckChore for running health checker regularly. 030 */ 031@InterfaceAudience.Private 032public class HealthCheckChore extends ScheduledChore { 033 private static final Logger LOG = LoggerFactory.getLogger(HealthCheckChore.class); 034 private HealthChecker healthChecker; 035 private Configuration config; 036 private int threshold; 037 private int numTimesUnhealthy = 0; 038 private long failureWindow; 039 private long startWindow; 040 041 public HealthCheckChore(int sleepTime, Stoppable stopper, Configuration conf) { 042 super("HealthChecker", stopper, sleepTime); 043 LOG.info("Health Check Chore runs every " + StringUtils.formatTime(sleepTime)); 044 this.config = conf; 045 String healthCheckScript = this.config.get(HConstants.HEALTH_SCRIPT_LOC); 046 long scriptTimeout = this.config.getLong(HConstants.HEALTH_SCRIPT_TIMEOUT, 047 HConstants.DEFAULT_HEALTH_SCRIPT_TIMEOUT); 048 healthChecker = new HealthChecker(); 049 healthChecker.init(healthCheckScript, scriptTimeout); 050 this.threshold = config.getInt(HConstants.HEALTH_FAILURE_THRESHOLD, 051 HConstants.DEFAULT_HEALTH_FAILURE_THRESHOLD); 052 this.failureWindow = (long) this.threshold * (long) sleepTime; 053 } 054 055 @Override 056 protected void chore() { 057 HealthReport report = healthChecker.checkHealth(); 058 boolean isHealthy = (report.getStatus() == HealthCheckerExitStatus.SUCCESS); 059 if (!isHealthy) { 060 boolean needToStop = decideToStop(); 061 if (needToStop) { 062 this.getStopper() 063 .stop("The node reported unhealthy " + threshold + " number of times consecutively."); 064 } 065 // Always log health report. 066 LOG.info("Health status at " + StringUtils.formatTime(EnvironmentEdgeManager.currentTime()) 067 + " : " + report.getHealthReport()); 068 } 069 } 070 071 private boolean decideToStop() { 072 boolean stop = false; 073 if (numTimesUnhealthy == 0) { 074 // First time we are seeing a failure. No need to stop, just 075 // record the time. 076 numTimesUnhealthy++; 077 startWindow = EnvironmentEdgeManager.currentTime(); 078 } else { 079 if ((EnvironmentEdgeManager.currentTime() - startWindow) < failureWindow) { 080 numTimesUnhealthy++; 081 if (numTimesUnhealthy == threshold) { 082 stop = true; 083 } else { 084 stop = false; 085 } 086 } else { 087 // Outside of failure window, so we reset to 1. 088 numTimesUnhealthy = 1; 089 startWindow = EnvironmentEdgeManager.currentTime(); 090 stop = false; 091 } 092 } 093 return stop; 094 } 095 096}