001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase;
019
020import java.io.Closeable;
021import java.io.IOException;
022import org.apache.hadoop.conf.Configurable;
023import org.apache.hadoop.conf.Configuration;
024import org.apache.hadoop.hbase.client.RegionInfoBuilder;
025import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
026import org.apache.hadoop.hbase.util.Threads;
027import org.apache.yetus.audience.InterfaceAudience;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.AdminService;
032import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.ClientService;
033import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.MasterService;
034
035/**
036 * This class defines methods that can help with managing HBase clusters from unit tests and system
037 * tests. There are 3 types of cluster deployments:
038 * <ul>
039 * <li><b>MiniHBaseCluster:</b> each server is run in the same JVM in separate threads, used by unit
040 * tests</li>
041 * <li><b>DistributedHBaseCluster:</b> the cluster is pre-deployed, system and integration tests can
042 * interact with the cluster.</li>
043 * <li><b>ProcessBasedLocalHBaseCluster:</b> each server is deployed locally but in separate JVMs.
044 * </li>
045 * </ul>
046 * <p>
047 * HBaseCluster unifies the way tests interact with the cluster, so that the same test can be run
048 * against a mini-cluster during unit test execution, or a distributed cluster having tens/hundreds
049 * of nodes during execution of integration tests.
050 * <p>
051 * HBaseCluster exposes client-side public interfaces to tests, so that tests does not assume
052 * running in a particular mode. Not all the tests are suitable to be run on an actual cluster, and
053 * some tests will still need to mock stuff and introspect internal state. For those use cases from
054 * unit tests, or if more control is needed, you can use the subclasses directly. In that sense,
055 * this class does not abstract away <strong>every</strong> interface that MiniHBaseCluster or
056 * DistributedHBaseCluster provide.
057 */
058@InterfaceAudience.Public
059public abstract class HBaseCluster implements Closeable, Configurable {
060  // Log is being used in DistributedHBaseCluster class, hence keeping it as package scope
061  static final Logger LOG = LoggerFactory.getLogger(HBaseCluster.class.getName());
062  protected Configuration conf;
063
064  /** the status of the cluster before we begin */
065  protected ClusterMetrics initialClusterStatus;
066
067  /**
068   * Construct an HBaseCluster
069   * @param conf Configuration to be used for cluster
070   */
071  public HBaseCluster(Configuration conf) {
072    setConf(conf);
073  }
074
075  @Override
076  public void setConf(Configuration conf) {
077    this.conf = conf;
078  }
079
080  @Override
081  public Configuration getConf() {
082    return conf;
083  }
084
085  /**
086   * Returns a ClusterMetrics for this HBase cluster.
087   * @see #getInitialClusterMetrics()
088   */
089  public abstract ClusterMetrics getClusterMetrics() throws IOException;
090
091  /**
092   * Returns a ClusterStatus for this HBase cluster as observed at the starting of the HBaseCluster
093   */
094  public ClusterMetrics getInitialClusterMetrics() throws IOException {
095    return initialClusterStatus;
096  }
097
098  /**
099   * Returns an {@link MasterService.BlockingInterface} to the active master
100   */
101  public abstract MasterService.BlockingInterface getMasterAdminService() throws IOException;
102
103  /**
104   * Returns an AdminProtocol interface to the regionserver
105   */
106  public abstract AdminService.BlockingInterface getAdminProtocol(ServerName serverName)
107    throws IOException;
108
109  /**
110   * Returns a ClientProtocol interface to the regionserver
111   */
112  public abstract ClientService.BlockingInterface getClientProtocol(ServerName serverName)
113    throws IOException;
114
115  /**
116   * Starts a new region server on the given hostname or if this is a mini/local cluster, starts a
117   * region server locally.
118   * @param hostname the hostname to start the regionserver on
119   * @throws IOException if something goes wrong
120   */
121  public abstract void startRegionServer(String hostname, int port) throws IOException;
122
123  /**
124   * Kills the region server process if this is a distributed cluster, otherwise this causes the
125   * region server to exit doing basic clean up only.
126   * @throws IOException if something goes wrong
127   */
128  public abstract void killRegionServer(ServerName serverName) throws IOException;
129
130  /**
131   * Keeping track of killed servers and being able to check if a particular server was killed makes
132   * it possible to do fault tolerance testing for dead servers in a deterministic way. A concrete
133   * example of such case is - killing servers and waiting for all regions of a particular table to
134   * be assigned. We can check for server column in META table and that its value is not one of the
135   * killed servers.
136   */
137  public abstract boolean isKilledRS(ServerName serverName);
138
139  /**
140   * Stops the given region server, by attempting a gradual stop.
141   * @throws IOException if something goes wrong
142   */
143  public abstract void stopRegionServer(ServerName serverName) throws IOException;
144
145  /**
146   * Wait for the specified region server to join the cluster
147   * @throws IOException if something goes wrong or timeout occurs
148   */
149  public void waitForRegionServerToStart(String hostname, int port, long timeout)
150    throws IOException {
151    long start = EnvironmentEdgeManager.currentTime();
152    while ((EnvironmentEdgeManager.currentTime() - start) < timeout) {
153      for (ServerName server : getClusterMetrics().getLiveServerMetrics().keySet()) {
154        if (server.getHostname().equals(hostname) && server.getPort() == port) {
155          return;
156        }
157      }
158      Threads.sleep(100);
159    }
160    throw new IOException(
161      "did timeout " + timeout + "ms waiting for region server to start: " + hostname);
162  }
163
164  /**
165   * Wait for the specified region server to stop the thread / process.
166   * @throws IOException if something goes wrong or timeout occurs
167   */
168  public abstract void waitForRegionServerToStop(ServerName serverName, long timeout)
169    throws IOException;
170
171  /**
172   * Suspend the region server
173   * @param serverName the hostname to suspend the regionserver on
174   * @throws IOException if something goes wrong
175   */
176  public abstract void suspendRegionServer(ServerName serverName) throws IOException;
177
178  /**
179   * Resume the region server
180   * @param serverName the hostname to resume the regionserver on
181   * @throws IOException if something goes wrong
182   */
183  public abstract void resumeRegionServer(ServerName serverName) throws IOException;
184
185  /**
186   * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster, silently
187   * logs warning message.
188   * @param hostname the hostname to start the regionserver on
189   * @throws IOException if something goes wrong
190   */
191  public abstract void startZkNode(String hostname, int port) throws IOException;
192
193  /**
194   * Kills the zookeeper node process if this is a distributed cluster, otherwise, this causes
195   * master to exit doing basic clean up only.
196   * @throws IOException if something goes wrong
197   */
198  public abstract void killZkNode(ServerName serverName) throws IOException;
199
200  /**
201   * Stops the region zookeeper if this is a distributed cluster, otherwise silently logs warning
202   * message.
203   * @throws IOException if something goes wrong
204   */
205  public abstract void stopZkNode(ServerName serverName) throws IOException;
206
207  /**
208   * Wait for the specified zookeeper node to join the cluster
209   * @throws IOException if something goes wrong or timeout occurs
210   */
211  public abstract void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException;
212
213  /**
214   * Wait for the specified zookeeper node to stop the thread / process.
215   * @throws IOException if something goes wrong or timeout occurs
216   */
217  public abstract void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException;
218
219  /**
220   * Starts a new datanode on the given hostname or if this is a mini/local cluster, silently logs
221   * warning message.
222   * @throws IOException if something goes wrong
223   */
224  public abstract void startDataNode(ServerName serverName) throws IOException;
225
226  /**
227   * Kills the datanode process if this is a distributed cluster, otherwise, this causes master to
228   * exit doing basic clean up only.
229   * @throws IOException if something goes wrong
230   */
231  public abstract void killDataNode(ServerName serverName) throws IOException;
232
233  /**
234   * Stops the datanode if this is a distributed cluster, otherwise silently logs warning message.
235   * @throws IOException if something goes wrong
236   */
237  public abstract void stopDataNode(ServerName serverName) throws IOException;
238
239  /**
240   * Wait for the specified datanode to join the cluster
241   * @throws IOException if something goes wrong or timeout occurs
242   */
243  public abstract void waitForDataNodeToStart(ServerName serverName, long timeout)
244    throws IOException;
245
246  /**
247   * Wait for the specified datanode to stop the thread / process.
248   * @throws IOException if something goes wrong or timeout occurs
249   */
250  public abstract void waitForDataNodeToStop(ServerName serverName, long timeout)
251    throws IOException;
252
253  /**
254   * Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs
255   * warning message.
256   * @throws IOException if something goes wrong
257   */
258  public abstract void startNameNode(ServerName serverName) throws IOException;
259
260  /**
261   * Kills the namenode process if this is a distributed cluster, otherwise, this causes master to
262   * exit doing basic clean up only.
263   * @throws IOException if something goes wrong
264   */
265  public abstract void killNameNode(ServerName serverName) throws IOException;
266
267  /**
268   * Stops the namenode if this is a distributed cluster, otherwise silently logs warning message.
269   * @throws IOException if something goes wrong
270   */
271  public abstract void stopNameNode(ServerName serverName) throws IOException;
272
273  /**
274   * Wait for the specified namenode to join the cluster
275   * @throws IOException if something goes wrong or timeout occurs
276   */
277  public abstract void waitForNameNodeToStart(ServerName serverName, long timeout)
278    throws IOException;
279
280  /**
281   * Wait for the specified namenode to stop
282   * @throws IOException if something goes wrong or timeout occurs
283   */
284  public abstract void waitForNameNodeToStop(ServerName serverName, long timeout)
285    throws IOException;
286
287  /**
288   * Starts a new journalnode on the given hostname or if this is a mini/local cluster, silently
289   * logs warning message.
290   * @throws IOException if something goes wrong
291   */
292  public abstract void startJournalNode(ServerName serverName) throws IOException;
293
294  /**
295   * Kills the journalnode process if this is a distributed cluster, otherwise, this causes master
296   * to exit doing basic clean up only.
297   * @throws IOException if something goes wrong
298   */
299  public abstract void killJournalNode(ServerName serverName) throws IOException;
300
301  /**
302   * Stops the journalnode if this is a distributed cluster, otherwise silently logs warning
303   * message.
304   * @throws IOException if something goes wrong
305   */
306  public abstract void stopJournalNode(ServerName serverName) throws IOException;
307
308  /**
309   * Wait for the specified journalnode to join the cluster
310   * @throws IOException if something goes wrong or timeout occurs
311   */
312  public abstract void waitForJournalNodeToStart(ServerName serverName, long timeout)
313    throws IOException;
314
315  /**
316   * Wait for the specified journalnode to stop
317   * @throws IOException if something goes wrong or timeout occurs
318   */
319  public abstract void waitForJournalNodeToStop(ServerName serverName, long timeout)
320    throws IOException;
321
322  /**
323   * Starts a new master on the given hostname or if this is a mini/local cluster, starts a master
324   * locally.
325   * @param hostname the hostname to start the master on
326   * @throws IOException if something goes wrong
327   */
328  public abstract void startMaster(String hostname, int port) throws IOException;
329
330  /**
331   * Kills the master process if this is a distributed cluster, otherwise, this causes master to
332   * exit doing basic clean up only.
333   * @throws IOException if something goes wrong
334   */
335  public abstract void killMaster(ServerName serverName) throws IOException;
336
337  /**
338   * Stops the given master, by attempting a gradual stop.
339   * @throws IOException if something goes wrong
340   */
341  public abstract void stopMaster(ServerName serverName) throws IOException;
342
343  /**
344   * Wait for the specified master to stop the thread / process.
345   * @throws IOException if something goes wrong or timeout occurs
346   */
347  public abstract void waitForMasterToStop(ServerName serverName, long timeout) throws IOException;
348
349  /**
350   * Blocks until there is an active master and that master has completed initialization.
351   * @return true if an active master becomes available. false if there are no masters left.
352   * @throws IOException if something goes wrong or timeout occurs
353   */
354  public boolean waitForActiveAndReadyMaster() throws IOException {
355    return waitForActiveAndReadyMaster(Long.MAX_VALUE);
356  }
357
358  /**
359   * Blocks until there is an active master and that master has completed initialization.
360   * @param timeout the timeout limit in ms
361   * @return true if an active master becomes available. false if there are no masters left.
362   */
363  public abstract boolean waitForActiveAndReadyMaster(long timeout) throws IOException;
364
365  /**
366   * Wait for HBase Cluster to shut down.
367   */
368  public abstract void waitUntilShutDown() throws IOException;
369
370  /**
371   * Shut down the HBase cluster
372   */
373  public abstract void shutdown() throws IOException;
374
375  /**
376   * Restores the cluster to it's initial state if this is a real cluster, otherwise does nothing.
377   * This is a best effort restore. If the servers are not reachable, or insufficient permissions,
378   * etc. restoration might be partial.
379   * @return whether restoration is complete
380   */
381  public boolean restoreInitialStatus() throws IOException {
382    return restoreClusterMetrics(getInitialClusterMetrics());
383  }
384
385  /**
386   * Restores the cluster to given state if this is a real cluster, otherwise does nothing. This is
387   * a best effort restore. If the servers are not reachable, or insufficient permissions, etc.
388   * restoration might be partial.
389   * @return whether restoration is complete
390   */
391  public boolean restoreClusterMetrics(ClusterMetrics desiredStatus) throws IOException {
392    return true;
393  }
394
395  /**
396   * Get the ServerName of region server serving the first hbase:meta region
397   */
398  public ServerName getServerHoldingMeta() throws IOException {
399    return getServerHoldingRegion(TableName.META_TABLE_NAME,
400      RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName());
401  }
402
403  /**
404   * Get the ServerName of region server serving the specified region
405   * @param regionName Name of the region in bytes
406   * @param tn         Table name that has the region.
407   * @return ServerName that hosts the region or null
408   */
409  public abstract ServerName getServerHoldingRegion(final TableName tn, byte[] regionName)
410    throws IOException;
411
412  /**
413   * @return whether we are interacting with a distributed cluster as opposed to an in-process
414   *         mini/local cluster.
415   */
416  public boolean isDistributedCluster() {
417    return false;
418  }
419
420  /**
421   * Closes all the resources held open for this cluster. Note that this call does not shutdown the
422   * cluster.
423   * @see #shutdown()
424   */
425  @Override
426  public abstract void close() throws IOException;
427
428  /**
429   * Wait for the namenode.
430   */
431  public void waitForNamenodeAvailable() throws InterruptedException {
432  }
433
434  public void waitForDatanodesRegistered(int nbDN) throws Exception {
435  }
436}