001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.replication.regionserver;
019
020import java.io.FileNotFoundException;
021import java.io.IOException;
022import java.util.ArrayList;
023import java.util.Arrays;
024import java.util.Collections;
025import java.util.LinkedList;
026import java.util.List;
027import java.util.Map;
028import java.util.Queue;
029import java.util.Set;
030import java.util.stream.Collectors;
031import org.apache.hadoop.conf.Configuration;
032import org.apache.hadoop.conf.Configured;
033import org.apache.hadoop.fs.FileStatus;
034import org.apache.hadoop.fs.FileSystem;
035import org.apache.hadoop.hbase.Abortable;
036import org.apache.hadoop.hbase.HBaseConfiguration;
037import org.apache.hadoop.hbase.ServerName;
038import org.apache.hadoop.hbase.client.Admin;
039import org.apache.hadoop.hbase.client.ClusterConnection;
040import org.apache.hadoop.hbase.client.ConnectionFactory;
041import org.apache.hadoop.hbase.client.HBaseAdmin;
042import org.apache.hadoop.hbase.client.replication.TableCFs;
043import org.apache.hadoop.hbase.io.WALLink;
044import org.apache.hadoop.hbase.procedure2.util.StringUtils;
045import org.apache.hadoop.hbase.replication.ReplicationPeerConfig;
046import org.apache.hadoop.hbase.replication.ReplicationPeerDescription;
047import org.apache.hadoop.hbase.replication.ReplicationQueueInfo;
048import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
049import org.apache.hadoop.hbase.replication.ReplicationStorageFactory;
050import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
051import org.apache.hadoop.hbase.zookeeper.ZKDump;
052import org.apache.hadoop.hbase.zookeeper.ZKUtil;
053import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
054import org.apache.hadoop.util.Tool;
055import org.apache.hadoop.util.ToolRunner;
056import org.apache.yetus.audience.InterfaceAudience;
057import org.slf4j.Logger;
058import org.slf4j.LoggerFactory;
059
060import org.apache.hbase.thirdparty.com.google.common.util.concurrent.AtomicLongMap;
061
062/**
063 * Provides information about the existing states of replication, replication peers and queues.
064 * Usage: hbase org.apache.hadoop.hbase.replication.regionserver.DumpReplicationQueues [args]
065 * Arguments: --distributed Polls each RS to dump information about the queue --hdfs Reports HDFS
066 * usage by the replication queues (note: can be overestimated).
067 */
068@InterfaceAudience.Private
069public class DumpReplicationQueues extends Configured implements Tool {
070
071  private static final Logger LOG = LoggerFactory.getLogger(DumpReplicationQueues.class.getName());
072
073  private List<String> deadRegionServers;
074  private List<String> deletedQueues;
075  private AtomicLongMap<String> peersQueueSize;
076  private long totalSizeOfWALs;
077  private long numWalsNotFound;
078
079  public DumpReplicationQueues() {
080    deadRegionServers = new ArrayList<>();
081    deletedQueues = new ArrayList<>();
082    peersQueueSize = AtomicLongMap.create();
083    totalSizeOfWALs = 0;
084    numWalsNotFound = 0;
085  }
086
087  static class DumpOptions {
088    boolean hdfs = false;
089    boolean distributed = false;
090
091    public DumpOptions() {
092    }
093
094    public DumpOptions(DumpOptions that) {
095      this.hdfs = that.hdfs;
096      this.distributed = that.distributed;
097    }
098
099    boolean isHdfs() {
100      return hdfs;
101    }
102
103    boolean isDistributed() {
104      return distributed;
105    }
106
107    void setHdfs(boolean hdfs) {
108      this.hdfs = hdfs;
109    }
110
111    void setDistributed(boolean distributed) {
112      this.distributed = distributed;
113    }
114  }
115
116  static DumpOptions parseOpts(Queue<String> args) {
117    DumpOptions opts = new DumpOptions();
118
119    String cmd = null;
120    while ((cmd = args.poll()) != null) {
121      if (cmd.equals("-h") || cmd.equals("--h") || cmd.equals("--help")) {
122        // place item back onto queue so that caller knows parsing was incomplete
123        args.add(cmd);
124        break;
125      }
126      final String hdfs = "--hdfs";
127      if (cmd.equals(hdfs)) {
128        opts.setHdfs(true);
129        continue;
130      }
131      final String distributed = "--distributed";
132      if (cmd.equals(distributed)) {
133        opts.setDistributed(true);
134        continue;
135      } else {
136        printUsageAndExit("ERROR: Unrecognized option/command: " + cmd, -1);
137      }
138      // check that --distributed is present when --hdfs is in the arguments
139      if (!opts.isDistributed() && opts.isHdfs()) {
140        printUsageAndExit("ERROR: --hdfs option can only be used with --distributed: " + cmd, -1);
141      }
142    }
143    return opts;
144  }
145
146  /**
147   * Main
148   */
149  public static void main(String[] args) throws Exception {
150    Configuration conf = HBaseConfiguration.create();
151    int ret = ToolRunner.run(conf, new DumpReplicationQueues(), args);
152    System.exit(ret);
153  }
154
155  @Override
156  public int run(String[] args) throws Exception {
157
158    int errCode = -1;
159    LinkedList<String> argv = new LinkedList<>();
160    argv.addAll(Arrays.asList(args));
161    DumpOptions opts = parseOpts(argv);
162
163    // args remaining, print help and exit
164    if (!argv.isEmpty()) {
165      errCode = 0;
166      printUsage();
167      return errCode;
168    }
169    return dumpReplicationQueues(opts);
170  }
171
172  protected void printUsage() {
173    printUsage(this.getClass().getName(), null);
174  }
175
176  protected static void printUsage(final String message) {
177    printUsage(DumpReplicationQueues.class.getName(), message);
178  }
179
180  protected static void printUsage(final String className, final String message) {
181    if (message != null && message.length() > 0) {
182      System.err.println(message);
183    }
184    System.err.println("Usage: hbase " + className + " \\");
185    System.err.println("  <OPTIONS> [-D<property=value>]*");
186    System.err.println();
187    System.err.println("General Options:");
188    System.err.println(" -h|--h|--help  Show this help and exit.");
189    System.err.println(" --distributed  Poll each RS and print its own replication queue. "
190      + "Default only polls ZooKeeper");
191    System.err.println(" --hdfs         Use HDFS to calculate usage of WALs by replication."
192      + " It could be overestimated if replicating to multiple peers."
193      + " --distributed flag is also needed.");
194  }
195
196  protected static void printUsageAndExit(final String message, final int exitCode) {
197    printUsage(message);
198    System.exit(exitCode);
199  }
200
201  private int dumpReplicationQueues(DumpOptions opts) throws Exception {
202
203    Configuration conf = getConf();
204    HBaseAdmin.available(conf);
205    ClusterConnection connection = (ClusterConnection) ConnectionFactory.createConnection(conf);
206    Admin admin = connection.getAdmin();
207
208    ZKWatcher zkw =
209      new ZKWatcher(conf, "DumpReplicationQueues" + EnvironmentEdgeManager.currentTime(),
210        new WarnOnlyAbortable(), true);
211
212    try {
213      // Our zk watcher
214      LOG.info("Our Quorum: " + zkw.getQuorum());
215      List<TableCFs> replicatedTableCFs = admin.listReplicatedTableCFs();
216      if (replicatedTableCFs.isEmpty()) {
217        LOG.info("No tables with a configured replication peer were found.");
218        return (0);
219      } else {
220        LOG.info("Replicated Tables: " + replicatedTableCFs);
221      }
222
223      List<ReplicationPeerDescription> peers = admin.listReplicationPeers();
224
225      if (peers.isEmpty()) {
226        LOG.info("Replication is enabled but no peer configuration was found.");
227      }
228
229      System.out.println("Dumping replication peers and configurations:");
230      System.out.println(dumpPeersState(peers));
231
232      if (opts.isDistributed()) {
233        LOG.info("Found [--distributed], will poll each RegionServer.");
234        Set<String> peerIds =
235          peers.stream().map((peer) -> peer.getPeerId()).collect(Collectors.toSet());
236        System.out.println(dumpQueues(zkw, peerIds, opts.isHdfs()));
237        System.out.println(dumpReplicationSummary());
238      } else {
239        // use ZK instead
240        System.out.print("Dumping replication znodes via ZooKeeper:");
241        System.out.println(ZKDump.getReplicationZnodesDump(zkw));
242      }
243      return (0);
244    } catch (IOException e) {
245      return (-1);
246    } finally {
247      zkw.close();
248    }
249  }
250
251  public String dumpReplicationSummary() {
252    StringBuilder sb = new StringBuilder();
253    if (!deletedQueues.isEmpty()) {
254      sb.append("Found " + deletedQueues.size() + " deleted queues"
255        + ", run hbck -fixReplication in order to remove the deleted replication queues\n");
256      for (String deletedQueue : deletedQueues) {
257        sb.append("    " + deletedQueue + "\n");
258      }
259    }
260    if (!deadRegionServers.isEmpty()) {
261      sb.append("Found " + deadRegionServers.size() + " dead regionservers"
262        + ", restart one regionserver to transfer the queues of dead regionservers\n");
263      for (String deadRs : deadRegionServers) {
264        sb.append("    " + deadRs + "\n");
265      }
266    }
267    if (!peersQueueSize.isEmpty()) {
268      sb.append("Dumping all peers's number of WALs in replication queue\n");
269      for (Map.Entry<String, Long> entry : peersQueueSize.asMap().entrySet()) {
270        sb.append(
271          "    PeerId: " + entry.getKey() + " , sizeOfLogQueue: " + entry.getValue() + "\n");
272      }
273    }
274    sb.append("    Total size of WALs on HDFS: " + StringUtils.humanSize(totalSizeOfWALs) + "\n");
275    if (numWalsNotFound > 0) {
276      sb.append("    ERROR: There are " + numWalsNotFound + " WALs not found!!!\n");
277    }
278    return sb.toString();
279  }
280
281  public String dumpPeersState(List<ReplicationPeerDescription> peers) throws Exception {
282    Map<String, String> currentConf;
283    StringBuilder sb = new StringBuilder();
284    for (ReplicationPeerDescription peer : peers) {
285      ReplicationPeerConfig peerConfig = peer.getPeerConfig();
286      sb.append("Peer: " + peer.getPeerId() + "\n");
287      sb.append("    " + "State: " + (peer.isEnabled() ? "ENABLED" : "DISABLED") + "\n");
288      sb.append("    " + "Cluster Name: " + peerConfig.getClusterKey() + "\n");
289      sb.append("    " + "Replication Endpoint: " + peerConfig.getReplicationEndpointImpl() + "\n");
290      currentConf = peerConfig.getConfiguration();
291      // Only show when we have a custom configuration for the peer
292      if (currentConf.size() > 1) {
293        sb.append("    " + "Peer Configuration: " + currentConf + "\n");
294      }
295      sb.append("    " + "Peer Table CFs: " + peerConfig.getTableCFsMap() + "\n");
296      sb.append("    " + "Peer Namespaces: " + peerConfig.getNamespaces() + "\n");
297    }
298    return sb.toString();
299  }
300
301  public String dumpQueues(ZKWatcher zkw, Set<String> peerIds, boolean hdfs) throws Exception {
302    ReplicationQueueStorage queueStorage;
303    StringBuilder sb = new StringBuilder();
304
305    queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zkw, getConf());
306    Set<ServerName> liveRegionServers = ZKUtil.listChildrenNoWatch(zkw, zkw.getZNodePaths().rsZNode)
307      .stream().map(ServerName::parseServerName).collect(Collectors.toSet());
308
309    // Loops each peer on each RS and dumps the queues
310    List<ServerName> regionservers = queueStorage.getListOfReplicators();
311    if (regionservers == null || regionservers.isEmpty()) {
312      return sb.toString();
313    }
314    for (ServerName regionserver : regionservers) {
315      List<String> queueIds = queueStorage.getAllQueues(regionserver);
316      if (!liveRegionServers.contains(regionserver)) {
317        deadRegionServers.add(regionserver.getServerName());
318      }
319      for (String queueId : queueIds) {
320        ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId);
321        List<String> wals = queueStorage.getWALsInQueue(regionserver, queueId);
322        Collections.sort(wals);
323        if (!peerIds.contains(queueInfo.getPeerId())) {
324          deletedQueues.add(regionserver + "/" + queueId);
325          sb.append(formatQueue(regionserver, queueStorage, queueInfo, queueId, wals, true, hdfs));
326        } else {
327          sb.append(formatQueue(regionserver, queueStorage, queueInfo, queueId, wals, false, hdfs));
328        }
329      }
330    }
331    return sb.toString();
332  }
333
334  private String formatQueue(ServerName regionserver, ReplicationQueueStorage queueStorage,
335    ReplicationQueueInfo queueInfo, String queueId, List<String> wals, boolean isDeleted,
336    boolean hdfs) throws Exception {
337    StringBuilder sb = new StringBuilder();
338
339    List<ServerName> deadServers;
340
341    sb.append("Dumping replication queue info for RegionServer: [" + regionserver + "]" + "\n");
342    sb.append("    Queue znode: " + queueId + "\n");
343    sb.append("    PeerID: " + queueInfo.getPeerId() + "\n");
344    sb.append("    Recovered: " + queueInfo.isQueueRecovered() + "\n");
345    deadServers = queueInfo.getDeadRegionServers();
346    if (deadServers.isEmpty()) {
347      sb.append("    No dead RegionServers found in this queue." + "\n");
348    } else {
349      sb.append("    Dead RegionServers: " + deadServers + "\n");
350    }
351    sb.append("    Was deleted: " + isDeleted + "\n");
352    sb.append("    Number of WALs in replication queue: " + wals.size() + "\n");
353    peersQueueSize.addAndGet(queueInfo.getPeerId(), wals.size());
354
355    for (String wal : wals) {
356      long position = queueStorage.getWALPosition(regionserver, queueInfo.getPeerId(), wal);
357      sb.append("    Replication position for " + wal + ": "
358        + (position > 0 ? position : "0" + " (not started or nothing to replicate)") + "\n");
359    }
360
361    if (hdfs) {
362      FileSystem fs = FileSystem.get(getConf());
363      sb.append("    Total size of WALs on HDFS for this queue: "
364        + StringUtils.humanSize(getTotalWALSize(fs, wals, regionserver)) + "\n");
365    }
366    return sb.toString();
367  }
368
369  /**
370   * return total size in bytes from a list of WALs
371   */
372  private long getTotalWALSize(FileSystem fs, List<String> wals, ServerName server)
373    throws IOException {
374    long size = 0;
375    FileStatus fileStatus;
376
377    for (String wal : wals) {
378      try {
379        fileStatus = (new WALLink(getConf(), server.getServerName(), wal)).getFileStatus(fs);
380      } catch (IOException e) {
381        if (e instanceof FileNotFoundException) {
382          numWalsNotFound++;
383          LOG.warn("WAL " + wal + " couldn't be found, skipping", e);
384        } else {
385          LOG.warn("Can't get file status of WAL " + wal + ", skipping", e);
386        }
387        continue;
388      }
389      size += fileStatus.getLen();
390    }
391
392    totalSizeOfWALs += size;
393    return size;
394  }
395
396  private static class WarnOnlyAbortable implements Abortable {
397    @Override
398    public void abort(String why, Throwable e) {
399      LOG.warn("DumpReplicationQueue received abort, ignoring.  Reason: " + why);
400      if (LOG.isDebugEnabled()) {
401        LOG.debug(e.toString(), e);
402      }
403    }
404
405    @Override
406    public boolean isAborted() {
407      return false;
408    }
409  }
410}