001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.io.hfile;
019
020import static com.codahale.metrics.MetricRegistry.name;
021
022import com.codahale.metrics.ConsoleReporter;
023import com.codahale.metrics.Histogram;
024import com.codahale.metrics.MetricRegistry;
025import com.codahale.metrics.Snapshot;
026import com.codahale.metrics.UniformReservoir;
027import java.io.ByteArrayOutputStream;
028import java.io.DataInput;
029import java.io.IOException;
030import java.io.PrintStream;
031import java.text.DateFormat;
032import java.util.ArrayList;
033import java.util.Arrays;
034import java.util.HashMap;
035import java.util.Iterator;
036import java.util.LinkedHashSet;
037import java.util.List;
038import java.util.Locale;
039import java.util.Map;
040import java.util.Optional;
041import java.util.Set;
042import java.util.TimeZone;
043import java.util.concurrent.atomic.LongAdder;
044import org.apache.hadoop.conf.Configuration;
045import org.apache.hadoop.conf.Configured;
046import org.apache.hadoop.fs.FileSystem;
047import org.apache.hadoop.fs.Path;
048import org.apache.hadoop.hbase.Cell;
049import org.apache.hadoop.hbase.CellComparator;
050import org.apache.hadoop.hbase.CellUtil;
051import org.apache.hadoop.hbase.ExtendedCell;
052import org.apache.hadoop.hbase.HBaseConfiguration;
053import org.apache.hadoop.hbase.HBaseInterfaceAudience;
054import org.apache.hadoop.hbase.HConstants;
055import org.apache.hadoop.hbase.KeyValue;
056import org.apache.hadoop.hbase.KeyValueUtil;
057import org.apache.hadoop.hbase.PrivateCellUtil;
058import org.apache.hadoop.hbase.TableName;
059import org.apache.hadoop.hbase.Tag;
060import org.apache.hadoop.hbase.client.RegionInfo;
061import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper;
062import org.apache.hadoop.hbase.mob.MobUtils;
063import org.apache.hadoop.hbase.regionserver.HStoreFile;
064import org.apache.hadoop.hbase.regionserver.TimeRangeTracker;
065import org.apache.hadoop.hbase.util.BloomFilter;
066import org.apache.hadoop.hbase.util.BloomFilterFactory;
067import org.apache.hadoop.hbase.util.BloomFilterUtil;
068import org.apache.hadoop.hbase.util.Bytes;
069import org.apache.hadoop.hbase.util.CommonFSUtils;
070import org.apache.hadoop.hbase.util.HFileArchiveUtil;
071import org.apache.hadoop.util.Tool;
072import org.apache.hadoop.util.ToolRunner;
073import org.apache.yetus.audience.InterfaceAudience;
074import org.apache.yetus.audience.InterfaceStability;
075import org.slf4j.Logger;
076import org.slf4j.LoggerFactory;
077
078import org.apache.hbase.thirdparty.org.apache.commons.cli.CommandLine;
079import org.apache.hbase.thirdparty.org.apache.commons.cli.CommandLineParser;
080import org.apache.hbase.thirdparty.org.apache.commons.cli.HelpFormatter;
081import org.apache.hbase.thirdparty.org.apache.commons.cli.Option;
082import org.apache.hbase.thirdparty.org.apache.commons.cli.OptionGroup;
083import org.apache.hbase.thirdparty.org.apache.commons.cli.Options;
084import org.apache.hbase.thirdparty.org.apache.commons.cli.ParseException;
085import org.apache.hbase.thirdparty.org.apache.commons.cli.PosixParser;
086
087/**
088 * Implements pretty-printing functionality for {@link HFile}s.
089 */
090@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS)
091@InterfaceStability.Evolving
092public class HFilePrettyPrinter extends Configured implements Tool {
093
094  private static final Logger LOG = LoggerFactory.getLogger(HFilePrettyPrinter.class);
095
096  private Options options = new Options();
097
098  private boolean verbose;
099  private boolean printValue;
100  private boolean printKey;
101  private boolean shouldPrintMeta;
102  private boolean printBlockIndex;
103  private boolean printBlockHeaders;
104  private boolean printStats;
105  private boolean printStatRanges;
106  private boolean checkRow;
107  private boolean checkFamily;
108  private boolean isSeekToRow = false;
109  private boolean checkMobIntegrity = false;
110  private Map<String, List<Path>> mobFileLocations;
111  private static final int FOUND_MOB_FILES_CACHE_CAPACITY = 50;
112  private static final int MISSING_MOB_FILES_CACHE_CAPACITY = 20;
113  private PrintStream out = System.out;
114  private PrintStream err = System.err;
115
116  /**
117   * The row which the user wants to specify and print all the KeyValues for.
118   */
119  private byte[] row = null;
120
121  private List<Path> files = new ArrayList<>();
122  private int count;
123
124  private static final String FOUR_SPACES = "    ";
125
126  public HFilePrettyPrinter() {
127    super();
128    init();
129  }
130
131  public HFilePrettyPrinter(Configuration conf) {
132    super(conf);
133    init();
134  }
135
136  private void init() {
137    options.addOption("v", "verbose", false, "Verbose output; emits file and meta data delimiters");
138    options.addOption("p", "printkv", false, "Print key/value pairs");
139    options.addOption("e", "printkey", false, "Print keys");
140    options.addOption("m", "printmeta", false, "Print meta data of file");
141    options.addOption("b", "printblocks", false, "Print block index meta data");
142    options.addOption("h", "printblockheaders", false, "Print block headers for each block.");
143    options.addOption("k", "checkrow", false,
144      "Enable row order check; looks for out-of-order keys");
145    options.addOption("a", "checkfamily", false, "Enable family check");
146    options.addOption("w", "seekToRow", true,
147      "Seek to this row and print all the kvs for this row only");
148    options.addOption("s", "stats", false, "Print statistics");
149    options.addOption("d", "details", false,
150      "Print detailed statistics, including counts by range");
151    options.addOption("i", "checkMobIntegrity", false,
152      "Print all cells whose mob files are missing");
153
154    OptionGroup files = new OptionGroup();
155    files.addOption(new Option("f", "file", true,
156      "File to scan. Pass full-path; e.g. hdfs://a:9000/hbase/hbase:meta/12/34"));
157    files.addOption(
158      new Option("r", "region", true, "Region to scan. Pass region name; e.g. 'hbase:meta,,1'"));
159    options.addOptionGroup(files);
160  }
161
162  public void setPrintStreams(PrintStream out, PrintStream err) {
163    this.out = out;
164    this.err = err;
165  }
166
167  public boolean parseOptions(String args[]) throws ParseException, IOException {
168    if (args.length == 0) {
169      HelpFormatter formatter = new HelpFormatter();
170      formatter.printHelp("hfile", options, true);
171      return false;
172    }
173    CommandLineParser parser = new PosixParser();
174    CommandLine cmd = parser.parse(options, args);
175
176    verbose = cmd.hasOption("v");
177    printValue = cmd.hasOption("p");
178    printKey = cmd.hasOption("e") || printValue;
179    shouldPrintMeta = cmd.hasOption("m");
180    printBlockIndex = cmd.hasOption("b");
181    printBlockHeaders = cmd.hasOption("h");
182    printStatRanges = cmd.hasOption("d");
183    printStats = cmd.hasOption("s") || printStatRanges;
184    checkRow = cmd.hasOption("k");
185    checkFamily = cmd.hasOption("a");
186    checkMobIntegrity = cmd.hasOption("i");
187
188    if (cmd.hasOption("f")) {
189      files.add(new Path(cmd.getOptionValue("f")));
190    }
191
192    if (cmd.hasOption("w")) {
193      String key = cmd.getOptionValue("w");
194      if (key != null && key.length() != 0) {
195        row = Bytes.toBytesBinary(key);
196        isSeekToRow = true;
197      } else {
198        err.println("Invalid row is specified.");
199        System.exit(-1);
200      }
201    }
202
203    if (cmd.hasOption("r")) {
204      String regionName = cmd.getOptionValue("r");
205      byte[] rn = Bytes.toBytes(regionName);
206      byte[][] hri = RegionInfo.parseRegionName(rn);
207      Path rootDir = CommonFSUtils.getRootDir(getConf());
208      Path tableDir = CommonFSUtils.getTableDir(rootDir, TableName.valueOf(hri[0]));
209      String enc = RegionInfo.encodeRegionName(rn);
210      Path regionDir = new Path(tableDir, enc);
211      if (verbose) out.println("region dir -> " + regionDir);
212      List<Path> regionFiles = HFile.getStoreFiles(FileSystem.get(getConf()), regionDir);
213      if (verbose) out.println("Number of region files found -> " + regionFiles.size());
214      if (verbose) {
215        int i = 1;
216        for (Path p : regionFiles) {
217          if (verbose) out.println("Found file[" + i++ + "] -> " + p);
218        }
219      }
220      files.addAll(regionFiles);
221    }
222
223    if (checkMobIntegrity) {
224      if (verbose) {
225        System.out.println("checkMobIntegrity is enabled");
226      }
227      mobFileLocations = new HashMap<>();
228    }
229
230    cmd.getArgList().forEach((file) -> files.add(new Path(file)));
231
232    return true;
233  }
234
235  /**
236   * Runs the command-line pretty-printer, and returns the desired command exit code (zero for
237   * success, non-zero for failure).
238   */
239  @Override
240  public int run(String[] args) {
241    if (getConf() == null) {
242      throw new RuntimeException("A Configuration instance must be provided.");
243    }
244    try {
245      CommonFSUtils.setFsDefault(getConf(), CommonFSUtils.getRootDir(getConf()));
246      if (!parseOptions(args)) {
247        return 1;
248      }
249    } catch (IOException ex) {
250      LOG.error("Error parsing command-line options", ex);
251      return 1;
252    } catch (ParseException ex) {
253      LOG.error("Error parsing command-line options", ex);
254      return 1;
255    }
256
257    // iterate over all files found
258    for (Path fileName : files) {
259      try {
260        int exitCode = processFile(fileName, false);
261        if (exitCode != 0) {
262          return exitCode;
263        }
264      } catch (IOException ex) {
265        LOG.error("Error reading " + fileName, ex);
266        return -2;
267      }
268    }
269
270    if (verbose || printKey) {
271      out.println("Scanned kv count -> " + count);
272    }
273
274    return 0;
275  }
276
277  // HBASE-22561 introduces boolean checkRootDir for WebUI specificly
278  public int processFile(Path file, boolean checkRootDir) throws IOException {
279    if (verbose) {
280      out.println("Scanning -> " + file);
281    }
282
283    if (checkRootDir) {
284      Path rootPath = CommonFSUtils.getRootDir(getConf());
285      String rootString = rootPath + Path.SEPARATOR;
286      if (!file.toString().startsWith(rootString)) {
287        // First we see if fully-qualified URI matches the root dir. It might
288        // also be an absolute path in the same filesystem, so we prepend the FS
289        // of the root dir and see if that fully-qualified URI matches.
290        FileSystem rootFS = rootPath.getFileSystem(getConf());
291        String qualifiedFile = rootFS.getUri().toString() + file.toString();
292        if (!qualifiedFile.startsWith(rootString)) {
293          err.println(
294            "ERROR, file (" + file + ") is not in HBase's root directory (" + rootString + ")");
295          return -2;
296        }
297      }
298    }
299
300    FileSystem fs = file.getFileSystem(getConf());
301    if (!fs.exists(file)) {
302      err.println("ERROR, file doesnt exist: " + file);
303      return -2;
304    }
305
306    HFile.Reader reader = HFile.createReader(fs, file, CacheConfig.DISABLED, true, getConf());
307
308    Map<byte[], byte[]> fileInfo = reader.getHFileInfo();
309
310    KeyValueStatsCollector fileStats = null;
311
312    if (verbose || printKey || checkRow || checkFamily || printStats || checkMobIntegrity) {
313      // scan over file and read key/value's and check if requested
314      HFileScanner scanner = reader.getScanner(getConf(), false, false, false);
315      fileStats = new KeyValueStatsCollector();
316      boolean shouldScanKeysValues;
317      if (this.isSeekToRow && !Bytes.equals(row, reader.getFirstRowKey().orElse(null))) {
318        // seek to the first kv on this row
319        shouldScanKeysValues = (scanner.seekTo(PrivateCellUtil.createFirstOnRow(this.row)) != -1);
320      } else {
321        shouldScanKeysValues = scanner.seekTo();
322      }
323      if (shouldScanKeysValues) {
324        scanKeysValues(file, fileStats, scanner, row);
325      }
326    }
327
328    // print meta data
329    if (shouldPrintMeta) {
330      printMeta(reader, fileInfo);
331    }
332
333    if (printBlockIndex) {
334      out.println("Block Index:");
335      out.println(reader.getDataBlockIndexReader());
336    }
337
338    if (printBlockHeaders) {
339      out.println("Block Headers:");
340      /*
341       * TODO: this same/similar block iteration logic is used in HFileBlock#blockRange and
342       * TestLazyDataBlockDecompression. Refactor?
343       */
344      FSDataInputStreamWrapper fsdis = new FSDataInputStreamWrapper(fs, file);
345      long fileSize = fs.getFileStatus(file).getLen();
346      FixedFileTrailer trailer = FixedFileTrailer.readFromStream(fsdis.getStream(false), fileSize);
347      long offset = trailer.getFirstDataBlockOffset(), max = trailer.getLastDataBlockOffset();
348      HFileBlock block;
349      while (offset <= max) {
350        block = reader.readBlock(offset, -1, /* cacheBlock */ false, /* pread */ false,
351          /* isCompaction */ false, /* updateCacheMetrics */ false, null, null);
352        offset += block.getOnDiskSizeWithHeader();
353        out.println(block);
354      }
355    }
356
357    if (printStats) {
358      fileStats.finish(printStatRanges);
359      out.println("Stats:\n" + fileStats);
360    }
361
362    reader.close();
363    return 0;
364  }
365
366  private void scanKeysValues(Path file, KeyValueStatsCollector fileStats, HFileScanner scanner,
367    byte[] row) throws IOException {
368    Cell pCell = null;
369    FileSystem fs = FileSystem.get(getConf());
370    Set<String> foundMobFiles = new LinkedHashSet<>(FOUND_MOB_FILES_CACHE_CAPACITY);
371    Set<String> missingMobFiles = new LinkedHashSet<>(MISSING_MOB_FILES_CACHE_CAPACITY);
372    do {
373      ExtendedCell cell = scanner.getCell();
374      if (row != null && row.length != 0) {
375        int result = CellComparator.getInstance().compareRows(cell, row, 0, row.length);
376        if (result > 0) {
377          break;
378        } else if (result < 0) {
379          continue;
380        }
381      }
382      // collect stats
383      if (printStats) {
384        fileStats.collect(cell, printStatRanges);
385      }
386      // dump key value
387      if (printKey) {
388        out.print("K: " + cell);
389        if (printValue) {
390          out.print(" V: " + Bytes.toStringBinary(cell.getValueArray(), cell.getValueOffset(),
391            cell.getValueLength()));
392          int i = 0;
393          List<Tag> tags = PrivateCellUtil.getTags(cell);
394          for (Tag tag : tags) {
395            out.print(String.format(" T[%d]: %s", i++, tag.toString()));
396          }
397        }
398        out.println();
399      }
400      // check if rows are in order
401      if (checkRow && pCell != null) {
402        if (CellComparator.getInstance().compareRows(pCell, cell) > 0) {
403          err.println("WARNING, previous row is greater then" + " current row\n\tfilename -> "
404            + file + "\n\tprevious -> " + CellUtil.getCellKeyAsString(pCell) + "\n\tcurrent  -> "
405            + CellUtil.getCellKeyAsString(cell));
406        }
407      }
408      // check if families are consistent
409      if (checkFamily) {
410        String fam =
411          Bytes.toString(cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength());
412        if (!file.toString().contains(fam)) {
413          err.println("WARNING, filename does not match kv family," + "\n\tfilename -> " + file
414            + "\n\tkeyvalue -> " + CellUtil.getCellKeyAsString(cell));
415        }
416        if (pCell != null && CellComparator.getInstance().compareFamilies(pCell, cell) != 0) {
417          err.println(
418            "WARNING, previous kv has different family" + " compared to current key\n\tfilename -> "
419              + file + "\n\tprevious -> " + CellUtil.getCellKeyAsString(pCell) + "\n\tcurrent  -> "
420              + CellUtil.getCellKeyAsString(cell));
421        }
422      }
423      // check if mob files are missing.
424      if (checkMobIntegrity && MobUtils.isMobReferenceCell(cell)) {
425        Optional<TableName> tn = MobUtils.getTableName(cell);
426        if (!tn.isPresent()) {
427          System.err.println(
428            "ERROR, wrong tag format in mob reference cell " + CellUtil.getCellKeyAsString(cell));
429        } else if (!MobUtils.hasValidMobRefCellValue(cell)) {
430          System.err.println(
431            "ERROR, wrong value format in mob reference cell " + CellUtil.getCellKeyAsString(cell));
432        } else {
433          String mobFileName = MobUtils.getMobFileName(cell);
434          boolean exist = mobFileExists(fs, tn.get(), mobFileName,
435            Bytes.toString(CellUtil.cloneFamily(cell)), foundMobFiles, missingMobFiles);
436          if (!exist) {
437            // report error
438            System.err.println("ERROR, the mob file [" + mobFileName
439              + "] is missing referenced by cell " + CellUtil.getCellKeyAsString(cell));
440          }
441        }
442      }
443      pCell = cell;
444      ++count;
445    } while (scanner.next());
446  }
447
448  /**
449   * Checks whether the referenced mob file exists.
450   */
451  private boolean mobFileExists(FileSystem fs, TableName tn, String mobFileName, String family,
452    Set<String> foundMobFiles, Set<String> missingMobFiles) throws IOException {
453    if (foundMobFiles.contains(mobFileName)) {
454      return true;
455    }
456    if (missingMobFiles.contains(mobFileName)) {
457      return false;
458    }
459    String tableName = tn.getNameAsString();
460    List<Path> locations = mobFileLocations.get(tableName);
461    if (locations == null) {
462      locations = new ArrayList<>(2);
463      locations.add(MobUtils.getMobFamilyPath(getConf(), tn, family));
464      locations.add(HFileArchiveUtil.getStoreArchivePath(getConf(), tn,
465        MobUtils.getMobRegionInfo(tn).getEncodedName(), family));
466      mobFileLocations.put(tn.getNameAsString(), locations);
467    }
468    boolean exist = false;
469    for (Path location : locations) {
470      Path mobFilePath = new Path(location, mobFileName);
471      if (fs.exists(mobFilePath)) {
472        exist = true;
473        break;
474      }
475    }
476    if (exist) {
477      evictMobFilesIfNecessary(foundMobFiles, FOUND_MOB_FILES_CACHE_CAPACITY);
478      foundMobFiles.add(mobFileName);
479    } else {
480      evictMobFilesIfNecessary(missingMobFiles, MISSING_MOB_FILES_CACHE_CAPACITY);
481      missingMobFiles.add(mobFileName);
482    }
483    return exist;
484  }
485
486  /**
487   * Evicts the cached mob files if the set is larger than the limit.
488   */
489  private void evictMobFilesIfNecessary(Set<String> mobFileNames, int limit) {
490    if (mobFileNames.size() < limit) {
491      return;
492    }
493    int index = 0;
494    int evict = limit / 2;
495    Iterator<String> fileNamesItr = mobFileNames.iterator();
496    while (index < evict && fileNamesItr.hasNext()) {
497      fileNamesItr.next();
498      fileNamesItr.remove();
499      index++;
500    }
501  }
502
503  /**
504   * Format a string of the form "k1=v1, k2=v2, ..." into separate lines with a four-space
505   * indentation.
506   */
507  private static String asSeparateLines(String keyValueStr) {
508    return keyValueStr.replaceAll(", ([a-zA-Z]+=)", ",\n" + FOUR_SPACES + "$1");
509  }
510
511  private void printMeta(HFile.Reader reader, Map<byte[], byte[]> fileInfo) throws IOException {
512    out.println("Block index size as per heapsize: " + reader.indexSize());
513    out.println(asSeparateLines(reader.toString()));
514    out.println("Trailer:\n    " + asSeparateLines(reader.getTrailer().toString()));
515    out.println("Fileinfo:");
516    for (Map.Entry<byte[], byte[]> e : fileInfo.entrySet()) {
517      out.print(FOUR_SPACES + Bytes.toString(e.getKey()) + " = ");
518      if (
519        Bytes.equals(e.getKey(), HStoreFile.MAX_SEQ_ID_KEY)
520          || Bytes.equals(e.getKey(), HStoreFile.DELETE_FAMILY_COUNT)
521          || Bytes.equals(e.getKey(), HStoreFile.EARLIEST_PUT_TS)
522          || Bytes.equals(e.getKey(), HFileWriterImpl.MAX_MEMSTORE_TS_KEY)
523          || Bytes.equals(e.getKey(), HFileInfo.CREATE_TIME_TS)
524          || Bytes.equals(e.getKey(), HStoreFile.BULKLOAD_TIME_KEY)
525      ) {
526        out.println(Bytes.toLong(e.getValue()));
527      } else if (Bytes.equals(e.getKey(), HStoreFile.TIMERANGE_KEY)) {
528        TimeRangeTracker timeRangeTracker = TimeRangeTracker.parseFrom(e.getValue());
529        out.println(timeRangeTracker.getMin() + "...." + timeRangeTracker.getMax());
530      } else if (
531        Bytes.equals(e.getKey(), HFileInfo.AVG_KEY_LEN)
532          || Bytes.equals(e.getKey(), HFileInfo.AVG_VALUE_LEN)
533          || Bytes.equals(e.getKey(), HFileWriterImpl.KEY_VALUE_VERSION)
534          || Bytes.equals(e.getKey(), HFileInfo.MAX_TAGS_LEN)
535      ) {
536        out.println(Bytes.toInt(e.getValue()));
537      } else if (
538        Bytes.equals(e.getKey(), HStoreFile.MAJOR_COMPACTION_KEY)
539          || Bytes.equals(e.getKey(), HFileInfo.TAGS_COMPRESSED)
540          || Bytes.equals(e.getKey(), HStoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY)
541          || Bytes.equals(e.getKey(), HStoreFile.HISTORICAL_KEY)
542      ) {
543        out.println(Bytes.toBoolean(e.getValue()));
544      } else if (Bytes.equals(e.getKey(), HFileInfo.LASTKEY)) {
545        out.println(new KeyValue.KeyOnlyKeyValue(e.getValue()).toString());
546      } else {
547        out.println(Bytes.toStringBinary(e.getValue()));
548      }
549    }
550
551    try {
552      out.println("Mid-key: " + reader.midKey().map(CellUtil::getCellKeyAsString));
553    } catch (Exception e) {
554      out.println("Unable to retrieve the midkey");
555    }
556
557    // Printing general bloom information
558    DataInput bloomMeta = reader.getGeneralBloomFilterMetadata();
559    BloomFilter bloomFilter = null;
560    if (bloomMeta != null) bloomFilter = BloomFilterFactory.createFromMeta(bloomMeta, reader);
561
562    out.println("Bloom filter:");
563    if (bloomFilter != null) {
564      out.println(FOUR_SPACES
565        + bloomFilter.toString().replaceAll(BloomFilterUtil.STATS_RECORD_SEP, "\n" + FOUR_SPACES));
566    } else {
567      out.println(FOUR_SPACES + "Not present");
568    }
569
570    // Printing delete bloom information
571    bloomMeta = reader.getDeleteBloomFilterMetadata();
572    bloomFilter = null;
573    if (bloomMeta != null) bloomFilter = BloomFilterFactory.createFromMeta(bloomMeta, reader);
574
575    out.println("Delete Family Bloom filter:");
576    if (bloomFilter != null) {
577      out.println(FOUR_SPACES
578        + bloomFilter.toString().replaceAll(BloomFilterUtil.STATS_RECORD_SEP, "\n" + FOUR_SPACES));
579    } else {
580      out.println(FOUR_SPACES + "Not present");
581    }
582  }
583
584  // Default reservoir is exponentially decaying, but we're doing a point-in-time analysis
585  // of a store file. It doesn't make sense to prefer keys later in the store file.
586  private static final MetricRegistry.MetricSupplier<Histogram> UNIFORM_RESERVOIR =
587    () -> new Histogram(new UniformReservoir());
588
589  // Useful ranges for viewing distribution of small to large keys, values, and rows.
590  // we only print ranges which actually have values, so more here doesn't add much overhead
591  private static final long[] RANGES = new long[] { 1, 3, 10, 50, 100, 500, 1_000, 5_000, 10_000,
592    50_000, 100_000, 500_000, 750_000, 1_000_000, 5_000_000, 10_000_000, 50_000_000, 100_000_000 };
593
594  /**
595   * Holds a Histogram and supporting min/max and range buckets for analyzing distribution of key
596   * bytes, value bytes, row bytes, and row columns. Supports adding values, getting the histogram,
597   * and getting counts per range.
598   */
599  static class KeyValueStats {
600    private final Histogram histogram;
601    private final String name;
602    private long max = Long.MIN_VALUE;
603    private long min = Long.MAX_VALUE;
604    private boolean collectRanges = false;
605    private final LongAdder[] rangeCounts;
606
607    KeyValueStats(MetricRegistry metricRegistry, String statName) {
608      this.histogram =
609        metricRegistry.histogram(name(HFilePrettyPrinter.class, statName), UNIFORM_RESERVOIR);
610      this.name = statName;
611      this.rangeCounts = new LongAdder[RANGES.length];
612      for (int i = 0; i < rangeCounts.length; i++) {
613        rangeCounts[i] = new LongAdder();
614      }
615    }
616
617    void update(long value, boolean collectRanges) {
618      histogram.update(value);
619      min = Math.min(value, min);
620      max = Math.max(value, max);
621
622      if (collectRanges) {
623        this.collectRanges = true;
624        int result = Arrays.binarySearch(RANGES, value);
625        int idx = result >= 0 ? result : Math.abs(result) - 1;
626        rangeCounts[idx].increment();
627      }
628    }
629
630    Histogram getHistogram() {
631      return histogram;
632    }
633
634    String getName() {
635      return name;
636    }
637
638    long getMax() {
639      return max;
640    }
641
642    long getMin() {
643      return min;
644    }
645
646    long[] getRanges() {
647      return RANGES;
648    }
649
650    long getCountAtOrBelow(long range) {
651      long count = 0;
652      for (int i = 0; i < RANGES.length; i++) {
653        if (RANGES[i] <= range) {
654          count += rangeCounts[i].sum();
655        } else {
656          break;
657        }
658      }
659      return count;
660    }
661
662    boolean hasRangeCounts() {
663      return collectRanges;
664    }
665  }
666
667  private static class KeyValueStatsCollector {
668    private final MetricRegistry metricsRegistry = new MetricRegistry();
669    private final ByteArrayOutputStream metricsOutput = new ByteArrayOutputStream();
670
671    KeyValueStats keyLen = new KeyValueStats(metricsRegistry, "Key length");
672    KeyValueStats valLen = new KeyValueStats(metricsRegistry, "Val length");
673    KeyValueStats rowSizeBytes = new KeyValueStats(metricsRegistry, "Row size (bytes)");
674    KeyValueStats rowSizeCols = new KeyValueStats(metricsRegistry, "Row size (columns)");
675
676    private final SimpleReporter simpleReporter =
677      SimpleReporter.newBuilder().outputTo(new PrintStream(metricsOutput)).addStats(keyLen)
678        .addStats(valLen).addStats(rowSizeBytes).addStats(rowSizeCols).build();
679
680    long curRowBytes = 0;
681    long curRowCols = 0;
682
683    byte[] biggestRow = null;
684
685    private Cell prevCell = null;
686    private long maxRowBytes = 0;
687    private long curRowKeyLength;
688
689    public void collect(Cell cell, boolean printStatRanges) {
690      valLen.update(cell.getValueLength(), printStatRanges);
691      if (prevCell != null && CellComparator.getInstance().compareRows(prevCell, cell) != 0) {
692        // new row
693        collectRow(printStatRanges);
694      }
695      curRowBytes += cell.getSerializedSize();
696      curRowKeyLength = KeyValueUtil.keyLength(cell);
697      curRowCols++;
698      prevCell = cell;
699    }
700
701    private void collectRow(boolean printStatRanges) {
702      rowSizeBytes.update(curRowBytes, printStatRanges);
703      rowSizeCols.update(curRowCols, printStatRanges);
704      keyLen.update(curRowKeyLength, printStatRanges);
705
706      if (curRowBytes > maxRowBytes && prevCell != null) {
707        biggestRow = CellUtil.cloneRow(prevCell);
708        maxRowBytes = curRowBytes;
709      }
710
711      curRowBytes = 0;
712      curRowCols = 0;
713    }
714
715    public void finish(boolean printStatRanges) {
716      if (curRowCols > 0) {
717        collectRow(printStatRanges);
718      }
719    }
720
721    @Override
722    public String toString() {
723      if (prevCell == null) return "no data available for statistics";
724
725      // Dump the metrics to the output stream
726      simpleReporter.report();
727
728      return metricsOutput.toString() + "Key of biggest row: " + Bytes.toStringBinary(biggestRow);
729    }
730  }
731
732  /**
733   * Simple reporter which collects registered histograms for printing to an output stream in
734   * {@link #report()}.
735   */
736  private static final class SimpleReporter {
737    /**
738     * Returns a new {@link Builder} for {@link SimpleReporter}.
739     * @return a {@link Builder} instance for a {@link SimpleReporter}
740     */
741    public static Builder newBuilder() {
742      return new Builder();
743    }
744
745    /**
746     * A builder for {@link SimpleReporter} instances. Defaults to using the default locale and time
747     * zone, writing to {@code System.out}.
748     */
749    public static class Builder {
750      private final List<KeyValueStats> stats = new ArrayList<>();
751      private PrintStream output;
752      private Locale locale;
753      private TimeZone timeZone;
754
755      private Builder() {
756        this.output = System.out;
757        this.locale = Locale.getDefault();
758        this.timeZone = TimeZone.getDefault();
759      }
760
761      /**
762       * Write to the given {@link PrintStream}.
763       * @param output a {@link PrintStream} instance.
764       * @return {@code this}
765       */
766      public Builder outputTo(PrintStream output) {
767        this.output = output;
768        return this;
769      }
770
771      /**
772       * Add the given {@link KeyValueStats} to be reported
773       * @param stat the stat to be reported
774       * @return {@code this}
775       */
776      public Builder addStats(KeyValueStats stat) {
777        this.stats.add(stat);
778        return this;
779      }
780
781      /**
782       * Builds a {@link ConsoleReporter} with the given properties.
783       * @return a {@link ConsoleReporter}
784       */
785      public SimpleReporter build() {
786        return new SimpleReporter(output, stats, locale, timeZone);
787      }
788    }
789
790    private final PrintStream output;
791    private final List<KeyValueStats> stats;
792    private final Locale locale;
793    private final DateFormat dateFormat;
794
795    private SimpleReporter(PrintStream output, List<KeyValueStats> stats, Locale locale,
796      TimeZone timeZone) {
797      this.output = output;
798      this.stats = stats;
799      this.locale = locale;
800      this.dateFormat = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.MEDIUM, locale);
801      dateFormat.setTimeZone(timeZone);
802    }
803
804    public void report() {
805      // we know we only have histograms
806      if (!stats.isEmpty()) {
807        for (KeyValueStats stat : stats) {
808          output.print("   " + stat.getName());
809          output.println(':');
810          printHistogram(stat);
811        }
812        output.println();
813      }
814
815      output.println();
816      output.flush();
817    }
818
819    private void printHistogram(KeyValueStats stats) {
820      Histogram histogram = stats.getHistogram();
821      Snapshot snapshot = histogram.getSnapshot();
822
823      output.printf(locale, "               min = %d%n", stats.getMin());
824      output.printf(locale, "               max = %d%n", stats.getMax());
825      output.printf(locale, "              mean = %2.2f%n", snapshot.getMean());
826      output.printf(locale, "            stddev = %2.2f%n", snapshot.getStdDev());
827      output.printf(locale, "            median = %2.2f%n", snapshot.getMedian());
828      output.printf(locale, "              75%% <= %2.2f%n", snapshot.get75thPercentile());
829      output.printf(locale, "              95%% <= %2.2f%n", snapshot.get95thPercentile());
830      output.printf(locale, "              98%% <= %2.2f%n", snapshot.get98thPercentile());
831      output.printf(locale, "              99%% <= %2.2f%n", snapshot.get99thPercentile());
832      output.printf(locale, "            99.9%% <= %2.2f%n", snapshot.get999thPercentile());
833      output.printf(locale, "             count = %d%n", histogram.getCount());
834
835      // if printStatRanges was enabled with -d arg, below we'll create an approximate histogram
836      // of counts based on the configured ranges in RANGES. Each range of sizes (i.e. <= 50, <=
837      // 100, etc) will have a count printed if any values were seen in that range. If no values
838      // were seen for a range, that range will be excluded to keep the output small.
839      if (stats.hasRangeCounts()) {
840        output.printf(locale, "           (range <= count):%n");
841        long lastVal = 0;
842        long lastRange = 0;
843        for (long range : stats.getRanges()) {
844          long val = stats.getCountAtOrBelow(range);
845          if (val - lastVal > 0) {
846            // print the last zero value before this one, to give context
847            if (lastVal == 0 && lastRange != 0) {
848              printRangeCount(lastRange, lastVal);
849            }
850            printRangeCount(range, val - lastVal);
851          }
852          lastVal = val;
853          lastRange = range;
854        }
855        if (histogram.getCount() - lastVal > 0) {
856          // print any remaining that might have been outside our buckets
857          printRangeCount(Long.MAX_VALUE, histogram.getCount() - lastVal);
858        }
859      }
860    }
861
862    private void printRangeCount(long range, long countAtOrBelow) {
863      String rangeString = range == Long.MAX_VALUE ? "inf" : Long.toString(range);
864      output.printf(locale, "%17s <= %d%n", rangeString, countAtOrBelow);
865    }
866  }
867
868  public static void main(String[] args) throws Exception {
869    Configuration conf = HBaseConfiguration.create();
870    // no need for a block cache
871    conf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0);
872    int ret = ToolRunner.run(conf, new HFilePrettyPrinter(), args);
873    System.exit(ret);
874  }
875}