001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.io.hfile;
019
020import static com.codahale.metrics.MetricRegistry.name;
021
022import com.codahale.metrics.ConsoleReporter;
023import com.codahale.metrics.Histogram;
024import com.codahale.metrics.MetricRegistry;
025import com.codahale.metrics.Snapshot;
026import com.codahale.metrics.UniformReservoir;
027import java.io.ByteArrayOutputStream;
028import java.io.DataInput;
029import java.io.IOException;
030import java.io.PrintStream;
031import java.text.DateFormat;
032import java.util.ArrayList;
033import java.util.Arrays;
034import java.util.HashMap;
035import java.util.Iterator;
036import java.util.LinkedHashSet;
037import java.util.List;
038import java.util.Locale;
039import java.util.Map;
040import java.util.Optional;
041import java.util.Set;
042import java.util.TimeZone;
043import java.util.concurrent.atomic.LongAdder;
044import org.apache.hadoop.conf.Configuration;
045import org.apache.hadoop.conf.Configured;
046import org.apache.hadoop.fs.FileSystem;
047import org.apache.hadoop.fs.Path;
048import org.apache.hadoop.hbase.Cell;
049import org.apache.hadoop.hbase.CellComparator;
050import org.apache.hadoop.hbase.CellUtil;
051import org.apache.hadoop.hbase.HBaseConfiguration;
052import org.apache.hadoop.hbase.HBaseInterfaceAudience;
053import org.apache.hadoop.hbase.HConstants;
054import org.apache.hadoop.hbase.HRegionInfo;
055import org.apache.hadoop.hbase.KeyValue;
056import org.apache.hadoop.hbase.KeyValueUtil;
057import org.apache.hadoop.hbase.PrivateCellUtil;
058import org.apache.hadoop.hbase.TableName;
059import org.apache.hadoop.hbase.Tag;
060import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper;
061import org.apache.hadoop.hbase.mob.MobUtils;
062import org.apache.hadoop.hbase.regionserver.HStoreFile;
063import org.apache.hadoop.hbase.regionserver.TimeRangeTracker;
064import org.apache.hadoop.hbase.util.BloomFilter;
065import org.apache.hadoop.hbase.util.BloomFilterFactory;
066import org.apache.hadoop.hbase.util.BloomFilterUtil;
067import org.apache.hadoop.hbase.util.Bytes;
068import org.apache.hadoop.hbase.util.CommonFSUtils;
069import org.apache.hadoop.hbase.util.HFileArchiveUtil;
070import org.apache.hadoop.util.Tool;
071import org.apache.hadoop.util.ToolRunner;
072import org.apache.yetus.audience.InterfaceAudience;
073import org.apache.yetus.audience.InterfaceStability;
074import org.slf4j.Logger;
075import org.slf4j.LoggerFactory;
076
077import org.apache.hbase.thirdparty.org.apache.commons.cli.CommandLine;
078import org.apache.hbase.thirdparty.org.apache.commons.cli.CommandLineParser;
079import org.apache.hbase.thirdparty.org.apache.commons.cli.HelpFormatter;
080import org.apache.hbase.thirdparty.org.apache.commons.cli.Option;
081import org.apache.hbase.thirdparty.org.apache.commons.cli.OptionGroup;
082import org.apache.hbase.thirdparty.org.apache.commons.cli.Options;
083import org.apache.hbase.thirdparty.org.apache.commons.cli.ParseException;
084import org.apache.hbase.thirdparty.org.apache.commons.cli.PosixParser;
085
086/**
087 * Implements pretty-printing functionality for {@link HFile}s.
088 */
089@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS)
090@InterfaceStability.Evolving
091public class HFilePrettyPrinter extends Configured implements Tool {
092
093  private static final Logger LOG = LoggerFactory.getLogger(HFilePrettyPrinter.class);
094
095  private Options options = new Options();
096
097  private boolean verbose;
098  private boolean printValue;
099  private boolean printKey;
100  private boolean shouldPrintMeta;
101  private boolean printBlockIndex;
102  private boolean printBlockHeaders;
103  private boolean printStats;
104  private boolean printStatRanges;
105  private boolean checkRow;
106  private boolean checkFamily;
107  private boolean isSeekToRow = false;
108  private boolean checkMobIntegrity = false;
109  private Map<String, List<Path>> mobFileLocations;
110  private static final int FOUND_MOB_FILES_CACHE_CAPACITY = 50;
111  private static final int MISSING_MOB_FILES_CACHE_CAPACITY = 20;
112  private PrintStream out = System.out;
113  private PrintStream err = System.err;
114
115  /**
116   * The row which the user wants to specify and print all the KeyValues for.
117   */
118  private byte[] row = null;
119
120  private List<Path> files = new ArrayList<>();
121  private int count;
122
123  private static final String FOUR_SPACES = "    ";
124
125  public HFilePrettyPrinter() {
126    super();
127    init();
128  }
129
130  public HFilePrettyPrinter(Configuration conf) {
131    super(conf);
132    init();
133  }
134
135  private void init() {
136    options.addOption("v", "verbose", false, "Verbose output; emits file and meta data delimiters");
137    options.addOption("p", "printkv", false, "Print key/value pairs");
138    options.addOption("e", "printkey", false, "Print keys");
139    options.addOption("m", "printmeta", false, "Print meta data of file");
140    options.addOption("b", "printblocks", false, "Print block index meta data");
141    options.addOption("h", "printblockheaders", false, "Print block headers for each block.");
142    options.addOption("k", "checkrow", false,
143      "Enable row order check; looks for out-of-order keys");
144    options.addOption("a", "checkfamily", false, "Enable family check");
145    options.addOption("w", "seekToRow", true,
146      "Seek to this row and print all the kvs for this row only");
147    options.addOption("s", "stats", false, "Print statistics");
148    options.addOption("d", "details", false,
149      "Print detailed statistics, including counts by range");
150    options.addOption("i", "checkMobIntegrity", false,
151      "Print all cells whose mob files are missing");
152
153    OptionGroup files = new OptionGroup();
154    files.addOption(new Option("f", "file", true,
155      "File to scan. Pass full-path; e.g. hdfs://a:9000/hbase/hbase:meta/12/34"));
156    files.addOption(
157      new Option("r", "region", true, "Region to scan. Pass region name; e.g. 'hbase:meta,,1'"));
158    options.addOptionGroup(files);
159  }
160
161  public void setPrintStreams(PrintStream out, PrintStream err) {
162    this.out = out;
163    this.err = err;
164  }
165
166  public boolean parseOptions(String args[]) throws ParseException, IOException {
167    if (args.length == 0) {
168      HelpFormatter formatter = new HelpFormatter();
169      formatter.printHelp("hfile", options, true);
170      return false;
171    }
172    CommandLineParser parser = new PosixParser();
173    CommandLine cmd = parser.parse(options, args);
174
175    verbose = cmd.hasOption("v");
176    printValue = cmd.hasOption("p");
177    printKey = cmd.hasOption("e") || printValue;
178    shouldPrintMeta = cmd.hasOption("m");
179    printBlockIndex = cmd.hasOption("b");
180    printBlockHeaders = cmd.hasOption("h");
181    printStatRanges = cmd.hasOption("d");
182    printStats = cmd.hasOption("s") || printStatRanges;
183    checkRow = cmd.hasOption("k");
184    checkFamily = cmd.hasOption("a");
185    checkMobIntegrity = cmd.hasOption("i");
186
187    if (cmd.hasOption("f")) {
188      files.add(new Path(cmd.getOptionValue("f")));
189    }
190
191    if (cmd.hasOption("w")) {
192      String key = cmd.getOptionValue("w");
193      if (key != null && key.length() != 0) {
194        row = Bytes.toBytesBinary(key);
195        isSeekToRow = true;
196      } else {
197        err.println("Invalid row is specified.");
198        System.exit(-1);
199      }
200    }
201
202    if (cmd.hasOption("r")) {
203      String regionName = cmd.getOptionValue("r");
204      byte[] rn = Bytes.toBytes(regionName);
205      byte[][] hri = HRegionInfo.parseRegionName(rn);
206      Path rootDir = CommonFSUtils.getRootDir(getConf());
207      Path tableDir = CommonFSUtils.getTableDir(rootDir, TableName.valueOf(hri[0]));
208      String enc = HRegionInfo.encodeRegionName(rn);
209      Path regionDir = new Path(tableDir, enc);
210      if (verbose) out.println("region dir -> " + regionDir);
211      List<Path> regionFiles = HFile.getStoreFiles(FileSystem.get(getConf()), regionDir);
212      if (verbose) out.println("Number of region files found -> " + regionFiles.size());
213      if (verbose) {
214        int i = 1;
215        for (Path p : regionFiles) {
216          if (verbose) out.println("Found file[" + i++ + "] -> " + p);
217        }
218      }
219      files.addAll(regionFiles);
220    }
221
222    if (checkMobIntegrity) {
223      if (verbose) {
224        System.out.println("checkMobIntegrity is enabled");
225      }
226      mobFileLocations = new HashMap<>();
227    }
228
229    cmd.getArgList().forEach((file) -> files.add(new Path(file)));
230
231    return true;
232  }
233
234  /**
235   * Runs the command-line pretty-printer, and returns the desired command exit code (zero for
236   * success, non-zero for failure).
237   */
238  @Override
239  public int run(String[] args) {
240    if (getConf() == null) {
241      throw new RuntimeException("A Configuration instance must be provided.");
242    }
243    try {
244      CommonFSUtils.setFsDefault(getConf(), CommonFSUtils.getRootDir(getConf()));
245      if (!parseOptions(args)) {
246        return 1;
247      }
248    } catch (IOException ex) {
249      LOG.error("Error parsing command-line options", ex);
250      return 1;
251    } catch (ParseException ex) {
252      LOG.error("Error parsing command-line options", ex);
253      return 1;
254    }
255
256    // iterate over all files found
257    for (Path fileName : files) {
258      try {
259        int exitCode = processFile(fileName, false);
260        if (exitCode != 0) {
261          return exitCode;
262        }
263      } catch (IOException ex) {
264        LOG.error("Error reading " + fileName, ex);
265        return -2;
266      }
267    }
268
269    if (verbose || printKey) {
270      out.println("Scanned kv count -> " + count);
271    }
272
273    return 0;
274  }
275
276  // HBASE-22561 introduces boolean checkRootDir for WebUI specificly
277  public int processFile(Path file, boolean checkRootDir) throws IOException {
278    if (verbose) {
279      out.println("Scanning -> " + file);
280    }
281
282    if (checkRootDir) {
283      Path rootPath = CommonFSUtils.getRootDir(getConf());
284      String rootString = rootPath + Path.SEPARATOR;
285      if (!file.toString().startsWith(rootString)) {
286        // First we see if fully-qualified URI matches the root dir. It might
287        // also be an absolute path in the same filesystem, so we prepend the FS
288        // of the root dir and see if that fully-qualified URI matches.
289        FileSystem rootFS = rootPath.getFileSystem(getConf());
290        String qualifiedFile = rootFS.getUri().toString() + file.toString();
291        if (!qualifiedFile.startsWith(rootString)) {
292          err.println(
293            "ERROR, file (" + file + ") is not in HBase's root directory (" + rootString + ")");
294          return -2;
295        }
296      }
297    }
298
299    FileSystem fs = file.getFileSystem(getConf());
300    if (!fs.exists(file)) {
301      err.println("ERROR, file doesnt exist: " + file);
302      return -2;
303    }
304
305    HFile.Reader reader = HFile.createReader(fs, file, CacheConfig.DISABLED, true, getConf());
306
307    Map<byte[], byte[]> fileInfo = reader.getHFileInfo();
308
309    KeyValueStatsCollector fileStats = null;
310
311    if (verbose || printKey || checkRow || checkFamily || printStats || checkMobIntegrity) {
312      // scan over file and read key/value's and check if requested
313      HFileScanner scanner = reader.getScanner(getConf(), false, false, false);
314      fileStats = new KeyValueStatsCollector();
315      boolean shouldScanKeysValues;
316      if (this.isSeekToRow && !Bytes.equals(row, reader.getFirstRowKey().orElse(null))) {
317        // seek to the first kv on this row
318        shouldScanKeysValues = (scanner.seekTo(PrivateCellUtil.createFirstOnRow(this.row)) != -1);
319      } else {
320        shouldScanKeysValues = scanner.seekTo();
321      }
322      if (shouldScanKeysValues) {
323        scanKeysValues(file, fileStats, scanner, row);
324      }
325    }
326
327    // print meta data
328    if (shouldPrintMeta) {
329      printMeta(reader, fileInfo);
330    }
331
332    if (printBlockIndex) {
333      out.println("Block Index:");
334      out.println(reader.getDataBlockIndexReader());
335    }
336
337    if (printBlockHeaders) {
338      out.println("Block Headers:");
339      /*
340       * TODO: this same/similar block iteration logic is used in HFileBlock#blockRange and
341       * TestLazyDataBlockDecompression. Refactor?
342       */
343      FSDataInputStreamWrapper fsdis = new FSDataInputStreamWrapper(fs, file);
344      long fileSize = fs.getFileStatus(file).getLen();
345      FixedFileTrailer trailer = FixedFileTrailer.readFromStream(fsdis.getStream(false), fileSize);
346      long offset = trailer.getFirstDataBlockOffset(), max = trailer.getLastDataBlockOffset();
347      HFileBlock block;
348      while (offset <= max) {
349        block = reader.readBlock(offset, -1, /* cacheBlock */ false, /* pread */ false,
350          /* isCompaction */ false, /* updateCacheMetrics */ false, null, null);
351        offset += block.getOnDiskSizeWithHeader();
352        out.println(block);
353      }
354    }
355
356    if (printStats) {
357      fileStats.finish(printStatRanges);
358      out.println("Stats:\n" + fileStats);
359    }
360
361    reader.close();
362    return 0;
363  }
364
365  private void scanKeysValues(Path file, KeyValueStatsCollector fileStats, HFileScanner scanner,
366    byte[] row) throws IOException {
367    Cell pCell = null;
368    FileSystem fs = FileSystem.get(getConf());
369    Set<String> foundMobFiles = new LinkedHashSet<>(FOUND_MOB_FILES_CACHE_CAPACITY);
370    Set<String> missingMobFiles = new LinkedHashSet<>(MISSING_MOB_FILES_CACHE_CAPACITY);
371    do {
372      Cell cell = scanner.getCell();
373      if (row != null && row.length != 0) {
374        int result = CellComparator.getInstance().compareRows(cell, row, 0, row.length);
375        if (result > 0) {
376          break;
377        } else if (result < 0) {
378          continue;
379        }
380      }
381      // collect stats
382      if (printStats) {
383        fileStats.collect(cell, printStatRanges);
384      }
385      // dump key value
386      if (printKey) {
387        out.print("K: " + cell);
388        if (printValue) {
389          out.print(" V: " + Bytes.toStringBinary(cell.getValueArray(), cell.getValueOffset(),
390            cell.getValueLength()));
391          int i = 0;
392          List<Tag> tags = PrivateCellUtil.getTags(cell);
393          for (Tag tag : tags) {
394            out.print(String.format(" T[%d]: %s", i++, tag.toString()));
395          }
396        }
397        out.println();
398      }
399      // check if rows are in order
400      if (checkRow && pCell != null) {
401        if (CellComparator.getInstance().compareRows(pCell, cell) > 0) {
402          err.println("WARNING, previous row is greater then" + " current row\n\tfilename -> "
403            + file + "\n\tprevious -> " + CellUtil.getCellKeyAsString(pCell) + "\n\tcurrent  -> "
404            + CellUtil.getCellKeyAsString(cell));
405        }
406      }
407      // check if families are consistent
408      if (checkFamily) {
409        String fam =
410          Bytes.toString(cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength());
411        if (!file.toString().contains(fam)) {
412          err.println("WARNING, filename does not match kv family," + "\n\tfilename -> " + file
413            + "\n\tkeyvalue -> " + CellUtil.getCellKeyAsString(cell));
414        }
415        if (pCell != null && CellComparator.getInstance().compareFamilies(pCell, cell) != 0) {
416          err.println(
417            "WARNING, previous kv has different family" + " compared to current key\n\tfilename -> "
418              + file + "\n\tprevious -> " + CellUtil.getCellKeyAsString(pCell) + "\n\tcurrent  -> "
419              + CellUtil.getCellKeyAsString(cell));
420        }
421      }
422      // check if mob files are missing.
423      if (checkMobIntegrity && MobUtils.isMobReferenceCell(cell)) {
424        Optional<TableName> tn = MobUtils.getTableName(cell);
425        if (!tn.isPresent()) {
426          System.err.println(
427            "ERROR, wrong tag format in mob reference cell " + CellUtil.getCellKeyAsString(cell));
428        } else if (!MobUtils.hasValidMobRefCellValue(cell)) {
429          System.err.println(
430            "ERROR, wrong value format in mob reference cell " + CellUtil.getCellKeyAsString(cell));
431        } else {
432          String mobFileName = MobUtils.getMobFileName(cell);
433          boolean exist = mobFileExists(fs, tn.get(), mobFileName,
434            Bytes.toString(CellUtil.cloneFamily(cell)), foundMobFiles, missingMobFiles);
435          if (!exist) {
436            // report error
437            System.err.println("ERROR, the mob file [" + mobFileName
438              + "] is missing referenced by cell " + CellUtil.getCellKeyAsString(cell));
439          }
440        }
441      }
442      pCell = cell;
443      ++count;
444    } while (scanner.next());
445  }
446
447  /**
448   * Checks whether the referenced mob file exists.
449   */
450  private boolean mobFileExists(FileSystem fs, TableName tn, String mobFileName, String family,
451    Set<String> foundMobFiles, Set<String> missingMobFiles) throws IOException {
452    if (foundMobFiles.contains(mobFileName)) {
453      return true;
454    }
455    if (missingMobFiles.contains(mobFileName)) {
456      return false;
457    }
458    String tableName = tn.getNameAsString();
459    List<Path> locations = mobFileLocations.get(tableName);
460    if (locations == null) {
461      locations = new ArrayList<>(2);
462      locations.add(MobUtils.getMobFamilyPath(getConf(), tn, family));
463      locations.add(HFileArchiveUtil.getStoreArchivePath(getConf(), tn,
464        MobUtils.getMobRegionInfo(tn).getEncodedName(), family));
465      mobFileLocations.put(tn.getNameAsString(), locations);
466    }
467    boolean exist = false;
468    for (Path location : locations) {
469      Path mobFilePath = new Path(location, mobFileName);
470      if (fs.exists(mobFilePath)) {
471        exist = true;
472        break;
473      }
474    }
475    if (exist) {
476      evictMobFilesIfNecessary(foundMobFiles, FOUND_MOB_FILES_CACHE_CAPACITY);
477      foundMobFiles.add(mobFileName);
478    } else {
479      evictMobFilesIfNecessary(missingMobFiles, MISSING_MOB_FILES_CACHE_CAPACITY);
480      missingMobFiles.add(mobFileName);
481    }
482    return exist;
483  }
484
485  /**
486   * Evicts the cached mob files if the set is larger than the limit.
487   */
488  private void evictMobFilesIfNecessary(Set<String> mobFileNames, int limit) {
489    if (mobFileNames.size() < limit) {
490      return;
491    }
492    int index = 0;
493    int evict = limit / 2;
494    Iterator<String> fileNamesItr = mobFileNames.iterator();
495    while (index < evict && fileNamesItr.hasNext()) {
496      fileNamesItr.next();
497      fileNamesItr.remove();
498      index++;
499    }
500  }
501
502  /**
503   * Format a string of the form "k1=v1, k2=v2, ..." into separate lines with a four-space
504   * indentation.
505   */
506  private static String asSeparateLines(String keyValueStr) {
507    return keyValueStr.replaceAll(", ([a-zA-Z]+=)", ",\n" + FOUR_SPACES + "$1");
508  }
509
510  private void printMeta(HFile.Reader reader, Map<byte[], byte[]> fileInfo) throws IOException {
511    out.println("Block index size as per heapsize: " + reader.indexSize());
512    out.println(asSeparateLines(reader.toString()));
513    out.println("Trailer:\n    " + asSeparateLines(reader.getTrailer().toString()));
514    out.println("Fileinfo:");
515    for (Map.Entry<byte[], byte[]> e : fileInfo.entrySet()) {
516      out.print(FOUR_SPACES + Bytes.toString(e.getKey()) + " = ");
517      if (
518        Bytes.equals(e.getKey(), HStoreFile.MAX_SEQ_ID_KEY)
519          || Bytes.equals(e.getKey(), HStoreFile.DELETE_FAMILY_COUNT)
520          || Bytes.equals(e.getKey(), HStoreFile.EARLIEST_PUT_TS)
521          || Bytes.equals(e.getKey(), HFileWriterImpl.MAX_MEMSTORE_TS_KEY)
522          || Bytes.equals(e.getKey(), HFileInfo.CREATE_TIME_TS)
523          || Bytes.equals(e.getKey(), HStoreFile.BULKLOAD_TIME_KEY)
524      ) {
525        out.println(Bytes.toLong(e.getValue()));
526      } else if (Bytes.equals(e.getKey(), HStoreFile.TIMERANGE_KEY)) {
527        TimeRangeTracker timeRangeTracker = TimeRangeTracker.parseFrom(e.getValue());
528        out.println(timeRangeTracker.getMin() + "...." + timeRangeTracker.getMax());
529      } else if (
530        Bytes.equals(e.getKey(), HFileInfo.AVG_KEY_LEN)
531          || Bytes.equals(e.getKey(), HFileInfo.AVG_VALUE_LEN)
532          || Bytes.equals(e.getKey(), HFileWriterImpl.KEY_VALUE_VERSION)
533          || Bytes.equals(e.getKey(), HFileInfo.MAX_TAGS_LEN)
534      ) {
535        out.println(Bytes.toInt(e.getValue()));
536      } else if (
537        Bytes.equals(e.getKey(), HStoreFile.MAJOR_COMPACTION_KEY)
538          || Bytes.equals(e.getKey(), HFileInfo.TAGS_COMPRESSED)
539          || Bytes.equals(e.getKey(), HStoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY)
540      ) {
541        out.println(Bytes.toBoolean(e.getValue()));
542      } else if (Bytes.equals(e.getKey(), HFileInfo.LASTKEY)) {
543        out.println(new KeyValue.KeyOnlyKeyValue(e.getValue()).toString());
544      } else {
545        out.println(Bytes.toStringBinary(e.getValue()));
546      }
547    }
548
549    try {
550      out.println("Mid-key: " + reader.midKey().map(CellUtil::getCellKeyAsString));
551    } catch (Exception e) {
552      out.println("Unable to retrieve the midkey");
553    }
554
555    // Printing general bloom information
556    DataInput bloomMeta = reader.getGeneralBloomFilterMetadata();
557    BloomFilter bloomFilter = null;
558    if (bloomMeta != null) bloomFilter = BloomFilterFactory.createFromMeta(bloomMeta, reader);
559
560    out.println("Bloom filter:");
561    if (bloomFilter != null) {
562      out.println(FOUR_SPACES
563        + bloomFilter.toString().replaceAll(BloomFilterUtil.STATS_RECORD_SEP, "\n" + FOUR_SPACES));
564    } else {
565      out.println(FOUR_SPACES + "Not present");
566    }
567
568    // Printing delete bloom information
569    bloomMeta = reader.getDeleteBloomFilterMetadata();
570    bloomFilter = null;
571    if (bloomMeta != null) bloomFilter = BloomFilterFactory.createFromMeta(bloomMeta, reader);
572
573    out.println("Delete Family Bloom filter:");
574    if (bloomFilter != null) {
575      out.println(FOUR_SPACES
576        + bloomFilter.toString().replaceAll(BloomFilterUtil.STATS_RECORD_SEP, "\n" + FOUR_SPACES));
577    } else {
578      out.println(FOUR_SPACES + "Not present");
579    }
580  }
581
582  // Default reservoir is exponentially decaying, but we're doing a point-in-time analysis
583  // of a store file. It doesn't make sense to prefer keys later in the store file.
584  private static final MetricRegistry.MetricSupplier<Histogram> UNIFORM_RESERVOIR =
585    () -> new Histogram(new UniformReservoir());
586
587  // Useful ranges for viewing distribution of small to large keys, values, and rows.
588  // we only print ranges which actually have values, so more here doesn't add much overhead
589  private static final long[] RANGES = new long[] { 1, 3, 10, 50, 100, 500, 1_000, 5_000, 10_000,
590    50_000, 100_000, 500_000, 750_000, 1_000_000, 5_000_000, 10_000_000, 50_000_000, 100_000_000 };
591
592  /**
593   * Holds a Histogram and supporting min/max and range buckets for analyzing distribution of key
594   * bytes, value bytes, row bytes, and row columns. Supports adding values, getting the histogram,
595   * and getting counts per range.
596   */
597  static class KeyValueStats {
598    private final Histogram histogram;
599    private final String name;
600    private long max = Long.MIN_VALUE;
601    private long min = Long.MAX_VALUE;
602    private boolean collectRanges = false;
603    private final LongAdder[] rangeCounts;
604
605    KeyValueStats(MetricRegistry metricRegistry, String statName) {
606      this.histogram =
607        metricRegistry.histogram(name(HFilePrettyPrinter.class, statName), UNIFORM_RESERVOIR);
608      this.name = statName;
609      this.rangeCounts = new LongAdder[RANGES.length];
610      for (int i = 0; i < rangeCounts.length; i++) {
611        rangeCounts[i] = new LongAdder();
612      }
613    }
614
615    void update(long value, boolean collectRanges) {
616      histogram.update(value);
617      min = Math.min(value, min);
618      max = Math.max(value, max);
619
620      if (collectRanges) {
621        this.collectRanges = true;
622        int result = Arrays.binarySearch(RANGES, value);
623        int idx = result >= 0 ? result : Math.abs(result) - 1;
624        rangeCounts[idx].increment();
625      }
626    }
627
628    Histogram getHistogram() {
629      return histogram;
630    }
631
632    String getName() {
633      return name;
634    }
635
636    long getMax() {
637      return max;
638    }
639
640    long getMin() {
641      return min;
642    }
643
644    long[] getRanges() {
645      return RANGES;
646    }
647
648    long getCountAtOrBelow(long range) {
649      long count = 0;
650      for (int i = 0; i < RANGES.length; i++) {
651        if (RANGES[i] <= range) {
652          count += rangeCounts[i].sum();
653        } else {
654          break;
655        }
656      }
657      return count;
658    }
659
660    boolean hasRangeCounts() {
661      return collectRanges;
662    }
663  }
664
665  private static class KeyValueStatsCollector {
666    private final MetricRegistry metricsRegistry = new MetricRegistry();
667    private final ByteArrayOutputStream metricsOutput = new ByteArrayOutputStream();
668
669    KeyValueStats keyLen = new KeyValueStats(metricsRegistry, "Key length");
670    KeyValueStats valLen = new KeyValueStats(metricsRegistry, "Val length");
671    KeyValueStats rowSizeBytes = new KeyValueStats(metricsRegistry, "Row size (bytes)");
672    KeyValueStats rowSizeCols = new KeyValueStats(metricsRegistry, "Row size (columns)");
673
674    private final SimpleReporter simpleReporter =
675      SimpleReporter.newBuilder().outputTo(new PrintStream(metricsOutput)).addStats(keyLen)
676        .addStats(valLen).addStats(rowSizeBytes).addStats(rowSizeCols).build();
677
678    long curRowBytes = 0;
679    long curRowCols = 0;
680
681    byte[] biggestRow = null;
682
683    private Cell prevCell = null;
684    private long maxRowBytes = 0;
685    private long curRowKeyLength;
686
687    public void collect(Cell cell, boolean printStatRanges) {
688      valLen.update(cell.getValueLength(), printStatRanges);
689      if (prevCell != null && CellComparator.getInstance().compareRows(prevCell, cell) != 0) {
690        // new row
691        collectRow(printStatRanges);
692      }
693      curRowBytes += cell.getSerializedSize();
694      curRowKeyLength = KeyValueUtil.keyLength(cell);
695      curRowCols++;
696      prevCell = cell;
697    }
698
699    private void collectRow(boolean printStatRanges) {
700      rowSizeBytes.update(curRowBytes, printStatRanges);
701      rowSizeCols.update(curRowCols, printStatRanges);
702      keyLen.update(curRowKeyLength, printStatRanges);
703
704      if (curRowBytes > maxRowBytes && prevCell != null) {
705        biggestRow = CellUtil.cloneRow(prevCell);
706        maxRowBytes = curRowBytes;
707      }
708
709      curRowBytes = 0;
710      curRowCols = 0;
711    }
712
713    public void finish(boolean printStatRanges) {
714      if (curRowCols > 0) {
715        collectRow(printStatRanges);
716      }
717    }
718
719    @Override
720    public String toString() {
721      if (prevCell == null) return "no data available for statistics";
722
723      // Dump the metrics to the output stream
724      simpleReporter.report();
725
726      return metricsOutput.toString() + "Key of biggest row: " + Bytes.toStringBinary(biggestRow);
727    }
728  }
729
730  /**
731   * Simple reporter which collects registered histograms for printing to an output stream in
732   * {@link #report()}.
733   */
734  private static final class SimpleReporter {
735    /**
736     * Returns a new {@link Builder} for {@link SimpleReporter}.
737     * @return a {@link Builder} instance for a {@link SimpleReporter}
738     */
739    public static Builder newBuilder() {
740      return new Builder();
741    }
742
743    /**
744     * A builder for {@link SimpleReporter} instances. Defaults to using the default locale and time
745     * zone, writing to {@code System.out}.
746     */
747    public static class Builder {
748      private final List<KeyValueStats> stats = new ArrayList<>();
749      private PrintStream output;
750      private Locale locale;
751      private TimeZone timeZone;
752
753      private Builder() {
754        this.output = System.out;
755        this.locale = Locale.getDefault();
756        this.timeZone = TimeZone.getDefault();
757      }
758
759      /**
760       * Write to the given {@link PrintStream}.
761       * @param output a {@link PrintStream} instance.
762       * @return {@code this}
763       */
764      public Builder outputTo(PrintStream output) {
765        this.output = output;
766        return this;
767      }
768
769      /**
770       * Add the given {@link KeyValueStats} to be reported
771       * @param stat the stat to be reported
772       * @return {@code this}
773       */
774      public Builder addStats(KeyValueStats stat) {
775        this.stats.add(stat);
776        return this;
777      }
778
779      /**
780       * Builds a {@link ConsoleReporter} with the given properties.
781       * @return a {@link ConsoleReporter}
782       */
783      public SimpleReporter build() {
784        return new SimpleReporter(output, stats, locale, timeZone);
785      }
786    }
787
788    private final PrintStream output;
789    private final List<KeyValueStats> stats;
790    private final Locale locale;
791    private final DateFormat dateFormat;
792
793    private SimpleReporter(PrintStream output, List<KeyValueStats> stats, Locale locale,
794      TimeZone timeZone) {
795      this.output = output;
796      this.stats = stats;
797      this.locale = locale;
798      this.dateFormat = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.MEDIUM, locale);
799      dateFormat.setTimeZone(timeZone);
800    }
801
802    public void report() {
803      // we know we only have histograms
804      if (!stats.isEmpty()) {
805        for (KeyValueStats stat : stats) {
806          output.print("   " + stat.getName());
807          output.println(':');
808          printHistogram(stat);
809        }
810        output.println();
811      }
812
813      output.println();
814      output.flush();
815    }
816
817    private void printHistogram(KeyValueStats stats) {
818      Histogram histogram = stats.getHistogram();
819      Snapshot snapshot = histogram.getSnapshot();
820
821      output.printf(locale, "               min = %d%n", stats.getMin());
822      output.printf(locale, "               max = %d%n", stats.getMax());
823      output.printf(locale, "              mean = %2.2f%n", snapshot.getMean());
824      output.printf(locale, "            stddev = %2.2f%n", snapshot.getStdDev());
825      output.printf(locale, "            median = %2.2f%n", snapshot.getMedian());
826      output.printf(locale, "              75%% <= %2.2f%n", snapshot.get75thPercentile());
827      output.printf(locale, "              95%% <= %2.2f%n", snapshot.get95thPercentile());
828      output.printf(locale, "              98%% <= %2.2f%n", snapshot.get98thPercentile());
829      output.printf(locale, "              99%% <= %2.2f%n", snapshot.get99thPercentile());
830      output.printf(locale, "            99.9%% <= %2.2f%n", snapshot.get999thPercentile());
831      output.printf(locale, "             count = %d%n", histogram.getCount());
832
833      // if printStatRanges was enabled with -d arg, below we'll create an approximate histogram
834      // of counts based on the configured ranges in RANGES. Each range of sizes (i.e. <= 50, <=
835      // 100, etc) will have a count printed if any values were seen in that range. If no values
836      // were seen for a range, that range will be excluded to keep the output small.
837      if (stats.hasRangeCounts()) {
838        output.printf(locale, "           (range <= count):%n");
839        long lastVal = 0;
840        long lastRange = 0;
841        for (long range : stats.getRanges()) {
842          long val = stats.getCountAtOrBelow(range);
843          if (val - lastVal > 0) {
844            // print the last zero value before this one, to give context
845            if (lastVal == 0 && lastRange != 0) {
846              printRangeCount(lastRange, lastVal);
847            }
848            printRangeCount(range, val - lastVal);
849          }
850          lastVal = val;
851          lastRange = range;
852        }
853        if (histogram.getCount() - lastVal > 0) {
854          // print any remaining that might have been outside our buckets
855          printRangeCount(Long.MAX_VALUE, histogram.getCount() - lastVal);
856        }
857      }
858    }
859
860    private void printRangeCount(long range, long countAtOrBelow) {
861      String rangeString = range == Long.MAX_VALUE ? "inf" : Long.toString(range);
862      output.printf(locale, "%17s <= %d%n", rangeString, countAtOrBelow);
863    }
864  }
865
866  public static void main(String[] args) throws Exception {
867    Configuration conf = HBaseConfiguration.create();
868    // no need for a block cache
869    conf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0);
870    int ret = ToolRunner.run(conf, new HFilePrettyPrinter(), args);
871    System.exit(ret);
872  }
873}