001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.io.hfile;
019
020import static org.apache.hadoop.hbase.io.hfile.HFileBlock.FILL_HEADER;
021
022import java.io.IOException;
023import java.nio.ByteBuffer;
024import java.util.NavigableMap;
025import java.util.NavigableSet;
026import java.util.concurrent.ConcurrentSkipListMap;
027import java.util.concurrent.ConcurrentSkipListSet;
028import org.apache.hadoop.conf.Configuration;
029import org.apache.hadoop.hbase.metrics.impl.FastLongHistogram;
030import org.apache.hadoop.hbase.nio.ByteBuff;
031import org.apache.hadoop.hbase.util.Bytes;
032import org.apache.hadoop.hbase.util.ChecksumType;
033import org.apache.hadoop.hbase.util.GsonUtil;
034import org.apache.yetus.audience.InterfaceAudience;
035import org.slf4j.Logger;
036import org.slf4j.LoggerFactory;
037
038import org.apache.hbase.thirdparty.com.google.gson.Gson;
039import org.apache.hbase.thirdparty.com.google.gson.TypeAdapter;
040import org.apache.hbase.thirdparty.com.google.gson.stream.JsonReader;
041import org.apache.hbase.thirdparty.com.google.gson.stream.JsonWriter;
042
043/**
044 * Utilty for aggregating counts in CachedBlocks and toString/toJSON CachedBlocks and BlockCaches.
045 * No attempt has been made at making this thread safe.
046 */
047@InterfaceAudience.Private
048public class BlockCacheUtil {
049
050  private static final Logger LOG = LoggerFactory.getLogger(BlockCacheUtil.class);
051
052  public static final long NANOS_PER_SECOND = 1000000000;
053
054  /**
055   * Needed generating JSON.
056   */
057  private static final Gson GSON = GsonUtil.createGson()
058    .registerTypeAdapter(FastLongHistogram.class, new TypeAdapter<FastLongHistogram>() {
059
060      @Override
061      public void write(JsonWriter out, FastLongHistogram value) throws IOException {
062        AgeSnapshot snapshot = new AgeSnapshot(value);
063        out.beginObject();
064        out.name("mean").value(snapshot.getMean());
065        out.name("min").value(snapshot.getMin());
066        out.name("max").value(snapshot.getMax());
067        out.name("75thPercentile").value(snapshot.get75thPercentile());
068        out.name("95thPercentile").value(snapshot.get95thPercentile());
069        out.name("98thPercentile").value(snapshot.get98thPercentile());
070        out.name("99thPercentile").value(snapshot.get99thPercentile());
071        out.name("999thPercentile").value(snapshot.get999thPercentile());
072        out.endObject();
073      }
074
075      @Override
076      public FastLongHistogram read(JsonReader in) throws IOException {
077        throw new UnsupportedOperationException();
078      }
079    }).setPrettyPrinting().create();
080
081  /** Returns The block content as String. */
082  public static String toString(final CachedBlock cb, final long now) {
083    return "filename=" + cb.getFilename() + ", " + toStringMinusFileName(cb, now);
084  }
085
086  /**
087   * Little data structure to hold counts for a file. Used doing a toJSON.
088   */
089  static class CachedBlockCountsPerFile {
090    private int count = 0;
091    private long size = 0;
092    private int countData = 0;
093    private long sizeData = 0;
094    private final String filename;
095
096    CachedBlockCountsPerFile(final String filename) {
097      this.filename = filename;
098    }
099
100    public int getCount() {
101      return count;
102    }
103
104    public long getSize() {
105      return size;
106    }
107
108    public int getCountData() {
109      return countData;
110    }
111
112    public long getSizeData() {
113      return sizeData;
114    }
115
116    public String getFilename() {
117      return filename;
118    }
119  }
120
121  /** Returns A JSON String of <code>filename</code> and counts of <code>blocks</code> */
122  public static String toJSON(String filename, NavigableSet<CachedBlock> blocks)
123    throws IOException {
124    CachedBlockCountsPerFile counts = new CachedBlockCountsPerFile(filename);
125    for (CachedBlock cb : blocks) {
126      counts.count++;
127      counts.size += cb.getSize();
128      BlockType bt = cb.getBlockType();
129      if (bt != null && bt.isData()) {
130        counts.countData++;
131        counts.sizeData += cb.getSize();
132      }
133    }
134    return GSON.toJson(counts);
135  }
136
137  /** Returns JSON string of <code>cbsf</code> aggregated */
138  public static String toJSON(CachedBlocksByFile cbsbf) throws IOException {
139    return GSON.toJson(cbsbf);
140  }
141
142  /** Returns JSON string of <code>bc</code> content. */
143  public static String toJSON(BlockCache bc) throws IOException {
144    return GSON.toJson(bc);
145  }
146
147  /** Returns The block content of <code>bc</code> as a String minus the filename. */
148  public static String toStringMinusFileName(final CachedBlock cb, final long now) {
149    return "offset=" + cb.getOffset() + ", size=" + cb.getSize() + ", age="
150      + (now - cb.getCachedTime()) + ", type=" + cb.getBlockType() + ", priority="
151      + cb.getBlockPriority();
152  }
153
154  /**
155   * Get a {@link CachedBlocksByFile} instance and load it up by iterating content in
156   * {@link BlockCache}.
157   * @param conf Used to read configurations
158   * @param bc   Block Cache to iterate.
159   * @return Laoded up instance of CachedBlocksByFile
160   */
161  public static CachedBlocksByFile getLoadedCachedBlocksByFile(final Configuration conf,
162    final BlockCache bc) {
163    CachedBlocksByFile cbsbf = new CachedBlocksByFile(conf);
164    for (CachedBlock cb : bc) {
165      if (cbsbf.update(cb)) break;
166    }
167    return cbsbf;
168  }
169
170  private static int compareCacheBlock(Cacheable left, Cacheable right,
171    boolean includeNextBlockMetadata) {
172    ByteBuffer l = ByteBuffer.allocate(left.getSerializedLength());
173    left.serialize(l, includeNextBlockMetadata);
174    ByteBuffer r = ByteBuffer.allocate(right.getSerializedLength());
175    right.serialize(r, includeNextBlockMetadata);
176    return Bytes.compareTo(l.array(), l.arrayOffset(), l.limit(), r.array(), r.arrayOffset(),
177      r.limit());
178  }
179
180  /**
181   * Validate that the existing and newBlock are the same without including the nextBlockMetadata,
182   * if not, throw an exception. If they are the same without the nextBlockMetadata, return the
183   * comparison.
184   * @param existing block that is existing in the cache.
185   * @param newBlock block that is trying to be cached.
186   * @param cacheKey the cache key of the blocks.
187   * @return comparison of the existing block to the newBlock.
188   */
189  public static int validateBlockAddition(Cacheable existing, Cacheable newBlock,
190    BlockCacheKey cacheKey) {
191    int comparison = compareCacheBlock(existing, newBlock, false);
192    if (comparison != 0) {
193      throw new RuntimeException(
194        "Cached block contents differ, which should not have happened." + "cacheKey:" + cacheKey);
195    }
196    if ((existing instanceof HFileBlock) && (newBlock instanceof HFileBlock)) {
197      comparison = ((HFileBlock) existing).getNextBlockOnDiskSize()
198        - ((HFileBlock) newBlock).getNextBlockOnDiskSize();
199    }
200    return comparison;
201  }
202
203  /**
204   * Because of the region splitting, it's possible that the split key locate in the middle of a
205   * block. So it's possible that both the daughter regions load the same block from their parent
206   * HFile. When pread, we don't force the read to read all of the next block header. So when two
207   * threads try to cache the same block, it's possible that one thread read all of the next block
208   * header but the other one didn't. if the already cached block hasn't next block header but the
209   * new block to cache has, then we can replace the existing block with the new block for better
210   * performance.(HBASE-20447)
211   * @param blockCache BlockCache to check
212   * @param cacheKey   the block cache key
213   * @param newBlock   the new block which try to put into the block cache.
214   * @return true means need to replace existing block with new block for the same block cache key.
215   *         false means just keep the existing block.
216   */
217  public static boolean shouldReplaceExistingCacheBlock(BlockCache blockCache,
218    BlockCacheKey cacheKey, Cacheable newBlock) {
219    // NOTICE: The getBlock has retained the existingBlock inside.
220    Cacheable existingBlock = blockCache.getBlock(cacheKey, false, false, false);
221    if (existingBlock == null) {
222      return true;
223    }
224    try {
225      int comparison = BlockCacheUtil.validateBlockAddition(existingBlock, newBlock, cacheKey);
226      if (comparison < 0) {
227        LOG.warn("Cached block contents differ by nextBlockOnDiskSize, the new block has "
228          + "nextBlockOnDiskSize set. Caching new block.");
229        return true;
230      } else if (comparison > 0) {
231        LOG.warn("Cached block contents differ by nextBlockOnDiskSize, the existing block has "
232          + "nextBlockOnDiskSize set, Keeping cached block.");
233        return false;
234      } else {
235        LOG.debug("Caching an already cached block: {}. This is harmless and can happen in rare "
236          + "cases (see HBASE-8547)", cacheKey);
237        return false;
238      }
239    } finally {
240      // Release this block to decrement the reference count.
241      existingBlock.release();
242    }
243  }
244
245  private static final int DEFAULT_MAX = 1000000;
246
247  public static int getMaxCachedBlocksByFile(Configuration conf) {
248    return conf == null ? DEFAULT_MAX : conf.getInt("hbase.ui.blockcache.by.file.max", DEFAULT_MAX);
249  }
250
251  /**
252   * Similarly to HFileBlock.Writer.getBlockForCaching(), creates a HFileBlock instance without
253   * checksum for caching. This is needed for when we cache blocks via readers (either prefetch or
254   * client read), otherwise we may fail equality comparison when checking against same block that
255   * may already have been cached at write time.
256   * @param cacheConf the related CacheConfig object.
257   * @param block     the HFileBlock instance to be converted.
258   * @return the resulting HFileBlock instance without checksum.
259   */
260  public static HFileBlock getBlockForCaching(CacheConfig cacheConf, HFileBlock block) {
261    // Calculate how many bytes we need for checksum on the tail of the block.
262    int numBytes = cacheConf.shouldCacheCompressed(block.getBlockType().getCategory())
263      ? 0
264      : (int) ChecksumUtil.numBytes(block.getOnDiskDataSizeWithHeader(),
265        block.getHFileContext().getBytesPerChecksum());
266    ByteBuff buff = block.getBufferReadOnly();
267    HFileBlockBuilder builder = new HFileBlockBuilder();
268    return builder.withBlockType(block.getBlockType())
269      .withOnDiskSizeWithoutHeader(block.getOnDiskSizeWithoutHeader())
270      .withUncompressedSizeWithoutHeader(block.getUncompressedSizeWithoutHeader())
271      .withPrevBlockOffset(block.getPrevBlockOffset()).withByteBuff(buff)
272      .withFillHeader(FILL_HEADER).withOffset(block.getOffset()).withNextBlockOnDiskSize(-1)
273      .withOnDiskDataSizeWithHeader(block.getOnDiskDataSizeWithHeader() + numBytes)
274      .withHFileContext(cloneContext(block.getHFileContext()))
275      .withByteBuffAllocator(cacheConf.getByteBuffAllocator()).withShared(!buff.hasArray()).build();
276  }
277
278  public static HFileContext cloneContext(HFileContext context) {
279    HFileContext newContext = new HFileContextBuilder().withBlockSize(context.getBlocksize())
280      .withBytesPerCheckSum(0).withChecksumType(ChecksumType.NULL) // no checksums in cached data
281      .withCompression(context.getCompression())
282      .withDataBlockEncoding(context.getDataBlockEncoding())
283      .withHBaseCheckSum(context.isUseHBaseChecksum()).withCompressTags(context.isCompressTags())
284      .withIncludesMvcc(context.isIncludesMvcc()).withIncludesTags(context.isIncludesTags())
285      .withColumnFamily(context.getColumnFamily()).withTableName(context.getTableName()).build();
286    return newContext;
287  }
288
289  /**
290   * Use one of these to keep a running account of cached blocks by file. Throw it away when done.
291   * This is different than metrics in that it is stats on current state of a cache. See
292   * getLoadedCachedBlocksByFile
293   */
294  public static class CachedBlocksByFile {
295    private int count;
296    private int dataBlockCount;
297    private long size;
298    private long dataSize;
299    private final long now = System.nanoTime();
300    /**
301     * How many blocks to look at before we give up. There could be many millions of blocks. We
302     * don't want the ui to freeze while we run through 1B blocks... users will think hbase dead. UI
303     * displays warning in red when stats are incomplete.
304     */
305    private final int max;
306
307    CachedBlocksByFile() {
308      this(null);
309    }
310
311    CachedBlocksByFile(final Configuration c) {
312      this.max = getMaxCachedBlocksByFile(c);
313    }
314
315    /**
316     * Map by filename. use concurent utils because we want our Map and contained blocks sorted.
317     */
318    private transient NavigableMap<String, NavigableSet<CachedBlock>> cachedBlockByFile =
319      new ConcurrentSkipListMap<>();
320    FastLongHistogram hist = new FastLongHistogram();
321
322    /** Returns True if full.... if we won't be adding any more. */
323    public boolean update(final CachedBlock cb) {
324      if (isFull()) return true;
325      NavigableSet<CachedBlock> set = this.cachedBlockByFile.get(cb.getFilename());
326      if (set == null) {
327        set = new ConcurrentSkipListSet<>();
328        this.cachedBlockByFile.put(cb.getFilename(), set);
329      }
330      set.add(cb);
331      this.size += cb.getSize();
332      this.count++;
333      BlockType bt = cb.getBlockType();
334      if (bt != null && bt.isData()) {
335        this.dataBlockCount++;
336        this.dataSize += cb.getSize();
337      }
338      long age = (this.now - cb.getCachedTime()) / NANOS_PER_SECOND;
339      this.hist.add(age, 1);
340      return false;
341    }
342
343    /**
344     * @return True if full; i.e. there are more items in the cache but we only loaded up the
345     *         maximum set in configuration <code>hbase.ui.blockcache.by.file.max</code> (Default:
346     *         DEFAULT_MAX).
347     */
348    public boolean isFull() {
349      return this.count >= this.max;
350    }
351
352    public NavigableMap<String, NavigableSet<CachedBlock>> getCachedBlockStatsByFile() {
353      return this.cachedBlockByFile;
354    }
355
356    /** Returns count of blocks in the cache */
357    public int getCount() {
358      return count;
359    }
360
361    public int getDataCount() {
362      return dataBlockCount;
363    }
364
365    /** Returns size of blocks in the cache */
366    public long getSize() {
367      return size;
368    }
369
370    /** Returns Size of data. */
371    public long getDataSize() {
372      return dataSize;
373    }
374
375    public AgeSnapshot getAgeInCacheSnapshot() {
376      return new AgeSnapshot(this.hist);
377    }
378
379    @Override
380    public String toString() {
381      AgeSnapshot snapshot = getAgeInCacheSnapshot();
382      return "count=" + count + ", dataBlockCount=" + dataBlockCount + ", size=" + size
383        + ", dataSize=" + getDataSize() + ", mean age=" + snapshot.getMean() + ", min age="
384        + snapshot.getMin() + ", max age=" + snapshot.getMax() + ", 75th percentile age="
385        + snapshot.get75thPercentile() + ", 95th percentile age=" + snapshot.get95thPercentile()
386        + ", 98th percentile age=" + snapshot.get98thPercentile() + ", 99th percentile age="
387        + snapshot.get99thPercentile() + ", 99.9th percentile age=" + snapshot.get99thPercentile();
388    }
389  }
390}