001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.io.hfile;
019
020import static org.apache.hadoop.hbase.regionserver.CompactSplit.HBASE_REGION_SERVER_ENABLE_COMPACTION;
021import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.BLOCK_CACHE_KEY_KEY;
022
023import io.opentelemetry.api.common.Attributes;
024import io.opentelemetry.api.trace.Span;
025import java.io.DataInput;
026import java.io.IOException;
027import java.nio.ByteBuffer;
028import java.util.ArrayList;
029import java.util.Optional;
030import java.util.function.IntConsumer;
031import org.apache.hadoop.conf.Configurable;
032import org.apache.hadoop.conf.Configuration;
033import org.apache.hadoop.fs.Path;
034import org.apache.hadoop.hbase.ByteBufferKeyOnlyKeyValue;
035import org.apache.hadoop.hbase.Cell;
036import org.apache.hadoop.hbase.CellComparator;
037import org.apache.hadoop.hbase.CellUtil;
038import org.apache.hadoop.hbase.HConstants;
039import org.apache.hadoop.hbase.KeyValue;
040import org.apache.hadoop.hbase.PrivateCellUtil;
041import org.apache.hadoop.hbase.SizeCachedByteBufferKeyValue;
042import org.apache.hadoop.hbase.SizeCachedKeyValue;
043import org.apache.hadoop.hbase.SizeCachedNoTagsByteBufferKeyValue;
044import org.apache.hadoop.hbase.SizeCachedNoTagsKeyValue;
045import org.apache.hadoop.hbase.io.HFileLink;
046import org.apache.hadoop.hbase.io.compress.Compression;
047import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder;
048import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
049import org.apache.hadoop.hbase.io.encoding.HFileBlockDecodingContext;
050import org.apache.hadoop.hbase.nio.ByteBuff;
051import org.apache.hadoop.hbase.regionserver.KeyValueScanner;
052import org.apache.hadoop.hbase.regionserver.StoreFileInfo;
053import org.apache.hadoop.hbase.util.ByteBufferUtils;
054import org.apache.hadoop.hbase.util.Bytes;
055import org.apache.hadoop.hbase.util.IdLock;
056import org.apache.hadoop.hbase.util.ObjectIntPair;
057import org.apache.hadoop.io.WritableUtils;
058import org.apache.yetus.audience.InterfaceAudience;
059import org.slf4j.Logger;
060import org.slf4j.LoggerFactory;
061
062/**
063 * Implementation that can handle all hfile versions of {@link HFile.Reader}.
064 */
065@InterfaceAudience.Private
066@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "URF_UNREAD_PUBLIC_OR_PROTECTED_FIELD")
067public abstract class HFileReaderImpl implements HFile.Reader, Configurable {
068  // This class is HFileReaderV3 + HFileReaderV2 + AbstractHFileReader all squashed together into
069  // one file. Ditto for all the HFileReader.ScannerV? implementations. I was running up against
070  // the MaxInlineLevel limit because too many tiers involved reading from an hfile. Was also hard
071  // to navigate the source code when so many classes participating in read.
072  private static final Logger LOG = LoggerFactory.getLogger(HFileReaderImpl.class);
073
074  /** Data block index reader keeping the root data index in memory */
075  protected HFileBlockIndex.CellBasedKeyBlockIndexReader dataBlockIndexReader;
076
077  /** Meta block index reader -- always single level */
078  protected HFileBlockIndex.ByteArrayKeyBlockIndexReader metaBlockIndexReader;
079
080  protected FixedFileTrailer trailer;
081
082  private final boolean primaryReplicaReader;
083
084  /**
085   * What kind of data block encoding should be used while reading, writing, and handling cache.
086   */
087  protected HFileDataBlockEncoder dataBlockEncoder = NoOpDataBlockEncoder.INSTANCE;
088
089  /** Block cache configuration. */
090  protected final CacheConfig cacheConf;
091
092  protected ReaderContext context;
093
094  protected final HFileInfo fileInfo;
095
096  /** Path of file */
097  protected final Path path;
098
099  /** File name to be used for block names */
100  protected final String name;
101
102  private Configuration conf;
103
104  protected HFileContext hfileContext;
105
106  /** Filesystem-level block reader. */
107  protected HFileBlock.FSReader fsBlockReader;
108
109  /**
110   * A "sparse lock" implementation allowing to lock on a particular block identified by offset. The
111   * purpose of this is to avoid two clients loading the same block, and have all but one client
112   * wait to get the block from the cache.
113   */
114  private IdLock offsetLock = new IdLock();
115
116  /** Minimum minor version supported by this HFile format */
117  static final int MIN_MINOR_VERSION = 0;
118
119  /** Maximum minor version supported by this HFile format */
120  // We went to version 2 when we moved to pb'ing fileinfo and the trailer on
121  // the file. This version can read Writables version 1.
122  static final int MAX_MINOR_VERSION = 3;
123
124  /** Minor versions starting with this number have faked index key */
125  static final int MINOR_VERSION_WITH_FAKED_KEY = 3;
126
127  /**
128   * Opens a HFile.
129   * @param context   Reader context info
130   * @param fileInfo  HFile info
131   * @param cacheConf Cache configuration.
132   * @param conf      Configuration
133   */
134  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "URF_UNREAD_PUBLIC_OR_PROTECTED_FIELD")
135  public HFileReaderImpl(ReaderContext context, HFileInfo fileInfo, CacheConfig cacheConf,
136    Configuration conf) throws IOException {
137    this.cacheConf = cacheConf;
138    this.context = context;
139    this.path = context.getFilePath();
140    this.name = path.getName();
141    this.conf = conf;
142    this.primaryReplicaReader = context.isPrimaryReplicaReader();
143    this.fileInfo = fileInfo;
144    this.trailer = fileInfo.getTrailer();
145    this.hfileContext = fileInfo.getHFileContext();
146    this.fsBlockReader =
147      new HFileBlock.FSReaderImpl(context, hfileContext, cacheConf.getByteBuffAllocator(), conf);
148    this.dataBlockEncoder = HFileDataBlockEncoderImpl.createFromFileInfo(fileInfo);
149    fsBlockReader.setDataBlockEncoder(dataBlockEncoder, conf);
150    dataBlockIndexReader = fileInfo.getDataBlockIndexReader();
151    metaBlockIndexReader = fileInfo.getMetaBlockIndexReader();
152  }
153
154  @SuppressWarnings("serial")
155  public static class BlockIndexNotLoadedException extends IllegalStateException {
156    public BlockIndexNotLoadedException(Path path) {
157      // Add a message in case anyone relies on it as opposed to class name.
158      super(path + " block index not loaded");
159    }
160  }
161
162  private Optional<String> toStringFirstKey() {
163    return getFirstKey().map(CellUtil::getCellKeyAsString);
164  }
165
166  private Optional<String> toStringLastKey() {
167    return getLastKey().map(CellUtil::getCellKeyAsString);
168  }
169
170  @Override
171  public String toString() {
172    return "reader=" + path.toString()
173      + (!isFileInfoLoaded()
174        ? ""
175        : ", compression=" + trailer.getCompressionCodec().getName() + ", cacheConf=" + cacheConf
176          + ", firstKey=" + toStringFirstKey() + ", lastKey=" + toStringLastKey())
177      + ", avgKeyLen=" + fileInfo.getAvgKeyLen() + ", avgValueLen=" + fileInfo.getAvgValueLen()
178      + ", entries=" + trailer.getEntryCount() + ", length=" + context.getFileSize();
179  }
180
181  @Override
182  public long length() {
183    return context.getFileSize();
184  }
185
186  /**
187   * @return the first key in the file. May be null if file has no entries. Note that this is not
188   *         the first row key, but rather the byte form of the first KeyValue.
189   */
190  @Override
191  public Optional<Cell> getFirstKey() {
192    if (dataBlockIndexReader == null) {
193      throw new BlockIndexNotLoadedException(path);
194    }
195    return dataBlockIndexReader.isEmpty()
196      ? Optional.empty()
197      : Optional.of(dataBlockIndexReader.getRootBlockKey(0));
198  }
199
200  /**
201   * TODO left from {@link HFile} version 1: move this to StoreFile after Ryan's patch goes in to
202   * eliminate {@link KeyValue} here.
203   * @return the first row key, or null if the file is empty.
204   */
205  @Override
206  public Optional<byte[]> getFirstRowKey() {
207    // We have to copy the row part to form the row key alone
208    return getFirstKey().map(CellUtil::cloneRow);
209  }
210
211  /**
212   * TODO left from {@link HFile} version 1: move this to StoreFile after Ryan's patch goes in to
213   * eliminate {@link KeyValue} here.
214   * @return the last row key, or null if the file is empty.
215   */
216  @Override
217  public Optional<byte[]> getLastRowKey() {
218    // We have to copy the row part to form the row key alone
219    return getLastKey().map(CellUtil::cloneRow);
220  }
221
222  /** Returns number of KV entries in this HFile */
223  @Override
224  public long getEntries() {
225    return trailer.getEntryCount();
226  }
227
228  /** Returns comparator */
229  @Override
230  public CellComparator getComparator() {
231    return this.hfileContext.getCellComparator();
232  }
233
234  public Compression.Algorithm getCompressionAlgorithm() {
235    return trailer.getCompressionCodec();
236  }
237
238  /**
239   * @return the total heap size of data and meta block indexes in bytes. Does not take into account
240   *         non-root blocks of a multilevel data index.
241   */
242  @Override
243  public long indexSize() {
244    return (dataBlockIndexReader != null ? dataBlockIndexReader.heapSize() : 0)
245      + ((metaBlockIndexReader != null) ? metaBlockIndexReader.heapSize() : 0);
246  }
247
248  @Override
249  public String getName() {
250    return name;
251  }
252
253  @Override
254  public void setDataBlockEncoder(HFileDataBlockEncoder dataBlockEncoder) {
255    this.dataBlockEncoder = dataBlockEncoder;
256    this.fsBlockReader.setDataBlockEncoder(dataBlockEncoder, conf);
257  }
258
259  @Override
260  public void setDataBlockIndexReader(HFileBlockIndex.CellBasedKeyBlockIndexReader reader) {
261    this.dataBlockIndexReader = reader;
262  }
263
264  @Override
265  public HFileBlockIndex.CellBasedKeyBlockIndexReader getDataBlockIndexReader() {
266    return dataBlockIndexReader;
267  }
268
269  @Override
270  public void setMetaBlockIndexReader(HFileBlockIndex.ByteArrayKeyBlockIndexReader reader) {
271    this.metaBlockIndexReader = reader;
272  }
273
274  @Override
275  public HFileBlockIndex.ByteArrayKeyBlockIndexReader getMetaBlockIndexReader() {
276    return metaBlockIndexReader;
277  }
278
279  @Override
280  public FixedFileTrailer getTrailer() {
281    return trailer;
282  }
283
284  @Override
285  public ReaderContext getContext() {
286    return this.context;
287  }
288
289  @Override
290  public HFileInfo getHFileInfo() {
291    return this.fileInfo;
292  }
293
294  @Override
295  public boolean isPrimaryReplicaReader() {
296    return primaryReplicaReader;
297  }
298
299  /**
300   * An exception thrown when an operation requiring a scanner to be seeked is invoked on a scanner
301   * that is not seeked.
302   */
303  @SuppressWarnings("serial")
304  public static class NotSeekedException extends IllegalStateException {
305    public NotSeekedException(Path path) {
306      super(path + " not seeked to a key/value");
307    }
308  }
309
310  protected static class HFileScannerImpl implements HFileScanner {
311    private ByteBuff blockBuffer;
312    protected final boolean cacheBlocks;
313    protected final boolean pread;
314    protected final boolean isCompaction;
315    private int currKeyLen;
316    private int currValueLen;
317    private int currMemstoreTSLen;
318    private long currMemstoreTS;
319    protected final HFile.Reader reader;
320    private int currTagsLen;
321    private short rowLen;
322    // buffer backed keyonlyKV
323    private ByteBufferKeyOnlyKeyValue bufBackedKeyOnlyKv = new ByteBufferKeyOnlyKeyValue();
324    // A pair for reusing in blockSeek() so that we don't garbage lot of objects
325    final ObjectIntPair<ByteBuffer> pair = new ObjectIntPair<>();
326
327    /**
328     * The next indexed key is to keep track of the indexed key of the next data block. If the
329     * nextIndexedKey is HConstants.NO_NEXT_INDEXED_KEY, it means that the current data block is the
330     * last data block. If the nextIndexedKey is null, it means the nextIndexedKey has not been
331     * loaded yet.
332     */
333    protected Cell nextIndexedKey;
334    // Current block being used. NOTICE: DON't release curBlock separately except in shipped() or
335    // close() methods. Because the shipped() or close() will do the release finally, even if any
336    // exception occur the curBlock will be released by the close() method (see
337    // RegionScannerImpl#handleException). Call the releaseIfNotCurBlock() to release the
338    // unreferenced block please.
339    protected HFileBlock curBlock;
340    // Whether we returned a result for curBlock's size in recordBlockSize().
341    // gets reset whenever curBlock is changed.
342    private boolean providedCurrentBlockSize = false;
343    // Previous blocks that were used in the course of the read
344    protected final ArrayList<HFileBlock> prevBlocks = new ArrayList<>();
345
346    public HFileScannerImpl(final HFile.Reader reader, final boolean cacheBlocks,
347      final boolean pread, final boolean isCompaction) {
348      this.reader = reader;
349      this.cacheBlocks = cacheBlocks;
350      this.pread = pread;
351      this.isCompaction = isCompaction;
352    }
353
354    void updateCurrBlockRef(HFileBlock block) {
355      if (block != null && curBlock != null && block.getOffset() == curBlock.getOffset()) {
356        return;
357      }
358      if (this.curBlock != null && this.curBlock.isSharedMem()) {
359        prevBlocks.add(this.curBlock);
360      }
361      this.curBlock = block;
362      this.providedCurrentBlockSize = false;
363    }
364
365    void reset() {
366      // We don't have to keep ref to heap block
367      if (this.curBlock != null && this.curBlock.isSharedMem()) {
368        this.prevBlocks.add(this.curBlock);
369      }
370      this.curBlock = null;
371    }
372
373    private void returnBlocks(boolean returnAll) {
374      this.prevBlocks.forEach(HFileBlock::release);
375      this.prevBlocks.clear();
376      if (returnAll && this.curBlock != null) {
377        this.curBlock.release();
378        this.curBlock = null;
379      }
380    }
381
382    @Override
383    public boolean isSeeked() {
384      return blockBuffer != null;
385    }
386
387    @Override
388    public String toString() {
389      return "HFileScanner for reader " + String.valueOf(getReader());
390    }
391
392    protected void assertSeeked() {
393      if (!isSeeked()) {
394        throw new NotSeekedException(reader.getPath());
395      }
396    }
397
398    @Override
399    public HFile.Reader getReader() {
400      return reader;
401    }
402
403    // From non encoded HFiles, we always read back KeyValue or its descendant.(Note: When HFile
404    // block is in DBB, it will be OffheapKV). So all parts of the Cell is in a contiguous
405    // array/buffer. How many bytes we should wrap to make the KV is what this method returns.
406    private int getKVBufSize() {
407      int kvBufSize = KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen;
408      if (currTagsLen > 0) {
409        kvBufSize += Bytes.SIZEOF_SHORT + currTagsLen;
410      }
411      return kvBufSize;
412    }
413
414    @Override
415    public void close() {
416      if (!pread) {
417        // For seek + pread stream socket should be closed when the scanner is closed. HBASE-9393
418        reader.unbufferStream();
419      }
420      this.returnBlocks(true);
421    }
422
423    @Override
424    public void recordBlockSize(IntConsumer blockSizeConsumer) {
425      if (!providedCurrentBlockSize && curBlock != null) {
426        providedCurrentBlockSize = true;
427        blockSizeConsumer.accept(curBlock.getUncompressedSizeWithoutHeader());
428      }
429    }
430
431    // Returns the #bytes in HFile for the current cell. Used to skip these many bytes in current
432    // HFile block's buffer so as to position to the next cell.
433    private int getCurCellSerializedSize() {
434      int curCellSize = KEY_VALUE_LEN_SIZE + currKeyLen + currValueLen + currMemstoreTSLen;
435      if (this.reader.getFileContext().isIncludesTags()) {
436        curCellSize += Bytes.SIZEOF_SHORT + currTagsLen;
437      }
438      return curCellSize;
439    }
440
441    protected void readKeyValueLen() {
442      // This is a hot method. We go out of our way to make this method short so it can be
443      // inlined and is not too big to compile. We also manage position in ByteBuffer ourselves
444      // because it is faster than going via range-checked ByteBuffer methods or going through a
445      // byte buffer array a byte at a time.
446      // Get a long at a time rather than read two individual ints. In micro-benchmarking, even
447      // with the extra bit-fiddling, this is order-of-magnitude faster than getting two ints.
448      // Trying to imitate what was done - need to profile if this is better or
449      // earlier way is better by doing mark and reset?
450      // But ensure that you read long instead of two ints
451      long ll = blockBuffer.getLongAfterPosition(0);
452      // Read top half as an int of key length and bottom int as value length
453      this.currKeyLen = (int) (ll >> Integer.SIZE);
454      this.currValueLen = (int) (Bytes.MASK_FOR_LOWER_INT_IN_LONG ^ ll);
455      checkKeyValueLen();
456      this.rowLen = blockBuffer.getShortAfterPosition(Bytes.SIZEOF_LONG);
457      // Move position past the key and value lengths and then beyond the key and value
458      int p = (Bytes.SIZEOF_LONG + currKeyLen + currValueLen);
459      if (reader.getFileContext().isIncludesTags()) {
460        // Tags length is a short.
461        this.currTagsLen = blockBuffer.getShortAfterPosition(p);
462        checkTagsLen();
463        p += (Bytes.SIZEOF_SHORT + currTagsLen);
464      }
465      readMvccVersion(p);
466    }
467
468    private final void checkTagsLen() {
469      if (checkLen(this.currTagsLen)) {
470        throw new IllegalStateException(
471          "Invalid currTagsLen " + this.currTagsLen + ". Block offset: " + curBlock.getOffset()
472            + ", block length: " + this.blockBuffer.limit() + ", position: "
473            + this.blockBuffer.position() + " (without header)." + " path=" + reader.getPath());
474      }
475    }
476
477    /**
478     * Read mvcc. Does checks to see if we even need to read the mvcc at all.
479     */
480    protected void readMvccVersion(final int offsetFromPos) {
481      // See if we even need to decode mvcc.
482      if (!this.reader.getHFileInfo().shouldIncludeMemStoreTS()) {
483        return;
484      }
485      if (!this.reader.getHFileInfo().isDecodeMemstoreTS()) {
486        currMemstoreTS = 0;
487        currMemstoreTSLen = 1;
488        return;
489      }
490      _readMvccVersion(offsetFromPos);
491    }
492
493    /**
494     * Actually do the mvcc read. Does no checks.
495     */
496    private void _readMvccVersion(int offsetFromPos) {
497      // This is Bytes#bytesToVint inlined so can save a few instructions in this hot method; i.e.
498      // previous if one-byte vint, we'd redo the vint call to find int size.
499      // Also the method is kept small so can be inlined.
500      byte firstByte = blockBuffer.getByteAfterPosition(offsetFromPos);
501      int len = WritableUtils.decodeVIntSize(firstByte);
502      if (len == 1) {
503        this.currMemstoreTS = firstByte;
504      } else {
505        int remaining = len - 1;
506        long i = 0;
507        offsetFromPos++;
508        if (remaining >= Bytes.SIZEOF_INT) {
509          // The int read has to be converted to unsigned long so the & op
510          i = (blockBuffer.getIntAfterPosition(offsetFromPos) & 0x00000000ffffffffL);
511          remaining -= Bytes.SIZEOF_INT;
512          offsetFromPos += Bytes.SIZEOF_INT;
513        }
514        if (remaining >= Bytes.SIZEOF_SHORT) {
515          short s = blockBuffer.getShortAfterPosition(offsetFromPos);
516          i = i << 16;
517          i = i | (s & 0xFFFF);
518          remaining -= Bytes.SIZEOF_SHORT;
519          offsetFromPos += Bytes.SIZEOF_SHORT;
520        }
521        for (int idx = 0; idx < remaining; idx++) {
522          byte b = blockBuffer.getByteAfterPosition(offsetFromPos + idx);
523          i = i << 8;
524          i = i | (b & 0xFF);
525        }
526        currMemstoreTS = (WritableUtils.isNegativeVInt(firstByte) ? ~i : i);
527      }
528      this.currMemstoreTSLen = len;
529    }
530
531    /**
532     * Within a loaded block, seek looking for the last key that is smaller than (or equal to?) the
533     * key we are interested in. A note on the seekBefore: if you have seekBefore = true, AND the
534     * first key in the block = key, then you'll get thrown exceptions. The caller has to check for
535     * that case and load the previous block as appropriate. the key to find find the key before the
536     * given key in case of exact match.
537     * @return 0 in case of an exact key match, 1 in case of an inexact match, -2 in case of an
538     *         inexact match and furthermore, the input key less than the first key of current
539     *         block(e.g. using a faked index key)
540     */
541    protected int blockSeek(Cell key, boolean seekBefore) {
542      int klen, vlen, tlen = 0;
543      int lastKeyValueSize = -1;
544      int offsetFromPos;
545      do {
546        offsetFromPos = 0;
547        // Better to ensure that we use the BB Utils here
548        long ll = blockBuffer.getLongAfterPosition(offsetFromPos);
549        klen = (int) (ll >> Integer.SIZE);
550        vlen = (int) (Bytes.MASK_FOR_LOWER_INT_IN_LONG ^ ll);
551        if (checkKeyLen(klen) || checkLen(vlen)) {
552          throw new IllegalStateException(
553            "Invalid klen " + klen + " or vlen " + vlen + ". Block offset: " + curBlock.getOffset()
554              + ", block length: " + blockBuffer.limit() + ", position: " + blockBuffer.position()
555              + " (without header)." + " path=" + reader.getPath());
556        }
557        offsetFromPos += Bytes.SIZEOF_LONG;
558        this.rowLen = blockBuffer.getShortAfterPosition(offsetFromPos);
559        blockBuffer.asSubByteBuffer(blockBuffer.position() + offsetFromPos, klen, pair);
560        bufBackedKeyOnlyKv.setKey(pair.getFirst(), pair.getSecond(), klen, rowLen);
561        int comp =
562          PrivateCellUtil.compareKeyIgnoresMvcc(reader.getComparator(), key, bufBackedKeyOnlyKv);
563        offsetFromPos += klen + vlen;
564        if (this.reader.getFileContext().isIncludesTags()) {
565          // Read short as unsigned, high byte first
566          tlen = ((blockBuffer.getByteAfterPosition(offsetFromPos) & 0xff) << 8)
567            ^ (blockBuffer.getByteAfterPosition(offsetFromPos + 1) & 0xff);
568          if (checkLen(tlen)) {
569            throw new IllegalStateException("Invalid tlen " + tlen + ". Block offset: "
570              + curBlock.getOffset() + ", block length: " + blockBuffer.limit() + ", position: "
571              + blockBuffer.position() + " (without header)." + " path=" + reader.getPath());
572          }
573          // add the two bytes read for the tags.
574          offsetFromPos += tlen + (Bytes.SIZEOF_SHORT);
575        }
576        if (this.reader.getHFileInfo().shouldIncludeMemStoreTS()) {
577          // Directly read the mvcc based on current position
578          readMvccVersion(offsetFromPos);
579        }
580        if (comp == 0) {
581          if (seekBefore) {
582            if (lastKeyValueSize < 0) {
583              throw new IllegalStateException("blockSeek with seekBefore "
584                + "at the first key of the block: key=" + CellUtil.getCellKeyAsString(key)
585                + ", blockOffset=" + curBlock.getOffset() + ", onDiskSize="
586                + curBlock.getOnDiskSizeWithHeader() + ", path=" + reader.getPath());
587            }
588            blockBuffer.moveBack(lastKeyValueSize);
589            readKeyValueLen();
590            return 1; // non exact match.
591          }
592          currKeyLen = klen;
593          currValueLen = vlen;
594          currTagsLen = tlen;
595          return 0; // indicate exact match
596        } else if (comp < 0) {
597          if (lastKeyValueSize > 0) {
598            blockBuffer.moveBack(lastKeyValueSize);
599          }
600          readKeyValueLen();
601          if (lastKeyValueSize == -1 && blockBuffer.position() == 0) {
602            return HConstants.INDEX_KEY_MAGIC;
603          }
604          return 1;
605        }
606        // The size of this key/value tuple, including key/value length fields.
607        lastKeyValueSize = klen + vlen + currMemstoreTSLen + KEY_VALUE_LEN_SIZE;
608        // include tag length also if tags included with KV
609        if (reader.getFileContext().isIncludesTags()) {
610          lastKeyValueSize += tlen + Bytes.SIZEOF_SHORT;
611        }
612        blockBuffer.skip(lastKeyValueSize);
613      } while (blockBuffer.hasRemaining());
614
615      // Seek to the last key we successfully read. This will happen if this is
616      // the last key/value pair in the file, in which case the following call
617      // to next() has to return false.
618      blockBuffer.moveBack(lastKeyValueSize);
619      readKeyValueLen();
620      return 1; // didn't exactly find it.
621    }
622
623    @Override
624    public Cell getNextIndexedKey() {
625      return nextIndexedKey;
626    }
627
628    @Override
629    public int seekTo(Cell key) throws IOException {
630      return seekTo(key, true);
631    }
632
633    @Override
634    public int reseekTo(Cell key) throws IOException {
635      int compared;
636      if (isSeeked()) {
637        compared = compareKey(reader.getComparator(), key);
638        if (compared < 1) {
639          // If the required key is less than or equal to current key, then
640          // don't do anything.
641          return compared;
642        } else {
643          // The comparison with no_next_index_key has to be checked
644          if (
645            this.nextIndexedKey != null && (this.nextIndexedKey
646                == KeyValueScanner.NO_NEXT_INDEXED_KEY
647              || PrivateCellUtil.compareKeyIgnoresMvcc(reader.getComparator(), key, nextIndexedKey)
648                  < 0)
649          ) {
650            // The reader shall continue to scan the current data block instead
651            // of querying the
652            // block index as long as it knows the target key is strictly
653            // smaller than
654            // the next indexed key or the current data block is the last data
655            // block.
656            return loadBlockAndSeekToKey(this.curBlock, nextIndexedKey, false, key, false);
657          }
658        }
659      }
660      // Don't rewind on a reseek operation, because reseek implies that we are
661      // always going forward in the file.
662      return seekTo(key, false);
663    }
664
665    /**
666     * An internal API function. Seek to the given key, optionally rewinding to the first key of the
667     * block before doing the seek.
668     * @param key    - a cell representing the key that we need to fetch
669     * @param rewind whether to rewind to the first key of the block before doing the seek. If this
670     *               is false, we are assuming we never go back, otherwise the result is undefined.
671     * @return -1 if the key is earlier than the first key of the file, 0 if we are at the given
672     *         key, 1 if we are past the given key -2 if the key is earlier than the first key of
673     *         the file while using a faked index key
674     */
675    public int seekTo(Cell key, boolean rewind) throws IOException {
676      HFileBlockIndex.BlockIndexReader indexReader = reader.getDataBlockIndexReader();
677      BlockWithScanInfo blockWithScanInfo = indexReader.loadDataBlockWithScanInfo(key, curBlock,
678        cacheBlocks, pread, isCompaction, getEffectiveDataBlockEncoding(), reader);
679      if (blockWithScanInfo == null || blockWithScanInfo.getHFileBlock() == null) {
680        // This happens if the key e.g. falls before the beginning of the file.
681        return -1;
682      }
683      return loadBlockAndSeekToKey(blockWithScanInfo.getHFileBlock(),
684        blockWithScanInfo.getNextIndexedKey(), rewind, key, false);
685    }
686
687    @Override
688    public boolean seekBefore(Cell key) throws IOException {
689      HFileBlock seekToBlock = reader.getDataBlockIndexReader().seekToDataBlock(key, curBlock,
690        cacheBlocks, pread, isCompaction, reader.getEffectiveEncodingInCache(isCompaction), reader);
691      if (seekToBlock == null) {
692        return false;
693      }
694      Cell firstKey = getFirstKeyCellInBlock(seekToBlock);
695      if (PrivateCellUtil.compareKeyIgnoresMvcc(reader.getComparator(), firstKey, key) >= 0) {
696        long previousBlockOffset = seekToBlock.getPrevBlockOffset();
697        // The key we are interested in
698        if (previousBlockOffset == -1) {
699          // we have a 'problem', the key we want is the first of the file.
700          releaseIfNotCurBlock(seekToBlock);
701          return false;
702        }
703
704        // The first key in the current block 'seekToBlock' is greater than the given
705        // seekBefore key. We will go ahead by reading the next block that satisfies the
706        // given key. Return the current block before reading the next one.
707        releaseIfNotCurBlock(seekToBlock);
708        // It is important that we compute and pass onDiskSize to the block
709        // reader so that it does not have to read the header separately to
710        // figure out the size. Currently, we do not have a way to do this
711        // correctly in the general case however.
712        // TODO: See https://issues.apache.org/jira/browse/HBASE-14576
713        int prevBlockSize = -1;
714        seekToBlock = reader.readBlock(previousBlockOffset, prevBlockSize, cacheBlocks, pread,
715          isCompaction, true, BlockType.DATA, getEffectiveDataBlockEncoding());
716        // TODO shortcut: seek forward in this block to the last key of the
717        // block.
718      }
719      loadBlockAndSeekToKey(seekToBlock, firstKey, true, key, true);
720      return true;
721    }
722
723    /**
724     * The curBlock will be released by shipping or close method, so only need to consider releasing
725     * the block, which was read from HFile before and not referenced by curBlock.
726     */
727    protected void releaseIfNotCurBlock(HFileBlock block) {
728      if (curBlock != block) {
729        block.release();
730      }
731    }
732
733    /**
734     * Scans blocks in the "scanned" section of the {@link HFile} until the next data block is
735     * found.
736     * @return the next block, or null if there are no more data blocks
737     */
738    @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NP_NULL_ON_SOME_PATH",
739        justification = "Yeah, unnecessary null check; could do w/ clean up")
740    protected HFileBlock readNextDataBlock() throws IOException {
741      long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
742      if (curBlock == null) {
743        return null;
744      }
745      HFileBlock block = this.curBlock;
746      do {
747        if (block.getOffset() >= lastDataBlockOffset) {
748          releaseIfNotCurBlock(block);
749          return null;
750        }
751        if (block.getOffset() < 0) {
752          releaseIfNotCurBlock(block);
753          throw new IOException("Invalid block offset=" + block + ", path=" + reader.getPath());
754        }
755        // We are reading the next block without block type validation, because
756        // it might turn out to be a non-data block.
757        block = reader.readBlock(block.getOffset() + block.getOnDiskSizeWithHeader(),
758          block.getNextBlockOnDiskSize(), cacheBlocks, pread, isCompaction, true, null,
759          getEffectiveDataBlockEncoding());
760        if (block != null && !block.getBlockType().isData()) {
761          // Whatever block we read we will be returning it unless
762          // it is a datablock. Just in case the blocks are non data blocks
763          block.release();
764        }
765      } while (!block.getBlockType().isData());
766      return block;
767    }
768
769    public DataBlockEncoding getEffectiveDataBlockEncoding() {
770      return this.reader.getEffectiveEncodingInCache(isCompaction);
771    }
772
773    @Override
774    public Cell getCell() {
775      if (!isSeeked()) {
776        return null;
777      }
778
779      Cell ret;
780      int cellBufSize = getKVBufSize();
781      long seqId = 0L;
782      if (this.reader.getHFileInfo().shouldIncludeMemStoreTS()) {
783        seqId = currMemstoreTS;
784      }
785      if (blockBuffer.hasArray()) {
786        // TODO : reduce the varieties of KV here. Check if based on a boolean
787        // we can handle the 'no tags' case.
788        if (currTagsLen > 0) {
789          ret = new SizeCachedKeyValue(blockBuffer.array(),
790            blockBuffer.arrayOffset() + blockBuffer.position(), cellBufSize, seqId, currKeyLen,
791            rowLen);
792        } else {
793          ret = new SizeCachedNoTagsKeyValue(blockBuffer.array(),
794            blockBuffer.arrayOffset() + blockBuffer.position(), cellBufSize, seqId, currKeyLen,
795            rowLen);
796        }
797      } else {
798        ByteBuffer buf = blockBuffer.asSubByteBuffer(cellBufSize);
799        if (buf.isDirect()) {
800          ret = currTagsLen > 0
801            ? new SizeCachedByteBufferKeyValue(buf, buf.position(), cellBufSize, seqId, currKeyLen,
802              rowLen)
803            : new SizeCachedNoTagsByteBufferKeyValue(buf, buf.position(), cellBufSize, seqId,
804              currKeyLen, rowLen);
805        } else {
806          if (currTagsLen > 0) {
807            ret = new SizeCachedKeyValue(buf.array(), buf.arrayOffset() + buf.position(),
808              cellBufSize, seqId, currKeyLen, rowLen);
809          } else {
810            ret = new SizeCachedNoTagsKeyValue(buf.array(), buf.arrayOffset() + buf.position(),
811              cellBufSize, seqId, currKeyLen, rowLen);
812          }
813        }
814      }
815      return ret;
816    }
817
818    @Override
819    public Cell getKey() {
820      assertSeeked();
821      // Create a new object so that this getKey is cached as firstKey, lastKey
822      ObjectIntPair<ByteBuffer> keyPair = new ObjectIntPair<>();
823      blockBuffer.asSubByteBuffer(blockBuffer.position() + KEY_VALUE_LEN_SIZE, currKeyLen, keyPair);
824      ByteBuffer keyBuf = keyPair.getFirst();
825      if (keyBuf.hasArray()) {
826        return new KeyValue.KeyOnlyKeyValue(keyBuf.array(),
827          keyBuf.arrayOffset() + keyPair.getSecond(), currKeyLen);
828      } else {
829        // Better to do a copy here instead of holding on to this BB so that
830        // we could release the blocks referring to this key. This key is specifically used
831        // in HalfStoreFileReader to get the firstkey and lastkey by creating a new scanner
832        // every time. So holding onto the BB (incase of DBB) is not advised here.
833        byte[] key = new byte[currKeyLen];
834        ByteBufferUtils.copyFromBufferToArray(key, keyBuf, keyPair.getSecond(), 0, currKeyLen);
835        return new KeyValue.KeyOnlyKeyValue(key, 0, currKeyLen);
836      }
837    }
838
839    @Override
840    public ByteBuffer getValue() {
841      assertSeeked();
842      // Okie to create new Pair. Not used in hot path
843      ObjectIntPair<ByteBuffer> valuePair = new ObjectIntPair<>();
844      this.blockBuffer.asSubByteBuffer(blockBuffer.position() + KEY_VALUE_LEN_SIZE + currKeyLen,
845        currValueLen, valuePair);
846      ByteBuffer valBuf = valuePair.getFirst().duplicate();
847      valBuf.position(valuePair.getSecond());
848      valBuf.limit(currValueLen + valuePair.getSecond());
849      return valBuf.slice();
850    }
851
852    protected void setNonSeekedState() {
853      reset();
854      blockBuffer = null;
855      currKeyLen = 0;
856      currValueLen = 0;
857      currMemstoreTS = 0;
858      currMemstoreTSLen = 0;
859      currTagsLen = 0;
860    }
861
862    /**
863     * Set the position on current backing blockBuffer.
864     */
865    private void positionThisBlockBuffer() {
866      try {
867        blockBuffer.skip(getCurCellSerializedSize());
868      } catch (IllegalArgumentException e) {
869        LOG.error("Current pos = " + blockBuffer.position() + "; currKeyLen = " + currKeyLen
870          + "; currValLen = " + currValueLen + "; block limit = " + blockBuffer.limit()
871          + "; currBlock currBlockOffset = " + this.curBlock.getOffset() + "; path="
872          + reader.getPath());
873        throw e;
874      }
875    }
876
877    /**
878     * Set our selves up for the next 'next' invocation, set up next block.
879     * @return True is more to read else false if at the end.
880     */
881    private boolean positionForNextBlock() throws IOException {
882      // Methods are small so they get inlined because they are 'hot'.
883      long lastDataBlockOffset = reader.getTrailer().getLastDataBlockOffset();
884      if (this.curBlock.getOffset() >= lastDataBlockOffset) {
885        setNonSeekedState();
886        return false;
887      }
888      return isNextBlock();
889    }
890
891    private boolean isNextBlock() throws IOException {
892      // Methods are small so they get inlined because they are 'hot'.
893      HFileBlock nextBlock = readNextDataBlock();
894      if (nextBlock == null) {
895        setNonSeekedState();
896        return false;
897      }
898      updateCurrentBlock(nextBlock);
899      return true;
900    }
901
902    private final boolean _next() throws IOException {
903      // Small method so can be inlined. It is a hot one.
904      if (blockBuffer.remaining() <= 0) {
905        return positionForNextBlock();
906      }
907
908      // We are still in the same block.
909      readKeyValueLen();
910      return true;
911    }
912
913    /**
914     * Go to the next key/value in the block section. Loads the next block if necessary. If
915     * successful, {@link #getKey()} and {@link #getValue()} can be called.
916     * @return true if successfully navigated to the next key/value
917     */
918    @Override
919    public boolean next() throws IOException {
920      // This is a hot method so extreme measures taken to ensure it is small and inlineable.
921      // Checked by setting: -XX:+UnlockDiagnosticVMOptions -XX:+PrintInlining -XX:+PrintCompilation
922      assertSeeked();
923      positionThisBlockBuffer();
924      return _next();
925    }
926
927    /**
928     * Positions this scanner at the start of the file.
929     * @return false if empty file; i.e. a call to next would return false and the current key and
930     *         value are undefined.
931     */
932    @Override
933    public boolean seekTo() throws IOException {
934      if (reader == null) {
935        return false;
936      }
937
938      if (reader.getTrailer().getEntryCount() == 0) {
939        // No data blocks.
940        return false;
941      }
942
943      long firstDataBlockOffset = reader.getTrailer().getFirstDataBlockOffset();
944      if (curBlock != null && curBlock.getOffset() == firstDataBlockOffset) {
945        return processFirstDataBlock();
946      }
947
948      readAndUpdateNewBlock(firstDataBlockOffset);
949      return true;
950    }
951
952    protected boolean processFirstDataBlock() throws IOException {
953      blockBuffer.rewind();
954      readKeyValueLen();
955      return true;
956    }
957
958    protected void readAndUpdateNewBlock(long firstDataBlockOffset) throws IOException {
959      HFileBlock newBlock = reader.readBlock(firstDataBlockOffset, -1, cacheBlocks, pread,
960        isCompaction, true, BlockType.DATA, getEffectiveDataBlockEncoding());
961      if (newBlock.getOffset() < 0) {
962        releaseIfNotCurBlock(newBlock);
963        throw new IOException(
964          "Invalid offset=" + newBlock.getOffset() + ", path=" + reader.getPath());
965      }
966      updateCurrentBlock(newBlock);
967    }
968
969    protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, Cell nextIndexedKey, boolean rewind,
970      Cell key, boolean seekBefore) throws IOException {
971      if (this.curBlock == null || this.curBlock.getOffset() != seekToBlock.getOffset()) {
972        updateCurrentBlock(seekToBlock);
973      } else if (rewind) {
974        blockBuffer.rewind();
975      }
976      // Update the nextIndexedKey
977      this.nextIndexedKey = nextIndexedKey;
978      return blockSeek(key, seekBefore);
979    }
980
981    /** Returns True if v &lt;= 0 or v &gt; current block buffer limit. */
982    protected final boolean checkKeyLen(final int v) {
983      return v <= 0 || v > this.blockBuffer.limit();
984    }
985
986    /** Returns True if v &lt; 0 or v &gt; current block buffer limit. */
987    protected final boolean checkLen(final int v) {
988      return v < 0 || v > this.blockBuffer.limit();
989    }
990
991    /**
992     * Check key and value lengths are wholesome.
993     */
994    protected final void checkKeyValueLen() {
995      if (checkKeyLen(this.currKeyLen) || checkLen(this.currValueLen)) {
996        throw new IllegalStateException("Invalid currKeyLen " + this.currKeyLen
997          + " or currValueLen " + this.currValueLen + ". Block offset: " + this.curBlock.getOffset()
998          + ", block length: " + this.blockBuffer.limit() + ", position: "
999          + this.blockBuffer.position() + " (without header)." + ", path=" + reader.getPath());
1000      }
1001    }
1002
1003    /**
1004     * Updates the current block to be the given {@link HFileBlock}. Seeks to the the first
1005     * key/value pair.
1006     * @param newBlock the block read by {@link HFileReaderImpl#readBlock}, it's a totally new block
1007     *                 with new allocated {@link ByteBuff}, so if no further reference to this
1008     *                 block, we should release it carefully.
1009     */
1010    protected void updateCurrentBlock(HFileBlock newBlock) throws IOException {
1011      try {
1012        if (newBlock.getBlockType() != BlockType.DATA) {
1013          throw new IllegalStateException(
1014            "ScannerV2 works only on data blocks, got " + newBlock.getBlockType() + "; "
1015              + "HFileName=" + reader.getPath() + ", " + "dataBlockEncoder="
1016              + reader.getDataBlockEncoding() + ", " + "isCompaction=" + isCompaction);
1017        }
1018        updateCurrBlockRef(newBlock);
1019        blockBuffer = newBlock.getBufferWithoutHeader();
1020        readKeyValueLen();
1021      } finally {
1022        releaseIfNotCurBlock(newBlock);
1023      }
1024      // Reset the next indexed key
1025      this.nextIndexedKey = null;
1026    }
1027
1028    protected Cell getFirstKeyCellInBlock(HFileBlock curBlock) {
1029      ByteBuff buffer = curBlock.getBufferWithoutHeader();
1030      // It is safe to manipulate this buffer because we own the buffer object.
1031      buffer.rewind();
1032      int klen = buffer.getInt();
1033      buffer.skip(Bytes.SIZEOF_INT);// Skip value len part
1034      ByteBuffer keyBuff = buffer.asSubByteBuffer(klen);
1035      if (keyBuff.hasArray()) {
1036        return new KeyValue.KeyOnlyKeyValue(keyBuff.array(),
1037          keyBuff.arrayOffset() + keyBuff.position(), klen);
1038      } else {
1039        return new ByteBufferKeyOnlyKeyValue(keyBuff, keyBuff.position(), klen);
1040      }
1041    }
1042
1043    @Override
1044    public String getKeyString() {
1045      return CellUtil.toString(getKey(), false);
1046    }
1047
1048    @Override
1049    public String getValueString() {
1050      return ByteBufferUtils.toStringBinary(getValue());
1051    }
1052
1053    public int compareKey(CellComparator comparator, Cell key) {
1054      blockBuffer.asSubByteBuffer(blockBuffer.position() + KEY_VALUE_LEN_SIZE, currKeyLen, pair);
1055      this.bufBackedKeyOnlyKv.setKey(pair.getFirst(), pair.getSecond(), currKeyLen, rowLen);
1056      return PrivateCellUtil.compareKeyIgnoresMvcc(comparator, key, this.bufBackedKeyOnlyKv);
1057    }
1058
1059    @Override
1060    public void shipped() throws IOException {
1061      this.returnBlocks(false);
1062    }
1063  }
1064
1065  @Override
1066  public Path getPath() {
1067    return path;
1068  }
1069
1070  @Override
1071  public DataBlockEncoding getDataBlockEncoding() {
1072    return dataBlockEncoder.getDataBlockEncoding();
1073  }
1074
1075  @Override
1076  public Configuration getConf() {
1077    return conf;
1078  }
1079
1080  @Override
1081  public void setConf(Configuration conf) {
1082    this.conf = conf;
1083  }
1084
1085  /** Minor versions in HFile starting with this number have hbase checksums */
1086  public static final int MINOR_VERSION_WITH_CHECKSUM = 1;
1087  /** In HFile minor version that does not support checksums */
1088  public static final int MINOR_VERSION_NO_CHECKSUM = 0;
1089
1090  /** HFile minor version that introduced pbuf filetrailer */
1091  public static final int PBUF_TRAILER_MINOR_VERSION = 2;
1092
1093  /**
1094   * The size of a (key length, value length) tuple that prefixes each entry in a data block.
1095   */
1096  public final static int KEY_VALUE_LEN_SIZE = 2 * Bytes.SIZEOF_INT;
1097
1098  /**
1099   * Retrieve block from cache. Validates the retrieved block's type vs {@code expectedBlockType}
1100   * and its encoding vs. {@code expectedDataBlockEncoding}. Unpacks the block as necessary.
1101   */
1102  private HFileBlock getCachedBlock(BlockCacheKey cacheKey, boolean cacheBlock, boolean useLock,
1103    boolean updateCacheMetrics, BlockType expectedBlockType,
1104    DataBlockEncoding expectedDataBlockEncoding) throws IOException {
1105    // Check cache for block. If found return.
1106    BlockCache cache = cacheConf.getBlockCache().orElse(null);
1107    if (cache != null) {
1108      HFileBlock cachedBlock =
1109        (HFileBlock) cache.getBlock(cacheKey, cacheBlock, useLock, updateCacheMetrics);
1110      if (cachedBlock != null) {
1111        if (cacheConf.shouldCacheCompressed(cachedBlock.getBlockType().getCategory())) {
1112          HFileBlock compressedBlock = cachedBlock;
1113          cachedBlock = compressedBlock.unpack(hfileContext, fsBlockReader);
1114          // In case of compressed block after unpacking we can release the compressed block
1115          if (compressedBlock != cachedBlock) {
1116            compressedBlock.release();
1117          }
1118        }
1119        try {
1120          validateBlockType(cachedBlock, expectedBlockType);
1121        } catch (IOException e) {
1122          returnAndEvictBlock(cache, cacheKey, cachedBlock);
1123          throw e;
1124        }
1125
1126        if (expectedDataBlockEncoding == null) {
1127          return cachedBlock;
1128        }
1129        DataBlockEncoding actualDataBlockEncoding = cachedBlock.getDataBlockEncoding();
1130        // Block types other than data blocks always have
1131        // DataBlockEncoding.NONE. To avoid false negative cache misses, only
1132        // perform this check if cached block is a data block.
1133        if (
1134          cachedBlock.getBlockType().isData()
1135            && !actualDataBlockEncoding.equals(expectedDataBlockEncoding)
1136        ) {
1137          // This mismatch may happen if a Scanner, which is used for say a
1138          // compaction, tries to read an encoded block from the block cache.
1139          // The reverse might happen when an EncodedScanner tries to read
1140          // un-encoded blocks which were cached earlier.
1141          //
1142          // Because returning a data block with an implicit BlockType mismatch
1143          // will cause the requesting scanner to throw a disk read should be
1144          // forced here. This will potentially cause a significant number of
1145          // cache misses, so update so we should keep track of this as it might
1146          // justify the work on a CompoundScanner.
1147          if (
1148            !expectedDataBlockEncoding.equals(DataBlockEncoding.NONE)
1149              && !actualDataBlockEncoding.equals(DataBlockEncoding.NONE)
1150          ) {
1151            // If the block is encoded but the encoding does not match the
1152            // expected encoding it is likely the encoding was changed but the
1153            // block was not yet evicted. Evictions on file close happen async
1154            // so blocks with the old encoding still linger in cache for some
1155            // period of time. This event should be rare as it only happens on
1156            // schema definition change.
1157            LOG.info(
1158              "Evicting cached block with key {} because data block encoding mismatch; "
1159                + "expected {}, actual {}, path={}",
1160              cacheKey, actualDataBlockEncoding, expectedDataBlockEncoding, path);
1161            // This is an error scenario. so here we need to release the block.
1162            returnAndEvictBlock(cache, cacheKey, cachedBlock);
1163          }
1164          return null;
1165        }
1166        return cachedBlock;
1167      }
1168    }
1169    return null;
1170  }
1171
1172  private void returnAndEvictBlock(BlockCache cache, BlockCacheKey cacheKey, Cacheable block) {
1173    block.release();
1174    cache.evictBlock(cacheKey);
1175  }
1176
1177  /**
1178   * @param cacheBlock Add block to cache, if found
1179   * @return block wrapped in a ByteBuffer, with header skipped
1180   */
1181  @Override
1182  public HFileBlock getMetaBlock(String metaBlockName, boolean cacheBlock) throws IOException {
1183    if (trailer.getMetaIndexCount() == 0) {
1184      return null; // there are no meta blocks
1185    }
1186    if (metaBlockIndexReader == null) {
1187      throw new IOException(path + " meta index not loaded");
1188    }
1189
1190    byte[] mbname = Bytes.toBytes(metaBlockName);
1191    int block = metaBlockIndexReader.rootBlockContainingKey(mbname, 0, mbname.length);
1192    if (block == -1) {
1193      return null;
1194    }
1195    long blockSize = metaBlockIndexReader.getRootBlockDataSize(block);
1196
1197    // Per meta key from any given file, synchronize reads for said block. This
1198    // is OK to do for meta blocks because the meta block index is always
1199    // single-level.
1200    synchronized (metaBlockIndexReader.getRootBlockKey(block)) {
1201      // Check cache for block. If found return.
1202      long metaBlockOffset = metaBlockIndexReader.getRootBlockOffset(block);
1203      BlockCacheKey cacheKey =
1204        new BlockCacheKey(name, metaBlockOffset, this.isPrimaryReplicaReader(), BlockType.META);
1205
1206      cacheBlock &= cacheConf.shouldCacheBlockOnRead(BlockType.META.getCategory());
1207      HFileBlock cachedBlock =
1208        getCachedBlock(cacheKey, cacheBlock, false, true, BlockType.META, null);
1209      if (cachedBlock != null) {
1210        assert cachedBlock.isUnpacked() : "Packed block leak.";
1211        // Return a distinct 'shallow copy' of the block,
1212        // so pos does not get messed by the scanner
1213        return cachedBlock;
1214      }
1215      // Cache Miss, please load.
1216
1217      HFileBlock compressedBlock =
1218        fsBlockReader.readBlockData(metaBlockOffset, blockSize, true, false, true);
1219      HFileBlock uncompressedBlock = compressedBlock.unpack(hfileContext, fsBlockReader);
1220      if (compressedBlock != uncompressedBlock) {
1221        compressedBlock.release();
1222      }
1223
1224      // Cache the block
1225      if (cacheBlock) {
1226        cacheConf.getBlockCache().ifPresent(
1227          cache -> cache.cacheBlock(cacheKey, uncompressedBlock, cacheConf.isInMemory()));
1228      }
1229      return uncompressedBlock;
1230    }
1231  }
1232
1233  /**
1234   * Whether we use heap or not depends on our intent to cache the block. We want to avoid
1235   * allocating to off-heap if we intend to cache into the on-heap L1 cache. Otherwise, it's more
1236   * efficient to allocate to off-heap since we can control GC ourselves for those. So our decision
1237   * here breaks down as follows: <br>
1238   * If block cache is disabled, don't use heap. If we're not using the CombinedBlockCache, use heap
1239   * unless caching is disabled for the request. Otherwise, only use heap if caching is enabled and
1240   * the expected block type is not DATA (which goes to off-heap L2 in combined cache).
1241   * @see org.apache.hadoop.hbase.io.hfile.HFileBlock.FSReader#readBlockData(long, long, boolean,
1242   *      boolean, boolean)
1243   */
1244  private boolean shouldUseHeap(BlockType expectedBlockType, boolean cacheBlock) {
1245    if (!cacheConf.getBlockCache().isPresent()) {
1246      return false;
1247    }
1248
1249    // we only cache a block if cacheBlock is true and caching-on-read is enabled in CacheConfig
1250    // we can really only check for that if have an expectedBlockType
1251    if (expectedBlockType != null) {
1252      cacheBlock &= cacheConf.shouldCacheBlockOnRead(expectedBlockType.getCategory());
1253    }
1254
1255    if (!cacheConf.isCombinedBlockCache()) {
1256      // Block to cache in LruBlockCache must be an heap one, if caching enabled. So just allocate
1257      // block memory from heap for saving an extra off-heap to heap copying in that case.
1258      return cacheBlock;
1259    }
1260
1261    return cacheBlock && expectedBlockType != null && !expectedBlockType.isData();
1262  }
1263
1264  @Override
1265  public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize, final boolean cacheBlock,
1266    boolean pread, final boolean isCompaction, boolean updateCacheMetrics,
1267    BlockType expectedBlockType, DataBlockEncoding expectedDataBlockEncoding) throws IOException {
1268    return readBlock(dataBlockOffset, onDiskBlockSize, cacheBlock, pread, isCompaction,
1269      updateCacheMetrics, expectedBlockType, expectedDataBlockEncoding, false);
1270  }
1271
1272  @Override
1273  public HFileBlock readBlock(long dataBlockOffset, long onDiskBlockSize, final boolean cacheBlock,
1274    boolean pread, final boolean isCompaction, boolean updateCacheMetrics,
1275    BlockType expectedBlockType, DataBlockEncoding expectedDataBlockEncoding, boolean cacheOnly)
1276    throws IOException {
1277    if (dataBlockIndexReader == null) {
1278      throw new IOException(path + " block index not loaded");
1279    }
1280    long trailerOffset = trailer.getLoadOnOpenDataOffset();
1281    if (dataBlockOffset < 0 || dataBlockOffset >= trailerOffset) {
1282      throw new IOException("Requested block is out of range: " + dataBlockOffset
1283        + ", lastDataBlockOffset: " + trailer.getLastDataBlockOffset()
1284        + ", trailer.getLoadOnOpenDataOffset: " + trailerOffset + ", path=" + path);
1285    }
1286    // For any given block from any given file, synchronize reads for said
1287    // block.
1288    // Without a cache, this synchronizing is needless overhead, but really
1289    // the other choice is to duplicate work (which the cache would prevent you
1290    // from doing).
1291
1292    BlockCacheKey cacheKey =
1293      new BlockCacheKey(name, dataBlockOffset, this.isPrimaryReplicaReader(), expectedBlockType);
1294    Attributes attributes = Attributes.of(BLOCK_CACHE_KEY_KEY, cacheKey.toString());
1295
1296    boolean cacheable = cacheBlock && cacheIfCompactionsOff();
1297
1298    boolean useLock = false;
1299    IdLock.Entry lockEntry = null;
1300    final Span span = Span.current();
1301    try {
1302      while (true) {
1303        // Check cache for block. If found return.
1304        if (cacheConf.shouldReadBlockFromCache(expectedBlockType) && !cacheOnly) {
1305          if (useLock) {
1306            lockEntry = offsetLock.getLockEntry(dataBlockOffset);
1307          }
1308          // Try and get the block from the block cache. If the useLock variable is true then this
1309          // is the second time through the loop and it should not be counted as a block cache miss.
1310          HFileBlock cachedBlock = getCachedBlock(cacheKey, cacheBlock, useLock, updateCacheMetrics,
1311            expectedBlockType, expectedDataBlockEncoding);
1312          if (cachedBlock != null) {
1313            if (LOG.isTraceEnabled()) {
1314              LOG.trace("Block for file {} is coming from Cache {}",
1315                Bytes.toString(cachedBlock.getHFileContext().getTableName()), cachedBlock);
1316            }
1317            span.addEvent("block cache hit", attributes);
1318            assert cachedBlock.isUnpacked() : "Packed block leak.";
1319            if (cachedBlock.getBlockType().isData()) {
1320              if (updateCacheMetrics) {
1321                HFile.DATABLOCK_READ_COUNT.increment();
1322              }
1323              // Validate encoding type for data blocks. We include encoding
1324              // type in the cache key, and we expect it to match on a cache hit.
1325              if (cachedBlock.getDataBlockEncoding() != dataBlockEncoder.getDataBlockEncoding()) {
1326                // Remember to release the block when in exceptional path.
1327                cacheConf.getBlockCache().ifPresent(cache -> {
1328                  returnAndEvictBlock(cache, cacheKey, cachedBlock);
1329                });
1330                throw new IOException("Cached block under key " + cacheKey + " "
1331                  + "has wrong encoding: " + cachedBlock.getDataBlockEncoding() + " (expected: "
1332                  + dataBlockEncoder.getDataBlockEncoding() + "), path=" + path);
1333              }
1334            }
1335            // Cache-hit. Return!
1336            return cachedBlock;
1337          }
1338
1339          if (!useLock && cacheable && cacheConf.shouldLockOnCacheMiss(expectedBlockType)) {
1340            // check cache again with lock
1341            useLock = true;
1342            continue;
1343          }
1344          // Carry on, please load.
1345        }
1346
1347        span.addEvent("block cache miss", attributes);
1348        // Load block from filesystem.
1349        HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset, onDiskBlockSize, pread,
1350          !isCompaction, shouldUseHeap(expectedBlockType, cacheable));
1351        try {
1352          validateBlockType(hfileBlock, expectedBlockType);
1353        } catch (IOException e) {
1354          hfileBlock.release();
1355          throw e;
1356        }
1357        BlockType.BlockCategory category = hfileBlock.getBlockType().getCategory();
1358        final boolean cacheCompressed = cacheConf.shouldCacheCompressed(category);
1359        final boolean cacheOnRead = cacheConf.shouldCacheBlockOnRead(category);
1360
1361        // Don't need the unpacked block back and we're storing the block in the cache compressed
1362        if (cacheOnly && cacheCompressed && cacheOnRead) {
1363          cacheConf.getBlockCache().ifPresent(cache -> {
1364            LOG.debug("Skipping decompression of block {} in prefetch", cacheKey);
1365            // Cache the block if necessary
1366            if (cacheable && cacheConf.shouldCacheBlockOnRead(category)) {
1367              cache.cacheBlock(cacheKey, hfileBlock, cacheConf.isInMemory(), cacheOnly);
1368            }
1369          });
1370
1371          if (updateCacheMetrics && hfileBlock.getBlockType().isData()) {
1372            HFile.DATABLOCK_READ_COUNT.increment();
1373          }
1374          return hfileBlock;
1375        }
1376        HFileBlock unpacked = hfileBlock.unpack(hfileContext, fsBlockReader);
1377        // Cache the block if necessary
1378        cacheConf.getBlockCache().ifPresent(cache -> {
1379          if (cacheable && cacheConf.shouldCacheBlockOnRead(category)) {
1380            // Using the wait on cache during compaction and prefetching.
1381            cache.cacheBlock(cacheKey, cacheCompressed ? hfileBlock : unpacked,
1382              cacheConf.isInMemory(), cacheOnly);
1383          }
1384        });
1385        if (unpacked != hfileBlock) {
1386          // End of life here if hfileBlock is an independent block.
1387          hfileBlock.release();
1388        }
1389        if (updateCacheMetrics && hfileBlock.getBlockType().isData()) {
1390          HFile.DATABLOCK_READ_COUNT.increment();
1391        }
1392
1393        return unpacked;
1394      }
1395    } finally {
1396      if (lockEntry != null) {
1397        offsetLock.releaseLockEntry(lockEntry);
1398      }
1399    }
1400  }
1401
1402  @Override
1403  public boolean hasMVCCInfo() {
1404    return fileInfo.shouldIncludeMemStoreTS() && fileInfo.isDecodeMemstoreTS();
1405  }
1406
1407  /**
1408   * Compares the actual type of a block retrieved from cache or disk with its expected type and
1409   * throws an exception in case of a mismatch. Expected block type of {@link BlockType#DATA} is
1410   * considered to match the actual block type [@link {@link BlockType#ENCODED_DATA} as well.
1411   * @param block             a block retrieved from cache or disk
1412   * @param expectedBlockType the expected block type, or null to skip the check
1413   */
1414  private void validateBlockType(HFileBlock block, BlockType expectedBlockType) throws IOException {
1415    if (expectedBlockType == null) {
1416      return;
1417    }
1418    BlockType actualBlockType = block.getBlockType();
1419    if (expectedBlockType.isData() && actualBlockType.isData()) {
1420      // We consider DATA to match ENCODED_DATA for the purpose of this
1421      // verification.
1422      return;
1423    }
1424    if (actualBlockType != expectedBlockType) {
1425      throw new IOException("Expected block type " + expectedBlockType + ", " + "but got "
1426        + actualBlockType + ": " + block + ", path=" + path);
1427    }
1428  }
1429
1430  /**
1431   * @return Last key as cell in the file. May be null if file has no entries. Note that this is not
1432   *         the last row key, but it is the Cell representation of the last key
1433   */
1434  @Override
1435  public Optional<Cell> getLastKey() {
1436    return dataBlockIndexReader.isEmpty()
1437      ? Optional.empty()
1438      : Optional.of(fileInfo.getLastKeyCell());
1439  }
1440
1441  /**
1442   * @return Midkey for this file. We work with block boundaries only so returned midkey is an
1443   *         approximation only.
1444   */
1445  @Override
1446  public Optional<Cell> midKey() throws IOException {
1447    return Optional.ofNullable(dataBlockIndexReader.midkey(this));
1448  }
1449
1450  @Override
1451  public void close() throws IOException {
1452    close(cacheConf.shouldEvictOnClose());
1453  }
1454
1455  @Override
1456  public DataBlockEncoding getEffectiveEncodingInCache(boolean isCompaction) {
1457    return dataBlockEncoder.getEffectiveEncodingInCache(isCompaction);
1458  }
1459
1460  /** For testing */
1461  @Override
1462  public HFileBlock.FSReader getUncachedBlockReader() {
1463    return fsBlockReader;
1464  }
1465
1466  /**
1467   * Scanner that operates on encoded data blocks.
1468   */
1469  protected static class EncodedScanner extends HFileScannerImpl {
1470    private final HFileBlockDecodingContext decodingCtx;
1471    private final DataBlockEncoder.EncodedSeeker seeker;
1472    private final DataBlockEncoder dataBlockEncoder;
1473
1474    public EncodedScanner(HFile.Reader reader, boolean cacheBlocks, boolean pread,
1475      boolean isCompaction, HFileContext meta, Configuration conf) {
1476      super(reader, cacheBlocks, pread, isCompaction);
1477      DataBlockEncoding encoding = reader.getDataBlockEncoding();
1478      dataBlockEncoder = encoding.getEncoder();
1479      decodingCtx = dataBlockEncoder.newDataBlockDecodingContext(conf, meta);
1480      seeker = dataBlockEncoder.createSeeker(decodingCtx);
1481    }
1482
1483    @Override
1484    public boolean isSeeked() {
1485      return curBlock != null;
1486    }
1487
1488    @Override
1489    public void setNonSeekedState() {
1490      reset();
1491    }
1492
1493    /**
1494     * Updates the current block to be the given {@link HFileBlock}. Seeks to the the first
1495     * key/value pair.
1496     * @param newBlock the block to make current, and read by {@link HFileReaderImpl#readBlock},
1497     *                 it's a totally new block with new allocated {@link ByteBuff}, so if no
1498     *                 further reference to this block, we should release it carefully.
1499     */
1500    @Override
1501    protected void updateCurrentBlock(HFileBlock newBlock) throws CorruptHFileException {
1502      try {
1503        // sanity checks
1504        if (newBlock.getBlockType() != BlockType.ENCODED_DATA) {
1505          throw new IllegalStateException("EncodedScanner works only on encoded data blocks");
1506        }
1507        short dataBlockEncoderId = newBlock.getDataBlockEncodingId();
1508        if (!DataBlockEncoding.isCorrectEncoder(dataBlockEncoder, dataBlockEncoderId)) {
1509          String encoderCls = dataBlockEncoder.getClass().getName();
1510          throw new CorruptHFileException(
1511            "Encoder " + encoderCls + " doesn't support data block encoding "
1512              + DataBlockEncoding.getNameFromId(dataBlockEncoderId) + ",path=" + reader.getPath());
1513        }
1514        updateCurrBlockRef(newBlock);
1515        ByteBuff encodedBuffer = getEncodedBuffer(newBlock);
1516        seeker.setCurrentBuffer(encodedBuffer);
1517      } finally {
1518        releaseIfNotCurBlock(newBlock);
1519      }
1520      // Reset the next indexed key
1521      this.nextIndexedKey = null;
1522    }
1523
1524    private ByteBuff getEncodedBuffer(HFileBlock newBlock) {
1525      ByteBuff origBlock = newBlock.getBufferReadOnly();
1526      int pos = newBlock.headerSize() + DataBlockEncoding.ID_SIZE;
1527      origBlock.position(pos);
1528      origBlock
1529        .limit(pos + newBlock.getUncompressedSizeWithoutHeader() - DataBlockEncoding.ID_SIZE);
1530      return origBlock.slice();
1531    }
1532
1533    @Override
1534    protected boolean processFirstDataBlock() throws IOException {
1535      seeker.rewind();
1536      return true;
1537    }
1538
1539    @Override
1540    public boolean next() throws IOException {
1541      boolean isValid = seeker.next();
1542      if (!isValid) {
1543        HFileBlock newBlock = readNextDataBlock();
1544        isValid = newBlock != null;
1545        if (isValid) {
1546          updateCurrentBlock(newBlock);
1547        } else {
1548          setNonSeekedState();
1549        }
1550      }
1551      return isValid;
1552    }
1553
1554    @Override
1555    public Cell getKey() {
1556      assertValidSeek();
1557      return seeker.getKey();
1558    }
1559
1560    @Override
1561    public ByteBuffer getValue() {
1562      assertValidSeek();
1563      return seeker.getValueShallowCopy();
1564    }
1565
1566    @Override
1567    public Cell getCell() {
1568      if (this.curBlock == null) {
1569        return null;
1570      }
1571      return seeker.getCell();
1572    }
1573
1574    @Override
1575    public String getKeyString() {
1576      return CellUtil.toString(getKey(), false);
1577    }
1578
1579    @Override
1580    public String getValueString() {
1581      ByteBuffer valueBuffer = getValue();
1582      return ByteBufferUtils.toStringBinary(valueBuffer);
1583    }
1584
1585    private void assertValidSeek() {
1586      if (this.curBlock == null) {
1587        throw new NotSeekedException(reader.getPath());
1588      }
1589    }
1590
1591    @Override
1592    protected Cell getFirstKeyCellInBlock(HFileBlock curBlock) {
1593      return dataBlockEncoder.getFirstKeyCellInBlock(getEncodedBuffer(curBlock));
1594    }
1595
1596    @Override
1597    protected int loadBlockAndSeekToKey(HFileBlock seekToBlock, Cell nextIndexedKey, boolean rewind,
1598      Cell key, boolean seekBefore) throws IOException {
1599      if (this.curBlock == null || this.curBlock.getOffset() != seekToBlock.getOffset()) {
1600        updateCurrentBlock(seekToBlock);
1601      } else if (rewind) {
1602        seeker.rewind();
1603      }
1604      this.nextIndexedKey = nextIndexedKey;
1605      return seeker.seekToKeyInBlock(key, seekBefore);
1606    }
1607
1608    @Override
1609    public int compareKey(CellComparator comparator, Cell key) {
1610      return seeker.compareKey(comparator, key);
1611    }
1612  }
1613
1614  /**
1615   * Returns a buffer with the Bloom filter metadata. The caller takes ownership of the buffer.
1616   */
1617  @Override
1618  public DataInput getGeneralBloomFilterMetadata() throws IOException {
1619    return this.getBloomFilterMetadata(BlockType.GENERAL_BLOOM_META);
1620  }
1621
1622  @Override
1623  public DataInput getDeleteBloomFilterMetadata() throws IOException {
1624    return this.getBloomFilterMetadata(BlockType.DELETE_FAMILY_BLOOM_META);
1625  }
1626
1627  private DataInput getBloomFilterMetadata(BlockType blockType) throws IOException {
1628    if (
1629      blockType != BlockType.GENERAL_BLOOM_META && blockType != BlockType.DELETE_FAMILY_BLOOM_META
1630    ) {
1631      throw new RuntimeException(
1632        "Block Type: " + blockType.toString() + " is not supported, path=" + path);
1633    }
1634
1635    for (HFileBlock b : fileInfo.getLoadOnOpenBlocks()) {
1636      if (b.getBlockType() == blockType) {
1637        return b.getByteStream();
1638      }
1639    }
1640    return null;
1641  }
1642
1643  public boolean isFileInfoLoaded() {
1644    return true; // We load file info in constructor in version 2.
1645  }
1646
1647  @Override
1648  public HFileContext getFileContext() {
1649    return hfileContext;
1650  }
1651
1652  /**
1653   * Returns false if block prefetching was requested for this file and has not completed, true
1654   * otherwise
1655   */
1656  @Override
1657  public boolean prefetchComplete() {
1658    return PrefetchExecutor.isCompleted(path);
1659  }
1660
1661  /**
1662   * Returns true if block prefetching was started after waiting for specified delay, false
1663   * otherwise
1664   */
1665  @Override
1666  public boolean prefetchStarted() {
1667    return PrefetchExecutor.isPrefetchStarted();
1668  }
1669
1670  /**
1671   * Create a Scanner on this file. No seeks or reads are done on creation. Call
1672   * {@link HFileScanner#seekTo(Cell)} to position an start the read. There is nothing to clean up
1673   * in a Scanner. Letting go of your references to the scanner is sufficient. NOTE: Do not use this
1674   * overload of getScanner for compactions. See
1675   * {@link #getScanner(Configuration, boolean, boolean, boolean)}
1676   * @param conf        Store configuration.
1677   * @param cacheBlocks True if we should cache blocks read in by this scanner.
1678   * @param pread       Use positional read rather than seek+read if true (pread is better for
1679   *                    random reads, seek+read is better scanning).
1680   * @return Scanner on this file.
1681   */
1682  @Override
1683  public HFileScanner getScanner(Configuration conf, boolean cacheBlocks, final boolean pread) {
1684    return getScanner(conf, cacheBlocks, pread, false);
1685  }
1686
1687  /**
1688   * Create a Scanner on this file. No seeks or reads are done on creation. Call
1689   * {@link HFileScanner#seekTo(Cell)} to position an start the read. There is nothing to clean up
1690   * in a Scanner. Letting go of your references to the scanner is sufficient.
1691   * @param conf         Store configuration.
1692   * @param cacheBlocks  True if we should cache blocks read in by this scanner.
1693   * @param pread        Use positional read rather than seek+read if true (pread is better for
1694   *                     random reads, seek+read is better scanning).
1695   * @param isCompaction is scanner being used for a compaction?
1696   * @return Scanner on this file.
1697   */
1698  @Override
1699  public HFileScanner getScanner(Configuration conf, boolean cacheBlocks, final boolean pread,
1700    final boolean isCompaction) {
1701    if (dataBlockEncoder.useEncodedScanner()) {
1702      return new EncodedScanner(this, cacheBlocks, pread, isCompaction, this.hfileContext, conf);
1703    }
1704    return new HFileScannerImpl(this, cacheBlocks, pread, isCompaction);
1705  }
1706
1707  public int getMajorVersion() {
1708    return 3;
1709  }
1710
1711  @Override
1712  public void unbufferStream() {
1713    fsBlockReader.unbufferStream();
1714  }
1715
1716  protected boolean cacheIfCompactionsOff() {
1717    return (!StoreFileInfo.isReference(name) && !HFileLink.isHFileLink(name))
1718      || !conf.getBoolean(HBASE_REGION_SERVER_ENABLE_COMPACTION, true);
1719  }
1720}