001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.io.hfile; 019 020import java.io.ByteArrayOutputStream; 021import java.io.DataInput; 022import java.io.DataInputStream; 023import java.io.DataOutput; 024import java.io.DataOutputStream; 025import java.io.IOException; 026import java.nio.ByteBuffer; 027import java.util.ArrayList; 028import java.util.Collections; 029import java.util.List; 030import java.util.concurrent.atomic.AtomicReference; 031import org.apache.hadoop.conf.Configuration; 032import org.apache.hadoop.fs.FSDataOutputStream; 033import org.apache.hadoop.hbase.ByteBufferKeyOnlyKeyValue; 034import org.apache.hadoop.hbase.Cell; 035import org.apache.hadoop.hbase.CellComparator; 036import org.apache.hadoop.hbase.CellUtil; 037import org.apache.hadoop.hbase.KeyValue; 038import org.apache.hadoop.hbase.KeyValue.KeyOnlyKeyValue; 039import org.apache.hadoop.hbase.PrivateCellUtil; 040import org.apache.hadoop.hbase.io.HeapSize; 041import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; 042import org.apache.hadoop.hbase.io.hfile.HFile.CachingBlockReader; 043import org.apache.hadoop.hbase.nio.ByteBuff; 044import org.apache.hadoop.hbase.regionserver.KeyValueScanner; 045import org.apache.hadoop.hbase.util.Bytes; 046import org.apache.hadoop.hbase.util.ClassSize; 047import org.apache.hadoop.hbase.util.ObjectIntPair; 048import org.apache.hadoop.io.WritableUtils; 049import org.apache.hadoop.util.StringUtils; 050import org.apache.yetus.audience.InterfaceAudience; 051import org.slf4j.Logger; 052import org.slf4j.LoggerFactory; 053 054/** 055 * Provides functionality to write ({@link BlockIndexWriter}) and read BlockIndexReader single-level 056 * and multi-level block indexes. Examples of how to use the block index writer can be found in 057 * {@link org.apache.hadoop.hbase.io.hfile.CompoundBloomFilterWriter} and {@link HFileWriterImpl}. 058 * Examples of how to use the reader can be found in {@link HFileReaderImpl} and 059 * org.apache.hadoop.hbase.io.hfile.TestHFileBlockIndex. 060 */ 061@InterfaceAudience.Private 062public class HFileBlockIndex { 063 064 private static final Logger LOG = LoggerFactory.getLogger(HFileBlockIndex.class); 065 066 static final int DEFAULT_MAX_CHUNK_SIZE = 128 * 1024; 067 068 /** 069 * The maximum size guideline for index blocks (both leaf, intermediate, and root). If not 070 * specified, <code>DEFAULT_MAX_CHUNK_SIZE</code> is used. 071 */ 072 public static final String MAX_CHUNK_SIZE_KEY = "hfile.index.block.max.size"; 073 074 /** 075 * Minimum number of entries in a single index block. Even if we are above the 076 * hfile.index.block.max.size we will keep writing to the same block unless we have that many 077 * entries. We should have at least a few entries so that we don't have too many levels in the 078 * multi-level index. This should be at least 2 to make sure there is no infinite recursion. 079 */ 080 public static final String MIN_INDEX_NUM_ENTRIES_KEY = "hfile.index.block.min.entries"; 081 082 static final int DEFAULT_MIN_INDEX_NUM_ENTRIES = 16; 083 084 /** 085 * The number of bytes stored in each "secondary index" entry in addition to key bytes in the 086 * non-root index block format. The first long is the file offset of the deeper-level block the 087 * entry points to, and the int that follows is that block's on-disk size without including 088 * header. 089 */ 090 static final int SECONDARY_INDEX_ENTRY_OVERHEAD = Bytes.SIZEOF_INT + Bytes.SIZEOF_LONG; 091 092 /** 093 * Error message when trying to use inline block API in single-level mode. 094 */ 095 private static final String INLINE_BLOCKS_NOT_ALLOWED = 096 "Inline blocks are not allowed in the single-level-only mode"; 097 098 /** 099 * The size of a meta-data record used for finding the mid-key in a multi-level index. Consists of 100 * the middle leaf-level index block offset (long), its on-disk size without header included 101 * (int), and the mid-key entry's zero-based index in that leaf index block. 102 */ 103 protected static final int MID_KEY_METADATA_SIZE = Bytes.SIZEOF_LONG + 2 * Bytes.SIZEOF_INT; 104 105 /** 106 * An implementation of the BlockIndexReader that deals with block keys which are plain byte[] 107 * like MetaBlock or the Bloom Block for ROW bloom. Does not need a comparator. It can work on 108 * Bytes.BYTES_RAWCOMPARATOR 109 */ 110 static class ByteArrayKeyBlockIndexReader extends BlockIndexReader { 111 112 private byte[][] blockKeys; 113 114 public ByteArrayKeyBlockIndexReader(final int treeLevel) { 115 // Can be null for METAINDEX block 116 searchTreeLevel = treeLevel; 117 } 118 119 @Override 120 protected long calculateHeapSizeForBlockKeys(long heapSize) { 121 // Calculating the size of blockKeys 122 if (blockKeys != null) { 123 heapSize += ClassSize.REFERENCE; 124 // Adding array + references overhead 125 heapSize += ClassSize.align(ClassSize.ARRAY + blockKeys.length * ClassSize.REFERENCE); 126 127 // Adding bytes 128 for (byte[] key : blockKeys) { 129 heapSize += ClassSize.align(ClassSize.ARRAY + key.length); 130 } 131 } 132 return heapSize; 133 } 134 135 @Override 136 public boolean isEmpty() { 137 return blockKeys.length == 0; 138 } 139 140 /** 141 * from 0 to {@link #getRootBlockCount() - 1} 142 */ 143 public byte[] getRootBlockKey(int i) { 144 return blockKeys[i]; 145 } 146 147 @Override 148 public BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock, 149 boolean cacheBlocks, boolean pread, boolean isCompaction, 150 DataBlockEncoding expectedDataBlockEncoding, CachingBlockReader cachingBlockReader) 151 throws IOException { 152 // this would not be needed 153 return null; 154 } 155 156 @Override 157 public Cell midkey(CachingBlockReader cachingBlockReader) throws IOException { 158 // Not needed here 159 return null; 160 } 161 162 @Override 163 protected void initialize(int numEntries) { 164 blockKeys = new byte[numEntries][]; 165 } 166 167 @Override 168 protected void add(final byte[] key, final long offset, final int dataSize) { 169 blockOffsets[rootCount] = offset; 170 blockKeys[rootCount] = key; 171 blockDataSizes[rootCount] = dataSize; 172 rootCount++; 173 } 174 175 @Override 176 public int rootBlockContainingKey(byte[] key, int offset, int length, CellComparator comp) { 177 int pos = Bytes.binarySearch(blockKeys, key, offset, length); 178 // pos is between -(blockKeys.length + 1) to blockKeys.length - 1, see 179 // binarySearch's javadoc. 180 181 if (pos >= 0) { 182 // This means this is an exact match with an element of blockKeys. 183 assert pos < blockKeys.length; 184 return pos; 185 } 186 187 // Otherwise, pos = -(i + 1), where blockKeys[i - 1] < key < blockKeys[i], 188 // and i is in [0, blockKeys.length]. We are returning j = i - 1 such that 189 // blockKeys[j] <= key < blockKeys[j + 1]. In particular, j = -1 if 190 // key < blockKeys[0], meaning the file does not contain the given key. 191 192 int i = -pos - 1; 193 assert 0 <= i && i <= blockKeys.length; 194 return i - 1; 195 } 196 197 @Override 198 public int rootBlockContainingKey(Cell key) { 199 // Should not be called on this because here it deals only with byte[] 200 throw new UnsupportedOperationException( 201 "Cannot search for a key that is of Cell type. Only plain byte array keys " 202 + "can be searched for"); 203 } 204 205 @Override 206 public String toString() { 207 StringBuilder sb = new StringBuilder(); 208 sb.append("size=" + rootCount).append("\n"); 209 for (int i = 0; i < rootCount; i++) { 210 sb.append("key=").append(KeyValue.keyToString(blockKeys[i])).append("\n offset=") 211 .append(blockOffsets[i]).append(", dataSize=" + blockDataSizes[i]).append("\n"); 212 } 213 return sb.toString(); 214 } 215 } 216 217 /** 218 * An implementation of the BlockIndexReader that deals with block keys which are the key part of 219 * a cell like the Data block index or the ROW_COL bloom blocks This needs a comparator to work 220 * with the Cells 221 */ 222 static class CellBasedKeyBlockIndexReader extends BlockIndexReader { 223 224 private Cell[] blockKeys; 225 /** Pre-computed mid-key */ 226 private AtomicReference<Cell> midKey = new AtomicReference<>(); 227 /** Needed doing lookup on blocks. */ 228 protected CellComparator comparator; 229 230 public CellBasedKeyBlockIndexReader(final CellComparator c, final int treeLevel) { 231 // Can be null for METAINDEX block 232 comparator = c; 233 searchTreeLevel = treeLevel; 234 } 235 236 @Override 237 protected long calculateHeapSizeForBlockKeys(long heapSize) { 238 if (blockKeys != null) { 239 heapSize += ClassSize.REFERENCE; 240 // Adding array + references overhead 241 heapSize += ClassSize.align(ClassSize.ARRAY + blockKeys.length * ClassSize.REFERENCE); 242 243 // Adding blockKeys 244 for (Cell key : blockKeys) { 245 heapSize += ClassSize.align(key.heapSize()); 246 } 247 } 248 // Add comparator and the midkey atomicreference 249 heapSize += 2 * ClassSize.REFERENCE; 250 return heapSize; 251 } 252 253 @Override 254 public boolean isEmpty() { 255 return blockKeys.length == 0; 256 } 257 258 /** 259 * from 0 to {@link #getRootBlockCount() - 1} 260 */ 261 public Cell getRootBlockKey(int i) { 262 return blockKeys[i]; 263 } 264 265 @Override 266 public BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock, 267 boolean cacheBlocks, boolean pread, boolean isCompaction, 268 DataBlockEncoding expectedDataBlockEncoding, CachingBlockReader cachingBlockReader) 269 throws IOException { 270 int rootLevelIndex = rootBlockContainingKey(key); 271 if (rootLevelIndex < 0 || rootLevelIndex >= blockOffsets.length) { 272 return null; 273 } 274 275 // the next indexed key 276 Cell nextIndexedKey = null; 277 278 // Read the next-level (intermediate or leaf) index block. 279 long currentOffset = blockOffsets[rootLevelIndex]; 280 int currentOnDiskSize = blockDataSizes[rootLevelIndex]; 281 282 if (rootLevelIndex < blockKeys.length - 1) { 283 nextIndexedKey = blockKeys[rootLevelIndex + 1]; 284 } else { 285 nextIndexedKey = KeyValueScanner.NO_NEXT_INDEXED_KEY; 286 } 287 288 int lookupLevel = 1; // How many levels deep we are in our lookup. 289 int index = -1; 290 291 HFileBlock block = null; 292 KeyOnlyKeyValue tmpNextIndexKV = new KeyValue.KeyOnlyKeyValue(); 293 while (true) { 294 try { 295 // Must initialize it with null here, because if don't and once an exception happen in 296 // readBlock, then we'll release the previous assigned block twice in the finally block. 297 // (See HBASE-22422) 298 block = null; 299 if (currentBlock != null && currentBlock.getOffset() == currentOffset) { 300 // Avoid reading the same block again, even with caching turned off. 301 // This is crucial for compaction-type workload which might have 302 // caching turned off. This is like a one-block cache inside the 303 // scanner. 304 block = currentBlock; 305 } else { 306 // Call HFile's caching block reader API. We always cache index 307 // blocks, otherwise we might get terrible performance. 308 boolean shouldCache = cacheBlocks || (lookupLevel < searchTreeLevel); 309 BlockType expectedBlockType; 310 if (lookupLevel < searchTreeLevel - 1) { 311 expectedBlockType = BlockType.INTERMEDIATE_INDEX; 312 } else if (lookupLevel == searchTreeLevel - 1) { 313 expectedBlockType = BlockType.LEAF_INDEX; 314 } else { 315 // this also accounts for ENCODED_DATA 316 expectedBlockType = BlockType.DATA; 317 } 318 block = cachingBlockReader.readBlock(currentOffset, currentOnDiskSize, shouldCache, 319 pread, isCompaction, true, expectedBlockType, expectedDataBlockEncoding); 320 } 321 322 if (block == null) { 323 throw new IOException("Failed to read block at offset " + currentOffset 324 + ", onDiskSize=" + currentOnDiskSize); 325 } 326 327 // Found a data block, break the loop and check our level in the tree. 328 if (block.getBlockType().isData()) { 329 break; 330 } 331 332 // Not a data block. This must be a leaf-level or intermediate-level 333 // index block. We don't allow going deeper than searchTreeLevel. 334 if (++lookupLevel > searchTreeLevel) { 335 throw new IOException("Search Tree Level overflow: lookupLevel=" + lookupLevel 336 + ", searchTreeLevel=" + searchTreeLevel); 337 } 338 339 // Locate the entry corresponding to the given key in the non-root 340 // (leaf or intermediate-level) index block. 341 ByteBuff buffer = block.getBufferWithoutHeader(); 342 index = locateNonRootIndexEntry(buffer, key, comparator); 343 if (index == -1) { 344 // This has to be changed 345 // For now change this to key value 346 throw new IOException("The key " + CellUtil.getCellKeyAsString(key) + " is before the" 347 + " first key of the non-root index block " + block); 348 } 349 350 currentOffset = buffer.getLong(); 351 currentOnDiskSize = buffer.getInt(); 352 353 // Only update next indexed key if there is a next indexed key in the current level 354 byte[] nonRootIndexedKey = getNonRootIndexedKey(buffer, index + 1); 355 if (nonRootIndexedKey != null) { 356 tmpNextIndexKV.setKey(nonRootIndexedKey, 0, nonRootIndexedKey.length); 357 nextIndexedKey = tmpNextIndexKV; 358 } 359 } finally { 360 if (block != null && !block.getBlockType().isData()) { 361 // Release the block immediately if it is not the data block 362 block.release(); 363 } 364 } 365 } 366 367 if (lookupLevel != searchTreeLevel) { 368 assert block.getBlockType().isData(); 369 // Though we have retrieved a data block we have found an issue 370 // in the retrieved data block. Hence returned the block so that 371 // the ref count can be decremented 372 if (block != null) { 373 block.release(); 374 } 375 throw new IOException("Reached a data block at level " + lookupLevel 376 + " but the number of levels is " + searchTreeLevel); 377 } 378 379 // set the next indexed key for the current block. 380 return new BlockWithScanInfo(block, nextIndexedKey); 381 } 382 383 @Override 384 public Cell midkey(CachingBlockReader cachingBlockReader) throws IOException { 385 if (rootCount == 0) throw new IOException("HFile empty"); 386 387 Cell targetMidKey = this.midKey.get(); 388 if (targetMidKey != null) { 389 return targetMidKey; 390 } 391 392 if (midLeafBlockOffset >= 0) { 393 if (cachingBlockReader == null) { 394 throw new IOException( 395 "Have to read the middle leaf block but " + "no block reader available"); 396 } 397 398 // Caching, using pread, assuming this is not a compaction. 399 HFileBlock midLeafBlock = cachingBlockReader.readBlock(midLeafBlockOffset, 400 midLeafBlockOnDiskSize, true, true, false, true, BlockType.LEAF_INDEX, null); 401 try { 402 byte[] bytes = getNonRootIndexedKey(midLeafBlock.getBufferWithoutHeader(), midKeyEntry); 403 assert bytes != null; 404 targetMidKey = new KeyValue.KeyOnlyKeyValue(bytes, 0, bytes.length); 405 } finally { 406 midLeafBlock.release(); 407 } 408 } else { 409 // The middle of the root-level index. 410 targetMidKey = blockKeys[rootCount / 2]; 411 } 412 413 this.midKey.set(targetMidKey); 414 return targetMidKey; 415 } 416 417 @Override 418 protected void initialize(int numEntries) { 419 blockKeys = new Cell[numEntries]; 420 } 421 422 /** 423 * Adds a new entry in the root block index. Only used when reading. 424 * @param key Last key in the block 425 * @param offset file offset where the block is stored 426 * @param dataSize the uncompressed data size 427 */ 428 @Override 429 protected void add(final byte[] key, final long offset, final int dataSize) { 430 blockOffsets[rootCount] = offset; 431 // Create the blockKeys as Cells once when the reader is opened 432 blockKeys[rootCount] = new KeyValue.KeyOnlyKeyValue(key, 0, key.length); 433 blockDataSizes[rootCount] = dataSize; 434 rootCount++; 435 } 436 437 @Override 438 public int rootBlockContainingKey(final byte[] key, int offset, int length, 439 CellComparator comp) { 440 // This should always be called with Cell not with a byte[] key 441 throw new UnsupportedOperationException("Cannot find for a key containing plain byte " 442 + "array. Only cell based keys can be searched for"); 443 } 444 445 @Override 446 public int rootBlockContainingKey(Cell key) { 447 // Here the comparator should not be null as this happens for the root-level block 448 int pos = Bytes.binarySearch(blockKeys, key, comparator); 449 // pos is between -(blockKeys.length + 1) to blockKeys.length - 1, see 450 // binarySearch's javadoc. 451 452 if (pos >= 0) { 453 // This means this is an exact match with an element of blockKeys. 454 assert pos < blockKeys.length; 455 return pos; 456 } 457 458 // Otherwise, pos = -(i + 1), where blockKeys[i - 1] < key < blockKeys[i], 459 // and i is in [0, blockKeys.length]. We are returning j = i - 1 such that 460 // blockKeys[j] <= key < blockKeys[j + 1]. In particular, j = -1 if 461 // key < blockKeys[0], meaning the file does not contain the given key. 462 463 int i = -pos - 1; 464 assert 0 <= i && i <= blockKeys.length; 465 return i - 1; 466 } 467 468 @Override 469 public String toString() { 470 StringBuilder sb = new StringBuilder(); 471 sb.append("size=" + rootCount).append("\n"); 472 for (int i = 0; i < rootCount; i++) { 473 sb.append("key=").append((blockKeys[i])).append("\n offset=").append(blockOffsets[i]) 474 .append(", dataSize=" + blockDataSizes[i]).append("\n"); 475 } 476 return sb.toString(); 477 } 478 } 479 480 static class CellBasedKeyBlockIndexReaderV2 extends CellBasedKeyBlockIndexReader { 481 482 private HFileIndexBlockEncoder indexBlockEncoder; 483 484 private HFileIndexBlockEncoder.EncodedSeeker seeker; 485 486 public CellBasedKeyBlockIndexReaderV2(final CellComparator c, final int treeLevel) { 487 this(c, treeLevel, null); 488 } 489 490 public CellBasedKeyBlockIndexReaderV2(final CellComparator c, final int treeLevel, 491 HFileIndexBlockEncoder indexBlockEncoder) { 492 super(c, treeLevel); 493 // Can be null for METAINDEX block 494 this.indexBlockEncoder = 495 indexBlockEncoder != null ? indexBlockEncoder : NoOpIndexBlockEncoder.INSTANCE; 496 } 497 498 @Override 499 public boolean isEmpty() { 500 return seeker.isEmpty(); 501 } 502 503 @Override 504 public BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock, 505 boolean cacheBlocks, boolean pread, boolean isCompaction, 506 DataBlockEncoding expectedDataBlockEncoding, CachingBlockReader cachingBlockReader) 507 throws IOException { 508 return seeker.loadDataBlockWithScanInfo(key, currentBlock, cacheBlocks, pread, isCompaction, 509 expectedDataBlockEncoding, cachingBlockReader); 510 } 511 512 @Override 513 public Cell midkey(CachingBlockReader cachingBlockReader) throws IOException { 514 return seeker.midkey(cachingBlockReader); 515 } 516 517 /** 518 * from 0 to {@link #getRootBlockCount() - 1} 519 */ 520 public Cell getRootBlockKey(int i) { 521 return seeker.getRootBlockKey(i); 522 } 523 524 @Override 525 public int getRootBlockCount() { 526 return seeker.getRootBlockCount(); 527 } 528 529 @Override 530 public int rootBlockContainingKey(Cell key) { 531 return seeker.rootBlockContainingKey(key); 532 } 533 534 @Override 535 protected long calculateHeapSizeForBlockKeys(long heapSize) { 536 heapSize = super.calculateHeapSizeForBlockKeys(heapSize); 537 if (seeker != null) { 538 heapSize += ClassSize.REFERENCE; 539 heapSize += ClassSize.align(seeker.heapSize()); 540 } 541 return heapSize; 542 } 543 544 @Override 545 public void readMultiLevelIndexRoot(HFileBlock blk, final int numEntries) throws IOException { 546 seeker = indexBlockEncoder.createSeeker(); 547 seeker.initRootIndex(blk, numEntries, comparator, searchTreeLevel); 548 } 549 550 @Override 551 public String toString() { 552 return seeker.toString(); 553 } 554 } 555 556 /** 557 * The reader will always hold the root level index in the memory. Index blocks at all other 558 * levels will be cached in the LRU cache in practice, although this API does not enforce that. 559 * <p> 560 * All non-root (leaf and intermediate) index blocks contain what we call a "secondary index": an 561 * array of offsets to the entries within the block. This allows us to do binary search for the 562 * entry corresponding to the given key without having to deserialize the block. 563 */ 564 static abstract class BlockIndexReader implements HeapSize { 565 566 protected long[] blockOffsets; 567 protected int[] blockDataSizes; 568 protected int rootCount = 0; 569 570 // Mid-key metadata. 571 protected long midLeafBlockOffset = -1; 572 protected int midLeafBlockOnDiskSize = -1; 573 protected int midKeyEntry = -1; 574 575 /** 576 * The number of levels in the block index tree. One if there is only root level, two for root 577 * and leaf levels, etc. 578 */ 579 protected int searchTreeLevel; 580 581 /** Returns true if the block index is empty. */ 582 public abstract boolean isEmpty(); 583 584 /** 585 * Verifies that the block index is non-empty and throws an {@link IllegalStateException} 586 * otherwise. 587 */ 588 public void ensureNonEmpty() { 589 if (isEmpty()) { 590 throw new IllegalStateException("Block index is empty or not loaded"); 591 } 592 } 593 594 /** 595 * Return the data block which contains this key. This function will only be called when the 596 * HFile version is larger than 1. 597 * @param key the key we are looking for 598 * @param currentBlock the current block, to avoid re-reading the same block 599 * @param expectedDataBlockEncoding the data block encoding the caller is expecting the data 600 * block to be in, or null to not perform this check and return 601 * the block irrespective of the encoding 602 * @return reader a basic way to load blocks 603 */ 604 public HFileBlock seekToDataBlock(final Cell key, HFileBlock currentBlock, boolean cacheBlocks, 605 boolean pread, boolean isCompaction, DataBlockEncoding expectedDataBlockEncoding, 606 CachingBlockReader cachingBlockReader) throws IOException { 607 BlockWithScanInfo blockWithScanInfo = loadDataBlockWithScanInfo(key, currentBlock, 608 cacheBlocks, pread, isCompaction, expectedDataBlockEncoding, cachingBlockReader); 609 if (blockWithScanInfo == null) { 610 return null; 611 } else { 612 return blockWithScanInfo.getHFileBlock(); 613 } 614 } 615 616 /** 617 * Return the BlockWithScanInfo, a data structure which contains the Data HFileBlock with other 618 * scan info such as the key that starts the next HFileBlock. This function will only be called 619 * when the HFile version is larger than 1. 620 * @param key the key we are looking for 621 * @param currentBlock the current block, to avoid re-reading the same block 622 * @param expectedDataBlockEncoding the data block encoding the caller is expecting the data 623 * block to be in, or null to not perform this check and return 624 * the block irrespective of the encoding. 625 * @return the BlockWithScanInfo which contains the DataBlock with other scan info such as 626 * nextIndexedKey. 627 */ 628 public abstract BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock, 629 boolean cacheBlocks, boolean pread, boolean isCompaction, 630 DataBlockEncoding expectedDataBlockEncoding, CachingBlockReader cachingBlockReader) 631 throws IOException; 632 633 /** 634 * An approximation to the {@link HFile}'s mid-key. Operates on block boundaries, and does not 635 * go inside blocks. In other words, returns the first key of the middle block of the file. 636 * @return the first key of the middle block 637 */ 638 public abstract Cell midkey(CachingBlockReader cachingBlockReader) throws IOException; 639 640 /** 641 * @param i from 0 to {@link #getRootBlockCount() - 1} 642 */ 643 public long getRootBlockOffset(int i) { 644 return blockOffsets[i]; 645 } 646 647 /** 648 * @param i zero-based index of a root-level block 649 * @return the on-disk size of the root-level block for version 2, or the uncompressed size for 650 * version 1 651 */ 652 public int getRootBlockDataSize(int i) { 653 return blockDataSizes[i]; 654 } 655 656 /** Returns the number of root-level blocks in this block index */ 657 public int getRootBlockCount() { 658 return rootCount; 659 } 660 661 /** 662 * Finds the root-level index block containing the given key. Key to find the comparator to be 663 * used 664 * @return Offset of block containing <code>key</code> (between 0 and the number of blocks - 1) 665 * or -1 if this file does not contain the request. 666 */ 667 // When we want to find the meta index block or bloom block for ROW bloom 668 // type Bytes.BYTES_RAWCOMPARATOR would be enough. For the ROW_COL bloom case we need the 669 // CellComparator. 670 public abstract int rootBlockContainingKey(final byte[] key, int offset, int length, 671 CellComparator comp); 672 673 /** 674 * Finds the root-level index block containing the given key. Key to find 675 * @return Offset of block containing <code>key</code> (between 0 and the number of blocks - 1) 676 * or -1 if this file does not contain the request. 677 */ 678 // When we want to find the meta index block or bloom block for ROW bloom 679 // type 680 // Bytes.BYTES_RAWCOMPARATOR would be enough. For the ROW_COL bloom case we 681 // need the CellComparator. 682 public int rootBlockContainingKey(final byte[] key, int offset, int length) { 683 return rootBlockContainingKey(key, offset, length, null); 684 } 685 686 /** 687 * Finds the root-level index block containing the given key. Key to find 688 */ 689 public abstract int rootBlockContainingKey(final Cell key); 690 691 /** 692 * The indexed key at the ith position in the nonRootIndex. The position starts at 0. 693 * @param i the ith position 694 * @return The indexed key at the ith position in the nonRootIndex. 695 */ 696 static byte[] getNonRootIndexedKey(ByteBuff nonRootIndex, int i) { 697 int numEntries = nonRootIndex.getInt(0); 698 if (i < 0 || i >= numEntries) { 699 return null; 700 } 701 702 // Entries start after the number of entries and the secondary index. 703 // The secondary index takes numEntries + 1 ints. 704 int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2); 705 // Targetkey's offset relative to the end of secondary index 706 int targetKeyRelOffset = nonRootIndex.getInt(Bytes.SIZEOF_INT * (i + 1)); 707 708 // The offset of the target key in the blockIndex buffer 709 int targetKeyOffset = entriesOffset // Skip secondary index 710 + targetKeyRelOffset // Skip all entries until mid 711 + SECONDARY_INDEX_ENTRY_OVERHEAD; // Skip offset and on-disk-size 712 713 // We subtract the two consecutive secondary index elements, which 714 // gives us the size of the whole (offset, onDiskSize, key) tuple. We 715 // then need to subtract the overhead of offset and onDiskSize. 716 int targetKeyLength = nonRootIndex.getInt(Bytes.SIZEOF_INT * (i + 2)) - targetKeyRelOffset 717 - SECONDARY_INDEX_ENTRY_OVERHEAD; 718 719 // TODO check whether we can make BB backed Cell here? So can avoid bytes copy. 720 return nonRootIndex.toBytes(targetKeyOffset, targetKeyLength); 721 } 722 723 /** 724 * Performs a binary search over a non-root level index block. Utilizes the secondary index, 725 * which records the offsets of (offset, onDiskSize, firstKey) tuples of all entries. the key we 726 * are searching for offsets to individual entries in the blockIndex buffer the non-root index 727 * block buffer, starting with the secondary index. The position is ignored. 728 * @return the index i in [0, numEntries - 1] such that keys[i] <= key < keys[i + 1], if keys is 729 * the array of all keys being searched, or -1 otherwise 730 */ 731 static int binarySearchNonRootIndex(Cell key, ByteBuff nonRootIndex, 732 CellComparator comparator) { 733 734 int numEntries = nonRootIndex.getIntAfterPosition(0); 735 int low = 0; 736 int high = numEntries - 1; 737 int mid = 0; 738 739 // Entries start after the number of entries and the secondary index. 740 // The secondary index takes numEntries + 1 ints. 741 int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2); 742 743 // If we imagine that keys[-1] = -Infinity and 744 // keys[numEntries] = Infinity, then we are maintaining an invariant that 745 // keys[low - 1] < key < keys[high + 1] while narrowing down the range. 746 ByteBufferKeyOnlyKeyValue nonRootIndexkeyOnlyKV = new ByteBufferKeyOnlyKeyValue(); 747 ObjectIntPair<ByteBuffer> pair = new ObjectIntPair<>(); 748 while (low <= high) { 749 mid = low + ((high - low) >> 1); 750 751 // Midkey's offset relative to the end of secondary index 752 int midKeyRelOffset = nonRootIndex.getIntAfterPosition(Bytes.SIZEOF_INT * (mid + 1)); 753 754 // The offset of the middle key in the blockIndex buffer 755 int midKeyOffset = entriesOffset // Skip secondary index 756 + midKeyRelOffset // Skip all entries until mid 757 + SECONDARY_INDEX_ENTRY_OVERHEAD; // Skip offset and on-disk-size 758 759 // We subtract the two consecutive secondary index elements, which 760 // gives us the size of the whole (offset, onDiskSize, key) tuple. We 761 // then need to subtract the overhead of offset and onDiskSize. 762 int midLength = nonRootIndex.getIntAfterPosition(Bytes.SIZEOF_INT * (mid + 2)) 763 - midKeyRelOffset - SECONDARY_INDEX_ENTRY_OVERHEAD; 764 765 // we have to compare in this order, because the comparator order 766 // has special logic when the 'left side' is a special key. 767 // TODO make KeyOnlyKeyValue to be Buffer backed and avoid array() call. This has to be 768 // done after HBASE-12224 & HBASE-12282 769 // TODO avoid array call. 770 nonRootIndex.asSubByteBuffer(midKeyOffset, midLength, pair); 771 nonRootIndexkeyOnlyKV.setKey(pair.getFirst(), pair.getSecond(), midLength); 772 int cmp = PrivateCellUtil.compareKeyIgnoresMvcc(comparator, key, nonRootIndexkeyOnlyKV); 773 774 // key lives above the midpoint 775 if (cmp > 0) low = mid + 1; // Maintain the invariant that keys[low - 1] < key 776 // key lives below the midpoint 777 else if (cmp < 0) high = mid - 1; // Maintain the invariant that key < keys[high + 1] 778 else return mid; // exact match 779 } 780 781 // As per our invariant, keys[low - 1] < key < keys[high + 1], meaning 782 // that low - 1 < high + 1 and (low - high) <= 1. As per the loop break 783 // condition, low >= high + 1. Therefore, low = high + 1. 784 785 if (low != high + 1) { 786 throw new IllegalStateException( 787 "Binary search broken: low=" + low + " " + "instead of " + (high + 1)); 788 } 789 790 // OK, our invariant says that keys[low - 1] < key < keys[low]. We need to 791 // return i such that keys[i] <= key < keys[i + 1]. Therefore i = low - 1. 792 int i = low - 1; 793 794 // Some extra validation on the result. 795 if (i < -1 || i >= numEntries) { 796 throw new IllegalStateException("Binary search broken: result is " + i 797 + " but expected to be between -1 and (numEntries - 1) = " + (numEntries - 1)); 798 } 799 800 return i; 801 } 802 803 /** 804 * Search for one key using the secondary index in a non-root block. In case of success, 805 * positions the provided buffer at the entry of interest, where the file offset and the 806 * on-disk-size can be read. a non-root block without header. Initial position does not matter. 807 * the byte array containing the key 808 * @return the index position where the given key was found, otherwise return -1 in the case the 809 * given key is before the first key. 810 */ 811 static int locateNonRootIndexEntry(ByteBuff nonRootBlock, Cell key, CellComparator comparator) { 812 int entryIndex = binarySearchNonRootIndex(key, nonRootBlock, comparator); 813 814 if (entryIndex != -1) { 815 int numEntries = nonRootBlock.getIntAfterPosition(0); 816 817 // The end of secondary index and the beginning of entries themselves. 818 int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2); 819 820 // The offset of the entry we are interested in relative to the end of 821 // the secondary index. 822 int entryRelOffset = nonRootBlock.getIntAfterPosition(Bytes.SIZEOF_INT * (1 + entryIndex)); 823 824 nonRootBlock.position(entriesOffset + entryRelOffset); 825 } 826 827 return entryIndex; 828 } 829 830 /** 831 * Read in the root-level index from the given input stream. Must match what was written into 832 * the root level by {@link BlockIndexWriter#writeIndexBlocks(FSDataOutputStream)} at the offset 833 * that function returned. 834 * @param in the buffered input stream or wrapped byte input stream 835 * @param numEntries the number of root-level index entries 836 */ 837 public void readRootIndex(DataInput in, final int numEntries) throws IOException { 838 blockOffsets = new long[numEntries]; 839 initialize(numEntries); 840 blockDataSizes = new int[numEntries]; 841 842 // If index size is zero, no index was written. 843 if (numEntries > 0) { 844 for (int i = 0; i < numEntries; ++i) { 845 long offset = in.readLong(); 846 int dataSize = in.readInt(); 847 byte[] key = Bytes.readByteArray(in); 848 add(key, offset, dataSize); 849 } 850 } 851 } 852 853 protected abstract void initialize(int numEntries); 854 855 protected abstract void add(final byte[] key, final long offset, final int dataSize); 856 857 /** 858 * Read in the root-level index from the given input stream. Must match what was written into 859 * the root level by {@link BlockIndexWriter#writeIndexBlocks(FSDataOutputStream)} at the offset 860 * that function returned. 861 * @param blk the HFile block 862 * @param numEntries the number of root-level index entries 863 * @return the buffered input stream or wrapped byte input stream 864 */ 865 public DataInputStream readRootIndex(HFileBlock blk, final int numEntries) throws IOException { 866 DataInputStream in = blk.getByteStream(); 867 readRootIndex(in, numEntries); 868 return in; 869 } 870 871 /** 872 * Read the root-level metadata of a multi-level block index. Based on 873 * {@link #readRootIndex(DataInput, int)}, but also reads metadata necessary to compute the 874 * mid-key in a multi-level index. 875 * @param blk the HFile block 876 * @param numEntries the number of root-level index entries 877 */ 878 public void readMultiLevelIndexRoot(HFileBlock blk, final int numEntries) throws IOException { 879 DataInputStream in = readRootIndex(blk, numEntries); 880 // HFileBlock.getByteStream() returns a byte stream for reading the data(excluding checksum) 881 // of root index block, so after reading the root index there is no need to subtract the 882 // checksum bytes. 883 if (in.available() < MID_KEY_METADATA_SIZE) { 884 // No mid-key metadata available. 885 return; 886 } 887 midLeafBlockOffset = in.readLong(); 888 midLeafBlockOnDiskSize = in.readInt(); 889 midKeyEntry = in.readInt(); 890 } 891 892 @Override 893 public long heapSize() { 894 // The BlockIndexReader does not have the blockKey, comparator and the midkey atomic reference 895 long heapSize = 896 ClassSize.align(3 * ClassSize.REFERENCE + 2 * Bytes.SIZEOF_INT + ClassSize.OBJECT); 897 898 // Mid-key metadata. 899 heapSize += MID_KEY_METADATA_SIZE; 900 901 heapSize = calculateHeapSizeForBlockKeys(heapSize); 902 903 if (blockOffsets != null) { 904 heapSize += ClassSize.align(ClassSize.ARRAY + blockOffsets.length * Bytes.SIZEOF_LONG); 905 } 906 907 if (blockDataSizes != null) { 908 heapSize += ClassSize.align(ClassSize.ARRAY + blockDataSizes.length * Bytes.SIZEOF_INT); 909 } 910 911 return ClassSize.align(heapSize); 912 } 913 914 protected abstract long calculateHeapSizeForBlockKeys(long heapSize); 915 } 916 917 /** 918 * Writes the block index into the output stream. Generate the tree from bottom up. The leaf level 919 * is written to disk as a sequence of inline blocks, if it is larger than a certain number of 920 * bytes. If the leaf level is not large enough, we write all entries to the root level instead. 921 * After all leaf blocks have been written, we end up with an index referencing the resulting leaf 922 * index blocks. If that index is larger than the allowed root index size, the writer will break 923 * it up into reasonable-size intermediate-level index block chunks write those chunks out, and 924 * create another index referencing those chunks. This will be repeated until the remaining index 925 * is small enough to become the root index. However, in most practical cases we will only have 926 * leaf-level blocks and the root index, or just the root index. 927 */ 928 public static class BlockIndexWriter implements InlineBlockWriter { 929 /** 930 * While the index is being written, this represents the current block index referencing all 931 * leaf blocks, with one exception. If the file is being closed and there are not enough blocks 932 * to complete even a single leaf block, no leaf blocks get written and this contains the entire 933 * block index. After all levels of the index were written by 934 * {@link #writeIndexBlocks(FSDataOutputStream)}, this contains the final root-level index. 935 */ 936 private BlockIndexChunk rootChunk = new BlockIndexChunkImpl(); 937 938 /** 939 * Current leaf-level chunk. New entries referencing data blocks get added to this chunk until 940 * it grows large enough to be written to disk. 941 */ 942 private BlockIndexChunk curInlineChunk = new BlockIndexChunkImpl(); 943 944 /** 945 * The number of block index levels. This is one if there is only root level (even empty), two 946 * if there a leaf level and root level, and is higher if there are intermediate levels. This is 947 * only final after {@link #writeIndexBlocks(FSDataOutputStream)} has been called. The initial 948 * value accounts for the root level, and will be increased to two as soon as we find out there 949 * is a leaf-level in {@link #blockWritten(long, int, int)}. 950 */ 951 private int numLevels = 1; 952 953 private HFileBlock.Writer blockWriter; 954 private byte[] firstKey = null; 955 956 /** 957 * The total number of leaf-level entries, i.e. entries referenced by leaf-level blocks. For the 958 * data block index this is equal to the number of data blocks. 959 */ 960 private long totalNumEntries; 961 962 /** Total compressed size of all index blocks. */ 963 private long totalBlockOnDiskSize; 964 965 /** Total uncompressed size of all index blocks. */ 966 private long totalBlockUncompressedSize; 967 968 /** The maximum size guideline of all multi-level index blocks. */ 969 private int maxChunkSize; 970 971 /** The maximum level of multi-level index blocks */ 972 private int minIndexNumEntries; 973 974 /** Whether we require this block index to always be single-level. */ 975 private boolean singleLevelOnly; 976 977 /** CacheConfig, or null if cache-on-write is disabled */ 978 private CacheConfig cacheConf; 979 980 /** Name to use for computing cache keys */ 981 private String nameForCaching; 982 983 /** Type of encoding used for index blocks in HFile */ 984 private HFileIndexBlockEncoder indexBlockEncoder; 985 986 /** Creates a single-level block index writer */ 987 public BlockIndexWriter() { 988 this(null, null, null, null); 989 singleLevelOnly = true; 990 } 991 992 /** 993 * Creates a multi-level block index writer. 994 * @param blockWriter the block writer to use to write index blocks 995 * @param cacheConf used to determine when and how a block should be cached-on-write. 996 */ 997 public BlockIndexWriter(HFileBlock.Writer blockWriter, CacheConfig cacheConf, 998 String nameForCaching, HFileIndexBlockEncoder indexBlockEncoder) { 999 if ((cacheConf == null) != (nameForCaching == null)) { 1000 throw new IllegalArgumentException( 1001 "Block cache and file name for " + "caching must be both specified or both null"); 1002 } 1003 1004 this.blockWriter = blockWriter; 1005 this.cacheConf = cacheConf; 1006 this.nameForCaching = nameForCaching; 1007 this.maxChunkSize = HFileBlockIndex.DEFAULT_MAX_CHUNK_SIZE; 1008 this.minIndexNumEntries = HFileBlockIndex.DEFAULT_MIN_INDEX_NUM_ENTRIES; 1009 this.indexBlockEncoder = 1010 indexBlockEncoder != null ? indexBlockEncoder : NoOpIndexBlockEncoder.INSTANCE; 1011 } 1012 1013 public void setMaxChunkSize(int maxChunkSize) { 1014 if (maxChunkSize <= 0) { 1015 throw new IllegalArgumentException("Invalid maximum index block size"); 1016 } 1017 this.maxChunkSize = maxChunkSize; 1018 } 1019 1020 public void setMinIndexNumEntries(int minIndexNumEntries) { 1021 if (minIndexNumEntries <= 1) { 1022 throw new IllegalArgumentException("Invalid maximum index level, should be >= 2"); 1023 } 1024 this.minIndexNumEntries = minIndexNumEntries; 1025 } 1026 1027 /** 1028 * Writes the root level and intermediate levels of the block index into the output stream, 1029 * generating the tree from bottom up. Assumes that the leaf level has been inline-written to 1030 * the disk if there is enough data for more than one leaf block. We iterate by breaking the 1031 * current level of the block index, starting with the index of all leaf-level blocks, into 1032 * chunks small enough to be written to disk, and generate its parent level, until we end up 1033 * with a level small enough to become the root level. If the leaf level is not large enough, 1034 * there is no inline block index anymore, so we only write that level of block index to disk as 1035 * the root level. 1036 * @param out FSDataOutputStream 1037 * @return position at which we entered the root-level index. 1038 */ 1039 public long writeIndexBlocks(FSDataOutputStream out) throws IOException { 1040 if (curInlineChunk != null && curInlineChunk.getNumEntries() != 0) { 1041 throw new IOException("Trying to write a multi-level block index, " + "but are " 1042 + curInlineChunk.getNumEntries() + " entries in the " + "last inline chunk."); 1043 } 1044 1045 // We need to get mid-key metadata before we create intermediate 1046 // indexes and overwrite the root chunk. 1047 byte[] midKeyMetadata = numLevels > 1 ? rootChunk.getMidKeyMetadata() : null; 1048 1049 if (curInlineChunk != null) { 1050 while ( 1051 rootChunk.getRootSize() > maxChunkSize 1052 // HBASE-16288: if firstKey is larger than maxChunkSize we will loop indefinitely 1053 && rootChunk.getNumEntries() > minIndexNumEntries 1054 // Sanity check. We will not hit this (minIndexNumEntries ^ 16) blocks can be addressed 1055 && numLevels < 16 1056 ) { 1057 rootChunk = writeIntermediateLevel(out, rootChunk); 1058 numLevels += 1; 1059 } 1060 } 1061 1062 // write the root level 1063 long rootLevelIndexPos = out.getPos(); 1064 1065 { 1066 DataOutput blockStream = blockWriter.startWriting(BlockType.ROOT_INDEX); 1067 indexBlockEncoder.encode(rootChunk, true, blockStream); 1068 if (midKeyMetadata != null) blockStream.write(midKeyMetadata); 1069 blockWriter.writeHeaderAndData(out); 1070 if (cacheConf != null) { 1071 cacheConf.getBlockCache().ifPresent(cache -> { 1072 HFileBlock blockForCaching = blockWriter.getBlockForCaching(cacheConf); 1073 cache.cacheBlock(new BlockCacheKey(nameForCaching, rootLevelIndexPos, true, 1074 blockForCaching.getBlockType()), blockForCaching); 1075 }); 1076 } 1077 } 1078 1079 // Add root index block size 1080 totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader(); 1081 totalBlockUncompressedSize += blockWriter.getUncompressedSizeWithoutHeader(); 1082 1083 if (LOG.isTraceEnabled()) { 1084 LOG.trace("Wrote a " + numLevels + "-level index with root level at pos " 1085 + rootLevelIndexPos + ", " + rootChunk.getNumEntries() + " root-level entries, " 1086 + totalNumEntries + " total entries, " 1087 + StringUtils.humanReadableInt(this.totalBlockOnDiskSize) + " on-disk size, " 1088 + StringUtils.humanReadableInt(totalBlockUncompressedSize) + " total uncompressed size."); 1089 } 1090 return rootLevelIndexPos; 1091 } 1092 1093 /** 1094 * Writes the block index data as a single level only. Does not do any block framing. 1095 * @param out the buffered output stream to write the index to. Typically a stream 1096 * writing into an {@link HFile} block. 1097 * @param description a short description of the index being written. Used in a log message. 1098 */ 1099 public void writeSingleLevelIndex(DataOutput out, String description) throws IOException { 1100 expectNumLevels(1); 1101 1102 if (!singleLevelOnly) throw new IOException("Single-level mode is turned off"); 1103 1104 if (rootChunk.getNumEntries() > 0) 1105 throw new IOException("Root-level entries already added in " + "single-level mode"); 1106 1107 rootChunk = curInlineChunk; 1108 curInlineChunk = new BlockIndexChunkImpl(); 1109 1110 if (LOG.isTraceEnabled()) { 1111 LOG.trace("Wrote a single-level " + description + " index with " + rootChunk.getNumEntries() 1112 + " entries, " + rootChunk.getRootSize() + " bytes"); 1113 } 1114 indexBlockEncoder.encode(rootChunk, true, out); 1115 } 1116 1117 /** 1118 * Split the current level of the block index into intermediate index blocks of permitted size 1119 * and write those blocks to disk. Return the next level of the block index referencing those 1120 * intermediate-level blocks. 1121 * @param currentLevel the current level of the block index, such as the a chunk referencing all 1122 * leaf-level index blocks 1123 * @return the parent level block index, which becomes the root index after a few (usually zero) 1124 * iterations 1125 */ 1126 private BlockIndexChunk writeIntermediateLevel(FSDataOutputStream out, 1127 BlockIndexChunk currentLevel) throws IOException { 1128 // Entries referencing intermediate-level blocks we are about to create. 1129 BlockIndexChunk parent = new BlockIndexChunkImpl(); 1130 1131 // The current intermediate-level block index chunk. 1132 BlockIndexChunk curChunk = new BlockIndexChunkImpl(); 1133 1134 for (int i = 0; i < currentLevel.getNumEntries(); ++i) { 1135 curChunk.add(currentLevel.getBlockKey(i), currentLevel.getBlockOffset(i), 1136 currentLevel.getOnDiskDataSize(i)); 1137 1138 // HBASE-16288: We have to have at least minIndexNumEntries(16) items in the index so that 1139 // we won't end up with too-many levels for a index with very large rowKeys. Also, if the 1140 // first key is larger than maxChunkSize this will cause infinite recursion. 1141 if (i >= minIndexNumEntries && curChunk.getRootSize() >= maxChunkSize) { 1142 writeIntermediateBlock(out, parent, curChunk); 1143 } 1144 } 1145 1146 if (curChunk.getNumEntries() > 0) { 1147 writeIntermediateBlock(out, parent, curChunk); 1148 } 1149 1150 return parent; 1151 } 1152 1153 private void writeIntermediateBlock(FSDataOutputStream out, BlockIndexChunk parent, 1154 BlockIndexChunk curChunk) throws IOException { 1155 long beginOffset = out.getPos(); 1156 DataOutputStream dos = blockWriter.startWriting(BlockType.INTERMEDIATE_INDEX); 1157 indexBlockEncoder.encode(curChunk, false, dos); 1158 byte[] curFirstKey = curChunk.getBlockKey(0); 1159 blockWriter.writeHeaderAndData(out); 1160 1161 if (getCacheOnWrite()) { 1162 cacheConf.getBlockCache().ifPresent(cache -> { 1163 HFileBlock blockForCaching = blockWriter.getBlockForCaching(cacheConf); 1164 cache.cacheBlock( 1165 new BlockCacheKey(nameForCaching, beginOffset, true, blockForCaching.getBlockType()), 1166 blockForCaching); 1167 }); 1168 } 1169 1170 // Add intermediate index block size 1171 totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader(); 1172 totalBlockUncompressedSize += blockWriter.getUncompressedSizeWithoutHeader(); 1173 1174 // OFFSET is the beginning offset the chunk of block index entries. 1175 // SIZE is the total byte size of the chunk of block index entries 1176 // + the secondary index size 1177 // FIRST_KEY is the first key in the chunk of block index 1178 // entries. 1179 parent.add(curFirstKey, beginOffset, blockWriter.getOnDiskSizeWithHeader()); 1180 1181 // clear current block index chunk 1182 curChunk.clear(); 1183 curFirstKey = null; 1184 } 1185 1186 /** Returns how many block index entries there are in the root level */ 1187 public final int getNumRootEntries() { 1188 return rootChunk.getNumEntries(); 1189 } 1190 1191 /** Returns the number of levels in this block index. */ 1192 public int getNumLevels() { 1193 return numLevels; 1194 } 1195 1196 private void expectNumLevels(int expectedNumLevels) { 1197 if (numLevels != expectedNumLevels) { 1198 throw new IllegalStateException("Number of block index levels is " + numLevels 1199 + "but is expected to be " + expectedNumLevels); 1200 } 1201 } 1202 1203 /** 1204 * Whether there is an inline block ready to be written. In general, we write an leaf-level 1205 * index block as an inline block as soon as its size as serialized in the non-root format 1206 * reaches a certain threshold. 1207 */ 1208 @Override 1209 public boolean shouldWriteBlock(boolean closing) { 1210 if (singleLevelOnly) { 1211 throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED); 1212 } 1213 1214 if (curInlineChunk == null) { 1215 throw new IllegalStateException("curInlineChunk is null; has shouldWriteBlock been " 1216 + "called with closing=true and then called again?"); 1217 } 1218 1219 if (curInlineChunk.getNumEntries() == 0) { 1220 return false; 1221 } 1222 1223 // We do have some entries in the current inline chunk. 1224 if (closing) { 1225 if (rootChunk.getNumEntries() == 0) { 1226 // We did not add any leaf-level blocks yet. Instead of creating a 1227 // leaf level with one block, move these entries to the root level. 1228 1229 expectNumLevels(1); 1230 rootChunk = curInlineChunk; 1231 curInlineChunk = null; // Disallow adding any more index entries. 1232 return false; 1233 } 1234 1235 return true; 1236 } else { 1237 return curInlineChunk.getNonRootSize() >= maxChunkSize; 1238 } 1239 } 1240 1241 /** 1242 * Write out the current inline index block. Inline blocks are non-root blocks, so the non-root 1243 * index format is used. 1244 */ 1245 @Override 1246 public void writeInlineBlock(DataOutput out) throws IOException { 1247 if (singleLevelOnly) throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED); 1248 1249 // Write the inline block index to the output stream in the non-root 1250 // index block format. 1251 indexBlockEncoder.encode(curInlineChunk, false, out); 1252 1253 // Save the first key of the inline block so that we can add it to the 1254 // parent-level index. 1255 firstKey = curInlineChunk.getBlockKey(0); 1256 1257 // Start a new inline index block 1258 curInlineChunk.clear(); 1259 } 1260 1261 /** 1262 * Called after an inline block has been written so that we can add an entry referring to that 1263 * block to the parent-level index. 1264 */ 1265 @Override 1266 public void blockWritten(long offset, int onDiskSize, int uncompressedSize) { 1267 // Add leaf index block size 1268 totalBlockOnDiskSize += onDiskSize; 1269 totalBlockUncompressedSize += uncompressedSize; 1270 1271 if (singleLevelOnly) throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED); 1272 1273 if (firstKey == null) { 1274 throw new IllegalStateException( 1275 "Trying to add second-level index " + "entry with offset=" + offset + " and onDiskSize=" 1276 + onDiskSize + "but the first key was not set in writeInlineBlock"); 1277 } 1278 1279 if (rootChunk.getNumEntries() == 0) { 1280 // We are writing the first leaf block, so increase index level. 1281 expectNumLevels(1); 1282 numLevels = 2; 1283 } 1284 1285 // Add another entry to the second-level index. Include the number of 1286 // entries in all previous leaf-level chunks for mid-key calculation. 1287 rootChunk.add(firstKey, offset, onDiskSize, totalNumEntries); 1288 firstKey = null; 1289 } 1290 1291 @Override 1292 public BlockType getInlineBlockType() { 1293 return BlockType.LEAF_INDEX; 1294 } 1295 1296 /** 1297 * Add one index entry to the current leaf-level block. When the leaf-level block gets large 1298 * enough, it will be flushed to disk as an inline block. 1299 * @param firstKey the first key of the data block 1300 * @param blockOffset the offset of the data block 1301 * @param blockDataSize the on-disk size of the data block ({@link HFile} format version 2), or 1302 * the uncompressed size of the data block ( {@link HFile} format version 1303 * 1). 1304 */ 1305 public void addEntry(byte[] firstKey, long blockOffset, int blockDataSize) { 1306 curInlineChunk.add(firstKey, blockOffset, blockDataSize); 1307 ++totalNumEntries; 1308 } 1309 1310 /** 1311 * @throws IOException if we happened to write a multi-level index. 1312 */ 1313 public void ensureSingleLevel() throws IOException { 1314 if (numLevels > 1) { 1315 throw new IOException( 1316 "Wrote a " + numLevels + "-level index with " + rootChunk.getNumEntries() 1317 + " root-level entries, but " + "this is expected to be a single-level block index."); 1318 } 1319 } 1320 1321 /** 1322 * @return true if we are using cache-on-write. This is configured by the caller of the 1323 * constructor by either passing a valid block cache or null. 1324 */ 1325 @Override 1326 public boolean getCacheOnWrite() { 1327 return cacheConf != null && cacheConf.shouldCacheIndexesOnWrite(); 1328 } 1329 1330 /** 1331 * The total uncompressed size of the root index block, intermediate-level index blocks, and 1332 * leaf-level index blocks. 1333 * @return the total uncompressed size of all index blocks 1334 */ 1335 public long getTotalUncompressedSize() { 1336 return totalBlockUncompressedSize; 1337 } 1338 1339 } 1340 1341 /** 1342 * A single chunk of the block index in the process of writing. The data in this chunk can become 1343 * a leaf-level, intermediate-level, or root index block. 1344 */ 1345 static class BlockIndexChunkImpl implements BlockIndexChunk { 1346 1347 /** First keys of the key range corresponding to each index entry. */ 1348 private final List<byte[]> blockKeys = new ArrayList<>(); 1349 1350 /** Block offset in backing stream. */ 1351 private final List<Long> blockOffsets = new ArrayList<>(); 1352 1353 /** On-disk data sizes of lower-level data or index blocks. */ 1354 private final List<Integer> onDiskDataSizes = new ArrayList<>(); 1355 1356 /** 1357 * The cumulative number of sub-entries, i.e. entries on deeper-level block index entries. 1358 * numSubEntriesAt[i] is the number of sub-entries in the blocks corresponding to this chunk's 1359 * entries #0 through #i inclusively. 1360 */ 1361 private final List<Long> numSubEntriesAt = new ArrayList<>(); 1362 1363 /** 1364 * The offset of the next entry to be added, relative to the end of the "secondary index" in the 1365 * "non-root" format representation of this index chunk. This is the next value to be added to 1366 * the secondary index. 1367 */ 1368 private int curTotalNonRootEntrySize = 0; 1369 1370 /** 1371 * The accumulated size of this chunk if stored in the root index format. 1372 */ 1373 private int curTotalRootSize = 0; 1374 1375 /** 1376 * The "secondary index" used for binary search over variable-length records in a "non-root" 1377 * format block. These offsets are relative to the end of this secondary index. 1378 */ 1379 private final List<Integer> secondaryIndexOffsetMarks = new ArrayList<>(); 1380 1381 /** 1382 * Adds a new entry to this block index chunk. 1383 * @param firstKey the first key in the block pointed to by this entry 1384 * @param blockOffset the offset of the next-level block pointed to by this entry 1385 * @param onDiskDataSize the on-disk data of the block pointed to by this entry, 1386 * including header size 1387 * @param curTotalNumSubEntries if this chunk is the root index chunk under construction, this 1388 * specifies the current total number of sub-entries in all 1389 * leaf-level chunks, including the one corresponding to the 1390 * second-level entry being added. 1391 */ 1392 @Override 1393 public void add(byte[] firstKey, long blockOffset, int onDiskDataSize, 1394 long curTotalNumSubEntries) { 1395 // Record the offset for the secondary index 1396 secondaryIndexOffsetMarks.add(curTotalNonRootEntrySize); 1397 curTotalNonRootEntrySize += SECONDARY_INDEX_ENTRY_OVERHEAD + firstKey.length; 1398 1399 curTotalRootSize += Bytes.SIZEOF_LONG + Bytes.SIZEOF_INT 1400 + WritableUtils.getVIntSize(firstKey.length) + firstKey.length; 1401 1402 blockKeys.add(firstKey); 1403 blockOffsets.add(blockOffset); 1404 onDiskDataSizes.add(onDiskDataSize); 1405 1406 if (curTotalNumSubEntries != -1) { 1407 numSubEntriesAt.add(curTotalNumSubEntries); 1408 1409 // Make sure the parallel arrays are in sync. 1410 if (numSubEntriesAt.size() != blockKeys.size()) { 1411 throw new IllegalStateException("Only have key/value count " + "stats for " 1412 + numSubEntriesAt.size() + " block index " + "entries out of " + blockKeys.size()); 1413 } 1414 } 1415 } 1416 1417 /** 1418 * The same as {@link #add(byte[], long, int, long)} but does not take the key/value into 1419 * account. Used for single-level indexes. 1420 * @see #add(byte[], long, int, long) 1421 */ 1422 @Override 1423 public void add(byte[] firstKey, long blockOffset, int onDiskDataSize) { 1424 add(firstKey, blockOffset, onDiskDataSize, -1); 1425 } 1426 1427 @Override 1428 public void clear() { 1429 blockKeys.clear(); 1430 blockOffsets.clear(); 1431 onDiskDataSizes.clear(); 1432 secondaryIndexOffsetMarks.clear(); 1433 numSubEntriesAt.clear(); 1434 curTotalNonRootEntrySize = 0; 1435 curTotalRootSize = 0; 1436 } 1437 1438 /** 1439 * Finds the entry corresponding to the deeper-level index block containing the given 1440 * deeper-level entry (a "sub-entry"), assuming a global 0-based ordering of sub-entries. 1441 * <p> 1442 * <i> Implementation note. </i> We are looking for i such that numSubEntriesAt[i - 1] <= k < 1443 * numSubEntriesAt[i], because a deeper-level block #i (0-based) contains sub-entries # 1444 * numSubEntriesAt[i - 1]'th through numSubEntriesAt[i] - 1, assuming a global 0-based ordering 1445 * of sub-entries. i is by definition the insertion point of k in numSubEntriesAt. 1446 * @param k sub-entry index, from 0 to the total number sub-entries - 1 1447 * @return the 0-based index of the entry corresponding to the given sub-entry 1448 */ 1449 @Override 1450 public int getEntryBySubEntry(long k) { 1451 // We define mid-key as the key corresponding to k'th sub-entry 1452 // (0-based). 1453 1454 int i = Collections.binarySearch(numSubEntriesAt, k); 1455 1456 // Exact match: cumulativeWeight[i] = k. This means chunks #0 through 1457 // #i contain exactly k sub-entries, and the sub-entry #k (0-based) 1458 // is in the (i + 1)'th chunk. 1459 if (i >= 0) return i + 1; 1460 1461 // Inexact match. Return the insertion point. 1462 return -i - 1; 1463 } 1464 1465 /** 1466 * Used when writing the root block index of a multi-level block index. Serializes additional 1467 * information allowing to efficiently identify the mid-key. 1468 * @return a few serialized fields for finding the mid-key 1469 * @throws IOException if could not create metadata for computing mid-key 1470 */ 1471 @Override 1472 public byte[] getMidKeyMetadata() throws IOException { 1473 ByteArrayOutputStream baos = new ByteArrayOutputStream(MID_KEY_METADATA_SIZE); 1474 DataOutputStream baosDos = new DataOutputStream(baos); 1475 long totalNumSubEntries = numSubEntriesAt.get(blockKeys.size() - 1); 1476 if (totalNumSubEntries == 0) { 1477 throw new IOException("No leaf-level entries, mid-key unavailable"); 1478 } 1479 long midKeySubEntry = (totalNumSubEntries - 1) / 2; 1480 int midKeyEntry = getEntryBySubEntry(midKeySubEntry); 1481 1482 baosDos.writeLong(blockOffsets.get(midKeyEntry)); 1483 baosDos.writeInt(onDiskDataSizes.get(midKeyEntry)); 1484 1485 long numSubEntriesBefore = midKeyEntry > 0 ? numSubEntriesAt.get(midKeyEntry - 1) : 0; 1486 long subEntryWithinEntry = midKeySubEntry - numSubEntriesBefore; 1487 if (subEntryWithinEntry < 0 || subEntryWithinEntry > Integer.MAX_VALUE) { 1488 throw new IOException("Could not identify mid-key index within the " 1489 + "leaf-level block containing mid-key: out of range (" + subEntryWithinEntry 1490 + ", numSubEntriesBefore=" + numSubEntriesBefore + ", midKeySubEntry=" + midKeySubEntry 1491 + ")"); 1492 } 1493 1494 baosDos.writeInt((int) subEntryWithinEntry); 1495 1496 if (baosDos.size() != MID_KEY_METADATA_SIZE) { 1497 throw new IOException("Could not write mid-key metadata: size=" + baosDos.size() 1498 + ", correct size: " + MID_KEY_METADATA_SIZE); 1499 } 1500 1501 // Close just to be good citizens, although this has no effect. 1502 baos.close(); 1503 1504 return baos.toByteArray(); 1505 } 1506 1507 /** Returns the size of this chunk if stored in the non-root index block format */ 1508 @Override 1509 public int getNonRootSize() { 1510 return Bytes.SIZEOF_INT // Number of entries 1511 + Bytes.SIZEOF_INT * (blockKeys.size() + 1) // Secondary index 1512 + curTotalNonRootEntrySize; // All entries 1513 } 1514 1515 @Override 1516 public int getCurTotalNonRootEntrySize() { 1517 return curTotalNonRootEntrySize; 1518 } 1519 1520 @Override 1521 public List<byte[]> getBlockKeys() { 1522 return blockKeys; 1523 } 1524 1525 @Override 1526 public List<Integer> getSecondaryIndexOffsetMarks() { 1527 return secondaryIndexOffsetMarks; 1528 } 1529 1530 /** Returns the size of this chunk if stored in the root index block format */ 1531 @Override 1532 public int getRootSize() { 1533 return curTotalRootSize; 1534 } 1535 1536 /** Returns the number of entries in this block index chunk */ 1537 public int getNumEntries() { 1538 return blockKeys.size(); 1539 } 1540 1541 public byte[] getBlockKey(int i) { 1542 return blockKeys.get(i); 1543 } 1544 1545 public long getBlockOffset(int i) { 1546 return blockOffsets.get(i); 1547 } 1548 1549 public int getOnDiskDataSize(int i) { 1550 return onDiskDataSizes.get(i); 1551 } 1552 1553 public long getCumulativeNumKV(int i) { 1554 if (i < 0) return 0; 1555 return numSubEntriesAt.get(i); 1556 } 1557 1558 } 1559 1560 public static int getMaxChunkSize(Configuration conf) { 1561 return conf.getInt(MAX_CHUNK_SIZE_KEY, DEFAULT_MAX_CHUNK_SIZE); 1562 } 1563 1564 public static int getMinIndexNumEntries(Configuration conf) { 1565 return conf.getInt(MIN_INDEX_NUM_ENTRIES_KEY, DEFAULT_MIN_INDEX_NUM_ENTRIES); 1566 } 1567}