1 /* 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, software 14 * distributed under the License is distributed on an "AS IS" BASIS, 15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 package org.apache.hadoop.hbase.io.hfile; 20 21 import java.io.ByteArrayOutputStream; 22 import java.io.DataInput; 23 import java.io.DataInputStream; 24 import java.io.DataOutput; 25 import java.io.DataOutputStream; 26 import java.io.IOException; 27 import java.nio.ByteBuffer; 28 import java.util.ArrayList; 29 import java.util.Collections; 30 import java.util.List; 31 import java.util.concurrent.atomic.AtomicReference; 32 33 import org.apache.commons.logging.Log; 34 import org.apache.commons.logging.LogFactory; 35 import org.apache.hadoop.conf.Configuration; 36 import org.apache.hadoop.fs.FSDataOutputStream; 37 import org.apache.hadoop.hbase.Cell; 38 import org.apache.hadoop.hbase.KeyValue; 39 import org.apache.hadoop.hbase.KeyValue.KVComparator; 40 import org.apache.hadoop.hbase.KeyValueUtil; 41 import org.apache.hadoop.hbase.classification.InterfaceAudience; 42 import org.apache.hadoop.hbase.io.HeapSize; 43 import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; 44 import org.apache.hadoop.hbase.io.hfile.HFile.CachingBlockReader; 45 import org.apache.hadoop.hbase.regionserver.KeyValueScanner; 46 import org.apache.hadoop.hbase.util.ByteBufferUtils; 47 import org.apache.hadoop.hbase.util.Bytes; 48 import org.apache.hadoop.hbase.util.ClassSize; 49 import org.apache.hadoop.io.WritableUtils; 50 import org.apache.hadoop.util.StringUtils; 51 52 /** 53 * Provides functionality to write ({@link BlockIndexWriter}) and read 54 * BlockIndexReader 55 * single-level and multi-level block indexes. 56 * 57 * Examples of how to use the block index writer can be found in 58 * {@link org.apache.hadoop.hbase.util.CompoundBloomFilterWriter} and 59 * {@link HFileWriterV2}. Examples of how to use the reader can be 60 * found in {@link HFileReaderV2} and 61 * {@link org.apache.hadoop.hbase.io.hfile.TestHFileBlockIndex}. 62 */ 63 @InterfaceAudience.Private 64 public class HFileBlockIndex { 65 66 private static final Log LOG = LogFactory.getLog(HFileBlockIndex.class); 67 68 static final int DEFAULT_MAX_CHUNK_SIZE = 128 * 1024; 69 70 /** 71 * The maximum size guideline for index blocks (both leaf, intermediate, and 72 * root). If not specified, <code>DEFAULT_MAX_CHUNK_SIZE</code> is used. 73 */ 74 public static final String MAX_CHUNK_SIZE_KEY = "hfile.index.block.max.size"; 75 76 /** 77 * The number of bytes stored in each "secondary index" entry in addition to 78 * key bytes in the non-root index block format. The first long is the file 79 * offset of the deeper-level block the entry points to, and the int that 80 * follows is that block's on-disk size without including header. 81 */ 82 static final int SECONDARY_INDEX_ENTRY_OVERHEAD = Bytes.SIZEOF_INT 83 + Bytes.SIZEOF_LONG; 84 85 /** 86 * Error message when trying to use inline block API in single-level mode. 87 */ 88 private static final String INLINE_BLOCKS_NOT_ALLOWED = 89 "Inline blocks are not allowed in the single-level-only mode"; 90 91 /** 92 * The size of a meta-data record used for finding the mid-key in a 93 * multi-level index. Consists of the middle leaf-level index block offset 94 * (long), its on-disk size without header included (int), and the mid-key 95 * entry's zero-based index in that leaf index block. 96 */ 97 private static final int MID_KEY_METADATA_SIZE = Bytes.SIZEOF_LONG + 98 2 * Bytes.SIZEOF_INT; 99 100 /** 101 * The reader will always hold the root level index in the memory. Index 102 * blocks at all other levels will be cached in the LRU cache in practice, 103 * although this API does not enforce that. 104 * 105 * All non-root (leaf and intermediate) index blocks contain what we call a 106 * "secondary index": an array of offsets to the entries within the block. 107 * This allows us to do binary search for the entry corresponding to the 108 * given key without having to deserialize the block. 109 */ 110 public static class BlockIndexReader implements HeapSize { 111 /** Needed doing lookup on blocks. */ 112 private final KVComparator comparator; 113 114 // Root-level data. 115 private byte[][] blockKeys; 116 private long[] blockOffsets; 117 private int[] blockDataSizes; 118 private int rootCount = 0; 119 120 // Mid-key metadata. 121 private long midLeafBlockOffset = -1; 122 private int midLeafBlockOnDiskSize = -1; 123 private int midKeyEntry = -1; 124 125 /** Pre-computed mid-key */ 126 private AtomicReference<byte[]> midKey = new AtomicReference<byte[]>(); 127 128 /** 129 * The number of levels in the block index tree. One if there is only root 130 * level, two for root and leaf levels, etc. 131 */ 132 private int searchTreeLevel; 133 134 /** A way to read {@link HFile} blocks at a given offset */ 135 private CachingBlockReader cachingBlockReader; 136 BlockIndexReader(final KVComparator c, final int treeLevel, final CachingBlockReader cachingBlockReader)137 public BlockIndexReader(final KVComparator c, final int treeLevel, 138 final CachingBlockReader cachingBlockReader) { 139 this(c, treeLevel); 140 this.cachingBlockReader = cachingBlockReader; 141 } 142 BlockIndexReader(final KVComparator c, final int treeLevel)143 public BlockIndexReader(final KVComparator c, final int treeLevel) 144 { 145 comparator = c; 146 searchTreeLevel = treeLevel; 147 } 148 149 /** 150 * @return true if the block index is empty. 151 */ isEmpty()152 public boolean isEmpty() { 153 return blockKeys.length == 0; 154 } 155 156 /** 157 * Verifies that the block index is non-empty and throws an 158 * {@link IllegalStateException} otherwise. 159 */ ensureNonEmpty()160 public void ensureNonEmpty() { 161 if (blockKeys.length == 0) { 162 throw new IllegalStateException("Block index is empty or not loaded"); 163 } 164 } 165 166 /** 167 * Return the data block which contains this key. This function will only 168 * be called when the HFile version is larger than 1. 169 * 170 * @param key the key we are looking for 171 * @param currentBlock the current block, to avoid re-reading the same block 172 * @param cacheBlocks 173 * @param pread 174 * @param isCompaction 175 * @param expectedDataBlockEncoding the data block encoding the caller is 176 * expecting the data block to be in, or null to not perform this 177 * check and return the block irrespective of the encoding 178 * @return reader a basic way to load blocks 179 * @throws IOException 180 */ seekToDataBlock(final Cell key, HFileBlock currentBlock, boolean cacheBlocks, boolean pread, boolean isCompaction, DataBlockEncoding expectedDataBlockEncoding)181 public HFileBlock seekToDataBlock(final Cell key, HFileBlock currentBlock, boolean cacheBlocks, 182 boolean pread, boolean isCompaction, DataBlockEncoding expectedDataBlockEncoding) 183 throws IOException { 184 BlockWithScanInfo blockWithScanInfo = loadDataBlockWithScanInfo(key, currentBlock, 185 cacheBlocks, 186 pread, isCompaction, expectedDataBlockEncoding); 187 if (blockWithScanInfo == null) { 188 return null; 189 } else { 190 return blockWithScanInfo.getHFileBlock(); 191 } 192 } 193 194 /** 195 * Return the BlockWithScanInfo which contains the DataBlock with other scan 196 * info such as nextIndexedKey. This function will only be called when the 197 * HFile version is larger than 1. 198 * 199 * @param key 200 * the key we are looking for 201 * @param currentBlock 202 * the current block, to avoid re-reading the same block 203 * @param cacheBlocks 204 * @param pread 205 * @param isCompaction 206 * @param expectedDataBlockEncoding the data block encoding the caller is 207 * expecting the data block to be in, or null to not perform this 208 * check and return the block irrespective of the encoding. 209 * @return the BlockWithScanInfo which contains the DataBlock with other 210 * scan info such as nextIndexedKey. 211 * @throws IOException 212 */ loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock, boolean cacheBlocks, boolean pread, boolean isCompaction, DataBlockEncoding expectedDataBlockEncoding)213 public BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock, 214 boolean cacheBlocks, 215 boolean pread, boolean isCompaction, DataBlockEncoding expectedDataBlockEncoding) 216 throws IOException { 217 int rootLevelIndex = rootBlockContainingKey(key); 218 if (rootLevelIndex < 0 || rootLevelIndex >= blockOffsets.length) { 219 return null; 220 } 221 222 // the next indexed key 223 Cell nextIndexedKey = null; 224 225 // Read the next-level (intermediate or leaf) index block. 226 long currentOffset = blockOffsets[rootLevelIndex]; 227 int currentOnDiskSize = blockDataSizes[rootLevelIndex]; 228 229 if (rootLevelIndex < blockKeys.length - 1) { 230 nextIndexedKey = new KeyValue.KeyOnlyKeyValue(blockKeys[rootLevelIndex + 1]); 231 } else { 232 nextIndexedKey = KeyValueScanner.NO_NEXT_INDEXED_KEY; 233 } 234 235 int lookupLevel = 1; // How many levels deep we are in our lookup. 236 int index = -1; 237 238 HFileBlock block; 239 while (true) { 240 241 if (currentBlock != null && currentBlock.getOffset() == currentOffset) 242 { 243 // Avoid reading the same block again, even with caching turned off. 244 // This is crucial for compaction-type workload which might have 245 // caching turned off. This is like a one-block cache inside the 246 // scanner. 247 block = currentBlock; 248 } else { 249 // Call HFile's caching block reader API. We always cache index 250 // blocks, otherwise we might get terrible performance. 251 boolean shouldCache = cacheBlocks || (lookupLevel < searchTreeLevel); 252 BlockType expectedBlockType; 253 if (lookupLevel < searchTreeLevel - 1) { 254 expectedBlockType = BlockType.INTERMEDIATE_INDEX; 255 } else if (lookupLevel == searchTreeLevel - 1) { 256 expectedBlockType = BlockType.LEAF_INDEX; 257 } else { 258 // this also accounts for ENCODED_DATA 259 expectedBlockType = BlockType.DATA; 260 } 261 block = cachingBlockReader.readBlock(currentOffset, 262 currentOnDiskSize, shouldCache, pread, isCompaction, true, 263 expectedBlockType, expectedDataBlockEncoding); 264 } 265 266 if (block == null) { 267 throw new IOException("Failed to read block at offset " + 268 currentOffset + ", onDiskSize=" + currentOnDiskSize); 269 } 270 271 // Found a data block, break the loop and check our level in the tree. 272 if (block.getBlockType().isData()) { 273 break; 274 } 275 276 // Not a data block. This must be a leaf-level or intermediate-level 277 // index block. We don't allow going deeper than searchTreeLevel. 278 if (++lookupLevel > searchTreeLevel) { 279 throw new IOException("Search Tree Level overflow: lookupLevel="+ 280 lookupLevel + ", searchTreeLevel=" + searchTreeLevel); 281 } 282 283 // Locate the entry corresponding to the given key in the non-root 284 // (leaf or intermediate-level) index block. 285 ByteBuffer buffer = block.getBufferWithoutHeader(); 286 index = locateNonRootIndexEntry(buffer, key, comparator); 287 if (index == -1) { 288 // This has to be changed 289 // For now change this to key value 290 KeyValue kv = KeyValueUtil.ensureKeyValue(key); 291 throw new IOException("The key " 292 + Bytes.toStringBinary(kv.getKey(), kv.getKeyOffset(), kv.getKeyLength()) 293 + " is before the" + " first key of the non-root index block " 294 + block); 295 } 296 297 currentOffset = buffer.getLong(); 298 currentOnDiskSize = buffer.getInt(); 299 300 // Only update next indexed key if there is a next indexed key in the current level 301 byte[] tmpNextIndexedKey = getNonRootIndexedKey(buffer, index + 1); 302 if (tmpNextIndexedKey != null) { 303 nextIndexedKey = new KeyValue.KeyOnlyKeyValue(tmpNextIndexedKey); 304 } 305 } 306 307 if (lookupLevel != searchTreeLevel) { 308 throw new IOException("Reached a data block at level " + lookupLevel + 309 " but the number of levels is " + searchTreeLevel); 310 } 311 312 // set the next indexed key for the current block. 313 BlockWithScanInfo blockWithScanInfo = new BlockWithScanInfo(block, nextIndexedKey); 314 return blockWithScanInfo; 315 } 316 317 /** 318 * An approximation to the {@link HFile}'s mid-key. Operates on block 319 * boundaries, and does not go inside blocks. In other words, returns the 320 * first key of the middle block of the file. 321 * 322 * @return the first key of the middle block 323 */ midkey()324 public byte[] midkey() throws IOException { 325 if (rootCount == 0) 326 throw new IOException("HFile empty"); 327 328 byte[] targetMidKey = this.midKey.get(); 329 if (targetMidKey != null) { 330 return targetMidKey; 331 } 332 333 if (midLeafBlockOffset >= 0) { 334 if (cachingBlockReader == null) { 335 throw new IOException("Have to read the middle leaf block but " + 336 "no block reader available"); 337 } 338 339 // Caching, using pread, assuming this is not a compaction. 340 HFileBlock midLeafBlock = cachingBlockReader.readBlock( 341 midLeafBlockOffset, midLeafBlockOnDiskSize, true, true, false, true, 342 BlockType.LEAF_INDEX, null); 343 344 ByteBuffer b = midLeafBlock.getBufferWithoutHeader(); 345 int numDataBlocks = b.getInt(); 346 int keyRelOffset = b.getInt(Bytes.SIZEOF_INT * (midKeyEntry + 1)); 347 int keyLen = b.getInt(Bytes.SIZEOF_INT * (midKeyEntry + 2)) - 348 keyRelOffset; 349 int keyOffset = Bytes.SIZEOF_INT * (numDataBlocks + 2) + keyRelOffset 350 + SECONDARY_INDEX_ENTRY_OVERHEAD; 351 targetMidKey = ByteBufferUtils.toBytes(b, keyOffset, keyLen); 352 } else { 353 // The middle of the root-level index. 354 targetMidKey = blockKeys[rootCount / 2]; 355 } 356 357 this.midKey.set(targetMidKey); 358 return targetMidKey; 359 } 360 361 /** 362 * @param i from 0 to {@link #getRootBlockCount() - 1} 363 */ getRootBlockKey(int i)364 public byte[] getRootBlockKey(int i) { 365 return blockKeys[i]; 366 } 367 368 /** 369 * @param i from 0 to {@link #getRootBlockCount() - 1} 370 */ getRootBlockOffset(int i)371 public long getRootBlockOffset(int i) { 372 return blockOffsets[i]; 373 } 374 375 /** 376 * @param i zero-based index of a root-level block 377 * @return the on-disk size of the root-level block for version 2, or the 378 * uncompressed size for version 1 379 */ getRootBlockDataSize(int i)380 public int getRootBlockDataSize(int i) { 381 return blockDataSizes[i]; 382 } 383 384 /** 385 * @return the number of root-level blocks in this block index 386 */ getRootBlockCount()387 public int getRootBlockCount() { 388 return rootCount; 389 } 390 391 /** 392 * Finds the root-level index block containing the given key. 393 * 394 * @param key 395 * Key to find 396 * @return Offset of block containing <code>key</code> (between 0 and the 397 * number of blocks - 1) or -1 if this file does not contain the 398 * request. 399 */ rootBlockContainingKey(final byte[] key, int offset, int length)400 public int rootBlockContainingKey(final byte[] key, int offset, int length) { 401 int pos = Bytes.binarySearch(blockKeys, key, offset, length, comparator); 402 // pos is between -(blockKeys.length + 1) to blockKeys.length - 1, see 403 // binarySearch's javadoc. 404 405 if (pos >= 0) { 406 // This means this is an exact match with an element of blockKeys. 407 assert pos < blockKeys.length; 408 return pos; 409 } 410 411 // Otherwise, pos = -(i + 1), where blockKeys[i - 1] < key < blockKeys[i], 412 // and i is in [0, blockKeys.length]. We are returning j = i - 1 such that 413 // blockKeys[j] <= key < blockKeys[j + 1]. In particular, j = -1 if 414 // key < blockKeys[0], meaning the file does not contain the given key. 415 416 int i = -pos - 1; 417 assert 0 <= i && i <= blockKeys.length; 418 return i - 1; 419 } 420 421 /** 422 * Finds the root-level index block containing the given key. 423 * 424 * @param key 425 * Key to find 426 */ 427 public int rootBlockContainingKey(final Cell key) { 428 int pos = Bytes.binarySearch(blockKeys, key, comparator); 429 // pos is between -(blockKeys.length + 1) to blockKeys.length - 1, see 430 // binarySearch's javadoc. 431 432 if (pos >= 0) { 433 // This means this is an exact match with an element of blockKeys. 434 assert pos < blockKeys.length; 435 return pos; 436 } 437 438 // Otherwise, pos = -(i + 1), where blockKeys[i - 1] < key < blockKeys[i], 439 // and i is in [0, blockKeys.length]. We are returning j = i - 1 such that 440 // blockKeys[j] <= key < blockKeys[j + 1]. In particular, j = -1 if 441 // key < blockKeys[0], meaning the file does not contain the given key. 442 443 int i = -pos - 1; 444 assert 0 <= i && i <= blockKeys.length; 445 return i - 1; 446 } 447 448 /** 449 * Adds a new entry in the root block index. Only used when reading. 450 * 451 * @param key Last key in the block 452 * @param offset file offset where the block is stored 453 * @param dataSize the uncompressed data size 454 */ 455 private void add(final byte[] key, final long offset, final int dataSize) { 456 blockOffsets[rootCount] = offset; 457 blockKeys[rootCount] = key; 458 blockDataSizes[rootCount] = dataSize; 459 rootCount++; 460 } 461 462 /** 463 * The indexed key at the ith position in the nonRootIndex. The position starts at 0. 464 * @param nonRootIndex 465 * @param i the ith position 466 * @return The indexed key at the ith position in the nonRootIndex. 467 */ 468 private byte[] getNonRootIndexedKey(ByteBuffer nonRootIndex, int i) { 469 int numEntries = nonRootIndex.getInt(0); 470 if (i < 0 || i >= numEntries) { 471 return null; 472 } 473 474 // Entries start after the number of entries and the secondary index. 475 // The secondary index takes numEntries + 1 ints. 476 int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2); 477 // Targetkey's offset relative to the end of secondary index 478 int targetKeyRelOffset = nonRootIndex.getInt( 479 Bytes.SIZEOF_INT * (i + 1)); 480 481 // The offset of the target key in the blockIndex buffer 482 int targetKeyOffset = entriesOffset // Skip secondary index 483 + targetKeyRelOffset // Skip all entries until mid 484 + SECONDARY_INDEX_ENTRY_OVERHEAD; // Skip offset and on-disk-size 485 486 // We subtract the two consecutive secondary index elements, which 487 // gives us the size of the whole (offset, onDiskSize, key) tuple. We 488 // then need to subtract the overhead of offset and onDiskSize. 489 int targetKeyLength = nonRootIndex.getInt(Bytes.SIZEOF_INT * (i + 2)) - 490 targetKeyRelOffset - SECONDARY_INDEX_ENTRY_OVERHEAD; 491 492 return ByteBufferUtils.toBytes(nonRootIndex, targetKeyOffset, targetKeyLength); 493 } 494 495 /** 496 * Performs a binary search over a non-root level index block. Utilizes the 497 * secondary index, which records the offsets of (offset, onDiskSize, 498 * firstKey) tuples of all entries. 499 * 500 * @param key 501 * the key we are searching for offsets to individual entries in 502 * the blockIndex buffer 503 * @param nonRootIndex 504 * the non-root index block buffer, starting with the secondary 505 * index. The position is ignored. 506 * @return the index i in [0, numEntries - 1] such that keys[i] <= key < 507 * keys[i + 1], if keys is the array of all keys being searched, or 508 * -1 otherwise 509 * @throws IOException 510 */ 511 static int binarySearchNonRootIndex(Cell key, ByteBuffer nonRootIndex, 512 KVComparator comparator) { 513 514 int numEntries = nonRootIndex.getInt(0); 515 int low = 0; 516 int high = numEntries - 1; 517 int mid = 0; 518 519 // Entries start after the number of entries and the secondary index. 520 // The secondary index takes numEntries + 1 ints. 521 int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2); 522 523 // If we imagine that keys[-1] = -Infinity and 524 // keys[numEntries] = Infinity, then we are maintaining an invariant that 525 // keys[low - 1] < key < keys[high + 1] while narrowing down the range. 526 KeyValue.KeyOnlyKeyValue nonRootIndexKV = new KeyValue.KeyOnlyKeyValue(); 527 while (low <= high) { 528 mid = (low + high) >>> 1; 529 530 // Midkey's offset relative to the end of secondary index 531 int midKeyRelOffset = nonRootIndex.getInt( 532 Bytes.SIZEOF_INT * (mid + 1)); 533 534 // The offset of the middle key in the blockIndex buffer 535 int midKeyOffset = entriesOffset // Skip secondary index 536 + midKeyRelOffset // Skip all entries until mid 537 + SECONDARY_INDEX_ENTRY_OVERHEAD; // Skip offset and on-disk-size 538 539 // We subtract the two consecutive secondary index elements, which 540 // gives us the size of the whole (offset, onDiskSize, key) tuple. We 541 // then need to subtract the overhead of offset and onDiskSize. 542 int midLength = nonRootIndex.getInt(Bytes.SIZEOF_INT * (mid + 2)) - 543 midKeyRelOffset - SECONDARY_INDEX_ENTRY_OVERHEAD; 544 545 // we have to compare in this order, because the comparator order 546 // has special logic when the 'left side' is a special key. 547 // TODO make KeyOnlyKeyValue to be Buffer backed and avoid array() call. This has to be 548 // done after HBASE-12224 & HBASE-12282 549 nonRootIndexKV.setKey(nonRootIndex.array(), 550 nonRootIndex.arrayOffset() + midKeyOffset, midLength); 551 int cmp = comparator.compareOnlyKeyPortion(key, nonRootIndexKV); 552 553 // key lives above the midpoint 554 if (cmp > 0) 555 low = mid + 1; // Maintain the invariant that keys[low - 1] < key 556 // key lives below the midpoint 557 else if (cmp < 0) 558 high = mid - 1; // Maintain the invariant that key < keys[high + 1] 559 else 560 return mid; // exact match 561 } 562 563 // As per our invariant, keys[low - 1] < key < keys[high + 1], meaning 564 // that low - 1 < high + 1 and (low - high) <= 1. As per the loop break 565 // condition, low >= high + 1. Therefore, low = high + 1. 566 567 if (low != high + 1) { 568 throw new IllegalStateException("Binary search broken: low=" + low 569 + " " + "instead of " + (high + 1)); 570 } 571 572 // OK, our invariant says that keys[low - 1] < key < keys[low]. We need to 573 // return i such that keys[i] <= key < keys[i + 1]. Therefore i = low - 1. 574 int i = low - 1; 575 576 // Some extra validation on the result. 577 if (i < -1 || i >= numEntries) { 578 throw new IllegalStateException("Binary search broken: result is " + 579 i + " but expected to be between -1 and (numEntries - 1) = " + 580 (numEntries - 1)); 581 } 582 583 return i; 584 } 585 586 /** 587 * Search for one key using the secondary index in a non-root block. In case 588 * of success, positions the provided buffer at the entry of interest, where 589 * the file offset and the on-disk-size can be read. 590 * 591 * @param nonRootBlock 592 * a non-root block without header. Initial position does not 593 * matter. 594 * @param key 595 * the byte array containing the key 596 * @return the index position where the given key was found, otherwise 597 * return -1 in the case the given key is before the first key. 598 * 599 */ 600 static int locateNonRootIndexEntry(ByteBuffer nonRootBlock, Cell key, 601 KVComparator comparator) { 602 int entryIndex = binarySearchNonRootIndex(key, nonRootBlock, comparator); 603 604 if (entryIndex != -1) { 605 int numEntries = nonRootBlock.getInt(0); 606 607 // The end of secondary index and the beginning of entries themselves. 608 int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2); 609 610 // The offset of the entry we are interested in relative to the end of 611 // the secondary index. 612 int entryRelOffset = nonRootBlock.getInt(Bytes.SIZEOF_INT * (1 + entryIndex)); 613 614 nonRootBlock.position(entriesOffset + entryRelOffset); 615 } 616 617 return entryIndex; 618 } 619 620 /** 621 * Read in the root-level index from the given input stream. Must match 622 * what was written into the root level by 623 * {@link BlockIndexWriter#writeIndexBlocks(FSDataOutputStream)} at the 624 * offset that function returned. 625 * 626 * @param in the buffered input stream or wrapped byte input stream 627 * @param numEntries the number of root-level index entries 628 * @throws IOException 629 */ 630 public void readRootIndex(DataInput in, final int numEntries) 631 throws IOException { 632 blockOffsets = new long[numEntries]; 633 blockKeys = new byte[numEntries][]; 634 blockDataSizes = new int[numEntries]; 635 636 // If index size is zero, no index was written. 637 if (numEntries > 0) { 638 for (int i = 0; i < numEntries; ++i) { 639 long offset = in.readLong(); 640 int dataSize = in.readInt(); 641 byte[] key = Bytes.readByteArray(in); 642 add(key, offset, dataSize); 643 } 644 } 645 } 646 647 /** 648 * Read in the root-level index from the given input stream. Must match 649 * what was written into the root level by 650 * {@link BlockIndexWriter#writeIndexBlocks(FSDataOutputStream)} at the 651 * offset that function returned. 652 * 653 * @param blk the HFile block 654 * @param numEntries the number of root-level index entries 655 * @return the buffered input stream or wrapped byte input stream 656 * @throws IOException 657 */ 658 public DataInputStream readRootIndex(HFileBlock blk, final int numEntries) throws IOException { 659 DataInputStream in = blk.getByteStream(); 660 readRootIndex(in, numEntries); 661 return in; 662 } 663 664 /** 665 * Read the root-level metadata of a multi-level block index. Based on 666 * {@link #readRootIndex(DataInput, int)}, but also reads metadata 667 * necessary to compute the mid-key in a multi-level index. 668 * 669 * @param blk the HFile block 670 * @param numEntries the number of root-level index entries 671 * @throws IOException 672 */ 673 public void readMultiLevelIndexRoot(HFileBlock blk, 674 final int numEntries) throws IOException { 675 DataInputStream in = readRootIndex(blk, numEntries); 676 // after reading the root index the checksum bytes have to 677 // be subtracted to know if the mid key exists. 678 int checkSumBytes = blk.totalChecksumBytes(); 679 if ((in.available() - checkSumBytes) < MID_KEY_METADATA_SIZE) { 680 // No mid-key metadata available. 681 return; 682 } 683 midLeafBlockOffset = in.readLong(); 684 midLeafBlockOnDiskSize = in.readInt(); 685 midKeyEntry = in.readInt(); 686 } 687 688 @Override 689 public String toString() { 690 StringBuilder sb = new StringBuilder(); 691 sb.append("size=" + rootCount).append("\n"); 692 for (int i = 0; i < rootCount; i++) { 693 sb.append("key=").append(KeyValue.keyToString(blockKeys[i])) 694 .append("\n offset=").append(blockOffsets[i]) 695 .append(", dataSize=" + blockDataSizes[i]).append("\n"); 696 } 697 return sb.toString(); 698 } 699 700 @Override 701 public long heapSize() { 702 long heapSize = ClassSize.align(6 * ClassSize.REFERENCE + 703 2 * Bytes.SIZEOF_INT + ClassSize.OBJECT); 704 705 // Mid-key metadata. 706 heapSize += MID_KEY_METADATA_SIZE; 707 708 // Calculating the size of blockKeys 709 if (blockKeys != null) { 710 // Adding array + references overhead 711 heapSize += ClassSize.align(ClassSize.ARRAY + blockKeys.length 712 * ClassSize.REFERENCE); 713 714 // Adding bytes 715 for (byte[] key : blockKeys) { 716 heapSize += ClassSize.align(ClassSize.ARRAY + key.length); 717 } 718 } 719 720 if (blockOffsets != null) { 721 heapSize += ClassSize.align(ClassSize.ARRAY + blockOffsets.length 722 * Bytes.SIZEOF_LONG); 723 } 724 725 if (blockDataSizes != null) { 726 heapSize += ClassSize.align(ClassSize.ARRAY + blockDataSizes.length 727 * Bytes.SIZEOF_INT); 728 } 729 730 return ClassSize.align(heapSize); 731 } 732 733 } 734 735 /** 736 * Writes the block index into the output stream. Generate the tree from 737 * bottom up. The leaf level is written to disk as a sequence of inline 738 * blocks, if it is larger than a certain number of bytes. If the leaf level 739 * is not large enough, we write all entries to the root level instead. 740 * 741 * After all leaf blocks have been written, we end up with an index 742 * referencing the resulting leaf index blocks. If that index is larger than 743 * the allowed root index size, the writer will break it up into 744 * reasonable-size intermediate-level index block chunks write those chunks 745 * out, and create another index referencing those chunks. This will be 746 * repeated until the remaining index is small enough to become the root 747 * index. However, in most practical cases we will only have leaf-level 748 * blocks and the root index, or just the root index. 749 */ 750 public static class BlockIndexWriter implements InlineBlockWriter { 751 /** 752 * While the index is being written, this represents the current block 753 * index referencing all leaf blocks, with one exception. If the file is 754 * being closed and there are not enough blocks to complete even a single 755 * leaf block, no leaf blocks get written and this contains the entire 756 * block index. After all levels of the index were written by 757 * {@link #writeIndexBlocks(FSDataOutputStream)}, this contains the final 758 * root-level index. 759 */ 760 private BlockIndexChunk rootChunk = new BlockIndexChunk(); 761 762 /** 763 * Current leaf-level chunk. New entries referencing data blocks get added 764 * to this chunk until it grows large enough to be written to disk. 765 */ 766 private BlockIndexChunk curInlineChunk = new BlockIndexChunk(); 767 768 /** 769 * The number of block index levels. This is one if there is only root 770 * level (even empty), two if there a leaf level and root level, and is 771 * higher if there are intermediate levels. This is only final after 772 * {@link #writeIndexBlocks(FSDataOutputStream)} has been called. The 773 * initial value accounts for the root level, and will be increased to two 774 * as soon as we find out there is a leaf-level in 775 * {@link #blockWritten(long, int, int)}. 776 */ 777 private int numLevels = 1; 778 779 private HFileBlock.Writer blockWriter; 780 private byte[] firstKey = null; 781 782 /** 783 * The total number of leaf-level entries, i.e. entries referenced by 784 * leaf-level blocks. For the data block index this is equal to the number 785 * of data blocks. 786 */ 787 private long totalNumEntries; 788 789 /** Total compressed size of all index blocks. */ 790 private long totalBlockOnDiskSize; 791 792 /** Total uncompressed size of all index blocks. */ 793 private long totalBlockUncompressedSize; 794 795 /** The maximum size guideline of all multi-level index blocks. */ 796 private int maxChunkSize; 797 798 /** Whether we require this block index to always be single-level. */ 799 private boolean singleLevelOnly; 800 801 /** CacheConfig, or null if cache-on-write is disabled */ 802 private CacheConfig cacheConf; 803 804 /** Name to use for computing cache keys */ 805 private String nameForCaching; 806 807 /** Creates a single-level block index writer */ 808 public BlockIndexWriter() { 809 this(null, null, null); 810 singleLevelOnly = true; 811 } 812 813 /** 814 * Creates a multi-level block index writer. 815 * 816 * @param blockWriter the block writer to use to write index blocks 817 * @param cacheConf used to determine when and how a block should be cached-on-write. 818 */ 819 public BlockIndexWriter(HFileBlock.Writer blockWriter, 820 CacheConfig cacheConf, String nameForCaching) { 821 if ((cacheConf == null) != (nameForCaching == null)) { 822 throw new IllegalArgumentException("Block cache and file name for " + 823 "caching must be both specified or both null"); 824 } 825 826 this.blockWriter = blockWriter; 827 this.cacheConf = cacheConf; 828 this.nameForCaching = nameForCaching; 829 this.maxChunkSize = HFileBlockIndex.DEFAULT_MAX_CHUNK_SIZE; 830 } 831 832 public void setMaxChunkSize(int maxChunkSize) { 833 if (maxChunkSize <= 0) { 834 throw new IllegalArgumentException("Invald maximum index block size"); 835 } 836 this.maxChunkSize = maxChunkSize; 837 } 838 839 /** 840 * Writes the root level and intermediate levels of the block index into 841 * the output stream, generating the tree from bottom up. Assumes that the 842 * leaf level has been inline-written to the disk if there is enough data 843 * for more than one leaf block. We iterate by breaking the current level 844 * of the block index, starting with the index of all leaf-level blocks, 845 * into chunks small enough to be written to disk, and generate its parent 846 * level, until we end up with a level small enough to become the root 847 * level. 848 * 849 * If the leaf level is not large enough, there is no inline block index 850 * anymore, so we only write that level of block index to disk as the root 851 * level. 852 * 853 * @param out FSDataOutputStream 854 * @return position at which we entered the root-level index. 855 * @throws IOException 856 */ 857 public long writeIndexBlocks(FSDataOutputStream out) throws IOException { 858 if (curInlineChunk != null && curInlineChunk.getNumEntries() != 0) { 859 throw new IOException("Trying to write a multi-level block index, " + 860 "but are " + curInlineChunk.getNumEntries() + " entries in the " + 861 "last inline chunk."); 862 } 863 864 // We need to get mid-key metadata before we create intermediate 865 // indexes and overwrite the root chunk. 866 byte[] midKeyMetadata = numLevels > 1 ? rootChunk.getMidKeyMetadata() 867 : null; 868 869 if (curInlineChunk != null) { 870 while (rootChunk.getRootSize() > maxChunkSize) { 871 rootChunk = writeIntermediateLevel(out, rootChunk); 872 numLevels += 1; 873 } 874 } 875 876 // write the root level 877 long rootLevelIndexPos = out.getPos(); 878 879 { 880 DataOutput blockStream = 881 blockWriter.startWriting(BlockType.ROOT_INDEX); 882 rootChunk.writeRoot(blockStream); 883 if (midKeyMetadata != null) 884 blockStream.write(midKeyMetadata); 885 blockWriter.writeHeaderAndData(out); 886 } 887 888 // Add root index block size 889 totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader(); 890 totalBlockUncompressedSize += 891 blockWriter.getUncompressedSizeWithoutHeader(); 892 893 if (LOG.isTraceEnabled()) { 894 LOG.trace("Wrote a " + numLevels + "-level index with root level at pos " 895 + rootLevelIndexPos + ", " + rootChunk.getNumEntries() 896 + " root-level entries, " + totalNumEntries + " total entries, " 897 + StringUtils.humanReadableInt(this.totalBlockOnDiskSize) + 898 " on-disk size, " 899 + StringUtils.humanReadableInt(totalBlockUncompressedSize) + 900 " total uncompressed size."); 901 } 902 return rootLevelIndexPos; 903 } 904 905 /** 906 * Writes the block index data as a single level only. Does not do any 907 * block framing. 908 * 909 * @param out the buffered output stream to write the index to. Typically a 910 * stream writing into an {@link HFile} block. 911 * @param description a short description of the index being written. Used 912 * in a log message. 913 * @throws IOException 914 */ 915 public void writeSingleLevelIndex(DataOutput out, String description) 916 throws IOException { 917 expectNumLevels(1); 918 919 if (!singleLevelOnly) 920 throw new IOException("Single-level mode is turned off"); 921 922 if (rootChunk.getNumEntries() > 0) 923 throw new IOException("Root-level entries already added in " + 924 "single-level mode"); 925 926 rootChunk = curInlineChunk; 927 curInlineChunk = new BlockIndexChunk(); 928 929 if (LOG.isTraceEnabled()) { 930 LOG.trace("Wrote a single-level " + description + " index with " 931 + rootChunk.getNumEntries() + " entries, " + rootChunk.getRootSize() 932 + " bytes"); 933 } 934 rootChunk.writeRoot(out); 935 } 936 937 /** 938 * Split the current level of the block index into intermediate index 939 * blocks of permitted size and write those blocks to disk. Return the next 940 * level of the block index referencing those intermediate-level blocks. 941 * 942 * @param out 943 * @param currentLevel the current level of the block index, such as the a 944 * chunk referencing all leaf-level index blocks 945 * @return the parent level block index, which becomes the root index after 946 * a few (usually zero) iterations 947 * @throws IOException 948 */ 949 private BlockIndexChunk writeIntermediateLevel(FSDataOutputStream out, 950 BlockIndexChunk currentLevel) throws IOException { 951 // Entries referencing intermediate-level blocks we are about to create. 952 BlockIndexChunk parent = new BlockIndexChunk(); 953 954 // The current intermediate-level block index chunk. 955 BlockIndexChunk curChunk = new BlockIndexChunk(); 956 957 for (int i = 0; i < currentLevel.getNumEntries(); ++i) { 958 curChunk.add(currentLevel.getBlockKey(i), 959 currentLevel.getBlockOffset(i), currentLevel.getOnDiskDataSize(i)); 960 961 if (curChunk.getRootSize() >= maxChunkSize) 962 writeIntermediateBlock(out, parent, curChunk); 963 } 964 965 if (curChunk.getNumEntries() > 0) { 966 writeIntermediateBlock(out, parent, curChunk); 967 } 968 969 return parent; 970 } 971 972 private void writeIntermediateBlock(FSDataOutputStream out, 973 BlockIndexChunk parent, BlockIndexChunk curChunk) throws IOException { 974 long beginOffset = out.getPos(); 975 DataOutputStream dos = blockWriter.startWriting( 976 BlockType.INTERMEDIATE_INDEX); 977 curChunk.writeNonRoot(dos); 978 byte[] curFirstKey = curChunk.getBlockKey(0); 979 blockWriter.writeHeaderAndData(out); 980 981 if (cacheConf != null) { 982 HFileBlock blockForCaching = blockWriter.getBlockForCaching(cacheConf); 983 cacheConf.getBlockCache().cacheBlock(new BlockCacheKey(nameForCaching, 984 beginOffset), blockForCaching); 985 } 986 987 // Add intermediate index block size 988 totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader(); 989 totalBlockUncompressedSize += 990 blockWriter.getUncompressedSizeWithoutHeader(); 991 992 // OFFSET is the beginning offset the chunk of block index entries. 993 // SIZE is the total byte size of the chunk of block index entries 994 // + the secondary index size 995 // FIRST_KEY is the first key in the chunk of block index 996 // entries. 997 parent.add(curFirstKey, beginOffset, 998 blockWriter.getOnDiskSizeWithHeader()); 999 1000 // clear current block index chunk 1001 curChunk.clear(); 1002 curFirstKey = null; 1003 } 1004 1005 /** 1006 * @return how many block index entries there are in the root level 1007 */ 1008 public final int getNumRootEntries() { 1009 return rootChunk.getNumEntries(); 1010 } 1011 1012 /** 1013 * @return the number of levels in this block index. 1014 */ 1015 public int getNumLevels() { 1016 return numLevels; 1017 } 1018 1019 private void expectNumLevels(int expectedNumLevels) { 1020 if (numLevels != expectedNumLevels) { 1021 throw new IllegalStateException("Number of block index levels is " 1022 + numLevels + "but is expected to be " + expectedNumLevels); 1023 } 1024 } 1025 1026 /** 1027 * Whether there is an inline block ready to be written. In general, we 1028 * write an leaf-level index block as an inline block as soon as its size 1029 * as serialized in the non-root format reaches a certain threshold. 1030 */ 1031 @Override 1032 public boolean shouldWriteBlock(boolean closing) { 1033 if (singleLevelOnly) { 1034 throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED); 1035 } 1036 1037 if (curInlineChunk == null) { 1038 throw new IllegalStateException("curInlineChunk is null; has shouldWriteBlock been " + 1039 "called with closing=true and then called again?"); 1040 } 1041 1042 if (curInlineChunk.getNumEntries() == 0) { 1043 return false; 1044 } 1045 1046 // We do have some entries in the current inline chunk. 1047 if (closing) { 1048 if (rootChunk.getNumEntries() == 0) { 1049 // We did not add any leaf-level blocks yet. Instead of creating a 1050 // leaf level with one block, move these entries to the root level. 1051 1052 expectNumLevels(1); 1053 rootChunk = curInlineChunk; 1054 curInlineChunk = null; // Disallow adding any more index entries. 1055 return false; 1056 } 1057 1058 return true; 1059 } else { 1060 return curInlineChunk.getNonRootSize() >= maxChunkSize; 1061 } 1062 } 1063 1064 /** 1065 * Write out the current inline index block. Inline blocks are non-root 1066 * blocks, so the non-root index format is used. 1067 * 1068 * @param out 1069 */ 1070 @Override 1071 public void writeInlineBlock(DataOutput out) throws IOException { 1072 if (singleLevelOnly) 1073 throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED); 1074 1075 // Write the inline block index to the output stream in the non-root 1076 // index block format. 1077 curInlineChunk.writeNonRoot(out); 1078 1079 // Save the first key of the inline block so that we can add it to the 1080 // parent-level index. 1081 firstKey = curInlineChunk.getBlockKey(0); 1082 1083 // Start a new inline index block 1084 curInlineChunk.clear(); 1085 } 1086 1087 /** 1088 * Called after an inline block has been written so that we can add an 1089 * entry referring to that block to the parent-level index. 1090 */ 1091 @Override 1092 public void blockWritten(long offset, int onDiskSize, int uncompressedSize) { 1093 // Add leaf index block size 1094 totalBlockOnDiskSize += onDiskSize; 1095 totalBlockUncompressedSize += uncompressedSize; 1096 1097 if (singleLevelOnly) 1098 throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED); 1099 1100 if (firstKey == null) { 1101 throw new IllegalStateException("Trying to add second-level index " + 1102 "entry with offset=" + offset + " and onDiskSize=" + onDiskSize + 1103 "but the first key was not set in writeInlineBlock"); 1104 } 1105 1106 if (rootChunk.getNumEntries() == 0) { 1107 // We are writing the first leaf block, so increase index level. 1108 expectNumLevels(1); 1109 numLevels = 2; 1110 } 1111 1112 // Add another entry to the second-level index. Include the number of 1113 // entries in all previous leaf-level chunks for mid-key calculation. 1114 rootChunk.add(firstKey, offset, onDiskSize, totalNumEntries); 1115 firstKey = null; 1116 } 1117 1118 @Override 1119 public BlockType getInlineBlockType() { 1120 return BlockType.LEAF_INDEX; 1121 } 1122 1123 /** 1124 * Add one index entry to the current leaf-level block. When the leaf-level 1125 * block gets large enough, it will be flushed to disk as an inline block. 1126 * 1127 * @param firstKey the first key of the data block 1128 * @param blockOffset the offset of the data block 1129 * @param blockDataSize the on-disk size of the data block ({@link HFile} 1130 * format version 2), or the uncompressed size of the data block ( 1131 * {@link HFile} format version 1). 1132 */ 1133 public void addEntry(byte[] firstKey, long blockOffset, int blockDataSize) { 1134 curInlineChunk.add(firstKey, blockOffset, blockDataSize); 1135 ++totalNumEntries; 1136 } 1137 1138 /** 1139 * @throws IOException if we happened to write a multi-level index. 1140 */ 1141 public void ensureSingleLevel() throws IOException { 1142 if (numLevels > 1) { 1143 throw new IOException ("Wrote a " + numLevels + "-level index with " + 1144 rootChunk.getNumEntries() + " root-level entries, but " + 1145 "this is expected to be a single-level block index."); 1146 } 1147 } 1148 1149 /** 1150 * @return true if we are using cache-on-write. This is configured by the 1151 * caller of the constructor by either passing a valid block cache 1152 * or null. 1153 */ 1154 @Override getCacheOnWrite()1155 public boolean getCacheOnWrite() { 1156 return cacheConf != null && cacheConf.shouldCacheIndexesOnWrite(); 1157 } 1158 1159 /** 1160 * The total uncompressed size of the root index block, intermediate-level 1161 * index blocks, and leaf-level index blocks. 1162 * 1163 * @return the total uncompressed size of all index blocks 1164 */ getTotalUncompressedSize()1165 public long getTotalUncompressedSize() { 1166 return totalBlockUncompressedSize; 1167 } 1168 1169 } 1170 1171 /** 1172 * A single chunk of the block index in the process of writing. The data in 1173 * this chunk can become a leaf-level, intermediate-level, or root index 1174 * block. 1175 */ 1176 static class BlockIndexChunk { 1177 1178 /** First keys of the key range corresponding to each index entry. */ 1179 private final List<byte[]> blockKeys = new ArrayList<byte[]>(); 1180 1181 /** Block offset in backing stream. */ 1182 private final List<Long> blockOffsets = new ArrayList<Long>(); 1183 1184 /** On-disk data sizes of lower-level data or index blocks. */ 1185 private final List<Integer> onDiskDataSizes = new ArrayList<Integer>(); 1186 1187 /** 1188 * The cumulative number of sub-entries, i.e. entries on deeper-level block 1189 * index entries. numSubEntriesAt[i] is the number of sub-entries in the 1190 * blocks corresponding to this chunk's entries #0 through #i inclusively. 1191 */ 1192 private final List<Long> numSubEntriesAt = new ArrayList<Long>(); 1193 1194 /** 1195 * The offset of the next entry to be added, relative to the end of the 1196 * "secondary index" in the "non-root" format representation of this index 1197 * chunk. This is the next value to be added to the secondary index. 1198 */ 1199 private int curTotalNonRootEntrySize = 0; 1200 1201 /** 1202 * The accumulated size of this chunk if stored in the root index format. 1203 */ 1204 private int curTotalRootSize = 0; 1205 1206 /** 1207 * The "secondary index" used for binary search over variable-length 1208 * records in a "non-root" format block. These offsets are relative to the 1209 * end of this secondary index. 1210 */ 1211 private final List<Integer> secondaryIndexOffsetMarks = 1212 new ArrayList<Integer>(); 1213 1214 /** 1215 * Adds a new entry to this block index chunk. 1216 * 1217 * @param firstKey the first key in the block pointed to by this entry 1218 * @param blockOffset the offset of the next-level block pointed to by this 1219 * entry 1220 * @param onDiskDataSize the on-disk data of the block pointed to by this 1221 * entry, including header size 1222 * @param curTotalNumSubEntries if this chunk is the root index chunk under 1223 * construction, this specifies the current total number of 1224 * sub-entries in all leaf-level chunks, including the one 1225 * corresponding to the second-level entry being added. 1226 */ add(byte[] firstKey, long blockOffset, int onDiskDataSize, long curTotalNumSubEntries)1227 void add(byte[] firstKey, long blockOffset, int onDiskDataSize, 1228 long curTotalNumSubEntries) { 1229 // Record the offset for the secondary index 1230 secondaryIndexOffsetMarks.add(curTotalNonRootEntrySize); 1231 curTotalNonRootEntrySize += SECONDARY_INDEX_ENTRY_OVERHEAD 1232 + firstKey.length; 1233 1234 curTotalRootSize += Bytes.SIZEOF_LONG + Bytes.SIZEOF_INT 1235 + WritableUtils.getVIntSize(firstKey.length) + firstKey.length; 1236 1237 blockKeys.add(firstKey); 1238 blockOffsets.add(blockOffset); 1239 onDiskDataSizes.add(onDiskDataSize); 1240 1241 if (curTotalNumSubEntries != -1) { 1242 numSubEntriesAt.add(curTotalNumSubEntries); 1243 1244 // Make sure the parallel arrays are in sync. 1245 if (numSubEntriesAt.size() != blockKeys.size()) { 1246 throw new IllegalStateException("Only have key/value count " + 1247 "stats for " + numSubEntriesAt.size() + " block index " + 1248 "entries out of " + blockKeys.size()); 1249 } 1250 } 1251 } 1252 1253 /** 1254 * The same as {@link #add(byte[], long, int, long)} but does not take the 1255 * key/value into account. Used for single-level indexes. 1256 * 1257 * @see {@link #add(byte[], long, int, long)} 1258 */ add(byte[] firstKey, long blockOffset, int onDiskDataSize)1259 public void add(byte[] firstKey, long blockOffset, int onDiskDataSize) { 1260 add(firstKey, blockOffset, onDiskDataSize, -1); 1261 } 1262 clear()1263 public void clear() { 1264 blockKeys.clear(); 1265 blockOffsets.clear(); 1266 onDiskDataSizes.clear(); 1267 secondaryIndexOffsetMarks.clear(); 1268 numSubEntriesAt.clear(); 1269 curTotalNonRootEntrySize = 0; 1270 curTotalRootSize = 0; 1271 } 1272 1273 /** 1274 * Finds the entry corresponding to the deeper-level index block containing 1275 * the given deeper-level entry (a "sub-entry"), assuming a global 0-based 1276 * ordering of sub-entries. 1277 * 1278 * <p> 1279 * <i> Implementation note. </i> We are looking for i such that 1280 * numSubEntriesAt[i - 1] <= k < numSubEntriesAt[i], because a deeper-level 1281 * block #i (0-based) contains sub-entries # numSubEntriesAt[i - 1]'th 1282 * through numSubEntriesAt[i] - 1, assuming a global 0-based ordering of 1283 * sub-entries. i is by definition the insertion point of k in 1284 * numSubEntriesAt. 1285 * 1286 * @param k sub-entry index, from 0 to the total number sub-entries - 1 1287 * @return the 0-based index of the entry corresponding to the given 1288 * sub-entry 1289 */ getEntryBySubEntry(long k)1290 public int getEntryBySubEntry(long k) { 1291 // We define mid-key as the key corresponding to k'th sub-entry 1292 // (0-based). 1293 1294 int i = Collections.binarySearch(numSubEntriesAt, k); 1295 1296 // Exact match: cumulativeWeight[i] = k. This means chunks #0 through 1297 // #i contain exactly k sub-entries, and the sub-entry #k (0-based) 1298 // is in the (i + 1)'th chunk. 1299 if (i >= 0) 1300 return i + 1; 1301 1302 // Inexact match. Return the insertion point. 1303 return -i - 1; 1304 } 1305 1306 /** 1307 * Used when writing the root block index of a multi-level block index. 1308 * Serializes additional information allowing to efficiently identify the 1309 * mid-key. 1310 * 1311 * @return a few serialized fields for finding the mid-key 1312 * @throws IOException if could not create metadata for computing mid-key 1313 */ getMidKeyMetadata()1314 public byte[] getMidKeyMetadata() throws IOException { 1315 ByteArrayOutputStream baos = new ByteArrayOutputStream( 1316 MID_KEY_METADATA_SIZE); 1317 DataOutputStream baosDos = new DataOutputStream(baos); 1318 long totalNumSubEntries = numSubEntriesAt.get(blockKeys.size() - 1); 1319 if (totalNumSubEntries == 0) { 1320 throw new IOException("No leaf-level entries, mid-key unavailable"); 1321 } 1322 long midKeySubEntry = (totalNumSubEntries - 1) / 2; 1323 int midKeyEntry = getEntryBySubEntry(midKeySubEntry); 1324 1325 baosDos.writeLong(blockOffsets.get(midKeyEntry)); 1326 baosDos.writeInt(onDiskDataSizes.get(midKeyEntry)); 1327 1328 long numSubEntriesBefore = midKeyEntry > 0 1329 ? numSubEntriesAt.get(midKeyEntry - 1) : 0; 1330 long subEntryWithinEntry = midKeySubEntry - numSubEntriesBefore; 1331 if (subEntryWithinEntry < 0 || subEntryWithinEntry > Integer.MAX_VALUE) 1332 { 1333 throw new IOException("Could not identify mid-key index within the " 1334 + "leaf-level block containing mid-key: out of range (" 1335 + subEntryWithinEntry + ", numSubEntriesBefore=" 1336 + numSubEntriesBefore + ", midKeySubEntry=" + midKeySubEntry 1337 + ")"); 1338 } 1339 1340 baosDos.writeInt((int) subEntryWithinEntry); 1341 1342 if (baosDos.size() != MID_KEY_METADATA_SIZE) { 1343 throw new IOException("Could not write mid-key metadata: size=" + 1344 baosDos.size() + ", correct size: " + MID_KEY_METADATA_SIZE); 1345 } 1346 1347 // Close just to be good citizens, although this has no effect. 1348 baos.close(); 1349 1350 return baos.toByteArray(); 1351 } 1352 1353 /** 1354 * Writes the block index chunk in the non-root index block format. This 1355 * format contains the number of entries, an index of integer offsets 1356 * for quick binary search on variable-length records, and tuples of 1357 * block offset, on-disk block size, and the first key for each entry. 1358 * 1359 * @param out 1360 * @throws IOException 1361 */ writeNonRoot(DataOutput out)1362 void writeNonRoot(DataOutput out) throws IOException { 1363 // The number of entries in the block. 1364 out.writeInt(blockKeys.size()); 1365 1366 if (secondaryIndexOffsetMarks.size() != blockKeys.size()) { 1367 throw new IOException("Corrupted block index chunk writer: " + 1368 blockKeys.size() + " entries but " + 1369 secondaryIndexOffsetMarks.size() + " secondary index items"); 1370 } 1371 1372 // For each entry, write a "secondary index" of relative offsets to the 1373 // entries from the end of the secondary index. This works, because at 1374 // read time we read the number of entries and know where the secondary 1375 // index ends. 1376 for (int currentSecondaryIndex : secondaryIndexOffsetMarks) 1377 out.writeInt(currentSecondaryIndex); 1378 1379 // We include one other element in the secondary index to calculate the 1380 // size of each entry more easily by subtracting secondary index elements. 1381 out.writeInt(curTotalNonRootEntrySize); 1382 1383 for (int i = 0; i < blockKeys.size(); ++i) { 1384 out.writeLong(blockOffsets.get(i)); 1385 out.writeInt(onDiskDataSizes.get(i)); 1386 out.write(blockKeys.get(i)); 1387 } 1388 } 1389 1390 /** 1391 * @return the size of this chunk if stored in the non-root index block 1392 * format 1393 */ getNonRootSize()1394 int getNonRootSize() { 1395 return Bytes.SIZEOF_INT // Number of entries 1396 + Bytes.SIZEOF_INT * (blockKeys.size() + 1) // Secondary index 1397 + curTotalNonRootEntrySize; // All entries 1398 } 1399 1400 /** 1401 * Writes this chunk into the given output stream in the root block index 1402 * format. This format is similar to the {@link HFile} version 1 block 1403 * index format, except that we store on-disk size of the block instead of 1404 * its uncompressed size. 1405 * 1406 * @param out the data output stream to write the block index to. Typically 1407 * a stream writing into an {@link HFile} block. 1408 * @throws IOException 1409 */ writeRoot(DataOutput out)1410 void writeRoot(DataOutput out) throws IOException { 1411 for (int i = 0; i < blockKeys.size(); ++i) { 1412 out.writeLong(blockOffsets.get(i)); 1413 out.writeInt(onDiskDataSizes.get(i)); 1414 Bytes.writeByteArray(out, blockKeys.get(i)); 1415 } 1416 } 1417 1418 /** 1419 * @return the size of this chunk if stored in the root index block format 1420 */ getRootSize()1421 int getRootSize() { 1422 return curTotalRootSize; 1423 } 1424 1425 /** 1426 * @return the number of entries in this block index chunk 1427 */ getNumEntries()1428 public int getNumEntries() { 1429 return blockKeys.size(); 1430 } 1431 getBlockKey(int i)1432 public byte[] getBlockKey(int i) { 1433 return blockKeys.get(i); 1434 } 1435 getBlockOffset(int i)1436 public long getBlockOffset(int i) { 1437 return blockOffsets.get(i); 1438 } 1439 getOnDiskDataSize(int i)1440 public int getOnDiskDataSize(int i) { 1441 return onDiskDataSizes.get(i); 1442 } 1443 getCumulativeNumKV(int i)1444 public long getCumulativeNumKV(int i) { 1445 if (i < 0) 1446 return 0; 1447 return numSubEntriesAt.get(i); 1448 } 1449 1450 } 1451 getMaxChunkSize(Configuration conf)1452 public static int getMaxChunkSize(Configuration conf) { 1453 return conf.getInt(MAX_CHUNK_SIZE_KEY, DEFAULT_MAX_CHUNK_SIZE); 1454 } 1455 } 1456