001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.regionserver; 019 020import static org.apache.hadoop.hbase.HConstants.REPLICATION_SCOPE_LOCAL; 021import static org.apache.hadoop.hbase.regionserver.HStoreFile.MAJOR_COMPACTION_KEY; 022import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.REGION_NAMES_KEY; 023import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.ROW_LOCK_READ_LOCK_KEY; 024import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent; 025 026import com.google.errorprone.annotations.RestrictedApi; 027import edu.umd.cs.findbugs.annotations.Nullable; 028import io.opentelemetry.api.trace.Span; 029import java.io.EOFException; 030import java.io.FileNotFoundException; 031import java.io.IOException; 032import java.io.InterruptedIOException; 033import java.lang.reflect.Constructor; 034import java.nio.ByteBuffer; 035import java.nio.charset.StandardCharsets; 036import java.text.ParseException; 037import java.util.ArrayList; 038import java.util.Arrays; 039import java.util.Collection; 040import java.util.Collections; 041import java.util.HashMap; 042import java.util.HashSet; 043import java.util.Iterator; 044import java.util.List; 045import java.util.Map; 046import java.util.Map.Entry; 047import java.util.NavigableMap; 048import java.util.NavigableSet; 049import java.util.Objects; 050import java.util.Optional; 051import java.util.RandomAccess; 052import java.util.Set; 053import java.util.TreeMap; 054import java.util.UUID; 055import java.util.concurrent.Callable; 056import java.util.concurrent.CompletionService; 057import java.util.concurrent.ConcurrentHashMap; 058import java.util.concurrent.ConcurrentMap; 059import java.util.concurrent.ConcurrentSkipListMap; 060import java.util.concurrent.ExecutionException; 061import java.util.concurrent.ExecutorCompletionService; 062import java.util.concurrent.ExecutorService; 063import java.util.concurrent.Executors; 064import java.util.concurrent.Future; 065import java.util.concurrent.FutureTask; 066import java.util.concurrent.ThreadFactory; 067import java.util.concurrent.ThreadPoolExecutor; 068import java.util.concurrent.TimeUnit; 069import java.util.concurrent.TimeoutException; 070import java.util.concurrent.atomic.AtomicBoolean; 071import java.util.concurrent.atomic.AtomicInteger; 072import java.util.concurrent.atomic.LongAdder; 073import java.util.concurrent.locks.Lock; 074import java.util.concurrent.locks.ReadWriteLock; 075import java.util.concurrent.locks.ReentrantReadWriteLock; 076import java.util.function.Function; 077import java.util.stream.Collectors; 078import java.util.stream.Stream; 079import org.apache.hadoop.conf.Configuration; 080import org.apache.hadoop.fs.FileStatus; 081import org.apache.hadoop.fs.FileSystem; 082import org.apache.hadoop.fs.LocatedFileStatus; 083import org.apache.hadoop.fs.Path; 084import org.apache.hadoop.hbase.Cell; 085import org.apache.hadoop.hbase.CellBuilderType; 086import org.apache.hadoop.hbase.CellComparator; 087import org.apache.hadoop.hbase.CellComparatorImpl; 088import org.apache.hadoop.hbase.CellScanner; 089import org.apache.hadoop.hbase.CellUtil; 090import org.apache.hadoop.hbase.CompareOperator; 091import org.apache.hadoop.hbase.CompoundConfiguration; 092import org.apache.hadoop.hbase.DoNotRetryIOException; 093import org.apache.hadoop.hbase.DroppedSnapshotException; 094import org.apache.hadoop.hbase.ExtendedCellBuilderFactory; 095import org.apache.hadoop.hbase.HConstants; 096import org.apache.hadoop.hbase.HConstants.OperationStatusCode; 097import org.apache.hadoop.hbase.HDFSBlocksDistribution; 098import org.apache.hadoop.hbase.KeyValue; 099import org.apache.hadoop.hbase.MetaCellComparator; 100import org.apache.hadoop.hbase.NamespaceDescriptor; 101import org.apache.hadoop.hbase.NotServingRegionException; 102import org.apache.hadoop.hbase.PrivateCellUtil; 103import org.apache.hadoop.hbase.RegionTooBusyException; 104import org.apache.hadoop.hbase.TableName; 105import org.apache.hadoop.hbase.Tag; 106import org.apache.hadoop.hbase.TagUtil; 107import org.apache.hadoop.hbase.client.Append; 108import org.apache.hadoop.hbase.client.CheckAndMutate; 109import org.apache.hadoop.hbase.client.CheckAndMutateResult; 110import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; 111import org.apache.hadoop.hbase.client.CompactionState; 112import org.apache.hadoop.hbase.client.Delete; 113import org.apache.hadoop.hbase.client.Durability; 114import org.apache.hadoop.hbase.client.Get; 115import org.apache.hadoop.hbase.client.Increment; 116import org.apache.hadoop.hbase.client.IsolationLevel; 117import org.apache.hadoop.hbase.client.Mutation; 118import org.apache.hadoop.hbase.client.Put; 119import org.apache.hadoop.hbase.client.RegionInfo; 120import org.apache.hadoop.hbase.client.RegionInfoBuilder; 121import org.apache.hadoop.hbase.client.RegionReplicaUtil; 122import org.apache.hadoop.hbase.client.Result; 123import org.apache.hadoop.hbase.client.Row; 124import org.apache.hadoop.hbase.client.RowMutations; 125import org.apache.hadoop.hbase.client.Scan; 126import org.apache.hadoop.hbase.client.TableDescriptor; 127import org.apache.hadoop.hbase.client.TableDescriptorBuilder; 128import org.apache.hadoop.hbase.conf.ConfigurationManager; 129import org.apache.hadoop.hbase.conf.PropagatingConfigurationObserver; 130import org.apache.hadoop.hbase.coprocessor.CoprocessorHost; 131import org.apache.hadoop.hbase.coprocessor.ReadOnlyConfiguration; 132import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare; 133import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException; 134import org.apache.hadoop.hbase.exceptions.TimeoutIOException; 135import org.apache.hadoop.hbase.exceptions.UnknownProtocolException; 136import org.apache.hadoop.hbase.filter.BinaryComparator; 137import org.apache.hadoop.hbase.filter.ByteArrayComparable; 138import org.apache.hadoop.hbase.filter.Filter; 139import org.apache.hadoop.hbase.io.HFileLink; 140import org.apache.hadoop.hbase.io.HeapSize; 141import org.apache.hadoop.hbase.io.TimeRange; 142import org.apache.hadoop.hbase.io.hfile.BlockCache; 143import org.apache.hadoop.hbase.io.hfile.CombinedBlockCache; 144import org.apache.hadoop.hbase.io.hfile.HFile; 145import org.apache.hadoop.hbase.io.hfile.bucket.BucketCache; 146import org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils; 147import org.apache.hadoop.hbase.ipc.RpcCall; 148import org.apache.hadoop.hbase.ipc.RpcServer; 149import org.apache.hadoop.hbase.mob.MobFileCache; 150import org.apache.hadoop.hbase.monitoring.MonitoredTask; 151import org.apache.hadoop.hbase.monitoring.TaskMonitor; 152import org.apache.hadoop.hbase.quotas.RegionServerSpaceQuotaManager; 153import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl.WriteEntry; 154import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext; 155import org.apache.hadoop.hbase.regionserver.compactions.CompactionLifeCycleTracker; 156import org.apache.hadoop.hbase.regionserver.metrics.MetricsTableRequests; 157import org.apache.hadoop.hbase.regionserver.throttle.CompactionThroughputControllerFactory; 158import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController; 159import org.apache.hadoop.hbase.regionserver.throttle.StoreHotnessProtector; 160import org.apache.hadoop.hbase.regionserver.throttle.ThroughputController; 161import org.apache.hadoop.hbase.regionserver.wal.WALSyncTimeoutIOException; 162import org.apache.hadoop.hbase.regionserver.wal.WALUtil; 163import org.apache.hadoop.hbase.replication.ReplicationUtils; 164import org.apache.hadoop.hbase.replication.regionserver.ReplicationObserver; 165import org.apache.hadoop.hbase.security.User; 166import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils; 167import org.apache.hadoop.hbase.snapshot.SnapshotManifest; 168import org.apache.hadoop.hbase.trace.TraceUtil; 169import org.apache.hadoop.hbase.util.Bytes; 170import org.apache.hadoop.hbase.util.CancelableProgressable; 171import org.apache.hadoop.hbase.util.ClassSize; 172import org.apache.hadoop.hbase.util.CommonFSUtils; 173import org.apache.hadoop.hbase.util.CoprocessorConfigurationUtil; 174import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 175import org.apache.hadoop.hbase.util.FSUtils; 176import org.apache.hadoop.hbase.util.HashedBytes; 177import org.apache.hadoop.hbase.util.NonceKey; 178import org.apache.hadoop.hbase.util.Pair; 179import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil; 180import org.apache.hadoop.hbase.util.TableDescriptorChecker; 181import org.apache.hadoop.hbase.util.Threads; 182import org.apache.hadoop.hbase.wal.WAL; 183import org.apache.hadoop.hbase.wal.WALEdit; 184import org.apache.hadoop.hbase.wal.WALFactory; 185import org.apache.hadoop.hbase.wal.WALKey; 186import org.apache.hadoop.hbase.wal.WALKeyImpl; 187import org.apache.hadoop.hbase.wal.WALSplitUtil; 188import org.apache.hadoop.hbase.wal.WALSplitUtil.MutationReplay; 189import org.apache.hadoop.hbase.wal.WALStreamReader; 190import org.apache.hadoop.util.StringUtils; 191import org.apache.yetus.audience.InterfaceAudience; 192import org.slf4j.Logger; 193import org.slf4j.LoggerFactory; 194 195import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; 196import org.apache.hbase.thirdparty.com.google.common.collect.Iterables; 197import org.apache.hbase.thirdparty.com.google.common.collect.Lists; 198import org.apache.hbase.thirdparty.com.google.common.collect.Maps; 199import org.apache.hbase.thirdparty.com.google.common.io.Closeables; 200import org.apache.hbase.thirdparty.com.google.protobuf.Service; 201import org.apache.hbase.thirdparty.com.google.protobuf.TextFormat; 202import org.apache.hbase.thirdparty.com.google.protobuf.UnsafeByteOperations; 203import org.apache.hbase.thirdparty.org.apache.commons.collections4.CollectionUtils; 204 205import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 206import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos; 207import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.CoprocessorServiceCall; 208import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.RegionLoad; 209import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.StoreSequenceId; 210import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription; 211import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos; 212import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.CompactionDescriptor; 213import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor; 214import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.FlushAction; 215import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor.StoreFlushDescriptor; 216import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor; 217import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor.EventType; 218import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.StoreDescriptor; 219 220/** 221 * Regions store data for a certain region of a table. It stores all columns for each row. A given 222 * table consists of one or more Regions. 223 * <p> 224 * An Region is defined by its table and its key extent. 225 * <p> 226 * Locking at the Region level serves only one purpose: preventing the region from being closed (and 227 * consequently split) while other operations are ongoing. Each row level operation obtains both a 228 * row lock and a region read lock for the duration of the operation. While a scanner is being 229 * constructed, getScanner holds a read lock. If the scanner is successfully constructed, it holds a 230 * read lock until it is closed. A close takes out a write lock and consequently will block for 231 * ongoing operations and will block new operations from starting while the close is in progress. 232 */ 233@SuppressWarnings("deprecation") 234@InterfaceAudience.Private 235public class HRegion implements HeapSize, PropagatingConfigurationObserver, Region { 236 private static final Logger LOG = LoggerFactory.getLogger(HRegion.class); 237 238 public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY = 239 "hbase.hregion.scan.loadColumnFamiliesOnDemand"; 240 241 public static final String HBASE_MAX_CELL_SIZE_KEY = "hbase.server.keyvalue.maxsize"; 242 public static final int DEFAULT_MAX_CELL_SIZE = 10485760; 243 244 public static final String HBASE_REGIONSERVER_MINIBATCH_SIZE = 245 "hbase.regionserver.minibatch.size"; 246 public static final int DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE = 20000; 247 248 public static final String WAL_HSYNC_CONF_KEY = "hbase.wal.hsync"; 249 public static final boolean DEFAULT_WAL_HSYNC = false; 250 251 /** Parameter name for compaction after bulkload */ 252 public static final String COMPACTION_AFTER_BULKLOAD_ENABLE = 253 "hbase.compaction.after.bulkload.enable"; 254 255 /** Config for allow split when file count greater than the configured blocking file count */ 256 public static final String SPLIT_IGNORE_BLOCKING_ENABLED_KEY = 257 "hbase.hregion.split.ignore.blocking.enabled"; 258 259 public static final String REGION_STORAGE_POLICY_KEY = "hbase.hregion.block.storage.policy"; 260 public static final String DEFAULT_REGION_STORAGE_POLICY = "NONE"; 261 262 /** 263 * This is for for using HRegion as a local storage, where we may put the recovered edits in a 264 * special place. Once this is set, we will only replay the recovered edits under this directory 265 * and ignore the original replay directory configs. 266 */ 267 public static final String SPECIAL_RECOVERED_EDITS_DIR = 268 "hbase.hregion.special.recovered.edits.dir"; 269 270 /** 271 * Mainly used for master local region, where we will replay the WAL file directly without 272 * splitting, so it is possible to have WAL files which are not closed cleanly, in this way, 273 * hitting EOF is expected so should not consider it as a critical problem. 274 */ 275 public static final String RECOVERED_EDITS_IGNORE_EOF = 276 "hbase.hregion.recovered.edits.ignore.eof"; 277 278 /** 279 * Whether to use {@link MetaCellComparator} even if we are not meta region. Used when creating 280 * master local region. 281 */ 282 public static final String USE_META_CELL_COMPARATOR = "hbase.region.use.meta.cell.comparator"; 283 284 public static final boolean DEFAULT_USE_META_CELL_COMPARATOR = false; 285 286 final AtomicBoolean closed = new AtomicBoolean(false); 287 288 /* 289 * Closing can take some time; use the closing flag if there is stuff we don't want to do while in 290 * closing state; e.g. like offer this region up to the master as a region to close if the 291 * carrying regionserver is overloaded. Once set, it is never cleared. 292 */ 293 final AtomicBoolean closing = new AtomicBoolean(false); 294 295 /** 296 * The max sequence id of flushed data on this region. There is no edit in memory that is less 297 * that this sequence id. 298 */ 299 private volatile long maxFlushedSeqId = HConstants.NO_SEQNUM; 300 301 /** 302 * Record the sequence id of last flush operation. Can be in advance of {@link #maxFlushedSeqId} 303 * when flushing a single column family. In this case, {@link #maxFlushedSeqId} will be older than 304 * the oldest edit in memory. 305 */ 306 private volatile long lastFlushOpSeqId = HConstants.NO_SEQNUM; 307 308 /** 309 * The sequence id of the last replayed open region event from the primary region. This is used to 310 * skip entries before this due to the possibility of replay edits coming out of order from 311 * replication. 312 */ 313 protected volatile long lastReplayedOpenRegionSeqId = -1L; 314 protected volatile long lastReplayedCompactionSeqId = -1L; 315 316 ////////////////////////////////////////////////////////////////////////////// 317 // Members 318 ////////////////////////////////////////////////////////////////////////////// 319 320 // map from a locked row to the context for that lock including: 321 // - CountDownLatch for threads waiting on that row 322 // - the thread that owns the lock (allow reentrancy) 323 // - reference count of (reentrant) locks held by the thread 324 // - the row itself 325 private final ConcurrentHashMap<HashedBytes, RowLockContext> lockedRows = 326 new ConcurrentHashMap<>(); 327 328 protected final Map<byte[], HStore> stores = 329 new ConcurrentSkipListMap<>(Bytes.BYTES_RAWCOMPARATOR); 330 331 // TODO: account for each registered handler in HeapSize computation 332 private Map<String, com.google.protobuf.Service> coprocessorServiceHandlers = Maps.newHashMap(); 333 334 // Track data size in all memstores 335 private final MemStoreSizing memStoreSizing = new ThreadSafeMemStoreSizing(); 336 RegionServicesForStores regionServicesForStores; 337 338 // Debug possible data loss due to WAL off 339 final LongAdder numMutationsWithoutWAL = new LongAdder(); 340 final LongAdder dataInMemoryWithoutWAL = new LongAdder(); 341 342 // Debug why CAS operations are taking a while. 343 final LongAdder checkAndMutateChecksPassed = new LongAdder(); 344 final LongAdder checkAndMutateChecksFailed = new LongAdder(); 345 346 // Number of requests 347 // Count rows for scan 348 final LongAdder readRequestsCount = new LongAdder(); 349 final LongAdder filteredReadRequestsCount = new LongAdder(); 350 // Count rows for multi row mutations 351 final LongAdder writeRequestsCount = new LongAdder(); 352 353 // Number of requests blocked by memstore size. 354 private final LongAdder blockedRequestsCount = new LongAdder(); 355 356 // Compaction LongAdders 357 final LongAdder compactionsFinished = new LongAdder(); 358 final LongAdder compactionsFailed = new LongAdder(); 359 final LongAdder compactionNumFilesCompacted = new LongAdder(); 360 final LongAdder compactionNumBytesCompacted = new LongAdder(); 361 final LongAdder compactionsQueued = new LongAdder(); 362 final LongAdder flushesQueued = new LongAdder(); 363 364 private BlockCache blockCache; 365 private MobFileCache mobFileCache; 366 private final WAL wal; 367 private final HRegionFileSystem fs; 368 protected final Configuration conf; 369 private final Configuration baseConf; 370 private final int rowLockWaitDuration; 371 static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000; 372 373 private Path regionWalDir; 374 private FileSystem walFS; 375 376 // set to true if the region is restored from snapshot 377 private boolean isRestoredRegion = false; 378 379 public void setRestoredRegion(boolean restoredRegion) { 380 isRestoredRegion = restoredRegion; 381 } 382 383 public MetricsTableRequests getMetricsTableRequests() { 384 return metricsTableRequests; 385 } 386 387 // Handle table latency metrics 388 private MetricsTableRequests metricsTableRequests; 389 390 // The internal wait duration to acquire a lock before read/update 391 // from the region. It is not per row. The purpose of this wait time 392 // is to avoid waiting a long time while the region is busy, so that 393 // we can release the IPC handler soon enough to improve the 394 // availability of the region server. It can be adjusted by 395 // tuning configuration "hbase.busy.wait.duration". 396 final long busyWaitDuration; 397 static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT; 398 399 // If updating multiple rows in one call, wait longer, 400 // i.e. waiting for busyWaitDuration * # of rows. However, 401 // we can limit the max multiplier. 402 final int maxBusyWaitMultiplier; 403 404 // Max busy wait duration. There is no point to wait longer than the RPC 405 // purge timeout, when a RPC call will be terminated by the RPC engine. 406 final long maxBusyWaitDuration; 407 408 // Max cell size. If nonzero, the maximum allowed size for any given cell 409 // in bytes 410 final long maxCellSize; 411 412 // Number of mutations for minibatch processing. 413 private final int miniBatchSize; 414 415 // negative number indicates infinite timeout 416 static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L; 417 final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool(); 418 419 final ConcurrentHashMap<RegionScanner, Long> scannerReadPoints; 420 final ReadPointCalculationLock smallestReadPointCalcLock; 421 422 /** 423 * The sequence ID that was enLongAddered when this region was opened. 424 */ 425 private long openSeqNum = HConstants.NO_SEQNUM; 426 427 /** 428 * The default setting for whether to enable on-demand CF loading for scan requests to this 429 * region. Requests can override it. 430 */ 431 private boolean isLoadingCfsOnDemandDefault = false; 432 433 private final AtomicInteger majorInProgress = new AtomicInteger(0); 434 private final AtomicInteger minorInProgress = new AtomicInteger(0); 435 436 // 437 // Context: During replay we want to ensure that we do not lose any data. So, we 438 // have to be conservative in how we replay wals. For each store, we calculate 439 // the maxSeqId up to which the store was flushed. And, skip the edits which 440 // are equal to or lower than maxSeqId for each store. 441 // The following map is populated when opening the region 442 Map<byte[], Long> maxSeqIdInStores = new TreeMap<>(Bytes.BYTES_COMPARATOR); 443 444 /** Saved state from replaying prepare flush cache */ 445 private PrepareFlushResult prepareFlushResult = null; 446 447 private volatile ConfigurationManager configurationManager; 448 449 // Used for testing. 450 private volatile Long timeoutForWriteLock = null; 451 452 private final CellComparator cellComparator; 453 454 private final int minBlockSizeBytes; 455 456 /** 457 * @return The smallest mvcc readPoint across all the scanners in this region. Writes older than 458 * this readPoint, are included in every read operation. 459 */ 460 public long getSmallestReadPoint() { 461 // We need to ensure that while we are calculating the smallestReadPoint 462 // no new RegionScanners can grab a readPoint that we are unaware of. 463 smallestReadPointCalcLock.lock(ReadPointCalculationLock.LockType.CALCULATION_LOCK); 464 try { 465 long minimumReadPoint = mvcc.getReadPoint(); 466 for (Long readPoint : this.scannerReadPoints.values()) { 467 minimumReadPoint = Math.min(minimumReadPoint, readPoint); 468 } 469 return minimumReadPoint; 470 } finally { 471 smallestReadPointCalcLock.unlock(ReadPointCalculationLock.LockType.CALCULATION_LOCK); 472 } 473 } 474 475 /* 476 * Data structure of write state flags used coordinating flushes, compactions and closes. 477 */ 478 static class WriteState { 479 // Set while a memstore flush is happening. 480 volatile boolean flushing = false; 481 // Set when a flush has been requested. 482 volatile boolean flushRequested = false; 483 // Number of compactions running. 484 AtomicInteger compacting = new AtomicInteger(0); 485 // Gets set in close. If set, cannot compact or flush again. 486 volatile boolean writesEnabled = true; 487 // Set if region is read-only 488 volatile boolean readOnly = false; 489 // whether the reads are enabled. This is different than readOnly, because readOnly is 490 // static in the lifetime of the region, while readsEnabled is dynamic 491 volatile boolean readsEnabled = true; 492 493 /** 494 * Set flags that make this region read-only. 495 * @param onOff flip value for region r/o setting 496 */ 497 synchronized void setReadOnly(final boolean onOff) { 498 this.writesEnabled = !onOff; 499 this.readOnly = onOff; 500 } 501 502 boolean isReadOnly() { 503 return this.readOnly; 504 } 505 506 boolean isFlushRequested() { 507 return this.flushRequested; 508 } 509 510 void setReadsEnabled(boolean readsEnabled) { 511 this.readsEnabled = readsEnabled; 512 } 513 514 static final long HEAP_SIZE = ClassSize.align(ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN); 515 } 516 517 /** 518 * Objects from this class are created when flushing to describe all the different states that 519 * that method ends up in. The Result enum describes those states. The sequence id should only be 520 * specified if the flush was successful, and the failure message should only be specified if it 521 * didn't flush. 522 */ 523 public static class FlushResultImpl implements FlushResult { 524 final Result result; 525 final String failureReason; 526 final long flushSequenceId; 527 final boolean wroteFlushWalMarker; 528 529 /** 530 * Convenience constructor to use when the flush is successful, the failure message is set to 531 * null. 532 * @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED. 533 * @param flushSequenceId Generated sequence id that comes right after the edits in the 534 * memstores. 535 */ 536 FlushResultImpl(Result result, long flushSequenceId) { 537 this(result, flushSequenceId, null, false); 538 assert result == Result.FLUSHED_NO_COMPACTION_NEEDED 539 || result == Result.FLUSHED_COMPACTION_NEEDED; 540 } 541 542 /** 543 * Convenience constructor to use when we cannot flush. 544 * @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH. 545 * @param failureReason Reason why we couldn't flush. 546 */ 547 FlushResultImpl(Result result, String failureReason, boolean wroteFlushMarker) { 548 this(result, -1, failureReason, wroteFlushMarker); 549 assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH; 550 } 551 552 /** 553 * Constructor with all the parameters. 554 * @param result Any of the Result. 555 * @param flushSequenceId Generated sequence id if the memstores were flushed else -1. 556 * @param failureReason Reason why we couldn't flush, or null. 557 */ 558 FlushResultImpl(Result result, long flushSequenceId, String failureReason, 559 boolean wroteFlushMarker) { 560 this.result = result; 561 this.flushSequenceId = flushSequenceId; 562 this.failureReason = failureReason; 563 this.wroteFlushWalMarker = wroteFlushMarker; 564 } 565 566 /** 567 * Convenience method, the equivalent of checking if result is FLUSHED_NO_COMPACTION_NEEDED or 568 * FLUSHED_NO_COMPACTION_NEEDED. 569 * @return true if the memstores were flushed, else false. 570 */ 571 @Override 572 public boolean isFlushSucceeded() { 573 return result == Result.FLUSHED_NO_COMPACTION_NEEDED 574 || result == Result.FLUSHED_COMPACTION_NEEDED; 575 } 576 577 /** 578 * Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED. 579 * @return True if the flush requested a compaction, else false (doesn't even mean it flushed). 580 */ 581 @Override 582 public boolean isCompactionNeeded() { 583 return result == Result.FLUSHED_COMPACTION_NEEDED; 584 } 585 586 @Override 587 public String toString() { 588 return new StringBuilder().append("flush result:").append(result).append(", ") 589 .append("failureReason:").append(failureReason).append(",").append("flush seq id") 590 .append(flushSequenceId).toString(); 591 } 592 593 @Override 594 public Result getResult() { 595 return result; 596 } 597 } 598 599 /** A result object from prepare flush cache stage */ 600 static class PrepareFlushResult { 601 final FlushResultImpl result; // indicating a failure result from prepare 602 final TreeMap<byte[], StoreFlushContext> storeFlushCtxs; 603 final TreeMap<byte[], List<Path>> committedFiles; 604 final TreeMap<byte[], MemStoreSize> storeFlushableSize; 605 final long startTime; 606 final long flushOpSeqId; 607 final long flushedSeqId; 608 final MemStoreSizing totalFlushableSize; 609 610 /** Constructs an early exit case */ 611 PrepareFlushResult(FlushResultImpl result, long flushSeqId) { 612 this(result, null, null, null, Math.max(0, flushSeqId), 0, 0, MemStoreSizing.DUD); 613 } 614 615 /** Constructs a successful prepare flush result */ 616 PrepareFlushResult(TreeMap<byte[], StoreFlushContext> storeFlushCtxs, 617 TreeMap<byte[], List<Path>> committedFiles, TreeMap<byte[], MemStoreSize> storeFlushableSize, 618 long startTime, long flushSeqId, long flushedSeqId, MemStoreSizing totalFlushableSize) { 619 this(null, storeFlushCtxs, committedFiles, storeFlushableSize, startTime, flushSeqId, 620 flushedSeqId, totalFlushableSize); 621 } 622 623 private PrepareFlushResult(FlushResultImpl result, 624 TreeMap<byte[], StoreFlushContext> storeFlushCtxs, TreeMap<byte[], List<Path>> committedFiles, 625 TreeMap<byte[], MemStoreSize> storeFlushableSize, long startTime, long flushSeqId, 626 long flushedSeqId, MemStoreSizing totalFlushableSize) { 627 this.result = result; 628 this.storeFlushCtxs = storeFlushCtxs; 629 this.committedFiles = committedFiles; 630 this.storeFlushableSize = storeFlushableSize; 631 this.startTime = startTime; 632 this.flushOpSeqId = flushSeqId; 633 this.flushedSeqId = flushedSeqId; 634 this.totalFlushableSize = totalFlushableSize; 635 } 636 637 public FlushResult getResult() { 638 return this.result; 639 } 640 } 641 642 /** 643 * A class that tracks exceptions that have been observed in one batch. Not thread safe. 644 */ 645 static class ObservedExceptionsInBatch { 646 private boolean wrongRegion = false; 647 private boolean failedSanityCheck = false; 648 private boolean wrongFamily = false; 649 650 /** Returns If a {@link WrongRegionException} has been observed. */ 651 boolean hasSeenWrongRegion() { 652 return wrongRegion; 653 } 654 655 /** 656 * Records that a {@link WrongRegionException} has been observed. 657 */ 658 void sawWrongRegion() { 659 wrongRegion = true; 660 } 661 662 /** Returns If a {@link FailedSanityCheckException} has been observed. */ 663 boolean hasSeenFailedSanityCheck() { 664 return failedSanityCheck; 665 } 666 667 /** 668 * Records that a {@link FailedSanityCheckException} has been observed. 669 */ 670 void sawFailedSanityCheck() { 671 failedSanityCheck = true; 672 } 673 674 /** Returns If a {@link NoSuchColumnFamilyException} has been observed. */ 675 boolean hasSeenNoSuchFamily() { 676 return wrongFamily; 677 } 678 679 /** 680 * Records that a {@link NoSuchColumnFamilyException} has been observed. 681 */ 682 void sawNoSuchFamily() { 683 wrongFamily = true; 684 } 685 } 686 687 final WriteState writestate = new WriteState(); 688 689 long memstoreFlushSize; 690 final long timestampSlop; 691 final long rowProcessorTimeout; 692 693 // Last flush time for each Store. Useful when we are flushing for each column 694 private final ConcurrentMap<HStore, Long> lastStoreFlushTimeMap = new ConcurrentHashMap<>(); 695 696 protected RegionServerServices rsServices; 697 private RegionServerAccounting rsAccounting; 698 private long flushCheckInterval; 699 // flushPerChanges is to prevent too many changes in memstore 700 private long flushPerChanges; 701 private long blockingMemStoreSize; 702 // Used to guard closes 703 final ReentrantReadWriteLock lock; 704 // Used to track interruptible holders of the region lock. Currently that is only RPC handler 705 // threads. Boolean value in map determines if lock holder can be interrupted, normally true, 706 // but may be false when thread is transiting a critical section. 707 final ConcurrentHashMap<Thread, Boolean> regionLockHolders; 708 709 // Stop updates lock 710 private final ReentrantReadWriteLock updatesLock = new ReentrantReadWriteLock(); 711 712 private final MultiVersionConcurrencyControl mvcc; 713 714 // Coprocessor host 715 private volatile RegionCoprocessorHost coprocessorHost; 716 717 private TableDescriptor htableDescriptor = null; 718 private RegionSplitPolicy splitPolicy; 719 private RegionSplitRestriction splitRestriction; 720 private FlushPolicy flushPolicy; 721 722 private final MetricsRegion metricsRegion; 723 private final MetricsRegionWrapperImpl metricsRegionWrapper; 724 private final Durability regionDurability; 725 private final boolean regionStatsEnabled; 726 // Stores the replication scope of the various column families of the table 727 // that has non-default scope 728 private final NavigableMap<byte[], Integer> replicationScope = 729 new TreeMap<>(Bytes.BYTES_COMPARATOR); 730 731 private final StoreHotnessProtector storeHotnessProtector; 732 733 /** 734 * HRegion constructor. This constructor should only be used for testing and extensions. Instances 735 * of HRegion should be instantiated with the {@link HRegion#createHRegion} or 736 * {@link HRegion#openHRegion} method. 737 * @param tableDir qualified path of directory where region should be located, usually the table 738 * directory. 739 * @param wal The WAL is the outbound log for any updates to the HRegion The wal file is a 740 * logfile from the previous execution that's custom-computed for this HRegion. 741 * The HRegionServer computes and sorts the appropriate wal info for this 742 * HRegion. If there is a previous wal file (implying that the HRegion has been 743 * written-to before), then read it from the supplied path. 744 * @param fs is the filesystem. 745 * @param confParam is global configuration settings. 746 * @param regionInfo - RegionInfo that describes the region is new), then read them from the 747 * supplied path. 748 * @param htd the table descriptor 749 * @param rsServices reference to {@link RegionServerServices} or null 750 * @deprecated Use other constructors. 751 */ 752 @Deprecated 753 public HRegion(final Path tableDir, final WAL wal, final FileSystem fs, 754 final Configuration confParam, final RegionInfo regionInfo, final TableDescriptor htd, 755 final RegionServerServices rsServices) { 756 this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo), wal, confParam, htd, 757 rsServices); 758 } 759 760 /** 761 * HRegion constructor. This constructor should only be used for testing and extensions. Instances 762 * of HRegion should be instantiated with the {@link HRegion#createHRegion} or 763 * {@link HRegion#openHRegion} method. 764 * @param fs is the filesystem. 765 * @param wal The WAL is the outbound log for any updates to the HRegion The wal file is a 766 * logfile from the previous execution that's custom-computed for this HRegion. 767 * The HRegionServer computes and sorts the appropriate wal info for this 768 * HRegion. If there is a previous wal file (implying that the HRegion has been 769 * written-to before), then read it from the supplied path. 770 * @param confParam is global configuration settings. 771 * @param htd the table descriptor 772 * @param rsServices reference to {@link RegionServerServices} or null 773 */ 774 public HRegion(final HRegionFileSystem fs, final WAL wal, final Configuration confParam, 775 final TableDescriptor htd, final RegionServerServices rsServices) { 776 if (htd == null) { 777 throw new IllegalArgumentException("Need table descriptor"); 778 } 779 780 if (confParam instanceof CompoundConfiguration) { 781 throw new IllegalArgumentException("Need original base configuration"); 782 } 783 784 this.wal = wal; 785 this.fs = fs; 786 this.mvcc = new MultiVersionConcurrencyControl(getRegionInfo().getShortNameToLog()); 787 788 // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor 789 this.baseConf = confParam; 790 this.conf = new CompoundConfiguration().add(confParam).addBytesMap(htd.getValues()); 791 this.cellComparator = htd.isMetaTable() 792 || conf.getBoolean(USE_META_CELL_COMPARATOR, DEFAULT_USE_META_CELL_COMPARATOR) 793 ? MetaCellComparator.META_COMPARATOR 794 : CellComparatorImpl.COMPARATOR; 795 this.lock = new ReentrantReadWriteLock( 796 conf.getBoolean(FAIR_REENTRANT_CLOSE_LOCK, DEFAULT_FAIR_REENTRANT_CLOSE_LOCK)); 797 this.regionLockHolders = new ConcurrentHashMap<>(); 798 this.flushCheckInterval = 799 conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL, DEFAULT_CACHE_FLUSH_INTERVAL); 800 this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES); 801 if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) { 802 throw new IllegalArgumentException( 803 MEMSTORE_FLUSH_PER_CHANGES + " can not exceed " + MAX_FLUSH_PER_CHANGES); 804 } 805 int tmpRowLockDuration = 806 conf.getInt("hbase.rowlock.wait.duration", DEFAULT_ROWLOCK_WAIT_DURATION); 807 if (tmpRowLockDuration <= 0) { 808 LOG.info("Found hbase.rowlock.wait.duration set to {}. values <= 0 will cause all row " 809 + "locking to fail. Treating it as 1ms to avoid region failure.", tmpRowLockDuration); 810 tmpRowLockDuration = 1; 811 } 812 this.rowLockWaitDuration = tmpRowLockDuration; 813 814 this.smallestReadPointCalcLock = new ReadPointCalculationLock(conf); 815 816 this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true); 817 this.htableDescriptor = htd; 818 Set<byte[]> families = this.htableDescriptor.getColumnFamilyNames(); 819 for (byte[] family : families) { 820 if (!replicationScope.containsKey(family)) { 821 int scope = htd.getColumnFamily(family).getScope(); 822 // Only store those families that has NON-DEFAULT scope 823 if (scope != REPLICATION_SCOPE_LOCAL) { 824 // Do a copy before storing it here. 825 replicationScope.put(Bytes.copy(family), scope); 826 } 827 } 828 } 829 830 this.rsServices = rsServices; 831 if (rsServices != null) { 832 this.blockCache = rsServices.getBlockCache().orElse(null); 833 this.mobFileCache = rsServices.getMobFileCache().orElse(null); 834 } 835 this.regionServicesForStores = new RegionServicesForStores(this, rsServices); 836 837 setHTableSpecificConf(); 838 this.scannerReadPoints = new ConcurrentHashMap<>(); 839 840 this.busyWaitDuration = conf.getLong("hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION); 841 this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2); 842 if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) { 843 throw new IllegalArgumentException("Invalid hbase.busy.wait.duration (" + busyWaitDuration 844 + ") or hbase.busy.wait.multiplier.max (" + maxBusyWaitMultiplier 845 + "). Their product should be positive"); 846 } 847 this.maxBusyWaitDuration = 848 conf.getLong("hbase.ipc.client.call.purge.timeout", 2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT); 849 850 /* 851 * timestamp.slop provides a server-side constraint on the timestamp. This assumes that you base 852 * your TS around EnvironmentEdgeManager.currentTime(). In this case, throw an error to the user 853 * if the user-specified TS is newer than now + slop. LATEST_TIMESTAMP == don't use this 854 * functionality 855 */ 856 this.timestampSlop = 857 conf.getLong("hbase.hregion.keyvalue.timestamp.slop.millisecs", HConstants.LATEST_TIMESTAMP); 858 859 /** 860 * Timeout for the process time in processRowsWithLocks(). Use -1 to switch off time bound. 861 */ 862 this.rowProcessorTimeout = 863 conf.getLong("hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT); 864 865 this.storeHotnessProtector = new StoreHotnessProtector(this, conf); 866 867 boolean forceSync = conf.getBoolean(WAL_HSYNC_CONF_KEY, DEFAULT_WAL_HSYNC); 868 /** 869 * This is the global default value for durability. All tables/mutations not defining a 870 * durability or using USE_DEFAULT will default to this value. 871 */ 872 Durability defaultDurability = forceSync ? Durability.FSYNC_WAL : Durability.SYNC_WAL; 873 this.regionDurability = this.htableDescriptor.getDurability() == Durability.USE_DEFAULT 874 ? defaultDurability 875 : this.htableDescriptor.getDurability(); 876 877 decorateRegionConfiguration(conf); 878 if (rsServices != null) { 879 this.rsAccounting = this.rsServices.getRegionServerAccounting(); 880 // don't initialize coprocessors if not running within a regionserver 881 // TODO: revisit if coprocessors should load in other cases 882 this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf); 883 this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this); 884 this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper, conf); 885 } else { 886 this.metricsRegionWrapper = null; 887 this.metricsRegion = null; 888 } 889 if (LOG.isDebugEnabled()) { 890 // Write out region name, its encoded name and storeHotnessProtector as string. 891 LOG.debug("Instantiated " + this + "; " + storeHotnessProtector.toString()); 892 } 893 894 configurationManager = null; 895 896 // disable stats tracking system tables, but check the config for everything else 897 this.regionStatsEnabled = htd.getTableName().getNamespaceAsString() 898 .equals(NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR) 899 ? false 900 : conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE, 901 HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE); 902 903 this.maxCellSize = conf.getLong(HBASE_MAX_CELL_SIZE_KEY, DEFAULT_MAX_CELL_SIZE); 904 this.miniBatchSize = 905 conf.getInt(HBASE_REGIONSERVER_MINIBATCH_SIZE, DEFAULT_HBASE_REGIONSERVER_MINIBATCH_SIZE); 906 907 // recover the metrics of read and write requests count if they were retained 908 if (rsServices != null && rsServices.getRegionServerAccounting() != null) { 909 Pair<Long, Long> retainedRWRequestsCnt = rsServices.getRegionServerAccounting() 910 .getRetainedRegionRWRequestsCnt().get(getRegionInfo().getEncodedName()); 911 if (retainedRWRequestsCnt != null) { 912 this.addReadRequestsCount(retainedRWRequestsCnt.getFirst()); 913 this.addWriteRequestsCount(retainedRWRequestsCnt.getSecond()); 914 // remove them since won't use again 915 rsServices.getRegionServerAccounting().getRetainedRegionRWRequestsCnt() 916 .remove(getRegionInfo().getEncodedName()); 917 } 918 } 919 920 minBlockSizeBytes = Arrays.stream(this.htableDescriptor.getColumnFamilies()) 921 .mapToInt(ColumnFamilyDescriptor::getBlocksize).min().orElse(HConstants.DEFAULT_BLOCKSIZE); 922 } 923 924 private void setHTableSpecificConf() { 925 if (this.htableDescriptor == null) { 926 return; 927 } 928 long flushSize = this.htableDescriptor.getMemStoreFlushSize(); 929 930 if (flushSize <= 0) { 931 flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE, 932 TableDescriptorBuilder.DEFAULT_MEMSTORE_FLUSH_SIZE); 933 } 934 this.memstoreFlushSize = flushSize; 935 long mult = conf.getLong(HConstants.HREGION_MEMSTORE_BLOCK_MULTIPLIER, 936 HConstants.DEFAULT_HREGION_MEMSTORE_BLOCK_MULTIPLIER); 937 this.blockingMemStoreSize = this.memstoreFlushSize * mult; 938 } 939 940 /** 941 * Initialize this region. Used only by tests and SplitTransaction to reopen the region. You 942 * should use createHRegion() or openHRegion() 943 * @return What the next sequence (edit) id should be. 944 * @throws IOException e 945 * @deprecated use HRegion.createHRegion() or HRegion.openHRegion() 946 */ 947 @Deprecated 948 public long initialize() throws IOException { 949 return initialize(null); 950 } 951 952 /** 953 * Initialize this region. 954 * @param reporter Tickle every so often if initialize is taking a while. 955 * @return What the next sequence (edit) id should be. 956 */ 957 long initialize(final CancelableProgressable reporter) throws IOException { 958 959 // Refuse to open the region if there is no column family in the table 960 if (htableDescriptor.getColumnFamilyCount() == 0) { 961 throw new DoNotRetryIOException("Table " + htableDescriptor.getTableName().getNameAsString() 962 + " should have at least one column family."); 963 } 964 965 MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this, true); 966 long nextSeqId = -1; 967 try { 968 nextSeqId = initializeRegionInternals(reporter, status); 969 return nextSeqId; 970 } catch (IOException e) { 971 LOG.warn("Failed initialize of region= {}, starting to roll back memstore", 972 getRegionInfo().getRegionNameAsString(), e); 973 // global memstore size will be decreased when dropping memstore 974 try { 975 // drop the memory used by memstore if open region fails 976 dropMemStoreContents(); 977 } catch (IOException ioE) { 978 if (conf.getBoolean(MemStoreLAB.USEMSLAB_KEY, MemStoreLAB.USEMSLAB_DEFAULT)) { 979 LOG.warn( 980 "Failed drop memstore of region= {}, " 981 + "some chunks may not released forever since MSLAB is enabled", 982 getRegionInfo().getRegionNameAsString()); 983 } 984 985 } 986 if (metricsTableRequests != null) { 987 metricsTableRequests.removeRegistry(); 988 } 989 throw e; 990 } finally { 991 // nextSeqid will be -1 if the initialization fails. 992 // At least it will be 0 otherwise. 993 if (nextSeqId == -1) { 994 status.abort("Exception during region " + getRegionInfo().getRegionNameAsString() 995 + " initialization."); 996 } 997 if (LOG.isDebugEnabled()) { 998 LOG.debug("Region open journal for {}:\n{}", this.getRegionInfo().getEncodedName(), 999 status.prettyPrintJournal()); 1000 } 1001 status.cleanup(); 1002 } 1003 } 1004 1005 private long initializeRegionInternals(final CancelableProgressable reporter, 1006 final MonitoredTask status) throws IOException { 1007 if (coprocessorHost != null) { 1008 status.setStatus("Running coprocessor pre-open hook"); 1009 coprocessorHost.preOpen(); 1010 } 1011 1012 String policyName = this.conf.get(REGION_STORAGE_POLICY_KEY, DEFAULT_REGION_STORAGE_POLICY); 1013 this.fs.setStoragePolicy(policyName.trim()); 1014 1015 // Write HRI to a file in case we need to recover hbase:meta 1016 // Only the primary replica should write .regioninfo 1017 if (this.getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID) { 1018 status.setStatus("Writing region info on filesystem"); 1019 fs.checkRegionInfoOnFilesystem(); 1020 } 1021 1022 // Initialize all the HStores 1023 status.setStatus("Initializing all the Stores"); 1024 long maxSeqId = initializeStores(reporter, status); 1025 this.mvcc.advanceTo(maxSeqId); 1026 if (!isRestoredRegion && ServerRegionReplicaUtil.shouldReplayRecoveredEdits(this)) { 1027 Collection<HStore> stores = this.stores.values(); 1028 try { 1029 // update the stores that we are replaying 1030 stores.forEach(HStore::startReplayingFromWAL); 1031 // Recover any edits if available. 1032 maxSeqId = 1033 Math.max(maxSeqId, replayRecoveredEditsIfAny(maxSeqIdInStores, reporter, status)); 1034 // Recover any hfiles if available 1035 maxSeqId = Math.max(maxSeqId, loadRecoveredHFilesIfAny(stores)); 1036 // Make sure mvcc is up to max. 1037 this.mvcc.advanceTo(maxSeqId); 1038 } finally { 1039 // update the stores that we are done replaying 1040 stores.forEach(HStore::stopReplayingFromWAL); 1041 } 1042 } 1043 this.lastReplayedOpenRegionSeqId = maxSeqId; 1044 1045 this.writestate.setReadOnly(ServerRegionReplicaUtil.isReadOnly(this)); 1046 this.writestate.flushRequested = false; 1047 this.writestate.compacting.set(0); 1048 1049 if (this.writestate.writesEnabled) { 1050 // Remove temporary data left over from old regions 1051 status.setStatus("Cleaning up temporary data from old regions"); 1052 fs.cleanupTempDir(); 1053 } 1054 1055 // Initialize split policy 1056 this.splitPolicy = RegionSplitPolicy.create(this, conf); 1057 1058 // Initialize split restriction 1059 splitRestriction = RegionSplitRestriction.create(getTableDescriptor(), conf); 1060 1061 // Initialize flush policy 1062 this.flushPolicy = FlushPolicyFactory.create(this, conf); 1063 1064 long lastFlushTime = EnvironmentEdgeManager.currentTime(); 1065 for (HStore store : stores.values()) { 1066 this.lastStoreFlushTimeMap.put(store, lastFlushTime); 1067 } 1068 1069 // Use maximum of log sequenceid or that which was found in stores 1070 // (particularly if no recovered edits, seqid will be -1). 1071 long nextSeqId = maxSeqId + 1; 1072 if (!isRestoredRegion) { 1073 // always get openSeqNum from the default replica, even if we are secondary replicas 1074 long maxSeqIdFromFile = WALSplitUtil.getMaxRegionSequenceId(conf, 1075 RegionReplicaUtil.getRegionInfoForDefaultReplica(getRegionInfo()), this::getFilesystem, 1076 this::getWalFileSystem); 1077 nextSeqId = Math.max(maxSeqId, maxSeqIdFromFile) + 1; 1078 // The openSeqNum will always be increase even for read only region, as we rely on it to 1079 // determine whether a region has been successfully reopened, so here we always need to update 1080 // the max sequence id file. 1081 if (RegionReplicaUtil.isDefaultReplica(getRegionInfo())) { 1082 LOG.debug("writing seq id for {}", this.getRegionInfo().getEncodedName()); 1083 WALSplitUtil.writeRegionSequenceIdFile(getWalFileSystem(), getWALRegionDir(), 1084 nextSeqId - 1); 1085 // This means we have replayed all the recovered edits and also written out the max sequence 1086 // id file, let's delete the wrong directories introduced in HBASE-20734, see HBASE-22617 1087 // for more details. 1088 Path wrongRegionWALDir = CommonFSUtils.getWrongWALRegionDir(conf, 1089 getRegionInfo().getTable(), getRegionInfo().getEncodedName()); 1090 FileSystem walFs = getWalFileSystem(); 1091 if (walFs.exists(wrongRegionWALDir)) { 1092 if (!walFs.delete(wrongRegionWALDir, true)) { 1093 LOG.debug("Failed to clean up wrong region WAL directory {}", wrongRegionWALDir); 1094 } 1095 } 1096 } 1097 } 1098 1099 LOG.info("Opened {}; next sequenceid={}; {}, {}", this.getRegionInfo().getShortNameToLog(), 1100 nextSeqId, this.splitPolicy, this.flushPolicy); 1101 1102 // A region can be reopened if failed a split; reset flags 1103 this.closing.set(false); 1104 this.closed.set(false); 1105 1106 if (coprocessorHost != null) { 1107 status.setStatus("Running coprocessor post-open hooks"); 1108 coprocessorHost.postOpen(); 1109 } 1110 1111 metricsTableRequests = new MetricsTableRequests(htableDescriptor.getTableName(), conf); 1112 1113 status.markComplete("Region opened successfully"); 1114 return nextSeqId; 1115 } 1116 1117 /** 1118 * Open all Stores. 1119 * @return Highest sequenceId found out in a Store. 1120 */ 1121 private long initializeStores(CancelableProgressable reporter, MonitoredTask status) 1122 throws IOException { 1123 return initializeStores(reporter, status, false); 1124 } 1125 1126 private long initializeStores(CancelableProgressable reporter, MonitoredTask status, 1127 boolean warmup) throws IOException { 1128 // Load in all the HStores. 1129 long maxSeqId = -1; 1130 // initialized to -1 so that we pick up MemstoreTS from column families 1131 long maxMemstoreTS = -1; 1132 1133 if (htableDescriptor.getColumnFamilyCount() != 0) { 1134 // initialize the thread pool for opening stores in parallel. 1135 ThreadPoolExecutor storeOpenerThreadPool = 1136 getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog()); 1137 CompletionService<HStore> completionService = 1138 new ExecutorCompletionService<>(storeOpenerThreadPool); 1139 1140 // initialize each store in parallel 1141 for (final ColumnFamilyDescriptor family : htableDescriptor.getColumnFamilies()) { 1142 status.setStatus("Instantiating store for column family " + family); 1143 completionService.submit(new Callable<HStore>() { 1144 @Override 1145 public HStore call() throws IOException { 1146 return instantiateHStore(family, warmup); 1147 } 1148 }); 1149 } 1150 boolean allStoresOpened = false; 1151 boolean hasSloppyStores = false; 1152 try { 1153 for (int i = 0; i < htableDescriptor.getColumnFamilyCount(); i++) { 1154 Future<HStore> future = completionService.take(); 1155 HStore store = future.get(); 1156 this.stores.put(store.getColumnFamilyDescriptor().getName(), store); 1157 if (store.isSloppyMemStore()) { 1158 hasSloppyStores = true; 1159 } 1160 1161 long storeMaxSequenceId = store.getMaxSequenceId().orElse(0L); 1162 maxSeqIdInStores.put(Bytes.toBytes(store.getColumnFamilyName()), storeMaxSequenceId); 1163 if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) { 1164 maxSeqId = storeMaxSequenceId; 1165 } 1166 long maxStoreMemstoreTS = store.getMaxMemStoreTS().orElse(0L); 1167 if (maxStoreMemstoreTS > maxMemstoreTS) { 1168 maxMemstoreTS = maxStoreMemstoreTS; 1169 } 1170 } 1171 allStoresOpened = true; 1172 if (hasSloppyStores) { 1173 htableDescriptor = TableDescriptorBuilder.newBuilder(htableDescriptor) 1174 .setFlushPolicyClassName(FlushNonSloppyStoresFirstPolicy.class.getName()).build(); 1175 LOG.info("Setting FlushNonSloppyStoresFirstPolicy for the region=" + this); 1176 } 1177 } catch (InterruptedException e) { 1178 throw throwOnInterrupt(e); 1179 } catch (ExecutionException e) { 1180 throw new IOException(e.getCause()); 1181 } finally { 1182 storeOpenerThreadPool.shutdownNow(); 1183 if (!allStoresOpened) { 1184 // something went wrong, close all opened stores 1185 LOG.error("Could not initialize all stores for the region=" + this); 1186 for (HStore store : this.stores.values()) { 1187 try { 1188 store.close(); 1189 } catch (IOException e) { 1190 LOG.warn("close store {} failed in region {}", store.toString(), this, e); 1191 } 1192 } 1193 } 1194 } 1195 } 1196 return Math.max(maxSeqId, maxMemstoreTS + 1); 1197 } 1198 1199 private void initializeWarmup(final CancelableProgressable reporter) throws IOException { 1200 MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this); 1201 // Initialize all the HStores 1202 status.setStatus("Warmup all stores of " + this.getRegionInfo().getRegionNameAsString()); 1203 try { 1204 initializeStores(reporter, status, true); 1205 } finally { 1206 status.markComplete("Warmed up " + this.getRegionInfo().getRegionNameAsString()); 1207 } 1208 } 1209 1210 /** Returns Map of StoreFiles by column family */ 1211 private NavigableMap<byte[], List<Path>> getStoreFiles() { 1212 NavigableMap<byte[], List<Path>> allStoreFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR); 1213 for (HStore store : stores.values()) { 1214 Collection<HStoreFile> storeFiles = store.getStorefiles(); 1215 if (storeFiles == null) { 1216 continue; 1217 } 1218 List<Path> storeFileNames = new ArrayList<>(); 1219 for (HStoreFile storeFile : storeFiles) { 1220 storeFileNames.add(storeFile.getPath()); 1221 } 1222 allStoreFiles.put(store.getColumnFamilyDescriptor().getName(), storeFileNames); 1223 } 1224 return allStoreFiles; 1225 } 1226 1227 protected void writeRegionOpenMarker(WAL wal, long openSeqId) throws IOException { 1228 Map<byte[], List<Path>> storeFiles = getStoreFiles(); 1229 RegionEventDescriptor regionOpenDesc = 1230 ProtobufUtil.toRegionEventDescriptor(RegionEventDescriptor.EventType.REGION_OPEN, 1231 getRegionInfo(), openSeqId, getRegionServerServices().getServerName(), storeFiles); 1232 WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionOpenDesc, 1233 mvcc); 1234 } 1235 1236 private void writeRegionCloseMarker(WAL wal) throws IOException { 1237 Map<byte[], List<Path>> storeFiles = getStoreFiles(); 1238 RegionEventDescriptor regionEventDesc = ProtobufUtil.toRegionEventDescriptor( 1239 RegionEventDescriptor.EventType.REGION_CLOSE, getRegionInfo(), mvcc.getReadPoint(), 1240 getRegionServerServices().getServerName(), storeFiles); 1241 WALUtil.writeRegionEventMarker(wal, getReplicationScope(), getRegionInfo(), regionEventDesc, 1242 mvcc); 1243 1244 // Store SeqId in WAL FileSystem when a region closes 1245 // checking region folder exists is due to many tests which delete the table folder while a 1246 // table is still online 1247 if (getWalFileSystem().exists(getWALRegionDir())) { 1248 WALSplitUtil.writeRegionSequenceIdFile(getWalFileSystem(), getWALRegionDir(), 1249 mvcc.getReadPoint()); 1250 } 1251 } 1252 1253 /** Returns True if this region has references. */ 1254 public boolean hasReferences() { 1255 return stores.values().stream().anyMatch(HStore::hasReferences); 1256 } 1257 1258 public void blockUpdates() { 1259 this.updatesLock.writeLock().lock(); 1260 } 1261 1262 public void unblockUpdates() { 1263 this.updatesLock.writeLock().unlock(); 1264 } 1265 1266 public HDFSBlocksDistribution getHDFSBlocksDistribution() { 1267 HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution(); 1268 stores.values().stream().filter(s -> s.getStorefiles() != null) 1269 .flatMap(s -> s.getStorefiles().stream()).map(HStoreFile::getHDFSBlockDistribution) 1270 .forEachOrdered(hdfsBlocksDistribution::add); 1271 return hdfsBlocksDistribution; 1272 } 1273 1274 /** 1275 * This is a helper function to compute HDFS block distribution on demand 1276 * @param conf configuration 1277 * @param tableDescriptor TableDescriptor of the table 1278 * @param regionInfo encoded name of the region 1279 * @return The HDFS blocks distribution for the given region. 1280 */ 1281 public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf, 1282 TableDescriptor tableDescriptor, RegionInfo regionInfo) throws IOException { 1283 Path tablePath = 1284 CommonFSUtils.getTableDir(CommonFSUtils.getRootDir(conf), tableDescriptor.getTableName()); 1285 return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath); 1286 } 1287 1288 /** 1289 * This is a helper function to compute HDFS block distribution on demand 1290 * @param conf configuration 1291 * @param tableDescriptor TableDescriptor of the table 1292 * @param regionInfo encoded name of the region 1293 * @param tablePath the table directory 1294 * @return The HDFS blocks distribution for the given region. 1295 */ 1296 public static HDFSBlocksDistribution computeHDFSBlocksDistribution(Configuration conf, 1297 TableDescriptor tableDescriptor, RegionInfo regionInfo, Path tablePath) throws IOException { 1298 HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution(); 1299 FileSystem fs = tablePath.getFileSystem(conf); 1300 1301 HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo); 1302 for (ColumnFamilyDescriptor family : tableDescriptor.getColumnFamilies()) { 1303 List<LocatedFileStatus> locatedFileStatusList = 1304 HRegionFileSystem.getStoreFilesLocatedStatus(regionFs, family.getNameAsString(), true); 1305 if (locatedFileStatusList == null) { 1306 continue; 1307 } 1308 1309 for (LocatedFileStatus status : locatedFileStatusList) { 1310 Path p = status.getPath(); 1311 if (StoreFileInfo.isReference(p) || HFileLink.isHFileLink(p)) { 1312 // Only construct StoreFileInfo object if its not a hfile, save obj 1313 // creation 1314 StoreFileInfo storeFileInfo = new StoreFileInfo(conf, fs, status); 1315 hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs)); 1316 } else if (StoreFileInfo.isHFile(p)) { 1317 // If its a HFile, then lets just add to the block distribution 1318 // lets not create more objects here, not even another HDFSBlocksDistribution 1319 FSUtils.addToHDFSBlocksDistribution(hdfsBlocksDistribution, status.getBlockLocations()); 1320 } else { 1321 throw new IOException("path=" + p + " doesn't look like a valid StoreFile"); 1322 } 1323 } 1324 } 1325 return hdfsBlocksDistribution; 1326 } 1327 1328 /** 1329 * Increase the size of mem store in this region and the size of global mem store 1330 */ 1331 private void incMemStoreSize(MemStoreSize mss) { 1332 incMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(), 1333 mss.getCellsCount()); 1334 } 1335 1336 void incMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta, 1337 int cellsCountDelta) { 1338 if (this.rsAccounting != null) { 1339 rsAccounting.incGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta); 1340 } 1341 long dataSize = this.memStoreSizing.incMemStoreSize(dataSizeDelta, heapSizeDelta, 1342 offHeapSizeDelta, cellsCountDelta); 1343 checkNegativeMemStoreDataSize(dataSize, dataSizeDelta); 1344 } 1345 1346 void decrMemStoreSize(MemStoreSize mss) { 1347 decrMemStoreSize(mss.getDataSize(), mss.getHeapSize(), mss.getOffHeapSize(), 1348 mss.getCellsCount()); 1349 } 1350 1351 private void decrMemStoreSize(long dataSizeDelta, long heapSizeDelta, long offHeapSizeDelta, 1352 int cellsCountDelta) { 1353 if (this.rsAccounting != null) { 1354 rsAccounting.decGlobalMemStoreSize(dataSizeDelta, heapSizeDelta, offHeapSizeDelta); 1355 } 1356 long dataSize = this.memStoreSizing.decMemStoreSize(dataSizeDelta, heapSizeDelta, 1357 offHeapSizeDelta, cellsCountDelta); 1358 checkNegativeMemStoreDataSize(dataSize, -dataSizeDelta); 1359 } 1360 1361 private void checkNegativeMemStoreDataSize(long memStoreDataSize, long delta) { 1362 // This is extremely bad if we make memStoreSizing negative. Log as much info on the offending 1363 // caller as possible. (memStoreSizing might be a negative value already -- freeing memory) 1364 if (memStoreDataSize < 0) { 1365 LOG.error("Asked to modify this region's (" + this.toString() 1366 + ") memStoreSizing to a negative value which is incorrect. Current memStoreSizing=" 1367 + (memStoreDataSize - delta) + ", delta=" + delta, new Exception()); 1368 } 1369 } 1370 1371 @Override 1372 public RegionInfo getRegionInfo() { 1373 return this.fs.getRegionInfo(); 1374 } 1375 1376 /** 1377 * Returns Instance of {@link RegionServerServices} used by this HRegion. Can be null. 1378 */ 1379 RegionServerServices getRegionServerServices() { 1380 return this.rsServices; 1381 } 1382 1383 @Override 1384 public long getReadRequestsCount() { 1385 return readRequestsCount.sum(); 1386 } 1387 1388 @Override 1389 public long getFilteredReadRequestsCount() { 1390 return filteredReadRequestsCount.sum(); 1391 } 1392 1393 @Override 1394 public long getWriteRequestsCount() { 1395 return writeRequestsCount.sum(); 1396 } 1397 1398 @Override 1399 public long getMemStoreDataSize() { 1400 return memStoreSizing.getDataSize(); 1401 } 1402 1403 @Override 1404 public long getMemStoreHeapSize() { 1405 return memStoreSizing.getHeapSize(); 1406 } 1407 1408 @Override 1409 public long getMemStoreOffHeapSize() { 1410 return memStoreSizing.getOffHeapSize(); 1411 } 1412 1413 /** Returns store services for this region, to access services required by store level needs */ 1414 public RegionServicesForStores getRegionServicesForStores() { 1415 return regionServicesForStores; 1416 } 1417 1418 @Override 1419 public long getNumMutationsWithoutWAL() { 1420 return numMutationsWithoutWAL.sum(); 1421 } 1422 1423 @Override 1424 public long getDataInMemoryWithoutWAL() { 1425 return dataInMemoryWithoutWAL.sum(); 1426 } 1427 1428 @Override 1429 public long getBlockedRequestsCount() { 1430 return blockedRequestsCount.sum(); 1431 } 1432 1433 @Override 1434 public long getCheckAndMutateChecksPassed() { 1435 return checkAndMutateChecksPassed.sum(); 1436 } 1437 1438 @Override 1439 public long getCheckAndMutateChecksFailed() { 1440 return checkAndMutateChecksFailed.sum(); 1441 } 1442 1443 // TODO Needs to check whether we should expose our metrics system to CPs. If CPs themselves doing 1444 // the op and bypassing the core, this might be needed? Should be stop supporting the bypass 1445 // feature? 1446 public MetricsRegion getMetrics() { 1447 return metricsRegion; 1448 } 1449 1450 @Override 1451 public boolean isClosed() { 1452 return this.closed.get(); 1453 } 1454 1455 @Override 1456 public boolean isClosing() { 1457 return this.closing.get(); 1458 } 1459 1460 @Override 1461 public boolean isReadOnly() { 1462 return this.writestate.isReadOnly(); 1463 } 1464 1465 @Override 1466 public boolean isAvailable() { 1467 return !isClosed() && !isClosing(); 1468 } 1469 1470 @Override 1471 public boolean isSplittable() { 1472 return splitPolicy.canSplit(); 1473 } 1474 1475 @Override 1476 public boolean isMergeable() { 1477 if (!isAvailable()) { 1478 LOG.debug("Region " + this + " is not mergeable because it is closing or closed"); 1479 return false; 1480 } 1481 if (hasReferences()) { 1482 LOG.debug("Region " + this + " is not mergeable because it has references"); 1483 return false; 1484 } 1485 1486 return true; 1487 } 1488 1489 public boolean areWritesEnabled() { 1490 synchronized (this.writestate) { 1491 return this.writestate.writesEnabled; 1492 } 1493 } 1494 1495 public MultiVersionConcurrencyControl getMVCC() { 1496 return mvcc; 1497 } 1498 1499 @Override 1500 public long getMaxFlushedSeqId() { 1501 return maxFlushedSeqId; 1502 } 1503 1504 /** Returns readpoint considering given IsolationLevel. Pass {@code null} for default */ 1505 public long getReadPoint(IsolationLevel isolationLevel) { 1506 if (isolationLevel != null && isolationLevel == IsolationLevel.READ_UNCOMMITTED) { 1507 // This scan can read even uncommitted transactions 1508 return Long.MAX_VALUE; 1509 } 1510 return mvcc.getReadPoint(); 1511 } 1512 1513 public boolean isLoadingCfsOnDemandDefault() { 1514 return this.isLoadingCfsOnDemandDefault; 1515 } 1516 1517 /** 1518 * Close down this HRegion. Flush the cache, shut down each HStore, don't service any more calls. 1519 * <p> 1520 * This method could take some time to execute, so don't call it from a time-sensitive thread. 1521 * @return Vector of all the storage files that the HRegion's component HStores make use of. It's 1522 * a list of all StoreFile objects. Returns empty vector if already closed and null if 1523 * judged that it should not close. 1524 * @throws IOException e 1525 * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was 1526 * not properly persisted. The region is put in closing mode, and 1527 * the caller MUST abort after this. 1528 */ 1529 public Map<byte[], List<HStoreFile>> close() throws IOException { 1530 return close(false); 1531 } 1532 1533 private final Object closeLock = new Object(); 1534 1535 /** Conf key for fair locking policy */ 1536 public static final String FAIR_REENTRANT_CLOSE_LOCK = 1537 "hbase.regionserver.fair.region.close.lock"; 1538 public static final boolean DEFAULT_FAIR_REENTRANT_CLOSE_LOCK = true; 1539 /** Conf key for the periodic flush interval */ 1540 public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL = 1541 "hbase.regionserver.optionalcacheflushinterval"; 1542 /** Default interval for the memstore flush */ 1543 public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000; 1544 /** Default interval for System tables memstore flush */ 1545 public static final int SYSTEM_CACHE_FLUSH_INTERVAL = 300000; // 5 minutes 1546 1547 /** Conf key to force a flush if there are already enough changes for one region in memstore */ 1548 public static final String MEMSTORE_FLUSH_PER_CHANGES = "hbase.regionserver.flush.per.changes"; 1549 public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions 1550 /** 1551 * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes 1552 * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region 1553 */ 1554 public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G 1555 1556 public static final String CLOSE_WAIT_ABORT = "hbase.regionserver.close.wait.abort"; 1557 public static final boolean DEFAULT_CLOSE_WAIT_ABORT = false; 1558 public static final String CLOSE_WAIT_TIME = "hbase.regionserver.close.wait.time.ms"; 1559 public static final long DEFAULT_CLOSE_WAIT_TIME = 60000; // 1 minute 1560 public static final String CLOSE_WAIT_INTERVAL = "hbase.regionserver.close.wait.interval.ms"; 1561 public static final long DEFAULT_CLOSE_WAIT_INTERVAL = 10000; // 10 seconds 1562 1563 public Map<byte[], List<HStoreFile>> close(boolean abort) throws IOException { 1564 return close(abort, false); 1565 } 1566 1567 /** 1568 * Close this HRegion. 1569 * @param abort true if server is aborting (only during testing) 1570 * @param ignoreStatus true if ignore the status (won't be showed on task list) 1571 * @return Vector of all the storage files that the HRegion's component HStores make use of. It's 1572 * a list of StoreFile objects. Can be null if we are not to close at this time, or we are 1573 * already closed. 1574 * @throws IOException e 1575 * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was 1576 * not properly persisted. The region is put in closing mode, and 1577 * the caller MUST abort after this. 1578 */ 1579 public Map<byte[], List<HStoreFile>> close(boolean abort, boolean ignoreStatus) 1580 throws IOException { 1581 return close(abort, ignoreStatus, false); 1582 } 1583 1584 /** 1585 * Close down this HRegion. Flush the cache unless abort parameter is true, Shut down each HStore, 1586 * don't service any more calls. This method could take some time to execute, so don't call it 1587 * from a time-sensitive thread. 1588 * @param abort true if server is aborting (only during testing) 1589 * @param ignoreStatus true if ignore the status (wont be showed on task list) 1590 * @param isGracefulStop true if region is being closed during graceful stop and the blocks in the 1591 * BucketCache should not be evicted. 1592 * @return Vector of all the storage files that the HRegion's component HStores make use of. It's 1593 * a list of StoreFile objects. Can be null if we are not to close at this time or we are 1594 * already closed. 1595 * @throws IOException e 1596 * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was 1597 * not properly persisted. The region is put in closing mode, and 1598 * the caller MUST abort after this. 1599 */ 1600 public Map<byte[], List<HStoreFile>> close(boolean abort, boolean ignoreStatus, 1601 boolean isGracefulStop) throws IOException { 1602 // Only allow one thread to close at a time. Serialize them so dual 1603 // threads attempting to close will run up against each other. 1604 MonitoredTask status = TaskMonitor.get().createStatus( 1605 "Closing region " + this.getRegionInfo().getEncodedName() + (abort ? " due to abort" : ""), 1606 true); 1607 status.setStatus("Waiting for close lock"); 1608 try { 1609 synchronized (closeLock) { 1610 if (isGracefulStop && rsServices != null) { 1611 rsServices.getBlockCache().ifPresent(blockCache -> { 1612 if (blockCache instanceof CombinedBlockCache) { 1613 BlockCache l2 = ((CombinedBlockCache) blockCache).getSecondLevelCache(); 1614 if (l2 instanceof BucketCache) { 1615 if (((BucketCache) l2).isCachePersistenceEnabled()) { 1616 LOG.info( 1617 "Closing region {} during a graceful stop, and cache persistence is on, " 1618 + "so setting evict on close to false. ", 1619 this.getRegionInfo().getRegionNameAsString()); 1620 this.getStores().forEach(s -> s.getCacheConfig().setEvictOnClose(false)); 1621 } 1622 } 1623 } 1624 }); 1625 } 1626 return doClose(abort, status); 1627 } 1628 } finally { 1629 if (LOG.isDebugEnabled()) { 1630 LOG.debug("Region close journal for {}:\n{}", this.getRegionInfo().getEncodedName(), 1631 status.prettyPrintJournal()); 1632 } 1633 status.cleanup(); 1634 } 1635 } 1636 1637 /** 1638 * Exposed for some very specific unit tests. 1639 */ 1640 public void setClosing(boolean closing) { 1641 this.closing.set(closing); 1642 } 1643 1644 /** 1645 * The {@link HRegion#doClose} will block forever if someone tries proving the dead lock via the 1646 * unit test. Instead of blocking, the {@link HRegion#doClose} will throw exception if you set the 1647 * timeout. 1648 * @param timeoutForWriteLock the second time to wait for the write lock in 1649 * {@link HRegion#doClose} 1650 */ 1651 public void setTimeoutForWriteLock(long timeoutForWriteLock) { 1652 assert timeoutForWriteLock >= 0; 1653 this.timeoutForWriteLock = timeoutForWriteLock; 1654 } 1655 1656 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "UL_UNRELEASED_LOCK_EXCEPTION_PATH", 1657 justification = "I think FindBugs is confused") 1658 private Map<byte[], List<HStoreFile>> doClose(boolean abort, MonitoredTask status) 1659 throws IOException { 1660 if (isClosed()) { 1661 LOG.warn("Region " + this + " already closed"); 1662 return null; 1663 } 1664 1665 if (coprocessorHost != null) { 1666 status.setStatus("Running coprocessor pre-close hooks"); 1667 this.coprocessorHost.preClose(abort); 1668 } 1669 status.setStatus("Disabling compacts and flushes for region"); 1670 boolean canFlush = true; 1671 synchronized (writestate) { 1672 // Disable compacting and flushing by background threads for this 1673 // region. 1674 canFlush = !writestate.readOnly; 1675 writestate.writesEnabled = false; 1676 LOG.debug("Closing {}, disabling compactions & flushes", 1677 this.getRegionInfo().getEncodedName()); 1678 waitForFlushesAndCompactions(); 1679 } 1680 // If we were not just flushing, is it worth doing a preflush...one 1681 // that will clear out of the bulk of the memstore before we put up 1682 // the close flag? 1683 if (!abort && worthPreFlushing() && canFlush) { 1684 status.setStatus("Pre-flushing region before close"); 1685 LOG.info("Running close preflush of {}", this.getRegionInfo().getEncodedName()); 1686 try { 1687 internalFlushcache(status); 1688 } catch (IOException ioe) { 1689 // Failed to flush the region. Keep going. 1690 status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage()); 1691 } 1692 } 1693 1694 // Set the closing flag 1695 // From this point new arrivals at the region lock will get NSRE. 1696 1697 this.closing.set(true); 1698 LOG.info("Closing region {}", this); 1699 1700 // Acquire the close lock 1701 1702 // The configuration parameter CLOSE_WAIT_ABORT is overloaded to enable both 1703 // the new regionserver abort condition and interrupts for running requests. 1704 // If CLOSE_WAIT_ABORT is not enabled there is no change from earlier behavior, 1705 // we will not attempt to interrupt threads servicing requests nor crash out 1706 // the regionserver if something remains stubborn. 1707 1708 final boolean canAbort = conf.getBoolean(CLOSE_WAIT_ABORT, DEFAULT_CLOSE_WAIT_ABORT); 1709 boolean useTimedWait = false; 1710 if (timeoutForWriteLock != null && timeoutForWriteLock != Long.MAX_VALUE) { 1711 // convert legacy use of timeoutForWriteLock in seconds to new use in millis 1712 timeoutForWriteLock = TimeUnit.SECONDS.toMillis(timeoutForWriteLock); 1713 useTimedWait = true; 1714 } else if (canAbort) { 1715 timeoutForWriteLock = conf.getLong(CLOSE_WAIT_TIME, DEFAULT_CLOSE_WAIT_TIME); 1716 useTimedWait = true; 1717 } 1718 if (LOG.isDebugEnabled()) { 1719 LOG.debug((useTimedWait ? "Time limited wait" : "Waiting without time limit") 1720 + " for close lock on " + this); 1721 } 1722 final long closeWaitInterval = conf.getLong(CLOSE_WAIT_INTERVAL, DEFAULT_CLOSE_WAIT_INTERVAL); 1723 long elapsedWaitTime = 0; 1724 if (useTimedWait) { 1725 // Sanity check configuration 1726 long remainingWaitTime = timeoutForWriteLock; 1727 if (remainingWaitTime < closeWaitInterval) { 1728 LOG.warn("Time limit for close wait of " + timeoutForWriteLock 1729 + " ms is less than the configured lock acquisition wait interval " + closeWaitInterval 1730 + " ms, using wait interval as time limit"); 1731 remainingWaitTime = closeWaitInterval; 1732 } 1733 boolean acquired = false; 1734 do { 1735 long start = EnvironmentEdgeManager.currentTime(); 1736 try { 1737 acquired = lock.writeLock().tryLock(Math.min(remainingWaitTime, closeWaitInterval), 1738 TimeUnit.MILLISECONDS); 1739 } catch (InterruptedException e) { 1740 // Interrupted waiting for close lock. More likely the server is shutting down, not 1741 // normal operation, so aborting upon interrupt while waiting on this lock would not 1742 // provide much value. Throw an IOE (as IIOE) like we would in the case where we 1743 // fail to acquire the lock. 1744 String msg = "Interrupted while waiting for close lock on " + this; 1745 LOG.warn(msg, e); 1746 throw (InterruptedIOException) new InterruptedIOException(msg).initCause(e); 1747 } 1748 long elapsed = EnvironmentEdgeManager.currentTime() - start; 1749 elapsedWaitTime += elapsed; 1750 remainingWaitTime -= elapsed; 1751 if (canAbort && !acquired && remainingWaitTime > 0) { 1752 // Before we loop to wait again, interrupt all region operations that might 1753 // still be in progress, to encourage them to break out of waiting states or 1754 // inner loops, throw an exception to clients, and release the read lock via 1755 // endRegionOperation. 1756 if (LOG.isDebugEnabled()) { 1757 LOG.debug("Interrupting region operations after waiting for close lock for " 1758 + elapsedWaitTime + " ms on " + this + ", " + remainingWaitTime + " ms remaining"); 1759 } 1760 interruptRegionOperations(); 1761 } 1762 } while (!acquired && remainingWaitTime > 0); 1763 1764 // If we fail to acquire the lock, trigger an abort if we can; otherwise throw an IOE 1765 // to let the caller know we could not proceed with the close. 1766 if (!acquired) { 1767 String msg = 1768 "Failed to acquire close lock on " + this + " after waiting " + elapsedWaitTime + " ms"; 1769 LOG.error(msg); 1770 if (canAbort) { 1771 // If we failed to acquire the write lock, abort the server 1772 rsServices.abort(msg, null); 1773 } 1774 throw new IOException(msg); 1775 } 1776 1777 } else { 1778 1779 long start = EnvironmentEdgeManager.currentTime(); 1780 lock.writeLock().lock(); 1781 elapsedWaitTime = EnvironmentEdgeManager.currentTime() - start; 1782 1783 } 1784 1785 if (LOG.isDebugEnabled()) { 1786 LOG.debug("Acquired close lock on " + this + " after waiting " + elapsedWaitTime + " ms"); 1787 } 1788 1789 status.setStatus("Disabling writes for close"); 1790 try { 1791 if (this.isClosed()) { 1792 status.abort("Already got closed by another process"); 1793 // SplitTransaction handles the null 1794 return null; 1795 } 1796 LOG.debug("Updates disabled for region " + this); 1797 // Don't flush the cache if we are aborting 1798 if (!abort && canFlush) { 1799 int failedfFlushCount = 0; 1800 int flushCount = 0; 1801 long tmp = 0; 1802 long remainingSize = this.memStoreSizing.getDataSize(); 1803 while (remainingSize > 0) { 1804 try { 1805 internalFlushcache(status); 1806 if (flushCount > 0) { 1807 LOG.info("Running extra flush, " + flushCount + " (carrying snapshot?) " + this); 1808 } 1809 flushCount++; 1810 tmp = this.memStoreSizing.getDataSize(); 1811 if (tmp >= remainingSize) { 1812 failedfFlushCount++; 1813 } 1814 remainingSize = tmp; 1815 if (failedfFlushCount > 5) { 1816 // If we failed 5 times and are unable to clear memory, abort 1817 // so we do not lose data 1818 throw new DroppedSnapshotException("Failed clearing memory after " + flushCount 1819 + " attempts on region: " + Bytes.toStringBinary(getRegionInfo().getRegionName())); 1820 } 1821 } catch (IOException ioe) { 1822 status.setStatus("Failed flush " + this + ", putting online again"); 1823 synchronized (writestate) { 1824 writestate.writesEnabled = true; 1825 } 1826 // Have to throw to upper layers. I can't abort server from here. 1827 throw ioe; 1828 } 1829 } 1830 } 1831 1832 Map<byte[], List<HStoreFile>> result = new TreeMap<>(Bytes.BYTES_COMPARATOR); 1833 if (!stores.isEmpty()) { 1834 // initialize the thread pool for closing stores in parallel. 1835 ThreadPoolExecutor storeCloserThreadPool = 1836 getStoreOpenAndCloseThreadPool("StoreCloser-" + getRegionInfo().getRegionNameAsString()); 1837 CompletionService<Pair<byte[], Collection<HStoreFile>>> completionService = 1838 new ExecutorCompletionService<>(storeCloserThreadPool); 1839 1840 // close each store in parallel 1841 for (HStore store : stores.values()) { 1842 MemStoreSize mss = store.getFlushableSize(); 1843 if (!(abort || mss.getDataSize() == 0 || writestate.readOnly)) { 1844 if (getRegionServerServices() != null) { 1845 getRegionServerServices().abort("Assertion failed while closing store " 1846 + getRegionInfo().getRegionNameAsString() + " " + store 1847 + ". flushableSize expected=0, actual={" + mss + "}. Current memStoreSize=" 1848 + this.memStoreSizing.getMemStoreSize() + ". Maybe a coprocessor " 1849 + "operation failed and left the memstore in a partially updated state.", null); 1850 } 1851 } 1852 completionService.submit(new Callable<Pair<byte[], Collection<HStoreFile>>>() { 1853 @Override 1854 public Pair<byte[], Collection<HStoreFile>> call() throws IOException { 1855 return new Pair<>(store.getColumnFamilyDescriptor().getName(), store.close()); 1856 } 1857 }); 1858 } 1859 try { 1860 for (int i = 0; i < stores.size(); i++) { 1861 Future<Pair<byte[], Collection<HStoreFile>>> future = completionService.take(); 1862 Pair<byte[], Collection<HStoreFile>> storeFiles = future.get(); 1863 List<HStoreFile> familyFiles = result.get(storeFiles.getFirst()); 1864 if (familyFiles == null) { 1865 familyFiles = new ArrayList<>(); 1866 result.put(storeFiles.getFirst(), familyFiles); 1867 } 1868 familyFiles.addAll(storeFiles.getSecond()); 1869 } 1870 } catch (InterruptedException e) { 1871 throw throwOnInterrupt(e); 1872 } catch (ExecutionException e) { 1873 Throwable cause = e.getCause(); 1874 if (cause instanceof IOException) { 1875 throw (IOException) cause; 1876 } 1877 throw new IOException(cause); 1878 } finally { 1879 storeCloserThreadPool.shutdownNow(); 1880 } 1881 } 1882 1883 status.setStatus("Writing region close event to WAL"); 1884 // Always write close marker to wal even for read only table. This is not a big problem as we 1885 // do not write any data into the region; it is just a meta edit in the WAL file. 1886 if ( 1887 !abort && wal != null && getRegionServerServices() != null 1888 && RegionReplicaUtil.isDefaultReplica(getRegionInfo()) 1889 ) { 1890 writeRegionCloseMarker(wal); 1891 } 1892 1893 this.closed.set(true); 1894 1895 // Decrease refCount of table latency metric registry. 1896 // Do this after closed#set to make sure only -1. 1897 if (metricsTableRequests != null) { 1898 metricsTableRequests.removeRegistry(); 1899 } 1900 1901 if (!canFlush) { 1902 decrMemStoreSize(this.memStoreSizing.getMemStoreSize()); 1903 } else if (this.memStoreSizing.getDataSize() != 0) { 1904 LOG.error("Memstore data size is {} in region {}", this.memStoreSizing.getDataSize(), this); 1905 } 1906 if (coprocessorHost != null) { 1907 status.setStatus("Running coprocessor post-close hooks"); 1908 this.coprocessorHost.postClose(abort); 1909 } 1910 if (this.metricsRegion != null) { 1911 this.metricsRegion.close(); 1912 } 1913 if (this.metricsRegionWrapper != null) { 1914 Closeables.close(this.metricsRegionWrapper, true); 1915 } 1916 status.markComplete("Closed"); 1917 LOG.info("Closed {}", this); 1918 return result; 1919 } finally { 1920 lock.writeLock().unlock(); 1921 } 1922 } 1923 1924 /** Wait for all current flushes and compactions of the region to complete */ 1925 // TODO HBASE-18906. Check the usage (if any) in Phoenix and expose this or give alternate way for 1926 // Phoenix needs. 1927 public void waitForFlushesAndCompactions() { 1928 synchronized (writestate) { 1929 if (this.writestate.readOnly) { 1930 // we should not wait for replayed flushed if we are read only (for example in case the 1931 // region is a secondary replica). 1932 return; 1933 } 1934 boolean interrupted = false; 1935 try { 1936 while (writestate.compacting.get() > 0 || writestate.flushing) { 1937 LOG.debug("waiting for " + writestate.compacting + " compactions" 1938 + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this); 1939 try { 1940 writestate.wait(); 1941 } catch (InterruptedException iex) { 1942 // essentially ignore and propagate the interrupt back up 1943 LOG.warn("Interrupted while waiting in region {}", this); 1944 interrupted = true; 1945 break; 1946 } 1947 } 1948 } finally { 1949 if (interrupted) { 1950 Thread.currentThread().interrupt(); 1951 } 1952 } 1953 } 1954 } 1955 1956 /** 1957 * Wait for all current flushes of the region to complete 1958 */ 1959 public void waitForFlushes() { 1960 waitForFlushes(0);// Unbound wait 1961 } 1962 1963 @Override 1964 public boolean waitForFlushes(long timeout) { 1965 synchronized (writestate) { 1966 if (this.writestate.readOnly) { 1967 // we should not wait for replayed flushed if we are read only (for example in case the 1968 // region is a secondary replica). 1969 return true; 1970 } 1971 if (!writestate.flushing) return true; 1972 long start = EnvironmentEdgeManager.currentTime(); 1973 long duration = 0; 1974 boolean interrupted = false; 1975 LOG.debug("waiting for cache flush to complete for region " + this); 1976 try { 1977 while (writestate.flushing) { 1978 if (timeout > 0 && duration >= timeout) break; 1979 try { 1980 long toWait = timeout == 0 ? 0 : (timeout - duration); 1981 writestate.wait(toWait); 1982 } catch (InterruptedException iex) { 1983 // essentially ignore and propagate the interrupt back up 1984 LOG.warn("Interrupted while waiting in region {}", this); 1985 interrupted = true; 1986 break; 1987 } finally { 1988 duration = EnvironmentEdgeManager.currentTime() - start; 1989 } 1990 } 1991 } finally { 1992 if (interrupted) { 1993 Thread.currentThread().interrupt(); 1994 } 1995 } 1996 LOG.debug("Waited {} ms for region {} flush to complete", duration, this); 1997 return !(writestate.flushing); 1998 } 1999 } 2000 2001 @Override 2002 public Configuration getReadOnlyConfiguration() { 2003 return new ReadOnlyConfiguration(this.conf); 2004 } 2005 2006 @Override 2007 public int getMinBlockSizeBytes() { 2008 return minBlockSizeBytes; 2009 } 2010 2011 private ThreadPoolExecutor getStoreOpenAndCloseThreadPool(final String threadNamePrefix) { 2012 int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount()); 2013 int maxThreads = Math.min(numStores, conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX, 2014 HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)); 2015 return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix); 2016 } 2017 2018 ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(final String threadNamePrefix) { 2019 int numStores = Math.max(1, this.htableDescriptor.getColumnFamilyCount()); 2020 int maxThreads = Math.max(1, conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX, 2021 HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX) / numStores); 2022 return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix); 2023 } 2024 2025 private static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads, 2026 final String threadNamePrefix) { 2027 return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS, 2028 new ThreadFactory() { 2029 private int count = 1; 2030 2031 @Override 2032 public Thread newThread(Runnable r) { 2033 return new Thread(r, threadNamePrefix + "-" + count++); 2034 } 2035 }); 2036 } 2037 2038 /** Returns True if its worth doing a flush before we put up the close flag. */ 2039 private boolean worthPreFlushing() { 2040 return this.memStoreSizing.getDataSize() 2041 > this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5); 2042 } 2043 2044 ////////////////////////////////////////////////////////////////////////////// 2045 // HRegion accessors 2046 ////////////////////////////////////////////////////////////////////////////// 2047 2048 @Override 2049 public TableDescriptor getTableDescriptor() { 2050 return this.htableDescriptor; 2051 } 2052 2053 @RestrictedApi(explanation = "Should only be called in tests", link = "", 2054 allowedOnPath = ".*/src/test/.*") 2055 public void setTableDescriptor(TableDescriptor desc) { 2056 htableDescriptor = desc; 2057 } 2058 2059 /** Returns WAL in use for this region */ 2060 public WAL getWAL() { 2061 return this.wal; 2062 } 2063 2064 public BlockCache getBlockCache() { 2065 return this.blockCache; 2066 } 2067 2068 /** 2069 * Only used for unit test which doesn't start region server. 2070 */ 2071 public void setBlockCache(BlockCache blockCache) { 2072 this.blockCache = blockCache; 2073 } 2074 2075 public MobFileCache getMobFileCache() { 2076 return this.mobFileCache; 2077 } 2078 2079 /** 2080 * Only used for unit test which doesn't start region server. 2081 */ 2082 public void setMobFileCache(MobFileCache mobFileCache) { 2083 this.mobFileCache = mobFileCache; 2084 } 2085 2086 /** Returns split policy for this region. */ 2087 RegionSplitPolicy getSplitPolicy() { 2088 return this.splitPolicy; 2089 } 2090 2091 /** 2092 * A split takes the config from the parent region & passes it to the daughter region's 2093 * constructor. If 'conf' was passed, you would end up using the HTD of the parent region in 2094 * addition to the new daughter HTD. Pass 'baseConf' to the daughter regions to avoid this tricky 2095 * dedupe problem. 2096 * @return Configuration object 2097 */ 2098 Configuration getBaseConf() { 2099 return this.baseConf; 2100 } 2101 2102 /** Returns {@link FileSystem} being used by this region */ 2103 public FileSystem getFilesystem() { 2104 return fs.getFileSystem(); 2105 } 2106 2107 /** Returns the {@link HRegionFileSystem} used by this region */ 2108 public HRegionFileSystem getRegionFileSystem() { 2109 return this.fs; 2110 } 2111 2112 /** Returns the WAL {@link HRegionFileSystem} used by this region */ 2113 HRegionWALFileSystem getRegionWALFileSystem() throws IOException { 2114 return new HRegionWALFileSystem(conf, getWalFileSystem(), 2115 CommonFSUtils.getWALTableDir(conf, htableDescriptor.getTableName()), fs.getRegionInfo()); 2116 } 2117 2118 /** Returns the WAL {@link FileSystem} being used by this region */ 2119 FileSystem getWalFileSystem() throws IOException { 2120 if (walFS == null) { 2121 walFS = CommonFSUtils.getWALFileSystem(conf); 2122 } 2123 return walFS; 2124 } 2125 2126 /** 2127 * @return the Region directory under WALRootDirectory 2128 * @throws IOException if there is an error getting WALRootDir 2129 */ 2130 public Path getWALRegionDir() throws IOException { 2131 if (regionWalDir == null) { 2132 regionWalDir = CommonFSUtils.getWALRegionDir(conf, getRegionInfo().getTable(), 2133 getRegionInfo().getEncodedName()); 2134 } 2135 return regionWalDir; 2136 } 2137 2138 @Override 2139 public long getEarliestFlushTimeForAllStores() { 2140 return Collections.min(lastStoreFlushTimeMap.values()); 2141 } 2142 2143 @Override 2144 public long getOldestHfileTs(boolean majorCompactionOnly) throws IOException { 2145 long result = Long.MAX_VALUE; 2146 for (HStore store : stores.values()) { 2147 Collection<HStoreFile> storeFiles = store.getStorefiles(); 2148 if (storeFiles == null) { 2149 continue; 2150 } 2151 for (HStoreFile file : storeFiles) { 2152 StoreFileReader sfReader = file.getReader(); 2153 if (sfReader == null) { 2154 continue; 2155 } 2156 HFile.Reader reader = sfReader.getHFileReader(); 2157 if (reader == null) { 2158 continue; 2159 } 2160 if (majorCompactionOnly) { 2161 byte[] val = reader.getHFileInfo().get(MAJOR_COMPACTION_KEY); 2162 if (val == null || !Bytes.toBoolean(val)) { 2163 continue; 2164 } 2165 } 2166 result = Math.min(result, reader.getFileContext().getFileCreateTime()); 2167 } 2168 } 2169 return result == Long.MAX_VALUE ? 0 : result; 2170 } 2171 2172 RegionLoad.Builder setCompleteSequenceId(RegionLoad.Builder regionLoadBldr) { 2173 long lastFlushOpSeqIdLocal = this.lastFlushOpSeqId; 2174 byte[] encodedRegionName = this.getRegionInfo().getEncodedNameAsBytes(); 2175 regionLoadBldr.clearStoreCompleteSequenceId(); 2176 for (byte[] familyName : this.stores.keySet()) { 2177 long earliest = this.wal.getEarliestMemStoreSeqNum(encodedRegionName, familyName); 2178 // Subtract - 1 to go earlier than the current oldest, unflushed edit in memstore; this will 2179 // give us a sequence id that is for sure flushed. We want edit replay to start after this 2180 // sequence id in this region. If NO_SEQNUM, use the regions maximum flush id. 2181 long csid = (earliest == HConstants.NO_SEQNUM) ? lastFlushOpSeqIdLocal : earliest - 1; 2182 regionLoadBldr.addStoreCompleteSequenceId(StoreSequenceId.newBuilder() 2183 .setFamilyName(UnsafeByteOperations.unsafeWrap(familyName)).setSequenceId(csid).build()); 2184 } 2185 return regionLoadBldr.setCompleteSequenceId(getMaxFlushedSeqId()); 2186 } 2187 2188 ////////////////////////////////////////////////////////////////////////////// 2189 // HRegion maintenance. 2190 // 2191 // These methods are meant to be called periodically by the HRegionServer for 2192 // upkeep. 2193 ////////////////////////////////////////////////////////////////////////////// 2194 /** 2195 * Do preparation for pending compaction. 2196 */ 2197 protected void doRegionCompactionPrep() throws IOException { 2198 } 2199 2200 /** 2201 * Synchronously compact all stores in the region. 2202 * <p> 2203 * This operation could block for a long time, so don't call it from a time-sensitive thread. 2204 * <p> 2205 * Note that no locks are taken to prevent possible conflicts between compaction and splitting 2206 * activities. The regionserver does not normally compact and split in parallel. However by 2207 * calling this method you may introduce unexpected and unhandled concurrency. Don't do this 2208 * unless you know what you are doing. 2209 * @param majorCompaction True to force a major compaction regardless of thresholds 2210 */ 2211 public void compact(boolean majorCompaction) throws IOException { 2212 if (majorCompaction) { 2213 stores.values().forEach(HStore::triggerMajorCompaction); 2214 } 2215 for (HStore s : stores.values()) { 2216 Optional<CompactionContext> compaction = s.requestCompaction(); 2217 if (compaction.isPresent()) { 2218 ThroughputController controller = null; 2219 if (rsServices != null) { 2220 controller = CompactionThroughputControllerFactory.create(rsServices, conf); 2221 } 2222 if (controller == null) { 2223 controller = NoLimitThroughputController.INSTANCE; 2224 } 2225 compact(compaction.get(), s, controller, null); 2226 } 2227 } 2228 } 2229 2230 /** 2231 * This is a helper function that compact all the stores synchronously. 2232 * <p> 2233 * It is used by utilities and testing 2234 */ 2235 public void compactStores() throws IOException { 2236 for (HStore s : stores.values()) { 2237 Optional<CompactionContext> compaction = s.requestCompaction(); 2238 if (compaction.isPresent()) { 2239 compact(compaction.get(), s, NoLimitThroughputController.INSTANCE, null); 2240 } 2241 } 2242 } 2243 2244 /** 2245 * This is a helper function that compact the given store. 2246 * <p> 2247 * It is used by utilities and testing 2248 */ 2249 void compactStore(byte[] family, ThroughputController throughputController) throws IOException { 2250 HStore s = getStore(family); 2251 Optional<CompactionContext> compaction = s.requestCompaction(); 2252 if (compaction.isPresent()) { 2253 compact(compaction.get(), s, throughputController, null); 2254 } 2255 } 2256 2257 /** 2258 * Called by compaction thread and after region is opened to compact the HStores if necessary. 2259 * <p> 2260 * This operation could block for a long time, so don't call it from a time-sensitive thread. Note 2261 * that no locking is necessary at this level because compaction only conflicts with a region 2262 * split, and that cannot happen because the region server does them sequentially and not in 2263 * parallel. 2264 * @param compaction Compaction details, obtained by requestCompaction() 2265 * @return whether the compaction completed 2266 */ 2267 public boolean compact(CompactionContext compaction, HStore store, 2268 ThroughputController throughputController) throws IOException { 2269 return compact(compaction, store, throughputController, null); 2270 } 2271 2272 public boolean compact(CompactionContext compaction, HStore store, 2273 ThroughputController throughputController, User user) throws IOException { 2274 assert compaction != null && compaction.hasSelection(); 2275 assert !compaction.getRequest().getFiles().isEmpty(); 2276 if (this.closing.get() || this.closed.get()) { 2277 LOG.debug("Skipping compaction on " + this + " because closing/closed"); 2278 store.cancelRequestedCompaction(compaction); 2279 return false; 2280 } 2281 MonitoredTask status = null; 2282 boolean requestNeedsCancellation = true; 2283 /* 2284 * We are trying to remove / relax the region read lock for compaction. Let's see what are the 2285 * potential race conditions among the operations (user scan, region split, region close and 2286 * region bulk load). user scan ---> region read lock region split --> region close first --> 2287 * region write lock region close --> region write lock region bulk load --> region write lock 2288 * read lock is compatible with read lock. ---> no problem with user scan/read region bulk load 2289 * does not cause problem for compaction (no consistency problem, store lock will help the store 2290 * file accounting). They can run almost concurrently at the region level. The only remaining 2291 * race condition is between the region close and compaction. So we will evaluate, below, how 2292 * region close intervenes with compaction if compaction does not acquire region read lock. Here 2293 * are the steps for compaction: 1. obtain list of StoreFile's 2. create StoreFileScanner's 2294 * based on list from #1 3. perform compaction and save resulting files under tmp dir 4. swap in 2295 * compacted files #1 is guarded by store lock. This patch does not change this --> no worse or 2296 * better For #2, we obtain smallest read point (for region) across all the Scanners (for both 2297 * default compactor and stripe compactor). The read points are for user scans. Region keeps the 2298 * read points for all currently open user scanners. Compaction needs to know the smallest read 2299 * point so that during re-write of the hfiles, it can remove the mvcc points for the cells if 2300 * their mvccs are older than the smallest since they are not needed anymore. This will not 2301 * conflict with compaction. For #3, it can be performed in parallel to other operations. For #4 2302 * bulk load and compaction don't conflict with each other on the region level (for multi-family 2303 * atomicy). Region close and compaction are guarded pretty well by the 'writestate'. In 2304 * HRegion#doClose(), we have : synchronized (writestate) { // Disable compacting and flushing 2305 * by background threads for this // region. canFlush = !writestate.readOnly; 2306 * writestate.writesEnabled = false; LOG.debug("Closing " + this + 2307 * ": disabling compactions & flushes"); waitForFlushesAndCompactions(); } 2308 * waitForFlushesAndCompactions() would wait for writestate.compacting to come down to 0. and in 2309 * HRegion.compact() try { synchronized (writestate) { if (writestate.writesEnabled) { 2310 * wasStateSet = true; ++writestate.compacting; } else { String msg = "NOT compacting region " + 2311 * this + ". Writes disabled."; LOG.info(msg); status.abort(msg); return false; } } Also in 2312 * compactor.performCompaction(): check periodically to see if a system stop is requested if 2313 * (closeChecker != null && closeChecker.isTimeLimit(store, now)) { progress.cancel(); return 2314 * false; } if (closeChecker != null && closeChecker.isSizeLimit(store, len)) { 2315 * progress.cancel(); return false; } 2316 */ 2317 try { 2318 byte[] cf = Bytes.toBytes(store.getColumnFamilyName()); 2319 if (stores.get(cf) != store) { 2320 LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this 2321 + " has been re-instantiated, cancel this compaction request. " 2322 + " It may be caused by the roll back of split transaction"); 2323 return false; 2324 } 2325 2326 status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this); 2327 if (this.closed.get()) { 2328 String msg = "Skipping compaction on " + this + " because closed"; 2329 LOG.debug(msg); 2330 status.abort(msg); 2331 return false; 2332 } 2333 boolean wasStateSet = false; 2334 try { 2335 synchronized (writestate) { 2336 if (writestate.writesEnabled) { 2337 wasStateSet = true; 2338 writestate.compacting.incrementAndGet(); 2339 } else { 2340 String msg = "NOT compacting region " + this + ". Writes disabled."; 2341 LOG.info(msg); 2342 status.abort(msg); 2343 return false; 2344 } 2345 } 2346 LOG.info("Starting compaction of {} in {}{}", store, this, 2347 (compaction.getRequest().isOffPeak() ? " as an off-peak compaction" : "")); 2348 doRegionCompactionPrep(); 2349 try { 2350 status.setStatus("Compacting store " + store); 2351 // We no longer need to cancel the request on the way out of this 2352 // method because Store#compact will clean up unconditionally 2353 requestNeedsCancellation = false; 2354 store.compact(compaction, throughputController, user); 2355 } catch (InterruptedIOException iioe) { 2356 String msg = "region " + this + " compaction interrupted"; 2357 LOG.info(msg, iioe); 2358 status.abort(msg); 2359 return false; 2360 } 2361 } finally { 2362 if (wasStateSet) { 2363 synchronized (writestate) { 2364 writestate.compacting.decrementAndGet(); 2365 if (writestate.compacting.get() <= 0) { 2366 writestate.notifyAll(); 2367 } 2368 } 2369 } 2370 } 2371 status.markComplete("Compaction complete"); 2372 return true; 2373 } finally { 2374 if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction); 2375 if (status != null) { 2376 LOG.debug("Compaction status journal for {}:\n{}", this.getRegionInfo().getEncodedName(), 2377 status.prettyPrintJournal()); 2378 status.cleanup(); 2379 } 2380 } 2381 } 2382 2383 /** 2384 * Flush the cache. 2385 * <p> 2386 * When this method is called the cache will be flushed unless: 2387 * <ol> 2388 * <li>the cache is empty</li> 2389 * <li>the region is closed.</li> 2390 * <li>a flush is already in progress</li> 2391 * <li>writes are disabled</li> 2392 * </ol> 2393 * <p> 2394 * This method may block for some time, so it should not be called from a time-sensitive thread. 2395 * @param flushAllStores whether we want to force a flush of all stores 2396 * @return FlushResult indicating whether the flush was successful or not and if the region needs 2397 * compacting 2398 * @throws IOException general io exceptions because a snapshot was not properly persisted. 2399 */ 2400 // TODO HBASE-18905. We might have to expose a requestFlush API for CPs 2401 public FlushResult flush(boolean flushAllStores) throws IOException { 2402 return flushcache(flushAllStores, false, FlushLifeCycleTracker.DUMMY); 2403 } 2404 2405 public interface FlushResult { 2406 enum Result { 2407 FLUSHED_NO_COMPACTION_NEEDED, 2408 FLUSHED_COMPACTION_NEEDED, 2409 // Special case where a flush didn't run because there's nothing in the memstores. Used when 2410 // bulk loading to know when we can still load even if a flush didn't happen. 2411 CANNOT_FLUSH_MEMSTORE_EMPTY, 2412 CANNOT_FLUSH 2413 } 2414 2415 /** Returns the detailed result code */ 2416 Result getResult(); 2417 2418 /** Returns true if the memstores were flushed, else false */ 2419 boolean isFlushSucceeded(); 2420 2421 /** Returns True if the flush requested a compaction, else false */ 2422 boolean isCompactionNeeded(); 2423 } 2424 2425 FlushResultImpl flushcache(boolean flushAllStores, boolean writeFlushRequestWalMarker, 2426 FlushLifeCycleTracker tracker) throws IOException { 2427 List<byte[]> families = null; 2428 if (flushAllStores) { 2429 families = new ArrayList<>(); 2430 families.addAll(this.getTableDescriptor().getColumnFamilyNames()); 2431 } 2432 return this.flushcache(families, writeFlushRequestWalMarker, tracker); 2433 } 2434 2435 /** 2436 * Flush the cache. When this method is called the cache will be flushed unless: 2437 * <ol> 2438 * <li>the cache is empty</li> 2439 * <li>the region is closed.</li> 2440 * <li>a flush is already in progress</li> 2441 * <li>writes are disabled</li> 2442 * </ol> 2443 * <p> 2444 * This method may block for some time, so it should not be called from a time-sensitive thread. 2445 * @param families stores of region to flush. 2446 * @param writeFlushRequestWalMarker whether to write the flush request marker to WAL 2447 * @param tracker used to track the life cycle of this flush 2448 * @return whether the flush is success and whether the region needs compacting 2449 * @throws IOException general io exceptions 2450 * @throws DroppedSnapshotException Thrown when replay of wal is required because a Snapshot was 2451 * not properly persisted. The region is put in closing mode, and 2452 * the caller MUST abort after this. 2453 */ 2454 public FlushResultImpl flushcache(List<byte[]> families, boolean writeFlushRequestWalMarker, 2455 FlushLifeCycleTracker tracker) throws IOException { 2456 // fail-fast instead of waiting on the lock 2457 if (this.closing.get()) { 2458 String msg = "Skipping flush on " + this + " because closing"; 2459 LOG.debug(msg); 2460 return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false); 2461 } 2462 MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this); 2463 status.setStatus("Acquiring readlock on region"); 2464 // block waiting for the lock for flushing cache 2465 lock.readLock().lock(); 2466 boolean flushed = true; 2467 try { 2468 if (this.closed.get()) { 2469 String msg = "Skipping flush on " + this + " because closed"; 2470 LOG.debug(msg); 2471 status.abort(msg); 2472 flushed = false; 2473 return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false); 2474 } 2475 if (coprocessorHost != null) { 2476 status.setStatus("Running coprocessor pre-flush hooks"); 2477 coprocessorHost.preFlush(tracker); 2478 } 2479 // TODO: this should be managed within memstore with the snapshot, updated only after flush 2480 // successful 2481 if (numMutationsWithoutWAL.sum() > 0) { 2482 numMutationsWithoutWAL.reset(); 2483 dataInMemoryWithoutWAL.reset(); 2484 } 2485 synchronized (writestate) { 2486 if (!writestate.flushing && writestate.writesEnabled) { 2487 this.writestate.flushing = true; 2488 } else { 2489 String msg = "NOT flushing " + this + " as " 2490 + (writestate.flushing ? "already flushing" : "writes are not enabled"); 2491 LOG.debug(msg); 2492 status.abort(msg); 2493 flushed = false; 2494 return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false); 2495 } 2496 } 2497 2498 try { 2499 // The reason that we do not always use flushPolicy is, when the flush is 2500 // caused by logRoller, we should select stores which must be flushed 2501 // rather than could be flushed. 2502 Collection<HStore> specificStoresToFlush = null; 2503 if (families != null) { 2504 specificStoresToFlush = getSpecificStores(families); 2505 } else { 2506 specificStoresToFlush = flushPolicy.selectStoresToFlush(); 2507 } 2508 FlushResultImpl fs = 2509 internalFlushcache(specificStoresToFlush, status, writeFlushRequestWalMarker, tracker); 2510 2511 if (coprocessorHost != null) { 2512 status.setStatus("Running post-flush coprocessor hooks"); 2513 coprocessorHost.postFlush(tracker); 2514 } 2515 2516 if (fs.isFlushSucceeded()) { 2517 flushesQueued.reset(); 2518 } 2519 2520 status.markComplete("Flush successful " + fs.toString()); 2521 return fs; 2522 } finally { 2523 synchronized (writestate) { 2524 writestate.flushing = false; 2525 this.writestate.flushRequested = false; 2526 writestate.notifyAll(); 2527 } 2528 } 2529 } finally { 2530 lock.readLock().unlock(); 2531 if (flushed) { 2532 // Don't log this journal stuff if no flush -- confusing. 2533 LOG.debug("Flush status journal for {}:\n{}", this.getRegionInfo().getEncodedName(), 2534 status.prettyPrintJournal()); 2535 } 2536 status.cleanup(); 2537 } 2538 } 2539 2540 /** 2541 * get stores which matches the specified families 2542 * @return the stores need to be flushed. 2543 */ 2544 private Collection<HStore> getSpecificStores(List<byte[]> families) { 2545 Collection<HStore> specificStoresToFlush = new ArrayList<>(); 2546 for (byte[] family : families) { 2547 specificStoresToFlush.add(stores.get(family)); 2548 } 2549 return specificStoresToFlush; 2550 } 2551 2552 /** 2553 * Should the store be flushed because it is old enough. 2554 * <p> 2555 * Every FlushPolicy should call this to determine whether a store is old enough to flush (except 2556 * that you always flush all stores). Otherwise the method will always returns true which will 2557 * make a lot of flush requests. 2558 */ 2559 boolean shouldFlushStore(HStore store) { 2560 long earliest = this.wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), 2561 store.getColumnFamilyDescriptor().getName()) - 1; 2562 if (earliest > 0 && earliest + flushPerChanges < mvcc.getReadPoint()) { 2563 if (LOG.isDebugEnabled()) { 2564 LOG.debug("Flush column family " + store.getColumnFamilyName() + " of " 2565 + getRegionInfo().getEncodedName() + " because unflushed sequenceid=" + earliest 2566 + " is > " + this.flushPerChanges + " from current=" + mvcc.getReadPoint()); 2567 } 2568 return true; 2569 } 2570 if (this.flushCheckInterval <= 0) { 2571 return false; 2572 } 2573 long now = EnvironmentEdgeManager.currentTime(); 2574 if (store.timeOfOldestEdit() < now - this.flushCheckInterval) { 2575 if (LOG.isDebugEnabled()) { 2576 LOG.debug("Flush column family: " + store.getColumnFamilyName() + " of " 2577 + getRegionInfo().getEncodedName() + " because time of oldest edit=" 2578 + store.timeOfOldestEdit() + " is > " + this.flushCheckInterval + " from now =" + now); 2579 } 2580 return true; 2581 } 2582 return false; 2583 } 2584 2585 /** 2586 * Should the memstore be flushed now 2587 */ 2588 boolean shouldFlush(final StringBuilder whyFlush) { 2589 whyFlush.setLength(0); 2590 // This is a rough measure. 2591 if ( 2592 this.maxFlushedSeqId > 0 2593 && (this.maxFlushedSeqId + this.flushPerChanges < this.mvcc.getReadPoint()) 2594 ) { 2595 whyFlush.append("more than max edits, " + this.flushPerChanges + ", since last flush"); 2596 return true; 2597 } 2598 long modifiedFlushCheckInterval = flushCheckInterval; 2599 if ( 2600 getRegionInfo().getTable().isSystemTable() 2601 && getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID 2602 ) { 2603 modifiedFlushCheckInterval = SYSTEM_CACHE_FLUSH_INTERVAL; 2604 } 2605 if (modifiedFlushCheckInterval <= 0) { // disabled 2606 return false; 2607 } 2608 long now = EnvironmentEdgeManager.currentTime(); 2609 // if we flushed in the recent past, we don't need to do again now 2610 if ((now - getEarliestFlushTimeForAllStores() < modifiedFlushCheckInterval)) { 2611 return false; 2612 } 2613 // since we didn't flush in the recent past, flush now if certain conditions 2614 // are met. Return true on first such memstore hit. 2615 for (HStore s : stores.values()) { 2616 if (s.timeOfOldestEdit() < now - modifiedFlushCheckInterval) { 2617 // we have an old enough edit in the memstore, flush 2618 whyFlush.append(s.toString() + " has an old edit so flush to free WALs"); 2619 return true; 2620 } 2621 } 2622 return false; 2623 } 2624 2625 /** 2626 * Flushing all stores. 2627 * @see #internalFlushcache(Collection, MonitoredTask, boolean, FlushLifeCycleTracker) 2628 */ 2629 private FlushResult internalFlushcache(MonitoredTask status) throws IOException { 2630 return internalFlushcache(stores.values(), status, false, FlushLifeCycleTracker.DUMMY); 2631 } 2632 2633 /** 2634 * Flushing given stores. 2635 * @see #internalFlushcache(WAL, long, Collection, MonitoredTask, boolean, FlushLifeCycleTracker) 2636 */ 2637 private FlushResultImpl internalFlushcache(Collection<HStore> storesToFlush, MonitoredTask status, 2638 boolean writeFlushWalMarker, FlushLifeCycleTracker tracker) throws IOException { 2639 return internalFlushcache(this.wal, HConstants.NO_SEQNUM, storesToFlush, status, 2640 writeFlushWalMarker, tracker); 2641 } 2642 2643 /** 2644 * Flush the memstore. Flushing the memstore is a little tricky. We have a lot of updates in the 2645 * memstore, all of which have also been written to the wal. We need to write those updates in the 2646 * memstore out to disk, while being able to process reads/writes as much as possible during the 2647 * flush operation. 2648 * <p> 2649 * This method may block for some time. Every time you call it, we up the regions sequence id even 2650 * if we don't flush; i.e. the returned region id will be at least one larger than the last edit 2651 * applied to this region. The returned id does not refer to an actual edit. The returned id can 2652 * be used for say installing a bulk loaded file just ahead of the last hfile that was the result 2653 * of this flush, etc. 2654 * @param wal Null if we're NOT to go via wal. 2655 * @param myseqid The seqid to use if <code>wal</code> is null writing out flush file. 2656 * @param storesToFlush The list of stores to flush. 2657 * @return object describing the flush's state 2658 * @throws IOException general io exceptions 2659 * @throws DroppedSnapshotException Thrown when replay of WAL is required. 2660 */ 2661 protected FlushResultImpl internalFlushcache(WAL wal, long myseqid, 2662 Collection<HStore> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker, 2663 FlushLifeCycleTracker tracker) throws IOException { 2664 PrepareFlushResult result = 2665 internalPrepareFlushCache(wal, myseqid, storesToFlush, status, writeFlushWalMarker, tracker); 2666 if (result.result == null) { 2667 return internalFlushCacheAndCommit(wal, status, result, storesToFlush); 2668 } else { 2669 return result.result; // early exit due to failure from prepare stage 2670 } 2671 } 2672 2673 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "DLS_DEAD_LOCAL_STORE", 2674 justification = "FindBugs seems confused about trxId") 2675 protected PrepareFlushResult internalPrepareFlushCache(WAL wal, long myseqid, 2676 Collection<HStore> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker, 2677 FlushLifeCycleTracker tracker) throws IOException { 2678 if (this.rsServices != null && this.rsServices.isAborted()) { 2679 // Don't flush when server aborting, it's unsafe 2680 throw new IOException("Aborting flush because server is aborted..."); 2681 } 2682 final long startTime = EnvironmentEdgeManager.currentTime(); 2683 // If nothing to flush, return, but return with a valid unused sequenceId. 2684 // Its needed by bulk upload IIRC. It flushes until no edits in memory so it can insert a 2685 // bulk loaded file between memory and existing hfiles. It wants a good seqeunceId that belongs 2686 // to no other that it can use to associate with the bulk load. Hence this little dance below 2687 // to go get one. 2688 if (this.memStoreSizing.getDataSize() <= 0) { 2689 // Take an update lock so no edits can come into memory just yet. 2690 this.updatesLock.writeLock().lock(); 2691 WriteEntry writeEntry = null; 2692 try { 2693 if (this.memStoreSizing.getDataSize() <= 0) { 2694 // Presume that if there are still no edits in the memstore, then there are no edits for 2695 // this region out in the WAL subsystem so no need to do any trickery clearing out 2696 // edits in the WAL sub-system. Up the sequence number so the resulting flush id is for 2697 // sure just beyond the last appended region edit and not associated with any edit 2698 // (useful as marker when bulk loading, etc.). 2699 if (wal != null) { 2700 writeEntry = mvcc.begin(); 2701 long flushOpSeqId = writeEntry.getWriteNumber(); 2702 FlushResultImpl flushResult = 2703 new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, flushOpSeqId, 2704 "Nothing to flush", writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker)); 2705 mvcc.completeAndWait(writeEntry); 2706 // Set to null so we don't complete it again down in finally block. 2707 writeEntry = null; 2708 return new PrepareFlushResult(flushResult, myseqid); 2709 } else { 2710 return new PrepareFlushResult(new FlushResultImpl( 2711 FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush", false), myseqid); 2712 } 2713 } 2714 } finally { 2715 if (writeEntry != null) { 2716 // If writeEntry is non-null, this operation failed; the mvcc transaction failed... 2717 // but complete it anyways so it doesn't block the mvcc queue. 2718 mvcc.complete(writeEntry); 2719 } 2720 this.updatesLock.writeLock().unlock(); 2721 } 2722 } 2723 logFatLineOnFlush(storesToFlush, myseqid); 2724 // Stop updates while we snapshot the memstore of all of these regions' stores. We only have 2725 // to do this for a moment. It is quick. We also set the memstore size to zero here before we 2726 // allow updates again so its value will represent the size of the updates received 2727 // during flush 2728 2729 // We have to take an update lock during snapshot, or else a write could end up in both snapshot 2730 // and memstore (makes it difficult to do atomic rows then) 2731 status.setStatus("Obtaining lock to block concurrent updates"); 2732 // block waiting for the lock for internal flush 2733 this.updatesLock.writeLock().lock(); 2734 status.setStatus("Preparing flush snapshotting stores in " + getRegionInfo().getEncodedName()); 2735 MemStoreSizing totalSizeOfFlushableStores = new NonThreadSafeMemStoreSizing(); 2736 2737 Map<byte[], Long> flushedFamilyNamesToSeq = new HashMap<>(); 2738 for (HStore store : storesToFlush) { 2739 flushedFamilyNamesToSeq.put(store.getColumnFamilyDescriptor().getName(), 2740 store.preFlushSeqIDEstimation()); 2741 } 2742 2743 TreeMap<byte[], StoreFlushContext> storeFlushCtxs = new TreeMap<>(Bytes.BYTES_COMPARATOR); 2744 TreeMap<byte[], List<Path>> committedFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR); 2745 TreeMap<byte[], MemStoreSize> storeFlushableSize = new TreeMap<>(Bytes.BYTES_COMPARATOR); 2746 // The sequence id of this flush operation which is used to log FlushMarker and pass to 2747 // createFlushContext to use as the store file's sequence id. It can be in advance of edits 2748 // still in the memstore, edits that are in other column families yet to be flushed. 2749 long flushOpSeqId = HConstants.NO_SEQNUM; 2750 // The max flushed sequence id after this flush operation completes. All edits in memstore 2751 // will be in advance of this sequence id. 2752 long flushedSeqId = HConstants.NO_SEQNUM; 2753 byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes(); 2754 try { 2755 if (wal != null) { 2756 Long earliestUnflushedSequenceIdForTheRegion = 2757 wal.startCacheFlush(encodedRegionName, flushedFamilyNamesToSeq); 2758 if (earliestUnflushedSequenceIdForTheRegion == null) { 2759 // This should never happen. This is how startCacheFlush signals flush cannot proceed. 2760 String msg = this.getRegionInfo().getEncodedName() + " flush aborted; WAL closing."; 2761 status.setStatus(msg); 2762 return new PrepareFlushResult( 2763 new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false), myseqid); 2764 } 2765 flushOpSeqId = getNextSequenceId(wal); 2766 // Back up 1, minus 1 from oldest sequence id in memstore to get last 'flushed' edit 2767 flushedSeqId = earliestUnflushedSequenceIdForTheRegion.longValue() == HConstants.NO_SEQNUM 2768 ? flushOpSeqId 2769 : earliestUnflushedSequenceIdForTheRegion.longValue() - 1; 2770 } else { 2771 // use the provided sequence Id as WAL is not being used for this flush. 2772 flushedSeqId = flushOpSeqId = myseqid; 2773 } 2774 2775 for (HStore s : storesToFlush) { 2776 storeFlushCtxs.put(s.getColumnFamilyDescriptor().getName(), 2777 s.createFlushContext(flushOpSeqId, tracker)); 2778 // for writing stores to WAL 2779 committedFiles.put(s.getColumnFamilyDescriptor().getName(), null); 2780 } 2781 2782 // write the snapshot start to WAL 2783 if (wal != null && !writestate.readOnly) { 2784 FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH, 2785 getRegionInfo(), flushOpSeqId, committedFiles); 2786 // No sync. Sync is below where no updates lock and we do FlushAction.COMMIT_FLUSH 2787 WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false, 2788 mvcc); 2789 } 2790 2791 // Prepare flush (take a snapshot) 2792 storeFlushCtxs.forEach((name, flush) -> { 2793 MemStoreSize snapshotSize = flush.prepare(); 2794 totalSizeOfFlushableStores.incMemStoreSize(snapshotSize); 2795 storeFlushableSize.put(name, snapshotSize); 2796 }); 2797 } catch (IOException ex) { 2798 doAbortFlushToWAL(wal, flushOpSeqId, committedFiles); 2799 throw ex; 2800 } finally { 2801 this.updatesLock.writeLock().unlock(); 2802 } 2803 String s = "Finished memstore snapshotting " + this + ", syncing WAL and waiting on mvcc, " 2804 + "flushsize=" + totalSizeOfFlushableStores; 2805 status.setStatus(s); 2806 doSyncOfUnflushedWALChanges(wal, getRegionInfo()); 2807 return new PrepareFlushResult(storeFlushCtxs, committedFiles, storeFlushableSize, startTime, 2808 flushOpSeqId, flushedSeqId, totalSizeOfFlushableStores); 2809 } 2810 2811 /** 2812 * Utility method broken out of internalPrepareFlushCache so that method is smaller. 2813 */ 2814 private void logFatLineOnFlush(Collection<HStore> storesToFlush, long sequenceId) { 2815 if (!LOG.isInfoEnabled()) { 2816 return; 2817 } 2818 // Log a fat line detailing what is being flushed. 2819 StringBuilder perCfExtras = null; 2820 if (!isAllFamilies(storesToFlush)) { 2821 perCfExtras = new StringBuilder(); 2822 for (HStore store : storesToFlush) { 2823 MemStoreSize mss = store.getFlushableSize(); 2824 perCfExtras.append("; ").append(store.getColumnFamilyName()); 2825 perCfExtras.append("={dataSize=").append(StringUtils.byteDesc(mss.getDataSize())); 2826 perCfExtras.append(", heapSize=").append(StringUtils.byteDesc(mss.getHeapSize())); 2827 perCfExtras.append(", offHeapSize=").append(StringUtils.byteDesc(mss.getOffHeapSize())); 2828 perCfExtras.append("}"); 2829 } 2830 } 2831 MemStoreSize mss = this.memStoreSizing.getMemStoreSize(); 2832 LOG.info("Flushing " + this.getRegionInfo().getEncodedName() + " " + storesToFlush.size() + "/" 2833 + stores.size() + " column families," + " dataSize=" + StringUtils.byteDesc(mss.getDataSize()) 2834 + " heapSize=" + StringUtils.byteDesc(mss.getHeapSize()) 2835 + ((perCfExtras != null && perCfExtras.length() > 0) ? perCfExtras.toString() : "") 2836 + ((wal != null) ? "" : "; WAL is null, using passed sequenceid=" + sequenceId)); 2837 } 2838 2839 private void doAbortFlushToWAL(final WAL wal, final long flushOpSeqId, 2840 final Map<byte[], List<Path>> committedFiles) { 2841 if (wal == null) return; 2842 try { 2843 FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH, 2844 getRegionInfo(), flushOpSeqId, committedFiles); 2845 WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, false, mvcc); 2846 } catch (Throwable t) { 2847 LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL: {} in " 2848 + " region {}", StringUtils.stringifyException(t), this); 2849 // ignore this since we will be aborting the RS with DSE. 2850 } 2851 // we have called wal.startCacheFlush(), now we have to abort it 2852 wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes()); 2853 } 2854 2855 /** 2856 * Sync unflushed WAL changes. See HBASE-8208 for details 2857 */ 2858 private static void doSyncOfUnflushedWALChanges(final WAL wal, final RegionInfo hri) 2859 throws IOException { 2860 if (wal == null) { 2861 return; 2862 } 2863 try { 2864 wal.sync(); // ensure that flush marker is sync'ed 2865 } catch (IOException ioe) { 2866 wal.abortCacheFlush(hri.getEncodedNameAsBytes()); 2867 throw ioe; 2868 } 2869 } 2870 2871 /** Returns True if passed Set is all families in the region. */ 2872 private boolean isAllFamilies(Collection<HStore> families) { 2873 return families == null || this.stores.size() == families.size(); 2874 } 2875 2876 /** 2877 * Writes a marker to WAL indicating a flush is requested but cannot be complete due to various 2878 * reasons. Ignores exceptions from WAL. Returns whether the write succeeded. 2879 * @return whether WAL write was successful 2880 */ 2881 private boolean writeFlushRequestMarkerToWAL(WAL wal, boolean writeFlushWalMarker) { 2882 if (writeFlushWalMarker && wal != null && !writestate.readOnly) { 2883 FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.CANNOT_FLUSH, 2884 getRegionInfo(), -1, new TreeMap<>(Bytes.BYTES_COMPARATOR)); 2885 try { 2886 WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true, 2887 mvcc); 2888 return true; 2889 } catch (IOException e) { 2890 LOG.warn(getRegionInfo().getEncodedName() + " : " 2891 + "Received exception while trying to write the flush request to wal", e); 2892 } 2893 } 2894 return false; 2895 } 2896 2897 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", 2898 justification = "Intentional; notify is about completed flush") 2899 FlushResultImpl internalFlushCacheAndCommit(WAL wal, MonitoredTask status, 2900 PrepareFlushResult prepareResult, Collection<HStore> storesToFlush) throws IOException { 2901 // prepare flush context is carried via PrepareFlushResult 2902 TreeMap<byte[], StoreFlushContext> storeFlushCtxs = prepareResult.storeFlushCtxs; 2903 TreeMap<byte[], List<Path>> committedFiles = prepareResult.committedFiles; 2904 long startTime = prepareResult.startTime; 2905 long flushOpSeqId = prepareResult.flushOpSeqId; 2906 long flushedSeqId = prepareResult.flushedSeqId; 2907 2908 String s = "Flushing stores of " + this; 2909 status.setStatus(s); 2910 if (LOG.isTraceEnabled()) LOG.trace(s); 2911 2912 // Any failure from here on out will be catastrophic requiring server 2913 // restart so wal content can be replayed and put back into the memstore. 2914 // Otherwise, the snapshot content while backed up in the wal, it will not 2915 // be part of the current running servers state. 2916 boolean compactionRequested = false; 2917 long flushedOutputFileSize = 0; 2918 try { 2919 // A. Flush memstore to all the HStores. 2920 // Keep running vector of all store files that includes both old and the 2921 // just-made new flush store file. The new flushed file is still in the 2922 // tmp directory. 2923 2924 for (StoreFlushContext flush : storeFlushCtxs.values()) { 2925 flush.flushCache(status); 2926 } 2927 2928 // Switch snapshot (in memstore) -> new hfile (thus causing 2929 // all the store scanners to reset/reseek). 2930 for (Map.Entry<byte[], StoreFlushContext> flushEntry : storeFlushCtxs.entrySet()) { 2931 StoreFlushContext sfc = flushEntry.getValue(); 2932 boolean needsCompaction = sfc.commit(status); 2933 if (needsCompaction) { 2934 compactionRequested = true; 2935 } 2936 byte[] storeName = flushEntry.getKey(); 2937 List<Path> storeCommittedFiles = sfc.getCommittedFiles(); 2938 committedFiles.put(storeName, storeCommittedFiles); 2939 // Flush committed no files, indicating flush is empty or flush was canceled 2940 if (storeCommittedFiles == null || storeCommittedFiles.isEmpty()) { 2941 MemStoreSize storeFlushableSize = prepareResult.storeFlushableSize.get(storeName); 2942 prepareResult.totalFlushableSize.decMemStoreSize(storeFlushableSize); 2943 } 2944 flushedOutputFileSize += sfc.getOutputFileSize(); 2945 } 2946 storeFlushCtxs.clear(); 2947 2948 // Set down the memstore size by amount of flush. 2949 MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize(); 2950 this.decrMemStoreSize(mss); 2951 2952 // Increase the size of this Region for the purposes of quota. Noop if quotas are disabled. 2953 // During startup, quota manager may not be initialized yet. 2954 if (rsServices != null) { 2955 RegionServerSpaceQuotaManager quotaManager = rsServices.getRegionServerSpaceQuotaManager(); 2956 if (quotaManager != null) { 2957 quotaManager.getRegionSizeStore().incrementRegionSize(this.getRegionInfo(), 2958 flushedOutputFileSize); 2959 } 2960 } 2961 2962 if (wal != null) { 2963 // write flush marker to WAL. If fail, we should throw DroppedSnapshotException 2964 FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.COMMIT_FLUSH, 2965 getRegionInfo(), flushOpSeqId, committedFiles); 2966 WALUtil.writeFlushMarker(wal, this.getReplicationScope(), getRegionInfo(), desc, true, 2967 mvcc); 2968 } 2969 } catch (Throwable t) { 2970 // An exception here means that the snapshot was not persisted. 2971 // The wal needs to be replayed so its content is restored to memstore. 2972 // Currently, only a server restart will do this. 2973 // We used to only catch IOEs but its possible that we'd get other 2974 // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch 2975 // all and sundry. 2976 if (wal != null) { 2977 try { 2978 FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH, 2979 getRegionInfo(), flushOpSeqId, committedFiles); 2980 WALUtil.writeFlushMarker(wal, this.replicationScope, getRegionInfo(), desc, false, mvcc); 2981 } catch (Throwable ex) { 2982 LOG.warn( 2983 getRegionInfo().getEncodedName() + " : " + "failed writing ABORT_FLUSH marker to WAL", 2984 ex); 2985 // ignore this since we will be aborting the RS with DSE. 2986 } 2987 wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes()); 2988 } 2989 DroppedSnapshotException dse = new DroppedSnapshotException( 2990 "region: " + Bytes.toStringBinary(getRegionInfo().getRegionName()), t); 2991 status.abort("Flush failed: " + StringUtils.stringifyException(t)); 2992 2993 // Callers for flushcache() should catch DroppedSnapshotException and abort the region server. 2994 // However, since we may have the region read lock, we cannot call close(true) here since 2995 // we cannot promote to a write lock. Instead we are setting closing so that all other region 2996 // operations except for close will be rejected. 2997 this.closing.set(true); 2998 2999 if (rsServices != null) { 3000 // This is a safeguard against the case where the caller fails to explicitly handle aborting 3001 rsServices.abort("Replay of WAL required. Forcing server shutdown", dse); 3002 } 3003 3004 throw dse; 3005 } 3006 3007 // If we get to here, the HStores have been written. 3008 if (wal != null) { 3009 wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes(), flushedSeqId); 3010 } 3011 3012 // Record latest flush time 3013 for (HStore store : storesToFlush) { 3014 this.lastStoreFlushTimeMap.put(store, startTime); 3015 } 3016 3017 this.maxFlushedSeqId = flushedSeqId; 3018 this.lastFlushOpSeqId = flushOpSeqId; 3019 3020 // C. Finally notify anyone waiting on memstore to clear: 3021 // e.g. checkResources(). 3022 synchronized (this) { 3023 notifyAll(); // FindBugs NN_NAKED_NOTIFY 3024 } 3025 3026 long time = EnvironmentEdgeManager.currentTime() - startTime; 3027 MemStoreSize mss = prepareResult.totalFlushableSize.getMemStoreSize(); 3028 long memstoresize = this.memStoreSizing.getMemStoreSize().getDataSize(); 3029 String msg = "Finished flush of" + " dataSize ~" + StringUtils.byteDesc(mss.getDataSize()) + "/" 3030 + mss.getDataSize() + ", heapSize ~" + StringUtils.byteDesc(mss.getHeapSize()) + "/" 3031 + mss.getHeapSize() + ", currentSize=" + StringUtils.byteDesc(memstoresize) + "/" 3032 + memstoresize + " for " + this.getRegionInfo().getEncodedName() + " in " + time 3033 + "ms, sequenceid=" + flushOpSeqId + ", compaction requested=" + compactionRequested 3034 + ((wal == null) ? "; wal=null" : ""); 3035 LOG.info(msg); 3036 status.setStatus(msg); 3037 3038 if (rsServices != null && rsServices.getMetrics() != null) { 3039 rsServices.getMetrics().updateFlush(getTableDescriptor().getTableName().getNameAsString(), 3040 time, mss.getDataSize(), flushedOutputFileSize); 3041 } 3042 3043 return new FlushResultImpl(compactionRequested 3044 ? FlushResult.Result.FLUSHED_COMPACTION_NEEDED 3045 : FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED, flushOpSeqId); 3046 } 3047 3048 /** 3049 * Method to safely get the next sequence number. 3050 * @return Next sequence number unassociated with any actual edit. 3051 */ 3052 protected long getNextSequenceId(final WAL wal) throws IOException { 3053 WriteEntry we = mvcc.begin(); 3054 mvcc.completeAndWait(we); 3055 return we.getWriteNumber(); 3056 } 3057 3058 ////////////////////////////////////////////////////////////////////////////// 3059 // get() methods for client use. 3060 ////////////////////////////////////////////////////////////////////////////// 3061 3062 @Override 3063 public RegionScannerImpl getScanner(Scan scan) throws IOException { 3064 return getScanner(scan, null); 3065 } 3066 3067 @Override 3068 public RegionScannerImpl getScanner(Scan scan, List<KeyValueScanner> additionalScanners) 3069 throws IOException { 3070 return getScanner(scan, additionalScanners, HConstants.NO_NONCE, HConstants.NO_NONCE); 3071 } 3072 3073 private RegionScannerImpl getScanner(Scan scan, List<KeyValueScanner> additionalScanners, 3074 long nonceGroup, long nonce) throws IOException { 3075 return TraceUtil.trace(() -> { 3076 startRegionOperation(Operation.SCAN); 3077 try { 3078 // Verify families are all valid 3079 if (!scan.hasFamilies()) { 3080 // Adding all families to scanner 3081 for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) { 3082 scan.addFamily(family); 3083 } 3084 } else { 3085 for (byte[] family : scan.getFamilyMap().keySet()) { 3086 checkFamily(family); 3087 } 3088 } 3089 return instantiateRegionScanner(scan, additionalScanners, nonceGroup, nonce); 3090 } finally { 3091 closeRegionOperation(Operation.SCAN); 3092 } 3093 }, () -> createRegionSpan("Region.getScanner")); 3094 } 3095 3096 protected RegionScannerImpl instantiateRegionScanner(Scan scan, 3097 List<KeyValueScanner> additionalScanners, long nonceGroup, long nonce) throws IOException { 3098 if (scan.isReversed()) { 3099 if (scan.getFilter() != null) { 3100 scan.getFilter().setReversed(true); 3101 } 3102 return new ReversedRegionScannerImpl(scan, additionalScanners, this, nonceGroup, nonce); 3103 } 3104 return new RegionScannerImpl(scan, additionalScanners, this, nonceGroup, nonce); 3105 } 3106 3107 /** 3108 * Prepare a delete for a row mutation processor 3109 * @param delete The passed delete is modified by this method. WARNING! 3110 */ 3111 private void prepareDelete(Delete delete) throws IOException { 3112 // Check to see if this is a deleteRow insert 3113 if (delete.getFamilyCellMap().isEmpty()) { 3114 for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) { 3115 // Don't eat the timestamp 3116 delete.addFamily(family, delete.getTimestamp()); 3117 } 3118 } else { 3119 for (byte[] family : delete.getFamilyCellMap().keySet()) { 3120 if (family == null) { 3121 throw new NoSuchColumnFamilyException("Empty family is invalid"); 3122 } 3123 checkFamily(family); 3124 } 3125 } 3126 } 3127 3128 @Override 3129 public void delete(Delete delete) throws IOException { 3130 TraceUtil.trace(() -> { 3131 checkReadOnly(); 3132 checkResources(); 3133 startRegionOperation(Operation.DELETE); 3134 try { 3135 // All edits for the given row (across all column families) must happen atomically. 3136 return mutate(delete); 3137 } finally { 3138 closeRegionOperation(Operation.DELETE); 3139 } 3140 }, () -> createRegionSpan("Region.delete")); 3141 } 3142 3143 /** 3144 * Set up correct timestamps in the KVs in Delete object. 3145 * <p/> 3146 * Caller should have the row and region locks. 3147 */ 3148 private void prepareDeleteTimestamps(Mutation mutation, Map<byte[], List<Cell>> familyMap, 3149 byte[] byteNow) throws IOException { 3150 for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) { 3151 3152 byte[] family = e.getKey(); 3153 List<Cell> cells = e.getValue(); 3154 assert cells instanceof RandomAccess; 3155 3156 Map<byte[], Integer> kvCount = new TreeMap<>(Bytes.BYTES_COMPARATOR); 3157 int listSize = cells.size(); 3158 for (int i = 0; i < listSize; i++) { 3159 Cell cell = cells.get(i); 3160 // Check if time is LATEST, change to time of most recent addition if so 3161 // This is expensive. 3162 if ( 3163 cell.getTimestamp() == HConstants.LATEST_TIMESTAMP && PrivateCellUtil.isDeleteType(cell) 3164 ) { 3165 byte[] qual = CellUtil.cloneQualifier(cell); 3166 3167 Integer count = kvCount.get(qual); 3168 if (count == null) { 3169 kvCount.put(qual, 1); 3170 } else { 3171 kvCount.put(qual, count + 1); 3172 } 3173 count = kvCount.get(qual); 3174 3175 Get get = new Get(CellUtil.cloneRow(cell)); 3176 get.setMaxVersions(count); 3177 get.addColumn(family, qual); 3178 if (coprocessorHost != null) { 3179 if ( 3180 !coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell, byteNow, get) 3181 ) { 3182 updateDeleteLatestVersionTimestamp(cell, get, count, byteNow); 3183 } 3184 } else { 3185 updateDeleteLatestVersionTimestamp(cell, get, count, byteNow); 3186 } 3187 } else { 3188 PrivateCellUtil.updateLatestStamp(cell, byteNow); 3189 } 3190 } 3191 } 3192 } 3193 3194 private void updateDeleteLatestVersionTimestamp(Cell cell, Get get, int count, byte[] byteNow) 3195 throws IOException { 3196 try (RegionScanner scanner = getScanner(new Scan(get))) { 3197 // NOTE: Please don't use HRegion.get() instead, 3198 // because it will copy cells to heap. See HBASE-26036 3199 List<Cell> result = new ArrayList<>(); 3200 scanner.next(result); 3201 3202 if (result.size() < count) { 3203 // Nothing to delete 3204 PrivateCellUtil.updateLatestStamp(cell, byteNow); 3205 return; 3206 } 3207 if (result.size() > count) { 3208 throw new RuntimeException("Unexpected size: " + result.size()); 3209 } 3210 Cell getCell = result.get(count - 1); 3211 PrivateCellUtil.setTimestamp(cell, getCell.getTimestamp()); 3212 } 3213 } 3214 3215 @Override 3216 public void put(Put put) throws IOException { 3217 TraceUtil.trace(() -> { 3218 checkReadOnly(); 3219 3220 // Do a rough check that we have resources to accept a write. The check is 3221 // 'rough' in that between the resource check and the call to obtain a 3222 // read lock, resources may run out. For now, the thought is that this 3223 // will be extremely rare; we'll deal with it when it happens. 3224 checkResources(); 3225 startRegionOperation(Operation.PUT); 3226 try { 3227 // All edits for the given row (across all column families) must happen atomically. 3228 return mutate(put); 3229 } finally { 3230 closeRegionOperation(Operation.PUT); 3231 } 3232 }, () -> createRegionSpan("Region.put")); 3233 } 3234 3235 /** 3236 * Class that tracks the progress of a batch operations, accumulating status codes and tracking 3237 * the index at which processing is proceeding. These batch operations may get split into 3238 * mini-batches for processing. 3239 */ 3240 private abstract static class BatchOperation<T> { 3241 protected final T[] operations; 3242 protected final OperationStatus[] retCodeDetails; 3243 protected final WALEdit[] walEditsFromCoprocessors; 3244 // reference family cell maps directly so coprocessors can mutate them if desired 3245 protected final Map<byte[], List<Cell>>[] familyCellMaps; 3246 // For Increment/Append operations 3247 protected final Result[] results; 3248 3249 protected final HRegion region; 3250 protected int nextIndexToProcess = 0; 3251 protected final ObservedExceptionsInBatch observedExceptions; 3252 // Durability of the batch (highest durability of all operations) 3253 protected Durability durability; 3254 protected boolean atomic = false; 3255 3256 public BatchOperation(final HRegion region, T[] operations) { 3257 this.operations = operations; 3258 this.retCodeDetails = new OperationStatus[operations.length]; 3259 Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN); 3260 this.walEditsFromCoprocessors = new WALEdit[operations.length]; 3261 familyCellMaps = new Map[operations.length]; 3262 this.results = new Result[operations.length]; 3263 3264 this.region = region; 3265 observedExceptions = new ObservedExceptionsInBatch(); 3266 durability = Durability.USE_DEFAULT; 3267 } 3268 3269 /** 3270 * Visitor interface for batch operations 3271 */ 3272 @FunctionalInterface 3273 interface Visitor { 3274 /** 3275 * @param index operation index 3276 * @return If true continue visiting remaining entries, break otherwise 3277 */ 3278 boolean visit(int index) throws IOException; 3279 } 3280 3281 /** 3282 * Helper method for visiting pending/ all batch operations 3283 */ 3284 public void visitBatchOperations(boolean pendingOnly, int lastIndexExclusive, Visitor visitor) 3285 throws IOException { 3286 assert lastIndexExclusive <= this.size(); 3287 for (int i = nextIndexToProcess; i < lastIndexExclusive; i++) { 3288 if (!pendingOnly || isOperationPending(i)) { 3289 if (!visitor.visit(i)) { 3290 break; 3291 } 3292 } 3293 } 3294 } 3295 3296 public abstract Mutation getMutation(int index); 3297 3298 public abstract long getNonceGroup(int index); 3299 3300 public abstract long getNonce(int index); 3301 3302 /** 3303 * This method is potentially expensive and useful mostly for non-replay CP path. 3304 */ 3305 public abstract Mutation[] getMutationsForCoprocs(); 3306 3307 public abstract boolean isInReplay(); 3308 3309 public abstract long getOrigLogSeqNum(); 3310 3311 public abstract void startRegionOperation() throws IOException; 3312 3313 public abstract void closeRegionOperation() throws IOException; 3314 3315 /** 3316 * Validates each mutation and prepares a batch for write. If necessary (non-replay case), runs 3317 * CP prePut()/preDelete()/preIncrement()/preAppend() hooks for all mutations in a batch. This 3318 * is intended to operate on entire batch and will be called from outside of class to check and 3319 * prepare batch. This can be implemented by calling helper method 3320 * {@link #checkAndPrepareMutation(int, long)} in a 'for' loop over mutations. 3321 */ 3322 public abstract void checkAndPrepare() throws IOException; 3323 3324 /** 3325 * Implement any Put request specific check and prepare logic here. Please refer to 3326 * {@link #checkAndPrepareMutation(Mutation, long)} for how its used. 3327 */ 3328 protected abstract void checkAndPreparePut(final Put p) throws IOException; 3329 3330 /** 3331 * If necessary, calls preBatchMutate() CP hook for a mini-batch and updates metrics, cell 3332 * count, tags and timestamp for all cells of all operations in a mini-batch. 3333 */ 3334 public abstract void prepareMiniBatchOperations( 3335 MiniBatchOperationInProgress<Mutation> miniBatchOp, long timestamp, 3336 final List<RowLock> acquiredRowLocks) throws IOException; 3337 3338 /** 3339 * Write mini-batch operations to MemStore 3340 */ 3341 public abstract WriteEntry writeMiniBatchOperationsToMemStore( 3342 final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry) 3343 throws IOException; 3344 3345 protected void writeMiniBatchOperationsToMemStore( 3346 final MiniBatchOperationInProgress<Mutation> miniBatchOp, final long writeNumber) 3347 throws IOException { 3348 MemStoreSizing memStoreAccounting = new NonThreadSafeMemStoreSizing(); 3349 visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> { 3350 // We need to update the sequence id for following reasons. 3351 // 1) If the op is in replay mode, FSWALEntry#stampRegionSequenceId won't stamp sequence id. 3352 // 2) If no WAL, FSWALEntry won't be used 3353 // we use durability of the original mutation for the mutation passed by CP. 3354 if (isInReplay() || getMutation(index).getDurability() == Durability.SKIP_WAL) { 3355 region.updateSequenceId(familyCellMaps[index].values(), writeNumber); 3356 } 3357 applyFamilyMapToMemStore(familyCellMaps[index], memStoreAccounting); 3358 return true; 3359 }); 3360 // update memStore size 3361 region.incMemStoreSize(memStoreAccounting.getDataSize(), memStoreAccounting.getHeapSize(), 3362 memStoreAccounting.getOffHeapSize(), memStoreAccounting.getCellsCount()); 3363 } 3364 3365 public boolean isDone() { 3366 return nextIndexToProcess == operations.length; 3367 } 3368 3369 public int size() { 3370 return operations.length; 3371 } 3372 3373 public boolean isOperationPending(int index) { 3374 return retCodeDetails[index].getOperationStatusCode() == OperationStatusCode.NOT_RUN; 3375 } 3376 3377 public List<UUID> getClusterIds() { 3378 assert size() != 0; 3379 return getMutation(0).getClusterIds(); 3380 } 3381 3382 boolean isAtomic() { 3383 return atomic; 3384 } 3385 3386 /** 3387 * Helper method that checks and prepares only one mutation. This can be used to implement 3388 * {@link #checkAndPrepare()} for entire Batch. NOTE: As CP 3389 * prePut()/preDelete()/preIncrement()/preAppend() hooks may modify mutations, this method 3390 * should be called after prePut()/preDelete()/preIncrement()/preAppend() CP hooks are run for 3391 * the mutation 3392 */ 3393 protected void checkAndPrepareMutation(Mutation mutation, final long timestamp) 3394 throws IOException { 3395 region.checkRow(mutation.getRow(), "batchMutate"); 3396 if (mutation instanceof Put) { 3397 // Check the families in the put. If bad, skip this one. 3398 checkAndPreparePut((Put) mutation); 3399 region.checkTimestamps(mutation.getFamilyCellMap(), timestamp); 3400 } else if (mutation instanceof Delete) { 3401 region.prepareDelete((Delete) mutation); 3402 } else if (mutation instanceof Increment || mutation instanceof Append) { 3403 region.checkFamilies(mutation.getFamilyCellMap().keySet()); 3404 } 3405 } 3406 3407 protected void checkAndPrepareMutation(int index, long timestamp) throws IOException { 3408 Mutation mutation = getMutation(index); 3409 try { 3410 this.checkAndPrepareMutation(mutation, timestamp); 3411 3412 if (mutation instanceof Put || mutation instanceof Delete) { 3413 // store the family map reference to allow for mutations 3414 familyCellMaps[index] = mutation.getFamilyCellMap(); 3415 } 3416 3417 // store durability for the batch (highest durability of all operations in the batch) 3418 Durability tmpDur = region.getEffectiveDurability(mutation.getDurability()); 3419 if (tmpDur.ordinal() > durability.ordinal()) { 3420 durability = tmpDur; 3421 } 3422 } catch (NoSuchColumnFamilyException nscfe) { 3423 final String msg = "No such column family in batch mutation in region " + this; 3424 if (observedExceptions.hasSeenNoSuchFamily()) { 3425 LOG.warn(msg + nscfe.getMessage()); 3426 } else { 3427 LOG.warn(msg, nscfe); 3428 observedExceptions.sawNoSuchFamily(); 3429 } 3430 retCodeDetails[index] = 3431 new OperationStatus(OperationStatusCode.BAD_FAMILY, nscfe.getMessage()); 3432 if (isAtomic()) { // fail, atomic means all or none 3433 throw nscfe; 3434 } 3435 } catch (FailedSanityCheckException fsce) { 3436 final String msg = "Batch Mutation did not pass sanity check in region " + this; 3437 if (observedExceptions.hasSeenFailedSanityCheck()) { 3438 LOG.warn(msg + fsce.getMessage()); 3439 } else { 3440 LOG.warn(msg, fsce); 3441 observedExceptions.sawFailedSanityCheck(); 3442 } 3443 retCodeDetails[index] = 3444 new OperationStatus(OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage()); 3445 if (isAtomic()) { 3446 throw fsce; 3447 } 3448 } catch (WrongRegionException we) { 3449 final String msg = "Batch mutation had a row that does not belong to this region " + this; 3450 if (observedExceptions.hasSeenWrongRegion()) { 3451 LOG.warn(msg + we.getMessage()); 3452 } else { 3453 LOG.warn(msg, we); 3454 observedExceptions.sawWrongRegion(); 3455 } 3456 retCodeDetails[index] = 3457 new OperationStatus(OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage()); 3458 if (isAtomic()) { 3459 throw we; 3460 } 3461 } 3462 } 3463 3464 /** 3465 * Creates Mini-batch of all operations [nextIndexToProcess, lastIndexExclusive) for which a row 3466 * lock can be acquired. All mutations with locked rows are considered to be In-progress 3467 * operations and hence the name {@link MiniBatchOperationInProgress}. Mini batch is window over 3468 * {@link BatchOperation} and contains contiguous pending operations. 3469 * @param acquiredRowLocks keeps track of rowLocks acquired. 3470 */ 3471 public MiniBatchOperationInProgress<Mutation> 3472 lockRowsAndBuildMiniBatch(List<RowLock> acquiredRowLocks) throws IOException { 3473 int readyToWriteCount = 0; 3474 int lastIndexExclusive = 0; 3475 RowLock prevRowLock = null; 3476 for (; lastIndexExclusive < size(); lastIndexExclusive++) { 3477 // It reaches the miniBatchSize, stop here and process the miniBatch 3478 // This only applies to non-atomic batch operations. 3479 if (!isAtomic() && (readyToWriteCount == region.miniBatchSize)) { 3480 break; 3481 } 3482 3483 if (!isOperationPending(lastIndexExclusive)) { 3484 continue; 3485 } 3486 3487 // HBASE-19389 Limit concurrency of put with dense (hundreds) columns to avoid exhausting 3488 // RS handlers, covering both MutationBatchOperation and ReplayBatchOperation 3489 // The BAD_FAMILY/SANITY_CHECK_FAILURE cases are handled in checkAndPrepare phase and won't 3490 // pass the isOperationPending check 3491 Map<byte[], List<Cell>> curFamilyCellMap = 3492 getMutation(lastIndexExclusive).getFamilyCellMap(); 3493 try { 3494 // start the protector before acquiring row lock considering performance, and will finish 3495 // it when encountering exception 3496 region.storeHotnessProtector.start(curFamilyCellMap); 3497 } catch (RegionTooBusyException rtbe) { 3498 region.storeHotnessProtector.finish(curFamilyCellMap); 3499 if (isAtomic()) { 3500 throw rtbe; 3501 } 3502 retCodeDetails[lastIndexExclusive] = 3503 new OperationStatus(OperationStatusCode.STORE_TOO_BUSY, rtbe.getMessage()); 3504 continue; 3505 } 3506 3507 Mutation mutation = getMutation(lastIndexExclusive); 3508 // If we haven't got any rows in our batch, we should block to get the next one. 3509 RowLock rowLock = null; 3510 boolean throwException = false; 3511 try { 3512 // if atomic then get exclusive lock, else shared lock 3513 rowLock = region.getRowLock(mutation.getRow(), !isAtomic(), prevRowLock); 3514 } catch (TimeoutIOException | InterruptedIOException e) { 3515 // NOTE: We will retry when other exceptions, but we should stop if we receive 3516 // TimeoutIOException or InterruptedIOException as operation has timed out or 3517 // interrupted respectively. 3518 throwException = true; 3519 throw e; 3520 } catch (IOException ioe) { 3521 LOG.warn("Failed getting lock, row={}, in region {}", 3522 Bytes.toStringBinary(mutation.getRow()), this, ioe); 3523 if (isAtomic()) { // fail, atomic means all or none 3524 throwException = true; 3525 throw ioe; 3526 } 3527 } catch (Throwable throwable) { 3528 throwException = true; 3529 throw throwable; 3530 } finally { 3531 if (throwException) { 3532 region.storeHotnessProtector.finish(curFamilyCellMap); 3533 } 3534 } 3535 if (rowLock == null) { 3536 // We failed to grab another lock 3537 if (isAtomic()) { 3538 region.storeHotnessProtector.finish(curFamilyCellMap); 3539 throw new IOException("Can't apply all operations atomically!"); 3540 } 3541 break; // Stop acquiring more rows for this batch 3542 } else { 3543 if (rowLock != prevRowLock) { 3544 // It is a different row now, add this to the acquiredRowLocks and 3545 // set prevRowLock to the new returned rowLock 3546 acquiredRowLocks.add(rowLock); 3547 prevRowLock = rowLock; 3548 } 3549 } 3550 3551 readyToWriteCount++; 3552 } 3553 return createMiniBatch(lastIndexExclusive, readyToWriteCount); 3554 } 3555 3556 protected MiniBatchOperationInProgress<Mutation> createMiniBatch(final int lastIndexExclusive, 3557 final int readyToWriteCount) { 3558 return new MiniBatchOperationInProgress<>(getMutationsForCoprocs(), retCodeDetails, 3559 walEditsFromCoprocessors, nextIndexToProcess, lastIndexExclusive, readyToWriteCount); 3560 } 3561 3562 /** 3563 * Builds separate WALEdit per nonce by applying input mutations. If WALEdits from CP are 3564 * present, they are merged to result WALEdit. 3565 */ 3566 public List<Pair<NonceKey, WALEdit>> 3567 buildWALEdits(final MiniBatchOperationInProgress<Mutation> miniBatchOp) throws IOException { 3568 List<Pair<NonceKey, WALEdit>> walEdits = new ArrayList<>(); 3569 3570 visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), new Visitor() { 3571 private Pair<NonceKey, WALEdit> curWALEditForNonce; 3572 3573 @Override 3574 public boolean visit(int index) throws IOException { 3575 Mutation m = getMutation(index); 3576 // we use durability of the original mutation for the mutation passed by CP. 3577 if (region.getEffectiveDurability(m.getDurability()) == Durability.SKIP_WAL) { 3578 region.recordMutationWithoutWal(m.getFamilyCellMap()); 3579 return true; 3580 } 3581 3582 // the batch may contain multiple nonce keys (replay case). If so, write WALEdit for each. 3583 // Given how nonce keys are originally written, these should be contiguous. 3584 // They don't have to be, it will still work, just write more WALEdits than needed. 3585 long nonceGroup = getNonceGroup(index); 3586 long nonce = getNonce(index); 3587 if ( 3588 curWALEditForNonce == null 3589 || curWALEditForNonce.getFirst().getNonceGroup() != nonceGroup 3590 || curWALEditForNonce.getFirst().getNonce() != nonce 3591 ) { 3592 curWALEditForNonce = new Pair<>(new NonceKey(nonceGroup, nonce), 3593 new WALEdit(miniBatchOp.getCellCount(), isInReplay())); 3594 walEdits.add(curWALEditForNonce); 3595 } 3596 WALEdit walEdit = curWALEditForNonce.getSecond(); 3597 3598 // Add WAL edits from CPs. 3599 WALEdit fromCP = walEditsFromCoprocessors[index]; 3600 if (fromCP != null) { 3601 for (Cell cell : fromCP.getCells()) { 3602 walEdit.add(cell); 3603 } 3604 } 3605 walEdit.add(familyCellMaps[index]); 3606 3607 return true; 3608 } 3609 }); 3610 return walEdits; 3611 } 3612 3613 /** 3614 * This method completes mini-batch operations by calling postBatchMutate() CP hook (if 3615 * required) and completing mvcc. 3616 */ 3617 public void completeMiniBatchOperations( 3618 final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry) 3619 throws IOException { 3620 if (writeEntry != null) { 3621 region.mvcc.completeAndWait(writeEntry); 3622 } 3623 } 3624 3625 public void doPostOpCleanupForMiniBatch( 3626 final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WALEdit walEdit, 3627 boolean success) throws IOException { 3628 doFinishHotnessProtector(miniBatchOp); 3629 } 3630 3631 private void 3632 doFinishHotnessProtector(final MiniBatchOperationInProgress<Mutation> miniBatchOp) { 3633 // check and return if the protector is not enabled 3634 if (!region.storeHotnessProtector.isEnable()) { 3635 return; 3636 } 3637 // miniBatchOp is null, if and only if lockRowsAndBuildMiniBatch throwing exception. 3638 // This case was handled. 3639 if (miniBatchOp == null) { 3640 return; 3641 } 3642 3643 final int finalLastIndexExclusive = miniBatchOp.getLastIndexExclusive(); 3644 3645 for (int i = nextIndexToProcess; i < finalLastIndexExclusive; i++) { 3646 switch (retCodeDetails[i].getOperationStatusCode()) { 3647 case SUCCESS: 3648 case FAILURE: 3649 region.storeHotnessProtector.finish(getMutation(i).getFamilyCellMap()); 3650 break; 3651 default: 3652 // do nothing 3653 // We won't start the protector for NOT_RUN/BAD_FAMILY/SANITY_CHECK_FAILURE and the 3654 // STORE_TOO_BUSY case is handled in StoreHotnessProtector#start 3655 break; 3656 } 3657 } 3658 } 3659 3660 /** 3661 * Atomically apply the given map of family->edits to the memstore. This handles the consistency 3662 * control on its own, but the caller should already have locked updatesLock.readLock(). This 3663 * also does <b>not</b> check the families for validity. 3664 * @param familyMap Map of Cells by family 3665 */ 3666 protected void applyFamilyMapToMemStore(Map<byte[], List<Cell>> familyMap, 3667 MemStoreSizing memstoreAccounting) { 3668 for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) { 3669 byte[] family = e.getKey(); 3670 List<Cell> cells = e.getValue(); 3671 assert cells instanceof RandomAccess; 3672 region.applyToMemStore(region.getStore(family), cells, false, memstoreAccounting); 3673 } 3674 } 3675 } 3676 3677 /** 3678 * Batch of mutation operations. Base class is shared with {@link ReplayBatchOperation} as most of 3679 * the logic is same. 3680 */ 3681 private static class MutationBatchOperation extends BatchOperation<Mutation> { 3682 3683 // For nonce operations 3684 private long nonceGroup; 3685 private long nonce; 3686 protected boolean canProceed; 3687 3688 public MutationBatchOperation(final HRegion region, Mutation[] operations, boolean atomic, 3689 long nonceGroup, long nonce) { 3690 super(region, operations); 3691 this.atomic = atomic; 3692 this.nonceGroup = nonceGroup; 3693 this.nonce = nonce; 3694 } 3695 3696 @Override 3697 public Mutation getMutation(int index) { 3698 return this.operations[index]; 3699 } 3700 3701 @Override 3702 public long getNonceGroup(int index) { 3703 return nonceGroup; 3704 } 3705 3706 @Override 3707 public long getNonce(int index) { 3708 return nonce; 3709 } 3710 3711 @Override 3712 public Mutation[] getMutationsForCoprocs() { 3713 return this.operations; 3714 } 3715 3716 @Override 3717 public boolean isInReplay() { 3718 return false; 3719 } 3720 3721 @Override 3722 public long getOrigLogSeqNum() { 3723 return SequenceId.NO_SEQUENCE_ID; 3724 } 3725 3726 @Override 3727 public void startRegionOperation() throws IOException { 3728 region.startRegionOperation(Operation.BATCH_MUTATE); 3729 } 3730 3731 @Override 3732 public void closeRegionOperation() throws IOException { 3733 region.closeRegionOperation(Operation.BATCH_MUTATE); 3734 } 3735 3736 @Override 3737 public void checkAndPreparePut(Put p) throws IOException { 3738 region.checkFamilies(p.getFamilyCellMap().keySet()); 3739 } 3740 3741 @Override 3742 public void checkAndPrepare() throws IOException { 3743 // index 0: puts, index 1: deletes, index 2: increments, index 3: append 3744 final int[] metrics = { 0, 0, 0, 0 }; 3745 3746 visitBatchOperations(true, this.size(), new Visitor() { 3747 private long now = EnvironmentEdgeManager.currentTime(); 3748 private WALEdit walEdit; 3749 3750 @Override 3751 public boolean visit(int index) throws IOException { 3752 // Run coprocessor pre hook outside of locks to avoid deadlock 3753 if (region.coprocessorHost != null) { 3754 if (walEdit == null) { 3755 walEdit = new WALEdit(); 3756 } 3757 callPreMutateCPHook(index, walEdit, metrics); 3758 if (!walEdit.isEmpty()) { 3759 walEditsFromCoprocessors[index] = walEdit; 3760 walEdit = null; 3761 } 3762 } 3763 if (isOperationPending(index)) { 3764 // TODO: Currently validation is done with current time before acquiring locks and 3765 // updates are done with different timestamps after acquiring locks. This behavior is 3766 // inherited from the code prior to this change. Can this be changed? 3767 checkAndPrepareMutation(index, now); 3768 } 3769 return true; 3770 } 3771 }); 3772 3773 // FIXME: we may update metrics twice! here for all operations bypassed by CP and later in 3774 // normal processing. 3775 // Update metrics in same way as it is done when we go the normal processing route (we now 3776 // update general metrics though a Coprocessor did the work). 3777 if (region.metricsRegion != null) { 3778 if (metrics[0] > 0) { 3779 // There were some Puts in the batch. 3780 region.metricsRegion.updatePut(); 3781 } 3782 if (metrics[1] > 0) { 3783 // There were some Deletes in the batch. 3784 region.metricsRegion.updateDelete(); 3785 } 3786 if (metrics[2] > 0) { 3787 // There were some Increment in the batch. 3788 region.metricsRegion.updateIncrement(); 3789 } 3790 if (metrics[3] > 0) { 3791 // There were some Append in the batch. 3792 region.metricsRegion.updateAppend(); 3793 } 3794 } 3795 } 3796 3797 @Override 3798 public void prepareMiniBatchOperations(MiniBatchOperationInProgress<Mutation> miniBatchOp, 3799 long timestamp, final List<RowLock> acquiredRowLocks) throws IOException { 3800 // For nonce operations 3801 canProceed = startNonceOperation(); 3802 3803 visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> { 3804 Mutation mutation = getMutation(index); 3805 if (mutation instanceof Put) { 3806 HRegion.updateCellTimestamps(familyCellMaps[index].values(), Bytes.toBytes(timestamp)); 3807 miniBatchOp.incrementNumOfPuts(); 3808 } else if (mutation instanceof Delete) { 3809 region.prepareDeleteTimestamps(mutation, familyCellMaps[index], Bytes.toBytes(timestamp)); 3810 miniBatchOp.incrementNumOfDeletes(); 3811 } else if (mutation instanceof Increment || mutation instanceof Append) { 3812 boolean returnResults; 3813 if (mutation instanceof Increment) { 3814 returnResults = ((Increment) mutation).isReturnResults(); 3815 } else { 3816 returnResults = ((Append) mutation).isReturnResults(); 3817 } 3818 3819 // For nonce operations 3820 if (!canProceed) { 3821 Result result; 3822 if (returnResults) { 3823 // convert duplicate increment/append to get 3824 List<Cell> results = region.get(toGet(mutation), false, nonceGroup, nonce); 3825 result = Result.create(results); 3826 } else { 3827 result = Result.EMPTY_RESULT; 3828 } 3829 retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, result); 3830 return true; 3831 } 3832 3833 Result result = null; 3834 if (region.coprocessorHost != null) { 3835 if (mutation instanceof Increment) { 3836 result = region.coprocessorHost.preIncrementAfterRowLock((Increment) mutation); 3837 } else { 3838 result = region.coprocessorHost.preAppendAfterRowLock((Append) mutation); 3839 } 3840 } 3841 if (result != null) { 3842 retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, 3843 returnResults ? result : Result.EMPTY_RESULT); 3844 return true; 3845 } 3846 3847 List<Cell> results = returnResults ? new ArrayList<>(mutation.size()) : null; 3848 familyCellMaps[index] = reckonDeltas(mutation, results, timestamp); 3849 this.results[index] = results != null ? Result.create(results) : Result.EMPTY_RESULT; 3850 3851 if (mutation instanceof Increment) { 3852 miniBatchOp.incrementNumOfIncrements(); 3853 } else { 3854 miniBatchOp.incrementNumOfAppends(); 3855 } 3856 } 3857 region.rewriteCellTags(familyCellMaps[index], mutation); 3858 3859 // update cell count 3860 if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) { 3861 for (List<Cell> cells : mutation.getFamilyCellMap().values()) { 3862 miniBatchOp.addCellCount(cells.size()); 3863 } 3864 } 3865 3866 WALEdit fromCP = walEditsFromCoprocessors[index]; 3867 if (fromCP != null) { 3868 miniBatchOp.addCellCount(fromCP.size()); 3869 } 3870 return true; 3871 }); 3872 3873 if (region.coprocessorHost != null) { 3874 // calling the pre CP hook for batch mutation 3875 region.coprocessorHost.preBatchMutate(miniBatchOp); 3876 checkAndMergeCPMutations(miniBatchOp, acquiredRowLocks, timestamp); 3877 } 3878 } 3879 3880 /** 3881 * Starts the nonce operation for a mutation, if needed. 3882 * @return whether to proceed this mutation. 3883 */ 3884 private boolean startNonceOperation() throws IOException { 3885 if ( 3886 region.rsServices == null || region.rsServices.getNonceManager() == null 3887 || nonce == HConstants.NO_NONCE 3888 ) { 3889 return true; 3890 } 3891 boolean canProceed; 3892 try { 3893 canProceed = 3894 region.rsServices.getNonceManager().startOperation(nonceGroup, nonce, region.rsServices); 3895 } catch (InterruptedException ex) { 3896 throw new InterruptedIOException("Nonce start operation interrupted"); 3897 } 3898 return canProceed; 3899 } 3900 3901 /** 3902 * Ends nonce operation for a mutation, if needed. 3903 * @param success Whether the operation for this nonce has succeeded. 3904 */ 3905 private void endNonceOperation(boolean success) { 3906 if ( 3907 region.rsServices != null && region.rsServices.getNonceManager() != null 3908 && nonce != HConstants.NO_NONCE 3909 ) { 3910 region.rsServices.getNonceManager().endOperation(nonceGroup, nonce, success); 3911 } 3912 } 3913 3914 private static Get toGet(final Mutation mutation) throws IOException { 3915 assert mutation instanceof Increment || mutation instanceof Append; 3916 Get get = new Get(mutation.getRow()); 3917 CellScanner cellScanner = mutation.cellScanner(); 3918 while (!cellScanner.advance()) { 3919 Cell cell = cellScanner.current(); 3920 get.addColumn(CellUtil.cloneFamily(cell), CellUtil.cloneQualifier(cell)); 3921 } 3922 if (mutation instanceof Increment) { 3923 // Increment 3924 Increment increment = (Increment) mutation; 3925 get.setTimeRange(increment.getTimeRange().getMin(), increment.getTimeRange().getMax()); 3926 } else { 3927 // Append 3928 Append append = (Append) mutation; 3929 get.setTimeRange(append.getTimeRange().getMin(), append.getTimeRange().getMax()); 3930 } 3931 for (Entry<String, byte[]> entry : mutation.getAttributesMap().entrySet()) { 3932 get.setAttribute(entry.getKey(), entry.getValue()); 3933 } 3934 return get; 3935 } 3936 3937 private Map<byte[], List<Cell>> reckonDeltas(Mutation mutation, List<Cell> results, long now) 3938 throws IOException { 3939 assert mutation instanceof Increment || mutation instanceof Append; 3940 Map<byte[], List<Cell>> ret = new TreeMap<>(Bytes.BYTES_COMPARATOR); 3941 // Process a Store/family at a time. 3942 for (Map.Entry<byte[], List<Cell>> entry : mutation.getFamilyCellMap().entrySet()) { 3943 final byte[] columnFamilyName = entry.getKey(); 3944 List<Cell> deltas = entry.getValue(); 3945 // Reckon for the Store what to apply to WAL and MemStore. 3946 List<Cell> toApply = 3947 reckonDeltasByStore(region.stores.get(columnFamilyName), mutation, now, deltas, results); 3948 if (!toApply.isEmpty()) { 3949 for (Cell cell : toApply) { 3950 HStore store = region.getStore(cell); 3951 if (store == null) { 3952 region.checkFamily(CellUtil.cloneFamily(cell)); 3953 } else { 3954 ret.computeIfAbsent(store.getColumnFamilyDescriptor().getName(), 3955 key -> new ArrayList<>()).add(cell); 3956 } 3957 } 3958 } 3959 } 3960 return ret; 3961 } 3962 3963 /** 3964 * Reckon the Cells to apply to WAL, memstore, and to return to the Client in passed column 3965 * family/Store. Does Get of current value and then adds passed in deltas for this Store 3966 * returning the result. 3967 * @param mutation The encompassing Mutation object 3968 * @param deltas Changes to apply to this Store; either increment amount or data to append 3969 * @param results In here we accumulate all the Cells we are to return to the client. If null, 3970 * client doesn't want results returned. 3971 * @return Resulting Cells after <code>deltas</code> have been applied to current values. Side 3972 * effect is our filling out of the <code>results</code> List. 3973 */ 3974 private List<Cell> reckonDeltasByStore(HStore store, Mutation mutation, long now, 3975 List<Cell> deltas, List<Cell> results) throws IOException { 3976 assert mutation instanceof Increment || mutation instanceof Append; 3977 byte[] columnFamily = store.getColumnFamilyDescriptor().getName(); 3978 List<Pair<Cell, Cell>> cellPairs = new ArrayList<>(deltas.size()); 3979 3980 // Sort the cells so that they match the order that they appear in the Get results. 3981 // Otherwise, we won't be able to find the existing values if the cells are not specified 3982 // in order by the client since cells are in an array list. 3983 deltas.sort(store.getComparator()); 3984 3985 // Get previous values for all columns in this family. 3986 Get get = new Get(mutation.getRow()); 3987 for (Cell cell : deltas) { 3988 get.addColumn(columnFamily, CellUtil.cloneQualifier(cell)); 3989 } 3990 TimeRange tr; 3991 if (mutation instanceof Increment) { 3992 tr = ((Increment) mutation).getTimeRange(); 3993 } else { 3994 tr = ((Append) mutation).getTimeRange(); 3995 } 3996 3997 if (tr != null) { 3998 get.setTimeRange(tr.getMin(), tr.getMax()); 3999 } 4000 4001 try (RegionScanner scanner = region.getScanner(new Scan(get))) { 4002 // NOTE: Please don't use HRegion.get() instead, 4003 // because it will copy cells to heap. See HBASE-26036 4004 List<Cell> currentValues = new ArrayList<>(); 4005 scanner.next(currentValues); 4006 // Iterate the input columns and update existing values if they were found, otherwise 4007 // add new column initialized to the delta amount 4008 int currentValuesIndex = 0; 4009 for (int i = 0; i < deltas.size(); i++) { 4010 Cell delta = deltas.get(i); 4011 Cell currentValue = null; 4012 if ( 4013 currentValuesIndex < currentValues.size() 4014 && CellUtil.matchingQualifier(currentValues.get(currentValuesIndex), delta) 4015 ) { 4016 currentValue = currentValues.get(currentValuesIndex); 4017 if (i < (deltas.size() - 1) && !CellUtil.matchingQualifier(delta, deltas.get(i + 1))) { 4018 currentValuesIndex++; 4019 } 4020 } 4021 // Switch on whether this an increment or an append building the new Cell to apply. 4022 Cell newCell; 4023 if (mutation instanceof Increment) { 4024 long deltaAmount = getLongValue(delta); 4025 final long newValue = 4026 currentValue == null ? deltaAmount : getLongValue(currentValue) + deltaAmount; 4027 newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation, 4028 (oldCell) -> Bytes.toBytes(newValue)); 4029 } else { 4030 newCell = reckonDelta(delta, currentValue, columnFamily, now, mutation, 4031 (oldCell) -> ByteBuffer 4032 .wrap(new byte[delta.getValueLength() + oldCell.getValueLength()]) 4033 .put(oldCell.getValueArray(), oldCell.getValueOffset(), oldCell.getValueLength()) 4034 .put(delta.getValueArray(), delta.getValueOffset(), delta.getValueLength()) 4035 .array()); 4036 } 4037 if (region.maxCellSize > 0) { 4038 int newCellSize = PrivateCellUtil.estimatedSerializedSizeOf(newCell); 4039 if (newCellSize > region.maxCellSize) { 4040 String msg = "Cell with size " + newCellSize + " exceeds limit of " 4041 + region.maxCellSize + " bytes in region " + this; 4042 LOG.debug(msg); 4043 throw new DoNotRetryIOException(msg); 4044 } 4045 } 4046 cellPairs.add(new Pair<>(currentValue, newCell)); 4047 // Add to results to get returned to the Client. If null, cilent does not want results. 4048 if (results != null) { 4049 results.add(newCell); 4050 } 4051 } 4052 // Give coprocessors a chance to update the new cells before apply to WAL or memstore 4053 if (region.coprocessorHost != null) { 4054 // Here the operation must be increment or append. 4055 cellPairs = mutation instanceof Increment 4056 ? region.coprocessorHost.postIncrementBeforeWAL(mutation, cellPairs) 4057 : region.coprocessorHost.postAppendBeforeWAL(mutation, cellPairs); 4058 } 4059 } 4060 return cellPairs.stream().map(Pair::getSecond).collect(Collectors.toList()); 4061 } 4062 4063 private static Cell reckonDelta(final Cell delta, final Cell currentCell, 4064 final byte[] columnFamily, final long now, Mutation mutation, Function<Cell, byte[]> supplier) 4065 throws IOException { 4066 // Forward any tags found on the delta. 4067 List<Tag> tags = TagUtil.carryForwardTags(delta); 4068 if (currentCell != null) { 4069 tags = TagUtil.carryForwardTags(tags, currentCell); 4070 tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL()); 4071 byte[] newValue = supplier.apply(currentCell); 4072 return ExtendedCellBuilderFactory.create(CellBuilderType.SHALLOW_COPY) 4073 .setRow(mutation.getRow(), 0, mutation.getRow().length) 4074 .setFamily(columnFamily, 0, columnFamily.length) 4075 // copy the qualifier if the cell is located in shared memory. 4076 .setQualifier(CellUtil.cloneQualifier(delta)) 4077 .setTimestamp(Math.max(currentCell.getTimestamp() + 1, now)) 4078 .setType(KeyValue.Type.Put.getCode()).setValue(newValue, 0, newValue.length) 4079 .setTags(TagUtil.fromList(tags)).build(); 4080 } else { 4081 tags = TagUtil.carryForwardTTLTag(tags, mutation.getTTL()); 4082 PrivateCellUtil.updateLatestStamp(delta, now); 4083 return CollectionUtils.isEmpty(tags) ? delta : PrivateCellUtil.createCell(delta, tags); 4084 } 4085 } 4086 4087 /** Returns Get the long out of the passed in Cell */ 4088 private static long getLongValue(final Cell cell) throws DoNotRetryIOException { 4089 int len = cell.getValueLength(); 4090 if (len != Bytes.SIZEOF_LONG) { 4091 // throw DoNotRetryIOException instead of IllegalArgumentException 4092 throw new DoNotRetryIOException("Field is not a long, it's " + len + " bytes wide"); 4093 } 4094 return PrivateCellUtil.getValueAsLong(cell); 4095 } 4096 4097 @Override 4098 public List<Pair<NonceKey, WALEdit>> 4099 buildWALEdits(final MiniBatchOperationInProgress<Mutation> miniBatchOp) throws IOException { 4100 List<Pair<NonceKey, WALEdit>> walEdits = super.buildWALEdits(miniBatchOp); 4101 // for MutationBatchOperation, more than one nonce is not allowed 4102 if (walEdits.size() > 1) { 4103 throw new IOException("Found multiple nonce keys per batch!"); 4104 } 4105 return walEdits; 4106 } 4107 4108 @Override 4109 public WriteEntry writeMiniBatchOperationsToMemStore( 4110 final MiniBatchOperationInProgress<Mutation> miniBatchOp, @Nullable WriteEntry writeEntry) 4111 throws IOException { 4112 if (writeEntry == null) { 4113 writeEntry = region.mvcc.begin(); 4114 } 4115 super.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry.getWriteNumber()); 4116 return writeEntry; 4117 } 4118 4119 @Override 4120 public void completeMiniBatchOperations( 4121 final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry) 4122 throws IOException { 4123 // TODO: can it be done after completing mvcc? 4124 // calling the post CP hook for batch mutation 4125 if (region.coprocessorHost != null) { 4126 region.coprocessorHost.postBatchMutate(miniBatchOp); 4127 } 4128 super.completeMiniBatchOperations(miniBatchOp, writeEntry); 4129 4130 if (nonce != HConstants.NO_NONCE) { 4131 if (region.rsServices != null && region.rsServices.getNonceManager() != null) { 4132 region.rsServices.getNonceManager().addMvccToOperationContext(nonceGroup, nonce, 4133 writeEntry.getWriteNumber()); 4134 } 4135 } 4136 } 4137 4138 @Override 4139 public void doPostOpCleanupForMiniBatch(MiniBatchOperationInProgress<Mutation> miniBatchOp, 4140 final WALEdit walEdit, boolean success) throws IOException { 4141 4142 super.doPostOpCleanupForMiniBatch(miniBatchOp, walEdit, success); 4143 if (miniBatchOp != null) { 4144 // synced so that the coprocessor contract is adhered to. 4145 if (region.coprocessorHost != null) { 4146 visitBatchOperations(false, miniBatchOp.getLastIndexExclusive(), (int i) -> { 4147 // only for successful puts/deletes/increments/appends 4148 if (retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.SUCCESS) { 4149 Mutation m = getMutation(i); 4150 if (m instanceof Put) { 4151 region.coprocessorHost.postPut((Put) m, walEdit); 4152 } else if (m instanceof Delete) { 4153 region.coprocessorHost.postDelete((Delete) m, walEdit); 4154 } else if (m instanceof Increment) { 4155 Result result = 4156 region.getCoprocessorHost().postIncrement((Increment) m, results[i], walEdit); 4157 if (result != results[i]) { 4158 retCodeDetails[i] = 4159 new OperationStatus(retCodeDetails[i].getOperationStatusCode(), result); 4160 } 4161 } else if (m instanceof Append) { 4162 Result result = 4163 region.getCoprocessorHost().postAppend((Append) m, results[i], walEdit); 4164 if (result != results[i]) { 4165 retCodeDetails[i] = 4166 new OperationStatus(retCodeDetails[i].getOperationStatusCode(), result); 4167 } 4168 } 4169 } 4170 return true; 4171 }); 4172 } 4173 4174 // For nonce operations 4175 if (canProceed && nonce != HConstants.NO_NONCE) { 4176 boolean[] areAllIncrementsAndAppendsSuccessful = new boolean[] { true }; 4177 visitBatchOperations(false, miniBatchOp.getLastIndexExclusive(), (int i) -> { 4178 Mutation mutation = getMutation(i); 4179 if (mutation instanceof Increment || mutation instanceof Append) { 4180 if (retCodeDetails[i].getOperationStatusCode() != OperationStatusCode.SUCCESS) { 4181 areAllIncrementsAndAppendsSuccessful[0] = false; 4182 return false; 4183 } 4184 } 4185 return true; 4186 }); 4187 endNonceOperation(areAllIncrementsAndAppendsSuccessful[0]); 4188 } 4189 4190 // See if the column families were consistent through the whole thing. 4191 // if they were then keep them. If they were not then pass a null. 4192 // null will be treated as unknown. 4193 // Total time taken might be involving Puts, Deletes, Increments and Appends. 4194 // Split the time for puts and deletes based on the total number of Puts, Deletes, 4195 // Increments and Appends. 4196 if (region.metricsRegion != null) { 4197 if (miniBatchOp.getNumOfPuts() > 0) { 4198 // There were some Puts in the batch. 4199 region.metricsRegion.updatePut(); 4200 } 4201 if (miniBatchOp.getNumOfDeletes() > 0) { 4202 // There were some Deletes in the batch. 4203 region.metricsRegion.updateDelete(); 4204 } 4205 if (miniBatchOp.getNumOfIncrements() > 0) { 4206 // There were some Increments in the batch. 4207 region.metricsRegion.updateIncrement(); 4208 } 4209 if (miniBatchOp.getNumOfAppends() > 0) { 4210 // There were some Appends in the batch. 4211 region.metricsRegion.updateAppend(); 4212 } 4213 } 4214 } 4215 4216 if (region.coprocessorHost != null) { 4217 // call the coprocessor hook to do any finalization steps after the put is done 4218 region.coprocessorHost.postBatchMutateIndispensably( 4219 miniBatchOp != null ? miniBatchOp : createMiniBatch(size(), 0), success); 4220 } 4221 } 4222 4223 /** 4224 * Runs prePut/preDelete/preIncrement/preAppend coprocessor hook for input mutation in a batch 4225 * @param metrics Array of 2 ints. index 0: count of puts, index 1: count of deletes, index 2: 4226 * count of increments and 3: count of appends 4227 */ 4228 private void callPreMutateCPHook(int index, final WALEdit walEdit, final int[] metrics) 4229 throws IOException { 4230 Mutation m = getMutation(index); 4231 if (m instanceof Put) { 4232 if (region.coprocessorHost.prePut((Put) m, walEdit)) { 4233 // pre hook says skip this Put 4234 // mark as success and skip in doMiniBatchMutation 4235 metrics[0]++; 4236 retCodeDetails[index] = OperationStatus.SUCCESS; 4237 } 4238 } else if (m instanceof Delete) { 4239 Delete curDel = (Delete) m; 4240 if (curDel.getFamilyCellMap().isEmpty()) { 4241 // handle deleting a row case 4242 // TODO: prepareDelete() has been called twice, before and after preDelete() CP hook. 4243 // Can this be avoided? 4244 region.prepareDelete(curDel); 4245 } 4246 if (region.coprocessorHost.preDelete(curDel, walEdit)) { 4247 // pre hook says skip this Delete 4248 // mark as success and skip in doMiniBatchMutation 4249 metrics[1]++; 4250 retCodeDetails[index] = OperationStatus.SUCCESS; 4251 } 4252 } else if (m instanceof Increment) { 4253 Increment increment = (Increment) m; 4254 Result result = region.coprocessorHost.preIncrement(increment, walEdit); 4255 if (result != null) { 4256 // pre hook says skip this Increment 4257 // mark as success and skip in doMiniBatchMutation 4258 metrics[2]++; 4259 retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, result); 4260 } 4261 } else if (m instanceof Append) { 4262 Append append = (Append) m; 4263 Result result = region.coprocessorHost.preAppend(append, walEdit); 4264 if (result != null) { 4265 // pre hook says skip this Append 4266 // mark as success and skip in doMiniBatchMutation 4267 metrics[3]++; 4268 retCodeDetails[index] = new OperationStatus(OperationStatusCode.SUCCESS, result); 4269 } 4270 } else { 4271 String msg = "Put/Delete/Increment/Append mutations only supported in a batch"; 4272 retCodeDetails[index] = new OperationStatus(OperationStatusCode.FAILURE, msg); 4273 if (isAtomic()) { // fail, atomic means all or none 4274 throw new IOException(msg); 4275 } 4276 } 4277 } 4278 4279 // TODO Support Increment/Append operations 4280 private void checkAndMergeCPMutations(final MiniBatchOperationInProgress<Mutation> miniBatchOp, 4281 final List<RowLock> acquiredRowLocks, final long timestamp) throws IOException { 4282 visitBatchOperations(true, nextIndexToProcess + miniBatchOp.size(), (int i) -> { 4283 // we pass (i - firstIndex) below since the call expects a relative index 4284 Mutation[] cpMutations = miniBatchOp.getOperationsFromCoprocessors(i - nextIndexToProcess); 4285 if (cpMutations == null) { 4286 return true; 4287 } 4288 // Else Coprocessor added more Mutations corresponding to the Mutation at this index. 4289 Mutation mutation = getMutation(i); 4290 for (Mutation cpMutation : cpMutations) { 4291 this.checkAndPrepareMutation(cpMutation, timestamp); 4292 4293 // Acquire row locks. If not, the whole batch will fail. 4294 acquiredRowLocks.add(region.getRowLock(cpMutation.getRow(), true, null)); 4295 4296 // Returned mutations from coprocessor correspond to the Mutation at index i. We can 4297 // directly add the cells from those mutations to the familyMaps of this mutation. 4298 Map<byte[], List<Cell>> cpFamilyMap = cpMutation.getFamilyCellMap(); 4299 region.rewriteCellTags(cpFamilyMap, mutation); 4300 // will get added to the memStore later 4301 mergeFamilyMaps(familyCellMaps[i], cpFamilyMap); 4302 4303 // The durability of returned mutation is replaced by the corresponding mutation. 4304 // If the corresponding mutation contains the SKIP_WAL, we shouldn't count the 4305 // cells of returned mutation. 4306 if (region.getEffectiveDurability(mutation.getDurability()) != Durability.SKIP_WAL) { 4307 for (List<Cell> cells : cpFamilyMap.values()) { 4308 miniBatchOp.addCellCount(cells.size()); 4309 } 4310 } 4311 } 4312 return true; 4313 }); 4314 } 4315 4316 private void mergeFamilyMaps(Map<byte[], List<Cell>> familyMap, 4317 Map<byte[], List<Cell>> toBeMerged) { 4318 for (Map.Entry<byte[], List<Cell>> entry : toBeMerged.entrySet()) { 4319 List<Cell> cells = familyMap.get(entry.getKey()); 4320 if (cells == null) { 4321 familyMap.put(entry.getKey(), entry.getValue()); 4322 } else { 4323 cells.addAll(entry.getValue()); 4324 } 4325 } 4326 } 4327 } 4328 4329 /** 4330 * Batch of mutations for replay. Base class is shared with {@link MutationBatchOperation} as most 4331 * of the logic is same. 4332 */ 4333 private static final class ReplayBatchOperation extends BatchOperation<MutationReplay> { 4334 4335 private long origLogSeqNum = 0; 4336 4337 public ReplayBatchOperation(final HRegion region, MutationReplay[] operations, 4338 long origLogSeqNum) { 4339 super(region, operations); 4340 this.origLogSeqNum = origLogSeqNum; 4341 } 4342 4343 @Override 4344 public Mutation getMutation(int index) { 4345 return this.operations[index].mutation; 4346 } 4347 4348 @Override 4349 public long getNonceGroup(int index) { 4350 return this.operations[index].nonceGroup; 4351 } 4352 4353 @Override 4354 public long getNonce(int index) { 4355 return this.operations[index].nonce; 4356 } 4357 4358 @Override 4359 public Mutation[] getMutationsForCoprocs() { 4360 return null; 4361 } 4362 4363 @Override 4364 public boolean isInReplay() { 4365 return true; 4366 } 4367 4368 @Override 4369 public long getOrigLogSeqNum() { 4370 return this.origLogSeqNum; 4371 } 4372 4373 @Override 4374 public void startRegionOperation() throws IOException { 4375 region.startRegionOperation(Operation.REPLAY_BATCH_MUTATE); 4376 } 4377 4378 @Override 4379 public void closeRegionOperation() throws IOException { 4380 region.closeRegionOperation(Operation.REPLAY_BATCH_MUTATE); 4381 } 4382 4383 /** 4384 * During replay, there could exist column families which are removed between region server 4385 * failure and replay 4386 */ 4387 @Override 4388 protected void checkAndPreparePut(Put p) throws IOException { 4389 Map<byte[], List<Cell>> familyCellMap = p.getFamilyCellMap(); 4390 List<byte[]> nonExistentList = null; 4391 for (byte[] family : familyCellMap.keySet()) { 4392 if (!region.htableDescriptor.hasColumnFamily(family)) { 4393 if (nonExistentList == null) { 4394 nonExistentList = new ArrayList<>(); 4395 } 4396 nonExistentList.add(family); 4397 } 4398 } 4399 if (nonExistentList != null) { 4400 for (byte[] family : nonExistentList) { 4401 // Perhaps schema was changed between crash and replay 4402 LOG.info("No family for {} omit from reply in region {}.", Bytes.toString(family), this); 4403 familyCellMap.remove(family); 4404 } 4405 } 4406 } 4407 4408 @Override 4409 public void checkAndPrepare() throws IOException { 4410 long now = EnvironmentEdgeManager.currentTime(); 4411 visitBatchOperations(true, this.size(), (int index) -> { 4412 checkAndPrepareMutation(index, now); 4413 return true; 4414 }); 4415 } 4416 4417 @Override 4418 public void prepareMiniBatchOperations(MiniBatchOperationInProgress<Mutation> miniBatchOp, 4419 long timestamp, final List<RowLock> acquiredRowLocks) throws IOException { 4420 visitBatchOperations(true, miniBatchOp.getLastIndexExclusive(), (int index) -> { 4421 // update cell count 4422 for (List<Cell> cells : getMutation(index).getFamilyCellMap().values()) { 4423 miniBatchOp.addCellCount(cells.size()); 4424 } 4425 return true; 4426 }); 4427 } 4428 4429 @Override 4430 public WriteEntry writeMiniBatchOperationsToMemStore( 4431 final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry) 4432 throws IOException { 4433 super.writeMiniBatchOperationsToMemStore(miniBatchOp, getOrigLogSeqNum()); 4434 return writeEntry; 4435 } 4436 4437 @Override 4438 public void completeMiniBatchOperations( 4439 final MiniBatchOperationInProgress<Mutation> miniBatchOp, final WriteEntry writeEntry) 4440 throws IOException { 4441 super.completeMiniBatchOperations(miniBatchOp, writeEntry); 4442 region.mvcc.advanceTo(getOrigLogSeqNum()); 4443 } 4444 } 4445 4446 public OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic, long nonceGroup, 4447 long nonce) throws IOException { 4448 // As it stands, this is used for 3 things 4449 // * batchMutate with single mutation - put/delete/increment/append, separate or from 4450 // checkAndMutate. 4451 // * coprocessor calls (see ex. BulkDeleteEndpoint). 4452 // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd... 4453 return batchMutate(new MutationBatchOperation(this, mutations, atomic, nonceGroup, nonce)); 4454 } 4455 4456 @Override 4457 public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException { 4458 // If the mutations has any Increment/Append operations, we need to do batchMutate atomically 4459 boolean atomic = 4460 Arrays.stream(mutations).anyMatch(m -> m instanceof Increment || m instanceof Append); 4461 return batchMutate(mutations, atomic); 4462 } 4463 4464 OperationStatus[] batchMutate(Mutation[] mutations, boolean atomic) throws IOException { 4465 return TraceUtil.trace( 4466 () -> batchMutate(mutations, atomic, HConstants.NO_NONCE, HConstants.NO_NONCE), 4467 () -> createRegionSpan("Region.batchMutate")); 4468 } 4469 4470 public OperationStatus[] batchReplay(MutationReplay[] mutations, long replaySeqId) 4471 throws IOException { 4472 if ( 4473 !RegionReplicaUtil.isDefaultReplica(getRegionInfo()) 4474 && replaySeqId < lastReplayedOpenRegionSeqId 4475 ) { 4476 // if it is a secondary replica we should ignore these entries silently 4477 // since they are coming out of order 4478 if (LOG.isTraceEnabled()) { 4479 LOG.trace(getRegionInfo().getEncodedName() + " : " + "Skipping " + mutations.length 4480 + " mutations with replaySeqId=" + replaySeqId 4481 + " which is < than lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId); 4482 for (MutationReplay mut : mutations) { 4483 LOG.trace(getRegionInfo().getEncodedName() + " : Skipping : " + mut.mutation); 4484 } 4485 } 4486 4487 OperationStatus[] statuses = new OperationStatus[mutations.length]; 4488 for (int i = 0; i < statuses.length; i++) { 4489 statuses[i] = OperationStatus.SUCCESS; 4490 } 4491 return statuses; 4492 } 4493 return batchMutate(new ReplayBatchOperation(this, mutations, replaySeqId)); 4494 } 4495 4496 /** 4497 * Perform a batch of mutations. 4498 * <p/> 4499 * Operations in a batch are stored with highest durability specified of for all operations in a 4500 * batch, except for {@link Durability#SKIP_WAL}. 4501 * <p/> 4502 * This function is called from {@link #batchReplay(WALSplitUtil.MutationReplay[], long)} with 4503 * {@link ReplayBatchOperation} instance and {@link #batchMutate(Mutation[])} with 4504 * {@link MutationBatchOperation} instance as an argument. As the processing of replay batch and 4505 * mutation batch is very similar, lot of code is shared by providing generic methods in base 4506 * class {@link BatchOperation}. The logic for this method and 4507 * {@link #doMiniBatchMutate(BatchOperation)} is implemented using methods in base class which are 4508 * overridden by derived classes to implement special behavior. 4509 * @param batchOp contains the list of mutations 4510 * @return an array of OperationStatus which internally contains the OperationStatusCode and the 4511 * exceptionMessage if any. 4512 * @throws IOException if an IO problem is encountered 4513 */ 4514 private OperationStatus[] batchMutate(BatchOperation<?> batchOp) throws IOException { 4515 boolean initialized = false; 4516 batchOp.startRegionOperation(); 4517 try { 4518 while (!batchOp.isDone()) { 4519 if (!batchOp.isInReplay()) { 4520 checkReadOnly(); 4521 } 4522 checkResources(); 4523 4524 if (!initialized) { 4525 this.writeRequestsCount.add(batchOp.size()); 4526 // validate and prepare batch for write, for MutationBatchOperation it also calls CP 4527 // prePut()/preDelete()/preIncrement()/preAppend() hooks 4528 batchOp.checkAndPrepare(); 4529 initialized = true; 4530 } 4531 doMiniBatchMutate(batchOp); 4532 requestFlushIfNeeded(); 4533 } 4534 } finally { 4535 if (rsServices != null && rsServices.getMetrics() != null) { 4536 rsServices.getMetrics().updateWriteQueryMeter(this, batchOp.size()); 4537 } 4538 batchOp.closeRegionOperation(); 4539 } 4540 return batchOp.retCodeDetails; 4541 } 4542 4543 /** 4544 * Called to do a piece of the batch that came in to {@link #batchMutate(Mutation[])} In here we 4545 * also handle replay of edits on region recover. Also gets change in size brought about by 4546 * applying {@code batchOp}. 4547 */ 4548 private void doMiniBatchMutate(BatchOperation<?> batchOp) throws IOException { 4549 boolean success = false; 4550 WALEdit walEdit = null; 4551 WriteEntry writeEntry = null; 4552 boolean locked = false; 4553 // We try to set up a batch in the range [batchOp.nextIndexToProcess,lastIndexExclusive) 4554 MiniBatchOperationInProgress<Mutation> miniBatchOp = null; 4555 /** Keep track of the locks we hold so we can release them in finally clause */ 4556 List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.size()); 4557 4558 // Check for thread interrupt status in case we have been signaled from 4559 // #interruptRegionOperation. 4560 checkInterrupt(); 4561 4562 try { 4563 // STEP 1. Try to acquire as many locks as we can and build mini-batch of operations with 4564 // locked rows 4565 miniBatchOp = batchOp.lockRowsAndBuildMiniBatch(acquiredRowLocks); 4566 4567 // We've now grabbed as many mutations off the list as we can 4568 // Ensure we acquire at least one. 4569 if (miniBatchOp.getReadyToWriteCount() <= 0) { 4570 // Nothing to put/delete/increment/append -- an exception in the above such as 4571 // NoSuchColumnFamily? 4572 return; 4573 } 4574 4575 // Check for thread interrupt status in case we have been signaled from 4576 // #interruptRegionOperation. Do it before we take the lock and disable interrupts for 4577 // the WAL append. 4578 checkInterrupt(); 4579 4580 lock(this.updatesLock.readLock(), miniBatchOp.getReadyToWriteCount()); 4581 locked = true; 4582 4583 // From this point until memstore update this operation should not be interrupted. 4584 disableInterrupts(); 4585 4586 // STEP 2. Update mini batch of all operations in progress with LATEST_TIMESTAMP timestamp 4587 // We should record the timestamp only after we have acquired the rowLock, 4588 // otherwise, newer puts/deletes/increment/append are not guaranteed to have a newer 4589 // timestamp 4590 4591 long now = EnvironmentEdgeManager.currentTime(); 4592 batchOp.prepareMiniBatchOperations(miniBatchOp, now, acquiredRowLocks); 4593 4594 // STEP 3. Build WAL edit 4595 4596 List<Pair<NonceKey, WALEdit>> walEdits = batchOp.buildWALEdits(miniBatchOp); 4597 4598 // STEP 4. Append the WALEdits to WAL and sync. 4599 4600 for (Iterator<Pair<NonceKey, WALEdit>> it = walEdits.iterator(); it.hasNext();) { 4601 Pair<NonceKey, WALEdit> nonceKeyWALEditPair = it.next(); 4602 walEdit = nonceKeyWALEditPair.getSecond(); 4603 NonceKey nonceKey = nonceKeyWALEditPair.getFirst(); 4604 4605 if (walEdit != null && !walEdit.isEmpty()) { 4606 writeEntry = doWALAppend(walEdit, batchOp.durability, batchOp.getClusterIds(), now, 4607 nonceKey.getNonceGroup(), nonceKey.getNonce(), batchOp.getOrigLogSeqNum()); 4608 } 4609 4610 // Complete mvcc for all but last writeEntry (for replay case) 4611 if (it.hasNext() && writeEntry != null) { 4612 mvcc.complete(writeEntry); 4613 writeEntry = null; 4614 } 4615 } 4616 4617 // STEP 5. Write back to memStore 4618 // NOTE: writeEntry can be null here 4619 writeEntry = batchOp.writeMiniBatchOperationsToMemStore(miniBatchOp, writeEntry); 4620 4621 // STEP 6. Complete MiniBatchOperations: If required calls postBatchMutate() CP hook and 4622 // complete mvcc for last writeEntry 4623 batchOp.completeMiniBatchOperations(miniBatchOp, writeEntry); 4624 writeEntry = null; 4625 success = true; 4626 } finally { 4627 // Call complete rather than completeAndWait because we probably had error if walKey != null 4628 if (writeEntry != null) mvcc.complete(writeEntry); 4629 4630 if (locked) { 4631 this.updatesLock.readLock().unlock(); 4632 } 4633 releaseRowLocks(acquiredRowLocks); 4634 4635 enableInterrupts(); 4636 4637 final int finalLastIndexExclusive = 4638 miniBatchOp != null ? miniBatchOp.getLastIndexExclusive() : batchOp.size(); 4639 final boolean finalSuccess = success; 4640 batchOp.visitBatchOperations(true, finalLastIndexExclusive, (int i) -> { 4641 Mutation mutation = batchOp.getMutation(i); 4642 if (mutation instanceof Increment || mutation instanceof Append) { 4643 if (finalSuccess) { 4644 batchOp.retCodeDetails[i] = 4645 new OperationStatus(OperationStatusCode.SUCCESS, batchOp.results[i]); 4646 } else { 4647 batchOp.retCodeDetails[i] = OperationStatus.FAILURE; 4648 } 4649 } else { 4650 batchOp.retCodeDetails[i] = 4651 finalSuccess ? OperationStatus.SUCCESS : OperationStatus.FAILURE; 4652 } 4653 return true; 4654 }); 4655 4656 batchOp.doPostOpCleanupForMiniBatch(miniBatchOp, walEdit, finalSuccess); 4657 4658 batchOp.nextIndexToProcess = finalLastIndexExclusive; 4659 } 4660 } 4661 4662 /** 4663 * Returns effective durability from the passed durability and the table descriptor. 4664 */ 4665 private Durability getEffectiveDurability(Durability d) { 4666 return d == Durability.USE_DEFAULT ? this.regionDurability : d; 4667 } 4668 4669 @Override 4670 @Deprecated 4671 public boolean checkAndMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op, 4672 ByteArrayComparable comparator, TimeRange timeRange, Mutation mutation) throws IOException { 4673 CheckAndMutate checkAndMutate; 4674 try { 4675 CheckAndMutate.Builder builder = CheckAndMutate.newBuilder(row) 4676 .ifMatches(family, qualifier, op, comparator.getValue()).timeRange(timeRange); 4677 if (mutation instanceof Put) { 4678 checkAndMutate = builder.build((Put) mutation); 4679 } else if (mutation instanceof Delete) { 4680 checkAndMutate = builder.build((Delete) mutation); 4681 } else { 4682 throw new DoNotRetryIOException( 4683 "Unsupported mutate type: " + mutation.getClass().getSimpleName().toUpperCase()); 4684 } 4685 } catch (IllegalArgumentException e) { 4686 throw new DoNotRetryIOException(e.getMessage()); 4687 } 4688 return checkAndMutate(checkAndMutate).isSuccess(); 4689 } 4690 4691 @Override 4692 @Deprecated 4693 public boolean checkAndMutate(byte[] row, Filter filter, TimeRange timeRange, Mutation mutation) 4694 throws IOException { 4695 CheckAndMutate checkAndMutate; 4696 try { 4697 CheckAndMutate.Builder builder = 4698 CheckAndMutate.newBuilder(row).ifMatches(filter).timeRange(timeRange); 4699 if (mutation instanceof Put) { 4700 checkAndMutate = builder.build((Put) mutation); 4701 } else if (mutation instanceof Delete) { 4702 checkAndMutate = builder.build((Delete) mutation); 4703 } else { 4704 throw new DoNotRetryIOException( 4705 "Unsupported mutate type: " + mutation.getClass().getSimpleName().toUpperCase()); 4706 } 4707 } catch (IllegalArgumentException e) { 4708 throw new DoNotRetryIOException(e.getMessage()); 4709 } 4710 return checkAndMutate(checkAndMutate).isSuccess(); 4711 } 4712 4713 @Override 4714 @Deprecated 4715 public boolean checkAndRowMutate(byte[] row, byte[] family, byte[] qualifier, CompareOperator op, 4716 ByteArrayComparable comparator, TimeRange timeRange, RowMutations rm) throws IOException { 4717 CheckAndMutate checkAndMutate; 4718 try { 4719 checkAndMutate = CheckAndMutate.newBuilder(row) 4720 .ifMatches(family, qualifier, op, comparator.getValue()).timeRange(timeRange).build(rm); 4721 } catch (IllegalArgumentException e) { 4722 throw new DoNotRetryIOException(e.getMessage()); 4723 } 4724 return checkAndMutate(checkAndMutate).isSuccess(); 4725 } 4726 4727 @Override 4728 @Deprecated 4729 public boolean checkAndRowMutate(byte[] row, Filter filter, TimeRange timeRange, RowMutations rm) 4730 throws IOException { 4731 CheckAndMutate checkAndMutate; 4732 try { 4733 checkAndMutate = 4734 CheckAndMutate.newBuilder(row).ifMatches(filter).timeRange(timeRange).build(rm); 4735 } catch (IllegalArgumentException e) { 4736 throw new DoNotRetryIOException(e.getMessage()); 4737 } 4738 return checkAndMutate(checkAndMutate).isSuccess(); 4739 } 4740 4741 @Override 4742 public CheckAndMutateResult checkAndMutate(CheckAndMutate checkAndMutate) throws IOException { 4743 return checkAndMutate(checkAndMutate, HConstants.NO_NONCE, HConstants.NO_NONCE); 4744 } 4745 4746 public CheckAndMutateResult checkAndMutate(CheckAndMutate checkAndMutate, long nonceGroup, 4747 long nonce) throws IOException { 4748 return TraceUtil.trace(() -> checkAndMutateInternal(checkAndMutate, nonceGroup, nonce), 4749 () -> createRegionSpan("Region.checkAndMutate")); 4750 } 4751 4752 private CheckAndMutateResult checkAndMutateInternal(CheckAndMutate checkAndMutate, 4753 long nonceGroup, long nonce) throws IOException { 4754 byte[] row = checkAndMutate.getRow(); 4755 Filter filter = null; 4756 byte[] family = null; 4757 byte[] qualifier = null; 4758 CompareOperator op = null; 4759 ByteArrayComparable comparator = null; 4760 if (checkAndMutate.hasFilter()) { 4761 filter = checkAndMutate.getFilter(); 4762 } else { 4763 family = checkAndMutate.getFamily(); 4764 qualifier = checkAndMutate.getQualifier(); 4765 op = checkAndMutate.getCompareOp(); 4766 comparator = new BinaryComparator(checkAndMutate.getValue()); 4767 } 4768 TimeRange timeRange = checkAndMutate.getTimeRange(); 4769 4770 Mutation mutation = null; 4771 RowMutations rowMutations = null; 4772 if (checkAndMutate.getAction() instanceof Mutation) { 4773 mutation = (Mutation) checkAndMutate.getAction(); 4774 } else { 4775 rowMutations = (RowMutations) checkAndMutate.getAction(); 4776 } 4777 4778 if (mutation != null) { 4779 checkMutationType(mutation); 4780 checkRow(mutation, row); 4781 } else { 4782 checkRow(rowMutations, row); 4783 } 4784 checkReadOnly(); 4785 // TODO, add check for value length also move this check to the client 4786 checkResources(); 4787 startRegionOperation(); 4788 try { 4789 Get get = new Get(row); 4790 if (family != null) { 4791 checkFamily(family); 4792 get.addColumn(family, qualifier); 4793 } 4794 if (filter != null) { 4795 get.setFilter(filter); 4796 } 4797 if (timeRange != null) { 4798 get.setTimeRange(timeRange.getMin(), timeRange.getMax()); 4799 } 4800 // Lock row - note that doBatchMutate will relock this row if called 4801 checkRow(row, "doCheckAndRowMutate"); 4802 RowLock rowLock = getRowLock(get.getRow(), false, null); 4803 try { 4804 if (this.getCoprocessorHost() != null) { 4805 CheckAndMutateResult result = 4806 getCoprocessorHost().preCheckAndMutateAfterRowLock(checkAndMutate); 4807 if (result != null) { 4808 return result; 4809 } 4810 } 4811 4812 // NOTE: We used to wait here until mvcc caught up: mvcc.await(); 4813 // Supposition is that now all changes are done under row locks, then when we go to read, 4814 // we'll get the latest on this row. 4815 boolean matches = false; 4816 long cellTs = 0; 4817 try (RegionScanner scanner = getScanner(new Scan(get))) { 4818 // NOTE: Please don't use HRegion.get() instead, 4819 // because it will copy cells to heap. See HBASE-26036 4820 List<Cell> result = new ArrayList<>(1); 4821 scanner.next(result); 4822 if (filter != null) { 4823 if (!result.isEmpty()) { 4824 matches = true; 4825 cellTs = result.get(0).getTimestamp(); 4826 } 4827 } else { 4828 boolean valueIsNull = 4829 comparator.getValue() == null || comparator.getValue().length == 0; 4830 if (result.isEmpty() && valueIsNull) { 4831 matches = op != CompareOperator.NOT_EQUAL; 4832 } else if (result.size() > 0 && valueIsNull) { 4833 matches = (result.get(0).getValueLength() == 0) == (op != CompareOperator.NOT_EQUAL); 4834 cellTs = result.get(0).getTimestamp(); 4835 } else if (result.size() == 1) { 4836 Cell kv = result.get(0); 4837 cellTs = kv.getTimestamp(); 4838 int compareResult = PrivateCellUtil.compareValue(kv, comparator); 4839 matches = matches(op, compareResult); 4840 } 4841 } 4842 } 4843 4844 // If matches, perform the mutation or the rowMutations 4845 if (matches) { 4846 // We have acquired the row lock already. If the system clock is NOT monotonically 4847 // non-decreasing (see HBASE-14070) we should make sure that the mutation has a 4848 // larger timestamp than what was observed via Get. doBatchMutate already does this, but 4849 // there is no way to pass the cellTs. See HBASE-14054. 4850 long now = EnvironmentEdgeManager.currentTime(); 4851 long ts = Math.max(now, cellTs); // ensure write is not eclipsed 4852 byte[] byteTs = Bytes.toBytes(ts); 4853 if (mutation != null) { 4854 if (mutation instanceof Put) { 4855 updateCellTimestamps(mutation.getFamilyCellMap().values(), byteTs); 4856 } 4857 // And else 'delete' is not needed since it already does a second get, and sets the 4858 // timestamp from get (see prepareDeleteTimestamps). 4859 } else { 4860 for (Mutation m : rowMutations.getMutations()) { 4861 if (m instanceof Put) { 4862 updateCellTimestamps(m.getFamilyCellMap().values(), byteTs); 4863 } 4864 } 4865 // And else 'delete' is not needed since it already does a second get, and sets the 4866 // timestamp from get (see prepareDeleteTimestamps). 4867 } 4868 // All edits for the given row (across all column families) must happen atomically. 4869 Result r; 4870 if (mutation != null) { 4871 r = mutate(mutation, true, nonceGroup, nonce).getResult(); 4872 } else { 4873 r = mutateRow(rowMutations, nonceGroup, nonce); 4874 } 4875 this.checkAndMutateChecksPassed.increment(); 4876 return new CheckAndMutateResult(true, r); 4877 } 4878 this.checkAndMutateChecksFailed.increment(); 4879 return new CheckAndMutateResult(false, null); 4880 } finally { 4881 rowLock.release(); 4882 } 4883 } finally { 4884 closeRegionOperation(); 4885 } 4886 } 4887 4888 private void checkMutationType(final Mutation mutation) throws DoNotRetryIOException { 4889 if ( 4890 !(mutation instanceof Put) && !(mutation instanceof Delete) 4891 && !(mutation instanceof Increment) && !(mutation instanceof Append) 4892 ) { 4893 throw new org.apache.hadoop.hbase.DoNotRetryIOException( 4894 "Action must be Put or Delete or Increment or Delete"); 4895 } 4896 } 4897 4898 private void checkRow(final Row action, final byte[] row) throws DoNotRetryIOException { 4899 if (!Bytes.equals(row, action.getRow())) { 4900 throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's getRow must match"); 4901 } 4902 } 4903 4904 private boolean matches(final CompareOperator op, final int compareResult) { 4905 boolean matches = false; 4906 switch (op) { 4907 case LESS: 4908 matches = compareResult < 0; 4909 break; 4910 case LESS_OR_EQUAL: 4911 matches = compareResult <= 0; 4912 break; 4913 case EQUAL: 4914 matches = compareResult == 0; 4915 break; 4916 case NOT_EQUAL: 4917 matches = compareResult != 0; 4918 break; 4919 case GREATER_OR_EQUAL: 4920 matches = compareResult >= 0; 4921 break; 4922 case GREATER: 4923 matches = compareResult > 0; 4924 break; 4925 default: 4926 throw new RuntimeException("Unknown Compare op " + op.name()); 4927 } 4928 return matches; 4929 } 4930 4931 private OperationStatus mutate(Mutation mutation) throws IOException { 4932 return mutate(mutation, false); 4933 } 4934 4935 private OperationStatus mutate(Mutation mutation, boolean atomic) throws IOException { 4936 return mutate(mutation, atomic, HConstants.NO_NONCE, HConstants.NO_NONCE); 4937 } 4938 4939 private OperationStatus mutate(Mutation mutation, boolean atomic, long nonceGroup, long nonce) 4940 throws IOException { 4941 OperationStatus[] status = 4942 this.batchMutate(new Mutation[] { mutation }, atomic, nonceGroup, nonce); 4943 if (status[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) { 4944 throw new FailedSanityCheckException(status[0].getExceptionMsg()); 4945 } else if (status[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) { 4946 throw new NoSuchColumnFamilyException(status[0].getExceptionMsg()); 4947 } else if (status[0].getOperationStatusCode().equals(OperationStatusCode.STORE_TOO_BUSY)) { 4948 throw new RegionTooBusyException(status[0].getExceptionMsg()); 4949 } 4950 return status[0]; 4951 } 4952 4953 /** 4954 * Complete taking the snapshot on the region. Writes the region info and adds references to the 4955 * working snapshot directory. TODO for api consistency, consider adding another version with no 4956 * {@link ForeignExceptionSnare} arg. (In the future other cancellable HRegion methods could 4957 * eventually add a {@link ForeignExceptionSnare}, or we could do something fancier). 4958 * @param desc snapshot description object 4959 * @param exnSnare ForeignExceptionSnare that captures external exceptions in case we need to bail 4960 * out. This is allowed to be null and will just be ignored in that case. 4961 * @throws IOException if there is an external or internal error causing the snapshot to fail 4962 */ 4963 public void addRegionToSnapshot(SnapshotDescription desc, ForeignExceptionSnare exnSnare) 4964 throws IOException { 4965 Path rootDir = CommonFSUtils.getRootDir(conf); 4966 Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir, conf); 4967 4968 SnapshotManifest manifest = 4969 SnapshotManifest.create(conf, getFilesystem(), snapshotDir, desc, exnSnare); 4970 manifest.addRegion(this); 4971 } 4972 4973 private void updateSequenceId(final Iterable<List<Cell>> cellItr, final long sequenceId) 4974 throws IOException { 4975 for (List<Cell> cells : cellItr) { 4976 if (cells == null) return; 4977 for (Cell cell : cells) { 4978 PrivateCellUtil.setSequenceId(cell, sequenceId); 4979 } 4980 } 4981 } 4982 4983 /** 4984 * Replace any cell timestamps set to {@link org.apache.hadoop.hbase.HConstants#LATEST_TIMESTAMP} 4985 * provided current timestamp. 4986 */ 4987 private static void updateCellTimestamps(final Iterable<List<Cell>> cellItr, final byte[] now) 4988 throws IOException { 4989 for (List<Cell> cells : cellItr) { 4990 if (cells == null) continue; 4991 // Optimization: 'foreach' loop is not used. See: 4992 // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects 4993 assert cells instanceof RandomAccess; 4994 int listSize = cells.size(); 4995 for (int i = 0; i < listSize; i++) { 4996 PrivateCellUtil.updateLatestStamp(cells.get(i), now); 4997 } 4998 } 4999 } 5000 5001 /** 5002 * Possibly rewrite incoming cell tags. 5003 */ 5004 private void rewriteCellTags(Map<byte[], List<Cell>> familyMap, final Mutation m) { 5005 // Check if we have any work to do and early out otherwise 5006 // Update these checks as more logic is added here 5007 if (m.getTTL() == Long.MAX_VALUE) { 5008 return; 5009 } 5010 5011 // From this point we know we have some work to do 5012 for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) { 5013 List<Cell> cells = e.getValue(); 5014 assert cells instanceof RandomAccess; 5015 int listSize = cells.size(); 5016 for (int i = 0; i < listSize; i++) { 5017 Cell cell = cells.get(i); 5018 List<Tag> newTags = TagUtil.carryForwardTags(null, cell); 5019 newTags = TagUtil.carryForwardTTLTag(newTags, m.getTTL()); 5020 // Rewrite the cell with the updated set of tags 5021 cells.set(i, PrivateCellUtil.createCell(cell, newTags)); 5022 } 5023 } 5024 } 5025 5026 /** 5027 * Check if resources to support an update. 5028 * <p/> 5029 * We throw RegionTooBusyException if above memstore limit and expect client to retry using some 5030 * kind of backoff 5031 */ 5032 private void checkResources() throws RegionTooBusyException { 5033 // If catalog region, do not impose resource constraints or block updates. 5034 if (this.getRegionInfo().isMetaRegion()) { 5035 return; 5036 } 5037 5038 MemStoreSize mss = this.memStoreSizing.getMemStoreSize(); 5039 if (mss.getHeapSize() + mss.getOffHeapSize() > this.blockingMemStoreSize) { 5040 blockedRequestsCount.increment(); 5041 requestFlush(); 5042 // Don't print current limit because it will vary too much. The message is used as a key 5043 // over in RetriesExhaustedWithDetailsException processing. 5044 final String regionName = 5045 this.getRegionInfo() == null ? "unknown" : this.getRegionInfo().getEncodedName(); 5046 final String serverName = this.getRegionServerServices() == null 5047 ? "unknown" 5048 : (this.getRegionServerServices().getServerName() == null 5049 ? "unknown" 5050 : this.getRegionServerServices().getServerName().toString()); 5051 RegionTooBusyException rtbe = new RegionTooBusyException("Over memstore limit=" 5052 + org.apache.hadoop.hbase.procedure2.util.StringUtils.humanSize(this.blockingMemStoreSize) 5053 + ", regionName=" + regionName + ", server=" + serverName); 5054 LOG.warn("Region is too busy due to exceeding memstore size limit.", rtbe); 5055 throw rtbe; 5056 } 5057 } 5058 5059 /** 5060 * @throws IOException Throws exception if region is in read-only mode. 5061 */ 5062 private void checkReadOnly() throws IOException { 5063 if (isReadOnly()) { 5064 throw new DoNotRetryIOException("region is read only"); 5065 } 5066 } 5067 5068 private void checkReadsEnabled() throws IOException { 5069 if (!this.writestate.readsEnabled) { 5070 throw new IOException(getRegionInfo().getEncodedName() 5071 + ": The region's reads are disabled. Cannot serve the request"); 5072 } 5073 } 5074 5075 public void setReadsEnabled(boolean readsEnabled) { 5076 if (readsEnabled && !this.writestate.readsEnabled) { 5077 LOG.info("Enabling reads for {}", getRegionInfo().getEncodedName()); 5078 } 5079 this.writestate.setReadsEnabled(readsEnabled); 5080 } 5081 5082 /** 5083 * @param delta If we are doing delta changes -- e.g. increment/append -- then this flag will be 5084 * set; when set we will run operations that make sense in the increment/append 5085 * scenario but that do not make sense otherwise. 5086 * @see #applyToMemStore(HStore, Cell, MemStoreSizing) 5087 */ 5088 private void applyToMemStore(HStore store, List<Cell> cells, boolean delta, 5089 MemStoreSizing memstoreAccounting) { 5090 // Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!! 5091 boolean upsert = delta && store.getColumnFamilyDescriptor().getMaxVersions() == 1; 5092 if (upsert) { 5093 store.upsert(cells, getSmallestReadPoint(), memstoreAccounting); 5094 } else { 5095 store.add(cells, memstoreAccounting); 5096 } 5097 } 5098 5099 /** 5100 * @see #applyToMemStore(HStore, List, boolean, MemStoreSizing) 5101 */ 5102 private void applyToMemStore(HStore store, Cell cell, MemStoreSizing memstoreAccounting) 5103 throws IOException { 5104 // Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!! 5105 if (store == null) { 5106 checkFamily(CellUtil.cloneFamily(cell)); 5107 // Unreachable because checkFamily will throw exception 5108 } 5109 store.add(cell, memstoreAccounting); 5110 } 5111 5112 /** 5113 * Check the collection of families for validity. 5114 */ 5115 public void checkFamilies(Collection<byte[]> families) throws NoSuchColumnFamilyException { 5116 for (byte[] family : families) { 5117 checkFamily(family); 5118 } 5119 } 5120 5121 /** 5122 * Check the collection of families for valid timestamps 5123 * @param now current timestamp 5124 */ 5125 public void checkTimestamps(final Map<byte[], List<Cell>> familyMap, long now) 5126 throws FailedSanityCheckException { 5127 if (timestampSlop == HConstants.LATEST_TIMESTAMP) { 5128 return; 5129 } 5130 long maxTs = now + timestampSlop; 5131 for (List<Cell> kvs : familyMap.values()) { 5132 // Optimization: 'foreach' loop is not used. See: 5133 // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects 5134 assert kvs instanceof RandomAccess; 5135 int listSize = kvs.size(); 5136 for (int i = 0; i < listSize; i++) { 5137 Cell cell = kvs.get(i); 5138 // see if the user-side TS is out of range. latest = server-side 5139 long ts = cell.getTimestamp(); 5140 if (ts != HConstants.LATEST_TIMESTAMP && ts > maxTs) { 5141 throw new FailedSanityCheckException( 5142 "Timestamp for KV out of range " + cell + " (too.new=" + timestampSlop + ")"); 5143 } 5144 } 5145 } 5146 } 5147 5148 /* 5149 * @return True if size is over the flush threshold 5150 */ 5151 private boolean isFlushSize(MemStoreSize size) { 5152 return size.getHeapSize() + size.getOffHeapSize() > getMemStoreFlushSize(); 5153 } 5154 5155 private void deleteRecoveredEdits(FileSystem fs, Iterable<Path> files) throws IOException { 5156 for (Path file : files) { 5157 if (!fs.delete(file, false)) { 5158 LOG.error("Failed delete of {}", file); 5159 } else { 5160 LOG.debug("Deleted recovered.edits file={}", file); 5161 } 5162 } 5163 } 5164 5165 /** 5166 * Read the edits put under this region by wal splitting process. Put the recovered edits back up 5167 * into this region. 5168 * <p> 5169 * We can ignore any wal message that has a sequence ID that's equal to or lower than minSeqId. 5170 * (Because we know such messages are already reflected in the HFiles.) 5171 * <p> 5172 * While this is running we are putting pressure on memory yet we are outside of our usual 5173 * accounting because we are not yet an onlined region (this stuff is being run as part of Region 5174 * initialization). This means that if we're up against global memory limits, we'll not be flagged 5175 * to flush because we are not online. We can't be flushed by usual mechanisms anyways; we're not 5176 * yet online so our relative sequenceids are not yet aligned with WAL sequenceids -- not till we 5177 * come up online, post processing of split edits. 5178 * <p> 5179 * But to help relieve memory pressure, at least manage our own heap size flushing if are in 5180 * excess of per-region limits. Flushing, though, we have to be careful and avoid using the 5181 * regionserver/wal sequenceid. Its running on a different line to whats going on in here in this 5182 * region context so if we crashed replaying these edits, but in the midst had a flush that used 5183 * the regionserver wal with a sequenceid in excess of whats going on in here in this region and 5184 * with its split editlogs, then we could miss edits the next time we go to recover. So, we have 5185 * to flush inline, using seqids that make sense in a this single region context only -- until we 5186 * online. 5187 * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of the maxSeqId 5188 * for the store to be applied, else its skipped. 5189 * @return the sequence id of the last edit added to this region out of the recovered edits log or 5190 * <code>minSeqId</code> if nothing added from editlogs. 5191 */ 5192 long replayRecoveredEditsIfAny(Map<byte[], Long> maxSeqIdInStores, 5193 final CancelableProgressable reporter, final MonitoredTask status) throws IOException { 5194 long minSeqIdForTheRegion = -1; 5195 for (Long maxSeqIdInStore : maxSeqIdInStores.values()) { 5196 if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) { 5197 minSeqIdForTheRegion = maxSeqIdInStore; 5198 } 5199 } 5200 long seqId = minSeqIdForTheRegion; 5201 String specialRecoveredEditsDirStr = conf.get(SPECIAL_RECOVERED_EDITS_DIR); 5202 if (org.apache.commons.lang3.StringUtils.isBlank(specialRecoveredEditsDirStr)) { 5203 FileSystem walFS = getWalFileSystem(); 5204 FileSystem rootFS = getFilesystem(); 5205 Path wrongRegionWALDir = CommonFSUtils.getWrongWALRegionDir(conf, getRegionInfo().getTable(), 5206 getRegionInfo().getEncodedName()); 5207 Path regionWALDir = getWALRegionDir(); 5208 Path regionDir = 5209 FSUtils.getRegionDirFromRootDir(CommonFSUtils.getRootDir(conf), getRegionInfo()); 5210 5211 // We made a mistake in HBASE-20734 so we need to do this dirty hack... 5212 NavigableSet<Path> filesUnderWrongRegionWALDir = 5213 WALSplitUtil.getSplitEditFilesSorted(walFS, wrongRegionWALDir); 5214 seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS, 5215 filesUnderWrongRegionWALDir, reporter, regionDir)); 5216 // This is to ensure backwards compatability with HBASE-20723 where recovered edits can appear 5217 // under the root dir even if walDir is set. 5218 NavigableSet<Path> filesUnderRootDir = Collections.emptyNavigableSet(); 5219 if (!regionWALDir.equals(regionDir)) { 5220 filesUnderRootDir = WALSplitUtil.getSplitEditFilesSorted(rootFS, regionDir); 5221 seqId = Math.max(seqId, replayRecoveredEditsForPaths(minSeqIdForTheRegion, rootFS, 5222 filesUnderRootDir, reporter, regionDir)); 5223 } 5224 5225 NavigableSet<Path> files = WALSplitUtil.getSplitEditFilesSorted(walFS, regionWALDir); 5226 seqId = Math.max(seqId, 5227 replayRecoveredEditsForPaths(minSeqIdForTheRegion, walFS, files, reporter, regionWALDir)); 5228 if (seqId > minSeqIdForTheRegion) { 5229 // Then we added some edits to memory. Flush and cleanup split edit files. 5230 internalFlushcache(null, seqId, stores.values(), status, false, 5231 FlushLifeCycleTracker.DUMMY); 5232 } 5233 // Now delete the content of recovered edits. We're done w/ them. 5234 if (files.size() > 0 && this.conf.getBoolean("hbase.region.archive.recovered.edits", false)) { 5235 // For debugging data loss issues! 5236 // If this flag is set, make use of the hfile archiving by making recovered.edits a fake 5237 // column family. Have to fake out file type too by casting our recovered.edits as 5238 // storefiles 5239 String fakeFamilyName = WALSplitUtil.getRegionDirRecoveredEditsDir(regionWALDir).getName(); 5240 Set<HStoreFile> fakeStoreFiles = new HashSet<>(files.size()); 5241 for (Path file : files) { 5242 fakeStoreFiles.add(new HStoreFile(walFS, file, this.conf, null, null, true)); 5243 } 5244 getRegionWALFileSystem().archiveRecoveredEdits(fakeFamilyName, fakeStoreFiles); 5245 } else { 5246 deleteRecoveredEdits(walFS, Iterables.concat(files, filesUnderWrongRegionWALDir)); 5247 deleteRecoveredEdits(rootFS, filesUnderRootDir); 5248 } 5249 } else { 5250 Path recoveredEditsDir = new Path(specialRecoveredEditsDirStr); 5251 FileSystem fs = recoveredEditsDir.getFileSystem(conf); 5252 FileStatus[] files = fs.listStatus(recoveredEditsDir); 5253 LOG.debug("Found {} recovered edits file(s) under {}", files == null ? 0 : files.length, 5254 recoveredEditsDir); 5255 if (files != null) { 5256 for (FileStatus file : files) { 5257 // it is safe to trust the zero-length in this case because we've been through rename and 5258 // lease recovery in the above. 5259 if (isZeroLengthThenDelete(fs, file, file.getPath())) { 5260 continue; 5261 } 5262 seqId = 5263 Math.max(seqId, replayRecoveredEdits(file.getPath(), maxSeqIdInStores, reporter, fs)); 5264 } 5265 } 5266 if (seqId > minSeqIdForTheRegion) { 5267 // Then we added some edits to memory. Flush and cleanup split edit files. 5268 internalFlushcache(null, seqId, stores.values(), status, false, 5269 FlushLifeCycleTracker.DUMMY); 5270 } 5271 deleteRecoveredEdits(fs, 5272 Stream.of(files).map(FileStatus::getPath).collect(Collectors.toList())); 5273 } 5274 5275 return seqId; 5276 } 5277 5278 private long replayRecoveredEditsForPaths(long minSeqIdForTheRegion, FileSystem fs, 5279 final NavigableSet<Path> files, final CancelableProgressable reporter, final Path regionDir) 5280 throws IOException { 5281 long seqid = minSeqIdForTheRegion; 5282 if (LOG.isDebugEnabled()) { 5283 LOG.debug("Found " + (files == null ? 0 : files.size()) + " recovered edits file(s) under " 5284 + regionDir); 5285 } 5286 5287 if (files == null || files.isEmpty()) { 5288 return minSeqIdForTheRegion; 5289 } 5290 5291 for (Path edits : files) { 5292 if (edits == null || !fs.exists(edits)) { 5293 LOG.warn("Null or non-existent edits file: " + edits); 5294 continue; 5295 } 5296 if (isZeroLengthThenDelete(fs, fs.getFileStatus(edits), edits)) { 5297 continue; 5298 } 5299 5300 long maxSeqId; 5301 String fileName = edits.getName(); 5302 maxSeqId = Math.abs(Long.parseLong(fileName)); 5303 if (maxSeqId <= minSeqIdForTheRegion) { 5304 if (LOG.isDebugEnabled()) { 5305 String msg = "Maximum sequenceid for this wal is " + maxSeqId 5306 + " and minimum sequenceid for the region " + this + " is " + minSeqIdForTheRegion 5307 + ", skipped the whole file, path=" + edits; 5308 LOG.debug(msg); 5309 } 5310 continue; 5311 } 5312 5313 try { 5314 // replay the edits. Replay can return -1 if everything is skipped, only update 5315 // if seqId is greater 5316 seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter, fs)); 5317 } catch (IOException e) { 5318 handleException(fs, edits, e); 5319 } 5320 } 5321 return seqid; 5322 } 5323 5324 private void handleException(FileSystem fs, Path edits, IOException e) throws IOException { 5325 boolean skipErrors = conf.getBoolean(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS, 5326 conf.getBoolean("hbase.skip.errors", HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS)); 5327 if (conf.get("hbase.skip.errors") != null) { 5328 LOG.warn("The property 'hbase.skip.errors' has been deprecated. Please use " 5329 + HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead."); 5330 } 5331 if (skipErrors) { 5332 Path p = WALSplitUtil.moveAsideBadEditsFile(fs, edits); 5333 LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + "=true so continuing. Renamed " 5334 + edits + " as " + p, e); 5335 } else { 5336 throw e; 5337 } 5338 } 5339 5340 /** 5341 * @param edits File of recovered edits. 5342 * @param maxSeqIdInStores Maximum sequenceid found in each store. Edits in wal must be larger 5343 * than this to be replayed for each store. 5344 * @return the sequence id of the last edit added to this region out of the recovered edits log or 5345 * <code>minSeqId</code> if nothing added from editlogs. 5346 */ 5347 private long replayRecoveredEdits(final Path edits, Map<byte[], Long> maxSeqIdInStores, 5348 final CancelableProgressable reporter, FileSystem fs) throws IOException { 5349 String msg = "Replaying edits from " + edits; 5350 LOG.info(msg); 5351 MonitoredTask status = TaskMonitor.get().createStatus(msg); 5352 5353 status.setStatus("Opening recovered edits"); 5354 try (WALStreamReader reader = WALFactory.createStreamReader(fs, edits, conf)) { 5355 long currentEditSeqId = -1; 5356 long currentReplaySeqId = -1; 5357 long firstSeqIdInLog = -1; 5358 long skippedEdits = 0; 5359 long editsCount = 0; 5360 long intervalEdits = 0; 5361 WAL.Entry entry; 5362 HStore store = null; 5363 boolean reported_once = false; 5364 ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager(); 5365 5366 try { 5367 // How many edits seen before we check elapsed time 5368 int interval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000); 5369 // How often to send a progress report (default 1/2 master timeout) 5370 int period = this.conf.getInt("hbase.hstore.report.period", 300000); 5371 long lastReport = EnvironmentEdgeManager.currentTime(); 5372 5373 if (coprocessorHost != null) { 5374 coprocessorHost.preReplayWALs(this.getRegionInfo(), edits); 5375 } 5376 5377 while ((entry = reader.next()) != null) { 5378 WALKey key = entry.getKey(); 5379 WALEdit val = entry.getEdit(); 5380 5381 if (ng != null) { // some test, or nonces disabled 5382 ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime()); 5383 } 5384 5385 if (reporter != null) { 5386 intervalEdits += val.size(); 5387 if (intervalEdits >= interval) { 5388 // Number of edits interval reached 5389 intervalEdits = 0; 5390 long cur = EnvironmentEdgeManager.currentTime(); 5391 if (lastReport + period <= cur) { 5392 status.setStatus( 5393 "Replaying edits..." + " skipped=" + skippedEdits + " edits=" + editsCount); 5394 // Timeout reached 5395 if (!reporter.progress()) { 5396 msg = "Progressable reporter failed, stopping replay for region " + this; 5397 LOG.warn(msg); 5398 status.abort(msg); 5399 throw new IOException(msg); 5400 } 5401 reported_once = true; 5402 lastReport = cur; 5403 } 5404 } 5405 } 5406 5407 if (firstSeqIdInLog == -1) { 5408 firstSeqIdInLog = key.getSequenceId(); 5409 } 5410 if (currentEditSeqId > key.getSequenceId()) { 5411 // when this condition is true, it means we have a serious defect because we need to 5412 // maintain increasing SeqId for WAL edits per region 5413 LOG.error(getRegionInfo().getEncodedName() + " : " + "Found decreasing SeqId. PreId=" 5414 + currentEditSeqId + " key=" + key + "; edit=" + val); 5415 } else { 5416 currentEditSeqId = key.getSequenceId(); 5417 } 5418 currentReplaySeqId = 5419 (key.getOrigLogSeqNum() > 0) ? key.getOrigLogSeqNum() : currentEditSeqId; 5420 5421 // Start coprocessor replay here. The coprocessor is for each WALEdit 5422 // instead of a KeyValue. 5423 if (coprocessorHost != null) { 5424 status.setStatus("Running pre-WAL-restore hook in coprocessors"); 5425 if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) { 5426 // if bypass this wal entry, ignore it ... 5427 continue; 5428 } 5429 } 5430 boolean checkRowWithinBoundary = false; 5431 // Check this edit is for this region. 5432 if ( 5433 !Bytes.equals(key.getEncodedRegionName(), this.getRegionInfo().getEncodedNameAsBytes()) 5434 ) { 5435 checkRowWithinBoundary = true; 5436 } 5437 5438 boolean flush = false; 5439 MemStoreSizing memStoreSizing = new NonThreadSafeMemStoreSizing(); 5440 for (Cell cell : val.getCells()) { 5441 // Check this edit is for me. Also, guard against writing the special 5442 // METACOLUMN info such as HBASE::CACHEFLUSH entries 5443 if (WALEdit.isMetaEditFamily(cell)) { 5444 // if region names don't match, skipp replaying compaction marker 5445 if (!checkRowWithinBoundary) { 5446 // this is a special edit, we should handle it 5447 CompactionDescriptor compaction = WALEdit.getCompaction(cell); 5448 if (compaction != null) { 5449 // replay the compaction 5450 replayWALCompactionMarker(compaction, false, true, Long.MAX_VALUE); 5451 } 5452 } 5453 skippedEdits++; 5454 continue; 5455 } 5456 // Figure which store the edit is meant for. 5457 if ( 5458 store == null 5459 || !CellUtil.matchingFamily(cell, store.getColumnFamilyDescriptor().getName()) 5460 ) { 5461 store = getStore(cell); 5462 } 5463 if (store == null) { 5464 // This should never happen. Perhaps schema was changed between 5465 // crash and redeploy? 5466 LOG.warn("No family for cell {} in region {}", cell, this); 5467 skippedEdits++; 5468 continue; 5469 } 5470 if ( 5471 checkRowWithinBoundary && !rowIsInRange(this.getRegionInfo(), cell.getRowArray(), 5472 cell.getRowOffset(), cell.getRowLength()) 5473 ) { 5474 LOG.warn("Row of {} is not within region boundary for region {}", cell, this); 5475 skippedEdits++; 5476 continue; 5477 } 5478 // Now, figure if we should skip this edit. 5479 if ( 5480 key.getSequenceId() 5481 <= maxSeqIdInStores.get(store.getColumnFamilyDescriptor().getName()) 5482 ) { 5483 skippedEdits++; 5484 continue; 5485 } 5486 PrivateCellUtil.setSequenceId(cell, currentReplaySeqId); 5487 5488 restoreEdit(store, cell, memStoreSizing); 5489 editsCount++; 5490 } 5491 MemStoreSize mss = memStoreSizing.getMemStoreSize(); 5492 incMemStoreSize(mss); 5493 flush = isFlushSize(this.memStoreSizing.getMemStoreSize()); 5494 if (flush) { 5495 internalFlushcache(null, currentEditSeqId, stores.values(), status, false, 5496 FlushLifeCycleTracker.DUMMY); 5497 } 5498 5499 if (coprocessorHost != null) { 5500 coprocessorHost.postWALRestore(this.getRegionInfo(), key, val); 5501 } 5502 } 5503 5504 if (coprocessorHost != null) { 5505 coprocessorHost.postReplayWALs(this.getRegionInfo(), edits); 5506 } 5507 } catch (EOFException eof) { 5508 if (!conf.getBoolean(RECOVERED_EDITS_IGNORE_EOF, false)) { 5509 Path p = WALSplitUtil.moveAsideBadEditsFile(walFS, edits); 5510 msg = "EnLongAddered EOF. Most likely due to Master failure during " 5511 + "wal splitting, so we have this data in another edit. Continuing, but renaming " 5512 + edits + " as " + p + " for region " + this; 5513 LOG.warn(msg, eof); 5514 status.abort(msg); 5515 } else { 5516 LOG.warn("EOF while replaying recover edits and config '{}' is true so " 5517 + "we will ignore it and continue", RECOVERED_EDITS_IGNORE_EOF, eof); 5518 } 5519 } catch (IOException ioe) { 5520 // If the IOE resulted from bad file format, 5521 // then this problem is idempotent and retrying won't help 5522 if (ioe.getCause() instanceof ParseException) { 5523 Path p = WALSplitUtil.moveAsideBadEditsFile(walFS, edits); 5524 msg = 5525 "File corruption enLongAddered! " + "Continuing, but renaming " + edits + " as " + p; 5526 LOG.warn(msg, ioe); 5527 status.setStatus(msg); 5528 } else { 5529 status.abort(StringUtils.stringifyException(ioe)); 5530 // other IO errors may be transient (bad network connection, 5531 // checksum exception on one datanode, etc). throw & retry 5532 throw ioe; 5533 } 5534 } 5535 if (reporter != null && !reported_once) { 5536 reporter.progress(); 5537 } 5538 msg = "Applied " + editsCount + ", skipped " + skippedEdits + ", firstSequenceIdInLog=" 5539 + firstSeqIdInLog + ", maxSequenceIdInLog=" + currentEditSeqId + ", path=" + edits; 5540 status.markComplete(msg); 5541 LOG.debug(msg); 5542 return currentEditSeqId; 5543 } finally { 5544 status.cleanup(); 5545 } 5546 } 5547 5548 /** 5549 * Call to complete a compaction. Its for the case where we find in the WAL a compaction that was 5550 * not finished. We could find one recovering a WAL after a regionserver crash. See HBASE-2331. 5551 */ 5552 void replayWALCompactionMarker(CompactionDescriptor compaction, boolean pickCompactionFiles, 5553 boolean removeFiles, long replaySeqId) throws IOException { 5554 try { 5555 checkTargetRegion(compaction.getEncodedRegionName().toByteArray(), 5556 "Compaction marker from WAL ", compaction); 5557 } catch (WrongRegionException wre) { 5558 if (RegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { 5559 // skip the compaction marker since it is not for this region 5560 return; 5561 } 5562 throw wre; 5563 } 5564 5565 synchronized (writestate) { 5566 if (replaySeqId < lastReplayedOpenRegionSeqId) { 5567 LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying compaction event :" 5568 + TextFormat.shortDebugString(compaction) + " because its sequence id " + replaySeqId 5569 + " is smaller than this regions " + "lastReplayedOpenRegionSeqId of " 5570 + lastReplayedOpenRegionSeqId); 5571 return; 5572 } 5573 if (replaySeqId < lastReplayedCompactionSeqId) { 5574 LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying compaction event :" 5575 + TextFormat.shortDebugString(compaction) + " because its sequence id " + replaySeqId 5576 + " is smaller than this regions " + "lastReplayedCompactionSeqId of " 5577 + lastReplayedCompactionSeqId); 5578 return; 5579 } else { 5580 lastReplayedCompactionSeqId = replaySeqId; 5581 } 5582 5583 if (LOG.isDebugEnabled()) { 5584 LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying compaction marker " 5585 + TextFormat.shortDebugString(compaction) + " with seqId=" + replaySeqId 5586 + " and lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId); 5587 } 5588 5589 startRegionOperation(Operation.REPLAY_EVENT); 5590 try { 5591 HStore store = this.getStore(compaction.getFamilyName().toByteArray()); 5592 if (store == null) { 5593 LOG.warn(getRegionInfo().getEncodedName() + " : " 5594 + "Found Compaction WAL edit for deleted family:" 5595 + Bytes.toString(compaction.getFamilyName().toByteArray())); 5596 return; 5597 } 5598 store.replayCompactionMarker(compaction, pickCompactionFiles, removeFiles); 5599 logRegionFiles(); 5600 } catch (FileNotFoundException ex) { 5601 LOG.warn(getRegionInfo().getEncodedName() + " : " 5602 + "At least one of the store files in compaction: " 5603 + TextFormat.shortDebugString(compaction) 5604 + " doesn't exist any more. Skip loading the file(s)", ex); 5605 } finally { 5606 closeRegionOperation(Operation.REPLAY_EVENT); 5607 } 5608 } 5609 } 5610 5611 void replayWALFlushMarker(FlushDescriptor flush, long replaySeqId) throws IOException { 5612 checkTargetRegion(flush.getEncodedRegionName().toByteArray(), "Flush marker from WAL ", flush); 5613 5614 if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { 5615 return; // if primary nothing to do 5616 } 5617 5618 if (LOG.isDebugEnabled()) { 5619 LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying flush marker " 5620 + TextFormat.shortDebugString(flush)); 5621 } 5622 5623 startRegionOperation(Operation.REPLAY_EVENT); // use region close lock to guard against close 5624 try { 5625 FlushAction action = flush.getAction(); 5626 switch (action) { 5627 case START_FLUSH: 5628 replayWALFlushStartMarker(flush); 5629 break; 5630 case COMMIT_FLUSH: 5631 replayWALFlushCommitMarker(flush); 5632 break; 5633 case ABORT_FLUSH: 5634 replayWALFlushAbortMarker(flush); 5635 break; 5636 case CANNOT_FLUSH: 5637 replayWALFlushCannotFlushMarker(flush, replaySeqId); 5638 break; 5639 default: 5640 LOG.warn(getRegionInfo().getEncodedName() + " : " 5641 + "Received a flush event with unknown action, ignoring. " 5642 + TextFormat.shortDebugString(flush)); 5643 break; 5644 } 5645 5646 logRegionFiles(); 5647 } finally { 5648 closeRegionOperation(Operation.REPLAY_EVENT); 5649 } 5650 } 5651 5652 /** 5653 * Replay the flush marker from primary region by creating a corresponding snapshot of the store 5654 * memstores, only if the memstores do not have a higher seqId from an earlier wal edit (because 5655 * the events may be coming out of order). 5656 */ 5657 PrepareFlushResult replayWALFlushStartMarker(FlushDescriptor flush) throws IOException { 5658 long flushSeqId = flush.getFlushSequenceNumber(); 5659 5660 HashSet<HStore> storesToFlush = new HashSet<>(); 5661 for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) { 5662 byte[] family = storeFlush.getFamilyName().toByteArray(); 5663 HStore store = getStore(family); 5664 if (store == null) { 5665 LOG.warn(getRegionInfo().getEncodedName() + " : " 5666 + "Received a flush start marker from primary, but the family is not found. Ignoring" 5667 + " StoreFlushDescriptor:" + TextFormat.shortDebugString(storeFlush)); 5668 continue; 5669 } 5670 storesToFlush.add(store); 5671 } 5672 5673 MonitoredTask status = TaskMonitor.get().createStatus("Preparing flush " + this); 5674 5675 // we will use writestate as a coarse-grain lock for all the replay events 5676 // (flush, compaction, region open etc) 5677 synchronized (writestate) { 5678 try { 5679 if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) { 5680 LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :" 5681 + TextFormat.shortDebugString(flush) 5682 + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId " 5683 + " of " + lastReplayedOpenRegionSeqId); 5684 return null; 5685 } 5686 if (numMutationsWithoutWAL.sum() > 0) { 5687 numMutationsWithoutWAL.reset(); 5688 dataInMemoryWithoutWAL.reset(); 5689 } 5690 5691 if (!writestate.flushing) { 5692 // we do not have an active snapshot and corresponding this.prepareResult. This means 5693 // we can just snapshot our memstores and continue as normal. 5694 5695 // invoke prepareFlushCache. Send null as wal since we do not want the flush events in wal 5696 PrepareFlushResult prepareResult = internalPrepareFlushCache(null, flushSeqId, 5697 storesToFlush, status, false, FlushLifeCycleTracker.DUMMY); 5698 if (prepareResult.result == null) { 5699 // save the PrepareFlushResult so that we can use it later from commit flush 5700 this.writestate.flushing = true; 5701 this.prepareFlushResult = prepareResult; 5702 status.markComplete("Flush prepare successful"); 5703 if (LOG.isDebugEnabled()) { 5704 LOG.debug(getRegionInfo().getEncodedName() + " : " + " Prepared flush with seqId:" 5705 + flush.getFlushSequenceNumber()); 5706 } 5707 } else { 5708 // special case empty memstore. We will still save the flush result in this case, since 5709 // our memstore ie empty, but the primary is still flushing 5710 if ( 5711 prepareResult.getResult().getResult() 5712 == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY 5713 ) { 5714 this.writestate.flushing = true; 5715 this.prepareFlushResult = prepareResult; 5716 if (LOG.isDebugEnabled()) { 5717 LOG.debug(getRegionInfo().getEncodedName() + " : " 5718 + " Prepared empty flush with seqId:" + flush.getFlushSequenceNumber()); 5719 } 5720 } 5721 status.abort("Flush prepare failed with " + prepareResult.result); 5722 // nothing much to do. prepare flush failed because of some reason. 5723 } 5724 return prepareResult; 5725 } else { 5726 // we already have an active snapshot. 5727 if (flush.getFlushSequenceNumber() == this.prepareFlushResult.flushOpSeqId) { 5728 // They define the same flush. Log and continue. 5729 LOG.warn(getRegionInfo().getEncodedName() + " : " 5730 + "Received a flush prepare marker with the same seqId: " 5731 + +flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: " 5732 + prepareFlushResult.flushOpSeqId + ". Ignoring"); 5733 // ignore 5734 } else if (flush.getFlushSequenceNumber() < this.prepareFlushResult.flushOpSeqId) { 5735 // We received a flush with a smaller seqNum than what we have prepared. We can only 5736 // ignore this prepare flush request. 5737 LOG.warn(getRegionInfo().getEncodedName() + " : " 5738 + "Received a flush prepare marker with a smaller seqId: " 5739 + +flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: " 5740 + prepareFlushResult.flushOpSeqId + ". Ignoring"); 5741 // ignore 5742 } else { 5743 // We received a flush with a larger seqNum than what we have prepared 5744 LOG.warn(getRegionInfo().getEncodedName() + " : " 5745 + "Received a flush prepare marker with a larger seqId: " 5746 + +flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: " 5747 + prepareFlushResult.flushOpSeqId + ". Ignoring"); 5748 // We do not have multiple active snapshots in the memstore or a way to merge current 5749 // memstore snapshot with the contents and resnapshot for now. We cannot take 5750 // another snapshot and drop the previous one because that will cause temporary 5751 // data loss in the secondary. So we ignore this for now, deferring the resolution 5752 // to happen when we see the corresponding flush commit marker. If we have a memstore 5753 // snapshot with x, and later received another prepare snapshot with y (where x < y), 5754 // when we see flush commit for y, we will drop snapshot for x, and can also drop all 5755 // the memstore edits if everything in memstore is < y. This is the usual case for 5756 // RS crash + recovery where we might see consequtive prepare flush wal markers. 5757 // Otherwise, this will cause more memory to be used in secondary replica until a 5758 // further prapare + commit flush is seen and replayed. 5759 } 5760 } 5761 } finally { 5762 status.cleanup(); 5763 writestate.notifyAll(); 5764 } 5765 } 5766 return null; 5767 } 5768 5769 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", 5770 justification = "Intentional; post memstore flush") 5771 void replayWALFlushCommitMarker(FlushDescriptor flush) throws IOException { 5772 MonitoredTask status = TaskMonitor.get().createStatus("Committing flush " + this); 5773 5774 // check whether we have the memstore snapshot with the corresponding seqId. Replay to 5775 // secondary region replicas are in order, except for when the region moves or then the 5776 // region server crashes. In those cases, we may receive replay requests out of order from 5777 // the original seqIds. 5778 synchronized (writestate) { 5779 try { 5780 if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) { 5781 LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :" 5782 + TextFormat.shortDebugString(flush) 5783 + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId " 5784 + " of " + lastReplayedOpenRegionSeqId); 5785 return; 5786 } 5787 5788 if (writestate.flushing) { 5789 PrepareFlushResult prepareFlushResult = this.prepareFlushResult; 5790 if (flush.getFlushSequenceNumber() == prepareFlushResult.flushOpSeqId) { 5791 if (LOG.isDebugEnabled()) { 5792 LOG.debug(getRegionInfo().getEncodedName() + " : " 5793 + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber() 5794 + " and a previous prepared snapshot was found"); 5795 } 5796 // This is the regular case where we received commit flush after prepare flush 5797 // corresponding to the same seqId. 5798 replayFlushInStores(flush, prepareFlushResult, true); 5799 5800 // Set down the memstore size by amount of flush. 5801 this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize()); 5802 this.prepareFlushResult = null; 5803 writestate.flushing = false; 5804 } else if (flush.getFlushSequenceNumber() < prepareFlushResult.flushOpSeqId) { 5805 // This should not happen normally. However, lets be safe and guard against these cases 5806 // we received a flush commit with a smaller seqId than what we have prepared 5807 // we will pick the flush file up from this commit (if we have not seen it), but we 5808 // will not drop the memstore 5809 LOG.warn(getRegionInfo().getEncodedName() + " : " 5810 + "Received a flush commit marker with smaller seqId: " 5811 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " 5812 + prepareFlushResult.flushOpSeqId + ". Picking up new file, but not dropping" 5813 + " prepared memstore snapshot"); 5814 replayFlushInStores(flush, prepareFlushResult, false); 5815 5816 // snapshot is not dropped, so memstore sizes should not be decremented 5817 // we still have the prepared snapshot, flushing should still be true 5818 } else { 5819 // This should not happen normally. However, lets be safe and guard against these cases 5820 // we received a flush commit with a larger seqId than what we have prepared 5821 // we will pick the flush file for this. We will also obtain the updates lock and 5822 // look for contents of the memstore to see whether we have edits after this seqId. 5823 // If not, we will drop all the memstore edits and the snapshot as well. 5824 LOG.warn(getRegionInfo().getEncodedName() + " : " 5825 + "Received a flush commit marker with larger seqId: " 5826 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " 5827 + prepareFlushResult.flushOpSeqId + ". Picking up new file and dropping prepared" 5828 + " memstore snapshot"); 5829 5830 replayFlushInStores(flush, prepareFlushResult, true); 5831 5832 // Set down the memstore size by amount of flush. 5833 this.decrMemStoreSize(prepareFlushResult.totalFlushableSize.getMemStoreSize()); 5834 5835 // Inspect the memstore contents to see whether the memstore contains only edits 5836 // with seqId smaller than the flush seqId. If so, we can discard those edits. 5837 dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null); 5838 5839 this.prepareFlushResult = null; 5840 writestate.flushing = false; 5841 } 5842 // If we were waiting for observing a flush or region opening event for not showing 5843 // partial data after a secondary region crash, we can allow reads now. We can only make 5844 // sure that we are not showing partial data (for example skipping some previous edits) 5845 // until we observe a full flush start and flush commit. So if we were not able to find 5846 // a previous flush we will not enable reads now. 5847 this.setReadsEnabled(true); 5848 } else { 5849 LOG.warn( 5850 getRegionInfo().getEncodedName() + " : " + "Received a flush commit marker with seqId:" 5851 + flush.getFlushSequenceNumber() + ", but no previous prepared snapshot was found"); 5852 // There is no corresponding prepare snapshot from before. 5853 // We will pick up the new flushed file 5854 replayFlushInStores(flush, null, false); 5855 5856 // Inspect the memstore contents to see whether the memstore contains only edits 5857 // with seqId smaller than the flush seqId. If so, we can discard those edits. 5858 dropMemStoreContentsForSeqId(flush.getFlushSequenceNumber(), null); 5859 } 5860 5861 status.markComplete("Flush commit successful"); 5862 5863 // Update the last flushed sequence id for region. 5864 this.maxFlushedSeqId = flush.getFlushSequenceNumber(); 5865 5866 // advance the mvcc read point so that the new flushed file is visible. 5867 mvcc.advanceTo(flush.getFlushSequenceNumber()); 5868 5869 } catch (FileNotFoundException ex) { 5870 LOG.warn(getRegionInfo().getEncodedName() + " : " 5871 + "At least one of the store files in flush: " + TextFormat.shortDebugString(flush) 5872 + " doesn't exist any more. Skip loading the file(s)", ex); 5873 } finally { 5874 status.cleanup(); 5875 writestate.notifyAll(); 5876 } 5877 } 5878 5879 // C. Finally notify anyone waiting on memstore to clear: 5880 // e.g. checkResources(). 5881 synchronized (this) { 5882 notifyAll(); // FindBugs NN_NAKED_NOTIFY 5883 } 5884 } 5885 5886 /** 5887 * Replays the given flush descriptor by opening the flush files in stores and dropping the 5888 * memstore snapshots if requested. 5889 */ 5890 private void replayFlushInStores(FlushDescriptor flush, PrepareFlushResult prepareFlushResult, 5891 boolean dropMemstoreSnapshot) throws IOException { 5892 for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) { 5893 byte[] family = storeFlush.getFamilyName().toByteArray(); 5894 HStore store = getStore(family); 5895 if (store == null) { 5896 LOG.warn(getRegionInfo().getEncodedName() + " : " 5897 + "Received a flush commit marker from primary, but the family is not found." 5898 + "Ignoring StoreFlushDescriptor:" + storeFlush); 5899 continue; 5900 } 5901 List<String> flushFiles = storeFlush.getFlushOutputList(); 5902 StoreFlushContext ctx = null; 5903 long startTime = EnvironmentEdgeManager.currentTime(); 5904 if (prepareFlushResult == null || prepareFlushResult.storeFlushCtxs == null) { 5905 ctx = store.createFlushContext(flush.getFlushSequenceNumber(), FlushLifeCycleTracker.DUMMY); 5906 } else { 5907 ctx = prepareFlushResult.storeFlushCtxs.get(family); 5908 startTime = prepareFlushResult.startTime; 5909 } 5910 5911 if (ctx == null) { 5912 LOG.warn(getRegionInfo().getEncodedName() + " : " 5913 + "Unexpected: flush commit marker received from store " + Bytes.toString(family) 5914 + " but no associated flush context. Ignoring"); 5915 continue; 5916 } 5917 5918 ctx.replayFlush(flushFiles, dropMemstoreSnapshot); // replay the flush 5919 5920 // Record latest flush time 5921 this.lastStoreFlushTimeMap.put(store, startTime); 5922 } 5923 } 5924 5925 private long loadRecoveredHFilesIfAny(Collection<HStore> stores) throws IOException { 5926 Path regionDir = fs.getRegionDir(); 5927 long maxSeqId = -1; 5928 for (HStore store : stores) { 5929 String familyName = store.getColumnFamilyName(); 5930 FileStatus[] files = 5931 WALSplitUtil.getRecoveredHFiles(fs.getFileSystem(), regionDir, familyName); 5932 if (files != null && files.length != 0) { 5933 for (FileStatus file : files) { 5934 Path filePath = file.getPath(); 5935 // If file length is zero then delete it 5936 if (isZeroLengthThenDelete(fs.getFileSystem(), file, filePath)) { 5937 continue; 5938 } 5939 try { 5940 HStoreFile storefile = store.tryCommitRecoveredHFile(file.getPath()); 5941 maxSeqId = Math.max(maxSeqId, storefile.getReader().getSequenceID()); 5942 } catch (IOException e) { 5943 handleException(fs.getFileSystem(), filePath, e); 5944 continue; 5945 } 5946 } 5947 if (this.rsServices != null && store.needsCompaction()) { 5948 this.rsServices.getCompactionRequestor().requestCompaction(this, store, 5949 "load recovered hfiles request compaction", Store.PRIORITY_USER + 1, 5950 CompactionLifeCycleTracker.DUMMY, null); 5951 } 5952 } 5953 } 5954 return maxSeqId; 5955 } 5956 5957 /** 5958 * Be careful, this method will drop all data in the memstore of this region. Currently, this 5959 * method is used to drop memstore to prevent memory leak when replaying recovered.edits while 5960 * opening region. 5961 */ 5962 private MemStoreSize dropMemStoreContents() throws IOException { 5963 MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing(); 5964 this.updatesLock.writeLock().lock(); 5965 try { 5966 for (HStore s : stores.values()) { 5967 MemStoreSize memStoreSize = doDropStoreMemStoreContentsForSeqId(s, HConstants.NO_SEQNUM); 5968 LOG.info("Drop memstore for Store " + s.getColumnFamilyName() + " in region " 5969 + this.getRegionInfo().getRegionNameAsString() + " , dropped memstoresize: [" 5970 + memStoreSize + " }"); 5971 totalFreedSize.incMemStoreSize(memStoreSize); 5972 } 5973 return totalFreedSize.getMemStoreSize(); 5974 } finally { 5975 this.updatesLock.writeLock().unlock(); 5976 } 5977 } 5978 5979 /** 5980 * Drops the memstore contents after replaying a flush descriptor or region open event replay if 5981 * the memstore edits have seqNums smaller than the given seq id 5982 */ 5983 private MemStoreSize dropMemStoreContentsForSeqId(long seqId, HStore store) throws IOException { 5984 MemStoreSizing totalFreedSize = new NonThreadSafeMemStoreSizing(); 5985 this.updatesLock.writeLock().lock(); 5986 try { 5987 5988 long currentSeqId = mvcc.getReadPoint(); 5989 if (seqId >= currentSeqId) { 5990 // then we can drop the memstore contents since everything is below this seqId 5991 LOG.info(getRegionInfo().getEncodedName() + " : " 5992 + "Dropping memstore contents as well since replayed flush seqId: " + seqId 5993 + " is greater than current seqId:" + currentSeqId); 5994 5995 // Prepare flush (take a snapshot) and then abort (drop the snapshot) 5996 if (store == null) { 5997 for (HStore s : stores.values()) { 5998 totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(s, currentSeqId)); 5999 } 6000 } else { 6001 totalFreedSize.incMemStoreSize(doDropStoreMemStoreContentsForSeqId(store, currentSeqId)); 6002 } 6003 } else { 6004 LOG.info(getRegionInfo().getEncodedName() + " : " 6005 + "Not dropping memstore contents since replayed flush seqId: " + seqId 6006 + " is smaller than current seqId:" + currentSeqId); 6007 } 6008 } finally { 6009 this.updatesLock.writeLock().unlock(); 6010 } 6011 return totalFreedSize.getMemStoreSize(); 6012 } 6013 6014 private MemStoreSize doDropStoreMemStoreContentsForSeqId(HStore s, long currentSeqId) 6015 throws IOException { 6016 MemStoreSize flushableSize = s.getFlushableSize(); 6017 this.decrMemStoreSize(flushableSize); 6018 StoreFlushContext ctx = s.createFlushContext(currentSeqId, FlushLifeCycleTracker.DUMMY); 6019 ctx.prepare(); 6020 ctx.abort(); 6021 return flushableSize; 6022 } 6023 6024 private void replayWALFlushAbortMarker(FlushDescriptor flush) { 6025 // nothing to do for now. A flush abort will cause a RS abort which means that the region 6026 // will be opened somewhere else later. We will see the region open event soon, and replaying 6027 // that will drop the snapshot 6028 } 6029 6030 private void replayWALFlushCannotFlushMarker(FlushDescriptor flush, long replaySeqId) { 6031 synchronized (writestate) { 6032 if (this.lastReplayedOpenRegionSeqId > replaySeqId) { 6033 LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying flush event :" 6034 + TextFormat.shortDebugString(flush) + " because its sequence id " + replaySeqId 6035 + " is smaller than this regions " + "lastReplayedOpenRegionSeqId of " 6036 + lastReplayedOpenRegionSeqId); 6037 return; 6038 } 6039 6040 // If we were waiting for observing a flush or region opening event for not showing partial 6041 // data after a secondary region crash, we can allow reads now. This event means that the 6042 // primary was not able to flush because memstore is empty when we requested flush. By the 6043 // time we observe this, we are guaranteed to have up to date seqId with our previous 6044 // assignment. 6045 this.setReadsEnabled(true); 6046 } 6047 } 6048 6049 PrepareFlushResult getPrepareFlushResult() { 6050 return prepareFlushResult; 6051 } 6052 6053 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", 6054 justification = "Intentional; cleared the memstore") 6055 void replayWALRegionEventMarker(RegionEventDescriptor regionEvent) throws IOException { 6056 checkTargetRegion(regionEvent.getEncodedRegionName().toByteArray(), 6057 "RegionEvent marker from WAL ", regionEvent); 6058 6059 startRegionOperation(Operation.REPLAY_EVENT); 6060 try { 6061 if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { 6062 return; // if primary nothing to do 6063 } 6064 6065 if (regionEvent.getEventType() == EventType.REGION_CLOSE) { 6066 // nothing to do on REGION_CLOSE for now. 6067 return; 6068 } 6069 if (regionEvent.getEventType() != EventType.REGION_OPEN) { 6070 LOG.warn(getRegionInfo().getEncodedName() + " : " 6071 + "Unknown region event received, ignoring :" + TextFormat.shortDebugString(regionEvent)); 6072 return; 6073 } 6074 6075 if (LOG.isDebugEnabled()) { 6076 LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying region open event marker " 6077 + TextFormat.shortDebugString(regionEvent)); 6078 } 6079 6080 // we will use writestate as a coarse-grain lock for all the replay events 6081 synchronized (writestate) { 6082 // Replication can deliver events out of order when primary region moves or the region 6083 // server crashes, since there is no coordination between replication of different wal files 6084 // belonging to different region servers. We have to safe guard against this case by using 6085 // region open event's seqid. Since this is the first event that the region puts (after 6086 // possibly flushing recovered.edits), after seeing this event, we can ignore every edit 6087 // smaller than this seqId 6088 if (this.lastReplayedOpenRegionSeqId <= regionEvent.getLogSequenceNumber()) { 6089 this.lastReplayedOpenRegionSeqId = regionEvent.getLogSequenceNumber(); 6090 } else { 6091 LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying region event :" 6092 + TextFormat.shortDebugString(regionEvent) 6093 + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId " 6094 + " of " + lastReplayedOpenRegionSeqId); 6095 return; 6096 } 6097 6098 // region open lists all the files that the region has at the time of the opening. Just pick 6099 // all the files and drop prepared flushes and empty memstores 6100 for (StoreDescriptor storeDescriptor : regionEvent.getStoresList()) { 6101 // stores of primary may be different now 6102 byte[] family = storeDescriptor.getFamilyName().toByteArray(); 6103 HStore store = getStore(family); 6104 if (store == null) { 6105 LOG.warn(getRegionInfo().getEncodedName() + " : " 6106 + "Received a region open marker from primary, but the family is not found. " 6107 + "Ignoring. StoreDescriptor:" + storeDescriptor); 6108 continue; 6109 } 6110 6111 long storeSeqId = store.getMaxSequenceId().orElse(0L); 6112 List<String> storeFiles = storeDescriptor.getStoreFileList(); 6113 try { 6114 store.refreshStoreFiles(storeFiles); // replace the files with the new ones 6115 } catch (FileNotFoundException ex) { 6116 LOG.warn(getRegionInfo().getEncodedName() + " : " + "At least one of the store files: " 6117 + storeFiles + " doesn't exist any more. Skip loading the file(s)", ex); 6118 continue; 6119 } 6120 if (store.getMaxSequenceId().orElse(0L) != storeSeqId) { 6121 // Record latest flush time if we picked up new files 6122 lastStoreFlushTimeMap.put(store, EnvironmentEdgeManager.currentTime()); 6123 } 6124 6125 if (writestate.flushing) { 6126 // only drop memstore snapshots if they are smaller than last flush for the store 6127 if (this.prepareFlushResult.flushOpSeqId <= regionEvent.getLogSequenceNumber()) { 6128 StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null 6129 ? null 6130 : this.prepareFlushResult.storeFlushCtxs.get(family); 6131 if (ctx != null) { 6132 MemStoreSize mss = store.getFlushableSize(); 6133 ctx.abort(); 6134 this.decrMemStoreSize(mss); 6135 this.prepareFlushResult.storeFlushCtxs.remove(family); 6136 } 6137 } 6138 } 6139 6140 // Drop the memstore contents if they are now smaller than the latest seen flushed file 6141 dropMemStoreContentsForSeqId(regionEvent.getLogSequenceNumber(), store); 6142 if (storeSeqId > this.maxFlushedSeqId) { 6143 this.maxFlushedSeqId = storeSeqId; 6144 } 6145 } 6146 6147 // if all stores ended up dropping their snapshots, we can safely drop the 6148 // prepareFlushResult 6149 dropPrepareFlushIfPossible(); 6150 6151 // advance the mvcc read point so that the new flushed file is visible. 6152 mvcc.await(); 6153 6154 // If we were waiting for observing a flush or region opening event for not showing partial 6155 // data after a secondary region crash, we can allow reads now. 6156 this.setReadsEnabled(true); 6157 6158 // C. Finally notify anyone waiting on memstore to clear: 6159 // e.g. checkResources(). 6160 synchronized (this) { 6161 notifyAll(); // FindBugs NN_NAKED_NOTIFY 6162 } 6163 } 6164 logRegionFiles(); 6165 } finally { 6166 closeRegionOperation(Operation.REPLAY_EVENT); 6167 } 6168 } 6169 6170 void replayWALBulkLoadEventMarker(WALProtos.BulkLoadDescriptor bulkLoadEvent) throws IOException { 6171 checkTargetRegion(bulkLoadEvent.getEncodedRegionName().toByteArray(), 6172 "BulkLoad marker from WAL ", bulkLoadEvent); 6173 6174 if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { 6175 return; // if primary nothing to do 6176 } 6177 6178 if (LOG.isDebugEnabled()) { 6179 LOG.debug(getRegionInfo().getEncodedName() + " : " + "Replaying bulkload event marker " 6180 + TextFormat.shortDebugString(bulkLoadEvent)); 6181 } 6182 // check if multiple families involved 6183 boolean multipleFamilies = false; 6184 byte[] family = null; 6185 for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) { 6186 byte[] fam = storeDescriptor.getFamilyName().toByteArray(); 6187 if (family == null) { 6188 family = fam; 6189 } else if (!Bytes.equals(family, fam)) { 6190 multipleFamilies = true; 6191 break; 6192 } 6193 } 6194 6195 startBulkRegionOperation(multipleFamilies); 6196 try { 6197 // we will use writestate as a coarse-grain lock for all the replay events 6198 synchronized (writestate) { 6199 // Replication can deliver events out of order when primary region moves or the region 6200 // server crashes, since there is no coordination between replication of different wal files 6201 // belonging to different region servers. We have to safe guard against this case by using 6202 // region open event's seqid. Since this is the first event that the region puts (after 6203 // possibly flushing recovered.edits), after seeing this event, we can ignore every edit 6204 // smaller than this seqId 6205 if ( 6206 bulkLoadEvent.getBulkloadSeqNum() >= 0 6207 && this.lastReplayedOpenRegionSeqId >= bulkLoadEvent.getBulkloadSeqNum() 6208 ) { 6209 LOG.warn(getRegionInfo().getEncodedName() + " : " + "Skipping replaying bulkload event :" 6210 + TextFormat.shortDebugString(bulkLoadEvent) 6211 + " because its sequence id is smaller than this region's lastReplayedOpenRegionSeqId" 6212 + " =" + lastReplayedOpenRegionSeqId); 6213 6214 return; 6215 } 6216 6217 for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) { 6218 // stores of primary may be different now 6219 family = storeDescriptor.getFamilyName().toByteArray(); 6220 HStore store = getStore(family); 6221 if (store == null) { 6222 LOG.warn(getRegionInfo().getEncodedName() + " : " 6223 + "Received a bulk load marker from primary, but the family is not found. " 6224 + "Ignoring. StoreDescriptor:" + storeDescriptor); 6225 continue; 6226 } 6227 6228 List<String> storeFiles = storeDescriptor.getStoreFileList(); 6229 for (String storeFile : storeFiles) { 6230 StoreFileInfo storeFileInfo = null; 6231 try { 6232 storeFileInfo = fs.getStoreFileInfo(Bytes.toString(family), storeFile); 6233 store.bulkLoadHFile(storeFileInfo); 6234 } catch (FileNotFoundException ex) { 6235 LOG.warn(getRegionInfo().getEncodedName() + " : " 6236 + ((storeFileInfo != null) 6237 ? storeFileInfo.toString() 6238 : (new Path(Bytes.toString(family), storeFile)).toString()) 6239 + " doesn't exist any more. Skip loading the file"); 6240 } 6241 } 6242 } 6243 } 6244 if (bulkLoadEvent.getBulkloadSeqNum() > 0) { 6245 mvcc.advanceTo(bulkLoadEvent.getBulkloadSeqNum()); 6246 } 6247 } finally { 6248 closeBulkRegionOperation(); 6249 } 6250 } 6251 6252 /** 6253 * If all stores ended up dropping their snapshots, we can safely drop the prepareFlushResult 6254 */ 6255 private void dropPrepareFlushIfPossible() { 6256 if (writestate.flushing) { 6257 boolean canDrop = true; 6258 if (prepareFlushResult.storeFlushCtxs != null) { 6259 for (Entry<byte[], StoreFlushContext> entry : prepareFlushResult.storeFlushCtxs 6260 .entrySet()) { 6261 HStore store = getStore(entry.getKey()); 6262 if (store == null) { 6263 continue; 6264 } 6265 if (store.getSnapshotSize().getDataSize() > 0) { 6266 canDrop = false; 6267 break; 6268 } 6269 } 6270 } 6271 6272 // this means that all the stores in the region has finished flushing, but the WAL marker 6273 // may not have been written or we did not receive it yet. 6274 if (canDrop) { 6275 writestate.flushing = false; 6276 this.prepareFlushResult = null; 6277 } 6278 } 6279 } 6280 6281 @Override 6282 public boolean refreshStoreFiles() throws IOException { 6283 return refreshStoreFiles(false); 6284 } 6285 6286 @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NN_NAKED_NOTIFY", 6287 justification = "Notify is about post replay. Intentional") 6288 protected boolean refreshStoreFiles(boolean force) throws IOException { 6289 if (!force && ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) { 6290 return false; // if primary nothing to do 6291 } 6292 6293 if (LOG.isDebugEnabled()) { 6294 LOG.debug(getRegionInfo().getEncodedName() + " : " 6295 + "Refreshing store files to see whether we can free up memstore"); 6296 } 6297 6298 long totalFreedDataSize = 0; 6299 6300 long smallestSeqIdInStores = Long.MAX_VALUE; 6301 6302 startRegionOperation(); // obtain region close lock 6303 try { 6304 Map<HStore, Long> map = new HashMap<>(); 6305 synchronized (writestate) { 6306 for (HStore store : stores.values()) { 6307 // TODO: some stores might see new data from flush, while others do not which 6308 // MIGHT break atomic edits across column families. 6309 long maxSeqIdBefore = store.getMaxSequenceId().orElse(0L); 6310 6311 // refresh the store files. This is similar to observing a region open wal marker. 6312 store.refreshStoreFiles(); 6313 6314 long storeSeqId = store.getMaxSequenceId().orElse(0L); 6315 if (storeSeqId < smallestSeqIdInStores) { 6316 smallestSeqIdInStores = storeSeqId; 6317 } 6318 6319 // see whether we can drop the memstore or the snapshot 6320 if (storeSeqId > maxSeqIdBefore) { 6321 if (writestate.flushing) { 6322 // only drop memstore snapshots if they are smaller than last flush for the store 6323 if (this.prepareFlushResult.flushOpSeqId <= storeSeqId) { 6324 StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null 6325 ? null 6326 : this.prepareFlushResult.storeFlushCtxs 6327 .get(store.getColumnFamilyDescriptor().getName()); 6328 if (ctx != null) { 6329 MemStoreSize mss = store.getFlushableSize(); 6330 ctx.abort(); 6331 this.decrMemStoreSize(mss); 6332 this.prepareFlushResult.storeFlushCtxs 6333 .remove(store.getColumnFamilyDescriptor().getName()); 6334 totalFreedDataSize += mss.getDataSize(); 6335 } 6336 } 6337 } 6338 6339 map.put(store, storeSeqId); 6340 } 6341 } 6342 6343 // if all stores ended up dropping their snapshots, we can safely drop the 6344 // prepareFlushResult 6345 dropPrepareFlushIfPossible(); 6346 6347 // advance the mvcc read point so that the new flushed files are visible. 6348 // either greater than flush seq number or they were already picked up via flush. 6349 for (HStore s : stores.values()) { 6350 mvcc.advanceTo(s.getMaxMemStoreTS().orElse(0L)); 6351 } 6352 6353 // smallestSeqIdInStores is the seqId that we have a corresponding hfile for. We can safely 6354 // skip all edits that are to be replayed in the future with that has a smaller seqId 6355 // than this. We are updating lastReplayedOpenRegionSeqId so that we can skip all edits 6356 // that we have picked the flush files for 6357 if (this.lastReplayedOpenRegionSeqId < smallestSeqIdInStores) { 6358 this.lastReplayedOpenRegionSeqId = smallestSeqIdInStores; 6359 } 6360 } 6361 if (!map.isEmpty()) { 6362 for (Map.Entry<HStore, Long> entry : map.entrySet()) { 6363 // Drop the memstore contents if they are now smaller than the latest seen flushed file 6364 totalFreedDataSize += 6365 dropMemStoreContentsForSeqId(entry.getValue(), entry.getKey()).getDataSize(); 6366 } 6367 } 6368 // C. Finally notify anyone waiting on memstore to clear: 6369 // e.g. checkResources(). 6370 synchronized (this) { 6371 notifyAll(); // FindBugs NN_NAKED_NOTIFY 6372 } 6373 return totalFreedDataSize > 0; 6374 } finally { 6375 closeRegionOperation(); 6376 } 6377 } 6378 6379 private void logRegionFiles() { 6380 if (LOG.isTraceEnabled()) { 6381 LOG.trace(getRegionInfo().getEncodedName() + " : Store files for region: "); 6382 stores.values().stream().filter(s -> s.getStorefiles() != null) 6383 .flatMap(s -> s.getStorefiles().stream()) 6384 .forEachOrdered(sf -> LOG.trace(getRegionInfo().getEncodedName() + " : " + sf)); 6385 } 6386 } 6387 6388 /** 6389 * Checks whether the given regionName is either equal to our region, or that the regionName is 6390 * the primary region to our corresponding range for the secondary replica. 6391 */ 6392 private void checkTargetRegion(byte[] encodedRegionName, String exceptionMsg, Object payload) 6393 throws WrongRegionException { 6394 if (Bytes.equals(this.getRegionInfo().getEncodedNameAsBytes(), encodedRegionName)) { 6395 return; 6396 } 6397 6398 if ( 6399 !RegionReplicaUtil.isDefaultReplica(this.getRegionInfo()) 6400 && Bytes.equals(encodedRegionName, this.fs.getRegionInfoForFS().getEncodedNameAsBytes()) 6401 ) { 6402 return; 6403 } 6404 6405 throw new WrongRegionException( 6406 exceptionMsg + payload + " targetted for region " + Bytes.toStringBinary(encodedRegionName) 6407 + " does not match this region: " + this.getRegionInfo()); 6408 } 6409 6410 /** 6411 * Used by tests 6412 * @param s Store to add edit too. 6413 * @param cell Cell to add. 6414 */ 6415 protected void restoreEdit(HStore s, Cell cell, MemStoreSizing memstoreAccounting) { 6416 s.add(cell, memstoreAccounting); 6417 } 6418 6419 /** 6420 * make sure have been through lease recovery before get file status, so the file length can be 6421 * trusted. 6422 * @param p File to check. 6423 * @return True if file was zero-length (and if so, we'll delete it in here). 6424 */ 6425 private static boolean isZeroLengthThenDelete(final FileSystem fs, final FileStatus stat, 6426 final Path p) throws IOException { 6427 if (stat.getLen() > 0) { 6428 return false; 6429 } 6430 LOG.warn("File " + p + " is zero-length, deleting."); 6431 fs.delete(p, false); 6432 return true; 6433 } 6434 6435 protected HStore instantiateHStore(final ColumnFamilyDescriptor family, boolean warmup) 6436 throws IOException { 6437 if (family.isMobEnabled()) { 6438 if (HFile.getFormatVersion(this.conf) < HFile.MIN_FORMAT_VERSION_WITH_TAGS) { 6439 throw new IOException("A minimum HFile version of " + HFile.MIN_FORMAT_VERSION_WITH_TAGS 6440 + " is required for MOB feature. Consider setting " + HFile.FORMAT_VERSION_KEY 6441 + " accordingly."); 6442 } 6443 return new HMobStore(this, family, this.conf, warmup); 6444 } 6445 return new HStore(this, family, this.conf, warmup); 6446 } 6447 6448 @Override 6449 public HStore getStore(byte[] column) { 6450 return this.stores.get(column); 6451 } 6452 6453 /** 6454 * Return HStore instance. Does not do any copy: as the number of store is limited, we iterate on 6455 * the list. 6456 */ 6457 private HStore getStore(Cell cell) { 6458 return stores.entrySet().stream().filter(e -> CellUtil.matchingFamily(cell, e.getKey())) 6459 .map(e -> e.getValue()).findFirst().orElse(null); 6460 } 6461 6462 @Override 6463 public List<HStore> getStores() { 6464 return new ArrayList<>(stores.values()); 6465 } 6466 6467 @Override 6468 public List<String> getStoreFileList(byte[][] columns) throws IllegalArgumentException { 6469 List<String> storeFileNames = new ArrayList<>(); 6470 synchronized (closeLock) { 6471 for (byte[] column : columns) { 6472 HStore store = this.stores.get(column); 6473 if (store == null) { 6474 throw new IllegalArgumentException( 6475 "No column family : " + new String(column, StandardCharsets.UTF_8) + " available"); 6476 } 6477 Collection<HStoreFile> storeFiles = store.getStorefiles(); 6478 if (storeFiles == null) { 6479 continue; 6480 } 6481 for (HStoreFile storeFile : storeFiles) { 6482 storeFileNames.add(storeFile.getPath().toString()); 6483 } 6484 6485 logRegionFiles(); 6486 } 6487 } 6488 return storeFileNames; 6489 } 6490 6491 ////////////////////////////////////////////////////////////////////////////// 6492 // Support code 6493 ////////////////////////////////////////////////////////////////////////////// 6494 6495 /** Make sure this is a valid row for the HRegion */ 6496 void checkRow(byte[] row, String op) throws IOException { 6497 if (!rowIsInRange(getRegionInfo(), row)) { 6498 throw new WrongRegionException("Requested row out of range for " + op + " on HRegion " + this 6499 + ", startKey='" + Bytes.toStringBinary(getRegionInfo().getStartKey()) + "', getEndKey()='" 6500 + Bytes.toStringBinary(getRegionInfo().getEndKey()) + "', row='" + Bytes.toStringBinary(row) 6501 + "'"); 6502 } 6503 } 6504 6505 /** 6506 * Get an exclusive ( write lock ) lock on a given row. 6507 * @param row Which row to lock. 6508 * @return A locked RowLock. The lock is exclusive and already aqquired. 6509 */ 6510 public RowLock getRowLock(byte[] row) throws IOException { 6511 return getRowLock(row, false); 6512 } 6513 6514 @Override 6515 public RowLock getRowLock(byte[] row, boolean readLock) throws IOException { 6516 checkRow(row, "row lock"); 6517 return getRowLock(row, readLock, null); 6518 } 6519 6520 Span createRegionSpan(String name) { 6521 return TraceUtil.createSpan(name).setAttribute(REGION_NAMES_KEY, 6522 Collections.singletonList(getRegionInfo().getRegionNameAsString())); 6523 } 6524 6525 // will be override in tests 6526 protected RowLock getRowLockInternal(byte[] row, boolean readLock, RowLock prevRowLock) 6527 throws IOException { 6528 // create an object to use a a key in the row lock map 6529 HashedBytes rowKey = new HashedBytes(row); 6530 6531 RowLockContext rowLockContext = null; 6532 RowLockImpl result = null; 6533 6534 boolean success = false; 6535 try { 6536 // Keep trying until we have a lock or error out. 6537 // TODO: do we need to add a time component here? 6538 while (result == null) { 6539 rowLockContext = computeIfAbsent(lockedRows, rowKey, () -> new RowLockContext(rowKey)); 6540 // Now try an get the lock. 6541 // This can fail as 6542 if (readLock) { 6543 // For read lock, if the caller has locked the same row previously, it will not try 6544 // to acquire the same read lock. It simply returns the previous row lock. 6545 RowLockImpl prevRowLockImpl = (RowLockImpl) prevRowLock; 6546 if ( 6547 (prevRowLockImpl != null) 6548 && (prevRowLockImpl.getLock() == rowLockContext.readWriteLock.readLock()) 6549 ) { 6550 success = true; 6551 return prevRowLock; 6552 } 6553 result = rowLockContext.newReadLock(); 6554 } else { 6555 result = rowLockContext.newWriteLock(); 6556 } 6557 } 6558 6559 int timeout = rowLockWaitDuration; 6560 boolean reachDeadlineFirst = false; 6561 Optional<RpcCall> call = RpcServer.getCurrentCall(); 6562 if (call.isPresent()) { 6563 long deadline = call.get().getDeadline(); 6564 if (deadline < Long.MAX_VALUE) { 6565 int timeToDeadline = (int) (deadline - EnvironmentEdgeManager.currentTime()); 6566 if (timeToDeadline <= this.rowLockWaitDuration) { 6567 reachDeadlineFirst = true; 6568 timeout = timeToDeadline; 6569 } 6570 } 6571 } 6572 6573 if (timeout <= 0 || !result.getLock().tryLock(timeout, TimeUnit.MILLISECONDS)) { 6574 String message = "Timed out waiting for lock for row: " + rowKey + " in region " 6575 + getRegionInfo().getEncodedName(); 6576 if (reachDeadlineFirst) { 6577 throw new TimeoutIOException(message); 6578 } else { 6579 // If timeToDeadline is larger than rowLockWaitDuration, we can not drop the request. 6580 throw new IOException(message); 6581 } 6582 } 6583 rowLockContext.setThreadName(Thread.currentThread().getName()); 6584 success = true; 6585 return result; 6586 } catch (InterruptedException ie) { 6587 if (LOG.isDebugEnabled()) { 6588 LOG.debug("Thread interrupted waiting for lock on row: {}, in region {}", rowKey, 6589 getRegionInfo().getRegionNameAsString()); 6590 } 6591 throw throwOnInterrupt(ie); 6592 } catch (Error error) { 6593 // The maximum lock count for read lock is 64K (hardcoded), when this maximum count 6594 // is reached, it will throw out an Error. This Error needs to be caught so it can 6595 // go ahead to process the minibatch with lock acquired. 6596 LOG.warn("Error to get row lock for {}, in region {}, cause: {}", Bytes.toStringBinary(row), 6597 getRegionInfo().getRegionNameAsString(), error); 6598 IOException ioe = new IOException(error); 6599 throw ioe; 6600 } finally { 6601 // Clean up the counts just in case this was the thing keeping the context alive. 6602 if (!success && rowLockContext != null) { 6603 rowLockContext.cleanUp(); 6604 } 6605 } 6606 } 6607 6608 private RowLock getRowLock(byte[] row, boolean readLock, final RowLock prevRowLock) 6609 throws IOException { 6610 return TraceUtil.trace(() -> getRowLockInternal(row, readLock, prevRowLock), 6611 () -> createRegionSpan("Region.getRowLock").setAttribute(ROW_LOCK_READ_LOCK_KEY, readLock)); 6612 } 6613 6614 private void releaseRowLocks(List<RowLock> rowLocks) { 6615 if (rowLocks != null) { 6616 for (RowLock rowLock : rowLocks) { 6617 rowLock.release(); 6618 } 6619 rowLocks.clear(); 6620 } 6621 } 6622 6623 public int getReadLockCount() { 6624 return lock.getReadLockCount(); 6625 } 6626 6627 public ConcurrentHashMap<HashedBytes, RowLockContext> getLockedRows() { 6628 return lockedRows; 6629 } 6630 6631 class RowLockContext { 6632 private final HashedBytes row; 6633 final ReadWriteLock readWriteLock = new ReentrantReadWriteLock(true); 6634 final AtomicBoolean usable = new AtomicBoolean(true); 6635 final AtomicInteger count = new AtomicInteger(0); 6636 final Object lock = new Object(); 6637 private String threadName; 6638 6639 RowLockContext(HashedBytes row) { 6640 this.row = row; 6641 } 6642 6643 RowLockImpl newWriteLock() { 6644 Lock l = readWriteLock.writeLock(); 6645 return getRowLock(l); 6646 } 6647 6648 RowLockImpl newReadLock() { 6649 Lock l = readWriteLock.readLock(); 6650 return getRowLock(l); 6651 } 6652 6653 private RowLockImpl getRowLock(Lock l) { 6654 count.incrementAndGet(); 6655 synchronized (lock) { 6656 if (usable.get()) { 6657 return new RowLockImpl(this, l); 6658 } else { 6659 return null; 6660 } 6661 } 6662 } 6663 6664 void cleanUp() { 6665 long c = count.decrementAndGet(); 6666 if (c <= 0) { 6667 synchronized (lock) { 6668 if (count.get() <= 0 && usable.get()) { // Don't attempt to remove row if already removed 6669 usable.set(false); 6670 RowLockContext removed = lockedRows.remove(row); 6671 assert removed == this : "we should never remove a different context"; 6672 } 6673 } 6674 } 6675 } 6676 6677 public void setThreadName(String threadName) { 6678 this.threadName = threadName; 6679 } 6680 6681 @Override 6682 public String toString() { 6683 return "RowLockContext{" + "row=" + row + ", readWriteLock=" + readWriteLock + ", count=" 6684 + count + ", threadName=" + threadName + '}'; 6685 } 6686 } 6687 6688 /** 6689 * Class used to represent a lock on a row. 6690 */ 6691 public static class RowLockImpl implements RowLock { 6692 private final RowLockContext context; 6693 private final Lock lock; 6694 6695 public RowLockImpl(RowLockContext context, Lock lock) { 6696 this.context = context; 6697 this.lock = lock; 6698 } 6699 6700 public Lock getLock() { 6701 return lock; 6702 } 6703 6704 public RowLockContext getContext() { 6705 return context; 6706 } 6707 6708 @Override 6709 public void release() { 6710 lock.unlock(); 6711 context.cleanUp(); 6712 } 6713 6714 @Override 6715 public String toString() { 6716 return "RowLockImpl{" + "context=" + context + ", lock=" + lock + '}'; 6717 } 6718 } 6719 6720 /** 6721 * Determines whether multiple column families are present Precondition: familyPaths is not null 6722 * @param familyPaths List of (column family, hfilePath) 6723 */ 6724 private static boolean hasMultipleColumnFamilies(Collection<Pair<byte[], String>> familyPaths) { 6725 boolean multipleFamilies = false; 6726 byte[] family = null; 6727 for (Pair<byte[], String> pair : familyPaths) { 6728 byte[] fam = pair.getFirst(); 6729 if (family == null) { 6730 family = fam; 6731 } else if (!Bytes.equals(family, fam)) { 6732 multipleFamilies = true; 6733 break; 6734 } 6735 } 6736 return multipleFamilies; 6737 } 6738 6739 /** 6740 * Attempts to atomically load a group of hfiles. This is critical for loading rows with multiple 6741 * column families atomically. 6742 * @param familyPaths List of Pair<byte[] column family, String hfilePath> 6743 * @param bulkLoadListener Internal hooks enabling massaging/preparation of a file about to be 6744 * bulk loaded 6745 * @return Map from family to List of store file paths if successful, null if failed recoverably 6746 * @throws IOException if failed unrecoverably. 6747 */ 6748 public Map<byte[], List<Path>> bulkLoadHFiles(Collection<Pair<byte[], String>> familyPaths, 6749 boolean assignSeqId, BulkLoadListener bulkLoadListener) throws IOException { 6750 return bulkLoadHFiles(familyPaths, assignSeqId, bulkLoadListener, false, null, true); 6751 } 6752 6753 /** 6754 * Listener class to enable callers of bulkLoadHFile() to perform any necessary pre/post 6755 * processing of a given bulkload call 6756 */ 6757 public interface BulkLoadListener { 6758 /** 6759 * Called before an HFile is actually loaded 6760 * @param family family being loaded to 6761 * @param srcPath path of HFile 6762 * @return final path to be used for actual loading 6763 */ 6764 String prepareBulkLoad(byte[] family, String srcPath, boolean copyFile, String customStaging) 6765 throws IOException; 6766 6767 /** 6768 * Called after a successful HFile load 6769 * @param family family being loaded to 6770 * @param srcPath path of HFile 6771 */ 6772 void doneBulkLoad(byte[] family, String srcPath) throws IOException; 6773 6774 /** 6775 * Called after a failed HFile load 6776 * @param family family being loaded to 6777 * @param srcPath path of HFile 6778 */ 6779 void failedBulkLoad(byte[] family, String srcPath) throws IOException; 6780 } 6781 6782 /** 6783 * Attempts to atomically load a group of hfiles. This is critical for loading rows with multiple 6784 * column families atomically. 6785 * @param familyPaths List of Pair<byte[] column family, String hfilePath> 6786 * @param bulkLoadListener Internal hooks enabling massaging/preparation of a file about to be 6787 * bulk loaded 6788 * @param copyFile always copy hfiles if true 6789 * @param clusterIds ids from clusters that had already handled the given bulkload event. 6790 * @return Map from family to List of store file paths if successful, null if failed recoverably 6791 * @throws IOException if failed unrecoverably. 6792 */ 6793 public Map<byte[], List<Path>> bulkLoadHFiles(Collection<Pair<byte[], String>> familyPaths, 6794 boolean assignSeqId, BulkLoadListener bulkLoadListener, boolean copyFile, 6795 List<String> clusterIds, boolean replicate) throws IOException { 6796 long seqId = -1; 6797 Map<byte[], List<Path>> storeFiles = new TreeMap<>(Bytes.BYTES_COMPARATOR); 6798 Map<String, Long> storeFilesSizes = new HashMap<>(); 6799 Preconditions.checkNotNull(familyPaths); 6800 // we need writeLock for multi-family bulk load 6801 startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths)); 6802 boolean isSuccessful = false; 6803 try { 6804 this.writeRequestsCount.increment(); 6805 6806 // There possibly was a split that happened between when the split keys 6807 // were gathered and before the HRegion's write lock was taken. We need 6808 // to validate the HFile region before attempting to bulk load all of them 6809 IOException ioException = null; 6810 List<Pair<byte[], String>> failures = new ArrayList<>(); 6811 for (Pair<byte[], String> p : familyPaths) { 6812 byte[] familyName = p.getFirst(); 6813 String path = p.getSecond(); 6814 6815 HStore store = getStore(familyName); 6816 if (store == null) { 6817 ioException = new org.apache.hadoop.hbase.DoNotRetryIOException( 6818 "No such column family " + Bytes.toStringBinary(familyName)); 6819 } else { 6820 try { 6821 store.assertBulkLoadHFileOk(new Path(path)); 6822 } catch (WrongRegionException wre) { 6823 // recoverable (file doesn't fit in region) 6824 failures.add(p); 6825 } catch (IOException ioe) { 6826 // unrecoverable (hdfs problem) 6827 ioException = ioe; 6828 } 6829 } 6830 6831 // validation failed because of some sort of IO problem. 6832 if (ioException != null) { 6833 LOG.error("There was IO error when checking if the bulk load is ok in region {}.", this, 6834 ioException); 6835 throw ioException; 6836 } 6837 } 6838 // validation failed, bail out before doing anything permanent. 6839 if (failures.size() != 0) { 6840 StringBuilder list = new StringBuilder(); 6841 for (Pair<byte[], String> p : failures) { 6842 list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ") 6843 .append(p.getSecond()); 6844 } 6845 // problem when validating 6846 LOG.warn("There was a recoverable bulk load failure likely due to a split. These (family," 6847 + " HFile) pairs were not loaded: {}, in region {}", list.toString(), this); 6848 return null; 6849 } 6850 6851 // We need to assign a sequential ID that's in between two memstores in order to preserve 6852 // the guarantee that all the edits lower than the highest sequential ID from all the 6853 // HFiles are flushed on disk. See HBASE-10958. The sequence id returned when we flush is 6854 // guaranteed to be one beyond the file made when we flushed (or if nothing to flush, it is 6855 // a sequence id that we can be sure is beyond the last hfile written). 6856 if (assignSeqId) { 6857 FlushResult fs = flushcache(true, false, FlushLifeCycleTracker.DUMMY); 6858 if (fs.isFlushSucceeded()) { 6859 seqId = ((FlushResultImpl) fs).flushSequenceId; 6860 } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) { 6861 seqId = ((FlushResultImpl) fs).flushSequenceId; 6862 } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH) { 6863 // CANNOT_FLUSH may mean that a flush is already on-going 6864 // we need to wait for that flush to complete 6865 waitForFlushes(); 6866 } else { 6867 throw new IOException("Could not bulk load with an assigned sequential ID because the " 6868 + "flush didn't run. Reason for not flushing: " + ((FlushResultImpl) fs).failureReason); 6869 } 6870 } 6871 6872 Map<byte[], List<Pair<Path, Path>>> familyWithFinalPath = 6873 new TreeMap<>(Bytes.BYTES_COMPARATOR); 6874 for (Pair<byte[], String> p : familyPaths) { 6875 byte[] familyName = p.getFirst(); 6876 String path = p.getSecond(); 6877 HStore store = getStore(familyName); 6878 if (!familyWithFinalPath.containsKey(familyName)) { 6879 familyWithFinalPath.put(familyName, new ArrayList<>()); 6880 } 6881 List<Pair<Path, Path>> lst = familyWithFinalPath.get(familyName); 6882 String finalPath = path; 6883 try { 6884 boolean reqTmp = store.storeEngine.requireWritingToTmpDirFirst(); 6885 if (bulkLoadListener != null) { 6886 finalPath = bulkLoadListener.prepareBulkLoad(familyName, path, copyFile, 6887 reqTmp ? null : fs.getRegionDir().toString()); 6888 } 6889 Pair<Path, Path> pair = null; 6890 if (reqTmp || !StoreFileInfo.isHFile(finalPath)) { 6891 pair = store.preBulkLoadHFile(finalPath, seqId); 6892 } else { 6893 Path livePath = new Path(finalPath); 6894 pair = new Pair<>(livePath, livePath); 6895 } 6896 lst.add(pair); 6897 } catch (IOException ioe) { 6898 // A failure here can cause an atomicity violation that we currently 6899 // cannot recover from since it is likely a failed HDFS operation. 6900 6901 LOG.error("There was a partial failure due to IO when attempting to" + " load " 6902 + Bytes.toString(p.getFirst()) + " : " + p.getSecond(), ioe); 6903 if (bulkLoadListener != null) { 6904 try { 6905 bulkLoadListener.failedBulkLoad(familyName, finalPath); 6906 } catch (Exception ex) { 6907 LOG.error("Error while calling failedBulkLoad for family " 6908 + Bytes.toString(familyName) + " with path " + path, ex); 6909 } 6910 } 6911 throw ioe; 6912 } 6913 } 6914 6915 if (this.getCoprocessorHost() != null) { 6916 for (Map.Entry<byte[], List<Pair<Path, Path>>> entry : familyWithFinalPath.entrySet()) { 6917 this.getCoprocessorHost().preCommitStoreFile(entry.getKey(), entry.getValue()); 6918 } 6919 } 6920 for (Map.Entry<byte[], List<Pair<Path, Path>>> entry : familyWithFinalPath.entrySet()) { 6921 byte[] familyName = entry.getKey(); 6922 for (Pair<Path, Path> p : entry.getValue()) { 6923 String path = p.getFirst().toString(); 6924 Path commitedStoreFile = p.getSecond(); 6925 HStore store = getStore(familyName); 6926 try { 6927 store.bulkLoadHFile(familyName, path, commitedStoreFile); 6928 // Note the size of the store file 6929 try { 6930 FileSystem fs = commitedStoreFile.getFileSystem(baseConf); 6931 storeFilesSizes.put(commitedStoreFile.getName(), 6932 fs.getFileStatus(commitedStoreFile).getLen()); 6933 } catch (IOException e) { 6934 LOG.warn("Failed to find the size of hfile " + commitedStoreFile, e); 6935 storeFilesSizes.put(commitedStoreFile.getName(), 0L); 6936 } 6937 6938 if (storeFiles.containsKey(familyName)) { 6939 storeFiles.get(familyName).add(commitedStoreFile); 6940 } else { 6941 List<Path> storeFileNames = new ArrayList<>(); 6942 storeFileNames.add(commitedStoreFile); 6943 storeFiles.put(familyName, storeFileNames); 6944 } 6945 if (bulkLoadListener != null) { 6946 bulkLoadListener.doneBulkLoad(familyName, path); 6947 } 6948 } catch (IOException ioe) { 6949 // A failure here can cause an atomicity violation that we currently 6950 // cannot recover from since it is likely a failed HDFS operation. 6951 6952 // TODO Need a better story for reverting partial failures due to HDFS. 6953 LOG.error("There was a partial failure due to IO when attempting to" + " load " 6954 + Bytes.toString(familyName) + " : " + p.getSecond(), ioe); 6955 if (bulkLoadListener != null) { 6956 try { 6957 bulkLoadListener.failedBulkLoad(familyName, path); 6958 } catch (Exception ex) { 6959 LOG.error("Error while calling failedBulkLoad for family " 6960 + Bytes.toString(familyName) + " with path " + path, ex); 6961 } 6962 } 6963 throw ioe; 6964 } 6965 } 6966 } 6967 6968 isSuccessful = true; 6969 if (conf.getBoolean(COMPACTION_AFTER_BULKLOAD_ENABLE, true)) { 6970 // request compaction 6971 familyWithFinalPath.keySet().forEach(family -> { 6972 HStore store = getStore(family); 6973 try { 6974 if (this.rsServices != null && store.needsCompaction()) { 6975 this.rsServices.getCompactionRequestor().requestSystemCompaction(this, store, 6976 "bulkload hfiles request compaction", true); 6977 LOG.info("Request compaction for region {} family {} after bulk load", 6978 this.getRegionInfo().getEncodedName(), store.getColumnFamilyName()); 6979 } 6980 } catch (IOException e) { 6981 LOG.error("bulkload hfiles request compaction error ", e); 6982 } 6983 }); 6984 } 6985 } finally { 6986 if (wal != null && !storeFiles.isEmpty()) { 6987 // Write a bulk load event for hfiles that are loaded 6988 try { 6989 WALProtos.BulkLoadDescriptor loadDescriptor = 6990 ProtobufUtil.toBulkLoadDescriptor(this.getRegionInfo().getTable(), 6991 UnsafeByteOperations.unsafeWrap(this.getRegionInfo().getEncodedNameAsBytes()), 6992 storeFiles, storeFilesSizes, seqId, clusterIds, replicate); 6993 WALUtil.writeBulkLoadMarkerAndSync(this.wal, this.getReplicationScope(), getRegionInfo(), 6994 loadDescriptor, mvcc); 6995 } catch (IOException ioe) { 6996 if (this.rsServices != null) { 6997 // Have to abort region server because some hfiles has been loaded but we can't write 6998 // the event into WAL 6999 isSuccessful = false; 7000 this.rsServices.abort("Failed to write bulk load event into WAL.", ioe); 7001 } 7002 } 7003 } 7004 7005 closeBulkRegionOperation(); 7006 } 7007 return isSuccessful ? storeFiles : null; 7008 } 7009 7010 @Override 7011 public boolean equals(Object o) { 7012 return o instanceof HRegion && Bytes.equals(getRegionInfo().getRegionName(), 7013 ((HRegion) o).getRegionInfo().getRegionName()); 7014 } 7015 7016 @Override 7017 public int hashCode() { 7018 return Bytes.hashCode(getRegionInfo().getRegionName()); 7019 } 7020 7021 @Override 7022 public String toString() { 7023 return getRegionInfo().getRegionNameAsString(); 7024 } 7025 7026 // Utility methods 7027 /** 7028 * A utility method to create new instances of HRegion based on the {@link HConstants#REGION_IMPL} 7029 * configuration property. 7030 * @param tableDir qualified path of directory where region should be located, usually the table 7031 * directory. 7032 * @param wal The WAL is the outbound log for any updates to the HRegion The wal file is a 7033 * logfile from the previous execution that's custom-computed for this HRegion. 7034 * The HRegionServer computes and sorts the appropriate wal info for this 7035 * HRegion. If there is a previous file (implying that the HRegion has been 7036 * written-to before), then read it from the supplied path. 7037 * @param fs is the filesystem. 7038 * @param conf is global configuration settings. 7039 * @param regionInfo - RegionInfo that describes the region is new), then read them from the 7040 * supplied path. 7041 * @param htd the table descriptor 7042 * @return the new instance 7043 */ 7044 public static HRegion newHRegion(Path tableDir, WAL wal, FileSystem fs, Configuration conf, 7045 RegionInfo regionInfo, final TableDescriptor htd, RegionServerServices rsServices) { 7046 try { 7047 @SuppressWarnings("unchecked") 7048 Class<? extends HRegion> regionClass = 7049 (Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class); 7050 7051 Constructor<? extends HRegion> c = 7052 regionClass.getConstructor(Path.class, WAL.class, FileSystem.class, Configuration.class, 7053 RegionInfo.class, TableDescriptor.class, RegionServerServices.class); 7054 7055 return c.newInstance(tableDir, wal, fs, conf, regionInfo, htd, rsServices); 7056 } catch (Throwable e) { 7057 // todo: what should I throw here? 7058 throw new IllegalStateException("Could not instantiate a region instance.", e); 7059 } 7060 } 7061 7062 /** 7063 * Convenience method creating new HRegions. Used by createTable. 7064 * @param info Info for region to create. 7065 * @param rootDir Root directory for HBase instance 7066 * @param wal shared WAL 7067 * @param initialize - true to initialize the region 7068 * @return new HRegion 7069 */ 7070 public static HRegion createHRegion(final RegionInfo info, final Path rootDir, 7071 final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal, 7072 final boolean initialize) throws IOException { 7073 return createHRegion(info, rootDir, conf, hTableDescriptor, wal, initialize, null); 7074 } 7075 7076 /** 7077 * Convenience method creating new HRegions. Used by createTable. 7078 * @param info Info for region to create. 7079 * @param rootDir Root directory for HBase instance 7080 * @param wal shared WAL 7081 * @param initialize - true to initialize the region 7082 * @param rsRpcServices An interface we can request flushes against. 7083 * @return new HRegion 7084 */ 7085 public static HRegion createHRegion(final RegionInfo info, final Path rootDir, 7086 final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal, 7087 final boolean initialize, RegionServerServices rsRpcServices) throws IOException { 7088 LOG.info("creating " + info + ", tableDescriptor=" 7089 + (hTableDescriptor == null ? "null" : hTableDescriptor) + ", regionDir=" + rootDir); 7090 createRegionDir(conf, info, rootDir); 7091 FileSystem fs = rootDir.getFileSystem(conf); 7092 Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable()); 7093 HRegion region = 7094 HRegion.newHRegion(tableDir, wal, fs, conf, info, hTableDescriptor, rsRpcServices); 7095 if (initialize) { 7096 region.initialize(null); 7097 } 7098 return region; 7099 } 7100 7101 /** 7102 * Create a region under the given table directory. 7103 */ 7104 public static HRegion createHRegion(Configuration conf, RegionInfo regionInfo, FileSystem fs, 7105 Path tableDir, TableDescriptor tableDesc) throws IOException { 7106 LOG.info("Creating {}, tableDescriptor={}, under table dir {}", regionInfo, tableDesc, 7107 tableDir); 7108 HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, regionInfo); 7109 HRegion region = HRegion.newHRegion(tableDir, null, fs, conf, regionInfo, tableDesc, null); 7110 return region; 7111 } 7112 7113 /** 7114 * Create the region directory in the filesystem. 7115 */ 7116 public static HRegionFileSystem createRegionDir(Configuration configuration, RegionInfo ri, 7117 Path rootDir) throws IOException { 7118 FileSystem fs = rootDir.getFileSystem(configuration); 7119 Path tableDir = CommonFSUtils.getTableDir(rootDir, ri.getTable()); 7120 // If directory already exists, will log warning and keep going. Will try to create 7121 // .regioninfo. If one exists, will overwrite. 7122 return HRegionFileSystem.createRegionOnFileSystem(configuration, fs, tableDir, ri); 7123 } 7124 7125 public static HRegion createHRegion(final RegionInfo info, final Path rootDir, 7126 final Configuration conf, final TableDescriptor hTableDescriptor, final WAL wal) 7127 throws IOException { 7128 return createHRegion(info, rootDir, conf, hTableDescriptor, wal, true); 7129 } 7130 7131 /** 7132 * Open a Region. 7133 * @param info Info for region to be opened. 7134 * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) passing 7135 * the result of the call to HRegion#getMinSequenceId() to ensure the wal id is 7136 * properly kept up. HRegionStore does this every time it opens a new region. 7137 * @return new HRegion 7138 */ 7139 public static HRegion openHRegion(final RegionInfo info, final TableDescriptor htd, final WAL wal, 7140 final Configuration conf) throws IOException { 7141 return openHRegion(info, htd, wal, conf, null, null); 7142 } 7143 7144 /** 7145 * Open a Region. 7146 * @param info Info for region to be opened 7147 * @param htd the table descriptor 7148 * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) 7149 * passing the result of the call to HRegion#getMinSequenceId() to ensure the 7150 * wal id is properly kept up. HRegionStore does this every time it opens a new 7151 * region. 7152 * @param conf The Configuration object to use. 7153 * @param rsServices An interface we can request flushes against. 7154 * @param reporter An interface we can report progress against. 7155 * @return new HRegion 7156 */ 7157 public static HRegion openHRegion(final RegionInfo info, final TableDescriptor htd, final WAL wal, 7158 final Configuration conf, final RegionServerServices rsServices, 7159 final CancelableProgressable reporter) throws IOException { 7160 return openHRegion(CommonFSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter); 7161 } 7162 7163 /** 7164 * Open a Region. 7165 * @param rootDir Root directory for HBase instance 7166 * @param info Info for region to be opened. 7167 * @param htd the table descriptor 7168 * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) passing 7169 * the result of the call to HRegion#getMinSequenceId() to ensure the wal id is 7170 * properly kept up. HRegionStore does this every time it opens a new region. 7171 * @param conf The Configuration object to use. 7172 * @return new HRegion 7173 */ 7174 public static HRegion openHRegion(Path rootDir, final RegionInfo info, final TableDescriptor htd, 7175 final WAL wal, final Configuration conf) throws IOException { 7176 return openHRegion(rootDir, info, htd, wal, conf, null, null); 7177 } 7178 7179 /** 7180 * Open a Region. 7181 * @param rootDir Root directory for HBase instance 7182 * @param info Info for region to be opened. 7183 * @param htd the table descriptor 7184 * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) 7185 * passing the result of the call to HRegion#getMinSequenceId() to ensure the 7186 * wal id is properly kept up. HRegionStore does this every time it opens a new 7187 * region. 7188 * @param conf The Configuration object to use. 7189 * @param rsServices An interface we can request flushes against. 7190 * @param reporter An interface we can report progress against. 7191 * @return new HRegion 7192 */ 7193 public static HRegion openHRegion(final Path rootDir, final RegionInfo info, 7194 final TableDescriptor htd, final WAL wal, final Configuration conf, 7195 final RegionServerServices rsServices, final CancelableProgressable reporter) 7196 throws IOException { 7197 FileSystem fs = null; 7198 if (rsServices != null) { 7199 fs = rsServices.getFileSystem(); 7200 } 7201 if (fs == null) { 7202 fs = rootDir.getFileSystem(conf); 7203 } 7204 return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter); 7205 } 7206 7207 /** 7208 * Open a Region. 7209 * @param conf The Configuration object to use. 7210 * @param fs Filesystem to use 7211 * @param rootDir Root directory for HBase instance 7212 * @param info Info for region to be opened. 7213 * @param htd the table descriptor 7214 * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) passing 7215 * the result of the call to HRegion#getMinSequenceId() to ensure the wal id is 7216 * properly kept up. HRegionStore does this every time it opens a new region. 7217 * @return new HRegion 7218 */ 7219 public static HRegion openHRegion(final Configuration conf, final FileSystem fs, 7220 final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal) 7221 throws IOException { 7222 return openHRegion(conf, fs, rootDir, info, htd, wal, null, null); 7223 } 7224 7225 /** 7226 * Open a Region. 7227 * @param conf The Configuration object to use. 7228 * @param fs Filesystem to use 7229 * @param rootDir Root directory for HBase instance 7230 * @param info Info for region to be opened. 7231 * @param htd the table descriptor 7232 * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) 7233 * passing the result of the call to HRegion#getMinSequenceId() to ensure the 7234 * wal id is properly kept up. HRegionStore does this every time it opens a new 7235 * region. 7236 * @param rsServices An interface we can request flushes against. 7237 * @param reporter An interface we can report progress against. 7238 * @return new HRegion 7239 */ 7240 public static HRegion openHRegion(final Configuration conf, final FileSystem fs, 7241 final Path rootDir, final RegionInfo info, final TableDescriptor htd, final WAL wal, 7242 final RegionServerServices rsServices, final CancelableProgressable reporter) 7243 throws IOException { 7244 Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable()); 7245 return openHRegionFromTableDir(conf, fs, tableDir, info, htd, wal, rsServices, reporter); 7246 } 7247 7248 /** 7249 * Open a Region. 7250 * @param conf The Configuration object to use. 7251 * @param fs Filesystem to use 7252 * @param info Info for region to be opened. 7253 * @param htd the table descriptor 7254 * @param wal WAL for region to use. This method will call WAL#setSequenceNumber(long) 7255 * passing the result of the call to HRegion#getMinSequenceId() to ensure the 7256 * wal id is properly kept up. HRegionStore does this every time it opens a new 7257 * region. 7258 * @param rsServices An interface we can request flushes against. 7259 * @param reporter An interface we can report progress against. 7260 * @return new HRegion 7261 */ 7262 public static HRegion openHRegionFromTableDir(final Configuration conf, final FileSystem fs, 7263 final Path tableDir, final RegionInfo info, final TableDescriptor htd, final WAL wal, 7264 final RegionServerServices rsServices, final CancelableProgressable reporter) 7265 throws IOException { 7266 Objects.requireNonNull(info, "RegionInfo cannot be null"); 7267 LOG.debug("Opening region: {}", info); 7268 HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices); 7269 return r.openHRegion(reporter); 7270 } 7271 7272 public NavigableMap<byte[], Integer> getReplicationScope() { 7273 return this.replicationScope; 7274 } 7275 7276 /** 7277 * Useful when reopening a closed region (normally for unit tests) 7278 * @param other original object 7279 * @param reporter An interface we can report progress against. 7280 * @return new HRegion 7281 */ 7282 public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter) 7283 throws IOException { 7284 HRegionFileSystem regionFs = other.getRegionFileSystem(); 7285 HRegion r = newHRegion(regionFs.getTableDir(), other.getWAL(), regionFs.getFileSystem(), 7286 other.baseConf, other.getRegionInfo(), other.getTableDescriptor(), null); 7287 return r.openHRegion(reporter); 7288 } 7289 7290 public static Region openHRegion(final Region other, final CancelableProgressable reporter) 7291 throws IOException { 7292 return openHRegion((HRegion) other, reporter); 7293 } 7294 7295 /** 7296 * Open HRegion. 7297 * <p/> 7298 * Calls initialize and sets sequenceId. 7299 * @return Returns <code>this</code> 7300 */ 7301 private HRegion openHRegion(final CancelableProgressable reporter) throws IOException { 7302 try { 7303 CompoundConfiguration cConfig = 7304 new CompoundConfiguration().add(conf).addBytesMap(htableDescriptor.getValues()); 7305 // Refuse to open the region if we are missing local compression support 7306 TableDescriptorChecker.checkCompression(cConfig, htableDescriptor); 7307 // Refuse to open the region if encryption configuration is incorrect or 7308 // codec support is missing 7309 LOG.debug("checking encryption for " + this.getRegionInfo().getEncodedName()); 7310 TableDescriptorChecker.checkEncryption(cConfig, htableDescriptor); 7311 // Refuse to open the region if a required class cannot be loaded 7312 LOG.debug("checking classloading for " + this.getRegionInfo().getEncodedName()); 7313 TableDescriptorChecker.checkClassLoading(cConfig, htableDescriptor); 7314 this.openSeqNum = initialize(reporter); 7315 this.mvcc.advanceTo(openSeqNum); 7316 // The openSeqNum must be increased every time when a region is assigned, as we rely on it to 7317 // determine whether a region has been successfully reopened. So here we always write open 7318 // marker, even if the table is read only. 7319 if ( 7320 wal != null && getRegionServerServices() != null 7321 && RegionReplicaUtil.isDefaultReplica(getRegionInfo()) 7322 ) { 7323 writeRegionOpenMarker(wal, openSeqNum); 7324 } 7325 } catch (Throwable t) { 7326 // By coprocessor path wrong region will open failed, 7327 // MetricsRegionWrapperImpl is already init and not close, 7328 // add region close when open failed 7329 try { 7330 // It is not required to write sequence id file when region open is failed. 7331 // Passing true to skip the sequence id file write. 7332 this.close(true); 7333 } catch (Throwable e) { 7334 LOG.warn("Open region: {} failed. Try close region but got exception ", 7335 this.getRegionInfo(), e); 7336 } 7337 throw t; 7338 } 7339 return this; 7340 } 7341 7342 /** 7343 * Open a Region on a read-only file-system (like hdfs snapshots) 7344 * @param conf The Configuration object to use. 7345 * @param fs Filesystem to use 7346 * @param info Info for region to be opened. 7347 * @param htd the table descriptor 7348 * @return new HRegion 7349 */ 7350 public static HRegion openReadOnlyFileSystemHRegion(final Configuration conf, final FileSystem fs, 7351 final Path tableDir, RegionInfo info, final TableDescriptor htd) throws IOException { 7352 if (info == null) { 7353 throw new NullPointerException("Passed region info is null"); 7354 } 7355 if (LOG.isDebugEnabled()) { 7356 LOG.debug("Opening region (readOnly filesystem): " + info); 7357 } 7358 if (info.getReplicaId() <= 0) { 7359 info = RegionInfoBuilder.newBuilder(info).setReplicaId(1).build(); 7360 } 7361 HRegion r = HRegion.newHRegion(tableDir, null, fs, conf, info, htd, null); 7362 r.writestate.setReadOnly(true); 7363 return r.openHRegion(null); 7364 } 7365 7366 public static HRegion warmupHRegion(final RegionInfo info, final TableDescriptor htd, 7367 final WAL wal, final Configuration conf, final RegionServerServices rsServices, 7368 final CancelableProgressable reporter) throws IOException { 7369 7370 Objects.requireNonNull(info, "RegionInfo cannot be null"); 7371 LOG.debug("Warmup {}", info); 7372 Path rootDir = CommonFSUtils.getRootDir(conf); 7373 Path tableDir = CommonFSUtils.getTableDir(rootDir, info.getTable()); 7374 FileSystem fs = null; 7375 if (rsServices != null) { 7376 fs = rsServices.getFileSystem(); 7377 } 7378 if (fs == null) { 7379 fs = rootDir.getFileSystem(conf); 7380 } 7381 HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, null); 7382 r.initializeWarmup(reporter); 7383 r.close(); 7384 return r; 7385 } 7386 7387 /** 7388 * Computes the Path of the HRegion 7389 * @param tabledir qualified path for table 7390 * @param name ENCODED region name 7391 * @return Path of HRegion directory 7392 * @deprecated For tests only; to be removed. 7393 */ 7394 @Deprecated 7395 public static Path getRegionDir(final Path tabledir, final String name) { 7396 return new Path(tabledir, name); 7397 } 7398 7399 /** 7400 * Determines if the specified row is within the row range specified by the specified RegionInfo 7401 * @param info RegionInfo that specifies the row range 7402 * @param row row to be checked 7403 * @return true if the row is within the range specified by the RegionInfo 7404 */ 7405 public static boolean rowIsInRange(RegionInfo info, final byte[] row) { 7406 return ((info.getStartKey().length == 0) || (Bytes.compareTo(info.getStartKey(), row) <= 0)) 7407 && ((info.getEndKey().length == 0) || (Bytes.compareTo(info.getEndKey(), row) > 0)); 7408 } 7409 7410 public static boolean rowIsInRange(RegionInfo info, final byte[] row, final int offset, 7411 final short length) { 7412 return ((info.getStartKey().length == 0) 7413 || (Bytes.compareTo(info.getStartKey(), 0, info.getStartKey().length, row, offset, length) 7414 <= 0)) 7415 && ((info.getEndKey().length == 0) 7416 || (Bytes.compareTo(info.getEndKey(), 0, info.getEndKey().length, row, offset, length) 7417 > 0)); 7418 } 7419 7420 @Override 7421 public Result get(final Get get) throws IOException { 7422 prepareGet(get); 7423 List<Cell> results = get(get, true); 7424 boolean stale = this.getRegionInfo().getReplicaId() != 0; 7425 return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null, stale); 7426 } 7427 7428 void prepareGet(final Get get) throws IOException { 7429 checkRow(get.getRow(), "Get"); 7430 // Verify families are all valid 7431 if (get.hasFamilies()) { 7432 for (byte[] family : get.familySet()) { 7433 checkFamily(family); 7434 } 7435 } else { // Adding all families to scanner 7436 for (byte[] family : this.htableDescriptor.getColumnFamilyNames()) { 7437 get.addFamily(family); 7438 } 7439 } 7440 } 7441 7442 @Override 7443 public List<Cell> get(Get get, boolean withCoprocessor) throws IOException { 7444 return get(get, withCoprocessor, HConstants.NO_NONCE, HConstants.NO_NONCE); 7445 } 7446 7447 private List<Cell> get(Get get, boolean withCoprocessor, long nonceGroup, long nonce) 7448 throws IOException { 7449 return TraceUtil.trace(() -> getInternal(get, withCoprocessor, nonceGroup, nonce), 7450 () -> createRegionSpan("Region.get")); 7451 } 7452 7453 private List<Cell> getInternal(Get get, boolean withCoprocessor, long nonceGroup, long nonce) 7454 throws IOException { 7455 List<Cell> results = new ArrayList<>(); 7456 long before = EnvironmentEdgeManager.currentTime(); 7457 7458 // pre-get CP hook 7459 if (withCoprocessor && (coprocessorHost != null)) { 7460 if (coprocessorHost.preGet(get, results)) { 7461 metricsUpdateForGet(results, before); 7462 return results; 7463 } 7464 } 7465 Scan scan = new Scan(get); 7466 if (scan.getLoadColumnFamiliesOnDemandValue() == null) { 7467 scan.setLoadColumnFamiliesOnDemand(isLoadingCfsOnDemandDefault()); 7468 } 7469 try (RegionScanner scanner = getScanner(scan, null, nonceGroup, nonce)) { 7470 List<Cell> tmp = new ArrayList<>(); 7471 scanner.next(tmp); 7472 // Copy EC to heap, then close the scanner. 7473 // This can be an EXPENSIVE call. It may make an extra copy from offheap to onheap buffers. 7474 // See more details in HBASE-26036. 7475 for (Cell cell : tmp) { 7476 results.add(CellUtil.cloneIfNecessary(cell)); 7477 } 7478 } 7479 7480 // post-get CP hook 7481 if (withCoprocessor && (coprocessorHost != null)) { 7482 coprocessorHost.postGet(get, results); 7483 } 7484 7485 metricsUpdateForGet(results, before); 7486 7487 return results; 7488 } 7489 7490 void metricsUpdateForGet(List<Cell> results, long before) { 7491 if (this.metricsRegion != null) { 7492 this.metricsRegion.updateGet(EnvironmentEdgeManager.currentTime() - before); 7493 } 7494 if (rsServices != null && this.rsServices.getMetrics() != null) { 7495 rsServices.getMetrics().updateReadQueryMeter(this, 1); 7496 } 7497 } 7498 7499 @Override 7500 public Result mutateRow(RowMutations rm) throws IOException { 7501 return mutateRow(rm, HConstants.NO_NONCE, HConstants.NO_NONCE); 7502 } 7503 7504 public Result mutateRow(RowMutations rm, long nonceGroup, long nonce) throws IOException { 7505 final List<Mutation> m = rm.getMutations(); 7506 OperationStatus[] statuses = batchMutate(m.toArray(new Mutation[0]), true, nonceGroup, nonce); 7507 7508 List<Result> results = new ArrayList<>(); 7509 for (OperationStatus status : statuses) { 7510 if (status.getResult() != null) { 7511 results.add(status.getResult()); 7512 } 7513 } 7514 7515 if (results.isEmpty()) { 7516 return null; 7517 } 7518 7519 // Merge the results of the Increment/Append operations 7520 List<Cell> cells = new ArrayList<>(); 7521 for (Result result : results) { 7522 if (result.rawCells() != null) { 7523 cells.addAll(Arrays.asList(result.rawCells())); 7524 } 7525 } 7526 return Result.create(cells); 7527 } 7528 7529 /** 7530 * Perform atomic (all or none) mutations within the region. 7531 * @param mutations The list of mutations to perform. <code>mutations</code> can contain 7532 * operations for multiple rows. Caller has to ensure that all rows are 7533 * contained in this region. 7534 * @param rowsToLock Rows to lock 7535 * @param nonceGroup Optional nonce group of the operation (client Id) 7536 * @param nonce Optional nonce of the operation (unique random id to ensure "more 7537 * idempotence") If multiple rows are locked care should be taken that 7538 * <code>rowsToLock</code> is sorted in order to avoid deadlocks. 7539 */ 7540 @Override 7541 public void mutateRowsWithLocks(Collection<Mutation> mutations, Collection<byte[]> rowsToLock, 7542 long nonceGroup, long nonce) throws IOException { 7543 batchMutate(new MutationBatchOperation(this, mutations.toArray(new Mutation[mutations.size()]), 7544 true, nonceGroup, nonce) { 7545 @Override 7546 public MiniBatchOperationInProgress<Mutation> 7547 lockRowsAndBuildMiniBatch(List<RowLock> acquiredRowLocks) throws IOException { 7548 RowLock prevRowLock = null; 7549 for (byte[] row : rowsToLock) { 7550 try { 7551 RowLock rowLock = region.getRowLock(row, false, prevRowLock); // write lock 7552 if (rowLock != prevRowLock) { 7553 acquiredRowLocks.add(rowLock); 7554 prevRowLock = rowLock; 7555 } 7556 } catch (IOException ioe) { 7557 LOG.warn("Failed getting lock, row={}, in region {}", Bytes.toStringBinary(row), this, 7558 ioe); 7559 throw ioe; 7560 } 7561 } 7562 return createMiniBatch(size(), size()); 7563 } 7564 }); 7565 } 7566 7567 /** Returns statistics about the current load of the region */ 7568 public ClientProtos.RegionLoadStats getLoadStatistics() { 7569 if (!regionStatsEnabled) { 7570 return null; 7571 } 7572 ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder(); 7573 stats.setMemStoreLoad((int) (Math.min(100, 7574 (this.memStoreSizing.getMemStoreSize().getHeapSize() * 100) / this.memstoreFlushSize))); 7575 if (rsServices.getHeapMemoryManager() != null) { 7576 // the HeapMemoryManager uses -0.0 to signal a problem asking the JVM, 7577 // so we could just do the calculation below and we'll get a 0. 7578 // treating it as a special case analogous to no HMM instead so that it can be 7579 // programatically treated different from using <1% of heap. 7580 final float occupancy = rsServices.getHeapMemoryManager().getHeapOccupancyPercent(); 7581 if (occupancy != HeapMemoryManager.HEAP_OCCUPANCY_ERROR_VALUE) { 7582 stats.setHeapOccupancy((int) (occupancy * 100)); 7583 } 7584 } 7585 stats.setCompactionPressure((int) (rsServices.getCompactionPressure() * 100 > 100 7586 ? 100 7587 : rsServices.getCompactionPressure() * 100)); 7588 return stats.build(); 7589 } 7590 7591 @Override 7592 public void processRowsWithLocks(RowProcessor<?, ?> processor) throws IOException { 7593 processRowsWithLocks(processor, rowProcessorTimeout, HConstants.NO_NONCE, HConstants.NO_NONCE); 7594 } 7595 7596 @Override 7597 public void processRowsWithLocks(RowProcessor<?, ?> processor, long nonceGroup, long nonce) 7598 throws IOException { 7599 processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce); 7600 } 7601 7602 @Override 7603 public void processRowsWithLocks(RowProcessor<?, ?> processor, long timeout, long nonceGroup, 7604 long nonce) throws IOException { 7605 for (byte[] row : processor.getRowsToLock()) { 7606 checkRow(row, "processRowsWithLocks"); 7607 } 7608 if (!processor.readOnly()) { 7609 checkReadOnly(); 7610 } 7611 checkResources(); 7612 startRegionOperation(); 7613 WALEdit walEdit = new WALEdit(); 7614 7615 // STEP 1. Run pre-process hook 7616 preProcess(processor, walEdit); 7617 // Short circuit the read only case 7618 if (processor.readOnly()) { 7619 try { 7620 long now = EnvironmentEdgeManager.currentTime(); 7621 doProcessRowWithTimeout(processor, now, this, null, null, timeout); 7622 processor.postProcess(this, walEdit, true); 7623 } finally { 7624 closeRegionOperation(); 7625 } 7626 return; 7627 } 7628 7629 boolean locked = false; 7630 List<RowLock> acquiredRowLocks = null; 7631 List<Mutation> mutations = new ArrayList<>(); 7632 Collection<byte[]> rowsToLock = processor.getRowsToLock(); 7633 // This is assigned by mvcc either explicity in the below or in the guts of the WAL append 7634 // when it assigns the edit a sequencedid (A.K.A the mvcc write number). 7635 WriteEntry writeEntry = null; 7636 MemStoreSizing memstoreAccounting = new NonThreadSafeMemStoreSizing(); 7637 7638 // Check for thread interrupt status in case we have been signaled from 7639 // #interruptRegionOperation. 7640 checkInterrupt(); 7641 7642 try { 7643 boolean success = false; 7644 try { 7645 // STEP 2. Acquire the row lock(s) 7646 acquiredRowLocks = new ArrayList<>(rowsToLock.size()); 7647 RowLock prevRowLock = null; 7648 for (byte[] row : rowsToLock) { 7649 // Attempt to lock all involved rows, throw if any lock times out 7650 // use a writer lock for mixed reads and writes 7651 RowLock rowLock = getRowLockInternal(row, false, prevRowLock); 7652 if (rowLock != prevRowLock) { 7653 acquiredRowLocks.add(rowLock); 7654 prevRowLock = rowLock; 7655 } 7656 } 7657 7658 // Check for thread interrupt status in case we have been signaled from 7659 // #interruptRegionOperation. Do it before we take the lock and disable interrupts for 7660 // the WAL append. 7661 checkInterrupt(); 7662 7663 // STEP 3. Region lock 7664 lock(this.updatesLock.readLock(), acquiredRowLocks.isEmpty() ? 1 : acquiredRowLocks.size()); 7665 locked = true; 7666 7667 // From this point until memstore update this operation should not be interrupted. 7668 disableInterrupts(); 7669 7670 long now = EnvironmentEdgeManager.currentTime(); 7671 // STEP 4. Let the processor scan the rows, generate mutations and add waledits 7672 doProcessRowWithTimeout(processor, now, this, mutations, walEdit, timeout); 7673 if (!mutations.isEmpty()) { 7674 writeRequestsCount.add(mutations.size()); 7675 // STEP 5. Call the preBatchMutate hook 7676 processor.preBatchMutate(this, walEdit); 7677 7678 // STEP 6. Append and sync if walEdit has data to write out. 7679 if (!walEdit.isEmpty()) { 7680 writeEntry = doWALAppend(walEdit, getEffectiveDurability(processor.useDurability()), 7681 processor.getClusterIds(), now, nonceGroup, nonce); 7682 } else { 7683 // We are here if WAL is being skipped. 7684 writeEntry = this.mvcc.begin(); 7685 } 7686 7687 // STEP 7. Apply to memstore 7688 long sequenceId = writeEntry.getWriteNumber(); 7689 for (Mutation m : mutations) { 7690 // Handle any tag based cell features. 7691 // TODO: Do we need to call rewriteCellTags down in applyToMemStore()? Why not before 7692 // so tags go into WAL? 7693 rewriteCellTags(m.getFamilyCellMap(), m); 7694 for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) { 7695 Cell cell = cellScanner.current(); 7696 if (walEdit.isEmpty()) { 7697 // If walEdit is empty, we put nothing in WAL. WAL stamps Cells with sequence id. 7698 // If no WAL, need to stamp it here. 7699 PrivateCellUtil.setSequenceId(cell, sequenceId); 7700 } 7701 applyToMemStore(getStore(cell), cell, memstoreAccounting); 7702 } 7703 } 7704 7705 // STEP 8. call postBatchMutate hook 7706 processor.postBatchMutate(this); 7707 7708 // STEP 9. Complete mvcc. 7709 mvcc.completeAndWait(writeEntry); 7710 writeEntry = null; 7711 7712 // STEP 10. Release region lock 7713 if (locked) { 7714 this.updatesLock.readLock().unlock(); 7715 locked = false; 7716 } 7717 7718 // STEP 11. Release row lock(s) 7719 releaseRowLocks(acquiredRowLocks); 7720 7721 if (rsServices != null && rsServices.getMetrics() != null) { 7722 rsServices.getMetrics().updateWriteQueryMeter(this, mutations.size()); 7723 } 7724 } 7725 success = true; 7726 } finally { 7727 // Call complete rather than completeAndWait because we probably had error if walKey != null 7728 if (writeEntry != null) mvcc.complete(writeEntry); 7729 if (locked) { 7730 this.updatesLock.readLock().unlock(); 7731 } 7732 // release locks if some were acquired but another timed out 7733 releaseRowLocks(acquiredRowLocks); 7734 7735 enableInterrupts(); 7736 } 7737 7738 // 12. Run post-process hook 7739 processor.postProcess(this, walEdit, success); 7740 } finally { 7741 closeRegionOperation(); 7742 if (!mutations.isEmpty()) { 7743 this.incMemStoreSize(memstoreAccounting.getMemStoreSize()); 7744 requestFlushIfNeeded(); 7745 } 7746 } 7747 } 7748 7749 private void preProcess(final RowProcessor<?, ?> processor, final WALEdit walEdit) 7750 throws IOException { 7751 try { 7752 processor.preProcess(this, walEdit); 7753 } catch (IOException e) { 7754 closeRegionOperation(); 7755 throw e; 7756 } 7757 } 7758 7759 private void doProcessRowWithTimeout(final RowProcessor<?, ?> processor, final long now, 7760 final HRegion region, final List<Mutation> mutations, final WALEdit walEdit, final long timeout) 7761 throws IOException { 7762 // Short circuit the no time bound case. 7763 if (timeout < 0) { 7764 try { 7765 processor.process(now, region, mutations, walEdit); 7766 } catch (IOException e) { 7767 String row = processor.getRowsToLock().isEmpty() 7768 ? "" 7769 : " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) 7770 + "..."; 7771 LOG.warn("RowProcessor: {}, in region {}, throws Exception {}", 7772 processor.getClass().getName(), getRegionInfo().getRegionNameAsString(), row, e); 7773 throw e; 7774 } 7775 return; 7776 } 7777 7778 // Case with time bound 7779 FutureTask<Void> task = new FutureTask<>(new Callable<Void>() { 7780 @Override 7781 public Void call() throws IOException { 7782 try { 7783 processor.process(now, region, mutations, walEdit); 7784 return null; 7785 } catch (IOException e) { 7786 String row = processor.getRowsToLock().isEmpty() 7787 ? "" 7788 : " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) 7789 + "..."; 7790 LOG.warn("RowProcessor: {}, in region {}, throws Exception {}", 7791 processor.getClass().getName(), getRegionInfo().getRegionNameAsString(), row, e); 7792 throw e; 7793 } 7794 } 7795 }); 7796 rowProcessorExecutor.execute(task); 7797 try { 7798 task.get(timeout, TimeUnit.MILLISECONDS); 7799 } catch (InterruptedException ie) { 7800 throw throwOnInterrupt(ie); 7801 } catch (TimeoutException te) { 7802 String row = processor.getRowsToLock().isEmpty() 7803 ? "" 7804 : " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "..."; 7805 LOG.error("RowProcessor timeout: {} ms, in region {}, {}", timeout, 7806 getRegionInfo().getRegionNameAsString(), row); 7807 throw new IOException(te); 7808 } catch (Exception e) { 7809 throw new IOException(e); 7810 } 7811 } 7812 7813 @Override 7814 public Result append(Append append) throws IOException { 7815 return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE); 7816 } 7817 7818 public Result append(Append append, long nonceGroup, long nonce) throws IOException { 7819 return TraceUtil.trace(() -> { 7820 checkReadOnly(); 7821 checkResources(); 7822 startRegionOperation(Operation.APPEND); 7823 try { 7824 // All edits for the given row (across all column families) must happen atomically. 7825 return mutate(append, true, nonceGroup, nonce).getResult(); 7826 } finally { 7827 closeRegionOperation(Operation.APPEND); 7828 } 7829 }, () -> createRegionSpan("Region.append")); 7830 } 7831 7832 @Override 7833 public Result increment(Increment increment) throws IOException { 7834 return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE); 7835 } 7836 7837 public Result increment(Increment increment, long nonceGroup, long nonce) throws IOException { 7838 return TraceUtil.trace(() -> { 7839 checkReadOnly(); 7840 checkResources(); 7841 startRegionOperation(Operation.INCREMENT); 7842 try { 7843 // All edits for the given row (across all column families) must happen atomically. 7844 return mutate(increment, true, nonceGroup, nonce).getResult(); 7845 } finally { 7846 closeRegionOperation(Operation.INCREMENT); 7847 } 7848 }, () -> createRegionSpan("Region.increment")); 7849 } 7850 7851 private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List<UUID> clusterIds, 7852 long now, long nonceGroup, long nonce) throws IOException { 7853 return doWALAppend(walEdit, durability, clusterIds, now, nonceGroup, nonce, 7854 SequenceId.NO_SEQUENCE_ID); 7855 } 7856 7857 /** Returns writeEntry associated with this append */ 7858 private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List<UUID> clusterIds, 7859 long now, long nonceGroup, long nonce, long origLogSeqNum) throws IOException { 7860 Preconditions.checkArgument(walEdit != null && !walEdit.isEmpty(), "WALEdit is null or empty!"); 7861 Preconditions.checkArgument(!walEdit.isReplay() || origLogSeqNum != SequenceId.NO_SEQUENCE_ID, 7862 "Invalid replay sequence Id for replay WALEdit!"); 7863 // Using default cluster id, as this can only happen in the originating cluster. 7864 // A slave cluster receives the final value (not the delta) as a Put. We use HLogKey 7865 // here instead of WALKeyImpl directly to support legacy coprocessors. 7866 WALKeyImpl walKey = walEdit.isReplay() 7867 ? new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(), 7868 this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, clusterIds, 7869 nonceGroup, nonce, mvcc) 7870 : new WALKeyImpl(this.getRegionInfo().getEncodedNameAsBytes(), 7871 this.htableDescriptor.getTableName(), SequenceId.NO_SEQUENCE_ID, now, clusterIds, 7872 nonceGroup, nonce, mvcc, this.getReplicationScope()); 7873 if (walEdit.isReplay()) { 7874 walKey.setOrigLogSeqNum(origLogSeqNum); 7875 } 7876 // don't call the coproc hook for writes to the WAL caused by 7877 // system lifecycle events like flushes or compactions 7878 if (this.coprocessorHost != null && !walEdit.isMetaEdit()) { 7879 this.coprocessorHost.preWALAppend(walKey, walEdit); 7880 } 7881 WriteEntry writeEntry = null; 7882 try { 7883 long txid = this.wal.appendData(this.getRegionInfo(), walKey, walEdit); 7884 // Call sync on our edit. 7885 if (txid != 0) { 7886 sync(txid, durability); 7887 } 7888 writeEntry = walKey.getWriteEntry(); 7889 } catch (IOException ioe) { 7890 if (walKey != null && walKey.getWriteEntry() != null) { 7891 mvcc.complete(walKey.getWriteEntry()); 7892 } 7893 7894 /** 7895 * If {@link WAL#sync} get a timeout exception, the only correct way is to abort the region 7896 * server, as the design of {@link WAL#sync}, is to succeed or die, there is no 'failure'. It 7897 * is usually not a big deal is because we set a very large default value(5 minutes) for 7898 * {@link AbstractFSWAL#WAL_SYNC_TIMEOUT_MS}, usually the WAL system will abort the region 7899 * server if it can not finish the sync within 5 minutes. 7900 */ 7901 if (ioe instanceof WALSyncTimeoutIOException) { 7902 if (rsServices != null) { 7903 rsServices.abort("WAL sync timeout,forcing server shutdown", ioe); 7904 } 7905 } 7906 throw ioe; 7907 } 7908 return writeEntry; 7909 } 7910 7911 // 7912 // New HBASE-880 Helpers 7913 // 7914 void checkFamily(final byte[] family) throws NoSuchColumnFamilyException { 7915 if (!this.htableDescriptor.hasColumnFamily(family)) { 7916 throw new NoSuchColumnFamilyException("Column family " + Bytes.toString(family) 7917 + " does not exist in region " + this + " in table " + this.htableDescriptor); 7918 } 7919 } 7920 7921 public static final long FIXED_OVERHEAD = ClassSize.estimateBase(HRegion.class, false); 7922 7923 // woefully out of date - currently missing: 7924 // 1 x HashMap - coprocessorServiceHandlers 7925 // 6 x LongAdder - numMutationsWithoutWAL, dataInMemoryWithoutWAL, 7926 // checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount, 7927 // writeRequestsCount 7928 // 1 x HRegion$WriteState - writestate 7929 // 1 x RegionCoprocessorHost - coprocessorHost 7930 // 1 x RegionSplitPolicy - splitPolicy 7931 // 1 x MetricsRegion - metricsRegion 7932 // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper 7933 // 1 x ReadPointCalculationLock - smallestReadPointCalcLock 7934 public static final long DEEP_OVERHEAD = FIXED_OVERHEAD + ClassSize.OBJECT + // closeLock 7935 (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing 7936 (3 * ClassSize.ATOMIC_LONG) + // numPutsWithoutWAL, dataInMemoryWithoutWAL, 7937 // compactionsFailed 7938 (3 * ClassSize.CONCURRENT_HASHMAP) + // lockedRows, scannerReadPoints, regionLockHolders 7939 WriteState.HEAP_SIZE + // writestate 7940 ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores 7941 (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock 7942 MultiVersionConcurrencyControl.FIXED_SIZE // mvcc 7943 + 2 * ClassSize.TREEMAP // maxSeqIdInStores, replicationScopes 7944 + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress 7945 + ClassSize.STORE_SERVICES // store services 7946 + StoreHotnessProtector.FIXED_SIZE; 7947 7948 @Override 7949 public long heapSize() { 7950 // this does not take into account row locks, recent flushes, mvcc entries, and more 7951 return DEEP_OVERHEAD + stores.values().stream().mapToLong(HStore::heapSize).sum(); 7952 } 7953 7954 /** 7955 * Registers a new protocol buffer {@link Service} subclass as a coprocessor endpoint to be 7956 * available for handling Region#execService(com.google.protobuf.RpcController, 7957 * org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall) calls. 7958 * <p> 7959 * Only a single instance may be registered per region for a given {@link Service} subclass (the 7960 * instances are keyed on {@link com.google.protobuf.Descriptors.ServiceDescriptor#getFullName()}. 7961 * After the first registration, subsequent calls with the same service name will fail with a 7962 * return value of {@code false}. 7963 * </p> 7964 * @param instance the {@code Service} subclass instance to expose as a coprocessor endpoint 7965 * @return {@code true} if the registration was successful, {@code false} otherwise 7966 */ 7967 public boolean registerService(com.google.protobuf.Service instance) { 7968 /* 7969 * No stacking of instances is allowed for a single service name 7970 */ 7971 com.google.protobuf.Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType(); 7972 String serviceName = CoprocessorRpcUtils.getServiceName(serviceDesc); 7973 if (coprocessorServiceHandlers.containsKey(serviceName)) { 7974 LOG.error("Coprocessor service {} already registered, rejecting request from {} in region {}", 7975 serviceName, instance, this); 7976 return false; 7977 } 7978 7979 coprocessorServiceHandlers.put(serviceName, instance); 7980 if (LOG.isDebugEnabled()) { 7981 LOG.debug("Registered coprocessor service: region=" 7982 + Bytes.toStringBinary(getRegionInfo().getRegionName()) + " service=" + serviceName); 7983 } 7984 return true; 7985 } 7986 7987 /** 7988 * Executes a single protocol buffer coprocessor endpoint {@link Service} method using the 7989 * registered protocol handlers. {@link Service} implementations must be registered via the 7990 * {@link #registerService(com.google.protobuf.Service)} method before they are available. 7991 * @param controller an {@code RpcContoller} implementation to pass to the invoked service 7992 * @param call a {@code CoprocessorServiceCall} instance identifying the service, method, 7993 * and parameters for the method invocation 7994 * @return a protocol buffer {@code Message} instance containing the method's result 7995 * @throws IOException if no registered service handler is found or an error occurs during the 7996 * invocation 7997 * @see #registerService(com.google.protobuf.Service) 7998 */ 7999 public com.google.protobuf.Message execService(com.google.protobuf.RpcController controller, 8000 CoprocessorServiceCall call) throws IOException { 8001 String serviceName = call.getServiceName(); 8002 com.google.protobuf.Service service = coprocessorServiceHandlers.get(serviceName); 8003 if (service == null) { 8004 throw new UnknownProtocolException(null, "No registered coprocessor service found for " 8005 + serviceName + " in region " + Bytes.toStringBinary(getRegionInfo().getRegionName())); 8006 } 8007 com.google.protobuf.Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType(); 8008 8009 String methodName = call.getMethodName(); 8010 com.google.protobuf.Descriptors.MethodDescriptor methodDesc = 8011 CoprocessorRpcUtils.getMethodDescriptor(methodName, serviceDesc); 8012 8013 com.google.protobuf.Message.Builder builder = 8014 service.getRequestPrototype(methodDesc).newBuilderForType(); 8015 8016 org.apache.hadoop.hbase.protobuf.ProtobufUtil.mergeFrom(builder, 8017 call.getRequest().toByteArray()); 8018 com.google.protobuf.Message request = 8019 CoprocessorRpcUtils.getRequest(service, methodDesc, call.getRequest()); 8020 8021 if (coprocessorHost != null) { 8022 request = coprocessorHost.preEndpointInvocation(service, methodName, request); 8023 } 8024 8025 final com.google.protobuf.Message.Builder responseBuilder = 8026 service.getResponsePrototype(methodDesc).newBuilderForType(); 8027 service.callMethod(methodDesc, controller, request, 8028 new com.google.protobuf.RpcCallback<com.google.protobuf.Message>() { 8029 @Override 8030 public void run(com.google.protobuf.Message message) { 8031 if (message != null) { 8032 responseBuilder.mergeFrom(message); 8033 } 8034 } 8035 }); 8036 8037 if (coprocessorHost != null) { 8038 coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder); 8039 } 8040 IOException exception = 8041 org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils.getControllerException(controller); 8042 if (exception != null) { 8043 throw exception; 8044 } 8045 8046 return responseBuilder.build(); 8047 } 8048 8049 public Optional<byte[]> checkSplit() { 8050 return checkSplit(false); 8051 } 8052 8053 /** 8054 * Return the split point. An empty result indicates the region isn't splittable. 8055 */ 8056 public Optional<byte[]> checkSplit(boolean force) { 8057 // Can't split META 8058 if ( 8059 this.getRegionInfo().isMetaRegion() 8060 || TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable()) 8061 ) { 8062 return Optional.empty(); 8063 } 8064 8065 // Can't split a region that is closing. 8066 if (this.isClosing()) { 8067 return Optional.empty(); 8068 } 8069 8070 if (!force && !splitPolicy.shouldSplit()) { 8071 return Optional.empty(); 8072 } 8073 8074 byte[] ret = splitPolicy.getSplitPoint(); 8075 if (ret != null && ret.length > 0) { 8076 ret = splitRestriction.getRestrictedSplitPoint(ret); 8077 } 8078 8079 if (ret != null) { 8080 try { 8081 checkRow(ret, "calculated split"); 8082 } catch (IOException e) { 8083 LOG.error("Ignoring invalid split for region {}", this, e); 8084 return Optional.empty(); 8085 } 8086 return Optional.of(ret); 8087 } else { 8088 return Optional.empty(); 8089 } 8090 } 8091 8092 /** Returns The priority that this region should have in the compaction queue */ 8093 public int getCompactPriority() { 8094 if (conf.getBoolean(SPLIT_IGNORE_BLOCKING_ENABLED_KEY, false) && checkSplit().isPresent()) { 8095 // if a region should split, split it before compact 8096 return Store.PRIORITY_USER; 8097 } 8098 return stores.values().stream().mapToInt(HStore::getCompactPriority).min() 8099 .orElse(Store.NO_PRIORITY); 8100 } 8101 8102 /** Returns the coprocessor host */ 8103 public RegionCoprocessorHost getCoprocessorHost() { 8104 return coprocessorHost; 8105 } 8106 8107 /** @param coprocessorHost the new coprocessor host */ 8108 public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) { 8109 this.coprocessorHost = coprocessorHost; 8110 } 8111 8112 @Override 8113 public void startRegionOperation() throws IOException { 8114 startRegionOperation(Operation.ANY); 8115 } 8116 8117 @Override 8118 public void startRegionOperation(Operation op) throws IOException { 8119 boolean isInterruptableOp = false; 8120 switch (op) { 8121 case GET: // interruptible read operations 8122 case SCAN: 8123 isInterruptableOp = true; 8124 checkReadsEnabled(); 8125 break; 8126 case INCREMENT: // interruptible write operations 8127 case APPEND: 8128 case PUT: 8129 case DELETE: 8130 case BATCH_MUTATE: 8131 case CHECK_AND_MUTATE: 8132 isInterruptableOp = true; 8133 break; 8134 default: // all others 8135 break; 8136 } 8137 if ( 8138 op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION || op == Operation.COMPACT_REGION 8139 || op == Operation.COMPACT_SWITCH 8140 ) { 8141 // split, merge or compact region doesn't need to check the closing/closed state or lock the 8142 // region 8143 return; 8144 } 8145 if (this.closing.get()) { 8146 throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing"); 8147 } 8148 lock(lock.readLock()); 8149 // Update regionLockHolders ONLY for any startRegionOperation call that is invoked from 8150 // an RPC handler 8151 Thread thisThread = Thread.currentThread(); 8152 if (isInterruptableOp) { 8153 regionLockHolders.put(thisThread, true); 8154 } 8155 if (this.closed.get()) { 8156 lock.readLock().unlock(); 8157 throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed"); 8158 } 8159 // The unit for snapshot is a region. So, all stores for this region must be 8160 // prepared for snapshot operation before proceeding. 8161 if (op == Operation.SNAPSHOT) { 8162 stores.values().forEach(HStore::preSnapshotOperation); 8163 } 8164 try { 8165 if (coprocessorHost != null) { 8166 coprocessorHost.postStartRegionOperation(op); 8167 } 8168 } catch (Exception e) { 8169 if (isInterruptableOp) { 8170 // would be harmless to remove what we didn't add but we know by 'isInterruptableOp' 8171 // if we added this thread to regionLockHolders 8172 regionLockHolders.remove(thisThread); 8173 } 8174 lock.readLock().unlock(); 8175 throw new IOException(e); 8176 } 8177 } 8178 8179 @Override 8180 public void closeRegionOperation() throws IOException { 8181 closeRegionOperation(Operation.ANY); 8182 } 8183 8184 @Override 8185 public void closeRegionOperation(Operation operation) throws IOException { 8186 if (operation == Operation.SNAPSHOT) { 8187 stores.values().forEach(HStore::postSnapshotOperation); 8188 } 8189 Thread thisThread = Thread.currentThread(); 8190 regionLockHolders.remove(thisThread); 8191 lock.readLock().unlock(); 8192 if (coprocessorHost != null) { 8193 coprocessorHost.postCloseRegionOperation(operation); 8194 } 8195 } 8196 8197 /** 8198 * This method needs to be called before any public call that reads or modifies stores in bulk. It 8199 * has to be called just before a try. #closeBulkRegionOperation needs to be called in the try's 8200 * finally block Acquires a writelock and checks if the region is closing or closed. 8201 * @throws NotServingRegionException when the region is closing or closed 8202 * @throws RegionTooBusyException if failed to get the lock in time 8203 * @throws InterruptedIOException if interrupted while waiting for a lock 8204 */ 8205 private void startBulkRegionOperation(boolean writeLockNeeded) throws IOException { 8206 if (this.closing.get()) { 8207 throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing"); 8208 } 8209 if (writeLockNeeded) lock(lock.writeLock()); 8210 else lock(lock.readLock()); 8211 if (this.closed.get()) { 8212 if (writeLockNeeded) lock.writeLock().unlock(); 8213 else lock.readLock().unlock(); 8214 throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed"); 8215 } 8216 regionLockHolders.put(Thread.currentThread(), true); 8217 } 8218 8219 /** 8220 * Closes the lock. This needs to be called in the finally block corresponding to the try block of 8221 * #startRegionOperation 8222 */ 8223 private void closeBulkRegionOperation() { 8224 regionLockHolders.remove(Thread.currentThread()); 8225 if (lock.writeLock().isHeldByCurrentThread()) lock.writeLock().unlock(); 8226 else lock.readLock().unlock(); 8227 } 8228 8229 /** 8230 * Update LongAdders for number of puts without wal and the size of possible data loss. These 8231 * information are exposed by the region server metrics. 8232 */ 8233 private void recordMutationWithoutWal(final Map<byte[], List<Cell>> familyMap) { 8234 numMutationsWithoutWAL.increment(); 8235 if (numMutationsWithoutWAL.sum() <= 1) { 8236 LOG.info("writing data to region " + this 8237 + " with WAL disabled. Data may be lost in the event of a crash."); 8238 } 8239 8240 long mutationSize = 0; 8241 for (List<Cell> cells : familyMap.values()) { 8242 // Optimization: 'foreach' loop is not used. See: 8243 // HBASE-12023 HRegion.applyFamilyMapToMemstore creates too many iterator objects 8244 assert cells instanceof RandomAccess; 8245 int listSize = cells.size(); 8246 for (int i = 0; i < listSize; i++) { 8247 Cell cell = cells.get(i); 8248 mutationSize += cell.getSerializedSize(); 8249 } 8250 } 8251 8252 dataInMemoryWithoutWAL.add(mutationSize); 8253 } 8254 8255 private void lock(final Lock lock) throws IOException { 8256 lock(lock, 1); 8257 } 8258 8259 /** 8260 * Try to acquire a lock. Throw RegionTooBusyException if failed to get the lock in time. Throw 8261 * InterruptedIOException if interrupted while waiting for the lock. 8262 */ 8263 private void lock(final Lock lock, final int multiplier) throws IOException { 8264 try { 8265 final long waitTime = Math.min(maxBusyWaitDuration, 8266 busyWaitDuration * Math.min(multiplier, maxBusyWaitMultiplier)); 8267 if (!lock.tryLock(waitTime, TimeUnit.MILLISECONDS)) { 8268 // Don't print millis. Message is used as a key over in 8269 // RetriesExhaustedWithDetailsException processing. 8270 final String regionName = 8271 this.getRegionInfo() == null ? "unknown" : this.getRegionInfo().getRegionNameAsString(); 8272 final String serverName = this.getRegionServerServices() == null 8273 ? "unknown" 8274 : (this.getRegionServerServices().getServerName() == null 8275 ? "unknown" 8276 : this.getRegionServerServices().getServerName().toString()); 8277 RegionTooBusyException rtbe = new RegionTooBusyException( 8278 "Failed to obtain lock; regionName=" + regionName + ", server=" + serverName); 8279 LOG.warn("Region is too busy to allow lock acquisition.", rtbe); 8280 throw rtbe; 8281 } 8282 } catch (InterruptedException ie) { 8283 if (LOG.isDebugEnabled()) { 8284 LOG.debug("Interrupted while waiting for a lock in region {}", this); 8285 } 8286 throw throwOnInterrupt(ie); 8287 } 8288 } 8289 8290 /** 8291 * Calls sync with the given transaction ID 8292 * @param txid should sync up to which transaction 8293 * @throws IOException If anything goes wrong with DFS 8294 */ 8295 private void sync(long txid, Durability durability) throws IOException { 8296 if (this.getRegionInfo().isMetaRegion()) { 8297 this.wal.sync(txid); 8298 } else { 8299 switch (durability) { 8300 case USE_DEFAULT: 8301 // do what table defaults to 8302 if (shouldSyncWAL()) { 8303 this.wal.sync(txid); 8304 } 8305 break; 8306 case SKIP_WAL: 8307 // nothing do to 8308 break; 8309 case ASYNC_WAL: 8310 // nothing do to 8311 break; 8312 case SYNC_WAL: 8313 this.wal.sync(txid, false); 8314 break; 8315 case FSYNC_WAL: 8316 this.wal.sync(txid, true); 8317 break; 8318 default: 8319 throw new RuntimeException("Unknown durability " + durability); 8320 } 8321 } 8322 } 8323 8324 /** 8325 * Check whether we should sync the wal from the table's durability settings 8326 */ 8327 private boolean shouldSyncWAL() { 8328 return regionDurability.ordinal() > Durability.ASYNC_WAL.ordinal(); 8329 } 8330 8331 /** Returns the latest sequence number that was read from storage when this region was opened */ 8332 public long getOpenSeqNum() { 8333 return this.openSeqNum; 8334 } 8335 8336 @Override 8337 public Map<byte[], Long> getMaxStoreSeqId() { 8338 return this.maxSeqIdInStores; 8339 } 8340 8341 public long getOldestSeqIdOfStore(byte[] familyName) { 8342 return wal.getEarliestMemStoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), familyName); 8343 } 8344 8345 @Override 8346 public CompactionState getCompactionState() { 8347 boolean hasMajor = majorInProgress.get() > 0, hasMinor = minorInProgress.get() > 0; 8348 return (hasMajor 8349 ? (hasMinor ? CompactionState.MAJOR_AND_MINOR : CompactionState.MAJOR) 8350 : (hasMinor ? CompactionState.MINOR : CompactionState.NONE)); 8351 } 8352 8353 public void reportCompactionRequestStart(boolean isMajor) { 8354 (isMajor ? majorInProgress : minorInProgress).incrementAndGet(); 8355 } 8356 8357 public void reportCompactionRequestEnd(boolean isMajor, int numFiles, long filesSizeCompacted) { 8358 int newValue = (isMajor ? majorInProgress : minorInProgress).decrementAndGet(); 8359 8360 // metrics 8361 compactionsFinished.increment(); 8362 compactionNumFilesCompacted.add(numFiles); 8363 compactionNumBytesCompacted.add(filesSizeCompacted); 8364 8365 assert newValue >= 0; 8366 } 8367 8368 public void reportCompactionRequestFailure() { 8369 compactionsFailed.increment(); 8370 } 8371 8372 public void incrementCompactionsQueuedCount() { 8373 compactionsQueued.increment(); 8374 } 8375 8376 public void decrementCompactionsQueuedCount() { 8377 compactionsQueued.decrement(); 8378 } 8379 8380 public void incrementFlushesQueuedCount() { 8381 flushesQueued.increment(); 8382 } 8383 8384 protected void decrementFlushesQueuedCount() { 8385 flushesQueued.decrement(); 8386 } 8387 8388 /** 8389 * If a handler thread is eligible for interrupt, make it ineligible. Should be paired with 8390 * {{@link #enableInterrupts()}. 8391 */ 8392 void disableInterrupts() { 8393 regionLockHolders.computeIfPresent(Thread.currentThread(), (t, b) -> false); 8394 } 8395 8396 /** 8397 * If a handler thread was made ineligible for interrupt via {{@link #disableInterrupts()}, make 8398 * it eligible again. No-op if interrupts are already enabled. 8399 */ 8400 void enableInterrupts() { 8401 regionLockHolders.computeIfPresent(Thread.currentThread(), (t, b) -> true); 8402 } 8403 8404 /** 8405 * Interrupt any region options that have acquired the region lock via 8406 * {@link #startRegionOperation(org.apache.hadoop.hbase.regionserver.Region.Operation)}, or 8407 * {@link #startBulkRegionOperation(boolean)}. 8408 */ 8409 private void interruptRegionOperations() { 8410 for (Map.Entry<Thread, Boolean> entry : regionLockHolders.entrySet()) { 8411 // An entry in this map will have a boolean value indicating if it is currently 8412 // eligible for interrupt; if so, we should interrupt it. 8413 if (entry.getValue().booleanValue()) { 8414 entry.getKey().interrupt(); 8415 } 8416 } 8417 } 8418 8419 /** 8420 * Check thread interrupt status and throw an exception if interrupted. 8421 * @throws NotServingRegionException if region is closing 8422 * @throws InterruptedIOException if interrupted but region is not closing 8423 */ 8424 // Package scope for tests 8425 void checkInterrupt() throws NotServingRegionException, InterruptedIOException { 8426 if (Thread.interrupted()) { 8427 if (this.closing.get()) { 8428 throw new NotServingRegionException( 8429 getRegionInfo().getRegionNameAsString() + " is closing"); 8430 } 8431 throw new InterruptedIOException(); 8432 } 8433 } 8434 8435 /** 8436 * Throw the correct exception upon interrupt 8437 * @param t cause 8438 */ 8439 // Package scope for tests 8440 IOException throwOnInterrupt(Throwable t) { 8441 if (this.closing.get()) { 8442 return (NotServingRegionException) new NotServingRegionException( 8443 getRegionInfo().getRegionNameAsString() + " is closing").initCause(t); 8444 } 8445 return (InterruptedIOException) new InterruptedIOException().initCause(t); 8446 } 8447 8448 /** 8449 * {@inheritDoc} 8450 */ 8451 @Override 8452 public void onConfigurationChange(Configuration conf) { 8453 this.storeHotnessProtector.update(conf); 8454 // update coprocessorHost if the configuration has changed. 8455 if ( 8456 CoprocessorConfigurationUtil.checkConfigurationChange(getReadOnlyConfiguration(), conf, 8457 CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, 8458 CoprocessorHost.USER_REGION_COPROCESSOR_CONF_KEY) 8459 ) { 8460 LOG.info("Update the system coprocessors because the configuration has changed"); 8461 decorateRegionConfiguration(conf); 8462 this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf); 8463 } 8464 } 8465 8466 /** 8467 * {@inheritDoc} 8468 */ 8469 @Override 8470 public void registerChildren(ConfigurationManager manager) { 8471 configurationManager = manager; 8472 stores.values().forEach(manager::registerObserver); 8473 } 8474 8475 /** 8476 * {@inheritDoc} 8477 */ 8478 @Override 8479 public void deregisterChildren(ConfigurationManager manager) { 8480 stores.values().forEach(configurationManager::deregisterObserver); 8481 } 8482 8483 @Override 8484 public CellComparator getCellComparator() { 8485 return cellComparator; 8486 } 8487 8488 public long getMemStoreFlushSize() { 8489 return this.memstoreFlushSize; 8490 } 8491 8492 //// method for debugging tests 8493 void throwException(String title, String regionName) { 8494 StringBuilder buf = new StringBuilder(); 8495 buf.append(title + ", "); 8496 buf.append(getRegionInfo().toString()); 8497 buf.append(getRegionInfo().isMetaRegion() ? " meta region " : " "); 8498 buf.append("stores: "); 8499 for (HStore s : stores.values()) { 8500 buf.append(s.getColumnFamilyDescriptor().getNameAsString()); 8501 buf.append(" size: "); 8502 buf.append(s.getMemStoreSize().getDataSize()); 8503 buf.append(" "); 8504 } 8505 buf.append("end-of-stores"); 8506 buf.append(", memstore size "); 8507 buf.append(getMemStoreDataSize()); 8508 if (getRegionInfo().getRegionNameAsString().startsWith(regionName)) { 8509 throw new RuntimeException(buf.toString()); 8510 } 8511 } 8512 8513 @Override 8514 public void requestCompaction(String why, int priority, boolean major, 8515 CompactionLifeCycleTracker tracker) throws IOException { 8516 if (major) { 8517 stores.values().forEach(HStore::triggerMajorCompaction); 8518 } 8519 rsServices.getCompactionRequestor().requestCompaction(this, why, priority, tracker, 8520 RpcServer.getRequestUser().orElse(null)); 8521 } 8522 8523 @Override 8524 public void requestCompaction(byte[] family, String why, int priority, boolean major, 8525 CompactionLifeCycleTracker tracker) throws IOException { 8526 HStore store = stores.get(family); 8527 if (store == null) { 8528 throw new NoSuchColumnFamilyException("column family " + Bytes.toString(family) 8529 + " does not exist in region " + getRegionInfo().getRegionNameAsString()); 8530 } 8531 if (major) { 8532 store.triggerMajorCompaction(); 8533 } 8534 rsServices.getCompactionRequestor().requestCompaction(this, store, why, priority, tracker, 8535 RpcServer.getRequestUser().orElse(null)); 8536 } 8537 8538 private void requestFlushIfNeeded() throws RegionTooBusyException { 8539 if (isFlushSize(this.memStoreSizing.getMemStoreSize())) { 8540 requestFlush(); 8541 } 8542 } 8543 8544 private void requestFlush() { 8545 if (this.rsServices == null) { 8546 return; 8547 } 8548 requestFlush0(FlushLifeCycleTracker.DUMMY); 8549 } 8550 8551 private void requestFlush0(FlushLifeCycleTracker tracker) { 8552 boolean shouldFlush = false; 8553 synchronized (writestate) { 8554 if (!this.writestate.isFlushRequested()) { 8555 shouldFlush = true; 8556 writestate.flushRequested = true; 8557 } 8558 } 8559 if (shouldFlush) { 8560 // Make request outside of synchronize block; HBASE-818. 8561 this.rsServices.getFlushRequester().requestFlush(this, tracker); 8562 if (LOG.isDebugEnabled()) { 8563 LOG.debug("Flush requested on " + this.getRegionInfo().getEncodedName()); 8564 } 8565 } else { 8566 tracker.notExecuted("Flush already requested on " + this); 8567 } 8568 } 8569 8570 @Override 8571 public void requestFlush(FlushLifeCycleTracker tracker) throws IOException { 8572 requestFlush0(tracker); 8573 } 8574 8575 /** 8576 * This method modifies the region's configuration in order to inject replication-related features 8577 * @param conf region configurations 8578 */ 8579 private static void decorateRegionConfiguration(Configuration conf) { 8580 if (ReplicationUtils.isReplicationForBulkLoadDataEnabled(conf)) { 8581 String plugins = conf.get(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, ""); 8582 String replicationCoprocessorClass = ReplicationObserver.class.getCanonicalName(); 8583 if (!plugins.contains(replicationCoprocessorClass)) { 8584 conf.set(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, 8585 (plugins.equals("") ? "" : (plugins + ",")) + replicationCoprocessorClass); 8586 } 8587 } 8588 } 8589 8590 public void addReadRequestsCount(long readRequestsCount) { 8591 this.readRequestsCount.add(readRequestsCount); 8592 } 8593 8594 public void addWriteRequestsCount(long writeRequestsCount) { 8595 this.writeRequestsCount.add(writeRequestsCount); 8596 } 8597 8598 @RestrictedApi(explanation = "Should only be called in tests", link = "", 8599 allowedOnPath = ".*/src/test/.*") 8600 boolean isReadsEnabled() { 8601 return this.writestate.readsEnabled; 8602 } 8603}