001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.assignment; 019 020import static org.apache.hadoop.hbase.io.hfile.CacheConfig.DEFAULT_EVICT_ON_CLOSE; 021import static org.apache.hadoop.hbase.io.hfile.CacheConfig.DEFAULT_EVICT_ON_SPLIT; 022import static org.apache.hadoop.hbase.io.hfile.CacheConfig.EVICT_BLOCKS_ON_CLOSE_KEY; 023import static org.apache.hadoop.hbase.io.hfile.CacheConfig.EVICT_BLOCKS_ON_SPLIT_KEY; 024import static org.apache.hadoop.hbase.master.LoadBalancer.BOGUS_SERVER_NAME; 025import static org.apache.hadoop.hbase.master.assignment.AssignmentManager.FORCE_REGION_RETAINMENT; 026 027import edu.umd.cs.findbugs.annotations.Nullable; 028import java.io.IOException; 029import java.util.concurrent.CompletableFuture; 030import java.util.concurrent.TimeUnit; 031import org.apache.hadoop.hbase.HBaseIOException; 032import org.apache.hadoop.hbase.ServerName; 033import org.apache.hadoop.hbase.TableName; 034import org.apache.hadoop.hbase.client.RegionInfo; 035import org.apache.hadoop.hbase.client.RegionReplicaUtil; 036import org.apache.hadoop.hbase.client.RetriesExhaustedException; 037import org.apache.hadoop.hbase.master.MetricsAssignmentManager; 038import org.apache.hadoop.hbase.master.RegionState.State; 039import org.apache.hadoop.hbase.master.ServerManager; 040import org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure; 041import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; 042import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; 043import org.apache.hadoop.hbase.procedure2.Procedure; 044import org.apache.hadoop.hbase.procedure2.ProcedureFutureUtil; 045import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; 046import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; 047import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; 048import org.apache.hadoop.hbase.procedure2.ProcedureUtil; 049import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; 050import org.apache.hadoop.hbase.util.FutureUtils; 051import org.apache.hadoop.hbase.util.RetryCounter; 052import org.apache.yetus.audience.InterfaceAudience; 053import org.slf4j.Logger; 054import org.slf4j.LoggerFactory; 055 056import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 057import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionStateTransitionState; 058import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionStateTransitionStateData; 059import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionType; 060import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; 061import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; 062 063/** 064 * The procedure to deal with the state transition of a region. A region with a TRSP in place is 065 * called RIT, i.e, RegionInTransition. 066 * <p/> 067 * It can be used to assign/unassign/reopen/move a region, and for 068 * {@link #unassign(MasterProcedureEnv, RegionInfo)} and 069 * {@link #reopen(MasterProcedureEnv, RegionInfo)}, you do not need to specify a target server, and 070 * for {@link #assign(MasterProcedureEnv, RegionInfo, ServerName)} and 071 * {@link #move(MasterProcedureEnv, RegionInfo, ServerName)}, if you want to you can provide a 072 * target server. And for {@link #move(MasterProcedureEnv, RegionInfo, ServerName)}, if you do not 073 * specify a targetServer, we will select one randomly. 074 * <p/> 075 * <p/> 076 * The typical state transition for assigning a region is: 077 * 078 * <pre> 079 * GET_ASSIGN_CANDIDATE ------> OPEN -----> CONFIRM_OPENED 080 * </pre> 081 * 082 * Notice that, if there are failures we may go back to the {@code GET_ASSIGN_CANDIDATE} state to 083 * try again. 084 * <p/> 085 * The typical state transition for unassigning a region is: 086 * 087 * <pre> 088 * CLOSE -----> CONFIRM_CLOSED 089 * </pre> 090 * 091 * Here things go a bit different, if there are failures, especially that if there is a server 092 * crash, we will go to the {@code GET_ASSIGN_CANDIDATE} state to bring the region online first, and 093 * then go through the normal way to unassign it. 094 * <p/> 095 * The typical state transition for reopening/moving a region is: 096 * 097 * <pre> 098 * CLOSE -----> CONFIRM_CLOSED -----> GET_ASSIGN_CANDIDATE ------> OPEN -----> CONFIRM_OPENED 099 * </pre> 100 * 101 * The retry logic is the same with the above assign/unassign. 102 * <p/> 103 * Notice that, although we allow specify a target server, it just acts as a candidate, we do not 104 * guarantee that the region will finally be on the target server. If this is important for you, you 105 * should check whether the region is on the target server after the procedure is finished. 106 * </p> 107 * Altenatively, for trying retaining assignments, the 108 * <b>hbase.master.scp.retain.assignment.force</b> option can be used together with 109 * <b>hbase.master.scp.retain.assignment</b>. 110 * <p/> 111 * When you want to schedule a TRSP, please check whether there is still one for this region, and 112 * the check should be under the RegionStateNode lock. We will remove the TRSP from a 113 * RegionStateNode when we are done, see the code in {@code reportTransition} method below. There 114 * could be at most one TRSP for a give region. 115 */ 116@InterfaceAudience.Private 117public class TransitRegionStateProcedure 118 extends AbstractStateMachineRegionProcedure<RegionStateTransitionState> { 119 120 private static final Logger LOG = LoggerFactory.getLogger(TransitRegionStateProcedure.class); 121 122 private TransitionType type; 123 124 private RegionStateTransitionState initialState; 125 126 private RegionStateTransitionState lastState; 127 128 // the candidate where we want to assign the region to. 129 private ServerName assignCandidate; 130 131 private boolean forceNewPlan; 132 133 private RetryCounter retryCounter; 134 135 private RegionRemoteProcedureBase remoteProc; 136 137 private boolean evictCache; 138 139 private boolean isSplit; 140 141 private RetryCounter forceRetainmentRetryCounter; 142 143 private long forceRetainmentTotalWait; 144 145 private CompletableFuture<Void> future; 146 147 public TransitRegionStateProcedure() { 148 } 149 150 private void setInitialAndLastState() { 151 switch (type) { 152 case ASSIGN: 153 initialState = RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE; 154 lastState = RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED; 155 break; 156 case UNASSIGN: 157 initialState = RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE; 158 lastState = RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED; 159 break; 160 case MOVE: 161 case REOPEN: 162 initialState = RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE; 163 lastState = RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED; 164 break; 165 default: 166 throw new IllegalArgumentException("Unknown TransitionType: " + type); 167 } 168 } 169 170 protected TransitRegionStateProcedure(MasterProcedureEnv env, RegionInfo hri, 171 ServerName assignCandidate, boolean forceNewPlan, TransitionType type) { 172 super(env, hri); 173 this.assignCandidate = assignCandidate; 174 this.forceNewPlan = forceNewPlan; 175 this.type = type; 176 setInitialAndLastState(); 177 178 // when do reopen TRSP, let the rs know the targetServer so it can keep some info on close 179 if (type == TransitionType.REOPEN) { 180 this.assignCandidate = getRegionStateNode(env).getRegionLocation(); 181 } 182 evictCache = 183 env.getMasterConfiguration().getBoolean(EVICT_BLOCKS_ON_CLOSE_KEY, DEFAULT_EVICT_ON_CLOSE); 184 initForceRetainmentRetryCounter(env); 185 } 186 187 private void initForceRetainmentRetryCounter(MasterProcedureEnv env) { 188 if (env.getAssignmentManager().isForceRegionRetainment()) { 189 forceRetainmentRetryCounter = 190 new RetryCounter(env.getAssignmentManager().getForceRegionRetainmentRetries(), 191 env.getAssignmentManager().getForceRegionRetainmentWaitInterval(), TimeUnit.MILLISECONDS); 192 forceRetainmentTotalWait = 0; 193 } 194 } 195 196 protected TransitRegionStateProcedure(MasterProcedureEnv env, RegionInfo hri, 197 ServerName assignCandidate, boolean forceNewPlan, TransitionType type, boolean isSplit) { 198 this(env, hri, assignCandidate, forceNewPlan, type); 199 this.isSplit = isSplit; 200 } 201 202 @Override 203 public TableOperationType getTableOperationType() { 204 // TODO: maybe we should make another type here, REGION_TRANSITION? 205 return TableOperationType.REGION_EDIT; 206 } 207 208 @Override 209 protected boolean waitInitialized(MasterProcedureEnv env) { 210 if (TableName.isMetaTableName(getTableName())) { 211 return false; 212 } 213 // First we need meta to be loaded, and second, if meta is not online then we will likely to 214 // fail when updating meta so we wait until it is assigned. 215 AssignmentManager am = env.getAssignmentManager(); 216 return am.waitMetaLoaded(this) || am.waitMetaAssigned(this, getRegion()); 217 } 218 219 private void checkAndWaitForOriginalServer(MasterProcedureEnv env, ServerName lastHost) 220 throws ProcedureSuspendedException { 221 ServerManager serverManager = env.getMasterServices().getServerManager(); 222 ServerName newNameForServer = serverManager.findServerWithSameHostnamePortWithLock(lastHost); 223 boolean isOnline = serverManager.createDestinationServersList().contains(newNameForServer); 224 225 if (!isOnline && forceRetainmentRetryCounter.shouldRetry()) { 226 int backoff = 227 Math.toIntExact(forceRetainmentRetryCounter.getBackoffTimeAndIncrementAttempts()); 228 forceRetainmentTotalWait += backoff; 229 LOG.info( 230 "Suspending the TRSP PID={} for {}ms because {} is true and previous host {} " 231 + "for region is not yet online.", 232 this.getProcId(), backoff, FORCE_REGION_RETAINMENT, lastHost); 233 setTimeout(backoff); 234 setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); 235 throw new ProcedureSuspendedException(); 236 } 237 LOG.info( 238 "{} is true. TRSP PID={} waited {}ms for host {} to come back online. " 239 + "Did host come back online? {}", 240 FORCE_REGION_RETAINMENT, this.getProcId(), forceRetainmentTotalWait, lastHost, isOnline); 241 initForceRetainmentRetryCounter(env); 242 } 243 244 private void queueAssign(MasterProcedureEnv env, RegionStateNode regionNode) 245 throws ProcedureSuspendedException { 246 boolean retain = false; 247 if (forceNewPlan) { 248 // set the region location to null if forceNewPlan is true 249 regionNode.setRegionLocation(null); 250 } else { 251 if (assignCandidate != null) { 252 retain = assignCandidate.equals(regionNode.getLastHost()); 253 regionNode.setRegionLocation(assignCandidate); 254 } else if (regionNode.getLastHost() != null) { 255 retain = true; 256 LOG.info("Setting lastHost {} as the location for region {}", regionNode.getLastHost(), 257 regionNode.getRegionInfo().getEncodedName()); 258 regionNode.setRegionLocation(regionNode.getLastHost()); 259 } 260 if ( 261 regionNode.getRegionLocation() != null 262 && env.getAssignmentManager().isForceRegionRetainment() 263 ) { 264 LOG.warn("{} is set to true. This may delay regions re-assignment " 265 + "upon RegionServers crashes or restarts.", FORCE_REGION_RETAINMENT); 266 checkAndWaitForOriginalServer(env, regionNode.getRegionLocation()); 267 } 268 } 269 LOG.info("Starting {}; {}; forceNewPlan={}, retain={}", this, regionNode.toShortString(), 270 forceNewPlan, retain); 271 env.getAssignmentManager().queueAssign(regionNode); 272 setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_OPEN); 273 if (regionNode.getProcedureEvent().suspendIfNotReady(this)) { 274 throw new ProcedureSuspendedException(); 275 } 276 } 277 278 private CompletableFuture<Void> getFuture() { 279 return future; 280 } 281 282 private void setFuture(CompletableFuture<Void> f) { 283 future = f; 284 } 285 286 private void openRegionAfterUpdatingMeta(ServerName loc) { 287 addChildProcedure(new OpenRegionProcedure(this, getRegion(), loc)); 288 setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED); 289 } 290 291 private void openRegion(MasterProcedureEnv env, RegionStateNode regionNode) 292 throws IOException, ProcedureSuspendedException { 293 ServerName loc = regionNode.getRegionLocation(); 294 if ( 295 ProcedureFutureUtil.checkFuture(this, this::getFuture, this::setFuture, 296 () -> openRegionAfterUpdatingMeta(loc)) 297 ) { 298 return; 299 } 300 if (loc == null || BOGUS_SERVER_NAME.equals(loc)) { 301 LOG.warn("No location specified for {}, jump back to state {} to get one", getRegion(), 302 RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE); 303 setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE); 304 throw new HBaseIOException("Failed to open region, the location is null or bogus."); 305 } 306 ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture, 307 env.getAssignmentManager().regionOpening(regionNode), env, 308 () -> openRegionAfterUpdatingMeta(loc)); 309 } 310 311 private void regionFailedOpenAfterUpdatingMeta(MasterProcedureEnv env, 312 RegionStateNode regionNode) { 313 setFailure(getClass().getSimpleName(), new RetriesExhaustedException( 314 "Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() + " exceeded")); 315 regionNode.unsetProcedure(this); 316 } 317 318 private Flow confirmOpened(MasterProcedureEnv env, RegionStateNode regionNode) 319 throws IOException, ProcedureSuspendedException { 320 if ( 321 ProcedureFutureUtil.checkFuture(this, this::getFuture, this::setFuture, 322 () -> regionFailedOpenAfterUpdatingMeta(env, regionNode)) 323 ) { 324 return Flow.NO_MORE_STATE; 325 } 326 if (regionNode.isInState(State.OPEN)) { 327 retryCounter = null; 328 if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED) { 329 // we are the last state, finish 330 regionNode.unsetProcedure(this); 331 ServerCrashProcedure.updateProgress(env, getParentProcId()); 332 return Flow.NO_MORE_STATE; 333 } 334 // It is possible that we arrive here but confirm opened is not the last state, for example, 335 // when merging or splitting a region, we unassign the region from a RS and the RS is crashed, 336 // then there will be recovered edits for this region, we'd better make the region online 337 // again and then unassign it, otherwise we have to fail the merge/split procedure as we may 338 // loss data. 339 setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE); 340 return Flow.HAS_MORE_STATE; 341 } 342 343 int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode) 344 .incrementAndGetRetries(); 345 int maxAttempts = env.getAssignmentManager().getAssignMaxAttempts(); 346 LOG.info("Retry={} of max={}; {}; {}", retries, maxAttempts, this, regionNode.toShortString()); 347 348 if (retries >= maxAttempts) { 349 ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture, 350 env.getAssignmentManager().regionFailedOpen(regionNode, true), env, 351 () -> regionFailedOpenAfterUpdatingMeta(env, regionNode)); 352 return Flow.NO_MORE_STATE; 353 } 354 355 // if not giving up, we will not update meta, so the returned CompletableFuture should be a fake 356 // one, which should have been completed already 357 CompletableFuture<Void> future = env.getAssignmentManager().regionFailedOpen(regionNode, false); 358 assert future.isDone(); 359 // we failed to assign the region, force a new plan 360 forceNewPlan = true; 361 regionNode.setRegionLocation(null); 362 setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE); 363 364 if (retries > env.getAssignmentManager().getAssignRetryImmediatelyMaxAttempts()) { 365 // Throw exception to backoff and retry when failed open too many times 366 throw new HBaseIOException( 367 "Failed confirm OPEN of " + regionNode + " (remote log may yield more detail on why)."); 368 } else { 369 // Here we do not throw exception because we want to the region to be online ASAP 370 return Flow.HAS_MORE_STATE; 371 } 372 } 373 374 private void closeRegionAfterUpdatingMeta(MasterProcedureEnv env, RegionStateNode regionNode) { 375 LOG.debug("Close region: isSplit: {}: evictOnSplit: {}: evictOnClose: {}", isSplit, 376 env.getMasterConfiguration().getBoolean(EVICT_BLOCKS_ON_SPLIT_KEY, DEFAULT_EVICT_ON_SPLIT), 377 evictCache); 378 // Splits/Merges are special cases, rather than deciding on the cache eviction behaviour here at 379 // Master, we just need to tell this close is for a split/merge and let RSes decide on the 380 // eviction. See HBASE-28811 for more context. 381 CloseRegionProcedure closeProc = new CloseRegionProcedure(this, getRegion(), 382 regionNode.getRegionLocation(), assignCandidate, isSplit); 383 addChildProcedure(closeProc); 384 setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED); 385 } 386 387 private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) 388 throws IOException, ProcedureSuspendedException { 389 if ( 390 ProcedureFutureUtil.checkFuture(this, this::getFuture, this::setFuture, 391 () -> closeRegionAfterUpdatingMeta(env, regionNode)) 392 ) { 393 return; 394 } 395 if (regionNode.isInState(State.OPEN, State.CLOSING, State.MERGING, State.SPLITTING)) { 396 // this is the normal case 397 ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture, 398 env.getAssignmentManager().regionClosing(regionNode), env, 399 () -> closeRegionAfterUpdatingMeta(env, regionNode)); 400 } else { 401 forceNewPlan = true; 402 regionNode.setRegionLocation(null); 403 setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE); 404 } 405 } 406 407 private Flow confirmClosed(MasterProcedureEnv env, RegionStateNode regionNode) 408 throws IOException { 409 if (regionNode.isInState(State.CLOSED)) { 410 retryCounter = null; 411 if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED) { 412 // we are the last state, finish 413 regionNode.unsetProcedure(this); 414 return Flow.NO_MORE_STATE; 415 } 416 // This means we need to open the region again, should be a move or reopen 417 setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE); 418 return Flow.HAS_MORE_STATE; 419 } 420 if (regionNode.isInState(State.CLOSING)) { 421 // This is possible, think the target RS crashes and restarts immediately, the close region 422 // operation will return a NotServingRegionException soon, we can only recover after SCP takes 423 // care of this RS. So here we throw an IOException to let upper layer to retry with backoff. 424 setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE); 425 throw new HBaseIOException("Failed to close region"); 426 } 427 // abnormally closed, need to reopen it, no matter what is the last state, see the comment in 428 // confirmOpened for more details that why we need to reopen the region first even if we just 429 // want to close it. 430 // The only exception is for non-default replica, where we do not need to deal with recovered 431 // edits. Notice that the region will remain in ABNORMALLY_CLOSED state, the upper layer need to 432 // deal with this state. For non-default replica, this is usually the same with CLOSED. 433 assert regionNode.isInState(State.ABNORMALLY_CLOSED); 434 if ( 435 !RegionReplicaUtil.isDefaultReplica(getRegion()) 436 && lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED 437 ) { 438 regionNode.unsetProcedure(this); 439 return Flow.NO_MORE_STATE; 440 } 441 retryCounter = null; 442 setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE); 443 return Flow.HAS_MORE_STATE; 444 } 445 446 // Override to lock RegionStateNode 447 @SuppressWarnings("rawtypes") 448 @Override 449 protected Procedure[] execute(MasterProcedureEnv env) 450 throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException { 451 RegionStateNode regionNode = 452 env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion()); 453 if (!regionNode.isLockedBy(this)) { 454 regionNode.lock(this, () -> ProcedureFutureUtil.wakeUp(this, env)); 455 } 456 try { 457 return super.execute(env); 458 } finally { 459 if (future == null) { 460 // release the lock if there is no pending updating meta operation 461 regionNode.unlock(this); 462 } 463 } 464 } 465 466 private RegionStateNode getRegionStateNode(MasterProcedureEnv env) { 467 return env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion()); 468 } 469 470 @Override 471 protected Flow executeFromState(MasterProcedureEnv env, RegionStateTransitionState state) 472 throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException { 473 RegionStateNode regionNode = getRegionStateNode(env); 474 try { 475 switch (state) { 476 case REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE: 477 // Need to do some sanity check for replica region, if the region does not exist at 478 // master, do not try to assign the replica region, log error and return. 479 if (!RegionReplicaUtil.isDefaultReplica(regionNode.getRegionInfo())) { 480 RegionInfo defaultRI = 481 RegionReplicaUtil.getRegionInfoForDefaultReplica(regionNode.getRegionInfo()); 482 if ( 483 env.getMasterServices().getAssignmentManager().getRegionStates() 484 .getRegionStateNode(defaultRI) == null 485 ) { 486 LOG.error( 487 "Cannot assign replica region {} because its primary region {} does not exist.", 488 regionNode.getRegionInfo(), defaultRI); 489 regionNode.unsetProcedure(this); 490 return Flow.NO_MORE_STATE; 491 } 492 } 493 queueAssign(env, regionNode); 494 return Flow.HAS_MORE_STATE; 495 case REGION_STATE_TRANSITION_OPEN: 496 openRegion(env, regionNode); 497 return Flow.HAS_MORE_STATE; 498 case REGION_STATE_TRANSITION_CONFIRM_OPENED: 499 return confirmOpened(env, regionNode); 500 case REGION_STATE_TRANSITION_CLOSE: 501 closeRegion(env, regionNode); 502 return Flow.HAS_MORE_STATE; 503 case REGION_STATE_TRANSITION_CONFIRM_CLOSED: 504 return confirmClosed(env, regionNode); 505 default: 506 throw new UnsupportedOperationException("unhandled state=" + state); 507 } 508 } catch (IOException e) { 509 if (retryCounter == null) { 510 retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration()); 511 } 512 long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); 513 LOG.warn( 514 "Failed transition, suspend {}secs {}; {}; waiting on rectified condition fixed " 515 + "by other Procedure or operator intervention", 516 backoff / 1000, this, regionNode.toShortString(), e); 517 throw suspend(Math.toIntExact(backoff), true); 518 } 519 } 520 521 /** 522 * At end of timeout, wake ourselves up so we run again. 523 */ 524 @Override 525 protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) { 526 setState(ProcedureProtos.ProcedureState.RUNNABLE); 527 env.getProcedureScheduler().addFront(this); 528 return false; // 'false' means that this procedure handled the timeout 529 } 530 531 // Should be called with RegionStateNode locked 532 public void reportTransition(MasterProcedureEnv env, RegionStateNode regionNode, 533 ServerName serverName, TransitionCode code, long seqId, long procId) throws IOException { 534 if (remoteProc == null) { 535 LOG.warn( 536 "There is no outstanding remote region procedure for {}, serverName={}, code={}," 537 + " seqId={}, proc={}, should be a retry, ignore", 538 regionNode, serverName, code, seqId, this); 539 return; 540 } 541 // The procId could be -1 if it is from an old region server, we need to deal with it so that we 542 // can do rolling upgraing. 543 if (procId >= 0 && remoteProc.getProcId() != procId) { 544 LOG.warn( 545 "The pid of remote region procedure for {} is {}, the reported pid={}, serverName={}," 546 + " code={}, seqId={}, proc={}, should be a retry, ignore", 547 regionNode, remoteProc.getProcId(), procId, serverName, code, seqId, this); 548 return; 549 } 550 remoteProc.reportTransition(env, regionNode, serverName, code, seqId); 551 } 552 553 // Should be called with RegionStateNode locked 554 public CompletableFuture<Void> serverCrashed(MasterProcedureEnv env, RegionStateNode regionNode, 555 ServerName serverName, boolean forceNewPlan) { 556 this.forceNewPlan = forceNewPlan; 557 if (remoteProc != null) { 558 // this means we are waiting for the sub procedure, so wake it up 559 try { 560 remoteProc.serverCrashed(env, regionNode, serverName); 561 } catch (Exception e) { 562 return FutureUtils.failedFuture(e); 563 } 564 return CompletableFuture.completedFuture(null); 565 } else { 566 if (regionNode.isInState(State.ABNORMALLY_CLOSED)) { 567 // should be a retry, where we have already changed the region state to abnormally closed 568 return CompletableFuture.completedFuture(null); 569 } else { 570 // we are in RUNNING state, just update the region state, and we will process it later. 571 return env.getAssignmentManager().regionClosedAbnormally(regionNode); 572 } 573 } 574 } 575 576 void attachRemoteProc(RegionRemoteProcedureBase proc) { 577 this.remoteProc = proc; 578 } 579 580 void unattachRemoteProc(RegionRemoteProcedureBase proc) { 581 assert this.remoteProc == proc; 582 this.remoteProc = null; 583 } 584 585 // will be called after we finish loading the meta entry for this region. 586 // used to change the state of the region node if we have a sub procedure, as we may not persist 587 // the state to meta yet. See the code in RegionRemoteProcedureBase.execute for more details. 588 void stateLoaded(AssignmentManager am, RegionStateNode regionNode) { 589 if (remoteProc != null) { 590 remoteProc.stateLoaded(am, regionNode); 591 } 592 } 593 594 @Override 595 protected void rollbackState(MasterProcedureEnv env, RegionStateTransitionState state) 596 throws IOException, InterruptedException { 597 // no rollback 598 throw new UnsupportedOperationException(); 599 } 600 601 @Override 602 protected RegionStateTransitionState getState(int stateId) { 603 return RegionStateTransitionState.forNumber(stateId); 604 } 605 606 @Override 607 protected int getStateId(RegionStateTransitionState state) { 608 return state.getNumber(); 609 } 610 611 @Override 612 protected RegionStateTransitionState getInitialState() { 613 return initialState; 614 } 615 616 private static TransitionType convert(RegionTransitionType type) { 617 switch (type) { 618 case ASSIGN: 619 return TransitionType.ASSIGN; 620 case UNASSIGN: 621 return TransitionType.UNASSIGN; 622 case MOVE: 623 return TransitionType.MOVE; 624 case REOPEN: 625 return TransitionType.REOPEN; 626 default: 627 throw new IllegalArgumentException("Unknown RegionTransitionType: " + type); 628 } 629 } 630 631 private static RegionTransitionType convert(TransitionType type) { 632 switch (type) { 633 case ASSIGN: 634 return RegionTransitionType.ASSIGN; 635 case UNASSIGN: 636 return RegionTransitionType.UNASSIGN; 637 case MOVE: 638 return RegionTransitionType.MOVE; 639 case REOPEN: 640 return RegionTransitionType.REOPEN; 641 default: 642 throw new IllegalArgumentException("Unknown TransitionType: " + type); 643 } 644 } 645 646 @Override 647 protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException { 648 super.serializeStateData(serializer); 649 RegionStateTransitionStateData.Builder builder = 650 RegionStateTransitionStateData.newBuilder().setType(convert(type)) 651 .setForceNewPlan(forceNewPlan).setEvictCache(evictCache).setIsSplit(isSplit); 652 if (assignCandidate != null) { 653 builder.setAssignCandidate(ProtobufUtil.toServerName(assignCandidate)); 654 } 655 serializer.serialize(builder.build()); 656 } 657 658 @Override 659 protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException { 660 super.deserializeStateData(serializer); 661 RegionStateTransitionStateData data = 662 serializer.deserialize(RegionStateTransitionStateData.class); 663 type = convert(data.getType()); 664 setInitialAndLastState(); 665 forceNewPlan = data.getForceNewPlan(); 666 if (data.hasAssignCandidate()) { 667 assignCandidate = ProtobufUtil.toServerName(data.getAssignCandidate()); 668 } 669 evictCache = data.getEvictCache(); 670 isSplit = data.getIsSplit(); 671 } 672 673 @Override 674 protected ProcedureMetrics getProcedureMetrics(MasterProcedureEnv env) { 675 MetricsAssignmentManager metrics = env.getAssignmentManager().getAssignmentManagerMetrics(); 676 switch (type) { 677 case ASSIGN: 678 return metrics.getAssignProcMetrics(); 679 case UNASSIGN: 680 return metrics.getUnassignProcMetrics(); 681 case MOVE: 682 return metrics.getMoveProcMetrics(); 683 case REOPEN: 684 return metrics.getReopenProcMetrics(); 685 default: 686 throw new IllegalArgumentException("Unknown transition type: " + type); 687 } 688 } 689 690 @Override 691 public void toStringClassDetails(StringBuilder sb) { 692 super.toStringClassDetails(sb); 693 if (initialState == RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE) { 694 sb.append(", ASSIGN"); 695 } else if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED) { 696 sb.append(", UNASSIGN"); 697 } else { 698 sb.append(", REOPEN/MOVE"); 699 } 700 } 701 702 private static TransitRegionStateProcedure setOwner(MasterProcedureEnv env, 703 TransitRegionStateProcedure proc) { 704 proc.setOwner(env.getRequestUser().getShortName()); 705 return proc; 706 } 707 708 public enum TransitionType { 709 ASSIGN, 710 UNASSIGN, 711 MOVE, 712 REOPEN 713 } 714 715 // Be careful that, when you call these 4 methods below, you need to manually attach the returned 716 // procedure with the RegionStateNode, otherwise the procedure will quit immediately without doing 717 // anything. See the comment in executeFromState to find out why we need this assumption. 718 public static TransitRegionStateProcedure assign(MasterProcedureEnv env, RegionInfo region, 719 @Nullable ServerName targetServer) { 720 return assign(env, region, false, targetServer); 721 } 722 723 public static TransitRegionStateProcedure assign(MasterProcedureEnv env, RegionInfo region, 724 boolean forceNewPlan, @Nullable ServerName targetServer) { 725 return setOwner(env, new TransitRegionStateProcedure(env, region, targetServer, forceNewPlan, 726 TransitionType.ASSIGN)); 727 } 728 729 public static TransitRegionStateProcedure unassign(MasterProcedureEnv env, RegionInfo region) { 730 return setOwner(env, 731 new TransitRegionStateProcedure(env, region, null, false, TransitionType.UNASSIGN)); 732 } 733 734 public static TransitRegionStateProcedure unassignSplitMerge(MasterProcedureEnv env, 735 RegionInfo region) { 736 return setOwner(env, 737 new TransitRegionStateProcedure(env, region, null, false, TransitionType.UNASSIGN, true)); 738 } 739 740 public static TransitRegionStateProcedure reopen(MasterProcedureEnv env, RegionInfo region) { 741 return setOwner(env, 742 new TransitRegionStateProcedure(env, region, null, false, TransitionType.REOPEN)); 743 } 744 745 public static TransitRegionStateProcedure move(MasterProcedureEnv env, RegionInfo region, 746 @Nullable ServerName targetServer) { 747 return setOwner(env, new TransitRegionStateProcedure(env, region, targetServer, 748 targetServer == null, TransitionType.MOVE)); 749 } 750}