001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.assignment;
019
020import static org.apache.hadoop.hbase.io.hfile.CacheConfig.DEFAULT_EVICT_ON_CLOSE;
021import static org.apache.hadoop.hbase.io.hfile.CacheConfig.DEFAULT_EVICT_ON_SPLIT;
022import static org.apache.hadoop.hbase.io.hfile.CacheConfig.EVICT_BLOCKS_ON_CLOSE_KEY;
023import static org.apache.hadoop.hbase.io.hfile.CacheConfig.EVICT_BLOCKS_ON_SPLIT_KEY;
024import static org.apache.hadoop.hbase.master.LoadBalancer.BOGUS_SERVER_NAME;
025import static org.apache.hadoop.hbase.master.assignment.AssignmentManager.FORCE_REGION_RETAINMENT;
026
027import edu.umd.cs.findbugs.annotations.Nullable;
028import java.io.IOException;
029import java.util.concurrent.CompletableFuture;
030import java.util.concurrent.TimeUnit;
031import org.apache.hadoop.hbase.HBaseIOException;
032import org.apache.hadoop.hbase.ServerName;
033import org.apache.hadoop.hbase.TableName;
034import org.apache.hadoop.hbase.client.RegionInfo;
035import org.apache.hadoop.hbase.client.RegionReplicaUtil;
036import org.apache.hadoop.hbase.client.RetriesExhaustedException;
037import org.apache.hadoop.hbase.master.MetricsAssignmentManager;
038import org.apache.hadoop.hbase.master.RegionState.State;
039import org.apache.hadoop.hbase.master.ServerManager;
040import org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure;
041import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
042import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
043import org.apache.hadoop.hbase.procedure2.Procedure;
044import org.apache.hadoop.hbase.procedure2.ProcedureFutureUtil;
045import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
046import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
047import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
048import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
049import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
050import org.apache.hadoop.hbase.util.FutureUtils;
051import org.apache.hadoop.hbase.util.RetryCounter;
052import org.apache.yetus.audience.InterfaceAudience;
053import org.slf4j.Logger;
054import org.slf4j.LoggerFactory;
055
056import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
057import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionStateTransitionState;
058import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionStateTransitionStateData;
059import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionType;
060import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
061import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
062
063/**
064 * The procedure to deal with the state transition of a region. A region with a TRSP in place is
065 * called RIT, i.e, RegionInTransition.
066 * <p/>
067 * It can be used to assign/unassign/reopen/move a region, and for
068 * {@link #unassign(MasterProcedureEnv, RegionInfo)} and
069 * {@link #reopen(MasterProcedureEnv, RegionInfo)}, you do not need to specify a target server, and
070 * for {@link #assign(MasterProcedureEnv, RegionInfo, ServerName)} and
071 * {@link #move(MasterProcedureEnv, RegionInfo, ServerName)}, if you want to you can provide a
072 * target server. And for {@link #move(MasterProcedureEnv, RegionInfo, ServerName)}, if you do not
073 * specify a targetServer, we will select one randomly.
074 * <p/>
075 * <p/>
076 * The typical state transition for assigning a region is:
077 *
078 * <pre>
079 * GET_ASSIGN_CANDIDATE ------> OPEN -----> CONFIRM_OPENED
080 * </pre>
081 *
082 * Notice that, if there are failures we may go back to the {@code GET_ASSIGN_CANDIDATE} state to
083 * try again.
084 * <p/>
085 * The typical state transition for unassigning a region is:
086 *
087 * <pre>
088 * CLOSE -----> CONFIRM_CLOSED
089 * </pre>
090 *
091 * Here things go a bit different, if there are failures, especially that if there is a server
092 * crash, we will go to the {@code GET_ASSIGN_CANDIDATE} state to bring the region online first, and
093 * then go through the normal way to unassign it.
094 * <p/>
095 * The typical state transition for reopening/moving a region is:
096 *
097 * <pre>
098 * CLOSE -----> CONFIRM_CLOSED -----> GET_ASSIGN_CANDIDATE ------> OPEN -----> CONFIRM_OPENED
099 * </pre>
100 *
101 * The retry logic is the same with the above assign/unassign.
102 * <p/>
103 * Notice that, although we allow specify a target server, it just acts as a candidate, we do not
104 * guarantee that the region will finally be on the target server. If this is important for you, you
105 * should check whether the region is on the target server after the procedure is finished.
106 * </p>
107 * Altenatively, for trying retaining assignments, the
108 * <b>hbase.master.scp.retain.assignment.force</b> option can be used together with
109 * <b>hbase.master.scp.retain.assignment</b>.
110 * <p/>
111 * When you want to schedule a TRSP, please check whether there is still one for this region, and
112 * the check should be under the RegionStateNode lock. We will remove the TRSP from a
113 * RegionStateNode when we are done, see the code in {@code reportTransition} method below. There
114 * could be at most one TRSP for a give region.
115 */
116@InterfaceAudience.Private
117public class TransitRegionStateProcedure
118  extends AbstractStateMachineRegionProcedure<RegionStateTransitionState> {
119
120  private static final Logger LOG = LoggerFactory.getLogger(TransitRegionStateProcedure.class);
121
122  private TransitionType type;
123
124  private RegionStateTransitionState initialState;
125
126  private RegionStateTransitionState lastState;
127
128  // the candidate where we want to assign the region to.
129  private ServerName assignCandidate;
130
131  private boolean forceNewPlan;
132
133  private RetryCounter retryCounter;
134
135  private RegionRemoteProcedureBase remoteProc;
136
137  private boolean evictCache;
138
139  private boolean isSplit;
140
141  private RetryCounter forceRetainmentRetryCounter;
142
143  private long forceRetainmentTotalWait;
144
145  private CompletableFuture<Void> future;
146
147  public TransitRegionStateProcedure() {
148  }
149
150  private void setInitialAndLastState() {
151    switch (type) {
152      case ASSIGN:
153        initialState = RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE;
154        lastState = RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED;
155        break;
156      case UNASSIGN:
157        initialState = RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE;
158        lastState = RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED;
159        break;
160      case MOVE:
161      case REOPEN:
162        initialState = RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE;
163        lastState = RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED;
164        break;
165      default:
166        throw new IllegalArgumentException("Unknown TransitionType: " + type);
167    }
168  }
169
170  protected TransitRegionStateProcedure(MasterProcedureEnv env, RegionInfo hri,
171    ServerName assignCandidate, boolean forceNewPlan, TransitionType type) {
172    super(env, hri);
173    this.assignCandidate = assignCandidate;
174    this.forceNewPlan = forceNewPlan;
175    this.type = type;
176    setInitialAndLastState();
177
178    // when do reopen TRSP, let the rs know the targetServer so it can keep some info on close
179    if (type == TransitionType.REOPEN) {
180      this.assignCandidate = getRegionStateNode(env).getRegionLocation();
181    }
182    evictCache =
183      env.getMasterConfiguration().getBoolean(EVICT_BLOCKS_ON_CLOSE_KEY, DEFAULT_EVICT_ON_CLOSE);
184    initForceRetainmentRetryCounter(env);
185  }
186
187  private void initForceRetainmentRetryCounter(MasterProcedureEnv env) {
188    if (env.getAssignmentManager().isForceRegionRetainment()) {
189      forceRetainmentRetryCounter =
190        new RetryCounter(env.getAssignmentManager().getForceRegionRetainmentRetries(),
191          env.getAssignmentManager().getForceRegionRetainmentWaitInterval(), TimeUnit.MILLISECONDS);
192      forceRetainmentTotalWait = 0;
193    }
194  }
195
196  protected TransitRegionStateProcedure(MasterProcedureEnv env, RegionInfo hri,
197    ServerName assignCandidate, boolean forceNewPlan, TransitionType type, boolean isSplit) {
198    this(env, hri, assignCandidate, forceNewPlan, type);
199    this.isSplit = isSplit;
200  }
201
202  @Override
203  public TableOperationType getTableOperationType() {
204    // TODO: maybe we should make another type here, REGION_TRANSITION?
205    return TableOperationType.REGION_EDIT;
206  }
207
208  @Override
209  protected boolean waitInitialized(MasterProcedureEnv env) {
210    if (TableName.isMetaTableName(getTableName())) {
211      return false;
212    }
213    // First we need meta to be loaded, and second, if meta is not online then we will likely to
214    // fail when updating meta so we wait until it is assigned.
215    AssignmentManager am = env.getAssignmentManager();
216    return am.waitMetaLoaded(this) || am.waitMetaAssigned(this, getRegion());
217  }
218
219  private void checkAndWaitForOriginalServer(MasterProcedureEnv env, ServerName lastHost)
220    throws ProcedureSuspendedException {
221    ServerManager serverManager = env.getMasterServices().getServerManager();
222    ServerName newNameForServer = serverManager.findServerWithSameHostnamePortWithLock(lastHost);
223    boolean isOnline = serverManager.createDestinationServersList().contains(newNameForServer);
224
225    if (!isOnline && forceRetainmentRetryCounter.shouldRetry()) {
226      int backoff =
227        Math.toIntExact(forceRetainmentRetryCounter.getBackoffTimeAndIncrementAttempts());
228      forceRetainmentTotalWait += backoff;
229      LOG.info(
230        "Suspending the TRSP PID={} for {}ms because {} is true and previous host {} "
231          + "for region is not yet online.",
232        this.getProcId(), backoff, FORCE_REGION_RETAINMENT, lastHost);
233      setTimeout(backoff);
234      setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
235      throw new ProcedureSuspendedException();
236    }
237    LOG.info(
238      "{} is true. TRSP PID={} waited {}ms for host {} to come back online. "
239        + "Did host come back online? {}",
240      FORCE_REGION_RETAINMENT, this.getProcId(), forceRetainmentTotalWait, lastHost, isOnline);
241    initForceRetainmentRetryCounter(env);
242  }
243
244  private void queueAssign(MasterProcedureEnv env, RegionStateNode regionNode)
245    throws ProcedureSuspendedException {
246    boolean retain = false;
247    if (forceNewPlan) {
248      // set the region location to null if forceNewPlan is true
249      regionNode.setRegionLocation(null);
250    } else {
251      if (assignCandidate != null) {
252        retain = assignCandidate.equals(regionNode.getLastHost());
253        regionNode.setRegionLocation(assignCandidate);
254      } else if (regionNode.getLastHost() != null) {
255        retain = true;
256        LOG.info("Setting lastHost {} as the location for region {}", regionNode.getLastHost(),
257          regionNode.getRegionInfo().getEncodedName());
258        regionNode.setRegionLocation(regionNode.getLastHost());
259      }
260      if (
261        regionNode.getRegionLocation() != null
262          && env.getAssignmentManager().isForceRegionRetainment()
263      ) {
264        LOG.warn("{} is set to true. This may delay regions re-assignment "
265          + "upon RegionServers crashes or restarts.", FORCE_REGION_RETAINMENT);
266        checkAndWaitForOriginalServer(env, regionNode.getRegionLocation());
267      }
268    }
269    LOG.info("Starting {}; {}; forceNewPlan={}, retain={}", this, regionNode.toShortString(),
270      forceNewPlan, retain);
271    env.getAssignmentManager().queueAssign(regionNode);
272    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_OPEN);
273    if (regionNode.getProcedureEvent().suspendIfNotReady(this)) {
274      throw new ProcedureSuspendedException();
275    }
276  }
277
278  private CompletableFuture<Void> getFuture() {
279    return future;
280  }
281
282  private void setFuture(CompletableFuture<Void> f) {
283    future = f;
284  }
285
286  private void openRegionAfterUpdatingMeta(ServerName loc) {
287    addChildProcedure(new OpenRegionProcedure(this, getRegion(), loc));
288    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED);
289  }
290
291  private void openRegion(MasterProcedureEnv env, RegionStateNode regionNode)
292    throws IOException, ProcedureSuspendedException {
293    ServerName loc = regionNode.getRegionLocation();
294    if (
295      ProcedureFutureUtil.checkFuture(this, this::getFuture, this::setFuture,
296        () -> openRegionAfterUpdatingMeta(loc))
297    ) {
298      return;
299    }
300    if (loc == null || BOGUS_SERVER_NAME.equals(loc)) {
301      LOG.warn("No location specified for {}, jump back to state {} to get one", getRegion(),
302        RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
303      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
304      throw new HBaseIOException("Failed to open region, the location is null or bogus.");
305    }
306    ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture,
307      env.getAssignmentManager().regionOpening(regionNode), env,
308      () -> openRegionAfterUpdatingMeta(loc));
309  }
310
311  private void regionFailedOpenAfterUpdatingMeta(MasterProcedureEnv env,
312    RegionStateNode regionNode) {
313    setFailure(getClass().getSimpleName(), new RetriesExhaustedException(
314      "Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() + " exceeded"));
315    regionNode.unsetProcedure(this);
316  }
317
318  private Flow confirmOpened(MasterProcedureEnv env, RegionStateNode regionNode)
319    throws IOException, ProcedureSuspendedException {
320    if (
321      ProcedureFutureUtil.checkFuture(this, this::getFuture, this::setFuture,
322        () -> regionFailedOpenAfterUpdatingMeta(env, regionNode))
323    ) {
324      return Flow.NO_MORE_STATE;
325    }
326    if (regionNode.isInState(State.OPEN)) {
327      retryCounter = null;
328      if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED) {
329        // we are the last state, finish
330        regionNode.unsetProcedure(this);
331        ServerCrashProcedure.updateProgress(env, getParentProcId());
332        return Flow.NO_MORE_STATE;
333      }
334      // It is possible that we arrive here but confirm opened is not the last state, for example,
335      // when merging or splitting a region, we unassign the region from a RS and the RS is crashed,
336      // then there will be recovered edits for this region, we'd better make the region online
337      // again and then unassign it, otherwise we have to fail the merge/split procedure as we may
338      // loss data.
339      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE);
340      return Flow.HAS_MORE_STATE;
341    }
342
343    int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
344      .incrementAndGetRetries();
345    int maxAttempts = env.getAssignmentManager().getAssignMaxAttempts();
346    LOG.info("Retry={} of max={}; {}; {}", retries, maxAttempts, this, regionNode.toShortString());
347
348    if (retries >= maxAttempts) {
349      ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture,
350        env.getAssignmentManager().regionFailedOpen(regionNode, true), env,
351        () -> regionFailedOpenAfterUpdatingMeta(env, regionNode));
352      return Flow.NO_MORE_STATE;
353    }
354
355    // if not giving up, we will not update meta, so the returned CompletableFuture should be a fake
356    // one, which should have been completed already
357    CompletableFuture<Void> future = env.getAssignmentManager().regionFailedOpen(regionNode, false);
358    assert future.isDone();
359    // we failed to assign the region, force a new plan
360    forceNewPlan = true;
361    regionNode.setRegionLocation(null);
362    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
363
364    if (retries > env.getAssignmentManager().getAssignRetryImmediatelyMaxAttempts()) {
365      // Throw exception to backoff and retry when failed open too many times
366      throw new HBaseIOException(
367        "Failed confirm OPEN of " + regionNode + " (remote log may yield more detail on why).");
368    } else {
369      // Here we do not throw exception because we want to the region to be online ASAP
370      return Flow.HAS_MORE_STATE;
371    }
372  }
373
374  private void closeRegionAfterUpdatingMeta(MasterProcedureEnv env, RegionStateNode regionNode) {
375    LOG.debug("Close region: isSplit: {}: evictOnSplit: {}: evictOnClose: {}", isSplit,
376      env.getMasterConfiguration().getBoolean(EVICT_BLOCKS_ON_SPLIT_KEY, DEFAULT_EVICT_ON_SPLIT),
377      evictCache);
378    // Splits/Merges are special cases, rather than deciding on the cache eviction behaviour here at
379    // Master, we just need to tell this close is for a split/merge and let RSes decide on the
380    // eviction. See HBASE-28811 for more context.
381    CloseRegionProcedure closeProc = new CloseRegionProcedure(this, getRegion(),
382      regionNode.getRegionLocation(), assignCandidate, isSplit);
383    addChildProcedure(closeProc);
384    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED);
385  }
386
387  private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode)
388    throws IOException, ProcedureSuspendedException {
389    if (
390      ProcedureFutureUtil.checkFuture(this, this::getFuture, this::setFuture,
391        () -> closeRegionAfterUpdatingMeta(env, regionNode))
392    ) {
393      return;
394    }
395    if (regionNode.isInState(State.OPEN, State.CLOSING, State.MERGING, State.SPLITTING)) {
396      // this is the normal case
397      ProcedureFutureUtil.suspendIfNecessary(this, this::setFuture,
398        env.getAssignmentManager().regionClosing(regionNode), env,
399        () -> closeRegionAfterUpdatingMeta(env, regionNode));
400    } else {
401      forceNewPlan = true;
402      regionNode.setRegionLocation(null);
403      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
404    }
405  }
406
407  private Flow confirmClosed(MasterProcedureEnv env, RegionStateNode regionNode)
408    throws IOException {
409    if (regionNode.isInState(State.CLOSED)) {
410      retryCounter = null;
411      if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED) {
412        // we are the last state, finish
413        regionNode.unsetProcedure(this);
414        return Flow.NO_MORE_STATE;
415      }
416      // This means we need to open the region again, should be a move or reopen
417      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
418      return Flow.HAS_MORE_STATE;
419    }
420    if (regionNode.isInState(State.CLOSING)) {
421      // This is possible, think the target RS crashes and restarts immediately, the close region
422      // operation will return a NotServingRegionException soon, we can only recover after SCP takes
423      // care of this RS. So here we throw an IOException to let upper layer to retry with backoff.
424      setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE);
425      throw new HBaseIOException("Failed to close region");
426    }
427    // abnormally closed, need to reopen it, no matter what is the last state, see the comment in
428    // confirmOpened for more details that why we need to reopen the region first even if we just
429    // want to close it.
430    // The only exception is for non-default replica, where we do not need to deal with recovered
431    // edits. Notice that the region will remain in ABNORMALLY_CLOSED state, the upper layer need to
432    // deal with this state. For non-default replica, this is usually the same with CLOSED.
433    assert regionNode.isInState(State.ABNORMALLY_CLOSED);
434    if (
435      !RegionReplicaUtil.isDefaultReplica(getRegion())
436        && lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED
437    ) {
438      regionNode.unsetProcedure(this);
439      return Flow.NO_MORE_STATE;
440    }
441    retryCounter = null;
442    setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
443    return Flow.HAS_MORE_STATE;
444  }
445
446  // Override to lock RegionStateNode
447  @SuppressWarnings("rawtypes")
448  @Override
449  protected Procedure[] execute(MasterProcedureEnv env)
450    throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
451    RegionStateNode regionNode =
452      env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
453    if (!regionNode.isLockedBy(this)) {
454      regionNode.lock(this, () -> ProcedureFutureUtil.wakeUp(this, env));
455    }
456    try {
457      return super.execute(env);
458    } finally {
459      if (future == null) {
460        // release the lock if there is no pending updating meta operation
461        regionNode.unlock(this);
462      }
463    }
464  }
465
466  private RegionStateNode getRegionStateNode(MasterProcedureEnv env) {
467    return env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion());
468  }
469
470  @Override
471  protected Flow executeFromState(MasterProcedureEnv env, RegionStateTransitionState state)
472    throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
473    RegionStateNode regionNode = getRegionStateNode(env);
474    try {
475      switch (state) {
476        case REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE:
477          // Need to do some sanity check for replica region, if the region does not exist at
478          // master, do not try to assign the replica region, log error and return.
479          if (!RegionReplicaUtil.isDefaultReplica(regionNode.getRegionInfo())) {
480            RegionInfo defaultRI =
481              RegionReplicaUtil.getRegionInfoForDefaultReplica(regionNode.getRegionInfo());
482            if (
483              env.getMasterServices().getAssignmentManager().getRegionStates()
484                .getRegionStateNode(defaultRI) == null
485            ) {
486              LOG.error(
487                "Cannot assign replica region {} because its primary region {} does not exist.",
488                regionNode.getRegionInfo(), defaultRI);
489              regionNode.unsetProcedure(this);
490              return Flow.NO_MORE_STATE;
491            }
492          }
493          queueAssign(env, regionNode);
494          return Flow.HAS_MORE_STATE;
495        case REGION_STATE_TRANSITION_OPEN:
496          openRegion(env, regionNode);
497          return Flow.HAS_MORE_STATE;
498        case REGION_STATE_TRANSITION_CONFIRM_OPENED:
499          return confirmOpened(env, regionNode);
500        case REGION_STATE_TRANSITION_CLOSE:
501          closeRegion(env, regionNode);
502          return Flow.HAS_MORE_STATE;
503        case REGION_STATE_TRANSITION_CONFIRM_CLOSED:
504          return confirmClosed(env, regionNode);
505        default:
506          throw new UnsupportedOperationException("unhandled state=" + state);
507      }
508    } catch (IOException e) {
509      if (retryCounter == null) {
510        retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
511      }
512      long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
513      LOG.warn(
514        "Failed transition, suspend {}secs {}; {}; waiting on rectified condition fixed "
515          + "by other Procedure or operator intervention",
516        backoff / 1000, this, regionNode.toShortString(), e);
517      throw suspend(Math.toIntExact(backoff), true);
518    }
519  }
520
521  /**
522   * At end of timeout, wake ourselves up so we run again.
523   */
524  @Override
525  protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
526    setState(ProcedureProtos.ProcedureState.RUNNABLE);
527    env.getProcedureScheduler().addFront(this);
528    return false; // 'false' means that this procedure handled the timeout
529  }
530
531  // Should be called with RegionStateNode locked
532  public void reportTransition(MasterProcedureEnv env, RegionStateNode regionNode,
533    ServerName serverName, TransitionCode code, long seqId, long procId) throws IOException {
534    if (remoteProc == null) {
535      LOG.warn(
536        "There is no outstanding remote region procedure for {}, serverName={}, code={},"
537          + " seqId={}, proc={}, should be a retry, ignore",
538        regionNode, serverName, code, seqId, this);
539      return;
540    }
541    // The procId could be -1 if it is from an old region server, we need to deal with it so that we
542    // can do rolling upgraing.
543    if (procId >= 0 && remoteProc.getProcId() != procId) {
544      LOG.warn(
545        "The pid of remote region procedure for {} is {}, the reported pid={}, serverName={},"
546          + " code={}, seqId={}, proc={}, should be a retry, ignore",
547        regionNode, remoteProc.getProcId(), procId, serverName, code, seqId, this);
548      return;
549    }
550    remoteProc.reportTransition(env, regionNode, serverName, code, seqId);
551  }
552
553  // Should be called with RegionStateNode locked
554  public CompletableFuture<Void> serverCrashed(MasterProcedureEnv env, RegionStateNode regionNode,
555    ServerName serverName, boolean forceNewPlan) {
556    this.forceNewPlan = forceNewPlan;
557    if (remoteProc != null) {
558      // this means we are waiting for the sub procedure, so wake it up
559      try {
560        remoteProc.serverCrashed(env, regionNode, serverName);
561      } catch (Exception e) {
562        return FutureUtils.failedFuture(e);
563      }
564      return CompletableFuture.completedFuture(null);
565    } else {
566      if (regionNode.isInState(State.ABNORMALLY_CLOSED)) {
567        // should be a retry, where we have already changed the region state to abnormally closed
568        return CompletableFuture.completedFuture(null);
569      } else {
570        // we are in RUNNING state, just update the region state, and we will process it later.
571        return env.getAssignmentManager().regionClosedAbnormally(regionNode);
572      }
573    }
574  }
575
576  void attachRemoteProc(RegionRemoteProcedureBase proc) {
577    this.remoteProc = proc;
578  }
579
580  void unattachRemoteProc(RegionRemoteProcedureBase proc) {
581    assert this.remoteProc == proc;
582    this.remoteProc = null;
583  }
584
585  // will be called after we finish loading the meta entry for this region.
586  // used to change the state of the region node if we have a sub procedure, as we may not persist
587  // the state to meta yet. See the code in RegionRemoteProcedureBase.execute for more details.
588  void stateLoaded(AssignmentManager am, RegionStateNode regionNode) {
589    if (remoteProc != null) {
590      remoteProc.stateLoaded(am, regionNode);
591    }
592  }
593
594  @Override
595  protected void rollbackState(MasterProcedureEnv env, RegionStateTransitionState state)
596    throws IOException, InterruptedException {
597    // no rollback
598    throw new UnsupportedOperationException();
599  }
600
601  @Override
602  protected RegionStateTransitionState getState(int stateId) {
603    return RegionStateTransitionState.forNumber(stateId);
604  }
605
606  @Override
607  protected int getStateId(RegionStateTransitionState state) {
608    return state.getNumber();
609  }
610
611  @Override
612  protected RegionStateTransitionState getInitialState() {
613    return initialState;
614  }
615
616  private static TransitionType convert(RegionTransitionType type) {
617    switch (type) {
618      case ASSIGN:
619        return TransitionType.ASSIGN;
620      case UNASSIGN:
621        return TransitionType.UNASSIGN;
622      case MOVE:
623        return TransitionType.MOVE;
624      case REOPEN:
625        return TransitionType.REOPEN;
626      default:
627        throw new IllegalArgumentException("Unknown RegionTransitionType: " + type);
628    }
629  }
630
631  private static RegionTransitionType convert(TransitionType type) {
632    switch (type) {
633      case ASSIGN:
634        return RegionTransitionType.ASSIGN;
635      case UNASSIGN:
636        return RegionTransitionType.UNASSIGN;
637      case MOVE:
638        return RegionTransitionType.MOVE;
639      case REOPEN:
640        return RegionTransitionType.REOPEN;
641      default:
642        throw new IllegalArgumentException("Unknown TransitionType: " + type);
643    }
644  }
645
646  @Override
647  protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
648    super.serializeStateData(serializer);
649    RegionStateTransitionStateData.Builder builder =
650      RegionStateTransitionStateData.newBuilder().setType(convert(type))
651        .setForceNewPlan(forceNewPlan).setEvictCache(evictCache).setIsSplit(isSplit);
652    if (assignCandidate != null) {
653      builder.setAssignCandidate(ProtobufUtil.toServerName(assignCandidate));
654    }
655    serializer.serialize(builder.build());
656  }
657
658  @Override
659  protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
660    super.deserializeStateData(serializer);
661    RegionStateTransitionStateData data =
662      serializer.deserialize(RegionStateTransitionStateData.class);
663    type = convert(data.getType());
664    setInitialAndLastState();
665    forceNewPlan = data.getForceNewPlan();
666    if (data.hasAssignCandidate()) {
667      assignCandidate = ProtobufUtil.toServerName(data.getAssignCandidate());
668    }
669    evictCache = data.getEvictCache();
670    isSplit = data.getIsSplit();
671  }
672
673  @Override
674  protected ProcedureMetrics getProcedureMetrics(MasterProcedureEnv env) {
675    MetricsAssignmentManager metrics = env.getAssignmentManager().getAssignmentManagerMetrics();
676    switch (type) {
677      case ASSIGN:
678        return metrics.getAssignProcMetrics();
679      case UNASSIGN:
680        return metrics.getUnassignProcMetrics();
681      case MOVE:
682        return metrics.getMoveProcMetrics();
683      case REOPEN:
684        return metrics.getReopenProcMetrics();
685      default:
686        throw new IllegalArgumentException("Unknown transition type: " + type);
687    }
688  }
689
690  @Override
691  public void toStringClassDetails(StringBuilder sb) {
692    super.toStringClassDetails(sb);
693    if (initialState == RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE) {
694      sb.append(", ASSIGN");
695    } else if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED) {
696      sb.append(", UNASSIGN");
697    } else {
698      sb.append(", REOPEN/MOVE");
699    }
700  }
701
702  private static TransitRegionStateProcedure setOwner(MasterProcedureEnv env,
703    TransitRegionStateProcedure proc) {
704    proc.setOwner(env.getRequestUser().getShortName());
705    return proc;
706  }
707
708  public enum TransitionType {
709    ASSIGN,
710    UNASSIGN,
711    MOVE,
712    REOPEN
713  }
714
715  // Be careful that, when you call these 4 methods below, you need to manually attach the returned
716  // procedure with the RegionStateNode, otherwise the procedure will quit immediately without doing
717  // anything. See the comment in executeFromState to find out why we need this assumption.
718  public static TransitRegionStateProcedure assign(MasterProcedureEnv env, RegionInfo region,
719    @Nullable ServerName targetServer) {
720    return assign(env, region, false, targetServer);
721  }
722
723  public static TransitRegionStateProcedure assign(MasterProcedureEnv env, RegionInfo region,
724    boolean forceNewPlan, @Nullable ServerName targetServer) {
725    return setOwner(env, new TransitRegionStateProcedure(env, region, targetServer, forceNewPlan,
726      TransitionType.ASSIGN));
727  }
728
729  public static TransitRegionStateProcedure unassign(MasterProcedureEnv env, RegionInfo region) {
730    return setOwner(env,
731      new TransitRegionStateProcedure(env, region, null, false, TransitionType.UNASSIGN));
732  }
733
734  public static TransitRegionStateProcedure unassignSplitMerge(MasterProcedureEnv env,
735    RegionInfo region) {
736    return setOwner(env,
737      new TransitRegionStateProcedure(env, region, null, false, TransitionType.UNASSIGN, true));
738  }
739
740  public static TransitRegionStateProcedure reopen(MasterProcedureEnv env, RegionInfo region) {
741    return setOwner(env,
742      new TransitRegionStateProcedure(env, region, null, false, TransitionType.REOPEN));
743  }
744
745  public static TransitRegionStateProcedure move(MasterProcedureEnv env, RegionInfo region,
746    @Nullable ServerName targetServer) {
747    return setOwner(env, new TransitRegionStateProcedure(env, region, targetServer,
748      targetServer == null, TransitionType.MOVE));
749  }
750}