001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.procedure;
019
020import java.io.IOException;
021import org.apache.hadoop.hbase.ServerName;
022import org.apache.hadoop.hbase.procedure2.FailedRemoteDispatchException;
023import org.apache.hadoop.hbase.procedure2.Procedure;
024import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
025import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
026import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
027import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher;
028import org.apache.hadoop.hbase.procedure2.RemoteProcedureException;
029import org.apache.yetus.audience.InterfaceAudience;
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
034
035@InterfaceAudience.Private
036/**
037 * The base class for Procedures that run {@link java.util.concurrent.Callable}s on a (remote)
038 * RegionServer; e.g. asking a RegionServer to split a WAL file as a sub-procedure of the
039 * ServerCrashProcedure recovery process.
040 * <p>
041 * To implement a new Procedure type, extend this class and override remoteCallBuild() and
042 * complete(). The dispatch and callback will be handled for you here, internally.
043 * <p>
044 * The Procedure works as follows. It uses {@link RSProcedureDispatcher}, the same system used
045 * dispatching Region OPEN and CLOSE RPCs, to pass a Callable to a RegionServer. Examples include
046 * {@link org.apache.hadoop.hbase.regionserver.SplitWALCallable} and
047 * {@link org.apache.hadoop.hbase.replication.regionserver.RefreshPeerCallable}. Rather than
048 * assign/unassign, the Master calls #executeProcedures against the remote RegionServer wrapping a
049 * Callable in a {@link ExecuteProceduresRequest}. Upon successful dispatch, the Procedure then
050 * suspends itself on the Master-side and relinqushes its executor worker. On receipt, the
051 * RegionServer submits the Callable to its executor service. When the Callable completes, it adds
052 * itself to a queue on the RegionServer side for processing by a background thread, the
053 * {@link RemoteProcedureResultReporter}. It picks up the completed Callable from the queue and RPCs
054 * the master at #reportProcedureDone with the procedure id and whether success or failure. The
055 * master calls complete() setting success or failure state and then reschedules the suspended
056 * Procedure so it can finish.
057 * <p>
058 * Here are some details on operation:
059 * <p>
060 * If adding the operation to the dispatcher fails, addOperationToNode will throw
061 * FailedRemoteDispatchException, and this Procedure will return 'null'. The Procedure Executor will
062 * then mark this procedure as 'complete' (though we failed to dispatch our task). In this case, the
063 * upper layer of this procedure must have a way to check if this Procedure really succeeded or not
064 * and have appropriate handling.
065 * <p>
066 * If sending the operation to remote RS failed, dispatcher will call remoteCallFailed() to handle
067 * this which calls remoteOperationDone with the exception. If the targetServer crashed but this
068 * procedure has no response or if we receive failed response, then dispatcher will call
069 * remoteOperationFailed() which also calls remoteOperationDone with the exception. If the operation
070 * is successful, then remoteOperationCompleted will be called and actually calls the
071 * remoteOperationDone without exception. In remoteOperationDone, we'll check if the procedure is
072 * already get wake up by others. Then developer could implement complete() based on their own
073 * purpose. But basic logic is that if operation succeed, set succ to true and do the clean work. If
074 * operation failed and require to resend it to the same server, leave the succ as false. If
075 * operation failed and require to resend it to another server, set succ to true and upper layer
076 * should be able to find out this operation not work and send a operation to another server.
077 */
078public abstract class ServerRemoteProcedure extends Procedure<MasterProcedureEnv>
079  implements RemoteProcedureDispatcher.RemoteProcedure<MasterProcedureEnv, ServerName> {
080  protected static final Logger LOG = LoggerFactory.getLogger(ServerRemoteProcedure.class);
081  protected ProcedureEvent<?> event;
082  protected ServerName targetServer;
083  // after remoteProcedureDone we require error field to decide the next state
084  protected Throwable remoteError;
085  protected MasterProcedureProtos.ServerRemoteProcedureState state =
086    MasterProcedureProtos.ServerRemoteProcedureState.SERVER_REMOTE_PROCEDURE_DISPATCH;
087
088  protected abstract boolean complete(MasterProcedureEnv env, Throwable error);
089
090  @Override
091  protected synchronized Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env)
092    throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
093    if (
094      state != MasterProcedureProtos.ServerRemoteProcedureState.SERVER_REMOTE_PROCEDURE_DISPATCH
095    ) {
096      if (complete(env, this.remoteError)) {
097        return null;
098      }
099      state = MasterProcedureProtos.ServerRemoteProcedureState.SERVER_REMOTE_PROCEDURE_DISPATCH;
100    }
101    try {
102      env.getRemoteDispatcher().addOperationToNode(targetServer, this);
103    } catch (FailedRemoteDispatchException frde) {
104      LOG.warn("Can not send remote operation {} to {}, this operation will "
105        + "be retried to send to another server", this.getProcId(), targetServer);
106      return null;
107    }
108    event = new ProcedureEvent<>(this);
109    event.suspendIfNotReady(this);
110    throw new ProcedureSuspendedException();
111  }
112
113  @Override
114  protected synchronized void completionCleanup(MasterProcedureEnv env) {
115    env.getRemoteDispatcher().removeCompletedOperation(targetServer, this);
116  }
117
118  @Override
119  public synchronized void remoteCallFailed(MasterProcedureEnv env, ServerName serverName,
120    IOException exception) {
121    state = MasterProcedureProtos.ServerRemoteProcedureState.SERVER_REMOTE_PROCEDURE_DISPATCH_FAIL;
122    remoteOperationDone(env, exception);
123  }
124
125  @Override
126  public synchronized void remoteOperationCompleted(MasterProcedureEnv env) {
127    state = MasterProcedureProtos.ServerRemoteProcedureState.SERVER_REMOTE_PROCEDURE_REPORT_SUCCEED;
128    remoteOperationDone(env, null);
129  }
130
131  @Override
132  public synchronized void remoteOperationFailed(MasterProcedureEnv env,
133    RemoteProcedureException error) {
134    state = MasterProcedureProtos.ServerRemoteProcedureState.SERVER_REMOTE_PROCEDURE_REPORT_FAILED;
135    remoteOperationDone(env, error);
136  }
137
138  synchronized void remoteOperationDone(MasterProcedureEnv env, Throwable error) {
139    if (this.isFinished()) {
140      LOG.info("This procedure {} is already finished, skip the rest processes", this.getProcId());
141      return;
142    }
143    if (event == null) {
144      LOG.warn("procedure event for {} is null, maybe the procedure is created when recovery",
145        getProcId());
146      return;
147    }
148    this.remoteError = error;
149    // below persistence is added so that if report goes to last active master, it throws exception
150    env.getMasterServices().getMasterProcedureExecutor().getStore().update(this);
151    event.wake(env.getProcedureScheduler());
152    event = null;
153  }
154}