001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.assignment; 019 020import java.io.IOException; 021import java.util.Optional; 022import org.apache.hadoop.hbase.HConstants; 023import org.apache.hadoop.hbase.ServerName; 024import org.apache.hadoop.hbase.TableName; 025import org.apache.hadoop.hbase.client.RegionInfo; 026import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; 027import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; 028import org.apache.hadoop.hbase.master.procedure.TableProcedureInterface; 029import org.apache.hadoop.hbase.procedure2.FailedRemoteDispatchException; 030import org.apache.hadoop.hbase.procedure2.Procedure; 031import org.apache.hadoop.hbase.procedure2.ProcedureEvent; 032import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; 033import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; 034import org.apache.hadoop.hbase.procedure2.ProcedureUtil; 035import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; 036import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher; 037import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteProcedure; 038import org.apache.hadoop.hbase.procedure2.RemoteProcedureException; 039import org.apache.hadoop.hbase.util.RetryCounter; 040import org.apache.yetus.audience.InterfaceAudience; 041import org.slf4j.Logger; 042import org.slf4j.LoggerFactory; 043 044import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 045import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseState; 046import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseStateData; 047import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; 048import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState; 049import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; 050 051/** 052 * The base class for the remote procedures used to open/close a region. 053 * <p/> 054 * Notice that here we do not care about the result of the remote call, if the remote call is 055 * finished, either succeeded or not, we will always finish the procedure. The parent procedure 056 * should take care of the result and try to reschedule if the result is not good. 057 */ 058@InterfaceAudience.Private 059public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedureEnv> 060 implements TableProcedureInterface, RemoteProcedure<MasterProcedureEnv, ServerName> { 061 062 private static final Logger LOG = LoggerFactory.getLogger(RegionRemoteProcedureBase.class); 063 064 protected RegionInfo region; 065 066 protected ServerName targetServer; 067 068 private RegionRemoteProcedureBaseState state = 069 RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH; 070 071 private TransitionCode transitionCode; 072 073 private long seqId; 074 075 private RetryCounter retryCounter; 076 077 protected RegionRemoteProcedureBase() { 078 } 079 080 protected RegionRemoteProcedureBase(TransitRegionStateProcedure parent, RegionInfo region, 081 ServerName targetServer) { 082 this.region = region; 083 this.targetServer = targetServer; 084 parent.attachRemoteProc(this); 085 } 086 087 @Override 088 public Optional<RemoteProcedureDispatcher.RemoteOperation> remoteCallBuild(MasterProcedureEnv env, 089 ServerName remote) { 090 // REPORT_SUCCEED means that this remote open/close request already executed in RegionServer. 091 // So return empty operation and RSProcedureDispatcher no need to send it again. 092 if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED) { 093 return Optional.empty(); 094 } 095 return Optional.of(newRemoteOperation()); 096 } 097 098 protected abstract RemoteProcedureDispatcher.RemoteOperation newRemoteOperation(); 099 100 @Override 101 public void remoteOperationCompleted(MasterProcedureEnv env) { 102 // should not be called since we use reportRegionStateTransition to report the result 103 throw new UnsupportedOperationException(); 104 } 105 106 @Override 107 public void remoteOperationFailed(MasterProcedureEnv env, RemoteProcedureException error) { 108 // should not be called since we use reportRegionStateTransition to report the result 109 throw new UnsupportedOperationException(); 110 } 111 112 private RegionStateNode getRegionNode(MasterProcedureEnv env) { 113 return env.getAssignmentManager().getRegionStates().getRegionStateNode(region); 114 } 115 116 @Override 117 public void remoteCallFailed(MasterProcedureEnv env, ServerName remote, IOException exception) { 118 RegionStateNode regionNode = getRegionNode(env); 119 regionNode.lock(); 120 try { 121 if (!env.getMasterServices().getServerManager().isServerOnline(remote)) { 122 // the SCP will interrupt us, give up 123 LOG.debug("{} for region {}, targetServer {} is dead, SCP will interrupt us, give up", this, 124 regionNode, remote); 125 return; 126 } 127 if (state != RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH) { 128 // not sure how can this happen but anyway let's add a check here to avoid waking the wrong 129 // procedure... 130 LOG.warn("{} for region {}, targetServer={} has already been woken up, ignore", this, 131 regionNode, remote); 132 return; 133 } 134 LOG.warn("The remote operation {} for region {} to server {} failed", this, regionNode, 135 remote, exception); 136 // It is OK to not persist the state here, as we do not need to change the region state if the 137 // remote call is failed. If the master crashed before we actually execute the procedure and 138 // persist the new state, it is fine to retry on the same target server again. 139 state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH_FAIL; 140 regionNode.getProcedureEvent().wake(env.getProcedureScheduler()); 141 } finally { 142 regionNode.unlock(); 143 } 144 } 145 146 @Override 147 public TableName getTableName() { 148 return region.getTable(); 149 } 150 151 @Override 152 protected boolean waitInitialized(MasterProcedureEnv env) { 153 if (TableName.isMetaTableName(getTableName())) { 154 return false; 155 } 156 // First we need meta to be loaded, and second, if meta is not online then we will likely to 157 // fail when updating meta so we wait until it is assigned. 158 AssignmentManager am = env.getAssignmentManager(); 159 return am.waitMetaLoaded(this) || am.waitMetaAssigned(this, region); 160 } 161 162 @Override 163 protected void rollback(MasterProcedureEnv env) throws IOException, InterruptedException { 164 throw new UnsupportedOperationException(); 165 } 166 167 @Override 168 protected boolean abort(MasterProcedureEnv env) { 169 return false; 170 } 171 172 // do some checks to see if the report is valid 173 protected abstract void checkTransition(RegionStateNode regionNode, TransitionCode transitionCode, 174 long seqId) throws UnexpectedStateException; 175 176 // change the in memory state of the regionNode, but do not update meta. 177 protected abstract void updateTransitionWithoutPersistingToMeta(MasterProcedureEnv env, 178 RegionStateNode regionNode, TransitionCode transitionCode, long seqId) throws IOException; 179 180 // A bit strange but the procedure store will throw RuntimeException if we can not persist the 181 // state, so upper layer should take care of this... 182 private void persistAndWake(MasterProcedureEnv env, RegionStateNode regionNode) { 183 // The synchronization here is to guard with ProcedureExecutor.executeRollback, as here we will 184 // not hold the procedure execution lock, but we should not persist a procedure in ROLLEDBACK 185 // state to the procedure store. 186 // The ProcedureStore.update must be inside the lock, so here the check for procedure state and 187 // update could be atomic. In ProcedureExecutor.cleanupAfterRollbackOneStep, we will set the 188 // state to ROLLEDBACK, which will hold the same lock too as the Procedure.setState method is 189 // synchronized. This is the key to keep us safe. 190 synchronized (this) { 191 if (getState() == ProcedureState.ROLLEDBACK) { 192 LOG.warn("Procedure {} has already been rolled back, skip persistent", this); 193 return; 194 } 195 env.getMasterServices().getMasterProcedureExecutor().getStore().update(this); 196 } 197 regionNode.getProcedureEvent().wake(env.getProcedureScheduler()); 198 } 199 200 // should be called with RegionStateNode locked, to avoid race with the execute method below 201 void reportTransition(MasterProcedureEnv env, RegionStateNode regionNode, ServerName serverName, 202 TransitionCode transitionCode, long seqId) throws IOException { 203 if (state != RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH) { 204 // should be a retry 205 return; 206 } 207 if (!targetServer.equals(serverName)) { 208 throw new UnexpectedStateException("Received report from " + serverName + ", expected " 209 + targetServer + ", " + regionNode + ", proc=" + this); 210 } 211 checkTransition(regionNode, transitionCode, seqId); 212 // this state means we have received the report from RS, does not mean the result is fine, as we 213 // may received a FAILED_OPEN. 214 this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED; 215 this.transitionCode = transitionCode; 216 this.seqId = seqId; 217 // Persist the transition code and openSeqNum(if provided). 218 // We should not update the hbase:meta directly as this may cause races when master restarts, 219 // as the old active master may incorrectly report back to RS and cause the new master to hang 220 // on a OpenRegionProcedure forever. See HBASE-22060 and HBASE-22074 for more details. 221 boolean succ = false; 222 try { 223 persistAndWake(env, regionNode); 224 succ = true; 225 } finally { 226 if (!succ) { 227 this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH; 228 this.transitionCode = null; 229 this.seqId = HConstants.NO_SEQNUM; 230 } 231 } 232 try { 233 updateTransitionWithoutPersistingToMeta(env, regionNode, transitionCode, seqId); 234 } catch (IOException e) { 235 throw new AssertionError("should not happen", e); 236 } 237 } 238 239 void serverCrashed(MasterProcedureEnv env, RegionStateNode regionNode, ServerName serverName) { 240 if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_SERVER_CRASH) { 241 // should be a retry 242 return; 243 } 244 RegionRemoteProcedureBaseState oldState = state; 245 // it is possible that the state is in REGION_REMOTE_PROCEDURE_SERVER_CRASH, think of this 246 // sequence 247 // 1. region is open on the target server and the above reportTransition call is succeeded 248 // 2. before we are woken up and update the meta, the target server crashes, and then we arrive 249 // here 250 this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_SERVER_CRASH; 251 boolean succ = false; 252 try { 253 persistAndWake(env, regionNode); 254 succ = true; 255 } finally { 256 if (!succ) { 257 this.state = oldState; 258 } 259 } 260 } 261 262 protected abstract void restoreSucceedState(AssignmentManager am, RegionStateNode regionNode, 263 long seqId) throws IOException; 264 265 void stateLoaded(AssignmentManager am, RegionStateNode regionNode) { 266 if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED) { 267 try { 268 restoreSucceedState(am, regionNode, seqId); 269 } catch (IOException e) { 270 // should not happen as we are just restoring the state 271 throw new AssertionError(e); 272 } 273 } 274 } 275 276 private TransitRegionStateProcedure getParent(MasterProcedureEnv env) { 277 return (TransitRegionStateProcedure) env.getMasterServices().getMasterProcedureExecutor() 278 .getProcedure(getParentProcId()); 279 } 280 281 private void unattach(MasterProcedureEnv env) { 282 getParent(env).unattachRemoteProc(this); 283 } 284 285 @Override 286 protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env) 287 throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException { 288 RegionStateNode regionNode = getRegionNode(env); 289 regionNode.lock(); 290 try { 291 switch (state) { 292 case REGION_REMOTE_PROCEDURE_DISPATCH: { 293 // The code which wakes us up also needs to lock the RSN so here we do not need to 294 // synchronize 295 // on the event. 296 ProcedureEvent<?> event = regionNode.getProcedureEvent(); 297 try { 298 env.getRemoteDispatcher().addOperationToNode(targetServer, this); 299 } catch (FailedRemoteDispatchException e) { 300 LOG.warn("Can not add remote operation {} for region {} to server {}, this usually " 301 + "because the server is alread dead, give up and mark the procedure as complete, " 302 + "the parent procedure will take care of this.", this, region, targetServer, e); 303 unattach(env); 304 return null; 305 } 306 event.suspend(); 307 event.suspendIfNotReady(this); 308 throw new ProcedureSuspendedException(); 309 } 310 case REGION_REMOTE_PROCEDURE_REPORT_SUCCEED: 311 env.getAssignmentManager().persistToMeta(regionNode); 312 unattach(env); 313 return null; 314 case REGION_REMOTE_PROCEDURE_DISPATCH_FAIL: 315 // the remote call is failed so we do not need to change the region state, just return. 316 unattach(env); 317 return null; 318 case REGION_REMOTE_PROCEDURE_SERVER_CRASH: 319 env.getAssignmentManager().regionClosedAbnormally(regionNode); 320 unattach(env); 321 return null; 322 default: 323 throw new IllegalStateException("Unknown state: " + state); 324 } 325 } catch (IOException e) { 326 if (retryCounter == null) { 327 retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration()); 328 } 329 long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); 330 LOG.warn("Failed updating meta, suspend {}secs {}; {};", backoff / 1000, this, regionNode, e); 331 setTimeout(Math.toIntExact(backoff)); 332 setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); 333 skipPersistence(); 334 throw new ProcedureSuspendedException(); 335 } finally { 336 regionNode.unlock(); 337 } 338 } 339 340 @Override 341 protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) { 342 setState(ProcedureProtos.ProcedureState.RUNNABLE); 343 env.getProcedureScheduler().addFront(this); 344 return false; // 'false' means that this procedure handled the timeout 345 } 346 347 @Override 348 public boolean storeInDispatchedQueue() { 349 return false; 350 } 351 352 @Override 353 protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException { 354 RegionRemoteProcedureBaseStateData.Builder builder = 355 RegionRemoteProcedureBaseStateData.newBuilder().setRegion(ProtobufUtil.toRegionInfo(region)) 356 .setTargetServer(ProtobufUtil.toServerName(targetServer)).setState(state); 357 if (transitionCode != null) { 358 builder.setTransitionCode(transitionCode); 359 builder.setSeqId(seqId); 360 } 361 serializer.serialize(builder.build()); 362 } 363 364 @Override 365 protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException { 366 RegionRemoteProcedureBaseStateData data = 367 serializer.deserialize(RegionRemoteProcedureBaseStateData.class); 368 region = ProtobufUtil.toRegionInfo(data.getRegion()); 369 targetServer = ProtobufUtil.toServerName(data.getTargetServer()); 370 // 'state' may not be present if we are reading an 'old' form of this pb Message. 371 if (data.hasState()) { 372 state = data.getState(); 373 } 374 if (data.hasTransitionCode()) { 375 transitionCode = data.getTransitionCode(); 376 seqId = data.getSeqId(); 377 } 378 } 379 380 @Override 381 protected void afterReplay(MasterProcedureEnv env) { 382 getParent(env).attachRemoteProc(this); 383 } 384 385 @Override 386 public String getProcName() { 387 return getClass().getSimpleName() + " " + region.getEncodedName(); 388 } 389 390 @Override 391 protected void toStringClassDetails(StringBuilder builder) { 392 builder.append(getProcName()); 393 if (targetServer != null) { 394 builder.append(", server="); 395 builder.append(this.targetServer); 396 } 397 if (this.retryCounter != null) { 398 builder.append(", retry="); 399 builder.append(this.retryCounter); 400 } 401 } 402}