001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.favored; 019 020import static org.apache.hadoop.hbase.ServerName.NON_STARTCODE; 021 022import java.io.IOException; 023import java.util.ArrayList; 024import java.util.HashMap; 025import java.util.HashSet; 026import java.util.List; 027import java.util.Map; 028import java.util.Map.Entry; 029import java.util.Set; 030import java.util.concurrent.ThreadLocalRandom; 031import org.apache.commons.lang3.StringUtils; 032import org.apache.hadoop.conf.Configuration; 033import org.apache.hadoop.hbase.Cell; 034import org.apache.hadoop.hbase.CellBuilderFactory; 035import org.apache.hadoop.hbase.CellBuilderType; 036import org.apache.hadoop.hbase.HBaseIOException; 037import org.apache.hadoop.hbase.HConstants; 038import org.apache.hadoop.hbase.MetaTableAccessor; 039import org.apache.hadoop.hbase.ServerName; 040import org.apache.hadoop.hbase.TableName; 041import org.apache.hadoop.hbase.client.Connection; 042import org.apache.hadoop.hbase.client.ConnectionFactory; 043import org.apache.hadoop.hbase.client.Put; 044import org.apache.hadoop.hbase.client.RegionInfo; 045import org.apache.hadoop.hbase.client.Table; 046import org.apache.hadoop.hbase.master.RackManager; 047import org.apache.hadoop.hbase.util.Bytes; 048import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 049import org.apache.yetus.audience.InterfaceAudience; 050import org.slf4j.Logger; 051import org.slf4j.LoggerFactory; 052 053import org.apache.hbase.thirdparty.com.google.common.collect.Lists; 054import org.apache.hbase.thirdparty.com.google.common.collect.Sets; 055import org.apache.hbase.thirdparty.org.apache.commons.collections4.CollectionUtils; 056 057import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 058import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos; 059import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos.FavoredNodes; 060 061/** 062 * Helper class for {@link FavoredNodeLoadBalancer} that has all the intelligence for racks, meta 063 * scans, etc. Instantiated by the {@link FavoredNodeLoadBalancer} when needed (from within calls 064 * like {@link FavoredNodeLoadBalancer#randomAssignment(RegionInfo, List)}). All updates to favored 065 * nodes should only be done from {@link FavoredNodesManager} and not through this helper class 066 * (except for tests). 067 */ 068@InterfaceAudience.Private 069public class FavoredNodeAssignmentHelper { 070 private static final Logger LOG = LoggerFactory.getLogger(FavoredNodeAssignmentHelper.class); 071 private RackManager rackManager; 072 private Map<String, List<ServerName>> rackToRegionServerMap; 073 private List<String> uniqueRackList; 074 // This map serves as a cache for rack to sn lookups. The num of 075 // region server entries might not match with that is in servers. 076 private Map<String, String> regionServerToRackMap; 077 private List<ServerName> servers; 078 public static final byte[] FAVOREDNODES_QUALIFIER = Bytes.toBytes("fn"); 079 public final static short FAVORED_NODES_NUM = 3; 080 public final static short MAX_ATTEMPTS_FN_GENERATION = 10; 081 082 public FavoredNodeAssignmentHelper(final List<ServerName> servers, Configuration conf) { 083 this(servers, new RackManager(conf)); 084 } 085 086 public FavoredNodeAssignmentHelper(final List<ServerName> servers, 087 final RackManager rackManager) { 088 this.servers = servers; 089 this.rackManager = rackManager; 090 this.rackToRegionServerMap = new HashMap<>(); 091 this.regionServerToRackMap = new HashMap<>(); 092 this.uniqueRackList = new ArrayList<>(); 093 } 094 095 // Always initialize() when FavoredNodeAssignmentHelper is constructed. 096 public void initialize() { 097 for (ServerName sn : this.servers) { 098 String rackName = getRackOfServer(sn); 099 List<ServerName> serverList = this.rackToRegionServerMap.get(rackName); 100 if (serverList == null) { 101 serverList = Lists.newArrayList(); 102 // Add the current rack to the unique rack list 103 this.uniqueRackList.add(rackName); 104 this.rackToRegionServerMap.put(rackName, serverList); 105 } 106 for (ServerName serverName : serverList) { 107 if (ServerName.isSameAddress(sn, serverName)) { 108 // The server is already present, ignore. 109 break; 110 } 111 } 112 serverList.add(sn); 113 this.regionServerToRackMap.put(sn.getHostname(), rackName); 114 } 115 } 116 117 /** 118 * Update meta table with favored nodes info 119 * @param regionToFavoredNodes map of RegionInfo's to their favored nodes 120 * @param connection connection to be used 121 */ 122 public static void updateMetaWithFavoredNodesInfo( 123 Map<RegionInfo, List<ServerName>> regionToFavoredNodes, Connection connection) 124 throws IOException { 125 List<Put> puts = new ArrayList<>(); 126 for (Map.Entry<RegionInfo, List<ServerName>> entry : regionToFavoredNodes.entrySet()) { 127 Put put = makePut(entry.getKey(), entry.getValue()); 128 if (put != null) { 129 puts.add(put); 130 } 131 } 132 try (Table table = connection.getTable(TableName.META_TABLE_NAME)) { 133 table.put(puts); 134 } 135 LOG.info("Added " + puts.size() + " region favored nodes in META"); 136 } 137 138 /** 139 * Update meta table with favored nodes info 140 */ 141 public static void updateMetaWithFavoredNodesInfo( 142 Map<RegionInfo, List<ServerName>> regionToFavoredNodes, Configuration conf) throws IOException { 143 // Write the region assignments to the meta table. 144 // TODO: See above overrides take a Connection rather than a Configuration only the 145 // Connection is a short circuit connection. That is not going to good in all cases, when 146 // master and meta are not colocated. Fix when this favored nodes feature is actually used 147 // someday. 148 try (Connection conn = ConnectionFactory.createConnection(conf)) { 149 updateMetaWithFavoredNodesInfo(regionToFavoredNodes, conn); 150 } 151 } 152 153 private static Put makePut(RegionInfo regionInfo, List<ServerName> favoredNodeList) 154 throws IOException { 155 if (CollectionUtils.isEmpty(favoredNodeList)) { 156 return null; 157 } 158 long time = EnvironmentEdgeManager.currentTime(); 159 Put put = new Put(MetaTableAccessor.getMetaKeyForRegion(regionInfo), time); 160 byte[] favoredNodes = getFavoredNodes(favoredNodeList); 161 put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY).setRow(put.getRow()) 162 .setFamily(HConstants.CATALOG_FAMILY).setQualifier(FAVOREDNODES_QUALIFIER).setTimestamp(time) 163 .setType(Cell.Type.Put).setValue(favoredNodes).build()); 164 LOG.debug("Create the region {} with favored nodes {}", regionInfo.getRegionNameAsString(), 165 favoredNodeList); 166 return put; 167 } 168 169 /** 170 * Convert PB bytes to ServerName. 171 * @param favoredNodes The PB'ed bytes of favored nodes 172 * @return the array of {@link ServerName} for the byte array of favored nodes. 173 */ 174 public static ServerName[] getFavoredNodesList(byte[] favoredNodes) throws IOException { 175 FavoredNodes f = FavoredNodes.parseFrom(favoredNodes); 176 List<HBaseProtos.ServerName> protoNodes = f.getFavoredNodeList(); 177 ServerName[] servers = new ServerName[protoNodes.size()]; 178 int i = 0; 179 for (HBaseProtos.ServerName node : protoNodes) { 180 servers[i++] = ProtobufUtil.toServerName(node); 181 } 182 return servers; 183 } 184 185 /** Returns PB'ed bytes of {@link FavoredNodes} generated by the server list. */ 186 public static byte[] getFavoredNodes(List<ServerName> serverAddrList) { 187 FavoredNodes.Builder f = FavoredNodes.newBuilder(); 188 for (ServerName s : serverAddrList) { 189 HBaseProtos.ServerName.Builder b = HBaseProtos.ServerName.newBuilder(); 190 b.setHostName(s.getHostname()); 191 b.setPort(s.getPort()); 192 b.setStartCode(ServerName.NON_STARTCODE); 193 f.addFavoredNode(b.build()); 194 } 195 return f.build().toByteArray(); 196 } 197 198 // Place the regions round-robin across the racks picking one server from each 199 // rack at a time. Start with a random rack, and a random server from every rack. 200 // If a rack doesn't have enough servers it will go to the next rack and so on. 201 // for choosing a primary. 202 // For example, if 4 racks (r1 .. r4) with 8 servers (s1..s8) each, one possible 203 // placement could be r2:s5, r3:s5, r4:s5, r1:s5, r2:s6, r3:s6.. 204 // If there were fewer servers in one rack, say r3, which had 3 servers, one possible 205 // placement could be r2:s5, <skip-r3>, r4:s5, r1:s5, r2:s6, <skip-r3> ... 206 // The regions should be distributed proportionately to the racksizes 207 public void placePrimaryRSAsRoundRobin(Map<ServerName, List<RegionInfo>> assignmentMap, 208 Map<RegionInfo, ServerName> primaryRSMap, List<RegionInfo> regions) { 209 List<String> rackList = new ArrayList<>(rackToRegionServerMap.size()); 210 rackList.addAll(rackToRegionServerMap.keySet()); 211 int rackIndex = ThreadLocalRandom.current().nextInt(rackList.size()); 212 int maxRackSize = 0; 213 for (Map.Entry<String, List<ServerName>> r : rackToRegionServerMap.entrySet()) { 214 if (r.getValue().size() > maxRackSize) { 215 maxRackSize = r.getValue().size(); 216 } 217 } 218 int numIterations = 0; 219 // Initialize the current processing host index. 220 int serverIndex = ThreadLocalRandom.current().nextInt(maxRackSize); 221 for (RegionInfo regionInfo : regions) { 222 List<ServerName> currentServerList; 223 String rackName; 224 while (true) { 225 rackName = rackList.get(rackIndex); 226 numIterations++; 227 // Get the server list for the current rack 228 currentServerList = rackToRegionServerMap.get(rackName); 229 230 if (serverIndex >= currentServerList.size()) { // not enough machines in this rack 231 if (numIterations % rackList.size() == 0) { 232 if (++serverIndex >= maxRackSize) serverIndex = 0; 233 } 234 if (++rackIndex >= rackList.size()) { 235 rackIndex = 0; // reset the rack index to 0 236 } 237 } else break; 238 } 239 240 // Get the current process region server 241 ServerName currentServer = currentServerList.get(serverIndex); 242 243 // Place the current region with the current primary region server 244 primaryRSMap.put(regionInfo, currentServer); 245 if (assignmentMap != null) { 246 List<RegionInfo> regionsForServer = assignmentMap.get(currentServer); 247 if (regionsForServer == null) { 248 regionsForServer = new ArrayList<>(); 249 assignmentMap.put(currentServer, regionsForServer); 250 } 251 regionsForServer.add(regionInfo); 252 } 253 254 // Set the next processing index 255 if (numIterations % rackList.size() == 0) { 256 ++serverIndex; 257 } 258 if (++rackIndex >= rackList.size()) { 259 rackIndex = 0; // reset the rack index to 0 260 } 261 } 262 } 263 264 public Map<RegionInfo, ServerName[]> 265 placeSecondaryAndTertiaryRS(Map<RegionInfo, ServerName> primaryRSMap) { 266 Map<RegionInfo, ServerName[]> secondaryAndTertiaryMap = new HashMap<>(); 267 for (Map.Entry<RegionInfo, ServerName> entry : primaryRSMap.entrySet()) { 268 // Get the target region and its primary region server rack 269 RegionInfo regionInfo = entry.getKey(); 270 ServerName primaryRS = entry.getValue(); 271 try { 272 // Create the secondary and tertiary region server pair object. 273 ServerName[] favoredNodes = getSecondaryAndTertiary(regionInfo, primaryRS); 274 if (favoredNodes != null) { 275 secondaryAndTertiaryMap.put(regionInfo, favoredNodes); 276 LOG.debug("Place the secondary and tertiary region server for region " 277 + regionInfo.getRegionNameAsString()); 278 } 279 } catch (Exception e) { 280 LOG.warn("Cannot place the favored nodes for region " + regionInfo.getRegionNameAsString() 281 + " because " + e, e); 282 continue; 283 } 284 } 285 return secondaryAndTertiaryMap; 286 } 287 288 public ServerName[] getSecondaryAndTertiary(RegionInfo regionInfo, ServerName primaryRS) 289 throws IOException { 290 291 ServerName[] favoredNodes;// Get the rack for the primary region server 292 String primaryRack = getRackOfServer(primaryRS); 293 294 if (getTotalNumberOfRacks() == 1) { 295 favoredNodes = singleRackCase(regionInfo, primaryRS, primaryRack); 296 } else { 297 favoredNodes = multiRackCase(primaryRS, primaryRack); 298 } 299 return favoredNodes; 300 } 301 302 private Map<ServerName, Set<RegionInfo>> 303 mapRSToPrimaries(Map<RegionInfo, ServerName> primaryRSMap) { 304 Map<ServerName, Set<RegionInfo>> primaryServerMap = new HashMap<>(); 305 for (Entry<RegionInfo, ServerName> e : primaryRSMap.entrySet()) { 306 Set<RegionInfo> currentSet = primaryServerMap.get(e.getValue()); 307 if (currentSet == null) { 308 currentSet = new HashSet<>(); 309 } 310 currentSet.add(e.getKey()); 311 primaryServerMap.put(e.getValue(), currentSet); 312 } 313 return primaryServerMap; 314 } 315 316 /** 317 * For regions that share the primary, avoid placing the secondary and tertiary on a same RS. Used 318 * for generating new assignments for the primary/secondary/tertiary RegionServers 319 * @return the map of regions to the servers the region-files should be hosted on 320 */ 321 public Map<RegionInfo, ServerName[]> 322 placeSecondaryAndTertiaryWithRestrictions(Map<RegionInfo, ServerName> primaryRSMap) { 323 Map<ServerName, Set<RegionInfo>> serverToPrimaries = mapRSToPrimaries(primaryRSMap); 324 Map<RegionInfo, ServerName[]> secondaryAndTertiaryMap = new HashMap<>(); 325 326 for (Entry<RegionInfo, ServerName> entry : primaryRSMap.entrySet()) { 327 // Get the target region and its primary region server rack 328 RegionInfo regionInfo = entry.getKey(); 329 ServerName primaryRS = entry.getValue(); 330 try { 331 // Get the rack for the primary region server 332 String primaryRack = getRackOfServer(primaryRS); 333 ServerName[] favoredNodes = null; 334 if (getTotalNumberOfRacks() == 1) { 335 // Single rack case: have to pick the secondary and tertiary 336 // from the same rack 337 favoredNodes = singleRackCase(regionInfo, primaryRS, primaryRack); 338 } else { 339 favoredNodes = multiRackCaseWithRestrictions(serverToPrimaries, secondaryAndTertiaryMap, 340 primaryRack, primaryRS, regionInfo); 341 } 342 if (favoredNodes != null) { 343 secondaryAndTertiaryMap.put(regionInfo, favoredNodes); 344 LOG.debug("Place the secondary and tertiary region server for region " 345 + regionInfo.getRegionNameAsString()); 346 } 347 } catch (Exception e) { 348 LOG.warn("Cannot place the favored nodes for region " + regionInfo.getRegionNameAsString() 349 + " because " + e, e); 350 continue; 351 } 352 } 353 return secondaryAndTertiaryMap; 354 } 355 356 private ServerName[] multiRackCaseWithRestrictions( 357 Map<ServerName, Set<RegionInfo>> serverToPrimaries, 358 Map<RegionInfo, ServerName[]> secondaryAndTertiaryMap, String primaryRack, ServerName primaryRS, 359 RegionInfo regionInfo) throws IOException { 360 // Random to choose the secondary and tertiary region server 361 // from another rack to place the secondary and tertiary 362 // Random to choose one rack except for the current rack 363 Set<String> rackSkipSet = new HashSet<>(); 364 rackSkipSet.add(primaryRack); 365 String secondaryRack = getOneRandomRack(rackSkipSet); 366 List<ServerName> serverList = getServersFromRack(secondaryRack); 367 Set<ServerName> serverSet = new HashSet<>(serverList); 368 ServerName[] favoredNodes; 369 if (serverList.size() >= 2) { 370 // Randomly pick up two servers from this secondary rack 371 // Skip the secondary for the tertiary placement 372 // skip the servers which share the primary already 373 Set<RegionInfo> primaries = serverToPrimaries.get(primaryRS); 374 Set<ServerName> skipServerSet = new HashSet<>(); 375 while (true) { 376 ServerName[] secondaryAndTertiary = null; 377 if (primaries.size() > 1) { 378 // check where his tertiary and secondary are 379 for (RegionInfo primary : primaries) { 380 secondaryAndTertiary = secondaryAndTertiaryMap.get(primary); 381 if (secondaryAndTertiary != null) { 382 if (getRackOfServer(secondaryAndTertiary[0]).equals(secondaryRack)) { 383 skipServerSet.add(secondaryAndTertiary[0]); 384 } 385 if (getRackOfServer(secondaryAndTertiary[1]).equals(secondaryRack)) { 386 skipServerSet.add(secondaryAndTertiary[1]); 387 } 388 } 389 } 390 } 391 if (skipServerSet.size() + 2 <= serverSet.size()) break; 392 skipServerSet.clear(); 393 rackSkipSet.add(secondaryRack); 394 // we used all racks 395 if (rackSkipSet.size() == getTotalNumberOfRacks()) { 396 // remove the last two added and break 397 skipServerSet.remove(secondaryAndTertiary[0]); 398 skipServerSet.remove(secondaryAndTertiary[1]); 399 break; 400 } 401 secondaryRack = getOneRandomRack(rackSkipSet); 402 serverList = getServersFromRack(secondaryRack); 403 serverSet = new HashSet<>(serverList); 404 } 405 406 // Place the secondary RS 407 ServerName secondaryRS = getOneRandomServer(secondaryRack, skipServerSet); 408 skipServerSet.add(secondaryRS); 409 // Place the tertiary RS 410 ServerName tertiaryRS = getOneRandomServer(secondaryRack, skipServerSet); 411 412 if (secondaryRS == null || tertiaryRS == null) { 413 LOG.error("Cannot place the secondary and tertiary" + " region server for region " 414 + regionInfo.getRegionNameAsString()); 415 } 416 // Create the secondary and tertiary pair 417 favoredNodes = new ServerName[2]; 418 favoredNodes[0] = secondaryRS; 419 favoredNodes[1] = tertiaryRS; 420 } else { 421 // Pick the secondary rs from this secondary rack 422 // and pick the tertiary from another random rack 423 favoredNodes = new ServerName[2]; 424 ServerName secondary = getOneRandomServer(secondaryRack); 425 favoredNodes[0] = secondary; 426 427 // Pick the tertiary 428 if (getTotalNumberOfRacks() == 2) { 429 // Pick the tertiary from the same rack of the primary RS 430 Set<ServerName> serverSkipSet = new HashSet<>(); 431 serverSkipSet.add(primaryRS); 432 favoredNodes[1] = getOneRandomServer(primaryRack, serverSkipSet); 433 } else { 434 // Pick the tertiary from another rack 435 rackSkipSet.add(secondaryRack); 436 String tertiaryRandomRack = getOneRandomRack(rackSkipSet); 437 favoredNodes[1] = getOneRandomServer(tertiaryRandomRack); 438 } 439 } 440 return favoredNodes; 441 } 442 443 private ServerName[] singleRackCase(RegionInfo regionInfo, ServerName primaryRS, 444 String primaryRack) throws IOException { 445 // Single rack case: have to pick the secondary and tertiary 446 // from the same rack 447 List<ServerName> serverList = getServersFromRack(primaryRack); 448 if ((serverList == null) || (serverList.size() <= 2)) { 449 // Single region server case: cannot not place the favored nodes 450 // on any server; 451 return null; 452 } else { 453 // Randomly select two region servers from the server list and make sure 454 // they are not overlap with the primary region server; 455 Set<ServerName> serverSkipSet = new HashSet<>(); 456 serverSkipSet.add(primaryRS); 457 458 // Place the secondary RS 459 ServerName secondaryRS = getOneRandomServer(primaryRack, serverSkipSet); 460 // Skip the secondary for the tertiary placement 461 serverSkipSet.add(secondaryRS); 462 ServerName tertiaryRS = getOneRandomServer(primaryRack, serverSkipSet); 463 464 if (secondaryRS == null || tertiaryRS == null) { 465 LOG.error("Cannot place the secondary, tertiary favored node for region " 466 + regionInfo.getRegionNameAsString()); 467 } 468 // Create the secondary and tertiary pair 469 ServerName[] favoredNodes = new ServerName[2]; 470 favoredNodes[0] = secondaryRS; 471 favoredNodes[1] = tertiaryRS; 472 return favoredNodes; 473 } 474 } 475 476 /** 477 * Place secondary and tertiary nodes in a multi rack case. If there are only two racks, then we 478 * try the place the secondary and tertiary on different rack than primary. But if the other rack 479 * has only one region server, then we place primary and tertiary on one rack and secondary on 480 * another. The aim is two distribute the three favored nodes on >= 2 racks. TODO: see how we can 481 * use generateMissingFavoredNodeMultiRack API here 482 * @param primaryRS The primary favored node. 483 * @param primaryRack The rack of the primary favored node. 484 * @return Array containing secondary and tertiary favored nodes. 485 * @throws IOException Signals that an I/O exception has occurred. 486 */ 487 private ServerName[] multiRackCase(ServerName primaryRS, String primaryRack) throws IOException { 488 489 List<ServerName> favoredNodes = Lists.newArrayList(primaryRS); 490 // Create the secondary and tertiary pair 491 ServerName secondaryRS = generateMissingFavoredNodeMultiRack(favoredNodes); 492 favoredNodes.add(secondaryRS); 493 String secondaryRack = getRackOfServer(secondaryRS); 494 495 ServerName tertiaryRS; 496 if (primaryRack.equals(secondaryRack)) { 497 tertiaryRS = generateMissingFavoredNode(favoredNodes); 498 } else { 499 // Try to place tertiary in secondary RS rack else place on primary rack. 500 tertiaryRS = getOneRandomServer(secondaryRack, Sets.newHashSet(secondaryRS)); 501 if (tertiaryRS == null) { 502 tertiaryRS = getOneRandomServer(primaryRack, Sets.newHashSet(primaryRS)); 503 } 504 // We couldn't find anything in secondary rack, get any FN 505 if (tertiaryRS == null) { 506 tertiaryRS = generateMissingFavoredNode(Lists.newArrayList(primaryRS, secondaryRS)); 507 } 508 } 509 return new ServerName[] { secondaryRS, tertiaryRS }; 510 } 511 512 public boolean canPlaceFavoredNodes() { 513 return (this.servers.size() >= FAVORED_NODES_NUM); 514 } 515 516 private int getTotalNumberOfRacks() { 517 return this.uniqueRackList.size(); 518 } 519 520 private List<ServerName> getServersFromRack(String rack) { 521 return this.rackToRegionServerMap.get(rack); 522 } 523 524 /** 525 * Gets a random server from the specified rack and skips anything specified. 526 * @param rack rack from a server is needed 527 * @param skipServerSet the server shouldn't belong to this set 528 */ 529 protected ServerName getOneRandomServer(String rack, Set<ServerName> skipServerSet) { 530 531 // Is the rack valid? Do we recognize it? 532 if (rack == null || getServersFromRack(rack) == null || getServersFromRack(rack).isEmpty()) { 533 return null; 534 } 535 536 // Lets use a set so we can eliminate duplicates 537 Set<StartcodeAgnosticServerName> serversToChooseFrom = Sets.newHashSet(); 538 for (ServerName sn : getServersFromRack(rack)) { 539 serversToChooseFrom.add(StartcodeAgnosticServerName.valueOf(sn)); 540 } 541 542 if (skipServerSet != null && skipServerSet.size() > 0) { 543 for (ServerName sn : skipServerSet) { 544 serversToChooseFrom.remove(StartcodeAgnosticServerName.valueOf(sn)); 545 } 546 // Do we have any servers left to choose from? 547 if (serversToChooseFrom.isEmpty()) { 548 return null; 549 } 550 } 551 552 ServerName randomServer = null; 553 int randomIndex = ThreadLocalRandom.current().nextInt(serversToChooseFrom.size()); 554 int j = 0; 555 for (StartcodeAgnosticServerName sn : serversToChooseFrom) { 556 if (j == randomIndex) { 557 randomServer = sn; 558 break; 559 } 560 j++; 561 } 562 563 if (randomServer != null) { 564 return ServerName.valueOf(randomServer.getAddress(), randomServer.getStartcode()); 565 } else { 566 return null; 567 } 568 } 569 570 private ServerName getOneRandomServer(String rack) throws IOException { 571 return this.getOneRandomServer(rack, null); 572 } 573 574 String getOneRandomRack(Set<String> skipRackSet) throws IOException { 575 if (skipRackSet == null || uniqueRackList.size() <= skipRackSet.size()) { 576 throw new IOException("Cannot randomly pick another random server"); 577 } 578 579 String randomRack; 580 do { 581 int randomIndex = ThreadLocalRandom.current().nextInt(this.uniqueRackList.size()); 582 randomRack = this.uniqueRackList.get(randomIndex); 583 } while (skipRackSet.contains(randomRack)); 584 585 return randomRack; 586 } 587 588 public static String getFavoredNodesAsString(List<ServerName> nodes) { 589 StringBuilder strBuf = new StringBuilder(); 590 int i = 0; 591 for (ServerName node : nodes) { 592 strBuf.append(node.getAddress()); 593 if (++i != nodes.size()) strBuf.append(";"); 594 } 595 return strBuf.toString(); 596 } 597 598 /* 599 * Generates a missing favored node based on the input favored nodes. This helps to generate new 600 * FN when there is already 2 FN and we need a third one. For eg, while generating new FN for 601 * split daughters after inheriting 2 FN from the parent. If the cluster has only one rack it 602 * generates from the same rack. If the cluster has multiple racks, then it ensures the new FN 603 * respects the rack constraints similar to HDFS. For eg: if there are 3 FN, they will be spread 604 * across 2 racks. 605 */ 606 public ServerName generateMissingFavoredNode(List<ServerName> favoredNodes) throws IOException { 607 if (this.uniqueRackList.size() == 1) { 608 return generateMissingFavoredNodeSingleRack(favoredNodes, null); 609 } else { 610 return generateMissingFavoredNodeMultiRack(favoredNodes, null); 611 } 612 } 613 614 public ServerName generateMissingFavoredNode(List<ServerName> favoredNodes, 615 List<ServerName> excludeNodes) throws IOException { 616 if (this.uniqueRackList.size() == 1) { 617 return generateMissingFavoredNodeSingleRack(favoredNodes, excludeNodes); 618 } else { 619 return generateMissingFavoredNodeMultiRack(favoredNodes, excludeNodes); 620 } 621 } 622 623 /* 624 * Generate FN for a single rack scenario, don't generate from one of the excluded nodes. Helps 625 * when we would like to find a replacement node. 626 */ 627 private ServerName generateMissingFavoredNodeSingleRack(List<ServerName> favoredNodes, 628 List<ServerName> excludeNodes) throws IOException { 629 ServerName newServer = null; 630 Set<ServerName> excludeFNSet = Sets.newHashSet(favoredNodes); 631 if (excludeNodes != null && excludeNodes.size() > 0) { 632 excludeFNSet.addAll(excludeNodes); 633 } 634 if (favoredNodes.size() < FAVORED_NODES_NUM) { 635 newServer = this.getOneRandomServer(this.uniqueRackList.get(0), excludeFNSet); 636 } 637 return newServer; 638 } 639 640 private ServerName generateMissingFavoredNodeMultiRack(List<ServerName> favoredNodes) 641 throws IOException { 642 return generateMissingFavoredNodeMultiRack(favoredNodes, null); 643 } 644 645 /* 646 * Generates a missing FN based on the input favoredNodes and also the nodes to be skipped. Get 647 * the current layout of favored nodes arrangement and nodes to be excluded and get a random node 648 * that goes with HDFS block placement. Eg: If the existing nodes are on one rack, generate one 649 * from another rack. We exclude as much as possible so the random selection has more chance to 650 * generate a node within a few iterations, ideally 1. 651 */ 652 private ServerName generateMissingFavoredNodeMultiRack(List<ServerName> favoredNodes, 653 List<ServerName> excludeNodes) throws IOException { 654 655 Set<String> racks = Sets.newHashSet(); 656 Map<String, Set<ServerName>> rackToFNMapping = new HashMap<>(); 657 658 // Lets understand the current rack distribution of the FN 659 for (ServerName sn : favoredNodes) { 660 String rack = getRackOfServer(sn); 661 racks.add(rack); 662 663 Set<ServerName> serversInRack = rackToFNMapping.get(rack); 664 if (serversInRack == null) { 665 serversInRack = Sets.newHashSet(); 666 rackToFNMapping.put(rack, serversInRack); 667 } 668 serversInRack.add(sn); 669 } 670 671 // What racks should be skipped while getting a FN? 672 Set<String> skipRackSet = Sets.newHashSet(); 673 674 /* 675 * If both the FN are from the same rack, then we don't want to generate another FN on the same 676 * rack. If that rack fails, the region would be unavailable. 677 */ 678 if (racks.size() == 1 && favoredNodes.size() > 1) { 679 skipRackSet.add(racks.iterator().next()); 680 } 681 682 /* 683 * If there are no free nodes on the existing racks, we should skip those racks too. We can 684 * reduce the number of iterations for FN selection. 685 */ 686 for (String rack : racks) { 687 if ( 688 getServersFromRack(rack) != null 689 && rackToFNMapping.get(rack).size() == getServersFromRack(rack).size() 690 ) { 691 skipRackSet.add(rack); 692 } 693 } 694 695 Set<ServerName> favoredNodeSet = Sets.newHashSet(favoredNodes); 696 if (excludeNodes != null && excludeNodes.size() > 0) { 697 favoredNodeSet.addAll(excludeNodes); 698 } 699 700 /* 701 * Lets get a random rack by excluding skipRackSet and generate a random FN from that rack. 702 */ 703 int i = 0; 704 Set<String> randomRacks = Sets.newHashSet(); 705 ServerName newServer = null; 706 do { 707 String randomRack = this.getOneRandomRack(skipRackSet); 708 newServer = this.getOneRandomServer(randomRack, favoredNodeSet); 709 randomRacks.add(randomRack); 710 i++; 711 } while ((i < MAX_ATTEMPTS_FN_GENERATION) && (newServer == null)); 712 713 if (newServer == null) { 714 if (LOG.isTraceEnabled()) { 715 LOG.trace(String.format( 716 "Unable to generate additional favored nodes for %s after " 717 + "considering racks %s and skip rack %s with a unique rack list of %s and rack " 718 + "to RS map of %s and RS to rack map of %s", 719 StringUtils.join(favoredNodes, ","), randomRacks, skipRackSet, uniqueRackList, 720 rackToRegionServerMap, regionServerToRackMap)); 721 } 722 throw new IOException( 723 " Unable to generate additional favored nodes for " + StringUtils.join(favoredNodes, ",")); 724 } 725 return newServer; 726 } 727 728 /* 729 * Generate favored nodes for a region. Choose a random server as primary and then choose 730 * secondary and tertiary FN so its spread across two racks. 731 */ 732 public List<ServerName> generateFavoredNodes(RegionInfo hri) throws IOException { 733 734 List<ServerName> favoredNodesForRegion = new ArrayList<>(FAVORED_NODES_NUM); 735 ServerName primary = servers.get(ThreadLocalRandom.current().nextInt(servers.size())); 736 favoredNodesForRegion.add(ServerName.valueOf(primary.getAddress(), ServerName.NON_STARTCODE)); 737 738 Map<RegionInfo, ServerName> primaryRSMap = new HashMap<>(1); 739 primaryRSMap.put(hri, primary); 740 Map<RegionInfo, ServerName[]> secondaryAndTertiaryRSMap = 741 placeSecondaryAndTertiaryRS(primaryRSMap); 742 ServerName[] secondaryAndTertiaryNodes = secondaryAndTertiaryRSMap.get(hri); 743 if (secondaryAndTertiaryNodes != null && secondaryAndTertiaryNodes.length == 2) { 744 for (ServerName sn : secondaryAndTertiaryNodes) { 745 favoredNodesForRegion.add(ServerName.valueOf(sn.getAddress(), ServerName.NON_STARTCODE)); 746 } 747 return favoredNodesForRegion; 748 } else { 749 throw new HBaseIOException("Unable to generate secondary and tertiary favored nodes."); 750 } 751 } 752 753 public Map<RegionInfo, List<ServerName>> generateFavoredNodesRoundRobin( 754 Map<ServerName, List<RegionInfo>> assignmentMap, List<RegionInfo> regions) throws IOException { 755 756 if (regions.size() > 0) { 757 if (canPlaceFavoredNodes()) { 758 Map<RegionInfo, ServerName> primaryRSMap = new HashMap<>(); 759 // Lets try to have an equal distribution for primary favored node 760 placePrimaryRSAsRoundRobin(assignmentMap, primaryRSMap, regions); 761 return generateFavoredNodes(primaryRSMap); 762 763 } else { 764 throw new HBaseIOException("Not enough nodes to generate favored nodes"); 765 } 766 } 767 return null; 768 } 769 770 /* 771 * Generate favored nodes for a set of regions when we know where they are currently hosted. 772 */ 773 private Map<RegionInfo, List<ServerName>> 774 generateFavoredNodes(Map<RegionInfo, ServerName> primaryRSMap) { 775 776 Map<RegionInfo, List<ServerName>> generatedFavNodes = new HashMap<>(); 777 Map<RegionInfo, ServerName[]> secondaryAndTertiaryRSMap = 778 placeSecondaryAndTertiaryRS(primaryRSMap); 779 780 for (Entry<RegionInfo, ServerName> entry : primaryRSMap.entrySet()) { 781 List<ServerName> favoredNodesForRegion = new ArrayList<>(FAVORED_NODES_NUM); 782 RegionInfo region = entry.getKey(); 783 ServerName primarySN = entry.getValue(); 784 favoredNodesForRegion 785 .add(ServerName.valueOf(primarySN.getHostname(), primarySN.getPort(), NON_STARTCODE)); 786 ServerName[] secondaryAndTertiaryNodes = secondaryAndTertiaryRSMap.get(region); 787 if (secondaryAndTertiaryNodes != null) { 788 favoredNodesForRegion.add(ServerName.valueOf(secondaryAndTertiaryNodes[0].getHostname(), 789 secondaryAndTertiaryNodes[0].getPort(), NON_STARTCODE)); 790 favoredNodesForRegion.add(ServerName.valueOf(secondaryAndTertiaryNodes[1].getHostname(), 791 secondaryAndTertiaryNodes[1].getPort(), NON_STARTCODE)); 792 } 793 generatedFavNodes.put(region, favoredNodesForRegion); 794 } 795 return generatedFavNodes; 796 } 797 798 /* 799 * Get the rack of server from local mapping when present, saves lookup by the RackManager. 800 */ 801 private String getRackOfServer(ServerName sn) { 802 if (this.regionServerToRackMap.containsKey(sn.getHostname())) { 803 return this.regionServerToRackMap.get(sn.getHostname()); 804 } else { 805 String rack = this.rackManager.getRack(sn); 806 this.regionServerToRackMap.put(sn.getHostname(), rack); 807 return rack; 808 } 809 } 810}