001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.favored;
019
020import static org.apache.hadoop.hbase.ServerName.NON_STARTCODE;
021
022import java.io.IOException;
023import java.util.ArrayList;
024import java.util.HashMap;
025import java.util.HashSet;
026import java.util.List;
027import java.util.Map;
028import java.util.Map.Entry;
029import java.util.Set;
030import java.util.concurrent.ThreadLocalRandom;
031import org.apache.commons.lang3.StringUtils;
032import org.apache.hadoop.conf.Configuration;
033import org.apache.hadoop.hbase.Cell;
034import org.apache.hadoop.hbase.CellBuilderFactory;
035import org.apache.hadoop.hbase.CellBuilderType;
036import org.apache.hadoop.hbase.HBaseIOException;
037import org.apache.hadoop.hbase.HConstants;
038import org.apache.hadoop.hbase.MetaTableAccessor;
039import org.apache.hadoop.hbase.ServerName;
040import org.apache.hadoop.hbase.TableName;
041import org.apache.hadoop.hbase.client.Connection;
042import org.apache.hadoop.hbase.client.ConnectionFactory;
043import org.apache.hadoop.hbase.client.Put;
044import org.apache.hadoop.hbase.client.RegionInfo;
045import org.apache.hadoop.hbase.client.Table;
046import org.apache.hadoop.hbase.master.RackManager;
047import org.apache.hadoop.hbase.util.Bytes;
048import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
049import org.apache.yetus.audience.InterfaceAudience;
050import org.slf4j.Logger;
051import org.slf4j.LoggerFactory;
052
053import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
054import org.apache.hbase.thirdparty.com.google.common.collect.Sets;
055import org.apache.hbase.thirdparty.org.apache.commons.collections4.CollectionUtils;
056
057import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
058import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos;
059import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos.FavoredNodes;
060
061/**
062 * Helper class for {@link FavoredNodeLoadBalancer} that has all the intelligence for racks, meta
063 * scans, etc. Instantiated by the {@link FavoredNodeLoadBalancer} when needed (from within calls
064 * like {@link FavoredNodeLoadBalancer#randomAssignment(RegionInfo, List)}). All updates to favored
065 * nodes should only be done from {@link FavoredNodesManager} and not through this helper class
066 * (except for tests).
067 */
068@InterfaceAudience.Private
069public class FavoredNodeAssignmentHelper {
070  private static final Logger LOG = LoggerFactory.getLogger(FavoredNodeAssignmentHelper.class);
071  private RackManager rackManager;
072  private Map<String, List<ServerName>> rackToRegionServerMap;
073  private List<String> uniqueRackList;
074  // This map serves as a cache for rack to sn lookups. The num of
075  // region server entries might not match with that is in servers.
076  private Map<String, String> regionServerToRackMap;
077  private List<ServerName> servers;
078  public static final byte[] FAVOREDNODES_QUALIFIER = Bytes.toBytes("fn");
079  public final static short FAVORED_NODES_NUM = 3;
080  public final static short MAX_ATTEMPTS_FN_GENERATION = 10;
081
082  public FavoredNodeAssignmentHelper(final List<ServerName> servers, Configuration conf) {
083    this(servers, new RackManager(conf));
084  }
085
086  public FavoredNodeAssignmentHelper(final List<ServerName> servers,
087    final RackManager rackManager) {
088    this.servers = servers;
089    this.rackManager = rackManager;
090    this.rackToRegionServerMap = new HashMap<>();
091    this.regionServerToRackMap = new HashMap<>();
092    this.uniqueRackList = new ArrayList<>();
093  }
094
095  // Always initialize() when FavoredNodeAssignmentHelper is constructed.
096  public void initialize() {
097    for (ServerName sn : this.servers) {
098      String rackName = getRackOfServer(sn);
099      List<ServerName> serverList = this.rackToRegionServerMap.get(rackName);
100      if (serverList == null) {
101        serverList = Lists.newArrayList();
102        // Add the current rack to the unique rack list
103        this.uniqueRackList.add(rackName);
104        this.rackToRegionServerMap.put(rackName, serverList);
105      }
106      for (ServerName serverName : serverList) {
107        if (ServerName.isSameAddress(sn, serverName)) {
108          // The server is already present, ignore.
109          break;
110        }
111      }
112      serverList.add(sn);
113      this.regionServerToRackMap.put(sn.getHostname(), rackName);
114    }
115  }
116
117  /**
118   * Update meta table with favored nodes info
119   * @param regionToFavoredNodes map of RegionInfo's to their favored nodes
120   * @param connection           connection to be used
121   */
122  public static void updateMetaWithFavoredNodesInfo(
123    Map<RegionInfo, List<ServerName>> regionToFavoredNodes, Connection connection)
124    throws IOException {
125    List<Put> puts = new ArrayList<>();
126    for (Map.Entry<RegionInfo, List<ServerName>> entry : regionToFavoredNodes.entrySet()) {
127      Put put = makePut(entry.getKey(), entry.getValue());
128      if (put != null) {
129        puts.add(put);
130      }
131    }
132    try (Table table = connection.getTable(TableName.META_TABLE_NAME)) {
133      table.put(puts);
134    }
135    LOG.info("Added " + puts.size() + " region favored nodes in META");
136  }
137
138  /**
139   * Update meta table with favored nodes info
140   */
141  public static void updateMetaWithFavoredNodesInfo(
142    Map<RegionInfo, List<ServerName>> regionToFavoredNodes, Configuration conf) throws IOException {
143    // Write the region assignments to the meta table.
144    // TODO: See above overrides take a Connection rather than a Configuration only the
145    // Connection is a short circuit connection. That is not going to good in all cases, when
146    // master and meta are not colocated. Fix when this favored nodes feature is actually used
147    // someday.
148    try (Connection conn = ConnectionFactory.createConnection(conf)) {
149      updateMetaWithFavoredNodesInfo(regionToFavoredNodes, conn);
150    }
151  }
152
153  private static Put makePut(RegionInfo regionInfo, List<ServerName> favoredNodeList)
154    throws IOException {
155    if (CollectionUtils.isEmpty(favoredNodeList)) {
156      return null;
157    }
158    long time = EnvironmentEdgeManager.currentTime();
159    Put put = new Put(MetaTableAccessor.getMetaKeyForRegion(regionInfo), time);
160    byte[] favoredNodes = getFavoredNodes(favoredNodeList);
161    put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY).setRow(put.getRow())
162      .setFamily(HConstants.CATALOG_FAMILY).setQualifier(FAVOREDNODES_QUALIFIER).setTimestamp(time)
163      .setType(Cell.Type.Put).setValue(favoredNodes).build());
164    LOG.debug("Create the region {} with favored nodes {}", regionInfo.getRegionNameAsString(),
165      favoredNodeList);
166    return put;
167  }
168
169  /**
170   * Convert PB bytes to ServerName.
171   * @param favoredNodes The PB'ed bytes of favored nodes
172   * @return the array of {@link ServerName} for the byte array of favored nodes.
173   */
174  public static ServerName[] getFavoredNodesList(byte[] favoredNodes) throws IOException {
175    FavoredNodes f = FavoredNodes.parseFrom(favoredNodes);
176    List<HBaseProtos.ServerName> protoNodes = f.getFavoredNodeList();
177    ServerName[] servers = new ServerName[protoNodes.size()];
178    int i = 0;
179    for (HBaseProtos.ServerName node : protoNodes) {
180      servers[i++] = ProtobufUtil.toServerName(node);
181    }
182    return servers;
183  }
184
185  /** Returns PB'ed bytes of {@link FavoredNodes} generated by the server list. */
186  public static byte[] getFavoredNodes(List<ServerName> serverAddrList) {
187    FavoredNodes.Builder f = FavoredNodes.newBuilder();
188    for (ServerName s : serverAddrList) {
189      HBaseProtos.ServerName.Builder b = HBaseProtos.ServerName.newBuilder();
190      b.setHostName(s.getHostname());
191      b.setPort(s.getPort());
192      b.setStartCode(ServerName.NON_STARTCODE);
193      f.addFavoredNode(b.build());
194    }
195    return f.build().toByteArray();
196  }
197
198  // Place the regions round-robin across the racks picking one server from each
199  // rack at a time. Start with a random rack, and a random server from every rack.
200  // If a rack doesn't have enough servers it will go to the next rack and so on.
201  // for choosing a primary.
202  // For example, if 4 racks (r1 .. r4) with 8 servers (s1..s8) each, one possible
203  // placement could be r2:s5, r3:s5, r4:s5, r1:s5, r2:s6, r3:s6..
204  // If there were fewer servers in one rack, say r3, which had 3 servers, one possible
205  // placement could be r2:s5, <skip-r3>, r4:s5, r1:s5, r2:s6, <skip-r3> ...
206  // The regions should be distributed proportionately to the racksizes
207  public void placePrimaryRSAsRoundRobin(Map<ServerName, List<RegionInfo>> assignmentMap,
208    Map<RegionInfo, ServerName> primaryRSMap, List<RegionInfo> regions) {
209    List<String> rackList = new ArrayList<>(rackToRegionServerMap.size());
210    rackList.addAll(rackToRegionServerMap.keySet());
211    int rackIndex = ThreadLocalRandom.current().nextInt(rackList.size());
212    int maxRackSize = 0;
213    for (Map.Entry<String, List<ServerName>> r : rackToRegionServerMap.entrySet()) {
214      if (r.getValue().size() > maxRackSize) {
215        maxRackSize = r.getValue().size();
216      }
217    }
218    int numIterations = 0;
219    // Initialize the current processing host index.
220    int serverIndex = ThreadLocalRandom.current().nextInt(maxRackSize);
221    for (RegionInfo regionInfo : regions) {
222      List<ServerName> currentServerList;
223      String rackName;
224      while (true) {
225        rackName = rackList.get(rackIndex);
226        numIterations++;
227        // Get the server list for the current rack
228        currentServerList = rackToRegionServerMap.get(rackName);
229
230        if (serverIndex >= currentServerList.size()) { // not enough machines in this rack
231          if (numIterations % rackList.size() == 0) {
232            if (++serverIndex >= maxRackSize) serverIndex = 0;
233          }
234          if (++rackIndex >= rackList.size()) {
235            rackIndex = 0; // reset the rack index to 0
236          }
237        } else break;
238      }
239
240      // Get the current process region server
241      ServerName currentServer = currentServerList.get(serverIndex);
242
243      // Place the current region with the current primary region server
244      primaryRSMap.put(regionInfo, currentServer);
245      if (assignmentMap != null) {
246        List<RegionInfo> regionsForServer = assignmentMap.get(currentServer);
247        if (regionsForServer == null) {
248          regionsForServer = new ArrayList<>();
249          assignmentMap.put(currentServer, regionsForServer);
250        }
251        regionsForServer.add(regionInfo);
252      }
253
254      // Set the next processing index
255      if (numIterations % rackList.size() == 0) {
256        ++serverIndex;
257      }
258      if (++rackIndex >= rackList.size()) {
259        rackIndex = 0; // reset the rack index to 0
260      }
261    }
262  }
263
264  public Map<RegionInfo, ServerName[]>
265    placeSecondaryAndTertiaryRS(Map<RegionInfo, ServerName> primaryRSMap) {
266    Map<RegionInfo, ServerName[]> secondaryAndTertiaryMap = new HashMap<>();
267    for (Map.Entry<RegionInfo, ServerName> entry : primaryRSMap.entrySet()) {
268      // Get the target region and its primary region server rack
269      RegionInfo regionInfo = entry.getKey();
270      ServerName primaryRS = entry.getValue();
271      try {
272        // Create the secondary and tertiary region server pair object.
273        ServerName[] favoredNodes = getSecondaryAndTertiary(regionInfo, primaryRS);
274        if (favoredNodes != null) {
275          secondaryAndTertiaryMap.put(regionInfo, favoredNodes);
276          LOG.debug("Place the secondary and tertiary region server for region "
277            + regionInfo.getRegionNameAsString());
278        }
279      } catch (Exception e) {
280        LOG.warn("Cannot place the favored nodes for region " + regionInfo.getRegionNameAsString()
281          + " because " + e, e);
282        continue;
283      }
284    }
285    return secondaryAndTertiaryMap;
286  }
287
288  public ServerName[] getSecondaryAndTertiary(RegionInfo regionInfo, ServerName primaryRS)
289    throws IOException {
290
291    ServerName[] favoredNodes;// Get the rack for the primary region server
292    String primaryRack = getRackOfServer(primaryRS);
293
294    if (getTotalNumberOfRacks() == 1) {
295      favoredNodes = singleRackCase(regionInfo, primaryRS, primaryRack);
296    } else {
297      favoredNodes = multiRackCase(primaryRS, primaryRack);
298    }
299    return favoredNodes;
300  }
301
302  private Map<ServerName, Set<RegionInfo>>
303    mapRSToPrimaries(Map<RegionInfo, ServerName> primaryRSMap) {
304    Map<ServerName, Set<RegionInfo>> primaryServerMap = new HashMap<>();
305    for (Entry<RegionInfo, ServerName> e : primaryRSMap.entrySet()) {
306      Set<RegionInfo> currentSet = primaryServerMap.get(e.getValue());
307      if (currentSet == null) {
308        currentSet = new HashSet<>();
309      }
310      currentSet.add(e.getKey());
311      primaryServerMap.put(e.getValue(), currentSet);
312    }
313    return primaryServerMap;
314  }
315
316  /**
317   * For regions that share the primary, avoid placing the secondary and tertiary on a same RS. Used
318   * for generating new assignments for the primary/secondary/tertiary RegionServers
319   * @return the map of regions to the servers the region-files should be hosted on
320   */
321  public Map<RegionInfo, ServerName[]>
322    placeSecondaryAndTertiaryWithRestrictions(Map<RegionInfo, ServerName> primaryRSMap) {
323    Map<ServerName, Set<RegionInfo>> serverToPrimaries = mapRSToPrimaries(primaryRSMap);
324    Map<RegionInfo, ServerName[]> secondaryAndTertiaryMap = new HashMap<>();
325
326    for (Entry<RegionInfo, ServerName> entry : primaryRSMap.entrySet()) {
327      // Get the target region and its primary region server rack
328      RegionInfo regionInfo = entry.getKey();
329      ServerName primaryRS = entry.getValue();
330      try {
331        // Get the rack for the primary region server
332        String primaryRack = getRackOfServer(primaryRS);
333        ServerName[] favoredNodes = null;
334        if (getTotalNumberOfRacks() == 1) {
335          // Single rack case: have to pick the secondary and tertiary
336          // from the same rack
337          favoredNodes = singleRackCase(regionInfo, primaryRS, primaryRack);
338        } else {
339          favoredNodes = multiRackCaseWithRestrictions(serverToPrimaries, secondaryAndTertiaryMap,
340            primaryRack, primaryRS, regionInfo);
341        }
342        if (favoredNodes != null) {
343          secondaryAndTertiaryMap.put(regionInfo, favoredNodes);
344          LOG.debug("Place the secondary and tertiary region server for region "
345            + regionInfo.getRegionNameAsString());
346        }
347      } catch (Exception e) {
348        LOG.warn("Cannot place the favored nodes for region " + regionInfo.getRegionNameAsString()
349          + " because " + e, e);
350        continue;
351      }
352    }
353    return secondaryAndTertiaryMap;
354  }
355
356  private ServerName[] multiRackCaseWithRestrictions(
357    Map<ServerName, Set<RegionInfo>> serverToPrimaries,
358    Map<RegionInfo, ServerName[]> secondaryAndTertiaryMap, String primaryRack, ServerName primaryRS,
359    RegionInfo regionInfo) throws IOException {
360    // Random to choose the secondary and tertiary region server
361    // from another rack to place the secondary and tertiary
362    // Random to choose one rack except for the current rack
363    Set<String> rackSkipSet = new HashSet<>();
364    rackSkipSet.add(primaryRack);
365    String secondaryRack = getOneRandomRack(rackSkipSet);
366    List<ServerName> serverList = getServersFromRack(secondaryRack);
367    Set<ServerName> serverSet = new HashSet<>(serverList);
368    ServerName[] favoredNodes;
369    if (serverList.size() >= 2) {
370      // Randomly pick up two servers from this secondary rack
371      // Skip the secondary for the tertiary placement
372      // skip the servers which share the primary already
373      Set<RegionInfo> primaries = serverToPrimaries.get(primaryRS);
374      Set<ServerName> skipServerSet = new HashSet<>();
375      while (true) {
376        ServerName[] secondaryAndTertiary = null;
377        if (primaries.size() > 1) {
378          // check where his tertiary and secondary are
379          for (RegionInfo primary : primaries) {
380            secondaryAndTertiary = secondaryAndTertiaryMap.get(primary);
381            if (secondaryAndTertiary != null) {
382              if (getRackOfServer(secondaryAndTertiary[0]).equals(secondaryRack)) {
383                skipServerSet.add(secondaryAndTertiary[0]);
384              }
385              if (getRackOfServer(secondaryAndTertiary[1]).equals(secondaryRack)) {
386                skipServerSet.add(secondaryAndTertiary[1]);
387              }
388            }
389          }
390        }
391        if (skipServerSet.size() + 2 <= serverSet.size()) break;
392        skipServerSet.clear();
393        rackSkipSet.add(secondaryRack);
394        // we used all racks
395        if (rackSkipSet.size() == getTotalNumberOfRacks()) {
396          // remove the last two added and break
397          skipServerSet.remove(secondaryAndTertiary[0]);
398          skipServerSet.remove(secondaryAndTertiary[1]);
399          break;
400        }
401        secondaryRack = getOneRandomRack(rackSkipSet);
402        serverList = getServersFromRack(secondaryRack);
403        serverSet = new HashSet<>(serverList);
404      }
405
406      // Place the secondary RS
407      ServerName secondaryRS = getOneRandomServer(secondaryRack, skipServerSet);
408      skipServerSet.add(secondaryRS);
409      // Place the tertiary RS
410      ServerName tertiaryRS = getOneRandomServer(secondaryRack, skipServerSet);
411
412      if (secondaryRS == null || tertiaryRS == null) {
413        LOG.error("Cannot place the secondary and tertiary" + " region server for region "
414          + regionInfo.getRegionNameAsString());
415      }
416      // Create the secondary and tertiary pair
417      favoredNodes = new ServerName[2];
418      favoredNodes[0] = secondaryRS;
419      favoredNodes[1] = tertiaryRS;
420    } else {
421      // Pick the secondary rs from this secondary rack
422      // and pick the tertiary from another random rack
423      favoredNodes = new ServerName[2];
424      ServerName secondary = getOneRandomServer(secondaryRack);
425      favoredNodes[0] = secondary;
426
427      // Pick the tertiary
428      if (getTotalNumberOfRacks() == 2) {
429        // Pick the tertiary from the same rack of the primary RS
430        Set<ServerName> serverSkipSet = new HashSet<>();
431        serverSkipSet.add(primaryRS);
432        favoredNodes[1] = getOneRandomServer(primaryRack, serverSkipSet);
433      } else {
434        // Pick the tertiary from another rack
435        rackSkipSet.add(secondaryRack);
436        String tertiaryRandomRack = getOneRandomRack(rackSkipSet);
437        favoredNodes[1] = getOneRandomServer(tertiaryRandomRack);
438      }
439    }
440    return favoredNodes;
441  }
442
443  private ServerName[] singleRackCase(RegionInfo regionInfo, ServerName primaryRS,
444    String primaryRack) throws IOException {
445    // Single rack case: have to pick the secondary and tertiary
446    // from the same rack
447    List<ServerName> serverList = getServersFromRack(primaryRack);
448    if ((serverList == null) || (serverList.size() <= 2)) {
449      // Single region server case: cannot not place the favored nodes
450      // on any server;
451      return null;
452    } else {
453      // Randomly select two region servers from the server list and make sure
454      // they are not overlap with the primary region server;
455      Set<ServerName> serverSkipSet = new HashSet<>();
456      serverSkipSet.add(primaryRS);
457
458      // Place the secondary RS
459      ServerName secondaryRS = getOneRandomServer(primaryRack, serverSkipSet);
460      // Skip the secondary for the tertiary placement
461      serverSkipSet.add(secondaryRS);
462      ServerName tertiaryRS = getOneRandomServer(primaryRack, serverSkipSet);
463
464      if (secondaryRS == null || tertiaryRS == null) {
465        LOG.error("Cannot place the secondary, tertiary favored node for region "
466          + regionInfo.getRegionNameAsString());
467      }
468      // Create the secondary and tertiary pair
469      ServerName[] favoredNodes = new ServerName[2];
470      favoredNodes[0] = secondaryRS;
471      favoredNodes[1] = tertiaryRS;
472      return favoredNodes;
473    }
474  }
475
476  /**
477   * Place secondary and tertiary nodes in a multi rack case. If there are only two racks, then we
478   * try the place the secondary and tertiary on different rack than primary. But if the other rack
479   * has only one region server, then we place primary and tertiary on one rack and secondary on
480   * another. The aim is two distribute the three favored nodes on >= 2 racks. TODO: see how we can
481   * use generateMissingFavoredNodeMultiRack API here
482   * @param primaryRS   The primary favored node.
483   * @param primaryRack The rack of the primary favored node.
484   * @return Array containing secondary and tertiary favored nodes.
485   * @throws IOException Signals that an I/O exception has occurred.
486   */
487  private ServerName[] multiRackCase(ServerName primaryRS, String primaryRack) throws IOException {
488
489    List<ServerName> favoredNodes = Lists.newArrayList(primaryRS);
490    // Create the secondary and tertiary pair
491    ServerName secondaryRS = generateMissingFavoredNodeMultiRack(favoredNodes);
492    favoredNodes.add(secondaryRS);
493    String secondaryRack = getRackOfServer(secondaryRS);
494
495    ServerName tertiaryRS;
496    if (primaryRack.equals(secondaryRack)) {
497      tertiaryRS = generateMissingFavoredNode(favoredNodes);
498    } else {
499      // Try to place tertiary in secondary RS rack else place on primary rack.
500      tertiaryRS = getOneRandomServer(secondaryRack, Sets.newHashSet(secondaryRS));
501      if (tertiaryRS == null) {
502        tertiaryRS = getOneRandomServer(primaryRack, Sets.newHashSet(primaryRS));
503      }
504      // We couldn't find anything in secondary rack, get any FN
505      if (tertiaryRS == null) {
506        tertiaryRS = generateMissingFavoredNode(Lists.newArrayList(primaryRS, secondaryRS));
507      }
508    }
509    return new ServerName[] { secondaryRS, tertiaryRS };
510  }
511
512  public boolean canPlaceFavoredNodes() {
513    return (this.servers.size() >= FAVORED_NODES_NUM);
514  }
515
516  private int getTotalNumberOfRacks() {
517    return this.uniqueRackList.size();
518  }
519
520  private List<ServerName> getServersFromRack(String rack) {
521    return this.rackToRegionServerMap.get(rack);
522  }
523
524  /**
525   * Gets a random server from the specified rack and skips anything specified.
526   * @param rack          rack from a server is needed
527   * @param skipServerSet the server shouldn't belong to this set
528   */
529  protected ServerName getOneRandomServer(String rack, Set<ServerName> skipServerSet) {
530
531    // Is the rack valid? Do we recognize it?
532    if (rack == null || getServersFromRack(rack) == null || getServersFromRack(rack).isEmpty()) {
533      return null;
534    }
535
536    // Lets use a set so we can eliminate duplicates
537    Set<StartcodeAgnosticServerName> serversToChooseFrom = Sets.newHashSet();
538    for (ServerName sn : getServersFromRack(rack)) {
539      serversToChooseFrom.add(StartcodeAgnosticServerName.valueOf(sn));
540    }
541
542    if (skipServerSet != null && skipServerSet.size() > 0) {
543      for (ServerName sn : skipServerSet) {
544        serversToChooseFrom.remove(StartcodeAgnosticServerName.valueOf(sn));
545      }
546      // Do we have any servers left to choose from?
547      if (serversToChooseFrom.isEmpty()) {
548        return null;
549      }
550    }
551
552    ServerName randomServer = null;
553    int randomIndex = ThreadLocalRandom.current().nextInt(serversToChooseFrom.size());
554    int j = 0;
555    for (StartcodeAgnosticServerName sn : serversToChooseFrom) {
556      if (j == randomIndex) {
557        randomServer = sn;
558        break;
559      }
560      j++;
561    }
562
563    if (randomServer != null) {
564      return ServerName.valueOf(randomServer.getAddress(), randomServer.getStartcode());
565    } else {
566      return null;
567    }
568  }
569
570  private ServerName getOneRandomServer(String rack) throws IOException {
571    return this.getOneRandomServer(rack, null);
572  }
573
574  String getOneRandomRack(Set<String> skipRackSet) throws IOException {
575    if (skipRackSet == null || uniqueRackList.size() <= skipRackSet.size()) {
576      throw new IOException("Cannot randomly pick another random server");
577    }
578
579    String randomRack;
580    do {
581      int randomIndex = ThreadLocalRandom.current().nextInt(this.uniqueRackList.size());
582      randomRack = this.uniqueRackList.get(randomIndex);
583    } while (skipRackSet.contains(randomRack));
584
585    return randomRack;
586  }
587
588  public static String getFavoredNodesAsString(List<ServerName> nodes) {
589    StringBuilder strBuf = new StringBuilder();
590    int i = 0;
591    for (ServerName node : nodes) {
592      strBuf.append(node.getAddress());
593      if (++i != nodes.size()) strBuf.append(";");
594    }
595    return strBuf.toString();
596  }
597
598  /*
599   * Generates a missing favored node based on the input favored nodes. This helps to generate new
600   * FN when there is already 2 FN and we need a third one. For eg, while generating new FN for
601   * split daughters after inheriting 2 FN from the parent. If the cluster has only one rack it
602   * generates from the same rack. If the cluster has multiple racks, then it ensures the new FN
603   * respects the rack constraints similar to HDFS. For eg: if there are 3 FN, they will be spread
604   * across 2 racks.
605   */
606  public ServerName generateMissingFavoredNode(List<ServerName> favoredNodes) throws IOException {
607    if (this.uniqueRackList.size() == 1) {
608      return generateMissingFavoredNodeSingleRack(favoredNodes, null);
609    } else {
610      return generateMissingFavoredNodeMultiRack(favoredNodes, null);
611    }
612  }
613
614  public ServerName generateMissingFavoredNode(List<ServerName> favoredNodes,
615    List<ServerName> excludeNodes) throws IOException {
616    if (this.uniqueRackList.size() == 1) {
617      return generateMissingFavoredNodeSingleRack(favoredNodes, excludeNodes);
618    } else {
619      return generateMissingFavoredNodeMultiRack(favoredNodes, excludeNodes);
620    }
621  }
622
623  /*
624   * Generate FN for a single rack scenario, don't generate from one of the excluded nodes. Helps
625   * when we would like to find a replacement node.
626   */
627  private ServerName generateMissingFavoredNodeSingleRack(List<ServerName> favoredNodes,
628    List<ServerName> excludeNodes) throws IOException {
629    ServerName newServer = null;
630    Set<ServerName> excludeFNSet = Sets.newHashSet(favoredNodes);
631    if (excludeNodes != null && excludeNodes.size() > 0) {
632      excludeFNSet.addAll(excludeNodes);
633    }
634    if (favoredNodes.size() < FAVORED_NODES_NUM) {
635      newServer = this.getOneRandomServer(this.uniqueRackList.get(0), excludeFNSet);
636    }
637    return newServer;
638  }
639
640  private ServerName generateMissingFavoredNodeMultiRack(List<ServerName> favoredNodes)
641    throws IOException {
642    return generateMissingFavoredNodeMultiRack(favoredNodes, null);
643  }
644
645  /*
646   * Generates a missing FN based on the input favoredNodes and also the nodes to be skipped. Get
647   * the current layout of favored nodes arrangement and nodes to be excluded and get a random node
648   * that goes with HDFS block placement. Eg: If the existing nodes are on one rack, generate one
649   * from another rack. We exclude as much as possible so the random selection has more chance to
650   * generate a node within a few iterations, ideally 1.
651   */
652  private ServerName generateMissingFavoredNodeMultiRack(List<ServerName> favoredNodes,
653    List<ServerName> excludeNodes) throws IOException {
654
655    Set<String> racks = Sets.newHashSet();
656    Map<String, Set<ServerName>> rackToFNMapping = new HashMap<>();
657
658    // Lets understand the current rack distribution of the FN
659    for (ServerName sn : favoredNodes) {
660      String rack = getRackOfServer(sn);
661      racks.add(rack);
662
663      Set<ServerName> serversInRack = rackToFNMapping.get(rack);
664      if (serversInRack == null) {
665        serversInRack = Sets.newHashSet();
666        rackToFNMapping.put(rack, serversInRack);
667      }
668      serversInRack.add(sn);
669    }
670
671    // What racks should be skipped while getting a FN?
672    Set<String> skipRackSet = Sets.newHashSet();
673
674    /*
675     * If both the FN are from the same rack, then we don't want to generate another FN on the same
676     * rack. If that rack fails, the region would be unavailable.
677     */
678    if (racks.size() == 1 && favoredNodes.size() > 1) {
679      skipRackSet.add(racks.iterator().next());
680    }
681
682    /*
683     * If there are no free nodes on the existing racks, we should skip those racks too. We can
684     * reduce the number of iterations for FN selection.
685     */
686    for (String rack : racks) {
687      if (
688        getServersFromRack(rack) != null
689          && rackToFNMapping.get(rack).size() == getServersFromRack(rack).size()
690      ) {
691        skipRackSet.add(rack);
692      }
693    }
694
695    Set<ServerName> favoredNodeSet = Sets.newHashSet(favoredNodes);
696    if (excludeNodes != null && excludeNodes.size() > 0) {
697      favoredNodeSet.addAll(excludeNodes);
698    }
699
700    /*
701     * Lets get a random rack by excluding skipRackSet and generate a random FN from that rack.
702     */
703    int i = 0;
704    Set<String> randomRacks = Sets.newHashSet();
705    ServerName newServer = null;
706    do {
707      String randomRack = this.getOneRandomRack(skipRackSet);
708      newServer = this.getOneRandomServer(randomRack, favoredNodeSet);
709      randomRacks.add(randomRack);
710      i++;
711    } while ((i < MAX_ATTEMPTS_FN_GENERATION) && (newServer == null));
712
713    if (newServer == null) {
714      if (LOG.isTraceEnabled()) {
715        LOG.trace(String.format(
716          "Unable to generate additional favored nodes for %s after "
717            + "considering racks %s and skip rack %s with a unique rack list of %s and rack "
718            + "to RS map of %s and RS to rack map of %s",
719          StringUtils.join(favoredNodes, ","), randomRacks, skipRackSet, uniqueRackList,
720          rackToRegionServerMap, regionServerToRackMap));
721      }
722      throw new IOException(
723        " Unable to generate additional favored nodes for " + StringUtils.join(favoredNodes, ","));
724    }
725    return newServer;
726  }
727
728  /*
729   * Generate favored nodes for a region. Choose a random server as primary and then choose
730   * secondary and tertiary FN so its spread across two racks.
731   */
732  public List<ServerName> generateFavoredNodes(RegionInfo hri) throws IOException {
733
734    List<ServerName> favoredNodesForRegion = new ArrayList<>(FAVORED_NODES_NUM);
735    ServerName primary = servers.get(ThreadLocalRandom.current().nextInt(servers.size()));
736    favoredNodesForRegion.add(ServerName.valueOf(primary.getAddress(), ServerName.NON_STARTCODE));
737
738    Map<RegionInfo, ServerName> primaryRSMap = new HashMap<>(1);
739    primaryRSMap.put(hri, primary);
740    Map<RegionInfo, ServerName[]> secondaryAndTertiaryRSMap =
741      placeSecondaryAndTertiaryRS(primaryRSMap);
742    ServerName[] secondaryAndTertiaryNodes = secondaryAndTertiaryRSMap.get(hri);
743    if (secondaryAndTertiaryNodes != null && secondaryAndTertiaryNodes.length == 2) {
744      for (ServerName sn : secondaryAndTertiaryNodes) {
745        favoredNodesForRegion.add(ServerName.valueOf(sn.getAddress(), ServerName.NON_STARTCODE));
746      }
747      return favoredNodesForRegion;
748    } else {
749      throw new HBaseIOException("Unable to generate secondary and tertiary favored nodes.");
750    }
751  }
752
753  public Map<RegionInfo, List<ServerName>> generateFavoredNodesRoundRobin(
754    Map<ServerName, List<RegionInfo>> assignmentMap, List<RegionInfo> regions) throws IOException {
755
756    if (regions.size() > 0) {
757      if (canPlaceFavoredNodes()) {
758        Map<RegionInfo, ServerName> primaryRSMap = new HashMap<>();
759        // Lets try to have an equal distribution for primary favored node
760        placePrimaryRSAsRoundRobin(assignmentMap, primaryRSMap, regions);
761        return generateFavoredNodes(primaryRSMap);
762
763      } else {
764        throw new HBaseIOException("Not enough nodes to generate favored nodes");
765      }
766    }
767    return null;
768  }
769
770  /*
771   * Generate favored nodes for a set of regions when we know where they are currently hosted.
772   */
773  private Map<RegionInfo, List<ServerName>>
774    generateFavoredNodes(Map<RegionInfo, ServerName> primaryRSMap) {
775
776    Map<RegionInfo, List<ServerName>> generatedFavNodes = new HashMap<>();
777    Map<RegionInfo, ServerName[]> secondaryAndTertiaryRSMap =
778      placeSecondaryAndTertiaryRS(primaryRSMap);
779
780    for (Entry<RegionInfo, ServerName> entry : primaryRSMap.entrySet()) {
781      List<ServerName> favoredNodesForRegion = new ArrayList<>(FAVORED_NODES_NUM);
782      RegionInfo region = entry.getKey();
783      ServerName primarySN = entry.getValue();
784      favoredNodesForRegion
785        .add(ServerName.valueOf(primarySN.getHostname(), primarySN.getPort(), NON_STARTCODE));
786      ServerName[] secondaryAndTertiaryNodes = secondaryAndTertiaryRSMap.get(region);
787      if (secondaryAndTertiaryNodes != null) {
788        favoredNodesForRegion.add(ServerName.valueOf(secondaryAndTertiaryNodes[0].getHostname(),
789          secondaryAndTertiaryNodes[0].getPort(), NON_STARTCODE));
790        favoredNodesForRegion.add(ServerName.valueOf(secondaryAndTertiaryNodes[1].getHostname(),
791          secondaryAndTertiaryNodes[1].getPort(), NON_STARTCODE));
792      }
793      generatedFavNodes.put(region, favoredNodesForRegion);
794    }
795    return generatedFavNodes;
796  }
797
798  /*
799   * Get the rack of server from local mapping when present, saves lookup by the RackManager.
800   */
801  private String getRackOfServer(ServerName sn) {
802    if (this.regionServerToRackMap.containsKey(sn.getHostname())) {
803      return this.regionServerToRackMap.get(sn.getHostname());
804    } else {
805      String rack = this.rackManager.getRack(sn);
806      this.regionServerToRackMap.put(sn.getHostname(), rack);
807      return rack;
808    }
809  }
810}