1 /**
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 package org.apache.hadoop.hbase.master;
19 
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Collection;
23 import java.util.Collections;
24 import java.util.HashMap;
25 import java.util.HashSet;
26 import java.util.Iterator;
27 import java.util.List;
28 import java.util.Map;
29 import java.util.Set;
30 import java.util.TreeMap;
31 
32 import org.apache.commons.logging.Log;
33 import org.apache.commons.logging.LogFactory;
34 import org.apache.hadoop.conf.Configuration;
35 import org.apache.hadoop.hbase.HConstants;
36 import org.apache.hadoop.hbase.HRegionInfo;
37 import org.apache.hadoop.hbase.HTableDescriptor;
38 import org.apache.hadoop.hbase.MetaTableAccessor;
39 import org.apache.hadoop.hbase.RegionTransition;
40 import org.apache.hadoop.hbase.Server;
41 import org.apache.hadoop.hbase.ServerLoad;
42 import org.apache.hadoop.hbase.ServerName;
43 import org.apache.hadoop.hbase.TableName;
44 import org.apache.hadoop.hbase.TableStateManager;
45 import org.apache.hadoop.hbase.classification.InterfaceAudience;
46 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
47 import org.apache.hadoop.hbase.master.RegionState.State;
48 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
49 import org.apache.hadoop.hbase.util.Bytes;
50 import org.apache.hadoop.hbase.util.FSUtils;
51 import org.apache.hadoop.hbase.util.Pair;
52 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
53 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
54 import org.apache.zookeeper.KeeperException;
55 
56 import com.google.common.annotations.VisibleForTesting;
57 import com.google.common.base.Preconditions;
58 
59 /**
60  * Region state accountant. It holds the states of all regions in the memory.
61  * In normal scenario, it should match the meta table and the true region states.
62  *
63  * This map is used by AssignmentManager to track region states.
64  */
65 @InterfaceAudience.Private
66 public class RegionStates {
67   private static final Log LOG = LogFactory.getLog(RegionStates.class);
68 
69   /**
70    * Regions currently in transition.
71    */
72   final HashMap<String, RegionState> regionsInTransition =
73     new HashMap<String, RegionState>();
74 
75   /**
76    * Region encoded name to state map.
77    * All the regions should be in this map.
78    */
79   private final Map<String, RegionState> regionStates =
80     new HashMap<String, RegionState>();
81 
82   /**
83    * Holds mapping of table -> region state
84    */
85   private final Map<TableName, Map<String, RegionState>> regionStatesTableIndex =
86       new HashMap<TableName, Map<String, RegionState>>();
87 
88   /**
89    * Server to regions assignment map.
90    * Contains the set of regions currently assigned to a given server.
91    */
92   private final Map<ServerName, Set<HRegionInfo>> serverHoldings =
93     new HashMap<ServerName, Set<HRegionInfo>>();
94 
95   /**
96    * Maintains the mapping from the default region to the replica regions.
97    */
98   private final Map<HRegionInfo, Set<HRegionInfo>> defaultReplicaToOtherReplicas =
99     new HashMap<HRegionInfo, Set<HRegionInfo>>();
100 
101   /**
102    * Region to server assignment map.
103    * Contains the server a given region is currently assigned to.
104    */
105   private final TreeMap<HRegionInfo, ServerName> regionAssignments =
106     new TreeMap<HRegionInfo, ServerName>();
107 
108   /**
109    * Encoded region name to server assignment map for re-assignment
110    * purpose. Contains the server a given region is last known assigned
111    * to, which has not completed log splitting, so not assignable.
112    * If a region is currently assigned, this server info in this
113    * map should be the same as that in regionAssignments.
114    * However the info in regionAssignments is cleared when the region
115    * is offline while the info in lastAssignments is cleared when
116    * the region is closed or the server is dead and processed.
117    */
118   private final HashMap<String, ServerName> lastAssignments =
119     new HashMap<String, ServerName>();
120 
121   /**
122    * Encoded region name to server assignment map for the
123    * purpose to clean up serverHoldings when a region is online
124    * on a new server. When the region is offline from the previous
125    * server, we cleaned up regionAssignments so that it has the
126    * latest assignment map. But we didn't clean up serverHoldings
127    * to match the meta. We need this map to find out the old server
128    * whose serverHoldings needs cleanup, given a moved region.
129    */
130   private final HashMap<String, ServerName> oldAssignments =
131     new HashMap<String, ServerName>();
132 
133   /**
134    * Map a host port pair string to the latest start code
135    * of a region server which is known to be dead. It is dead
136    * to us, but server manager may not know it yet.
137    */
138   private final HashMap<String, Long> deadServers =
139     new HashMap<String, Long>();
140 
141   /**
142    * Map a dead servers to the time when log split is done.
143    * Since log splitting is not ordered, we have to remember
144    * all processed instances. The map is cleaned up based
145    * on a configured time. By default, we assume a dead
146    * server should be done with log splitting in two hours.
147    */
148   private final HashMap<ServerName, Long> processedServers =
149     new HashMap<ServerName, Long>();
150   private long lastProcessedServerCleanTime;
151 
152   private final TableStateManager tableStateManager;
153   private final RegionStateStore regionStateStore;
154   private final ServerManager serverManager;
155   private final Server server;
156 
157   // The maximum time to keep a log split info in region states map
158   static final String LOG_SPLIT_TIME = "hbase.master.maximum.logsplit.keeptime";
159   static final long DEFAULT_LOG_SPLIT_TIME = 7200000L; // 2 hours
160 
RegionStates(final Server master, final TableStateManager tableStateManager, final ServerManager serverManager, final RegionStateStore regionStateStore)161   RegionStates(final Server master, final TableStateManager tableStateManager,
162       final ServerManager serverManager, final RegionStateStore regionStateStore) {
163     this.tableStateManager = tableStateManager;
164     this.regionStateStore = regionStateStore;
165     this.serverManager = serverManager;
166     this.server = master;
167   }
168 
169   /**
170    * @return a copy of the region assignment map
171    */
getRegionAssignments()172   public synchronized Map<HRegionInfo, ServerName> getRegionAssignments() {
173     return new TreeMap<HRegionInfo, ServerName>(regionAssignments);
174   }
175 
176   /**
177    * Return the replicas (including default) for the regions grouped by ServerName
178    * @param regions
179    * @return a pair containing the groupings as a map
180    */
getRegionAssignments( Collection<HRegionInfo> regions)181   synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignments(
182     Collection<HRegionInfo> regions) {
183     Map<ServerName, List<HRegionInfo>> map = new HashMap<ServerName, List<HRegionInfo>>();
184     for (HRegionInfo region : regions) {
185       HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(region);
186       Set<HRegionInfo> allReplicas = defaultReplicaToOtherReplicas.get(defaultReplica);
187       if (allReplicas != null) {
188         for (HRegionInfo hri : allReplicas) {
189           ServerName server = regionAssignments.get(hri);
190           if (server != null) {
191             List<HRegionInfo> regionsOnServer = map.get(server);
192             if (regionsOnServer == null) {
193               regionsOnServer = new ArrayList<HRegionInfo>(1);
194               map.put(server, regionsOnServer);
195             }
196             regionsOnServer.add(hri);
197           }
198         }
199       }
200     }
201     return map;
202   }
203 
getRegionServerOfRegion(HRegionInfo hri)204   public synchronized ServerName getRegionServerOfRegion(HRegionInfo hri) {
205     return regionAssignments.get(hri);
206   }
207 
208   /**
209    * Get regions in transition and their states
210    */
211   @SuppressWarnings("unchecked")
getRegionsInTransition()212   public synchronized Map<String, RegionState> getRegionsInTransition() {
213     return (Map<String, RegionState>)regionsInTransition.clone();
214   }
215 
216   /**
217    * @return True if specified region in transition.
218    */
isRegionInTransition(final HRegionInfo hri)219   public synchronized boolean isRegionInTransition(final HRegionInfo hri) {
220     return regionsInTransition.containsKey(hri.getEncodedName());
221   }
222 
223   /**
224    * @return True if specified region in transition.
225    */
isRegionInTransition(final String encodedName)226   public synchronized boolean isRegionInTransition(final String encodedName) {
227     return regionsInTransition.containsKey(encodedName);
228   }
229 
230   /**
231    * @return True if any region in transition.
232    */
isRegionsInTransition()233   public synchronized boolean isRegionsInTransition() {
234     return !regionsInTransition.isEmpty();
235   }
236 
237   /**
238    * @return True if specified region assigned, and not in transition.
239    */
isRegionOnline(final HRegionInfo hri)240   public synchronized boolean isRegionOnline(final HRegionInfo hri) {
241     return !isRegionInTransition(hri) && regionAssignments.containsKey(hri);
242   }
243 
244   /**
245    * @return True if specified region offline/closed, but not in transition.
246    * If the region is not in the map, it is offline to us too.
247    */
isRegionOffline(final HRegionInfo hri)248   public synchronized boolean isRegionOffline(final HRegionInfo hri) {
249     return getRegionState(hri) == null || (!isRegionInTransition(hri)
250       && isRegionInState(hri, State.OFFLINE, State.CLOSED));
251   }
252 
253   /**
254    * @return True if specified region is in one of the specified states.
255    */
isRegionInState( final HRegionInfo hri, final State... states)256   public boolean isRegionInState(
257       final HRegionInfo hri, final State... states) {
258     return isRegionInState(hri.getEncodedName(), states);
259   }
260 
261   /**
262    * @return True if specified region is in one of the specified states.
263    */
isRegionInState( final String encodedName, final State... states)264   public boolean isRegionInState(
265       final String encodedName, final State... states) {
266     RegionState regionState = getRegionState(encodedName);
267     return isOneOfStates(regionState, states);
268   }
269 
270   /**
271    * Wait for the state map to be updated by assignment manager.
272    */
waitForUpdate( final long timeout)273   public synchronized void waitForUpdate(
274       final long timeout) throws InterruptedException {
275     this.wait(timeout);
276   }
277 
278   /**
279    * Get region transition state
280    */
getRegionTransitionState(final HRegionInfo hri)281   public RegionState getRegionTransitionState(final HRegionInfo hri) {
282     return getRegionTransitionState(hri.getEncodedName());
283   }
284 
285   /**
286    * Get region transition state
287    */
288   public synchronized RegionState
getRegionTransitionState(final String encodedName)289       getRegionTransitionState(final String encodedName) {
290     return regionsInTransition.get(encodedName);
291   }
292 
293   /**
294    * Add a list of regions to RegionStates. If a region is split
295    * and offline, its state will be SPLIT. Otherwise, its state will
296    * be OFFLINE. Region already in RegionStates will be skipped.
297    */
createRegionStates( final List<HRegionInfo> hris)298   public void createRegionStates(
299       final List<HRegionInfo> hris) {
300     for (HRegionInfo hri: hris) {
301       createRegionState(hri);
302     }
303   }
304 
305   /**
306    * Add a region to RegionStates. If the region is split
307    * and offline, its state will be SPLIT. Otherwise, its state will
308    * be OFFLINE. If it is already in RegionStates, this call has
309    * no effect, and the original state is returned.
310    */
createRegionState(final HRegionInfo hri)311   public RegionState createRegionState(final HRegionInfo hri) {
312     return createRegionState(hri, null, null, null);
313   }
314 
315   /**
316    * Add a region to RegionStates with the specified state.
317    * If the region is already in RegionStates, this call has
318    * no effect, and the original state is returned.
319    *
320    * @param hri the region info to create a state for
321    * @param newState the state to the region in set to
322    * @param serverName the server the region is transitioning on
323    * @param lastHost the last server that hosts the region
324    * @return the current state
325    */
createRegionState(final HRegionInfo hri, State newState, ServerName serverName, ServerName lastHost)326   public synchronized RegionState createRegionState(final HRegionInfo hri,
327       State newState, ServerName serverName, ServerName lastHost) {
328     if (newState == null || (newState == State.OPEN && serverName == null)) {
329       newState =  State.OFFLINE;
330     }
331     if (hri.isOffline() && hri.isSplit()) {
332       newState = State.SPLIT;
333       serverName = null;
334     }
335     String encodedName = hri.getEncodedName();
336     RegionState regionState = regionStates.get(encodedName);
337     if (regionState != null) {
338       LOG.warn("Tried to create a state for a region already in RegionStates, "
339         + "used existing: " + regionState + ", ignored new: " + newState);
340     } else {
341       regionState = new RegionState(hri, newState, serverName);
342       putRegionState(regionState);
343       if (newState == State.OPEN) {
344         if (!serverName.equals(lastHost)) {
345           LOG.warn("Open region's last host " + lastHost
346             + " should be the same as the current one " + serverName
347             + ", ignored the last and used the current one");
348           lastHost = serverName;
349         }
350         lastAssignments.put(encodedName, lastHost);
351         regionAssignments.put(hri, lastHost);
352       } else if (!regionState.isUnassignable()) {
353         regionsInTransition.put(encodedName, regionState);
354       }
355       if (lastHost != null && newState != State.SPLIT) {
356         addToServerHoldings(lastHost, hri);
357         if (newState != State.OPEN) {
358           oldAssignments.put(encodedName, lastHost);
359         }
360       }
361     }
362     return regionState;
363   }
364 
putRegionState(RegionState regionState)365   private RegionState putRegionState(RegionState regionState) {
366     HRegionInfo hri = regionState.getRegion();
367     String encodedName = hri.getEncodedName();
368     TableName table = hri.getTable();
369     RegionState oldState = regionStates.put(encodedName, regionState);
370     Map<String, RegionState> map = regionStatesTableIndex.get(table);
371     if (map == null) {
372       map = new HashMap<String, RegionState>();
373       regionStatesTableIndex.put(table, map);
374     }
375     map.put(encodedName, regionState);
376     return oldState;
377   }
378 
379   /**
380    * Update a region state. It will be put in transition if not already there.
381    */
updateRegionState( final HRegionInfo hri, final State state)382   public RegionState updateRegionState(
383       final HRegionInfo hri, final State state) {
384     RegionState regionState = getRegionState(hri.getEncodedName());
385     return updateRegionState(hri, state,
386       regionState == null ? null : regionState.getServerName());
387   }
388 
389   /**
390    * Update a region state. It will be put in transition if not already there.
391    *
392    * If we can't find the region info based on the region name in
393    * the transition, log a warning and return null.
394    */
updateRegionState( final RegionTransition transition, final State state)395   public RegionState updateRegionState(
396       final RegionTransition transition, final State state) {
397     byte [] regionName = transition.getRegionName();
398     HRegionInfo regionInfo = getRegionInfo(regionName);
399     if (regionInfo == null) {
400       String prettyRegionName = HRegionInfo.prettyPrint(
401         HRegionInfo.encodeRegionName(regionName));
402       LOG.warn("Failed to find region " + prettyRegionName
403         + " in updating its state to " + state
404         + " based on region transition " + transition);
405       return null;
406     }
407     return updateRegionState(regionInfo, state,
408       transition.getServerName());
409   }
410 
411   /**
412    * Transition a region state to OPEN from OPENING/PENDING_OPEN
413    */
transitionOpenFromPendingOpenOrOpeningOnServer( final RegionTransition transition, final RegionState fromState, final ServerName sn)414   public synchronized RegionState transitionOpenFromPendingOpenOrOpeningOnServer(
415       final RegionTransition transition, final RegionState fromState, final ServerName sn) {
416     if(fromState.isPendingOpenOrOpeningOnServer(sn)){
417       return updateRegionState(transition, State.OPEN);
418     }
419     return null;
420   }
421 
422   /**
423    * Update a region state. It will be put in transition if not already there.
424    */
updateRegionState( final HRegionInfo hri, final State state, final ServerName serverName)425   public RegionState updateRegionState(
426       final HRegionInfo hri, final State state, final ServerName serverName) {
427     return updateRegionState(hri, state, serverName, HConstants.NO_SEQNUM);
428   }
429 
regionOnline(final HRegionInfo hri, final ServerName serverName)430   public void regionOnline(final HRegionInfo hri, final ServerName serverName) {
431     regionOnline(hri, serverName, HConstants.NO_SEQNUM);
432   }
433 
434   /**
435    * A region is online, won't be in transition any more.
436    * We can't confirm it is really online on specified region server
437    * because it hasn't been put in region server's online region list yet.
438    */
regionOnline(final HRegionInfo hri, final ServerName serverName, long openSeqNum)439   public void regionOnline(final HRegionInfo hri, final ServerName serverName, long openSeqNum) {
440     String encodedName = hri.getEncodedName();
441     if (!serverManager.isServerOnline(serverName)) {
442       // This is possible if the region server dies before master gets a
443       // chance to handle ZK event in time. At this time, if the dead server
444       // is already processed by SSH, we should ignore this event.
445       // If not processed yet, ignore and let SSH deal with it.
446       LOG.warn("Ignored, " + encodedName + " was opened on a dead server: " + serverName);
447       return;
448     }
449     updateRegionState(hri, State.OPEN, serverName, openSeqNum);
450 
451     synchronized (this) {
452       regionsInTransition.remove(encodedName);
453       ServerName oldServerName = regionAssignments.put(hri, serverName);
454       if (!serverName.equals(oldServerName)) {
455         if (LOG.isDebugEnabled()) {
456           LOG.debug("Onlined " + hri.getShortNameToLog() + " on " + serverName);
457         }
458         addToServerHoldings(serverName, hri);
459         addToReplicaMapping(hri);
460         if (oldServerName == null) {
461           oldServerName = oldAssignments.remove(encodedName);
462         }
463         if (oldServerName != null
464             && !oldServerName.equals(serverName)
465             && serverHoldings.containsKey(oldServerName)) {
466           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
467           removeFromServerHoldings(oldServerName, hri);
468         }
469       }
470     }
471   }
472 
addToServerHoldings(ServerName serverName, HRegionInfo hri)473   private void addToServerHoldings(ServerName serverName, HRegionInfo hri) {
474     Set<HRegionInfo> regions = serverHoldings.get(serverName);
475     if (regions == null) {
476       regions = new HashSet<HRegionInfo>();
477       serverHoldings.put(serverName, regions);
478     }
479     regions.add(hri);
480   }
481 
addToReplicaMapping(HRegionInfo hri)482   private void addToReplicaMapping(HRegionInfo hri) {
483     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
484     Set<HRegionInfo> replicas =
485         defaultReplicaToOtherReplicas.get(defaultReplica);
486     if (replicas == null) {
487       replicas = new HashSet<HRegionInfo>();
488       defaultReplicaToOtherReplicas.put(defaultReplica, replicas);
489     }
490     replicas.add(hri);
491   }
492 
removeFromServerHoldings(ServerName serverName, HRegionInfo hri)493   private void removeFromServerHoldings(ServerName serverName, HRegionInfo hri) {
494     Set<HRegionInfo> oldRegions = serverHoldings.get(serverName);
495     oldRegions.remove(hri);
496     if (oldRegions.isEmpty()) {
497       serverHoldings.remove(serverName);
498     }
499   }
500 
removeFromReplicaMapping(HRegionInfo hri)501   private void removeFromReplicaMapping(HRegionInfo hri) {
502     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
503     Set<HRegionInfo> replicas = defaultReplicaToOtherReplicas.get(defaultReplica);
504     if (replicas != null) {
505       replicas.remove(hri);
506       if (replicas.isEmpty()) {
507         defaultReplicaToOtherReplicas.remove(defaultReplica);
508       }
509     }
510   }
511 
512   /**
513    * A dead server's wals have been split so that all the regions
514    * used to be open on it can be safely assigned now. Mark them assignable.
515    */
logSplit(final ServerName serverName)516   public synchronized void logSplit(final ServerName serverName) {
517     for (Iterator<Map.Entry<String, ServerName>> it
518         = lastAssignments.entrySet().iterator(); it.hasNext();) {
519       Map.Entry<String, ServerName> e = it.next();
520       if (e.getValue().equals(serverName)) {
521         it.remove();
522       }
523     }
524     long now = System.currentTimeMillis();
525     if (LOG.isDebugEnabled()) {
526       LOG.debug("Adding to log splitting servers " + serverName);
527     }
528     processedServers.put(serverName, Long.valueOf(now));
529     Configuration conf = server.getConfiguration();
530     long obsoleteTime = conf.getLong(LOG_SPLIT_TIME, DEFAULT_LOG_SPLIT_TIME);
531     // Doesn't have to be very accurate about the clean up time
532     if (now > lastProcessedServerCleanTime + obsoleteTime) {
533       lastProcessedServerCleanTime = now;
534       long cutoff = now - obsoleteTime;
535       for (Iterator<Map.Entry<ServerName, Long>> it
536           = processedServers.entrySet().iterator(); it.hasNext();) {
537         Map.Entry<ServerName, Long> e = it.next();
538         if (e.getValue().longValue() < cutoff) {
539           if (LOG.isDebugEnabled()) {
540             LOG.debug("Removed from log splitting servers " + e.getKey());
541           }
542           it.remove();
543         }
544       }
545     }
546   }
547 
548   /**
549    * Log split is done for a given region, so it is assignable now.
550    */
logSplit(final HRegionInfo region)551   public void logSplit(final HRegionInfo region) {
552     clearLastAssignment(region);
553   }
554 
clearLastAssignment(final HRegionInfo region)555   public synchronized void clearLastAssignment(final HRegionInfo region) {
556     lastAssignments.remove(region.getEncodedName());
557   }
558 
559   /**
560    * A region is offline, won't be in transition any more.
561    */
regionOffline(final HRegionInfo hri)562   public void regionOffline(final HRegionInfo hri) {
563     regionOffline(hri, null);
564   }
565 
566   /**
567    * A region is offline, won't be in transition any more. Its state
568    * should be the specified expected state, which can only be
569    * Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
570    */
regionOffline( final HRegionInfo hri, final State expectedState)571   public void regionOffline(
572       final HRegionInfo hri, final State expectedState) {
573     Preconditions.checkArgument(expectedState == null
574       || RegionState.isUnassignable(expectedState),
575         "Offlined region should not be " + expectedState);
576     if (isRegionInState(hri, State.SPLITTING_NEW, State.MERGING_NEW)) {
577       // Remove it from all region maps
578       deleteRegion(hri);
579       return;
580     }
581     State newState =
582       expectedState == null ? State.OFFLINE : expectedState;
583     updateRegionState(hri, newState);
584     String encodedName = hri.getEncodedName();
585     synchronized (this) {
586       regionsInTransition.remove(encodedName);
587       ServerName oldServerName = regionAssignments.remove(hri);
588       if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
589         if (newState == State.MERGED || newState == State.SPLIT
590             || hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
591               ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
592           // Offline the region only if it's merged/split, or the table is disabled/disabling.
593           // Otherwise, offline it from this server only when it is online on a different server.
594           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
595           removeFromServerHoldings(oldServerName, hri);
596           removeFromReplicaMapping(hri);
597         } else {
598           // Need to remember it so that we can offline it from this
599           // server when it is online on a different server.
600           oldAssignments.put(encodedName, oldServerName);
601         }
602       }
603     }
604   }
605 
606   /**
607    * A server is offline, all regions on it are dead.
608    */
serverOffline(final ZooKeeperWatcher watcher, final ServerName sn)609   public List<HRegionInfo> serverOffline(final ZooKeeperWatcher watcher, final ServerName sn) {
610     // Offline all regions on this server not already in transition.
611     List<HRegionInfo> rits = new ArrayList<HRegionInfo>();
612     Set<HRegionInfo> regionsToCleanIfNoMetaEntry = new HashSet<HRegionInfo>();
613     // Offline regions outside the loop and synchronized block to avoid
614     // ConcurrentModificationException and deadlock in case of meta anassigned,
615     // but RegionState a blocked.
616     Set<HRegionInfo> regionsToOffline = new HashSet<HRegionInfo>();
617     synchronized (this) {
618       Set<HRegionInfo> assignedRegions = serverHoldings.get(sn);
619       if (assignedRegions == null) {
620         assignedRegions = new HashSet<HRegionInfo>();
621       }
622 
623       for (HRegionInfo region : assignedRegions) {
624         // Offline open regions, no need to offline if SPLIT/MERGED/OFFLINE
625         if (isRegionOnline(region)) {
626           regionsToOffline.add(region);
627         } else if (isRegionInState(region, State.SPLITTING, State.MERGING)) {
628           LOG.debug("Offline splitting/merging region " + getRegionState(region));
629           try {
630             // Delete the ZNode if exists
631             ZKAssign.deleteNodeFailSilent(watcher, region);
632             regionsToOffline.add(region);
633           } catch (KeeperException ke) {
634             server.abort("Unexpected ZK exception deleting node " + region, ke);
635           }
636         }
637       }
638 
639       for (RegionState state : regionsInTransition.values()) {
640         HRegionInfo hri = state.getRegion();
641         if (assignedRegions.contains(hri)) {
642           // Region is open on this region server, but in transition.
643           // This region must be moving away from this server, or splitting/merging.
644           // SSH will handle it, either skip assigning, or re-assign.
645           LOG.info("Transitioning " + state + " will be handled by ServerCrashProcedure for " + sn);
646         } else if (sn.equals(state.getServerName())) {
647           // Region is in transition on this region server, and this
648           // region is not open on this server. So the region must be
649           // moving to this server from another one (i.e. opening or
650           // pending open on this server, was open on another one.
651           // Offline state is also kind of pending open if the region is in
652           // transition. The region could be in failed_close state too if we have
653           // tried several times to open it while this region server is not reachable)
654           if (state.isPendingOpenOrOpening() || state.isFailedClose() || state.isOffline()) {
655             LOG.info("Found region in " + state +
656               " to be reassigned by ServerCrashProcedure for " + sn);
657             rits.add(hri);
658           } else if(state.isSplittingNew()) {
659             regionsToCleanIfNoMetaEntry.add(state.getRegion());
660           } else {
661             LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state);
662           }
663         }
664       }
665       this.notifyAll();
666     }
667 
668     for (HRegionInfo hri : regionsToOffline) {
669       regionOffline(hri);
670     }
671 
672     cleanIfNoMetaEntry(regionsToCleanIfNoMetaEntry);
673     return rits;
674   }
675 
676   /**
677    * This method does an RPC to hbase:meta. Do not call this method with a lock/synchronize held.
678    * @param hris The hris to check if empty in hbase:meta and if so, clean them up.
679    */
cleanIfNoMetaEntry(Set<HRegionInfo> hris)680   private void cleanIfNoMetaEntry(Set<HRegionInfo> hris) {
681     if (hris.isEmpty()) return;
682     for (HRegionInfo hri: hris) {
683       try {
684         // This is RPC to meta table. It is done while we have a synchronize on
685         // regionstates. No progress will be made if meta is not available at this time.
686         // This is a cleanup task. Not critical.
687         if (MetaTableAccessor.getRegion(server.getConnection(), hri.getEncodedNameAsBytes()) ==
688             null) {
689           regionOffline(hri);
690           FSUtils.deleteRegionDir(server.getConfiguration(), hri);
691         }
692       } catch (IOException e) {
693         LOG.warn("Got exception while deleting " + hri + " directories from file system.", e);
694       }
695     }
696   }
697 
698   /**
699    * Gets the online regions of the specified table.
700    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
701    * Only returns <em>online</em> regions.  If a region on this table has been
702    * closed during a disable, etc., it will be included in the returned list.
703    * So, the returned list may not necessarily be ALL regions in this table, its
704    * all the ONLINE regions in the table.
705    * @param tableName
706    * @return Online regions from <code>tableName</code>
707    */
getRegionsOfTable(TableName tableName)708   public synchronized List<HRegionInfo> getRegionsOfTable(TableName tableName) {
709     List<HRegionInfo> tableRegions = new ArrayList<HRegionInfo>();
710     // boundary needs to have table's name but regionID 0 so that it is sorted
711     // before all table's regions.
712     HRegionInfo boundary = new HRegionInfo(tableName, null, null, false, 0L);
713     for (HRegionInfo hri: regionAssignments.tailMap(boundary).keySet()) {
714       if(!hri.getTable().equals(tableName)) break;
715       tableRegions.add(hri);
716     }
717     return tableRegions;
718   }
719 
720   /**
721    * Gets current state of all regions of the table.
722    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
723    * Method guaranteed to return keys for all states
724    * in {@link org.apache.hadoop.hbase.master.RegionState.State}
725    *
726    * @param tableName
727    * @return Online regions from <code>tableName</code>
728    */
729   public synchronized Map<RegionState.State, List<HRegionInfo>>
getRegionByStateOfTable(TableName tableName)730   getRegionByStateOfTable(TableName tableName) {
731     Map<RegionState.State, List<HRegionInfo>> tableRegions =
732         new HashMap<State, List<HRegionInfo>>();
733     for (State state : State.values()) {
734       tableRegions.put(state, new ArrayList<HRegionInfo>());
735     }
736     Map<String, RegionState> indexMap = regionStatesTableIndex.get(tableName);
737     if (indexMap == null)
738       return tableRegions;
739     for (RegionState regionState : indexMap.values()) {
740       tableRegions.get(regionState.getState()).add(regionState.getRegion());
741     }
742     return tableRegions;
743   }
744 
745   /**
746    * Wait on region to clear regions-in-transition.
747    * <p>
748    * If the region isn't in transition, returns immediately.  Otherwise, method
749    * blocks until the region is out of transition.
750    */
waitOnRegionToClearRegionsInTransition( final HRegionInfo hri)751   public synchronized void waitOnRegionToClearRegionsInTransition(
752       final HRegionInfo hri) throws InterruptedException {
753     if (!isRegionInTransition(hri)) return;
754 
755     while(!server.isStopped() && isRegionInTransition(hri)) {
756       RegionState rs = getRegionState(hri);
757       LOG.info("Waiting on " + rs + " to clear regions-in-transition");
758       waitForUpdate(100);
759     }
760 
761     if (server.isStopped()) {
762       LOG.info("Giving up wait on region in " +
763         "transition because stoppable.isStopped is set");
764     }
765   }
766 
767   /**
768    * A table is deleted. Remove its regions from all internal maps.
769    * We loop through all regions assuming we don't delete tables too much.
770    */
tableDeleted(final TableName tableName)771   public void tableDeleted(final TableName tableName) {
772     Set<HRegionInfo> regionsToDelete = new HashSet<HRegionInfo>();
773     synchronized (this) {
774       for (RegionState state: regionStates.values()) {
775         HRegionInfo region = state.getRegion();
776         if (region.getTable().equals(tableName)) {
777           regionsToDelete.add(region);
778         }
779       }
780     }
781     for (HRegionInfo region: regionsToDelete) {
782       deleteRegion(region);
783     }
784   }
785 
786   /**
787    * Get a copy of all regions assigned to a server
788    */
getServerRegions(ServerName serverName)789   public synchronized Set<HRegionInfo> getServerRegions(ServerName serverName) {
790     Set<HRegionInfo> regions = serverHoldings.get(serverName);
791     if (regions == null) return null;
792     return new HashSet<HRegionInfo>(regions);
793   }
794 
795   /**
796    * Remove a region from all state maps.
797    */
798   @VisibleForTesting
deleteRegion(final HRegionInfo hri)799   public synchronized void deleteRegion(final HRegionInfo hri) {
800     String encodedName = hri.getEncodedName();
801     regionsInTransition.remove(encodedName);
802     regionStates.remove(encodedName);
803     TableName table = hri.getTable();
804     Map<String, RegionState> indexMap = regionStatesTableIndex.get(table);
805     indexMap.remove(encodedName);
806     if (indexMap.size() == 0)
807       regionStatesTableIndex.remove(table);
808     lastAssignments.remove(encodedName);
809     ServerName sn = regionAssignments.remove(hri);
810     if (sn != null) {
811       Set<HRegionInfo> regions = serverHoldings.get(sn);
812       regions.remove(hri);
813     }
814   }
815 
816   /**
817    * Checking if a region was assigned to a server which is not online now.
818    * If so, we should hold re-assign this region till SSH has split its wals.
819    * Once logs are split, the last assignment of this region will be reset,
820    * which means a null last assignment server is ok for re-assigning.
821    *
822    * A region server could be dead but we don't know it yet. We may
823    * think it's online falsely. Therefore if a server is online, we still
824    * need to confirm it reachable and having the expected start code.
825    */
wasRegionOnDeadServer(final String encodedName)826   synchronized boolean wasRegionOnDeadServer(final String encodedName) {
827     ServerName server = lastAssignments.get(encodedName);
828     return isServerDeadAndNotProcessed(server);
829   }
830 
isServerDeadAndNotProcessed(ServerName server)831   synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
832     if (server == null) return false;
833     if (serverManager.isServerOnline(server)) {
834       String hostAndPort = server.getHostAndPort();
835       long startCode = server.getStartcode();
836       Long deadCode = deadServers.get(hostAndPort);
837       if (deadCode == null || startCode > deadCode.longValue()) {
838         if (serverManager.isServerReachable(server)) {
839           return false;
840         }
841         // The size of deadServers won't grow unbounded.
842         deadServers.put(hostAndPort, Long.valueOf(startCode));
843       }
844       // Watch out! If the server is not dead, the region could
845       // remain unassigned. That's why ServerManager#isServerReachable
846       // should use some retry.
847       //
848       // We cache this info since it is very unlikely for that
849       // instance to come back up later on. We don't want to expire
850       // the server since we prefer to let it die naturally.
851       LOG.warn("Couldn't reach online server " + server);
852     }
853     // Now, we know it's dead. Check if it's processed
854     return !processedServers.containsKey(server);
855   }
856 
857  /**
858    * Get the last region server a region was on for purpose of re-assignment,
859    * i.e. should the re-assignment be held back till log split is done?
860    */
getLastRegionServerOfRegion(final String encodedName)861   synchronized ServerName getLastRegionServerOfRegion(final String encodedName) {
862     return lastAssignments.get(encodedName);
863   }
864 
setLastRegionServerOfRegions( final ServerName serverName, final List<HRegionInfo> regionInfos)865   synchronized void setLastRegionServerOfRegions(
866       final ServerName serverName, final List<HRegionInfo> regionInfos) {
867     for (HRegionInfo hri: regionInfos) {
868       setLastRegionServerOfRegion(serverName, hri.getEncodedName());
869     }
870   }
871 
setLastRegionServerOfRegion( final ServerName serverName, final String encodedName)872   synchronized void setLastRegionServerOfRegion(
873       final ServerName serverName, final String encodedName) {
874     lastAssignments.put(encodedName, serverName);
875   }
876 
splitRegion(HRegionInfo p, HRegionInfo a, HRegionInfo b, ServerName sn)877   void splitRegion(HRegionInfo p,
878       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
879 
880     regionStateStore.splitRegion(p, a, b, sn, getRegionReplication(p));
881     synchronized (this) {
882       // After PONR, split is considered to be done.
883       // Update server holdings to be aligned with the meta.
884       Set<HRegionInfo> regions = serverHoldings.get(sn);
885       if (regions == null) {
886         throw new IllegalStateException(sn + " should host some regions");
887       }
888       regions.remove(p);
889       regions.add(a);
890       regions.add(b);
891     }
892   }
893 
mergeRegions(HRegionInfo p, HRegionInfo a, HRegionInfo b, ServerName sn)894   void mergeRegions(HRegionInfo p,
895       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
896     regionStateStore.mergeRegions(p, a, b, sn, getRegionReplication(a));
897     synchronized (this) {
898       // After PONR, merge is considered to be done.
899       // Update server holdings to be aligned with the meta.
900       Set<HRegionInfo> regions = serverHoldings.get(sn);
901       if (regions == null) {
902         throw new IllegalStateException(sn + " should host some regions");
903       }
904       regions.remove(a);
905       regions.remove(b);
906       regions.add(p);
907     }
908   }
909 
getRegionReplication(HRegionInfo r)910   private int getRegionReplication(HRegionInfo r) throws IOException {
911     if (tableStateManager != null) {
912       HTableDescriptor htd = ((MasterServices)server).getTableDescriptors().get(r.getTable());
913       if (htd != null) {
914         return htd.getRegionReplication();
915       }
916     }
917     return 1;
918   }
919 
920   /**
921    * At cluster clean re/start, mark all user regions closed except those of tables
922    * that are excluded, such as disabled/disabling/enabling tables. All user regions
923    * and their previous locations are returned.
924    */
closeAllUserRegions(Set<TableName> excludedTables)925   synchronized Map<HRegionInfo, ServerName> closeAllUserRegions(Set<TableName> excludedTables) {
926     boolean noExcludeTables = excludedTables == null || excludedTables.isEmpty();
927     Set<HRegionInfo> toBeClosed = new HashSet<HRegionInfo>(regionStates.size());
928     for(RegionState state: regionStates.values()) {
929       HRegionInfo hri = state.getRegion();
930       if (state.isSplit() || hri.isSplit()) {
931         continue;
932       }
933       TableName tableName = hri.getTable();
934       if (!TableName.META_TABLE_NAME.equals(tableName)
935           && (noExcludeTables || !excludedTables.contains(tableName))) {
936         toBeClosed.add(hri);
937       }
938     }
939     Map<HRegionInfo, ServerName> allUserRegions =
940       new HashMap<HRegionInfo, ServerName>(toBeClosed.size());
941     for (HRegionInfo hri: toBeClosed) {
942       RegionState regionState = updateRegionState(hri, State.CLOSED);
943       allUserRegions.put(hri, regionState.getServerName());
944     }
945     return allUserRegions;
946   }
947 
948   /**
949    * Compute the average load across all region servers.
950    * Currently, this uses a very naive computation - just uses the number of
951    * regions being served, ignoring stats about number of requests.
952    * @return the average load
953    */
getAverageLoad()954   protected synchronized double getAverageLoad() {
955     int numServers = 0, totalLoad = 0;
956     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
957       Set<HRegionInfo> regions = e.getValue();
958       ServerName serverName = e.getKey();
959       int regionCount = regions.size();
960       if (serverManager.isServerOnline(serverName)) {
961         totalLoad += regionCount;
962         numServers++;
963       }
964     }
965     if (numServers > 1) {
966       // The master region server holds only a couple regions.
967       // Don't consider this server in calculating the average load
968       // if there are other region servers to avoid possible confusion.
969       Set<HRegionInfo> hris = serverHoldings.get(server.getServerName());
970       if (hris != null) {
971         totalLoad -= hris.size();
972         numServers--;
973       }
974     }
975     return numServers == 0 ? 0.0 :
976       (double)totalLoad / (double)numServers;
977   }
978 
979   /**
980    * This is an EXPENSIVE clone.  Cloning though is the safest thing to do.
981    * Can't let out original since it can change and at least the load balancer
982    * wants to iterate this exported list.  We need to synchronize on regions
983    * since all access to this.servers is under a lock on this.regions.
984    *
985    * @return A clone of current assignments by table.
986    */
987   protected Map<TableName, Map<ServerName, List<HRegionInfo>>>
getAssignmentsByTable()988       getAssignmentsByTable() {
989     Map<TableName, Map<ServerName, List<HRegionInfo>>> result =
990       new HashMap<TableName, Map<ServerName,List<HRegionInfo>>>();
991     synchronized (this) {
992       if (!server.getConfiguration().getBoolean("hbase.master.loadbalance.bytable", false)) {
993         Map<ServerName, List<HRegionInfo>> svrToRegions =
994           new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
995         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
996           svrToRegions.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
997         }
998         result.put(TableName.valueOf("ensemble"), svrToRegions);
999       } else {
1000         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
1001           for (HRegionInfo hri: e.getValue()) {
1002             if (hri.isMetaRegion()) continue;
1003             TableName tablename = hri.getTable();
1004             Map<ServerName, List<HRegionInfo>> svrToRegions = result.get(tablename);
1005             if (svrToRegions == null) {
1006               svrToRegions = new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1007               result.put(tablename, svrToRegions);
1008             }
1009             List<HRegionInfo> regions = svrToRegions.get(e.getKey());
1010             if (regions == null) {
1011               regions = new ArrayList<HRegionInfo>();
1012               svrToRegions.put(e.getKey(), regions);
1013             }
1014             regions.add(hri);
1015           }
1016         }
1017       }
1018     }
1019 
1020     Map<ServerName, ServerLoad>
1021       onlineSvrs = serverManager.getOnlineServers();
1022     // Take care of servers w/o assignments, and remove servers in draining mode
1023     List<ServerName> drainingServers = this.serverManager.getDrainingServersList();
1024     for (Map<ServerName, List<HRegionInfo>> map: result.values()) {
1025       for (ServerName svr: onlineSvrs.keySet()) {
1026         if (!map.containsKey(svr)) {
1027           map.put(svr, new ArrayList<HRegionInfo>());
1028         }
1029       }
1030       map.keySet().removeAll(drainingServers);
1031     }
1032     return result;
1033   }
1034 
getRegionState(final HRegionInfo hri)1035   protected RegionState getRegionState(final HRegionInfo hri) {
1036     return getRegionState(hri.getEncodedName());
1037   }
1038 
1039   /**
1040    * Returns a clone of region assignments per server
1041    * @return a Map of ServerName to a List of HRegionInfo's
1042    */
getRegionAssignmentsByServer()1043   protected synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignmentsByServer() {
1044     Map<ServerName, List<HRegionInfo>> regionsByServer =
1045         new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1046     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
1047       regionsByServer.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
1048     }
1049     return regionsByServer;
1050   }
1051 
getRegionState(final String encodedName)1052   protected synchronized RegionState getRegionState(final String encodedName) {
1053     return regionStates.get(encodedName);
1054   }
1055 
1056   /**
1057    * Get the HRegionInfo from cache, if not there, from the hbase:meta table
1058    * @param  regionName
1059    * @return HRegionInfo for the region
1060    */
1061   @SuppressWarnings("deprecation")
getRegionInfo(final byte [] regionName)1062   protected HRegionInfo getRegionInfo(final byte [] regionName) {
1063     String encodedName = HRegionInfo.encodeRegionName(regionName);
1064     RegionState regionState = getRegionState(encodedName);
1065     if (regionState != null) {
1066       return regionState.getRegion();
1067     }
1068 
1069     try {
1070       Pair<HRegionInfo, ServerName> p =
1071         MetaTableAccessor.getRegion(server.getConnection(), regionName);
1072       HRegionInfo hri = p == null ? null : p.getFirst();
1073       if (hri != null) {
1074         createRegionState(hri);
1075       }
1076       return hri;
1077     } catch (IOException e) {
1078       server.abort("Aborting because error occoured while reading "
1079         + Bytes.toStringBinary(regionName) + " from hbase:meta", e);
1080       return null;
1081     }
1082   }
1083 
isOneOfStates(RegionState regionState, State... states)1084   static boolean isOneOfStates(RegionState regionState, State... states) {
1085     State s = regionState != null ? regionState.getState() : null;
1086     for (State state: states) {
1087       if (s == state) return true;
1088     }
1089     return false;
1090   }
1091 
1092   /**
1093    * Update a region state. It will be put in transition if not already there.
1094    */
updateRegionState(final HRegionInfo hri, final State state, final ServerName serverName, long openSeqNum)1095   private RegionState updateRegionState(final HRegionInfo hri,
1096       final State state, final ServerName serverName, long openSeqNum) {
1097     if (state == State.FAILED_CLOSE || state == State.FAILED_OPEN) {
1098       LOG.warn("Failed to open/close " + hri.getShortNameToLog()
1099         + " on " + serverName + ", set to " + state);
1100     }
1101 
1102     String encodedName = hri.getEncodedName();
1103     RegionState regionState = new RegionState(
1104       hri, state, System.currentTimeMillis(), serverName);
1105     RegionState oldState = getRegionState(encodedName);
1106     if (!regionState.equals(oldState)) {
1107       LOG.info("Transition " + oldState + " to " + regionState);
1108       // Persist region state before updating in-memory info, if needed
1109       regionStateStore.updateRegionState(openSeqNum, regionState, oldState);
1110     }
1111 
1112     synchronized (this) {
1113       regionsInTransition.put(encodedName, regionState);
1114       putRegionState(regionState);
1115 
1116       // For these states, region should be properly closed.
1117       // There should be no log splitting issue.
1118       if ((state == State.CLOSED || state == State.MERGED
1119           || state == State.SPLIT) && lastAssignments.containsKey(encodedName)) {
1120         ServerName last = lastAssignments.get(encodedName);
1121         if (last.equals(serverName)) {
1122           lastAssignments.remove(encodedName);
1123         } else {
1124           LOG.warn(encodedName + " moved to " + state + " on "
1125             + serverName + ", expected " + last);
1126         }
1127       }
1128 
1129       // Once a region is opened, record its last assignment right away.
1130       if (serverName != null && state == State.OPEN) {
1131         ServerName last = lastAssignments.get(encodedName);
1132         if (!serverName.equals(last)) {
1133           lastAssignments.put(encodedName, serverName);
1134           if (last != null && isServerDeadAndNotProcessed(last)) {
1135             LOG.warn(encodedName + " moved to " + serverName
1136               + ", while it's previous host " + last
1137               + " is dead but not processed yet");
1138           }
1139         }
1140       }
1141 
1142       // notify the change
1143       this.notifyAll();
1144     }
1145     return regionState;
1146   }
1147 }
1148