1 /** 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 package org.apache.hadoop.hbase.master; 19 20 import java.io.IOException; 21 import java.util.ArrayList; 22 import java.util.Collection; 23 import java.util.Collections; 24 import java.util.HashMap; 25 import java.util.HashSet; 26 import java.util.Iterator; 27 import java.util.List; 28 import java.util.Map; 29 import java.util.Set; 30 import java.util.TreeMap; 31 32 import org.apache.commons.logging.Log; 33 import org.apache.commons.logging.LogFactory; 34 import org.apache.hadoop.conf.Configuration; 35 import org.apache.hadoop.hbase.HConstants; 36 import org.apache.hadoop.hbase.HRegionInfo; 37 import org.apache.hadoop.hbase.HTableDescriptor; 38 import org.apache.hadoop.hbase.MetaTableAccessor; 39 import org.apache.hadoop.hbase.RegionTransition; 40 import org.apache.hadoop.hbase.Server; 41 import org.apache.hadoop.hbase.ServerLoad; 42 import org.apache.hadoop.hbase.ServerName; 43 import org.apache.hadoop.hbase.TableName; 44 import org.apache.hadoop.hbase.TableStateManager; 45 import org.apache.hadoop.hbase.classification.InterfaceAudience; 46 import org.apache.hadoop.hbase.client.RegionReplicaUtil; 47 import org.apache.hadoop.hbase.master.RegionState.State; 48 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos; 49 import org.apache.hadoop.hbase.util.Bytes; 50 import org.apache.hadoop.hbase.util.FSUtils; 51 import org.apache.hadoop.hbase.util.Pair; 52 import org.apache.hadoop.hbase.zookeeper.ZKAssign; 53 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; 54 import org.apache.zookeeper.KeeperException; 55 56 import com.google.common.annotations.VisibleForTesting; 57 import com.google.common.base.Preconditions; 58 59 /** 60 * Region state accountant. It holds the states of all regions in the memory. 61 * In normal scenario, it should match the meta table and the true region states. 62 * 63 * This map is used by AssignmentManager to track region states. 64 */ 65 @InterfaceAudience.Private 66 public class RegionStates { 67 private static final Log LOG = LogFactory.getLog(RegionStates.class); 68 69 /** 70 * Regions currently in transition. 71 */ 72 final HashMap<String, RegionState> regionsInTransition = 73 new HashMap<String, RegionState>(); 74 75 /** 76 * Region encoded name to state map. 77 * All the regions should be in this map. 78 */ 79 private final Map<String, RegionState> regionStates = 80 new HashMap<String, RegionState>(); 81 82 /** 83 * Holds mapping of table -> region state 84 */ 85 private final Map<TableName, Map<String, RegionState>> regionStatesTableIndex = 86 new HashMap<TableName, Map<String, RegionState>>(); 87 88 /** 89 * Server to regions assignment map. 90 * Contains the set of regions currently assigned to a given server. 91 */ 92 private final Map<ServerName, Set<HRegionInfo>> serverHoldings = 93 new HashMap<ServerName, Set<HRegionInfo>>(); 94 95 /** 96 * Maintains the mapping from the default region to the replica regions. 97 */ 98 private final Map<HRegionInfo, Set<HRegionInfo>> defaultReplicaToOtherReplicas = 99 new HashMap<HRegionInfo, Set<HRegionInfo>>(); 100 101 /** 102 * Region to server assignment map. 103 * Contains the server a given region is currently assigned to. 104 */ 105 private final TreeMap<HRegionInfo, ServerName> regionAssignments = 106 new TreeMap<HRegionInfo, ServerName>(); 107 108 /** 109 * Encoded region name to server assignment map for re-assignment 110 * purpose. Contains the server a given region is last known assigned 111 * to, which has not completed log splitting, so not assignable. 112 * If a region is currently assigned, this server info in this 113 * map should be the same as that in regionAssignments. 114 * However the info in regionAssignments is cleared when the region 115 * is offline while the info in lastAssignments is cleared when 116 * the region is closed or the server is dead and processed. 117 */ 118 private final HashMap<String, ServerName> lastAssignments = 119 new HashMap<String, ServerName>(); 120 121 /** 122 * Encoded region name to server assignment map for the 123 * purpose to clean up serverHoldings when a region is online 124 * on a new server. When the region is offline from the previous 125 * server, we cleaned up regionAssignments so that it has the 126 * latest assignment map. But we didn't clean up serverHoldings 127 * to match the meta. We need this map to find out the old server 128 * whose serverHoldings needs cleanup, given a moved region. 129 */ 130 private final HashMap<String, ServerName> oldAssignments = 131 new HashMap<String, ServerName>(); 132 133 /** 134 * Map a host port pair string to the latest start code 135 * of a region server which is known to be dead. It is dead 136 * to us, but server manager may not know it yet. 137 */ 138 private final HashMap<String, Long> deadServers = 139 new HashMap<String, Long>(); 140 141 /** 142 * Map a dead servers to the time when log split is done. 143 * Since log splitting is not ordered, we have to remember 144 * all processed instances. The map is cleaned up based 145 * on a configured time. By default, we assume a dead 146 * server should be done with log splitting in two hours. 147 */ 148 private final HashMap<ServerName, Long> processedServers = 149 new HashMap<ServerName, Long>(); 150 private long lastProcessedServerCleanTime; 151 152 private final TableStateManager tableStateManager; 153 private final RegionStateStore regionStateStore; 154 private final ServerManager serverManager; 155 private final Server server; 156 157 // The maximum time to keep a log split info in region states map 158 static final String LOG_SPLIT_TIME = "hbase.master.maximum.logsplit.keeptime"; 159 static final long DEFAULT_LOG_SPLIT_TIME = 7200000L; // 2 hours 160 RegionStates(final Server master, final TableStateManager tableStateManager, final ServerManager serverManager, final RegionStateStore regionStateStore)161 RegionStates(final Server master, final TableStateManager tableStateManager, 162 final ServerManager serverManager, final RegionStateStore regionStateStore) { 163 this.tableStateManager = tableStateManager; 164 this.regionStateStore = regionStateStore; 165 this.serverManager = serverManager; 166 this.server = master; 167 } 168 169 /** 170 * @return a copy of the region assignment map 171 */ getRegionAssignments()172 public synchronized Map<HRegionInfo, ServerName> getRegionAssignments() { 173 return new TreeMap<HRegionInfo, ServerName>(regionAssignments); 174 } 175 176 /** 177 * Return the replicas (including default) for the regions grouped by ServerName 178 * @param regions 179 * @return a pair containing the groupings as a map 180 */ getRegionAssignments( Collection<HRegionInfo> regions)181 synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignments( 182 Collection<HRegionInfo> regions) { 183 Map<ServerName, List<HRegionInfo>> map = new HashMap<ServerName, List<HRegionInfo>>(); 184 for (HRegionInfo region : regions) { 185 HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(region); 186 Set<HRegionInfo> allReplicas = defaultReplicaToOtherReplicas.get(defaultReplica); 187 if (allReplicas != null) { 188 for (HRegionInfo hri : allReplicas) { 189 ServerName server = regionAssignments.get(hri); 190 if (server != null) { 191 List<HRegionInfo> regionsOnServer = map.get(server); 192 if (regionsOnServer == null) { 193 regionsOnServer = new ArrayList<HRegionInfo>(1); 194 map.put(server, regionsOnServer); 195 } 196 regionsOnServer.add(hri); 197 } 198 } 199 } 200 } 201 return map; 202 } 203 getRegionServerOfRegion(HRegionInfo hri)204 public synchronized ServerName getRegionServerOfRegion(HRegionInfo hri) { 205 return regionAssignments.get(hri); 206 } 207 208 /** 209 * Get regions in transition and their states 210 */ 211 @SuppressWarnings("unchecked") getRegionsInTransition()212 public synchronized Map<String, RegionState> getRegionsInTransition() { 213 return (Map<String, RegionState>)regionsInTransition.clone(); 214 } 215 216 /** 217 * @return True if specified region in transition. 218 */ isRegionInTransition(final HRegionInfo hri)219 public synchronized boolean isRegionInTransition(final HRegionInfo hri) { 220 return regionsInTransition.containsKey(hri.getEncodedName()); 221 } 222 223 /** 224 * @return True if specified region in transition. 225 */ isRegionInTransition(final String encodedName)226 public synchronized boolean isRegionInTransition(final String encodedName) { 227 return regionsInTransition.containsKey(encodedName); 228 } 229 230 /** 231 * @return True if any region in transition. 232 */ isRegionsInTransition()233 public synchronized boolean isRegionsInTransition() { 234 return !regionsInTransition.isEmpty(); 235 } 236 237 /** 238 * @return True if specified region assigned, and not in transition. 239 */ isRegionOnline(final HRegionInfo hri)240 public synchronized boolean isRegionOnline(final HRegionInfo hri) { 241 return !isRegionInTransition(hri) && regionAssignments.containsKey(hri); 242 } 243 244 /** 245 * @return True if specified region offline/closed, but not in transition. 246 * If the region is not in the map, it is offline to us too. 247 */ isRegionOffline(final HRegionInfo hri)248 public synchronized boolean isRegionOffline(final HRegionInfo hri) { 249 return getRegionState(hri) == null || (!isRegionInTransition(hri) 250 && isRegionInState(hri, State.OFFLINE, State.CLOSED)); 251 } 252 253 /** 254 * @return True if specified region is in one of the specified states. 255 */ isRegionInState( final HRegionInfo hri, final State... states)256 public boolean isRegionInState( 257 final HRegionInfo hri, final State... states) { 258 return isRegionInState(hri.getEncodedName(), states); 259 } 260 261 /** 262 * @return True if specified region is in one of the specified states. 263 */ isRegionInState( final String encodedName, final State... states)264 public boolean isRegionInState( 265 final String encodedName, final State... states) { 266 RegionState regionState = getRegionState(encodedName); 267 return isOneOfStates(regionState, states); 268 } 269 270 /** 271 * Wait for the state map to be updated by assignment manager. 272 */ waitForUpdate( final long timeout)273 public synchronized void waitForUpdate( 274 final long timeout) throws InterruptedException { 275 this.wait(timeout); 276 } 277 278 /** 279 * Get region transition state 280 */ getRegionTransitionState(final HRegionInfo hri)281 public RegionState getRegionTransitionState(final HRegionInfo hri) { 282 return getRegionTransitionState(hri.getEncodedName()); 283 } 284 285 /** 286 * Get region transition state 287 */ 288 public synchronized RegionState getRegionTransitionState(final String encodedName)289 getRegionTransitionState(final String encodedName) { 290 return regionsInTransition.get(encodedName); 291 } 292 293 /** 294 * Add a list of regions to RegionStates. If a region is split 295 * and offline, its state will be SPLIT. Otherwise, its state will 296 * be OFFLINE. Region already in RegionStates will be skipped. 297 */ createRegionStates( final List<HRegionInfo> hris)298 public void createRegionStates( 299 final List<HRegionInfo> hris) { 300 for (HRegionInfo hri: hris) { 301 createRegionState(hri); 302 } 303 } 304 305 /** 306 * Add a region to RegionStates. If the region is split 307 * and offline, its state will be SPLIT. Otherwise, its state will 308 * be OFFLINE. If it is already in RegionStates, this call has 309 * no effect, and the original state is returned. 310 */ createRegionState(final HRegionInfo hri)311 public RegionState createRegionState(final HRegionInfo hri) { 312 return createRegionState(hri, null, null, null); 313 } 314 315 /** 316 * Add a region to RegionStates with the specified state. 317 * If the region is already in RegionStates, this call has 318 * no effect, and the original state is returned. 319 * 320 * @param hri the region info to create a state for 321 * @param newState the state to the region in set to 322 * @param serverName the server the region is transitioning on 323 * @param lastHost the last server that hosts the region 324 * @return the current state 325 */ createRegionState(final HRegionInfo hri, State newState, ServerName serverName, ServerName lastHost)326 public synchronized RegionState createRegionState(final HRegionInfo hri, 327 State newState, ServerName serverName, ServerName lastHost) { 328 if (newState == null || (newState == State.OPEN && serverName == null)) { 329 newState = State.OFFLINE; 330 } 331 if (hri.isOffline() && hri.isSplit()) { 332 newState = State.SPLIT; 333 serverName = null; 334 } 335 String encodedName = hri.getEncodedName(); 336 RegionState regionState = regionStates.get(encodedName); 337 if (regionState != null) { 338 LOG.warn("Tried to create a state for a region already in RegionStates, " 339 + "used existing: " + regionState + ", ignored new: " + newState); 340 } else { 341 regionState = new RegionState(hri, newState, serverName); 342 putRegionState(regionState); 343 if (newState == State.OPEN) { 344 if (!serverName.equals(lastHost)) { 345 LOG.warn("Open region's last host " + lastHost 346 + " should be the same as the current one " + serverName 347 + ", ignored the last and used the current one"); 348 lastHost = serverName; 349 } 350 lastAssignments.put(encodedName, lastHost); 351 regionAssignments.put(hri, lastHost); 352 } else if (!regionState.isUnassignable()) { 353 regionsInTransition.put(encodedName, regionState); 354 } 355 if (lastHost != null && newState != State.SPLIT) { 356 addToServerHoldings(lastHost, hri); 357 if (newState != State.OPEN) { 358 oldAssignments.put(encodedName, lastHost); 359 } 360 } 361 } 362 return regionState; 363 } 364 putRegionState(RegionState regionState)365 private RegionState putRegionState(RegionState regionState) { 366 HRegionInfo hri = regionState.getRegion(); 367 String encodedName = hri.getEncodedName(); 368 TableName table = hri.getTable(); 369 RegionState oldState = regionStates.put(encodedName, regionState); 370 Map<String, RegionState> map = regionStatesTableIndex.get(table); 371 if (map == null) { 372 map = new HashMap<String, RegionState>(); 373 regionStatesTableIndex.put(table, map); 374 } 375 map.put(encodedName, regionState); 376 return oldState; 377 } 378 379 /** 380 * Update a region state. It will be put in transition if not already there. 381 */ updateRegionState( final HRegionInfo hri, final State state)382 public RegionState updateRegionState( 383 final HRegionInfo hri, final State state) { 384 RegionState regionState = getRegionState(hri.getEncodedName()); 385 return updateRegionState(hri, state, 386 regionState == null ? null : regionState.getServerName()); 387 } 388 389 /** 390 * Update a region state. It will be put in transition if not already there. 391 * 392 * If we can't find the region info based on the region name in 393 * the transition, log a warning and return null. 394 */ updateRegionState( final RegionTransition transition, final State state)395 public RegionState updateRegionState( 396 final RegionTransition transition, final State state) { 397 byte [] regionName = transition.getRegionName(); 398 HRegionInfo regionInfo = getRegionInfo(regionName); 399 if (regionInfo == null) { 400 String prettyRegionName = HRegionInfo.prettyPrint( 401 HRegionInfo.encodeRegionName(regionName)); 402 LOG.warn("Failed to find region " + prettyRegionName 403 + " in updating its state to " + state 404 + " based on region transition " + transition); 405 return null; 406 } 407 return updateRegionState(regionInfo, state, 408 transition.getServerName()); 409 } 410 411 /** 412 * Transition a region state to OPEN from OPENING/PENDING_OPEN 413 */ transitionOpenFromPendingOpenOrOpeningOnServer( final RegionTransition transition, final RegionState fromState, final ServerName sn)414 public synchronized RegionState transitionOpenFromPendingOpenOrOpeningOnServer( 415 final RegionTransition transition, final RegionState fromState, final ServerName sn) { 416 if(fromState.isPendingOpenOrOpeningOnServer(sn)){ 417 return updateRegionState(transition, State.OPEN); 418 } 419 return null; 420 } 421 422 /** 423 * Update a region state. It will be put in transition if not already there. 424 */ updateRegionState( final HRegionInfo hri, final State state, final ServerName serverName)425 public RegionState updateRegionState( 426 final HRegionInfo hri, final State state, final ServerName serverName) { 427 return updateRegionState(hri, state, serverName, HConstants.NO_SEQNUM); 428 } 429 regionOnline(final HRegionInfo hri, final ServerName serverName)430 public void regionOnline(final HRegionInfo hri, final ServerName serverName) { 431 regionOnline(hri, serverName, HConstants.NO_SEQNUM); 432 } 433 434 /** 435 * A region is online, won't be in transition any more. 436 * We can't confirm it is really online on specified region server 437 * because it hasn't been put in region server's online region list yet. 438 */ regionOnline(final HRegionInfo hri, final ServerName serverName, long openSeqNum)439 public void regionOnline(final HRegionInfo hri, final ServerName serverName, long openSeqNum) { 440 String encodedName = hri.getEncodedName(); 441 if (!serverManager.isServerOnline(serverName)) { 442 // This is possible if the region server dies before master gets a 443 // chance to handle ZK event in time. At this time, if the dead server 444 // is already processed by SSH, we should ignore this event. 445 // If not processed yet, ignore and let SSH deal with it. 446 LOG.warn("Ignored, " + encodedName + " was opened on a dead server: " + serverName); 447 return; 448 } 449 updateRegionState(hri, State.OPEN, serverName, openSeqNum); 450 451 synchronized (this) { 452 regionsInTransition.remove(encodedName); 453 ServerName oldServerName = regionAssignments.put(hri, serverName); 454 if (!serverName.equals(oldServerName)) { 455 if (LOG.isDebugEnabled()) { 456 LOG.debug("Onlined " + hri.getShortNameToLog() + " on " + serverName); 457 } 458 addToServerHoldings(serverName, hri); 459 addToReplicaMapping(hri); 460 if (oldServerName == null) { 461 oldServerName = oldAssignments.remove(encodedName); 462 } 463 if (oldServerName != null 464 && !oldServerName.equals(serverName) 465 && serverHoldings.containsKey(oldServerName)) { 466 LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName); 467 removeFromServerHoldings(oldServerName, hri); 468 } 469 } 470 } 471 } 472 addToServerHoldings(ServerName serverName, HRegionInfo hri)473 private void addToServerHoldings(ServerName serverName, HRegionInfo hri) { 474 Set<HRegionInfo> regions = serverHoldings.get(serverName); 475 if (regions == null) { 476 regions = new HashSet<HRegionInfo>(); 477 serverHoldings.put(serverName, regions); 478 } 479 regions.add(hri); 480 } 481 addToReplicaMapping(HRegionInfo hri)482 private void addToReplicaMapping(HRegionInfo hri) { 483 HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri); 484 Set<HRegionInfo> replicas = 485 defaultReplicaToOtherReplicas.get(defaultReplica); 486 if (replicas == null) { 487 replicas = new HashSet<HRegionInfo>(); 488 defaultReplicaToOtherReplicas.put(defaultReplica, replicas); 489 } 490 replicas.add(hri); 491 } 492 removeFromServerHoldings(ServerName serverName, HRegionInfo hri)493 private void removeFromServerHoldings(ServerName serverName, HRegionInfo hri) { 494 Set<HRegionInfo> oldRegions = serverHoldings.get(serverName); 495 oldRegions.remove(hri); 496 if (oldRegions.isEmpty()) { 497 serverHoldings.remove(serverName); 498 } 499 } 500 removeFromReplicaMapping(HRegionInfo hri)501 private void removeFromReplicaMapping(HRegionInfo hri) { 502 HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri); 503 Set<HRegionInfo> replicas = defaultReplicaToOtherReplicas.get(defaultReplica); 504 if (replicas != null) { 505 replicas.remove(hri); 506 if (replicas.isEmpty()) { 507 defaultReplicaToOtherReplicas.remove(defaultReplica); 508 } 509 } 510 } 511 512 /** 513 * A dead server's wals have been split so that all the regions 514 * used to be open on it can be safely assigned now. Mark them assignable. 515 */ logSplit(final ServerName serverName)516 public synchronized void logSplit(final ServerName serverName) { 517 for (Iterator<Map.Entry<String, ServerName>> it 518 = lastAssignments.entrySet().iterator(); it.hasNext();) { 519 Map.Entry<String, ServerName> e = it.next(); 520 if (e.getValue().equals(serverName)) { 521 it.remove(); 522 } 523 } 524 long now = System.currentTimeMillis(); 525 if (LOG.isDebugEnabled()) { 526 LOG.debug("Adding to log splitting servers " + serverName); 527 } 528 processedServers.put(serverName, Long.valueOf(now)); 529 Configuration conf = server.getConfiguration(); 530 long obsoleteTime = conf.getLong(LOG_SPLIT_TIME, DEFAULT_LOG_SPLIT_TIME); 531 // Doesn't have to be very accurate about the clean up time 532 if (now > lastProcessedServerCleanTime + obsoleteTime) { 533 lastProcessedServerCleanTime = now; 534 long cutoff = now - obsoleteTime; 535 for (Iterator<Map.Entry<ServerName, Long>> it 536 = processedServers.entrySet().iterator(); it.hasNext();) { 537 Map.Entry<ServerName, Long> e = it.next(); 538 if (e.getValue().longValue() < cutoff) { 539 if (LOG.isDebugEnabled()) { 540 LOG.debug("Removed from log splitting servers " + e.getKey()); 541 } 542 it.remove(); 543 } 544 } 545 } 546 } 547 548 /** 549 * Log split is done for a given region, so it is assignable now. 550 */ logSplit(final HRegionInfo region)551 public void logSplit(final HRegionInfo region) { 552 clearLastAssignment(region); 553 } 554 clearLastAssignment(final HRegionInfo region)555 public synchronized void clearLastAssignment(final HRegionInfo region) { 556 lastAssignments.remove(region.getEncodedName()); 557 } 558 559 /** 560 * A region is offline, won't be in transition any more. 561 */ regionOffline(final HRegionInfo hri)562 public void regionOffline(final HRegionInfo hri) { 563 regionOffline(hri, null); 564 } 565 566 /** 567 * A region is offline, won't be in transition any more. Its state 568 * should be the specified expected state, which can only be 569 * Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew. 570 */ regionOffline( final HRegionInfo hri, final State expectedState)571 public void regionOffline( 572 final HRegionInfo hri, final State expectedState) { 573 Preconditions.checkArgument(expectedState == null 574 || RegionState.isUnassignable(expectedState), 575 "Offlined region should not be " + expectedState); 576 if (isRegionInState(hri, State.SPLITTING_NEW, State.MERGING_NEW)) { 577 // Remove it from all region maps 578 deleteRegion(hri); 579 return; 580 } 581 State newState = 582 expectedState == null ? State.OFFLINE : expectedState; 583 updateRegionState(hri, newState); 584 String encodedName = hri.getEncodedName(); 585 synchronized (this) { 586 regionsInTransition.remove(encodedName); 587 ServerName oldServerName = regionAssignments.remove(hri); 588 if (oldServerName != null && serverHoldings.containsKey(oldServerName)) { 589 if (newState == State.MERGED || newState == State.SPLIT 590 || hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(), 591 ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) { 592 // Offline the region only if it's merged/split, or the table is disabled/disabling. 593 // Otherwise, offline it from this server only when it is online on a different server. 594 LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName); 595 removeFromServerHoldings(oldServerName, hri); 596 removeFromReplicaMapping(hri); 597 } else { 598 // Need to remember it so that we can offline it from this 599 // server when it is online on a different server. 600 oldAssignments.put(encodedName, oldServerName); 601 } 602 } 603 } 604 } 605 606 /** 607 * A server is offline, all regions on it are dead. 608 */ serverOffline(final ZooKeeperWatcher watcher, final ServerName sn)609 public List<HRegionInfo> serverOffline(final ZooKeeperWatcher watcher, final ServerName sn) { 610 // Offline all regions on this server not already in transition. 611 List<HRegionInfo> rits = new ArrayList<HRegionInfo>(); 612 Set<HRegionInfo> regionsToCleanIfNoMetaEntry = new HashSet<HRegionInfo>(); 613 // Offline regions outside the loop and synchronized block to avoid 614 // ConcurrentModificationException and deadlock in case of meta anassigned, 615 // but RegionState a blocked. 616 Set<HRegionInfo> regionsToOffline = new HashSet<HRegionInfo>(); 617 synchronized (this) { 618 Set<HRegionInfo> assignedRegions = serverHoldings.get(sn); 619 if (assignedRegions == null) { 620 assignedRegions = new HashSet<HRegionInfo>(); 621 } 622 623 for (HRegionInfo region : assignedRegions) { 624 // Offline open regions, no need to offline if SPLIT/MERGED/OFFLINE 625 if (isRegionOnline(region)) { 626 regionsToOffline.add(region); 627 } else if (isRegionInState(region, State.SPLITTING, State.MERGING)) { 628 LOG.debug("Offline splitting/merging region " + getRegionState(region)); 629 try { 630 // Delete the ZNode if exists 631 ZKAssign.deleteNodeFailSilent(watcher, region); 632 regionsToOffline.add(region); 633 } catch (KeeperException ke) { 634 server.abort("Unexpected ZK exception deleting node " + region, ke); 635 } 636 } 637 } 638 639 for (RegionState state : regionsInTransition.values()) { 640 HRegionInfo hri = state.getRegion(); 641 if (assignedRegions.contains(hri)) { 642 // Region is open on this region server, but in transition. 643 // This region must be moving away from this server, or splitting/merging. 644 // SSH will handle it, either skip assigning, or re-assign. 645 LOG.info("Transitioning " + state + " will be handled by ServerCrashProcedure for " + sn); 646 } else if (sn.equals(state.getServerName())) { 647 // Region is in transition on this region server, and this 648 // region is not open on this server. So the region must be 649 // moving to this server from another one (i.e. opening or 650 // pending open on this server, was open on another one. 651 // Offline state is also kind of pending open if the region is in 652 // transition. The region could be in failed_close state too if we have 653 // tried several times to open it while this region server is not reachable) 654 if (state.isPendingOpenOrOpening() || state.isFailedClose() || state.isOffline()) { 655 LOG.info("Found region in " + state + 656 " to be reassigned by ServerCrashProcedure for " + sn); 657 rits.add(hri); 658 } else if(state.isSplittingNew()) { 659 regionsToCleanIfNoMetaEntry.add(state.getRegion()); 660 } else { 661 LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state); 662 } 663 } 664 } 665 this.notifyAll(); 666 } 667 668 for (HRegionInfo hri : regionsToOffline) { 669 regionOffline(hri); 670 } 671 672 cleanIfNoMetaEntry(regionsToCleanIfNoMetaEntry); 673 return rits; 674 } 675 676 /** 677 * This method does an RPC to hbase:meta. Do not call this method with a lock/synchronize held. 678 * @param hris The hris to check if empty in hbase:meta and if so, clean them up. 679 */ cleanIfNoMetaEntry(Set<HRegionInfo> hris)680 private void cleanIfNoMetaEntry(Set<HRegionInfo> hris) { 681 if (hris.isEmpty()) return; 682 for (HRegionInfo hri: hris) { 683 try { 684 // This is RPC to meta table. It is done while we have a synchronize on 685 // regionstates. No progress will be made if meta is not available at this time. 686 // This is a cleanup task. Not critical. 687 if (MetaTableAccessor.getRegion(server.getConnection(), hri.getEncodedNameAsBytes()) == 688 null) { 689 regionOffline(hri); 690 FSUtils.deleteRegionDir(server.getConfiguration(), hri); 691 } 692 } catch (IOException e) { 693 LOG.warn("Got exception while deleting " + hri + " directories from file system.", e); 694 } 695 } 696 } 697 698 /** 699 * Gets the online regions of the specified table. 700 * This method looks at the in-memory state. It does not go to <code>hbase:meta</code>. 701 * Only returns <em>online</em> regions. If a region on this table has been 702 * closed during a disable, etc., it will be included in the returned list. 703 * So, the returned list may not necessarily be ALL regions in this table, its 704 * all the ONLINE regions in the table. 705 * @param tableName 706 * @return Online regions from <code>tableName</code> 707 */ getRegionsOfTable(TableName tableName)708 public synchronized List<HRegionInfo> getRegionsOfTable(TableName tableName) { 709 List<HRegionInfo> tableRegions = new ArrayList<HRegionInfo>(); 710 // boundary needs to have table's name but regionID 0 so that it is sorted 711 // before all table's regions. 712 HRegionInfo boundary = new HRegionInfo(tableName, null, null, false, 0L); 713 for (HRegionInfo hri: regionAssignments.tailMap(boundary).keySet()) { 714 if(!hri.getTable().equals(tableName)) break; 715 tableRegions.add(hri); 716 } 717 return tableRegions; 718 } 719 720 /** 721 * Gets current state of all regions of the table. 722 * This method looks at the in-memory state. It does not go to <code>hbase:meta</code>. 723 * Method guaranteed to return keys for all states 724 * in {@link org.apache.hadoop.hbase.master.RegionState.State} 725 * 726 * @param tableName 727 * @return Online regions from <code>tableName</code> 728 */ 729 public synchronized Map<RegionState.State, List<HRegionInfo>> getRegionByStateOfTable(TableName tableName)730 getRegionByStateOfTable(TableName tableName) { 731 Map<RegionState.State, List<HRegionInfo>> tableRegions = 732 new HashMap<State, List<HRegionInfo>>(); 733 for (State state : State.values()) { 734 tableRegions.put(state, new ArrayList<HRegionInfo>()); 735 } 736 Map<String, RegionState> indexMap = regionStatesTableIndex.get(tableName); 737 if (indexMap == null) 738 return tableRegions; 739 for (RegionState regionState : indexMap.values()) { 740 tableRegions.get(regionState.getState()).add(regionState.getRegion()); 741 } 742 return tableRegions; 743 } 744 745 /** 746 * Wait on region to clear regions-in-transition. 747 * <p> 748 * If the region isn't in transition, returns immediately. Otherwise, method 749 * blocks until the region is out of transition. 750 */ waitOnRegionToClearRegionsInTransition( final HRegionInfo hri)751 public synchronized void waitOnRegionToClearRegionsInTransition( 752 final HRegionInfo hri) throws InterruptedException { 753 if (!isRegionInTransition(hri)) return; 754 755 while(!server.isStopped() && isRegionInTransition(hri)) { 756 RegionState rs = getRegionState(hri); 757 LOG.info("Waiting on " + rs + " to clear regions-in-transition"); 758 waitForUpdate(100); 759 } 760 761 if (server.isStopped()) { 762 LOG.info("Giving up wait on region in " + 763 "transition because stoppable.isStopped is set"); 764 } 765 } 766 767 /** 768 * A table is deleted. Remove its regions from all internal maps. 769 * We loop through all regions assuming we don't delete tables too much. 770 */ tableDeleted(final TableName tableName)771 public void tableDeleted(final TableName tableName) { 772 Set<HRegionInfo> regionsToDelete = new HashSet<HRegionInfo>(); 773 synchronized (this) { 774 for (RegionState state: regionStates.values()) { 775 HRegionInfo region = state.getRegion(); 776 if (region.getTable().equals(tableName)) { 777 regionsToDelete.add(region); 778 } 779 } 780 } 781 for (HRegionInfo region: regionsToDelete) { 782 deleteRegion(region); 783 } 784 } 785 786 /** 787 * Get a copy of all regions assigned to a server 788 */ getServerRegions(ServerName serverName)789 public synchronized Set<HRegionInfo> getServerRegions(ServerName serverName) { 790 Set<HRegionInfo> regions = serverHoldings.get(serverName); 791 if (regions == null) return null; 792 return new HashSet<HRegionInfo>(regions); 793 } 794 795 /** 796 * Remove a region from all state maps. 797 */ 798 @VisibleForTesting deleteRegion(final HRegionInfo hri)799 public synchronized void deleteRegion(final HRegionInfo hri) { 800 String encodedName = hri.getEncodedName(); 801 regionsInTransition.remove(encodedName); 802 regionStates.remove(encodedName); 803 TableName table = hri.getTable(); 804 Map<String, RegionState> indexMap = regionStatesTableIndex.get(table); 805 indexMap.remove(encodedName); 806 if (indexMap.size() == 0) 807 regionStatesTableIndex.remove(table); 808 lastAssignments.remove(encodedName); 809 ServerName sn = regionAssignments.remove(hri); 810 if (sn != null) { 811 Set<HRegionInfo> regions = serverHoldings.get(sn); 812 regions.remove(hri); 813 } 814 } 815 816 /** 817 * Checking if a region was assigned to a server which is not online now. 818 * If so, we should hold re-assign this region till SSH has split its wals. 819 * Once logs are split, the last assignment of this region will be reset, 820 * which means a null last assignment server is ok for re-assigning. 821 * 822 * A region server could be dead but we don't know it yet. We may 823 * think it's online falsely. Therefore if a server is online, we still 824 * need to confirm it reachable and having the expected start code. 825 */ wasRegionOnDeadServer(final String encodedName)826 synchronized boolean wasRegionOnDeadServer(final String encodedName) { 827 ServerName server = lastAssignments.get(encodedName); 828 return isServerDeadAndNotProcessed(server); 829 } 830 isServerDeadAndNotProcessed(ServerName server)831 synchronized boolean isServerDeadAndNotProcessed(ServerName server) { 832 if (server == null) return false; 833 if (serverManager.isServerOnline(server)) { 834 String hostAndPort = server.getHostAndPort(); 835 long startCode = server.getStartcode(); 836 Long deadCode = deadServers.get(hostAndPort); 837 if (deadCode == null || startCode > deadCode.longValue()) { 838 if (serverManager.isServerReachable(server)) { 839 return false; 840 } 841 // The size of deadServers won't grow unbounded. 842 deadServers.put(hostAndPort, Long.valueOf(startCode)); 843 } 844 // Watch out! If the server is not dead, the region could 845 // remain unassigned. That's why ServerManager#isServerReachable 846 // should use some retry. 847 // 848 // We cache this info since it is very unlikely for that 849 // instance to come back up later on. We don't want to expire 850 // the server since we prefer to let it die naturally. 851 LOG.warn("Couldn't reach online server " + server); 852 } 853 // Now, we know it's dead. Check if it's processed 854 return !processedServers.containsKey(server); 855 } 856 857 /** 858 * Get the last region server a region was on for purpose of re-assignment, 859 * i.e. should the re-assignment be held back till log split is done? 860 */ getLastRegionServerOfRegion(final String encodedName)861 synchronized ServerName getLastRegionServerOfRegion(final String encodedName) { 862 return lastAssignments.get(encodedName); 863 } 864 setLastRegionServerOfRegions( final ServerName serverName, final List<HRegionInfo> regionInfos)865 synchronized void setLastRegionServerOfRegions( 866 final ServerName serverName, final List<HRegionInfo> regionInfos) { 867 for (HRegionInfo hri: regionInfos) { 868 setLastRegionServerOfRegion(serverName, hri.getEncodedName()); 869 } 870 } 871 setLastRegionServerOfRegion( final ServerName serverName, final String encodedName)872 synchronized void setLastRegionServerOfRegion( 873 final ServerName serverName, final String encodedName) { 874 lastAssignments.put(encodedName, serverName); 875 } 876 splitRegion(HRegionInfo p, HRegionInfo a, HRegionInfo b, ServerName sn)877 void splitRegion(HRegionInfo p, 878 HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException { 879 880 regionStateStore.splitRegion(p, a, b, sn, getRegionReplication(p)); 881 synchronized (this) { 882 // After PONR, split is considered to be done. 883 // Update server holdings to be aligned with the meta. 884 Set<HRegionInfo> regions = serverHoldings.get(sn); 885 if (regions == null) { 886 throw new IllegalStateException(sn + " should host some regions"); 887 } 888 regions.remove(p); 889 regions.add(a); 890 regions.add(b); 891 } 892 } 893 mergeRegions(HRegionInfo p, HRegionInfo a, HRegionInfo b, ServerName sn)894 void mergeRegions(HRegionInfo p, 895 HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException { 896 regionStateStore.mergeRegions(p, a, b, sn, getRegionReplication(a)); 897 synchronized (this) { 898 // After PONR, merge is considered to be done. 899 // Update server holdings to be aligned with the meta. 900 Set<HRegionInfo> regions = serverHoldings.get(sn); 901 if (regions == null) { 902 throw new IllegalStateException(sn + " should host some regions"); 903 } 904 regions.remove(a); 905 regions.remove(b); 906 regions.add(p); 907 } 908 } 909 getRegionReplication(HRegionInfo r)910 private int getRegionReplication(HRegionInfo r) throws IOException { 911 if (tableStateManager != null) { 912 HTableDescriptor htd = ((MasterServices)server).getTableDescriptors().get(r.getTable()); 913 if (htd != null) { 914 return htd.getRegionReplication(); 915 } 916 } 917 return 1; 918 } 919 920 /** 921 * At cluster clean re/start, mark all user regions closed except those of tables 922 * that are excluded, such as disabled/disabling/enabling tables. All user regions 923 * and their previous locations are returned. 924 */ closeAllUserRegions(Set<TableName> excludedTables)925 synchronized Map<HRegionInfo, ServerName> closeAllUserRegions(Set<TableName> excludedTables) { 926 boolean noExcludeTables = excludedTables == null || excludedTables.isEmpty(); 927 Set<HRegionInfo> toBeClosed = new HashSet<HRegionInfo>(regionStates.size()); 928 for(RegionState state: regionStates.values()) { 929 HRegionInfo hri = state.getRegion(); 930 if (state.isSplit() || hri.isSplit()) { 931 continue; 932 } 933 TableName tableName = hri.getTable(); 934 if (!TableName.META_TABLE_NAME.equals(tableName) 935 && (noExcludeTables || !excludedTables.contains(tableName))) { 936 toBeClosed.add(hri); 937 } 938 } 939 Map<HRegionInfo, ServerName> allUserRegions = 940 new HashMap<HRegionInfo, ServerName>(toBeClosed.size()); 941 for (HRegionInfo hri: toBeClosed) { 942 RegionState regionState = updateRegionState(hri, State.CLOSED); 943 allUserRegions.put(hri, regionState.getServerName()); 944 } 945 return allUserRegions; 946 } 947 948 /** 949 * Compute the average load across all region servers. 950 * Currently, this uses a very naive computation - just uses the number of 951 * regions being served, ignoring stats about number of requests. 952 * @return the average load 953 */ getAverageLoad()954 protected synchronized double getAverageLoad() { 955 int numServers = 0, totalLoad = 0; 956 for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) { 957 Set<HRegionInfo> regions = e.getValue(); 958 ServerName serverName = e.getKey(); 959 int regionCount = regions.size(); 960 if (serverManager.isServerOnline(serverName)) { 961 totalLoad += regionCount; 962 numServers++; 963 } 964 } 965 if (numServers > 1) { 966 // The master region server holds only a couple regions. 967 // Don't consider this server in calculating the average load 968 // if there are other region servers to avoid possible confusion. 969 Set<HRegionInfo> hris = serverHoldings.get(server.getServerName()); 970 if (hris != null) { 971 totalLoad -= hris.size(); 972 numServers--; 973 } 974 } 975 return numServers == 0 ? 0.0 : 976 (double)totalLoad / (double)numServers; 977 } 978 979 /** 980 * This is an EXPENSIVE clone. Cloning though is the safest thing to do. 981 * Can't let out original since it can change and at least the load balancer 982 * wants to iterate this exported list. We need to synchronize on regions 983 * since all access to this.servers is under a lock on this.regions. 984 * 985 * @return A clone of current assignments by table. 986 */ 987 protected Map<TableName, Map<ServerName, List<HRegionInfo>>> getAssignmentsByTable()988 getAssignmentsByTable() { 989 Map<TableName, Map<ServerName, List<HRegionInfo>>> result = 990 new HashMap<TableName, Map<ServerName,List<HRegionInfo>>>(); 991 synchronized (this) { 992 if (!server.getConfiguration().getBoolean("hbase.master.loadbalance.bytable", false)) { 993 Map<ServerName, List<HRegionInfo>> svrToRegions = 994 new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size()); 995 for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) { 996 svrToRegions.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue())); 997 } 998 result.put(TableName.valueOf("ensemble"), svrToRegions); 999 } else { 1000 for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) { 1001 for (HRegionInfo hri: e.getValue()) { 1002 if (hri.isMetaRegion()) continue; 1003 TableName tablename = hri.getTable(); 1004 Map<ServerName, List<HRegionInfo>> svrToRegions = result.get(tablename); 1005 if (svrToRegions == null) { 1006 svrToRegions = new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size()); 1007 result.put(tablename, svrToRegions); 1008 } 1009 List<HRegionInfo> regions = svrToRegions.get(e.getKey()); 1010 if (regions == null) { 1011 regions = new ArrayList<HRegionInfo>(); 1012 svrToRegions.put(e.getKey(), regions); 1013 } 1014 regions.add(hri); 1015 } 1016 } 1017 } 1018 } 1019 1020 Map<ServerName, ServerLoad> 1021 onlineSvrs = serverManager.getOnlineServers(); 1022 // Take care of servers w/o assignments, and remove servers in draining mode 1023 List<ServerName> drainingServers = this.serverManager.getDrainingServersList(); 1024 for (Map<ServerName, List<HRegionInfo>> map: result.values()) { 1025 for (ServerName svr: onlineSvrs.keySet()) { 1026 if (!map.containsKey(svr)) { 1027 map.put(svr, new ArrayList<HRegionInfo>()); 1028 } 1029 } 1030 map.keySet().removeAll(drainingServers); 1031 } 1032 return result; 1033 } 1034 getRegionState(final HRegionInfo hri)1035 protected RegionState getRegionState(final HRegionInfo hri) { 1036 return getRegionState(hri.getEncodedName()); 1037 } 1038 1039 /** 1040 * Returns a clone of region assignments per server 1041 * @return a Map of ServerName to a List of HRegionInfo's 1042 */ getRegionAssignmentsByServer()1043 protected synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignmentsByServer() { 1044 Map<ServerName, List<HRegionInfo>> regionsByServer = 1045 new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size()); 1046 for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) { 1047 regionsByServer.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue())); 1048 } 1049 return regionsByServer; 1050 } 1051 getRegionState(final String encodedName)1052 protected synchronized RegionState getRegionState(final String encodedName) { 1053 return regionStates.get(encodedName); 1054 } 1055 1056 /** 1057 * Get the HRegionInfo from cache, if not there, from the hbase:meta table 1058 * @param regionName 1059 * @return HRegionInfo for the region 1060 */ 1061 @SuppressWarnings("deprecation") getRegionInfo(final byte [] regionName)1062 protected HRegionInfo getRegionInfo(final byte [] regionName) { 1063 String encodedName = HRegionInfo.encodeRegionName(regionName); 1064 RegionState regionState = getRegionState(encodedName); 1065 if (regionState != null) { 1066 return regionState.getRegion(); 1067 } 1068 1069 try { 1070 Pair<HRegionInfo, ServerName> p = 1071 MetaTableAccessor.getRegion(server.getConnection(), regionName); 1072 HRegionInfo hri = p == null ? null : p.getFirst(); 1073 if (hri != null) { 1074 createRegionState(hri); 1075 } 1076 return hri; 1077 } catch (IOException e) { 1078 server.abort("Aborting because error occoured while reading " 1079 + Bytes.toStringBinary(regionName) + " from hbase:meta", e); 1080 return null; 1081 } 1082 } 1083 isOneOfStates(RegionState regionState, State... states)1084 static boolean isOneOfStates(RegionState regionState, State... states) { 1085 State s = regionState != null ? regionState.getState() : null; 1086 for (State state: states) { 1087 if (s == state) return true; 1088 } 1089 return false; 1090 } 1091 1092 /** 1093 * Update a region state. It will be put in transition if not already there. 1094 */ updateRegionState(final HRegionInfo hri, final State state, final ServerName serverName, long openSeqNum)1095 private RegionState updateRegionState(final HRegionInfo hri, 1096 final State state, final ServerName serverName, long openSeqNum) { 1097 if (state == State.FAILED_CLOSE || state == State.FAILED_OPEN) { 1098 LOG.warn("Failed to open/close " + hri.getShortNameToLog() 1099 + " on " + serverName + ", set to " + state); 1100 } 1101 1102 String encodedName = hri.getEncodedName(); 1103 RegionState regionState = new RegionState( 1104 hri, state, System.currentTimeMillis(), serverName); 1105 RegionState oldState = getRegionState(encodedName); 1106 if (!regionState.equals(oldState)) { 1107 LOG.info("Transition " + oldState + " to " + regionState); 1108 // Persist region state before updating in-memory info, if needed 1109 regionStateStore.updateRegionState(openSeqNum, regionState, oldState); 1110 } 1111 1112 synchronized (this) { 1113 regionsInTransition.put(encodedName, regionState); 1114 putRegionState(regionState); 1115 1116 // For these states, region should be properly closed. 1117 // There should be no log splitting issue. 1118 if ((state == State.CLOSED || state == State.MERGED 1119 || state == State.SPLIT) && lastAssignments.containsKey(encodedName)) { 1120 ServerName last = lastAssignments.get(encodedName); 1121 if (last.equals(serverName)) { 1122 lastAssignments.remove(encodedName); 1123 } else { 1124 LOG.warn(encodedName + " moved to " + state + " on " 1125 + serverName + ", expected " + last); 1126 } 1127 } 1128 1129 // Once a region is opened, record its last assignment right away. 1130 if (serverName != null && state == State.OPEN) { 1131 ServerName last = lastAssignments.get(encodedName); 1132 if (!serverName.equals(last)) { 1133 lastAssignments.put(encodedName, serverName); 1134 if (last != null && isServerDeadAndNotProcessed(last)) { 1135 LOG.warn(encodedName + " moved to " + serverName 1136 + ", while it's previous host " + last 1137 + " is dead but not processed yet"); 1138 } 1139 } 1140 } 1141 1142 // notify the change 1143 this.notifyAll(); 1144 } 1145 return regionState; 1146 } 1147 } 1148