1 /**
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *     http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 package org.apache.hadoop.hbase.zookeeper;
20 
21 import java.util.List;
22 
23 import org.apache.commons.logging.Log;
24 import org.apache.commons.logging.LogFactory;
25 import org.apache.hadoop.hbase.classification.InterfaceAudience;
26 import org.apache.hadoop.hbase.HConstants;
27 import org.apache.hadoop.hbase.HRegionInfo;
28 import org.apache.hadoop.hbase.RegionTransition;
29 import org.apache.hadoop.hbase.ServerName;
30 import org.apache.hadoop.hbase.exceptions.DeserializationException;
31 import org.apache.hadoop.hbase.executor.EventType;
32 import org.apache.zookeeper.AsyncCallback;
33 import org.apache.zookeeper.KeeperException;
34 import org.apache.zookeeper.KeeperException.Code;
35 import org.apache.zookeeper.data.Stat;
36 
37 // We should not be importing this Type here, nor a RegionTransition, etc.  This class should be
38 // about zk and bytes only.
39 
40 /**
41  * Utility class for doing region assignment in ZooKeeper.  This class extends
42  * stuff done in {@link ZKUtil} to cover specific assignment operations.
43  * <p>
44  * Contains only static methods and constants.
45  * <p>
46  * Used by both the Master and RegionServer.
47  * <p>
48  * All valid transitions outlined below:
49  * <p>
50  * <b>MASTER</b>
51  * <ol>
52  *   <li>
53  *     Master creates an unassigned node as OFFLINE.
54  *     - Cluster startup and table enabling.
55  *   </li>
56  *   <li>
57  *     Master forces an existing unassigned node to OFFLINE.
58  *     - RegionServer failure.
59  *     - Allows transitions from all states to OFFLINE.
60  *   </li>
61  *   <li>
62  *     Master deletes an unassigned node that was in a OPENED state.
63  *     - Normal region transitions.  Besides cluster startup, no other deletions
64  *     of unassigned nodes is allowed.
65  *   </li>
66  *   <li>
67  *     Master deletes all unassigned nodes regardless of state.
68  *     - Cluster startup before any assignment happens.
69  *   </li>
70  * </ol>
71  * <p>
72  * <b>REGIONSERVER</b>
73  * <ol>
74  *   <li>
75  *     RegionServer creates an unassigned node as CLOSING.
76  *     - All region closes will do this in response to a CLOSE RPC from Master.
77  *     - A node can never be transitioned to CLOSING, only created.
78  *   </li>
79  *   <li>
80  *     RegionServer transitions an unassigned node from CLOSING to CLOSED.
81  *     - Normal region closes.  CAS operation.
82  *   </li>
83  *   <li>
84  *     RegionServer transitions an unassigned node from OFFLINE to OPENING.
85  *     - All region opens will do this in response to an OPEN RPC from the Master.
86  *     - Normal region opens.  CAS operation.
87  *   </li>
88  *   <li>
89  *     RegionServer transitions an unassigned node from OPENING to OPENED.
90  *     - Normal region opens.  CAS operation.
91  *   </li>
92  * </ol>
93  */
94 @InterfaceAudience.Private
95 public class ZKAssign {
96   private static final Log LOG = LogFactory.getLog(ZKAssign.class);
97 
98   /**
99    * Gets the full path node name for the unassigned node for the specified
100    * region.
101    * @param zkw zk reference
102    * @param regionName region name
103    * @return full path node name
104    */
getNodeName(ZooKeeperWatcher zkw, String regionName)105   public static String getNodeName(ZooKeeperWatcher zkw, String regionName) {
106     return ZKUtil.joinZNode(zkw.assignmentZNode, regionName);
107   }
108 
109   /**
110    * Gets the region name from the full path node name of an unassigned node.
111    * @param path full zk path
112    * @return region name
113    */
getRegionName(ZooKeeperWatcher zkw, String path)114   public static String getRegionName(ZooKeeperWatcher zkw, String path) {
115     return path.substring(zkw.assignmentZNode.length()+1);
116   }
117 
118   // Master methods
119 
120   /**
121    * Creates a new unassigned node in the OFFLINE state for the specified region.
122    *
123    * <p>Does not transition nodes from other states.  If a node already exists
124    * for this region, a {@link org.apache.zookeeper.KeeperException.NodeExistsException}
125    * will be thrown.
126    *
127    * <p>Sets a watcher on the unassigned region node if the method is successful.
128    *
129    * <p>This method should only be used during cluster startup and the enabling
130    * of a table.
131    *
132    * @param zkw zk reference
133    * @param region region to be created as offline
134    * @param serverName server transition will happen on
135    * @throws KeeperException if unexpected zookeeper exception
136    * @throws KeeperException.NodeExistsException if node already exists
137    */
createNodeOffline(ZooKeeperWatcher zkw, HRegionInfo region, ServerName serverName)138   public static void createNodeOffline(ZooKeeperWatcher zkw, HRegionInfo region,
139       ServerName serverName)
140   throws KeeperException, KeeperException.NodeExistsException {
141     createNodeOffline(zkw, region, serverName, EventType.M_ZK_REGION_OFFLINE);
142   }
143 
createNodeOffline(ZooKeeperWatcher zkw, HRegionInfo region, ServerName serverName, final EventType event)144   public static void createNodeOffline(ZooKeeperWatcher zkw, HRegionInfo region,
145       ServerName serverName, final EventType event)
146   throws KeeperException, KeeperException.NodeExistsException {
147     LOG.debug(zkw.prefix("Creating unassigned node " +
148       region.getEncodedName() + " in OFFLINE state"));
149     RegionTransition rt =
150       RegionTransition.createRegionTransition(event, region.getRegionName(), serverName);
151     String node = getNodeName(zkw, region.getEncodedName());
152     ZKUtil.createAndWatch(zkw, node, rt.toByteArray());
153   }
154 
155   /**
156    * Creates an unassigned node in the OFFLINE state for the specified region.
157    * <p>
158    * Runs asynchronously.  Depends on no pre-existing znode.
159    *
160    * <p>Sets a watcher on the unassigned region node.
161    *
162    * @param zkw zk reference
163    * @param region region to be created as offline
164    * @param serverName server transition will happen on
165    * @param cb
166    * @param ctx
167    * @throws KeeperException if unexpected zookeeper exception
168    * @throws KeeperException.NodeExistsException if node already exists
169    */
asyncCreateNodeOffline(ZooKeeperWatcher zkw, HRegionInfo region, ServerName serverName, final AsyncCallback.StringCallback cb, final Object ctx)170   public static void asyncCreateNodeOffline(ZooKeeperWatcher zkw,
171       HRegionInfo region, ServerName serverName,
172       final AsyncCallback.StringCallback cb, final Object ctx)
173   throws KeeperException {
174     LOG.debug(zkw.prefix("Async create of unassigned node " +
175       region.getEncodedName() + " with OFFLINE state"));
176     RegionTransition rt =
177       RegionTransition.createRegionTransition(
178           EventType.M_ZK_REGION_OFFLINE, region.getRegionName(), serverName);
179     String node = getNodeName(zkw, region.getEncodedName());
180     ZKUtil.asyncCreate(zkw, node, rt.toByteArray(), cb, ctx);
181   }
182 
183   /**
184    * Creates or force updates an unassigned node to the OFFLINE state for the
185    * specified region.
186    * <p>
187    * Attempts to create the node but if it exists will force it to transition to
188    * and OFFLINE state.
189    *
190    * <p>Sets a watcher on the unassigned region node if the method is
191    * successful.
192    *
193    * <p>This method should be used when assigning a region.
194    *
195    * @param zkw zk reference
196    * @param region region to be created as offline
197    * @param serverName server transition will happen on
198    * @return the version of the znode created in OFFLINE state, -1 if
199    *         unsuccessful.
200    * @throws KeeperException if unexpected zookeeper exception
201    * @throws KeeperException.NodeExistsException if node already exists
202    */
createOrForceNodeOffline(ZooKeeperWatcher zkw, HRegionInfo region, ServerName serverName)203   public static int createOrForceNodeOffline(ZooKeeperWatcher zkw,
204       HRegionInfo region, ServerName serverName) throws KeeperException {
205     LOG.debug(zkw.prefix("Creating (or updating) unassigned node " +
206       region.getEncodedName() + " with OFFLINE state"));
207     RegionTransition rt = RegionTransition.createRegionTransition(EventType.M_ZK_REGION_OFFLINE,
208       region.getRegionName(), serverName, HConstants.EMPTY_BYTE_ARRAY);
209     byte [] data = rt.toByteArray();
210     String node = getNodeName(zkw, region.getEncodedName());
211     zkw.sync(node);
212     int version = ZKUtil.checkExists(zkw, node);
213     if (version == -1) {
214       return ZKUtil.createAndWatch(zkw, node, data);
215     } else {
216       boolean setData = false;
217       try {
218         setData = ZKUtil.setData(zkw, node, data, version);
219         // Setdata throws KeeperException which aborts the Master. So we are
220         // catching it here.
221         // If just before setting the znode to OFFLINE if the RS has made any
222         // change to the
223         // znode state then we need to return -1.
224       } catch (KeeperException kpe) {
225         LOG.info("Version mismatch while setting the node to OFFLINE state.");
226         return -1;
227       }
228       if (!setData) {
229         return -1;
230       } else {
231         // We successfully forced to OFFLINE, reset watch and handle if
232         // the state changed in between our set and the watch
233         byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
234         rt = getRegionTransition(bytes);
235         if (rt.getEventType() != EventType.M_ZK_REGION_OFFLINE) {
236           // state changed, need to process
237           return -1;
238         }
239       }
240     }
241     return version + 1;
242   }
243 
244   /**
245    * Deletes an existing unassigned node that is in the OPENED state for the
246    * specified region.
247    *
248    * <p>If a node does not already exist for this region, a
249    * {@link org.apache.zookeeper.KeeperException.NoNodeException} will be thrown.
250    *
251    * <p>No watcher is set whether this succeeds or not.
252    *
253    * <p>Returns false if the node was not in the proper state but did exist.
254    *
255    * <p>This method is used during normal region transitions when a region
256    * finishes successfully opening.  This is the Master acknowledging completion
257    * of the specified regions transition.
258    *
259    * @param zkw zk reference
260    * @param encodedRegionName opened region to be deleted from zk
261    * @param sn the expected region transition target server name
262    * @throws KeeperException if unexpected zookeeper exception
263    * @throws KeeperException.NoNodeException if node does not exist
264    */
deleteOpenedNode(ZooKeeperWatcher zkw, String encodedRegionName, ServerName sn)265   public static boolean deleteOpenedNode(ZooKeeperWatcher zkw,
266       String encodedRegionName, ServerName sn)
267   throws KeeperException, KeeperException.NoNodeException {
268     return deleteNode(zkw, encodedRegionName,
269       EventType.RS_ZK_REGION_OPENED, sn);
270   }
271 
272   /**
273    * Deletes an existing unassigned node that is in the OFFLINE state for the
274    * specified region.
275    *
276    * <p>If a node does not already exist for this region, a
277    * {@link org.apache.zookeeper.KeeperException.NoNodeException} will be thrown.
278    *
279    * <p>No watcher is set whether this succeeds or not.
280    *
281    * <p>Returns false if the node was not in the proper state but did exist.
282    *
283    * <p>This method is used during master failover when the regions on an RS
284    * that has died are all set to OFFLINE before being processed.
285    *
286    * @param zkw zk reference
287    * @param encodedRegionName closed region to be deleted from zk
288    * @param sn the expected region transition target server name
289    * @throws KeeperException if unexpected zookeeper exception
290    * @throws KeeperException.NoNodeException if node does not exist
291    */
deleteOfflineNode(ZooKeeperWatcher zkw, String encodedRegionName, ServerName sn)292   public static boolean deleteOfflineNode(ZooKeeperWatcher zkw,
293       String encodedRegionName, ServerName sn)
294   throws KeeperException, KeeperException.NoNodeException {
295     return deleteNode(zkw, encodedRegionName,
296       EventType.M_ZK_REGION_OFFLINE, sn);
297   }
298 
299   /**
300    * Deletes an existing unassigned node that is in the CLOSED state for the
301    * specified region.
302    *
303    * <p>If a node does not already exist for this region, a
304    * {@link org.apache.zookeeper.KeeperException.NoNodeException} will be thrown.
305    *
306    * <p>No watcher is set whether this succeeds or not.
307    *
308    * <p>Returns false if the node was not in the proper state but did exist.
309    *
310    * <p>This method is used during table disables when a region finishes
311    * successfully closing.  This is the Master acknowledging completion
312    * of the specified regions transition to being closed.
313    *
314    * @param zkw zk reference
315    * @param encodedRegionName closed region to be deleted from zk
316    * @param sn the expected region transition target server name
317    * @throws KeeperException if unexpected zookeeper exception
318    * @throws KeeperException.NoNodeException if node does not exist
319    */
deleteClosedNode(ZooKeeperWatcher zkw, String encodedRegionName, ServerName sn)320   public static boolean deleteClosedNode(ZooKeeperWatcher zkw,
321       String encodedRegionName, ServerName sn)
322   throws KeeperException, KeeperException.NoNodeException {
323     return deleteNode(zkw, encodedRegionName,
324       EventType.RS_ZK_REGION_CLOSED, sn);
325   }
326 
327   /**
328    * Deletes an existing unassigned node that is in the CLOSING state for the
329    * specified region.
330    *
331    * <p>If a node does not already exist for this region, a
332    * {@link org.apache.zookeeper.KeeperException.NoNodeException} will be thrown.
333    *
334    * <p>No watcher is set whether this succeeds or not.
335    *
336    * <p>Returns false if the node was not in the proper state but did exist.
337    *
338    * <p>This method is used during table disables when a region finishes
339    * successfully closing.  This is the Master acknowledging completion
340    * of the specified regions transition to being closed.
341    *
342    * @param zkw zk reference
343    * @param region closing region to be deleted from zk
344    * @param sn the expected region transition target server name
345    * @throws KeeperException if unexpected zookeeper exception
346    * @throws KeeperException.NoNodeException if node does not exist
347    */
deleteClosingNode(ZooKeeperWatcher zkw, HRegionInfo region, ServerName sn)348   public static boolean deleteClosingNode(ZooKeeperWatcher zkw,
349       HRegionInfo region, ServerName sn)
350   throws KeeperException, KeeperException.NoNodeException {
351     String encodedRegionName = region.getEncodedName();
352     return deleteNode(zkw, encodedRegionName,
353       EventType.M_ZK_REGION_CLOSING, sn);
354   }
355 
356   /**
357    * Deletes an existing unassigned node that is in the specified state for the
358    * specified region.
359    *
360    * <p>If a node does not already exist for this region, a
361    * {@link org.apache.zookeeper.KeeperException.NoNodeException} will be thrown.
362    *
363    * <p>No watcher is set whether this succeeds or not.
364    *
365    * <p>Returns false if the node was not in the proper state but did exist.
366    *
367    * <p>This method is used when a region finishes opening/closing.
368    * The Master acknowledges completion
369    * of the specified regions transition to being closed/opened.
370    *
371    * @param zkw zk reference
372    * @param encodedRegionName region to be deleted from zk
373    * @param expectedState state region must be in for delete to complete
374    * @param sn the expected region transition target server name
375    * @throws KeeperException if unexpected zookeeper exception
376    * @throws KeeperException.NoNodeException if node does not exist
377    */
deleteNode(ZooKeeperWatcher zkw, String encodedRegionName, EventType expectedState, ServerName sn)378   public static boolean deleteNode(ZooKeeperWatcher zkw, String encodedRegionName,
379       EventType expectedState, ServerName sn)
380   throws KeeperException, KeeperException.NoNodeException {
381     return deleteNode(zkw, encodedRegionName, expectedState, sn, -1);
382   }
383 
384   /**
385    * Deletes an existing unassigned node that is in the specified state for the
386    * specified region.
387    *
388    * <p>If a node does not already exist for this region, a
389    * {@link org.apache.zookeeper.KeeperException.NoNodeException} will be thrown.
390    *
391    * <p>No watcher is set whether this succeeds or not.
392    *
393    * <p>Returns false if the node was not in the proper state but did exist.
394    *
395    * <p>This method is used when a region finishes opening/closing.
396    * The Master acknowledges completion
397    * of the specified regions transition to being closed/opened.
398    *
399    * @param zkw zk reference
400    * @param encodedRegionName region to be deleted from zk
401    * @param expectedState state region must be in for delete to complete
402    * @param expectedVersion of the znode that is to be deleted.
403    *        If expectedVersion need not be compared while deleting the znode
404    *        pass -1
405    * @throws KeeperException if unexpected zookeeper exception
406    * @throws KeeperException.NoNodeException if node does not exist
407    */
deleteNode(ZooKeeperWatcher zkw, String encodedRegionName, EventType expectedState, int expectedVersion)408   public static boolean deleteNode(ZooKeeperWatcher zkw, String encodedRegionName,
409       EventType expectedState, int expectedVersion)
410   throws KeeperException, KeeperException.NoNodeException {
411     return deleteNode(zkw, encodedRegionName, expectedState, null, expectedVersion);
412   }
413 
414   /**
415    * Deletes an existing unassigned node that is in the specified state for the
416    * specified region.
417    *
418    * <p>If a node does not already exist for this region, a
419    * {@link org.apache.zookeeper.KeeperException.NoNodeException} will be thrown.
420    *
421    * <p>No watcher is set whether this succeeds or not.
422    *
423    * <p>Returns false if the node was not in the proper state but did exist.
424    *
425    * <p>This method is used when a region finishes opening/closing.
426    * The Master acknowledges completion
427    * of the specified regions transition to being closed/opened.
428    *
429    * @param zkw zk reference
430    * @param encodedRegionName region to be deleted from zk
431    * @param expectedState state region must be in for delete to complete
432    * @param serverName the expected region transition target server name
433    * @param expectedVersion of the znode that is to be deleted.
434    *        If expectedVersion need not be compared while deleting the znode
435    *        pass -1
436    * @throws KeeperException if unexpected zookeeper exception
437    * @throws KeeperException.NoNodeException if node does not exist
438    */
deleteNode(ZooKeeperWatcher zkw, String encodedRegionName, EventType expectedState, ServerName serverName, int expectedVersion)439   public static boolean deleteNode(ZooKeeperWatcher zkw, String encodedRegionName,
440       EventType expectedState, ServerName serverName, int expectedVersion)
441   throws KeeperException, KeeperException.NoNodeException {
442     if (LOG.isTraceEnabled()) {
443       LOG.trace(zkw.prefix("Deleting existing unassigned " +
444         "node " + encodedRegionName + " in expected state " + expectedState));
445     }
446     String node = getNodeName(zkw, encodedRegionName);
447     zkw.sync(node);
448     Stat stat = new Stat();
449     byte [] bytes = ZKUtil.getDataNoWatch(zkw, node, stat);
450     if (bytes == null) {
451       // If it came back null, node does not exist.
452       throw KeeperException.create(Code.NONODE);
453     }
454     RegionTransition rt = getRegionTransition(bytes);
455     EventType et = rt.getEventType();
456     if (!et.equals(expectedState)) {
457       LOG.warn(zkw.prefix("Attempting to delete unassigned node " + encodedRegionName + " in " +
458         expectedState + " state but node is in " + et + " state"));
459       return false;
460     }
461     // Verify the server transition happens on is not changed
462     if (serverName != null && !rt.getServerName().equals(serverName)) {
463       LOG.warn(zkw.prefix("Attempting to delete unassigned node " + encodedRegionName
464         + " with target " + serverName + " but node has " + rt.getServerName()));
465       return false;
466     }
467     if (expectedVersion != -1
468         && stat.getVersion() != expectedVersion) {
469       LOG.warn("The node " + encodedRegionName + " we are trying to delete is not" +
470         " the expected one. Got a version mismatch");
471       return false;
472     }
473     if(!ZKUtil.deleteNode(zkw, node, stat.getVersion())) {
474       LOG.warn(zkw.prefix("Attempting to delete " +
475           "unassigned node " + encodedRegionName + " in " + expectedState +
476           " state but after verifying state, we got a version mismatch"));
477       return false;
478     }
479     LOG.debug(zkw.prefix("Deleted unassigned node " +
480         encodedRegionName + " in expected state " + expectedState));
481     return true;
482   }
483 
484   /**
485    * Deletes all unassigned nodes regardless of their state.
486    *
487    * <p>No watchers are set.
488    *
489    * <p>This method is used by the Master during cluster startup to clear out
490    * any existing state from other cluster runs.
491    *
492    * @param zkw zk reference
493    * @throws KeeperException if unexpected zookeeper exception
494    */
deleteAllNodes(ZooKeeperWatcher zkw)495   public static void deleteAllNodes(ZooKeeperWatcher zkw)
496   throws KeeperException {
497     LOG.debug(zkw.prefix("Deleting any existing unassigned nodes"));
498     ZKUtil.deleteChildrenRecursively(zkw, zkw.assignmentZNode);
499   }
500 
501   /**
502    * Creates a new unassigned node in the CLOSING state for the specified
503    * region.
504    *
505    * <p>Does not transition nodes from any states.  If a node already exists
506    * for this region, a {@link org.apache.zookeeper.KeeperException.NodeExistsException}
507    * will be thrown.
508    *
509    * <p>If creation is successful, returns the version number of the CLOSING
510    * node created.
511    *
512    * <p>Set a watch.
513    *
514    * <p>This method should only be used by a Master when initiating a
515    * close of a region before sending a close request to the region server.
516    *
517    * @param zkw zk reference
518    * @param region region to be created as closing
519    * @param serverName server transition will happen on
520    * @return version of node after transition, -1 if unsuccessful transition
521    * @throws KeeperException if unexpected zookeeper exception
522    * @throws KeeperException.NodeExistsException if node already exists
523    */
createNodeClosing(ZooKeeperWatcher zkw, HRegionInfo region, ServerName serverName)524   public static int createNodeClosing(ZooKeeperWatcher zkw, HRegionInfo region,
525       ServerName serverName)
526   throws KeeperException, KeeperException.NodeExistsException {
527     LOG.debug(zkw.prefix("Creating unassigned node " +
528       region.getEncodedName() + " in a CLOSING state"));
529     RegionTransition rt = RegionTransition.createRegionTransition(EventType.M_ZK_REGION_CLOSING,
530       region.getRegionName(), serverName, HConstants.EMPTY_BYTE_ARRAY);
531     String node = getNodeName(zkw, region.getEncodedName());
532     return ZKUtil.createAndWatch(zkw, node, rt.toByteArray());
533   }
534 
535   // RegionServer methods
536 
537   /**
538    * Transitions an existing unassigned node for the specified region which is
539    * currently in the CLOSING state to be in the CLOSED state.
540    *
541    * <p>Does not transition nodes from other states.  If for some reason the
542    * node could not be transitioned, the method returns -1.  If the transition
543    * is successful, the version of the node after transition is returned.
544    *
545    * <p>This method can fail and return false for three different reasons:
546    * <ul><li>Unassigned node for this region does not exist</li>
547    * <li>Unassigned node for this region is not in CLOSING state</li>
548    * <li>After verifying CLOSING state, update fails because of wrong version
549    * (someone else already transitioned the node)</li>
550    * </ul>
551    *
552    * <p>Does not set any watches.
553    *
554    * <p>This method should only be used by a RegionServer when initiating a
555    * close of a region after receiving a CLOSE RPC from the Master.
556    *
557    * @param zkw zk reference
558    * @param region region to be transitioned to closed
559    * @param serverName server transition happens on
560    * @return version of node after transition, -1 if unsuccessful transition
561    * @throws KeeperException if unexpected zookeeper exception
562    */
transitionNodeClosed(ZooKeeperWatcher zkw, HRegionInfo region, ServerName serverName, int expectedVersion)563   public static int transitionNodeClosed(ZooKeeperWatcher zkw,
564       HRegionInfo region, ServerName serverName, int expectedVersion)
565   throws KeeperException {
566     return transitionNode(zkw, region, serverName,
567         EventType.M_ZK_REGION_CLOSING,
568         EventType.RS_ZK_REGION_CLOSED, expectedVersion);
569   }
570 
571   /**
572    * Transitions an existing unassigned node for the specified region which is
573    * currently in the OFFLINE state to be in the OPENING state.
574    *
575    * <p>Does not transition nodes from other states.  If for some reason the
576    * node could not be transitioned, the method returns -1.  If the transition
577    * is successful, the version of the node written as OPENING is returned.
578    *
579    * <p>This method can fail and return -1 for three different reasons:
580    * <ul><li>Unassigned node for this region does not exist</li>
581    * <li>Unassigned node for this region is not in OFFLINE state</li>
582    * <li>After verifying OFFLINE state, update fails because of wrong version
583    * (someone else already transitioned the node)</li>
584    * </ul>
585    *
586    * <p>Does not set any watches.
587    *
588    * <p>This method should only be used by a RegionServer when initiating an
589    * open of a region after receiving an OPEN RPC from the Master.
590    *
591    * @param zkw zk reference
592    * @param region region to be transitioned to opening
593    * @param serverName server transition happens on
594    * @return version of node after transition, -1 if unsuccessful transition
595    * @throws KeeperException if unexpected zookeeper exception
596    */
transitionNodeOpening(ZooKeeperWatcher zkw, HRegionInfo region, ServerName serverName)597   public static int transitionNodeOpening(ZooKeeperWatcher zkw,
598       HRegionInfo region, ServerName serverName)
599   throws KeeperException {
600     return transitionNodeOpening(zkw, region, serverName,
601       EventType.M_ZK_REGION_OFFLINE);
602   }
603 
transitionNodeOpening(ZooKeeperWatcher zkw, HRegionInfo region, ServerName serverName, final EventType beginState)604   public static int transitionNodeOpening(ZooKeeperWatcher zkw,
605       HRegionInfo region, ServerName serverName, final EventType beginState)
606   throws KeeperException {
607     return transitionNode(zkw, region, serverName, beginState,
608       EventType.RS_ZK_REGION_OPENING, -1);
609   }
610 
611   /**
612    * Confirm an existing unassigned node for the specified region which is
613    * currently in the OPENING state to be still in the OPENING state on
614    * the specified server.
615    *
616    * <p>If for some reason the check fails, the method returns -1. Otherwise,
617    * the version of the node (same as the expected version) is returned.
618    *
619    * <p>This method can fail and return -1 for three different reasons:
620    * <ul><li>Unassigned node for this region does not exist</li>
621    * <li>Unassigned node for this region is not in OPENING state</li>
622    * <li>After verifying OPENING state, the server name or the version of the
623    * doesn't match)</li>
624    * </ul>
625    *
626    * <p>Does not set any watches.
627    *
628    * <p>This method should only be used by a RegionServer when initiating an
629    * open of a region after receiving an OPEN RPC from the Master.
630    *
631    * @param zkw zk reference
632    * @param region region to be transitioned to opening
633    * @param serverName server transition happens on
634    * @return version of node after transition, -1 if unsuccessful transition
635    * @throws KeeperException if unexpected zookeeper exception
636    */
confirmNodeOpening(ZooKeeperWatcher zkw, HRegionInfo region, ServerName serverName, int expectedVersion)637   public static int confirmNodeOpening(ZooKeeperWatcher zkw,
638       HRegionInfo region, ServerName serverName, int expectedVersion)
639   throws KeeperException {
640 
641     String encoded = region.getEncodedName();
642     if(LOG.isDebugEnabled()) {
643       LOG.debug(zkw.prefix("Attempting to retransition opening state of node " +
644           HRegionInfo.prettyPrint(encoded)));
645     }
646 
647     String node = getNodeName(zkw, encoded);
648     zkw.sync(node);
649 
650     // Read existing data of the node
651     Stat stat = new Stat();
652     byte [] existingBytes = ZKUtil.getDataNoWatch(zkw, node, stat);
653     if (existingBytes == null) {
654       // Node no longer exists.  Return -1. It means unsuccessful transition.
655       return -1;
656     }
657     RegionTransition rt = getRegionTransition(existingBytes);
658 
659     // Verify it is the expected version
660     if (expectedVersion != -1 && stat.getVersion() != expectedVersion) {
661       LOG.warn(zkw.prefix("Attempt to retransition the opening state of the " +
662           "unassigned node for " + encoded + " failed, " +
663           "the node existed but was version " + stat.getVersion() +
664           " not the expected version " + expectedVersion));
665       return -1;
666     }
667 
668     // Verify it is in expected state
669     EventType et = rt.getEventType();
670     if (!et.equals(EventType.RS_ZK_REGION_OPENING)) {
671       String existingServer = (rt.getServerName() == null)
672           ? "<unknown>" : rt.getServerName().toString();
673       LOG.warn(zkw.prefix("Attempt to retransition the opening state of the unassigned node for "
674           + encoded + " failed, the node existed but was in the state " + et +
675           " set by the server " + existingServer));
676       return -1;
677     }
678 
679     return expectedVersion;
680   }
681 
682   /**
683    * Transitions an existing unassigned node for the specified region which is
684    * currently in the OPENING state to be in the OPENED state.
685    *
686    * <p>Does not transition nodes from other states.  If for some reason the
687    * node could not be transitioned, the method returns -1.  If the transition
688    * is successful, the version of the node after transition is returned.
689    *
690    * <p>This method can fail and return false for three different reasons:
691    * <ul><li>Unassigned node for this region does not exist</li>
692    * <li>Unassigned node for this region is not in OPENING state</li>
693    * <li>After verifying OPENING state, update fails because of wrong version
694    * (this should never actually happen since an RS only does this transition
695    * following a transition to OPENING.  if two RS are conflicting, one would
696    * fail the original transition to OPENING and not this transition)</li>
697    * </ul>
698    *
699    * <p>Does not set any watches.
700    *
701    * <p>This method should only be used by a RegionServer when completing the
702    * open of a region.
703    *
704    * @param zkw zk reference
705    * @param region region to be transitioned to opened
706    * @param serverName server transition happens on
707    * @return version of node after transition, -1 if unsuccessful transition
708    * @throws KeeperException if unexpected zookeeper exception
709    */
transitionNodeOpened(ZooKeeperWatcher zkw, HRegionInfo region, ServerName serverName, int expectedVersion)710   public static int transitionNodeOpened(ZooKeeperWatcher zkw,
711       HRegionInfo region, ServerName serverName, int expectedVersion)
712   throws KeeperException {
713     return transitionNode(zkw, region, serverName,
714         EventType.RS_ZK_REGION_OPENING,
715         EventType.RS_ZK_REGION_OPENED, expectedVersion);
716   }
717 
718   /**
719    *
720    * @param zkw zk reference
721    * @param region region to be closed
722    * @param expectedVersion expected version of the znode
723    * @return true if the znode exists, has the right version and the right state. False otherwise.
724    * @throws KeeperException
725    */
checkClosingState(ZooKeeperWatcher zkw, HRegionInfo region, int expectedVersion)726   public static boolean checkClosingState(ZooKeeperWatcher zkw, HRegionInfo region,
727                                           int expectedVersion) throws KeeperException {
728 
729     final String encoded = getNodeName(zkw, region.getEncodedName());
730     zkw.sync(encoded);
731 
732     // Read existing data of the node
733     Stat stat = new Stat();
734     byte[] existingBytes = ZKUtil.getDataNoWatch(zkw, encoded, stat);
735 
736     if (existingBytes == null) {
737       LOG.warn(zkw.prefix("Attempt to check the " +
738           "closing node for " + encoded +
739           ". The node does not exist"));
740       return false;
741     }
742 
743     if (expectedVersion != -1 && stat.getVersion() != expectedVersion) {
744       LOG.warn(zkw.prefix("Attempt to check the " +
745           "closing node for " + encoded +
746           ". The node existed but was version " + stat.getVersion() +
747           " not the expected version " + expectedVersion));
748       return false;
749     }
750 
751     RegionTransition rt = getRegionTransition(existingBytes);
752 
753     if (!EventType.M_ZK_REGION_CLOSING.equals(rt.getEventType())) {
754       LOG.warn(zkw.prefix("Attempt to check the " +
755           "closing node for " + encoded +
756           ". The node existed but was in an unexpected state: " + rt.getEventType()));
757       return false;
758     }
759 
760     return true;
761   }
762 
763   /**
764    * Method that actually performs unassigned node transitions.
765    *
766    * <p>Attempts to transition the unassigned node for the specified region
767    * from the expected state to the state in the specified transition data.
768    *
769    * <p>Method first reads existing data and verifies it is in the expected
770    * state.  If the node does not exist or the node is not in the expected
771    * state, the method returns -1.  If the transition is successful, the
772    * version number of the node following the transition is returned.
773    *
774    * <p>If the read state is what is expected, it attempts to write the new
775    * state and data into the node.  When doing this, it includes the expected
776    * version (determined when the existing state was verified) to ensure that
777    * only one transition is successful.  If there is a version mismatch, the
778    * method returns -1.
779    *
780    * <p>If the write is successful, no watch is set and the method returns true.
781    *
782    * @param zkw zk reference
783    * @param region region to be transitioned to opened
784    * @param serverName server transition happens on
785    * @param endState state to transition node to if all checks pass
786    * @param beginState state the node must currently be in to do transition
787    * @param expectedVersion expected version of data before modification, or -1
788    * @return version of node after transition, -1 if unsuccessful transition
789    * @throws KeeperException if unexpected zookeeper exception
790    */
transitionNode(ZooKeeperWatcher zkw, HRegionInfo region, ServerName serverName, EventType beginState, EventType endState, int expectedVersion)791   public static int transitionNode(ZooKeeperWatcher zkw, HRegionInfo region,
792       ServerName serverName, EventType beginState, EventType endState,
793       int expectedVersion)
794   throws KeeperException {
795     return transitionNode(zkw, region, serverName, beginState, endState, expectedVersion, null);
796   }
797 
798 
transitionNode(ZooKeeperWatcher zkw, HRegionInfo region, ServerName serverName, EventType beginState, EventType endState, int expectedVersion, final byte [] payload)799   public static int transitionNode(ZooKeeperWatcher zkw, HRegionInfo region,
800       ServerName serverName, EventType beginState, EventType endState,
801       int expectedVersion, final byte [] payload)
802   throws KeeperException {
803     String encoded = region.getEncodedName();
804     if(LOG.isDebugEnabled()) {
805       LOG.debug(zkw.prefix("Transitioning " + HRegionInfo.prettyPrint(encoded) +
806         " from " + beginState.toString() + " to " + endState.toString()));
807     }
808 
809     String node = getNodeName(zkw, encoded);
810     zkw.sync(node);
811 
812     // Read existing data of the node
813     Stat stat = new Stat();
814     byte [] existingBytes = ZKUtil.getDataNoWatch(zkw, node, stat);
815     if (existingBytes == null) {
816       // Node no longer exists.  Return -1. It means unsuccessful transition.
817       return -1;
818     }
819 
820     // Verify it is the expected version
821     if (expectedVersion != -1 && stat.getVersion() != expectedVersion) {
822       LOG.warn(zkw.prefix("Attempt to transition the " +
823         "unassigned node for " + encoded +
824         " from " + beginState + " to " + endState + " failed, " +
825         "the node existed but was version " + stat.getVersion() +
826         " not the expected version " + expectedVersion));
827         return -1;
828     }
829 
830     if (beginState.equals(EventType.M_ZK_REGION_OFFLINE)
831         && endState.equals(EventType.RS_ZK_REGION_OPENING)
832         && expectedVersion == -1 && stat.getVersion() != 0) {
833       // the below check ensures that double assignment doesnot happen.
834       // When the node is created for the first time then the expected version
835       // that is passed will be -1 and the version in znode will be 0.
836       // In all other cases the version in znode will be > 0.
837       LOG.warn(zkw.prefix("Attempt to transition the " + "unassigned node for "
838           + encoded + " from " + beginState + " to " + endState + " failed, "
839           + "the node existed but was version " + stat.getVersion()
840           + " not the expected version " + expectedVersion));
841       return -1;
842     }
843 
844     RegionTransition rt = getRegionTransition(existingBytes);
845 
846     // Verify the server transition happens on is not changed
847     if (!rt.getServerName().equals(serverName)) {
848       LOG.warn(zkw.prefix("Attempt to transition the " +
849         "unassigned node for " + encoded +
850         " from " + beginState + " to " + endState + " failed, " +
851         "the server that tried to transition was " + serverName +
852         " not the expected " + rt.getServerName()));
853       return -1;
854     }
855 
856     // Verify it is in expected state
857     EventType et = rt.getEventType();
858     if (!et.equals(beginState)) {
859       String existingServer = (rt.getServerName() == null)
860         ? "<unknown>" : rt.getServerName().toString();
861       LOG.warn(zkw.prefix("Attempt to transition the unassigned node for " + encoded
862         + " from " + beginState + " to " + endState + " failed, the node existed but"
863         + " was in the state " + et + " set by the server " + existingServer));
864       return -1;
865     }
866 
867     // Write new data, ensuring data has not changed since we last read it
868     try {
869       rt = RegionTransition.createRegionTransition(
870           endState, region.getRegionName(), serverName, payload);
871       if(!ZKUtil.setData(zkw, node, rt.toByteArray(), stat.getVersion())) {
872         LOG.warn(zkw.prefix("Attempt to transition the " +
873         "unassigned node for " + encoded +
874         " from " + beginState + " to " + endState + " failed, " +
875         "the node existed and was in the expected state but then when " +
876         "setting data we got a version mismatch"));
877         return -1;
878       }
879       if(LOG.isDebugEnabled()) {
880         LOG.debug(zkw.prefix("Transitioned node " + encoded +
881           " from " + beginState + " to " + endState));
882       }
883       return stat.getVersion() + 1;
884     } catch (KeeperException.NoNodeException nne) {
885       LOG.warn(zkw.prefix("Attempt to transition the " +
886         "unassigned node for " + encoded +
887         " from " + beginState + " to " + endState + " failed, " +
888         "the node existed and was in the expected state but then when " +
889         "setting data it no longer existed"));
890       return -1;
891     }
892   }
893 
getRegionTransition(final byte [] bytes)894   private static RegionTransition getRegionTransition(final byte [] bytes) throws KeeperException {
895     try {
896       return RegionTransition.parseFrom(bytes);
897     } catch (DeserializationException e) {
898       // Convert to a zk exception for now.  Otherwise have to change API
899       throw ZKUtil.convert(e);
900     }
901   }
902 
903   /**
904    * Gets the current data in the unassigned node for the specified region name
905    * or fully-qualified path.
906    *
907    * <p>Returns null if the region does not currently have a node.
908    *
909    * <p>Sets a watch on the node if the node exists.
910    *
911    * @param zkw zk reference
912    * @param pathOrRegionName fully-specified path or region name
913    * @return znode content
914    * @throws KeeperException if unexpected zookeeper exception
915    */
getData(ZooKeeperWatcher zkw, String pathOrRegionName)916   public static byte [] getData(ZooKeeperWatcher zkw,
917       String pathOrRegionName)
918   throws KeeperException {
919     String node = getPath(zkw, pathOrRegionName);
920     return ZKUtil.getDataAndWatch(zkw, node);
921   }
922 
923   /**
924    * Gets the current data in the unassigned node for the specified region name
925    * or fully-qualified path.
926    *
927    * <p>Returns null if the region does not currently have a node.
928    *
929    * <p>Sets a watch on the node if the node exists.
930    *
931    * @param zkw zk reference
932    * @param pathOrRegionName fully-specified path or region name
933    * @param stat object to populate the version.
934    * @return znode content
935    * @throws KeeperException if unexpected zookeeper exception
936    */
getDataAndWatch(ZooKeeperWatcher zkw, String pathOrRegionName, Stat stat)937   public static byte [] getDataAndWatch(ZooKeeperWatcher zkw,
938       String pathOrRegionName, Stat stat)
939   throws KeeperException {
940     String node = getPath(zkw, pathOrRegionName);
941     return ZKUtil.getDataAndWatch(zkw, node, stat);
942   }
943 
944   /**
945    * Gets the current data in the unassigned node for the specified region name
946    * or fully-qualified path.
947    *
948    * <p>Returns null if the region does not currently have a node.
949    *
950    * <p>Does not set a watch.
951    *
952    * @param zkw zk reference
953    * @param pathOrRegionName fully-specified path or region name
954    * @param stat object to store node info into on getData call
955    * @return znode content
956    * @throws KeeperException if unexpected zookeeper exception
957    */
getDataNoWatch(ZooKeeperWatcher zkw, String pathOrRegionName, Stat stat)958   public static byte [] getDataNoWatch(ZooKeeperWatcher zkw,
959       String pathOrRegionName, Stat stat)
960   throws KeeperException {
961     String node = getPath(zkw, pathOrRegionName);
962     return ZKUtil.getDataNoWatch(zkw, node, stat);
963   }
964 
965   /**
966    * @param zkw
967    * @param pathOrRegionName
968    * @return Path to znode
969    */
getPath(final ZooKeeperWatcher zkw, final String pathOrRegionName)970   public static String getPath(final ZooKeeperWatcher zkw, final String pathOrRegionName) {
971     return pathOrRegionName.startsWith("/")? pathOrRegionName : getNodeName(zkw, pathOrRegionName);
972   }
973 
974   /**
975    * Get the version of the specified znode
976    * @param zkw zk reference
977    * @param region region's info
978    * @return the version of the znode, -1 if it doesn't exist
979    * @throws KeeperException
980    */
getVersion(ZooKeeperWatcher zkw, HRegionInfo region)981   public static int getVersion(ZooKeeperWatcher zkw, HRegionInfo region)
982     throws KeeperException {
983     String znode = getNodeName(zkw, region.getEncodedName());
984     return ZKUtil.checkExists(zkw, znode);
985   }
986 
987   /**
988    * Delete the assignment node regardless of its current state.
989    * <p>
990    * Fail silent even if the node does not exist at all.
991    * @param watcher
992    * @param regionInfo
993    * @throws KeeperException
994    */
deleteNodeFailSilent(ZooKeeperWatcher watcher, HRegionInfo regionInfo)995   public static void deleteNodeFailSilent(ZooKeeperWatcher watcher,
996       HRegionInfo regionInfo)
997   throws KeeperException {
998     String node = getNodeName(watcher, regionInfo.getEncodedName());
999     ZKUtil.deleteNodeFailSilent(watcher, node);
1000   }
1001 
1002   /**
1003    * Blocks until there are no node in regions in transition.
1004    * <p>
1005    * Used in testing only.
1006    * @param zkw zk reference
1007    * @throws KeeperException
1008    * @throws InterruptedException
1009    */
blockUntilNoRIT(ZooKeeperWatcher zkw)1010   public static void blockUntilNoRIT(ZooKeeperWatcher zkw)
1011   throws KeeperException, InterruptedException {
1012     while (ZKUtil.nodeHasChildren(zkw, zkw.assignmentZNode)) {
1013       List<String> znodes =
1014         ZKUtil.listChildrenAndWatchForNewChildren(zkw, zkw.assignmentZNode);
1015       if (znodes != null && !znodes.isEmpty()) {
1016         LOG.debug("Waiting on RIT: " + znodes);
1017       }
1018       Thread.sleep(100);
1019     }
1020   }
1021 
1022   /**
1023    * Blocks until there is at least one node in regions in transition.
1024    * <p>
1025    * Used in testing only.
1026    * @param zkw zk reference
1027    * @throws KeeperException
1028    * @throws InterruptedException
1029    */
blockUntilRIT(ZooKeeperWatcher zkw)1030   public static void blockUntilRIT(ZooKeeperWatcher zkw)
1031   throws KeeperException, InterruptedException {
1032     while (!ZKUtil.nodeHasChildren(zkw, zkw.assignmentZNode)) {
1033       List<String> znodes =
1034         ZKUtil.listChildrenAndWatchForNewChildren(zkw, zkw.assignmentZNode);
1035       if (znodes == null || znodes.isEmpty()) {
1036         LOG.debug("No RIT in ZK");
1037       }
1038       Thread.sleep(100);
1039     }
1040   }
1041 
1042   /**
1043    * Presume bytes are serialized unassigned data structure
1044    * @param znodeBytes
1045    * @return String of the deserialized znode bytes.
1046    */
toString(final byte[] znodeBytes)1047   static String toString(final byte[] znodeBytes) {
1048     // This method should not exist.  Used by ZKUtil stringifying RegionTransition.  Have the
1049     // method in here so RegionTransition does not leak into ZKUtil.
1050     try {
1051       RegionTransition rt = RegionTransition.parseFrom(znodeBytes);
1052       return rt.toString();
1053     } catch (DeserializationException e) {
1054       return "";
1055     }
1056   }
1057 }
1058