1 /**
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *     http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 
20 package org.apache.hadoop.hbase.regionserver;
21 
22 import java.io.FileNotFoundException;
23 import java.io.IOException;
24 import java.io.InterruptedIOException;
25 import java.util.ArrayList;
26 import java.util.Collection;
27 import java.util.List;
28 import java.util.Map;
29 import java.util.UUID;
30 
31 import org.apache.commons.logging.Log;
32 import org.apache.commons.logging.LogFactory;
33 import org.apache.hadoop.hbase.classification.InterfaceAudience;
34 import org.apache.hadoop.conf.Configuration;
35 import org.apache.hadoop.fs.FSDataInputStream;
36 import org.apache.hadoop.fs.FSDataOutputStream;
37 import org.apache.hadoop.fs.FileStatus;
38 import org.apache.hadoop.fs.FileSystem;
39 import org.apache.hadoop.fs.FileUtil;
40 import org.apache.hadoop.fs.Path;
41 import org.apache.hadoop.fs.permission.FsPermission;
42 import org.apache.hadoop.hbase.HColumnDescriptor;
43 import org.apache.hadoop.hbase.HConstants;
44 import org.apache.hadoop.hbase.HRegionInfo;
45 import org.apache.hadoop.hbase.HTableDescriptor;
46 import org.apache.hadoop.hbase.KeyValue;
47 import org.apache.hadoop.hbase.KeyValueUtil;
48 import org.apache.hadoop.hbase.backup.HFileArchiver;
49 import org.apache.hadoop.hbase.fs.HFileSystem;
50 import org.apache.hadoop.hbase.io.Reference;
51 import org.apache.hadoop.hbase.util.Bytes;
52 import org.apache.hadoop.hbase.util.FSHDFSUtils;
53 import org.apache.hadoop.hbase.util.FSUtils;
54 import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
55 
56 /**
57  * View to an on-disk Region.
58  * Provides the set of methods necessary to interact with the on-disk region data.
59  */
60 @InterfaceAudience.Private
61 public class HRegionFileSystem {
62   private static final Log LOG = LogFactory.getLog(HRegionFileSystem.class);
63 
64   /** Name of the region info file that resides just under the region directory. */
65   public final static String REGION_INFO_FILE = ".regioninfo";
66 
67   /** Temporary subdirectory of the region directory used for merges. */
68   public static final String REGION_MERGES_DIR = ".merges";
69 
70   /** Temporary subdirectory of the region directory used for splits. */
71   public static final String REGION_SPLITS_DIR = ".splits";
72 
73   /** Temporary subdirectory of the region directory used for compaction output. */
74   private static final String REGION_TEMP_DIR = ".tmp";
75 
76   private final HRegionInfo regionInfo;
77   //regionInfo for interacting with FS (getting encodedName, etc)
78   private final HRegionInfo regionInfoForFs;
79   private final Configuration conf;
80   private final Path tableDir;
81   private final FileSystem fs;
82 
83   /**
84    * In order to handle NN connectivity hiccups, one need to retry non-idempotent operation at the
85    * client level.
86    */
87   private final int hdfsClientRetriesNumber;
88   private final int baseSleepBeforeRetries;
89   private static final int DEFAULT_HDFS_CLIENT_RETRIES_NUMBER = 10;
90   private static final int DEFAULT_BASE_SLEEP_BEFORE_RETRIES = 1000;
91 
92   /**
93    * Create a view to the on-disk region
94    * @param conf the {@link Configuration} to use
95    * @param fs {@link FileSystem} that contains the region
96    * @param tableDir {@link Path} to where the table is being stored
97    * @param regionInfo {@link HRegionInfo} for region
98    */
HRegionFileSystem(final Configuration conf, final FileSystem fs, final Path tableDir, final HRegionInfo regionInfo)99   HRegionFileSystem(final Configuration conf, final FileSystem fs, final Path tableDir,
100       final HRegionInfo regionInfo) {
101     this.fs = fs;
102     this.conf = conf;
103     this.tableDir = tableDir;
104     this.regionInfo = regionInfo;
105     this.regionInfoForFs = ServerRegionReplicaUtil.getRegionInfoForFs(regionInfo);
106     this.hdfsClientRetriesNumber = conf.getInt("hdfs.client.retries.number",
107       DEFAULT_HDFS_CLIENT_RETRIES_NUMBER);
108     this.baseSleepBeforeRetries = conf.getInt("hdfs.client.sleep.before.retries",
109       DEFAULT_BASE_SLEEP_BEFORE_RETRIES);
110  }
111 
112   /** @return the underlying {@link FileSystem} */
getFileSystem()113   public FileSystem getFileSystem() {
114     return this.fs;
115   }
116 
117   /** @return the {@link HRegionInfo} that describe this on-disk region view */
getRegionInfo()118   public HRegionInfo getRegionInfo() {
119     return this.regionInfo;
120   }
121 
getRegionInfoForFS()122   public HRegionInfo getRegionInfoForFS() {
123     return this.regionInfoForFs;
124   }
125 
126   /** @return {@link Path} to the region's root directory. */
getTableDir()127   public Path getTableDir() {
128     return this.tableDir;
129   }
130 
131   /** @return {@link Path} to the region directory. */
getRegionDir()132   public Path getRegionDir() {
133     return new Path(this.tableDir, this.regionInfoForFs.getEncodedName());
134   }
135 
136   // ===========================================================================
137   //  Temp Helpers
138   // ===========================================================================
139   /** @return {@link Path} to the region's temp directory, used for file creations */
getTempDir()140   Path getTempDir() {
141     return new Path(getRegionDir(), REGION_TEMP_DIR);
142   }
143 
144   /**
145    * Clean up any temp detritus that may have been left around from previous operation attempts.
146    */
cleanupTempDir()147   void cleanupTempDir() throws IOException {
148     deleteDir(getTempDir());
149   }
150 
151   // ===========================================================================
152   //  Store/StoreFile Helpers
153   // ===========================================================================
154   /**
155    * Returns the directory path of the specified family
156    * @param familyName Column Family Name
157    * @return {@link Path} to the directory of the specified family
158    */
getStoreDir(final String familyName)159   public Path getStoreDir(final String familyName) {
160     return new Path(this.getRegionDir(), familyName);
161   }
162 
163   /**
164    * Create the store directory for the specified family name
165    * @param familyName Column Family Name
166    * @return {@link Path} to the directory of the specified family
167    * @throws IOException if the directory creation fails.
168    */
createStoreDir(final String familyName)169   Path createStoreDir(final String familyName) throws IOException {
170     Path storeDir = getStoreDir(familyName);
171     if(!fs.exists(storeDir) && !createDir(storeDir))
172       throw new IOException("Failed creating "+storeDir);
173     return storeDir;
174   }
175 
176   /**
177    * Returns the store files available for the family.
178    * This methods performs the filtering based on the valid store files.
179    * @param familyName Column Family Name
180    * @return a set of {@link StoreFileInfo} for the specified family.
181    */
getStoreFiles(final byte[] familyName)182   public Collection<StoreFileInfo> getStoreFiles(final byte[] familyName) throws IOException {
183     return getStoreFiles(Bytes.toString(familyName));
184   }
185 
getStoreFiles(final String familyName)186   public Collection<StoreFileInfo> getStoreFiles(final String familyName) throws IOException {
187     return getStoreFiles(familyName, true);
188   }
189 
190   /**
191    * Returns the store files available for the family.
192    * This methods performs the filtering based on the valid store files.
193    * @param familyName Column Family Name
194    * @return a set of {@link StoreFileInfo} for the specified family.
195    */
getStoreFiles(final String familyName, final boolean validate)196   public Collection<StoreFileInfo> getStoreFiles(final String familyName, final boolean validate)
197       throws IOException {
198     Path familyDir = getStoreDir(familyName);
199     FileStatus[] files = FSUtils.listStatus(this.fs, familyDir);
200     if (files == null) {
201       LOG.debug("No StoreFiles for: " + familyDir);
202       return null;
203     }
204 
205     ArrayList<StoreFileInfo> storeFiles = new ArrayList<StoreFileInfo>(files.length);
206     for (FileStatus status: files) {
207       if (validate && !StoreFileInfo.isValid(status)) {
208         LOG.warn("Invalid StoreFile: " + status.getPath());
209         continue;
210       }
211       StoreFileInfo info = ServerRegionReplicaUtil.getStoreFileInfo(conf, fs, regionInfo,
212         regionInfoForFs, familyName, status.getPath());
213       storeFiles.add(info);
214 
215     }
216     return storeFiles;
217   }
218 
219   /**
220    * Return Qualified Path of the specified family/file
221    *
222    * @param familyName Column Family Name
223    * @param fileName File Name
224    * @return The qualified Path for the specified family/file
225    */
getStoreFilePath(final String familyName, final String fileName)226   Path getStoreFilePath(final String familyName, final String fileName) {
227     Path familyDir = getStoreDir(familyName);
228     return new Path(familyDir, fileName).makeQualified(this.fs);
229   }
230 
231   /**
232    * Return the store file information of the specified family/file.
233    *
234    * @param familyName Column Family Name
235    * @param fileName File Name
236    * @return The {@link StoreFileInfo} for the specified family/file
237    */
getStoreFileInfo(final String familyName, final String fileName)238   StoreFileInfo getStoreFileInfo(final String familyName, final String fileName)
239       throws IOException {
240     Path familyDir = getStoreDir(familyName);
241     return ServerRegionReplicaUtil.getStoreFileInfo(conf, fs, regionInfo,
242       regionInfoForFs, familyName, new Path(familyDir, fileName));
243   }
244 
245   /**
246    * Returns true if the specified family has reference files
247    * @param familyName Column Family Name
248    * @return true if family contains reference files
249    * @throws IOException
250    */
hasReferences(final String familyName)251   public boolean hasReferences(final String familyName) throws IOException {
252     FileStatus[] files = FSUtils.listStatus(fs, getStoreDir(familyName),
253         new FSUtils.ReferenceFileFilter(fs));
254     return files != null && files.length > 0;
255   }
256 
257   /**
258    * Check whether region has Reference file
259    * @param htd table desciptor of the region
260    * @return true if region has reference file
261    * @throws IOException
262    */
hasReferences(final HTableDescriptor htd)263   public boolean hasReferences(final HTableDescriptor htd) throws IOException {
264     for (HColumnDescriptor family : htd.getFamilies()) {
265       if (hasReferences(family.getNameAsString())) {
266         return true;
267       }
268     }
269     return false;
270   }
271 
272   /**
273    * @return the set of families present on disk
274    * @throws IOException
275    */
getFamilies()276   public Collection<String> getFamilies() throws IOException {
277     FileStatus[] fds = FSUtils.listStatus(fs, getRegionDir(), new FSUtils.FamilyDirFilter(fs));
278     if (fds == null) return null;
279 
280     ArrayList<String> families = new ArrayList<String>(fds.length);
281     for (FileStatus status: fds) {
282       families.add(status.getPath().getName());
283     }
284 
285     return families;
286   }
287 
288   /**
289    * Remove the region family from disk, archiving the store files.
290    * @param familyName Column Family Name
291    * @throws IOException if an error occours during the archiving
292    */
deleteFamily(final String familyName)293   public void deleteFamily(final String familyName) throws IOException {
294     // archive family store files
295     HFileArchiver.archiveFamily(fs, conf, regionInfoForFs, tableDir, Bytes.toBytes(familyName));
296 
297     // delete the family folder
298     Path familyDir = getStoreDir(familyName);
299     if(fs.exists(familyDir) && !deleteDir(familyDir))
300       throw new IOException("Could not delete family " + familyName
301           + " from FileSystem for region " + regionInfoForFs.getRegionNameAsString() + "("
302           + regionInfoForFs.getEncodedName() + ")");
303   }
304 
305   /**
306    * Generate a unique file name, used by createTempName() and commitStoreFile()
307    * @param suffix extra information to append to the generated name
308    * @return Unique file name
309    */
generateUniqueName(final String suffix)310   private static String generateUniqueName(final String suffix) {
311     String name = UUID.randomUUID().toString().replaceAll("-", "");
312     if (suffix != null) name += suffix;
313     return name;
314   }
315 
316   /**
317    * Generate a unique temporary Path. Used in conjuction with commitStoreFile()
318    * to get a safer file creation.
319    * <code>
320    * Path file = fs.createTempName();
321    * ...StoreFile.Writer(file)...
322    * fs.commitStoreFile("family", file);
323    * </code>
324    *
325    * @return Unique {@link Path} of the temporary file
326    */
createTempName()327   public Path createTempName() {
328     return createTempName(null);
329   }
330 
331   /**
332    * Generate a unique temporary Path. Used in conjuction with commitStoreFile()
333    * to get a safer file creation.
334    * <code>
335    * Path file = fs.createTempName();
336    * ...StoreFile.Writer(file)...
337    * fs.commitStoreFile("family", file);
338    * </code>
339    *
340    * @param suffix extra information to append to the generated name
341    * @return Unique {@link Path} of the temporary file
342    */
createTempName(final String suffix)343   public Path createTempName(final String suffix) {
344     return new Path(getTempDir(), generateUniqueName(suffix));
345   }
346 
347   /**
348    * Move the file from a build/temp location to the main family store directory.
349    * @param familyName Family that will gain the file
350    * @param buildPath {@link Path} to the file to commit.
351    * @return The new {@link Path} of the committed file
352    * @throws IOException
353    */
commitStoreFile(final String familyName, final Path buildPath)354   public Path commitStoreFile(final String familyName, final Path buildPath) throws IOException {
355     return commitStoreFile(familyName, buildPath, -1, false);
356   }
357 
358   /**
359    * Move the file from a build/temp location to the main family store directory.
360    * @param familyName Family that will gain the file
361    * @param buildPath {@link Path} to the file to commit.
362    * @param seqNum Sequence Number to append to the file name (less then 0 if no sequence number)
363    * @param generateNewName False if you want to keep the buildPath name
364    * @return The new {@link Path} of the committed file
365    * @throws IOException
366    */
commitStoreFile(final String familyName, final Path buildPath, final long seqNum, final boolean generateNewName)367   private Path commitStoreFile(final String familyName, final Path buildPath,
368       final long seqNum, final boolean generateNewName) throws IOException {
369     Path storeDir = getStoreDir(familyName);
370     if(!fs.exists(storeDir) && !createDir(storeDir))
371       throw new IOException("Failed creating " + storeDir);
372 
373     String name = buildPath.getName();
374     if (generateNewName) {
375       name = generateUniqueName((seqNum < 0) ? null : "_SeqId_" + seqNum + "_");
376     }
377     Path dstPath = new Path(storeDir, name);
378     if (!fs.exists(buildPath)) {
379       throw new FileNotFoundException(buildPath.toString());
380     }
381     LOG.debug("Committing store file " + buildPath + " as " + dstPath);
382     // buildPath exists, therefore not doing an exists() check.
383     if (!rename(buildPath, dstPath)) {
384       throw new IOException("Failed rename of " + buildPath + " to " + dstPath);
385     }
386     return dstPath;
387   }
388 
389 
390   /**
391    * Moves multiple store files to the relative region's family store directory.
392    * @param storeFiles list of store files divided by family
393    * @throws IOException
394    */
commitStoreFiles(final Map<byte[], List<StoreFile>> storeFiles)395   void commitStoreFiles(final Map<byte[], List<StoreFile>> storeFiles) throws IOException {
396     for (Map.Entry<byte[], List<StoreFile>> es: storeFiles.entrySet()) {
397       String familyName = Bytes.toString(es.getKey());
398       for (StoreFile sf: es.getValue()) {
399         commitStoreFile(familyName, sf.getPath());
400       }
401     }
402   }
403 
404   /**
405    * Archives the specified store file from the specified family.
406    * @param familyName Family that contains the store files
407    * @param filePath {@link Path} to the store file to remove
408    * @throws IOException if the archiving fails
409    */
removeStoreFile(final String familyName, final Path filePath)410   public void removeStoreFile(final String familyName, final Path filePath)
411       throws IOException {
412     HFileArchiver.archiveStoreFile(this.conf, this.fs, this.regionInfoForFs,
413         this.tableDir, Bytes.toBytes(familyName), filePath);
414   }
415 
416   /**
417    * Closes and archives the specified store files from the specified family.
418    * @param familyName Family that contains the store files
419    * @param storeFiles set of store files to remove
420    * @throws IOException if the archiving fails
421    */
removeStoreFiles(final String familyName, final Collection<StoreFile> storeFiles)422   public void removeStoreFiles(final String familyName, final Collection<StoreFile> storeFiles)
423       throws IOException {
424     HFileArchiver.archiveStoreFiles(this.conf, this.fs, this.regionInfoForFs,
425         this.tableDir, Bytes.toBytes(familyName), storeFiles);
426   }
427 
428   /**
429    * Bulk load: Add a specified store file to the specified family.
430    * If the source file is on the same different file-system is moved from the
431    * source location to the destination location, otherwise is copied over.
432    *
433    * @param familyName Family that will gain the file
434    * @param srcPath {@link Path} to the file to import
435    * @param seqNum Bulk Load sequence number
436    * @return The destination {@link Path} of the bulk loaded file
437    * @throws IOException
438    */
bulkLoadStoreFile(final String familyName, Path srcPath, long seqNum)439   Path bulkLoadStoreFile(final String familyName, Path srcPath, long seqNum)
440       throws IOException {
441     // Copy the file if it's on another filesystem
442     FileSystem srcFs = srcPath.getFileSystem(conf);
443     FileSystem desFs = fs instanceof HFileSystem ? ((HFileSystem)fs).getBackingFs() : fs;
444 
445     // We can't compare FileSystem instances as equals() includes UGI instance
446     // as part of the comparison and won't work when doing SecureBulkLoad
447     // TODO deal with viewFS
448     if (!FSHDFSUtils.isSameHdfs(conf, srcFs, desFs)) {
449       LOG.info("Bulk-load file " + srcPath + " is on different filesystem than " +
450           "the destination store. Copying file over to destination filesystem.");
451       Path tmpPath = createTempName();
452       FileUtil.copy(srcFs, srcPath, fs, tmpPath, false, conf);
453       LOG.info("Copied " + srcPath + " to temporary path on destination filesystem: " + tmpPath);
454       srcPath = tmpPath;
455     }
456 
457     return commitStoreFile(familyName, srcPath, seqNum, true);
458   }
459 
460   // ===========================================================================
461   //  Splits Helpers
462   // ===========================================================================
463   /** @return {@link Path} to the temp directory used during split operations */
getSplitsDir()464   Path getSplitsDir() {
465     return new Path(getRegionDir(), REGION_SPLITS_DIR);
466   }
467 
getSplitsDir(final HRegionInfo hri)468   Path getSplitsDir(final HRegionInfo hri) {
469     return new Path(getSplitsDir(), hri.getEncodedName());
470   }
471 
472   /**
473    * Clean up any split detritus that may have been left around from previous split attempts.
474    */
cleanupSplitsDir()475   void cleanupSplitsDir() throws IOException {
476     deleteDir(getSplitsDir());
477   }
478 
479   /**
480    * Clean up any split detritus that may have been left around from previous
481    * split attempts.
482    * Call this method on initial region deploy.
483    * @throws IOException
484    */
cleanupAnySplitDetritus()485   void cleanupAnySplitDetritus() throws IOException {
486     Path splitdir = this.getSplitsDir();
487     if (!fs.exists(splitdir)) return;
488     // Look at the splitdir.  It could have the encoded names of the daughter
489     // regions we tried to make.  See if the daughter regions actually got made
490     // out under the tabledir.  If here under splitdir still, then the split did
491     // not complete.  Try and do cleanup.  This code WILL NOT catch the case
492     // where we successfully created daughter a but regionserver crashed during
493     // the creation of region b.  In this case, there'll be an orphan daughter
494     // dir in the filesystem.  TOOD: Fix.
495     FileStatus[] daughters = FSUtils.listStatus(fs, splitdir, new FSUtils.DirFilter(fs));
496     if (daughters != null) {
497       for (FileStatus daughter: daughters) {
498         Path daughterDir = new Path(getTableDir(), daughter.getPath().getName());
499         if (fs.exists(daughterDir) && !deleteDir(daughterDir)) {
500           throw new IOException("Failed delete of " + daughterDir);
501         }
502       }
503     }
504     cleanupSplitsDir();
505     LOG.info("Cleaned up old failed split transaction detritus: " + splitdir);
506   }
507 
508   /**
509    * Remove daughter region
510    * @param regionInfo daughter {@link HRegionInfo}
511    * @throws IOException
512    */
cleanupDaughterRegion(final HRegionInfo regionInfo)513   void cleanupDaughterRegion(final HRegionInfo regionInfo) throws IOException {
514     Path regionDir = new Path(this.tableDir, regionInfo.getEncodedName());
515     if (this.fs.exists(regionDir) && !deleteDir(regionDir)) {
516       throw new IOException("Failed delete of " + regionDir);
517     }
518   }
519 
520   /**
521    * Commit a daughter region, moving it from the split temporary directory
522    * to the proper location in the filesystem.
523    *
524    * @param regionInfo                 daughter {@link org.apache.hadoop.hbase.HRegionInfo}
525    * @throws IOException
526    */
commitDaughterRegion(final HRegionInfo regionInfo)527   Path commitDaughterRegion(final HRegionInfo regionInfo)
528       throws IOException {
529     Path regionDir = new Path(this.tableDir, regionInfo.getEncodedName());
530     Path daughterTmpDir = this.getSplitsDir(regionInfo);
531 
532     if (fs.exists(daughterTmpDir)) {
533 
534       // Write HRI to a file in case we need to recover hbase:meta
535       Path regionInfoFile = new Path(daughterTmpDir, REGION_INFO_FILE);
536       byte[] regionInfoContent = getRegionInfoFileContent(regionInfo);
537       writeRegionInfoFileContent(conf, fs, regionInfoFile, regionInfoContent);
538 
539       // Move the daughter temp dir to the table dir
540       if (!rename(daughterTmpDir, regionDir)) {
541         throw new IOException("Unable to rename " + daughterTmpDir + " to " + regionDir);
542       }
543     }
544 
545     return regionDir;
546   }
547 
548   /**
549    * Create the region splits directory.
550    */
createSplitsDir()551   void createSplitsDir() throws IOException {
552     Path splitdir = getSplitsDir();
553     if (fs.exists(splitdir)) {
554       LOG.info("The " + splitdir + " directory exists.  Hence deleting it to recreate it");
555       if (!deleteDir(splitdir)) {
556         throw new IOException("Failed deletion of " + splitdir
557             + " before creating them again.");
558       }
559     }
560     // splitDir doesn't exists now. No need to do an exists() call for it.
561     if (!createDir(splitdir)) {
562       throw new IOException("Failed create of " + splitdir);
563     }
564   }
565 
566   /**
567    * Write out a split reference. Package local so it doesnt leak out of
568    * regionserver.
569    * @param hri {@link HRegionInfo} of the destination
570    * @param familyName Column Family Name
571    * @param f File to split.
572    * @param splitRow Split Row
573    * @param top True if we are referring to the top half of the hfile.
574    * @return Path to created reference.
575    * @param splitPolicy
576    * @throws IOException
577    */
splitStoreFile(final HRegionInfo hri, final String familyName, final StoreFile f, final byte[] splitRow, final boolean top, RegionSplitPolicy splitPolicy)578   Path splitStoreFile(final HRegionInfo hri, final String familyName, final StoreFile f,
579       final byte[] splitRow, final boolean top, RegionSplitPolicy splitPolicy) throws IOException {
580 
581     if (splitPolicy == null || !splitPolicy.skipStoreFileRangeCheck(familyName)) {
582       // Check whether the split row lies in the range of the store file
583       // If it is outside the range, return directly.
584       try {
585         if (top) {
586           //check if larger than last key.
587           KeyValue splitKey = KeyValueUtil.createFirstOnRow(splitRow);
588           byte[] lastKey = f.getLastKey();
589           // If lastKey is null means storefile is empty.
590           if (lastKey == null) {
591             return null;
592           }
593           if (f.getComparator().compareFlatKey(splitKey.getBuffer(),
594             splitKey.getKeyOffset(), splitKey.getKeyLength(), lastKey, 0, lastKey.length) > 0) {
595             return null;
596           }
597         } else {
598           //check if smaller than first key
599           KeyValue splitKey = KeyValueUtil.createLastOnRow(splitRow);
600           byte[] firstKey = f.getFirstKey();
601           // If firstKey is null means storefile is empty.
602           if (firstKey == null) {
603             return null;
604           }
605           if (f.getComparator().compareFlatKey(splitKey.getBuffer(),
606             splitKey.getKeyOffset(), splitKey.getKeyLength(), firstKey, 0, firstKey.length) < 0) {
607             return null;
608           }
609         }
610       } finally {
611         f.closeReader(f.getCacheConf() != null ? f.getCacheConf().shouldEvictOnClose() : true);
612       }
613     }
614 
615     Path splitDir = new Path(getSplitsDir(hri), familyName);
616     // A reference to the bottom half of the hsf store file.
617     Reference r =
618       top ? Reference.createTopReference(splitRow): Reference.createBottomReference(splitRow);
619     // Add the referred-to regions name as a dot separated suffix.
620     // See REF_NAME_REGEX regex above.  The referred-to regions name is
621     // up in the path of the passed in <code>f</code> -- parentdir is family,
622     // then the directory above is the region name.
623     String parentRegionName = regionInfoForFs.getEncodedName();
624     // Write reference with same file id only with the other region name as
625     // suffix and into the new region location (under same family).
626     Path p = new Path(splitDir, f.getPath().getName() + "." + parentRegionName);
627     return r.write(fs, p);
628   }
629 
630   // ===========================================================================
631   //  Merge Helpers
632   // ===========================================================================
633   /** @return {@link Path} to the temp directory used during merge operations */
getMergesDir()634   Path getMergesDir() {
635     return new Path(getRegionDir(), REGION_MERGES_DIR);
636   }
637 
getMergesDir(final HRegionInfo hri)638   Path getMergesDir(final HRegionInfo hri) {
639     return new Path(getMergesDir(), hri.getEncodedName());
640   }
641 
642   /**
643    * Clean up any merge detritus that may have been left around from previous merge attempts.
644    */
cleanupMergesDir()645   void cleanupMergesDir() throws IOException {
646     deleteDir(getMergesDir());
647   }
648 
649   /**
650    * Remove merged region
651    * @param mergedRegion {@link HRegionInfo}
652    * @throws IOException
653    */
cleanupMergedRegion(final HRegionInfo mergedRegion)654   void cleanupMergedRegion(final HRegionInfo mergedRegion) throws IOException {
655     Path regionDir = new Path(this.tableDir, mergedRegion.getEncodedName());
656     if (this.fs.exists(regionDir) && !this.fs.delete(regionDir, true)) {
657       throw new IOException("Failed delete of " + regionDir);
658     }
659   }
660 
661   /**
662    * Create the region merges directory.
663    * @throws IOException If merges dir already exists or we fail to create it.
664    * @see HRegionFileSystem#cleanupMergesDir()
665    */
createMergesDir()666   void createMergesDir() throws IOException {
667     Path mergesdir = getMergesDir();
668     if (fs.exists(mergesdir)) {
669       LOG.info("The " + mergesdir
670           + " directory exists.  Hence deleting it to recreate it");
671       if (!fs.delete(mergesdir, true)) {
672         throw new IOException("Failed deletion of " + mergesdir
673             + " before creating them again.");
674       }
675     }
676     if (!fs.mkdirs(mergesdir))
677       throw new IOException("Failed create of " + mergesdir);
678   }
679 
680   /**
681    * Write out a merge reference under the given merges directory. Package local
682    * so it doesnt leak out of regionserver.
683    * @param mergedRegion {@link HRegionInfo} of the merged region
684    * @param familyName Column Family Name
685    * @param f File to create reference.
686    * @param mergedDir
687    * @return Path to created reference.
688    * @throws IOException
689    */
mergeStoreFile(final HRegionInfo mergedRegion, final String familyName, final StoreFile f, final Path mergedDir)690   Path mergeStoreFile(final HRegionInfo mergedRegion, final String familyName,
691       final StoreFile f, final Path mergedDir)
692       throws IOException {
693     Path referenceDir = new Path(new Path(mergedDir,
694         mergedRegion.getEncodedName()), familyName);
695     // A whole reference to the store file.
696     Reference r = Reference.createTopReference(regionInfoForFs.getStartKey());
697     // Add the referred-to regions name as a dot separated suffix.
698     // See REF_NAME_REGEX regex above. The referred-to regions name is
699     // up in the path of the passed in <code>f</code> -- parentdir is family,
700     // then the directory above is the region name.
701     String mergingRegionName = regionInfoForFs.getEncodedName();
702     // Write reference with same file id only with the other region name as
703     // suffix and into the new region location (under same family).
704     Path p = new Path(referenceDir, f.getPath().getName() + "."
705         + mergingRegionName);
706     return r.write(fs, p);
707   }
708 
709   /**
710    * Commit a merged region, moving it from the merges temporary directory to
711    * the proper location in the filesystem.
712    * @param mergedRegionInfo merged region {@link HRegionInfo}
713    * @throws IOException
714    */
commitMergedRegion(final HRegionInfo mergedRegionInfo)715   void commitMergedRegion(final HRegionInfo mergedRegionInfo) throws IOException {
716     Path regionDir = new Path(this.tableDir, mergedRegionInfo.getEncodedName());
717     Path mergedRegionTmpDir = this.getMergesDir(mergedRegionInfo);
718     // Move the tmp dir in the expected location
719     if (mergedRegionTmpDir != null && fs.exists(mergedRegionTmpDir)) {
720       if (!fs.rename(mergedRegionTmpDir, regionDir)) {
721         throw new IOException("Unable to rename " + mergedRegionTmpDir + " to "
722             + regionDir);
723       }
724     }
725   }
726 
727   // ===========================================================================
728   //  Create/Open/Delete Helpers
729   // ===========================================================================
730   /**
731    * Log the current state of the region
732    * @param LOG log to output information
733    * @throws IOException if an unexpected exception occurs
734    */
logFileSystemState(final Log LOG)735   void logFileSystemState(final Log LOG) throws IOException {
736     FSUtils.logFileSystemState(fs, this.getRegionDir(), LOG);
737   }
738 
739   /**
740    * @param hri
741    * @return Content of the file we write out to the filesystem under a region
742    * @throws IOException
743    */
getRegionInfoFileContent(final HRegionInfo hri)744   private static byte[] getRegionInfoFileContent(final HRegionInfo hri) throws IOException {
745     return hri.toDelimitedByteArray();
746   }
747 
748   /**
749    * Create a {@link HRegionInfo} from the serialized version on-disk.
750    * @param fs {@link FileSystem} that contains the Region Info file
751    * @param regionDir {@link Path} to the Region Directory that contains the Info file
752    * @return An {@link HRegionInfo} instance gotten from the Region Info file.
753    * @throws IOException if an error occurred during file open/read operation.
754    */
loadRegionInfoFileContent(final FileSystem fs, final Path regionDir)755   public static HRegionInfo loadRegionInfoFileContent(final FileSystem fs, final Path regionDir)
756       throws IOException {
757     FSDataInputStream in = fs.open(new Path(regionDir, REGION_INFO_FILE));
758     try {
759       return HRegionInfo.parseFrom(in);
760     } finally {
761       in.close();
762     }
763   }
764 
765   /**
766    * Write the .regioninfo file on-disk.
767    */
writeRegionInfoFileContent(final Configuration conf, final FileSystem fs, final Path regionInfoFile, final byte[] content)768   private static void writeRegionInfoFileContent(final Configuration conf, final FileSystem fs,
769       final Path regionInfoFile, final byte[] content) throws IOException {
770     // First check to get the permissions
771     FsPermission perms = FSUtils.getFilePermissions(fs, conf, HConstants.DATA_FILE_UMASK_KEY);
772     // Write the RegionInfo file content
773     FSDataOutputStream out = FSUtils.create(conf, fs, regionInfoFile, perms, null);
774     try {
775       out.write(content);
776     } finally {
777       out.close();
778     }
779   }
780 
781   /**
782    * Write out an info file under the stored region directory. Useful recovering mangled regions.
783    * If the regionInfo already exists on-disk, then we fast exit.
784    */
checkRegionInfoOnFilesystem()785   void checkRegionInfoOnFilesystem() throws IOException {
786     // Compose the content of the file so we can compare to length in filesystem. If not same,
787     // rewrite it (it may have been written in the old format using Writables instead of pb). The
788     // pb version is much shorter -- we write now w/o the toString version -- so checking length
789     // only should be sufficient. I don't want to read the file every time to check if it pb
790     // serialized.
791     byte[] content = getRegionInfoFileContent(regionInfoForFs);
792     try {
793       Path regionInfoFile = new Path(getRegionDir(), REGION_INFO_FILE);
794 
795       FileStatus status = fs.getFileStatus(regionInfoFile);
796       if (status != null && status.getLen() == content.length) {
797         // Then assume the content good and move on.
798         // NOTE: that the length is not sufficient to define the the content matches.
799         return;
800       }
801 
802       LOG.info("Rewriting .regioninfo file at: " + regionInfoFile);
803       if (!fs.delete(regionInfoFile, false)) {
804         throw new IOException("Unable to remove existing " + regionInfoFile);
805       }
806     } catch (FileNotFoundException e) {
807       LOG.warn(REGION_INFO_FILE + " file not found for region: " + regionInfoForFs.getEncodedName() +
808           " on table " + regionInfo.getTable());
809     }
810 
811     // Write HRI to a file in case we need to recover hbase:meta
812     writeRegionInfoOnFilesystem(content, true);
813   }
814 
815   /**
816    * Write out an info file under the region directory. Useful recovering mangled regions.
817    * @param useTempDir indicate whether or not using the region .tmp dir for a safer file creation.
818    */
writeRegionInfoOnFilesystem(boolean useTempDir)819   private void writeRegionInfoOnFilesystem(boolean useTempDir) throws IOException {
820     byte[] content = getRegionInfoFileContent(regionInfoForFs);
821     writeRegionInfoOnFilesystem(content, useTempDir);
822   }
823 
824   /**
825    * Write out an info file under the region directory. Useful recovering mangled regions.
826    * @param regionInfoContent serialized version of the {@link HRegionInfo}
827    * @param useTempDir indicate whether or not using the region .tmp dir for a safer file creation.
828    */
writeRegionInfoOnFilesystem(final byte[] regionInfoContent, final boolean useTempDir)829   private void writeRegionInfoOnFilesystem(final byte[] regionInfoContent,
830       final boolean useTempDir) throws IOException {
831     Path regionInfoFile = new Path(getRegionDir(), REGION_INFO_FILE);
832     if (useTempDir) {
833       // Create in tmpDir and then move into place in case we crash after
834       // create but before close. If we don't successfully close the file,
835       // subsequent region reopens will fail the below because create is
836       // registered in NN.
837 
838       // And then create the file
839       Path tmpPath = new Path(getTempDir(), REGION_INFO_FILE);
840 
841       // If datanode crashes or if the RS goes down just before the close is called while trying to
842       // close the created regioninfo file in the .tmp directory then on next
843       // creation we will be getting AlreadyCreatedException.
844       // Hence delete and create the file if exists.
845       if (FSUtils.isExists(fs, tmpPath)) {
846         FSUtils.delete(fs, tmpPath, true);
847       }
848 
849       // Write HRI to a file in case we need to recover hbase:meta
850       writeRegionInfoFileContent(conf, fs, tmpPath, regionInfoContent);
851 
852       // Move the created file to the original path
853       if (fs.exists(tmpPath) &&  !rename(tmpPath, regionInfoFile)) {
854         throw new IOException("Unable to rename " + tmpPath + " to " + regionInfoFile);
855       }
856     } else {
857       // Write HRI to a file in case we need to recover hbase:meta
858       writeRegionInfoFileContent(conf, fs, regionInfoFile, regionInfoContent);
859     }
860   }
861 
862   /**
863    * Create a new Region on file-system.
864    * @param conf the {@link Configuration} to use
865    * @param fs {@link FileSystem} from which to add the region
866    * @param tableDir {@link Path} to where the table is being stored
867    * @param regionInfo {@link HRegionInfo} for region to be added
868    * @throws IOException if the region creation fails due to a FileSystem exception.
869    */
createRegionOnFileSystem(final Configuration conf, final FileSystem fs, final Path tableDir, final HRegionInfo regionInfo)870   public static HRegionFileSystem createRegionOnFileSystem(final Configuration conf,
871       final FileSystem fs, final Path tableDir, final HRegionInfo regionInfo) throws IOException {
872     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tableDir, regionInfo);
873     Path regionDir = regionFs.getRegionDir();
874 
875     if (fs.exists(regionDir)) {
876       LOG.warn("Trying to create a region that already exists on disk: " + regionDir);
877       throw new IOException("The specified region already exists on disk: " + regionDir);
878     }
879 
880     // Create the region directory
881     if (!createDirOnFileSystem(fs, conf, regionDir)) {
882       LOG.warn("Unable to create the region directory: " + regionDir);
883       throw new IOException("Unable to create region directory: " + regionDir);
884     }
885 
886     // Write HRI to a file in case we need to recover hbase:meta
887     regionFs.writeRegionInfoOnFilesystem(false);
888     return regionFs;
889   }
890 
891   /**
892    * Open Region from file-system.
893    * @param conf the {@link Configuration} to use
894    * @param fs {@link FileSystem} from which to add the region
895    * @param tableDir {@link Path} to where the table is being stored
896    * @param regionInfo {@link HRegionInfo} for region to be added
897    * @param readOnly True if you don't want to edit the region data
898    * @throws IOException if the region creation fails due to a FileSystem exception.
899    */
openRegionFromFileSystem(final Configuration conf, final FileSystem fs, final Path tableDir, final HRegionInfo regionInfo, boolean readOnly)900   public static HRegionFileSystem openRegionFromFileSystem(final Configuration conf,
901       final FileSystem fs, final Path tableDir, final HRegionInfo regionInfo, boolean readOnly)
902       throws IOException {
903     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tableDir, regionInfo);
904     Path regionDir = regionFs.getRegionDir();
905 
906     if (!fs.exists(regionDir)) {
907       LOG.warn("Trying to open a region that do not exists on disk: " + regionDir);
908       throw new IOException("The specified region do not exists on disk: " + regionDir);
909     }
910 
911     if (!readOnly) {
912       // Cleanup temporary directories
913       regionFs.cleanupTempDir();
914       regionFs.cleanupSplitsDir();
915       regionFs.cleanupMergesDir();
916 
917       // if it doesn't exists, Write HRI to a file, in case we need to recover hbase:meta
918       regionFs.checkRegionInfoOnFilesystem();
919     }
920 
921     return regionFs;
922   }
923 
924   /**
925    * Remove the region from the table directory, archiving the region's hfiles.
926    * @param conf the {@link Configuration} to use
927    * @param fs {@link FileSystem} from which to remove the region
928    * @param tableDir {@link Path} to where the table is being stored
929    * @param regionInfo {@link HRegionInfo} for region to be deleted
930    * @throws IOException if the request cannot be completed
931    */
deleteRegionFromFileSystem(final Configuration conf, final FileSystem fs, final Path tableDir, final HRegionInfo regionInfo)932   public static void deleteRegionFromFileSystem(final Configuration conf,
933       final FileSystem fs, final Path tableDir, final HRegionInfo regionInfo) throws IOException {
934     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tableDir, regionInfo);
935     Path regionDir = regionFs.getRegionDir();
936 
937     if (!fs.exists(regionDir)) {
938       LOG.warn("Trying to delete a region that do not exists on disk: " + regionDir);
939       return;
940     }
941 
942     if (LOG.isDebugEnabled()) {
943       LOG.debug("DELETING region " + regionDir);
944     }
945 
946     // Archive region
947     Path rootDir = FSUtils.getRootDir(conf);
948     HFileArchiver.archiveRegion(fs, rootDir, tableDir, regionDir);
949 
950     // Delete empty region dir
951     if (!fs.delete(regionDir, true)) {
952       LOG.warn("Failed delete of " + regionDir);
953     }
954   }
955 
956   /**
957    * Creates a directory. Assumes the user has already checked for this directory existence.
958    * @param dir
959    * @return the result of fs.mkdirs(). In case underlying fs throws an IOException, it checks
960    *         whether the directory exists or not, and returns true if it exists.
961    * @throws IOException
962    */
createDir(Path dir)963   boolean createDir(Path dir) throws IOException {
964     int i = 0;
965     IOException lastIOE = null;
966     do {
967       try {
968         return fs.mkdirs(dir);
969       } catch (IOException ioe) {
970         lastIOE = ioe;
971         if (fs.exists(dir)) return true; // directory is present
972         try {
973           sleepBeforeRetry("Create Directory", i+1);
974         } catch (InterruptedException e) {
975           throw (InterruptedIOException)new InterruptedIOException().initCause(e);
976         }
977       }
978     } while (++i <= hdfsClientRetriesNumber);
979     throw new IOException("Exception in createDir", lastIOE);
980   }
981 
982   /**
983    * Renames a directory. Assumes the user has already checked for this directory existence.
984    * @param srcpath
985    * @param dstPath
986    * @return true if rename is successful.
987    * @throws IOException
988    */
rename(Path srcpath, Path dstPath)989   boolean rename(Path srcpath, Path dstPath) throws IOException {
990     IOException lastIOE = null;
991     int i = 0;
992     do {
993       try {
994         return fs.rename(srcpath, dstPath);
995       } catch (IOException ioe) {
996         lastIOE = ioe;
997         if (!fs.exists(srcpath) && fs.exists(dstPath)) return true; // successful move
998         // dir is not there, retry after some time.
999         try {
1000           sleepBeforeRetry("Rename Directory", i+1);
1001         } catch (InterruptedException e) {
1002           throw (InterruptedIOException)new InterruptedIOException().initCause(e);
1003         }
1004       }
1005     } while (++i <= hdfsClientRetriesNumber);
1006 
1007     throw new IOException("Exception in rename", lastIOE);
1008   }
1009 
1010   /**
1011    * Deletes a directory. Assumes the user has already checked for this directory existence.
1012    * @param dir
1013    * @return true if the directory is deleted.
1014    * @throws IOException
1015    */
deleteDir(Path dir)1016   boolean deleteDir(Path dir) throws IOException {
1017     IOException lastIOE = null;
1018     int i = 0;
1019     do {
1020       try {
1021         return fs.delete(dir, true);
1022       } catch (IOException ioe) {
1023         lastIOE = ioe;
1024         if (!fs.exists(dir)) return true;
1025         // dir is there, retry deleting after some time.
1026         try {
1027           sleepBeforeRetry("Delete Directory", i+1);
1028         } catch (InterruptedException e) {
1029           throw (InterruptedIOException)new InterruptedIOException().initCause(e);
1030         }
1031       }
1032     } while (++i <= hdfsClientRetriesNumber);
1033 
1034     throw new IOException("Exception in DeleteDir", lastIOE);
1035   }
1036 
1037   /**
1038    * sleeping logic; handles the interrupt exception.
1039    */
sleepBeforeRetry(String msg, int sleepMultiplier)1040   private void sleepBeforeRetry(String msg, int sleepMultiplier) throws InterruptedException {
1041     sleepBeforeRetry(msg, sleepMultiplier, baseSleepBeforeRetries, hdfsClientRetriesNumber);
1042   }
1043 
1044   /**
1045    * Creates a directory for a filesystem and configuration object. Assumes the user has already
1046    * checked for this directory existence.
1047    * @param fs
1048    * @param conf
1049    * @param dir
1050    * @return the result of fs.mkdirs(). In case underlying fs throws an IOException, it checks
1051    *         whether the directory exists or not, and returns true if it exists.
1052    * @throws IOException
1053    */
createDirOnFileSystem(FileSystem fs, Configuration conf, Path dir)1054   private static boolean createDirOnFileSystem(FileSystem fs, Configuration conf, Path dir)
1055       throws IOException {
1056     int i = 0;
1057     IOException lastIOE = null;
1058     int hdfsClientRetriesNumber = conf.getInt("hdfs.client.retries.number",
1059       DEFAULT_HDFS_CLIENT_RETRIES_NUMBER);
1060     int baseSleepBeforeRetries = conf.getInt("hdfs.client.sleep.before.retries",
1061       DEFAULT_BASE_SLEEP_BEFORE_RETRIES);
1062     do {
1063       try {
1064         return fs.mkdirs(dir);
1065       } catch (IOException ioe) {
1066         lastIOE = ioe;
1067         if (fs.exists(dir)) return true; // directory is present
1068         try {
1069           sleepBeforeRetry("Create Directory", i+1, baseSleepBeforeRetries, hdfsClientRetriesNumber);
1070         } catch (InterruptedException e) {
1071           throw (InterruptedIOException)new InterruptedIOException().initCause(e);
1072         }
1073       }
1074     } while (++i <= hdfsClientRetriesNumber);
1075 
1076     throw new IOException("Exception in createDir", lastIOE);
1077   }
1078 
1079   /**
1080    * sleeping logic for static methods; handles the interrupt exception. Keeping a static version
1081    * for this to avoid re-looking for the integer values.
1082    */
sleepBeforeRetry(String msg, int sleepMultiplier, int baseSleepBeforeRetries, int hdfsClientRetriesNumber)1083   private static void sleepBeforeRetry(String msg, int sleepMultiplier, int baseSleepBeforeRetries,
1084       int hdfsClientRetriesNumber) throws InterruptedException {
1085     if (sleepMultiplier > hdfsClientRetriesNumber) {
1086       LOG.debug(msg + ", retries exhausted");
1087       return;
1088     }
1089     LOG.debug(msg + ", sleeping " + baseSleepBeforeRetries + " times " + sleepMultiplier);
1090     Thread.sleep((long)baseSleepBeforeRetries * sleepMultiplier);
1091   }
1092 }
1093