1 /**
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 package org.apache.hadoop.hdfs.util;
19 
20 import java.io.BufferedReader;
21 import java.io.File;
22 import java.io.FileInputStream;
23 import java.io.FileNotFoundException;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.InputStreamReader;
27 import java.security.DigestInputStream;
28 import java.security.MessageDigest;
29 import java.util.regex.Matcher;
30 import java.util.regex.Pattern;
31 
32 import org.apache.commons.logging.Log;
33 import org.apache.commons.logging.LogFactory;
34 import org.apache.hadoop.io.IOUtils;
35 import org.apache.hadoop.io.MD5Hash;
36 import org.apache.hadoop.util.StringUtils;
37 
38 import com.google.common.base.Charsets;
39 
40 /**
41  * Static functions for dealing with files of the same format
42  * that the Unix "md5sum" utility writes.
43  */
44 public abstract class MD5FileUtils {
45   private static final Log LOG = LogFactory.getLog(
46       MD5FileUtils.class);
47 
48   public static final String MD5_SUFFIX = ".md5";
49   private static final Pattern LINE_REGEX =
50     Pattern.compile("([0-9a-f]{32}) [ \\*](.+)");
51 
52   /**
53    * Verify that the previously saved md5 for the given file matches
54    * expectedMd5.
55    * @throws IOException
56    */
verifySavedMD5(File dataFile, MD5Hash expectedMD5)57   public static void verifySavedMD5(File dataFile, MD5Hash expectedMD5)
58       throws IOException {
59     MD5Hash storedHash = readStoredMd5ForFile(dataFile);
60     // Check the hash itself
61     if (!expectedMD5.equals(storedHash)) {
62       throw new IOException(
63           "File " + dataFile + " did not match stored MD5 checksum " +
64           " (stored: " + storedHash + ", computed: " + expectedMD5);
65     }
66   }
67 
68   /**
69    * Read the md5 file stored alongside the given data file
70    * and match the md5 file content.
71    * @param dataFile the file containing data
72    * @return a matcher with two matched groups
73    *   where group(1) is the md5 string and group(2) is the data file path.
74    */
readStoredMd5(File md5File)75   private static Matcher readStoredMd5(File md5File) throws IOException {
76     BufferedReader reader =
77         new BufferedReader(new InputStreamReader(new FileInputStream(
78             md5File), Charsets.UTF_8));
79     String md5Line;
80     try {
81       md5Line = reader.readLine();
82       if (md5Line == null) { md5Line = ""; }
83       md5Line = md5Line.trim();
84     } catch (IOException ioe) {
85       throw new IOException("Error reading md5 file at " + md5File, ioe);
86     } finally {
87       IOUtils.cleanup(LOG, reader);
88     }
89 
90     Matcher matcher = LINE_REGEX.matcher(md5Line);
91     if (!matcher.matches()) {
92       throw new IOException("Invalid MD5 file " + md5File + ": the content \""
93           + md5Line + "\" does not match the expected pattern.");
94     }
95     return matcher;
96   }
97 
98   /**
99    * Read the md5 checksum stored alongside the given data file.
100    * @param dataFile the file containing data
101    * @return the checksum stored in dataFile.md5
102    */
readStoredMd5ForFile(File dataFile)103   public static MD5Hash readStoredMd5ForFile(File dataFile) throws IOException {
104     final File md5File = getDigestFileForFile(dataFile);
105     if (!md5File.exists()) {
106       return null;
107     }
108 
109     final Matcher matcher = readStoredMd5(md5File);
110     String storedHash = matcher.group(1);
111     File referencedFile = new File(matcher.group(2));
112 
113     // Sanity check: Make sure that the file referenced in the .md5 file at
114     // least has the same name as the file we expect
115     if (!referencedFile.getName().equals(dataFile.getName())) {
116       throw new IOException(
117           "MD5 file at " + md5File + " references file named " +
118           referencedFile.getName() + " but we expected it to reference " +
119           dataFile);
120     }
121     return new MD5Hash(storedHash);
122   }
123 
124   /**
125    * Read dataFile and compute its MD5 checksum.
126    */
computeMd5ForFile(File dataFile)127   public static MD5Hash computeMd5ForFile(File dataFile) throws IOException {
128     InputStream in = new FileInputStream(dataFile);
129     try {
130       MessageDigest digester = MD5Hash.getDigester();
131       DigestInputStream dis = new DigestInputStream(in, digester);
132       IOUtils.copyBytes(dis, new IOUtils.NullOutputStream(), 128*1024);
133 
134       return new MD5Hash(digester.digest());
135     } finally {
136       IOUtils.closeStream(in);
137     }
138   }
139 
140   /**
141    * Save the ".md5" file that lists the md5sum of another file.
142    * @param dataFile the original file whose md5 was computed
143    * @param digest the computed digest
144    * @throws IOException
145    */
saveMD5File(File dataFile, MD5Hash digest)146   public static void saveMD5File(File dataFile, MD5Hash digest)
147       throws IOException {
148     final String digestString = StringUtils.byteToHexString(digest.getDigest());
149     saveMD5File(dataFile, digestString);
150   }
151 
saveMD5File(File dataFile, String digestString)152   private static void saveMD5File(File dataFile, String digestString)
153       throws IOException {
154     File md5File = getDigestFileForFile(dataFile);
155     String md5Line = digestString + " *" + dataFile.getName() + "\n";
156 
157     AtomicFileOutputStream afos = new AtomicFileOutputStream(md5File);
158     afos.write(md5Line.getBytes(Charsets.UTF_8));
159     afos.close();
160 
161     if (LOG.isDebugEnabled()) {
162       LOG.debug("Saved MD5 " + digestString + " to " + md5File);
163     }
164   }
165 
renameMD5File(File oldDataFile, File newDataFile)166   public static void renameMD5File(File oldDataFile, File newDataFile)
167       throws IOException {
168     final File fromFile = getDigestFileForFile(oldDataFile);
169     if (!fromFile.exists()) {
170       throw new FileNotFoundException(fromFile + " does not exist.");
171     }
172 
173     final String digestString = readStoredMd5(fromFile).group(1);
174     saveMD5File(newDataFile, digestString);
175 
176     if (!fromFile.delete()) {
177       LOG.warn("deleting  " + fromFile.getAbsolutePath() + " FAILED");
178     }
179   }
180 
181   /**
182    * @return a reference to the file with .md5 suffix that will
183    * contain the md5 checksum for the given data file.
184    */
getDigestFileForFile(File file)185   public static File getDigestFileForFile(File file) {
186     return new File(file.getParentFile(), file.getName() + MD5_SUFFIX);
187   }
188 }
189