1 /** 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 package org.apache.hadoop.hdfs.tools; 19 20 import java.io.BufferedReader; 21 import java.io.IOException; 22 import java.io.InputStream; 23 import java.io.InputStreamReader; 24 import java.io.PrintStream; 25 import java.net.URI; 26 import java.net.URL; 27 import java.net.URLConnection; 28 import java.net.URLEncoder; 29 import java.security.PrivilegedExceptionAction; 30 31 import org.apache.hadoop.classification.InterfaceAudience; 32 import org.apache.hadoop.conf.Configuration; 33 import org.apache.hadoop.conf.Configured; 34 import org.apache.hadoop.fs.FileSystem; 35 import org.apache.hadoop.fs.Path; 36 import org.apache.hadoop.hdfs.DFSUtil; 37 import org.apache.hadoop.hdfs.DistributedFileSystem; 38 import org.apache.hadoop.hdfs.HAUtil; 39 import org.apache.hadoop.hdfs.HdfsConfiguration; 40 import org.apache.hadoop.hdfs.server.namenode.NamenodeFsck; 41 import org.apache.hadoop.hdfs.web.URLConnectionFactory; 42 import org.apache.hadoop.security.UserGroupInformation; 43 import org.apache.hadoop.security.authentication.client.AuthenticationException; 44 import org.apache.hadoop.util.StringUtils; 45 import org.apache.hadoop.util.Tool; 46 import org.apache.hadoop.util.ToolRunner; 47 48 /** 49 * This class provides rudimentary checking of DFS volumes for errors and 50 * sub-optimal conditions. 51 * <p>The tool scans all files and directories, starting from an indicated 52 * root path. The following abnormal conditions are detected and handled:</p> 53 * <ul> 54 * <li>files with blocks that are completely missing from all datanodes.<br/> 55 * In this case the tool can perform one of the following actions: 56 * <ul> 57 * <li>none ({@link org.apache.hadoop.hdfs.server.namenode.NamenodeFsck#FIXING_NONE})</li> 58 * <li>move corrupted files to /lost+found directory on DFS 59 * ({@link org.apache.hadoop.hdfs.server.namenode.NamenodeFsck#FIXING_MOVE}). Remaining data blocks are saved as a 60 * block chains, representing longest consecutive series of valid blocks.</li> 61 * <li>delete corrupted files ({@link org.apache.hadoop.hdfs.server.namenode.NamenodeFsck#FIXING_DELETE})</li> 62 * </ul> 63 * </li> 64 * <li>detect files with under-replicated or over-replicated blocks</li> 65 * </ul> 66 * Additionally, the tool collects a detailed overall DFS statistics, and 67 * optionally can print detailed statistics on block locations and replication 68 * factors of each file. 69 * The tool also provides and option to filter open files during the scan. 70 * 71 */ 72 @InterfaceAudience.Private 73 public class DFSck extends Configured implements Tool { 74 static{ HdfsConfiguration.init()75 HdfsConfiguration.init(); 76 } 77 78 private static final String USAGE = "Usage: hdfs fsck <path> " 79 + "[-list-corruptfileblocks | " 80 + "[-move | -delete | -openforwrite] " 81 + "[-files [-blocks [-locations | -racks]]]] " 82 + "[-includeSnapshots] " 83 + "[-storagepolicies] [-blockId <blk_Id>]\n" 84 + "\t<path>\tstart checking from this path\n" 85 + "\t-move\tmove corrupted files to /lost+found\n" 86 + "\t-delete\tdelete corrupted files\n" 87 + "\t-files\tprint out files being checked\n" 88 + "\t-openforwrite\tprint out files opened for write\n" 89 + "\t-includeSnapshots\tinclude snapshot data if the given path" 90 + " indicates a snapshottable directory or there are " 91 + "snapshottable directories under it\n" 92 + "\t-list-corruptfileblocks\tprint out list of missing " 93 + "blocks and files they belong to\n" 94 + "\t-blocks\tprint out block report\n" 95 + "\t-locations\tprint out locations for every block\n" 96 + "\t-racks\tprint out network topology for data-node locations\n" 97 + "\t-storagepolicies\tprint out storage policy summary for the blocks\n" 98 + "\t-blockId\tprint out which file this blockId belongs to, locations" 99 + " (nodes, racks) of this block, and other diagnostics info" 100 + " (under replicated, corrupted or not, etc)\n\n" 101 + "Please Note:\n" 102 + "\t1. By default fsck ignores files opened for write, " 103 + "use -openforwrite to report such files. They are usually " 104 + " tagged CORRUPT or HEALTHY depending on their block " 105 + "allocation status\n" 106 + "\t2. Option -includeSnapshots should not be used for comparing stats," 107 + " should be used only for HEALTH check, as this may contain duplicates" 108 + " if the same file present in both original fs tree " 109 + "and inside snapshots."; 110 111 private final UserGroupInformation ugi; 112 private final PrintStream out; 113 private final URLConnectionFactory connectionFactory; 114 private final boolean isSpnegoEnabled; 115 116 /** 117 * Filesystem checker. 118 * @param conf current Configuration 119 */ DFSck(Configuration conf)120 public DFSck(Configuration conf) throws IOException { 121 this(conf, System.out); 122 } 123 DFSck(Configuration conf, PrintStream out)124 public DFSck(Configuration conf, PrintStream out) throws IOException { 125 super(conf); 126 this.ugi = UserGroupInformation.getCurrentUser(); 127 this.out = out; 128 this.connectionFactory = URLConnectionFactory 129 .newDefaultURLConnectionFactory(conf); 130 this.isSpnegoEnabled = UserGroupInformation.isSecurityEnabled(); 131 } 132 133 /** 134 * Print fsck usage information 135 */ printUsage(PrintStream out)136 static void printUsage(PrintStream out) { 137 out.println(USAGE + "\n"); 138 ToolRunner.printGenericCommandUsage(out); 139 } 140 @Override run(final String[] args)141 public int run(final String[] args) throws IOException { 142 if (args.length == 0) { 143 printUsage(System.err); 144 return -1; 145 } 146 147 try { 148 return UserGroupInformation.getCurrentUser().doAs( 149 new PrivilegedExceptionAction<Integer>() { 150 @Override 151 public Integer run() throws Exception { 152 return doWork(args); 153 } 154 }); 155 } catch (InterruptedException e) { 156 throw new IOException(e); 157 } 158 } 159 160 /* 161 * To get the list, we need to call iteratively until the server says 162 * there is no more left. 163 */ 164 private Integer listCorruptFileBlocks(String dir, String baseUrl) 165 throws IOException { 166 int errCode = -1; 167 int numCorrupt = 0; 168 int cookie = 0; 169 final String noCorruptLine = "has no CORRUPT files"; 170 final String noMoreCorruptLine = "has no more CORRUPT files"; 171 final String cookiePrefix = "Cookie:"; 172 boolean allDone = false; 173 while (!allDone) { 174 final StringBuffer url = new StringBuffer(baseUrl); 175 if (cookie > 0) { 176 url.append("&startblockafter=").append(String.valueOf(cookie)); 177 } 178 URL path = new URL(url.toString()); 179 URLConnection connection; 180 try { 181 connection = connectionFactory.openConnection(path, isSpnegoEnabled); 182 } catch (AuthenticationException e) { 183 throw new IOException(e); 184 } 185 InputStream stream = connection.getInputStream(); 186 BufferedReader input = new BufferedReader(new InputStreamReader( 187 stream, "UTF-8")); 188 try { 189 String line = null; 190 while ((line = input.readLine()) != null) { 191 if (line.startsWith(cookiePrefix)){ 192 try{ 193 cookie = Integer.parseInt(line.split("\t")[1]); 194 } catch (Exception e){ 195 allDone = true; 196 break; 197 } 198 continue; 199 } 200 if ((line.endsWith(noCorruptLine)) || 201 (line.endsWith(noMoreCorruptLine)) || 202 (line.endsWith(NamenodeFsck.NONEXISTENT_STATUS))) { 203 allDone = true; 204 break; 205 } 206 if ((line.isEmpty()) 207 || (line.startsWith("FSCK started by")) 208 || (line.startsWith("The filesystem under path"))) 209 continue; 210 numCorrupt++; 211 if (numCorrupt == 1) { 212 out.println("The list of corrupt files under path '" 213 + dir + "' are:"); 214 } 215 out.println(line); 216 } 217 } finally { 218 input.close(); 219 } 220 } 221 out.println("The filesystem under path '" + dir + "' has " 222 + numCorrupt + " CORRUPT files"); 223 if (numCorrupt == 0) 224 errCode = 0; 225 return errCode; 226 } 227 228 229 private Path getResolvedPath(String dir) throws IOException { 230 Configuration conf = getConf(); 231 Path dirPath = new Path(dir); 232 FileSystem fs = dirPath.getFileSystem(conf); 233 return fs.resolvePath(dirPath); 234 } 235 236 /** 237 * Derive the namenode http address from the current file system, 238 * either default or as set by "-fs" in the generic options. 239 * @return Returns http address or null if failure. 240 * @throws IOException if we can't determine the active NN address 241 */ 242 private URI getCurrentNamenodeAddress(Path target) throws IOException { 243 //String nnAddress = null; 244 Configuration conf = getConf(); 245 246 //get the filesystem object to verify it is an HDFS system 247 final FileSystem fs = target.getFileSystem(conf); 248 if (!(fs instanceof DistributedFileSystem)) { 249 System.err.println("FileSystem is " + fs.getUri()); 250 return null; 251 } 252 253 return DFSUtil.getInfoServer(HAUtil.getAddressOfActive(fs), conf, 254 DFSUtil.getHttpClientScheme(conf)); 255 } 256 257 private int doWork(final String[] args) throws IOException { 258 final StringBuilder url = new StringBuilder(); 259 260 url.append("/fsck?ugi=").append(ugi.getShortUserName()); 261 String dir = null; 262 boolean doListCorruptFileBlocks = false; 263 for (int idx = 0; idx < args.length; idx++) { 264 if (args[idx].equals("-move")) { url.append("&move=1"); } 265 else if (args[idx].equals("-delete")) { url.append("&delete=1"); } 266 else if (args[idx].equals("-files")) { url.append("&files=1"); } 267 else if (args[idx].equals("-openforwrite")) { url.append("&openforwrite=1"); } 268 else if (args[idx].equals("-blocks")) { url.append("&blocks=1"); } 269 else if (args[idx].equals("-locations")) { url.append("&locations=1"); } 270 else if (args[idx].equals("-racks")) { url.append("&racks=1"); } 271 else if (args[idx].equals("-storagepolicies")) { url.append("&storagepolicies=1"); } 272 else if (args[idx].equals("-list-corruptfileblocks")) { 273 url.append("&listcorruptfileblocks=1"); 274 doListCorruptFileBlocks = true; 275 } else if (args[idx].equals("-includeSnapshots")) { 276 url.append("&includeSnapshots=1"); 277 } else if (args[idx].equals("-blockId")) { 278 StringBuilder sb = new StringBuilder(); 279 idx++; 280 while(idx < args.length && !args[idx].startsWith("-")){ 281 sb.append(args[idx]); 282 sb.append(" "); 283 idx++; 284 } 285 url.append("&blockId=").append(URLEncoder.encode(sb.toString(), "UTF-8")); 286 } else if (!args[idx].startsWith("-")) { 287 if (null == dir) { 288 dir = args[idx]; 289 } else { 290 System.err.println("fsck: can only operate on one path at a time '" 291 + args[idx] + "'"); 292 printUsage(System.err); 293 return -1; 294 } 295 296 } else { 297 System.err.println("fsck: Illegal option '" + args[idx] + "'"); 298 printUsage(System.err); 299 return -1; 300 } 301 } 302 if (null == dir) { 303 dir = "/"; 304 } 305 306 Path dirpath = null; 307 URI namenodeAddress = null; 308 try { 309 dirpath = getResolvedPath(dir); 310 namenodeAddress = getCurrentNamenodeAddress(dirpath); 311 } catch (IOException ioe) { 312 System.err.println("FileSystem is inaccessible due to:\n" 313 + StringUtils.stringifyException(ioe)); 314 } 315 316 if (namenodeAddress == null) { 317 //Error message already output in {@link #getCurrentNamenodeAddress()} 318 System.err.println("DFSck exiting."); 319 return 0; 320 } 321 322 url.insert(0, namenodeAddress.toString()); 323 url.append("&path=").append(URLEncoder.encode( 324 Path.getPathWithoutSchemeAndAuthority(dirpath).toString(), "UTF-8")); 325 System.err.println("Connecting to namenode via " + url.toString()); 326 327 if (doListCorruptFileBlocks) { 328 return listCorruptFileBlocks(dir, url.toString()); 329 } 330 URL path = new URL(url.toString()); 331 URLConnection connection; 332 try { 333 connection = connectionFactory.openConnection(path, isSpnegoEnabled); 334 } catch (AuthenticationException e) { 335 throw new IOException(e); 336 } 337 InputStream stream = connection.getInputStream(); 338 BufferedReader input = new BufferedReader(new InputStreamReader( 339 stream, "UTF-8")); 340 String line = null; 341 String lastLine = null; 342 int errCode = -1; 343 try { 344 while ((line = input.readLine()) != null) { 345 out.println(line); 346 lastLine = line; 347 } 348 } finally { 349 input.close(); 350 } 351 if (lastLine.endsWith(NamenodeFsck.HEALTHY_STATUS)) { 352 errCode = 0; 353 } else if (lastLine.endsWith(NamenodeFsck.CORRUPT_STATUS)) { 354 errCode = 1; 355 } else if (lastLine.endsWith(NamenodeFsck.NONEXISTENT_STATUS)) { 356 errCode = 0; 357 } else if (lastLine.contains("Incorrect blockId format:")) { 358 errCode = 0; 359 } else if (lastLine.endsWith(NamenodeFsck.DECOMMISSIONED_STATUS)) { 360 errCode = 2; 361 } else if (lastLine.endsWith(NamenodeFsck.DECOMMISSIONING_STATUS)) { 362 errCode = 3; 363 } 364 return errCode; 365 } 366 367 public static void main(String[] args) throws Exception { 368 // -files option is also used by GenericOptionsParser 369 // Make sure that is not the first argument for fsck 370 int res = -1; 371 if ((args.length == 0) || ("-files".equals(args[0]))) { 372 printUsage(System.err); 373 ToolRunner.printGenericCommandUsage(System.err); 374 } else if (DFSUtil.parseHelpArgument(args, USAGE, System.out, true)) { 375 res = 0; 376 } else { 377 res = ToolRunner.run(new DFSck(new HdfsConfiguration()), args); 378 } 379 System.exit(res); 380 } 381 } 382