1 /** 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 19 package org.apache.hadoop.mapreduce; 20 21 import java.io.IOException; 22 23 import org.apache.hadoop.classification.InterfaceAudience; 24 import org.apache.hadoop.classification.InterfaceStability; 25 /** 26 * <code>OutputCommitter</code> describes the commit of task output for a 27 * Map-Reduce job. 28 * 29 * <p>The Map-Reduce framework relies on the <code>OutputCommitter</code> of 30 * the job to:<p> 31 * <ol> 32 * <li> 33 * Setup the job during initialization. For example, create the temporary 34 * output directory for the job during the initialization of the job. 35 * </li> 36 * <li> 37 * Cleanup the job after the job completion. For example, remove the 38 * temporary output directory after the job completion. 39 * </li> 40 * <li> 41 * Setup the task temporary output. 42 * </li> 43 * <li> 44 * Check whether a task needs a commit. This is to avoid the commit 45 * procedure if a task does not need commit. 46 * </li> 47 * <li> 48 * Commit of the task output. 49 * </li> 50 * <li> 51 * Discard the task commit. 52 * </li> 53 * </ol> 54 * The methods in this class can be called from several different processes and 55 * from several different contexts. It is important to know which process and 56 * which context each is called from. Each method should be marked accordingly 57 * in its documentation. It is also important to note that not all methods are 58 * guaranteed to be called once and only once. If a method is not guaranteed to 59 * have this property the output committer needs to handle this appropriately. 60 * Also note it will only be in rare situations where they may be called 61 * multiple times for the same task. 62 * 63 * @see org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter 64 * @see JobContext 65 * @see TaskAttemptContext 66 */ 67 @InterfaceAudience.Public 68 @InterfaceStability.Stable 69 public abstract class OutputCommitter { 70 /** 71 * For the framework to setup the job output during initialization. This is 72 * called from the application master process for the entire job. This will be 73 * called multiple times, once per job attempt. 74 * 75 * @param jobContext Context of the job whose output is being written. 76 * @throws IOException if temporary output could not be created 77 */ setupJob(JobContext jobContext)78 public abstract void setupJob(JobContext jobContext) throws IOException; 79 80 /** 81 * For cleaning up the job's output after job completion. This is called 82 * from the application master process for the entire job. This may be called 83 * multiple times. 84 * 85 * @param jobContext Context of the job whose output is being written. 86 * @throws IOException 87 * @deprecated Use {@link #commitJob(JobContext)} and 88 * {@link #abortJob(JobContext, JobStatus.State)} instead. 89 */ 90 @Deprecated cleanupJob(JobContext jobContext)91 public void cleanupJob(JobContext jobContext) throws IOException { } 92 93 /** 94 * For committing job's output after successful job completion. Note that this 95 * is invoked for jobs with final runstate as SUCCESSFUL. This is called 96 * from the application master process for the entire job. This is guaranteed 97 * to only be called once. If it throws an exception the entire job will 98 * fail. 99 * 100 * @param jobContext Context of the job whose output is being written. 101 * @throws IOException 102 */ commitJob(JobContext jobContext)103 public void commitJob(JobContext jobContext) throws IOException { 104 cleanupJob(jobContext); 105 } 106 107 108 /** 109 * For aborting an unsuccessful job's output. Note that this is invoked for 110 * jobs with final runstate as {@link JobStatus.State#FAILED} or 111 * {@link JobStatus.State#KILLED}. This is called from the application 112 * master process for the entire job. This may be called multiple times. 113 * 114 * @param jobContext Context of the job whose output is being written. 115 * @param state final runstate of the job 116 * @throws IOException 117 */ abortJob(JobContext jobContext, JobStatus.State state)118 public void abortJob(JobContext jobContext, JobStatus.State state) 119 throws IOException { 120 cleanupJob(jobContext); 121 } 122 123 /** 124 * Sets up output for the task. This is called from each individual task's 125 * process that will output to HDFS, and it is called just for that task. This 126 * may be called multiple times for the same task, but for different task 127 * attempts. 128 * 129 * @param taskContext Context of the task whose output is being written. 130 * @throws IOException 131 */ setupTask(TaskAttemptContext taskContext)132 public abstract void setupTask(TaskAttemptContext taskContext) 133 throws IOException; 134 135 /** 136 * Check whether task needs a commit. This is called from each individual 137 * task's process that will output to HDFS, and it is called just for that 138 * task. 139 * 140 * @param taskContext 141 * @return true/false 142 * @throws IOException 143 */ needsTaskCommit(TaskAttemptContext taskContext)144 public abstract boolean needsTaskCommit(TaskAttemptContext taskContext) 145 throws IOException; 146 147 /** 148 * To promote the task's temporary output to final output location. 149 * If {@link #needsTaskCommit(TaskAttemptContext)} returns true and this 150 * task is the task that the AM determines finished first, this method 151 * is called to commit an individual task's output. This is to mark 152 * that tasks output as complete, as {@link #commitJob(JobContext)} will 153 * also be called later on if the entire job finished successfully. This 154 * is called from a task's process. This may be called multiple times for the 155 * same task, but different task attempts. It should be very rare for this to 156 * be called multiple times and requires odd networking failures to make this 157 * happen. In the future the Hadoop framework may eliminate this race. 158 * 159 * @param taskContext Context of the task whose output is being written. 160 * @throws IOException if commit is not successful. 161 */ commitTask(TaskAttemptContext taskContext)162 public abstract void commitTask(TaskAttemptContext taskContext) 163 throws IOException; 164 165 /** 166 * Discard the task output. This is called from a task's process to clean 167 * up a single task's output that can not yet been committed. This may be 168 * called multiple times for the same task, but for different task attempts. 169 * 170 * @param taskContext 171 * @throws IOException 172 */ abortTask(TaskAttemptContext taskContext)173 public abstract void abortTask(TaskAttemptContext taskContext) 174 throws IOException; 175 176 /** 177 * Is task output recovery supported for restarting jobs? 178 * 179 * If task output recovery is supported, job restart can be done more 180 * efficiently. 181 * 182 * @return <code>true</code> if task output recovery is supported, 183 * <code>false</code> otherwise 184 * @see #recoverTask(TaskAttemptContext) 185 * @deprecated Use {@link #isRecoverySupported(JobContext)} instead. 186 */ 187 @Deprecated isRecoverySupported()188 public boolean isRecoverySupported() { 189 return false; 190 } 191 192 /** 193 * Is task output recovery supported for restarting jobs? 194 * 195 * If task output recovery is supported, job restart can be done more 196 * efficiently. 197 * 198 * @param jobContext 199 * Context of the job whose output is being written. 200 * @return <code>true</code> if task output recovery is supported, 201 * <code>false</code> otherwise 202 * @throws IOException 203 * @see #recoverTask(TaskAttemptContext) 204 */ isRecoverySupported(JobContext jobContext)205 public boolean isRecoverySupported(JobContext jobContext) throws IOException { 206 return isRecoverySupported(); 207 } 208 209 /** 210 * Recover the task output. 211 * 212 * The retry-count for the job will be passed via the 213 * {@link MRJobConfig#APPLICATION_ATTEMPT_ID} key in 214 * {@link TaskAttemptContext#getConfiguration()} for the 215 * <code>OutputCommitter</code>. This is called from the application master 216 * process, but it is called individually for each task. 217 * 218 * If an exception is thrown the task will be attempted again. 219 * 220 * This may be called multiple times for the same task. But from different 221 * application attempts. 222 * 223 * @param taskContext Context of the task whose output is being recovered 224 * @throws IOException 225 */ recoverTask(TaskAttemptContext taskContext)226 public void recoverTask(TaskAttemptContext taskContext) 227 throws IOException 228 {} 229 } 230