1 /**
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 package org.apache.hadoop.mapreduce;
20 
21 import java.io.IOException;
22 
23 import org.apache.hadoop.classification.InterfaceAudience;
24 import org.apache.hadoop.classification.InterfaceStability;
25 /**
26  * <code>OutputCommitter</code> describes the commit of task output for a
27  * Map-Reduce job.
28  *
29  * <p>The Map-Reduce framework relies on the <code>OutputCommitter</code> of
30  * the job to:<p>
31  * <ol>
32  *   <li>
33  *   Setup the job during initialization. For example, create the temporary
34  *   output directory for the job during the initialization of the job.
35  *   </li>
36  *   <li>
37  *   Cleanup the job after the job completion. For example, remove the
38  *   temporary output directory after the job completion.
39  *   </li>
40  *   <li>
41  *   Setup the task temporary output.
42  *   </li>
43  *   <li>
44  *   Check whether a task needs a commit. This is to avoid the commit
45  *   procedure if a task does not need commit.
46  *   </li>
47  *   <li>
48  *   Commit of the task output.
49  *   </li>
50  *   <li>
51  *   Discard the task commit.
52  *   </li>
53  * </ol>
54  * The methods in this class can be called from several different processes and
55  * from several different contexts.  It is important to know which process and
56  * which context each is called from.  Each method should be marked accordingly
57  * in its documentation.  It is also important to note that not all methods are
58  * guaranteed to be called once and only once.  If a method is not guaranteed to
59  * have this property the output committer needs to handle this appropriately.
60  * Also note it will only be in rare situations where they may be called
61  * multiple times for the same task.
62  *
63  * @see org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
64  * @see JobContext
65  * @see TaskAttemptContext
66  */
67 @InterfaceAudience.Public
68 @InterfaceStability.Stable
69 public abstract class OutputCommitter {
70   /**
71    * For the framework to setup the job output during initialization.  This is
72    * called from the application master process for the entire job. This will be
73    * called multiple times, once per job attempt.
74    *
75    * @param jobContext Context of the job whose output is being written.
76    * @throws IOException if temporary output could not be created
77    */
setupJob(JobContext jobContext)78   public abstract void setupJob(JobContext jobContext) throws IOException;
79 
80   /**
81    * For cleaning up the job's output after job completion.  This is called
82    * from the application master process for the entire job. This may be called
83    * multiple times.
84    *
85    * @param jobContext Context of the job whose output is being written.
86    * @throws IOException
87    * @deprecated Use {@link #commitJob(JobContext)} and
88    *                 {@link #abortJob(JobContext, JobStatus.State)} instead.
89    */
90   @Deprecated
cleanupJob(JobContext jobContext)91   public void cleanupJob(JobContext jobContext) throws IOException { }
92 
93   /**
94    * For committing job's output after successful job completion. Note that this
95    * is invoked for jobs with final runstate as SUCCESSFUL.  This is called
96    * from the application master process for the entire job. This is guaranteed
97    * to only be called once.  If it throws an exception the entire job will
98    * fail.
99    *
100    * @param jobContext Context of the job whose output is being written.
101    * @throws IOException
102    */
commitJob(JobContext jobContext)103   public void commitJob(JobContext jobContext) throws IOException {
104     cleanupJob(jobContext);
105   }
106 
107 
108   /**
109    * For aborting an unsuccessful job's output. Note that this is invoked for
110    * jobs with final runstate as {@link JobStatus.State#FAILED} or
111    * {@link JobStatus.State#KILLED}.  This is called from the application
112    * master process for the entire job. This may be called multiple times.
113    *
114    * @param jobContext Context of the job whose output is being written.
115    * @param state final runstate of the job
116    * @throws IOException
117    */
abortJob(JobContext jobContext, JobStatus.State state)118   public void abortJob(JobContext jobContext, JobStatus.State state)
119   throws IOException {
120     cleanupJob(jobContext);
121   }
122 
123   /**
124    * Sets up output for the task.  This is called from each individual task's
125    * process that will output to HDFS, and it is called just for that task. This
126    * may be called multiple times for the same task, but for different task
127    * attempts.
128    *
129    * @param taskContext Context of the task whose output is being written.
130    * @throws IOException
131    */
setupTask(TaskAttemptContext taskContext)132   public abstract void setupTask(TaskAttemptContext taskContext)
133   throws IOException;
134 
135   /**
136    * Check whether task needs a commit.  This is called from each individual
137    * task's process that will output to HDFS, and it is called just for that
138    * task.
139    *
140    * @param taskContext
141    * @return true/false
142    * @throws IOException
143    */
needsTaskCommit(TaskAttemptContext taskContext)144   public abstract boolean needsTaskCommit(TaskAttemptContext taskContext)
145   throws IOException;
146 
147   /**
148    * To promote the task's temporary output to final output location.
149    * If {@link #needsTaskCommit(TaskAttemptContext)} returns true and this
150    * task is the task that the AM determines finished first, this method
151    * is called to commit an individual task's output.  This is to mark
152    * that tasks output as complete, as {@link #commitJob(JobContext)} will
153    * also be called later on if the entire job finished successfully. This
154    * is called from a task's process. This may be called multiple times for the
155    * same task, but different task attempts.  It should be very rare for this to
156    * be called multiple times and requires odd networking failures to make this
157    * happen. In the future the Hadoop framework may eliminate this race.
158    *
159    * @param taskContext Context of the task whose output is being written.
160    * @throws IOException if commit is not successful.
161    */
commitTask(TaskAttemptContext taskContext)162   public abstract void commitTask(TaskAttemptContext taskContext)
163   throws IOException;
164 
165   /**
166    * Discard the task output. This is called from a task's process to clean
167    * up a single task's output that can not yet been committed. This may be
168    * called multiple times for the same task, but for different task attempts.
169    *
170    * @param taskContext
171    * @throws IOException
172    */
abortTask(TaskAttemptContext taskContext)173   public abstract void abortTask(TaskAttemptContext taskContext)
174   throws IOException;
175 
176   /**
177    * Is task output recovery supported for restarting jobs?
178    *
179    * If task output recovery is supported, job restart can be done more
180    * efficiently.
181    *
182    * @return <code>true</code> if task output recovery is supported,
183    *         <code>false</code> otherwise
184    * @see #recoverTask(TaskAttemptContext)
185    * @deprecated Use {@link #isRecoverySupported(JobContext)} instead.
186    */
187   @Deprecated
isRecoverySupported()188   public boolean isRecoverySupported() {
189     return false;
190   }
191 
192   /**
193    * Is task output recovery supported for restarting jobs?
194    *
195    * If task output recovery is supported, job restart can be done more
196    * efficiently.
197    *
198    * @param jobContext
199    *          Context of the job whose output is being written.
200    * @return <code>true</code> if task output recovery is supported,
201    *         <code>false</code> otherwise
202    * @throws IOException
203    * @see #recoverTask(TaskAttemptContext)
204    */
isRecoverySupported(JobContext jobContext)205   public boolean isRecoverySupported(JobContext jobContext) throws IOException {
206     return isRecoverySupported();
207   }
208 
209   /**
210    * Recover the task output.
211    *
212    * The retry-count for the job will be passed via the
213    * {@link MRJobConfig#APPLICATION_ATTEMPT_ID} key in
214    * {@link TaskAttemptContext#getConfiguration()} for the
215    * <code>OutputCommitter</code>.  This is called from the application master
216    * process, but it is called individually for each task.
217    *
218    * If an exception is thrown the task will be attempted again.
219    *
220    * This may be called multiple times for the same task.  But from different
221    * application attempts.
222    *
223    * @param taskContext Context of the task whose output is being recovered
224    * @throws IOException
225    */
recoverTask(TaskAttemptContext taskContext)226   public void recoverTask(TaskAttemptContext taskContext)
227   throws IOException
228   {}
229 }
230