001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.mapred;
020    
021    import java.io.IOException;
022    import java.net.URI;
023    
024    import org.apache.commons.logging.Log;
025    import org.apache.commons.logging.LogFactory;
026    import org.apache.hadoop.classification.InterfaceAudience;
027    import org.apache.hadoop.classification.InterfaceStability;
028    import org.apache.hadoop.fs.FileStatus;
029    import org.apache.hadoop.fs.FileSystem;
030    import org.apache.hadoop.fs.Path;
031    
032    /** An {@link OutputCommitter} that commits files specified 
033     * in job output directory i.e. ${mapreduce.output.fileoutputformat.outputdir}. 
034     **/
035    @InterfaceAudience.Public
036    @InterfaceStability.Stable
037    public class FileOutputCommitter extends OutputCommitter {
038    
039      public static final Log LOG = LogFactory.getLog(
040          "org.apache.hadoop.mapred.FileOutputCommitter");
041      
042      /**
043       * Temporary directory name 
044       */
045      public static final String TEMP_DIR_NAME = "_temporary";
046      public static final String SUCCEEDED_FILE_NAME = "_SUCCESS";
047      static final String SUCCESSFUL_JOB_OUTPUT_DIR_MARKER = 
048        "mapreduce.fileoutputcommitter.marksuccessfuljobs";
049    
050      public void setupJob(JobContext context) throws IOException {
051        JobConf conf = context.getJobConf();
052        Path outputPath = FileOutputFormat.getOutputPath(conf);
053        if (outputPath != null) {
054          Path tmpDir = 
055              new Path(outputPath, getJobAttemptBaseDirName(context) + 
056                  Path.SEPARATOR + FileOutputCommitter.TEMP_DIR_NAME);
057          FileSystem fileSys = tmpDir.getFileSystem(conf);
058          if (!fileSys.mkdirs(tmpDir)) {
059            LOG.error("Mkdirs failed to create " + tmpDir.toString());
060          }
061        }
062      }
063    
064      // True if the job requires output.dir marked on successful job.
065      // Note that by default it is set to true.
066      private boolean shouldMarkOutputDir(JobConf conf) {
067        return conf.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true);
068      }
069      
070      public void commitJob(JobContext context) throws IOException {
071        //delete the task temp directory from the current jobtempdir
072        JobConf conf = context.getJobConf();
073        Path outputPath = FileOutputFormat.getOutputPath(conf);
074        if (outputPath != null) {
075          FileSystem outputFileSystem = outputPath.getFileSystem(conf);
076          Path tmpDir = new Path(outputPath, getJobAttemptBaseDirName(context) +
077              Path.SEPARATOR + FileOutputCommitter.TEMP_DIR_NAME);
078          FileSystem fileSys = tmpDir.getFileSystem(context.getConfiguration());
079          if (fileSys.exists(tmpDir)) {
080            fileSys.delete(tmpDir, true);
081          } else {
082            LOG.warn("Task temp dir could not be deleted " + tmpDir);
083          }
084    
085          //move the job output to final place
086          Path jobOutputPath = 
087              new Path(outputPath, getJobAttemptBaseDirName(context));
088          moveJobOutputs(outputFileSystem, 
089              jobOutputPath, outputPath, jobOutputPath);
090    
091          // delete the _temporary folder in the output folder
092          cleanupJob(context);
093          // check if the output-dir marking is required
094          if (shouldMarkOutputDir(context.getJobConf())) {
095            // create a _success file in the output folder
096            markOutputDirSuccessful(context);
097          }
098        }
099      }
100      
101      // Create a _success file in the job's output folder
102      private void markOutputDirSuccessful(JobContext context) throws IOException {
103        JobConf conf = context.getJobConf();
104        // get the o/p path
105        Path outputPath = FileOutputFormat.getOutputPath(conf);
106        if (outputPath != null) {
107          // get the filesys
108          FileSystem fileSys = outputPath.getFileSystem(conf);
109          // create a file in the output folder to mark the job completion
110          Path filePath = new Path(outputPath, SUCCEEDED_FILE_NAME);
111          fileSys.create(filePath).close();
112        }
113      }
114    
115      private void moveJobOutputs(FileSystem fs, final Path origJobOutputPath,
116          Path finalOutputDir, Path jobOutput) throws IOException {
117        LOG.debug("Told to move job output from " + jobOutput
118            + " to " + finalOutputDir + 
119            " and orig job output path is " + origJobOutputPath);  
120        if (fs.isFile(jobOutput)) {
121          Path finalOutputPath = 
122              getFinalPath(fs, finalOutputDir, jobOutput, origJobOutputPath);
123          if (!fs.rename(jobOutput, finalOutputPath)) {
124            if (!fs.delete(finalOutputPath, true)) {
125              throw new IOException("Failed to delete earlier output of job");
126            }
127            if (!fs.rename(jobOutput, finalOutputPath)) {
128              throw new IOException("Failed to save output of job");
129            }
130          }
131          LOG.debug("Moved job output file from " + jobOutput + " to " + 
132              finalOutputPath);
133        } else if (fs.getFileStatus(jobOutput).isDirectory()) {
134          LOG.debug("Job output file " + jobOutput + " is a dir");      
135          FileStatus[] paths = fs.listStatus(jobOutput);
136          Path finalOutputPath = 
137              getFinalPath(fs, finalOutputDir, jobOutput, origJobOutputPath);
138          fs.mkdirs(finalOutputPath);
139          LOG.debug("Creating dirs along job output path " + finalOutputPath);
140          if (paths != null) {
141            for (FileStatus path : paths) {
142              moveJobOutputs(fs, origJobOutputPath, finalOutputDir, path.getPath());
143            }
144          }
145        }
146      }
147      
148      @Override
149      @Deprecated
150      public void cleanupJob(JobContext context) throws IOException {
151        JobConf conf = context.getJobConf();
152        // do the clean up of temporary directory
153        Path outputPath = FileOutputFormat.getOutputPath(conf);
154        if (outputPath != null) {
155          Path tmpDir = new Path(outputPath, FileOutputCommitter.TEMP_DIR_NAME);
156          FileSystem fileSys = tmpDir.getFileSystem(conf);
157          context.getProgressible().progress();
158          if (fileSys.exists(tmpDir)) {
159            fileSys.delete(tmpDir, true);
160          } else {
161            LOG.warn("Output Path is Null in cleanup");
162          }
163        }
164      }
165    
166      @Override
167      public void abortJob(JobContext context, int runState) 
168      throws IOException {
169        // simply delete the _temporary dir from the o/p folder of the job
170        cleanupJob(context);
171      }
172      
173      public void setupTask(TaskAttemptContext context) throws IOException {
174        // FileOutputCommitter's setupTask doesn't do anything. Because the
175        // temporary task directory is created on demand when the 
176        // task is writing.
177      }
178                      
179      public void commitTask(TaskAttemptContext context) 
180      throws IOException {
181        Path taskOutputPath = getTempTaskOutputPath(context);
182        TaskAttemptID attemptId = context.getTaskAttemptID();
183        JobConf job = context.getJobConf();
184        if (taskOutputPath != null) {
185          FileSystem fs = taskOutputPath.getFileSystem(job);
186          context.getProgressible().progress();
187          if (fs.exists(taskOutputPath)) {
188            // Move the task outputs to the current job attempt output dir
189            JobConf conf = context.getJobConf();
190            Path outputPath = FileOutputFormat.getOutputPath(conf);
191            FileSystem outputFileSystem = outputPath.getFileSystem(conf);
192            Path jobOutputPath = new Path(outputPath, getJobTempDirName(context));
193            moveTaskOutputs(context, outputFileSystem, jobOutputPath, 
194                taskOutputPath);
195    
196            // Delete the temporary task-specific output directory
197            if (!fs.delete(taskOutputPath, true)) {
198              LOG.info("Failed to delete the temporary output" + 
199              " directory of task: " + attemptId + " - " + taskOutputPath);
200            }
201            LOG.info("Saved output of task '" + attemptId + "' to " + 
202                     jobOutputPath);
203          }
204        }
205      }
206                      
207      private void moveTaskOutputs(TaskAttemptContext context,
208                                   FileSystem fs,
209                                   Path jobOutputDir,
210                                   Path taskOutput) 
211      throws IOException {
212        TaskAttemptID attemptId = context.getTaskAttemptID();
213        context.getProgressible().progress();
214        LOG.debug("Told to move taskoutput from " + taskOutput
215            + " to " + jobOutputDir);    
216        if (fs.isFile(taskOutput)) {
217          Path finalOutputPath = getFinalPath(fs, jobOutputDir, taskOutput, 
218                                              getTempTaskOutputPath(context));
219          if (!fs.rename(taskOutput, finalOutputPath)) {
220            if (!fs.delete(finalOutputPath, true)) {
221              throw new IOException("Failed to delete earlier output of task: " + 
222                                     attemptId);
223            }
224            if (!fs.rename(taskOutput, finalOutputPath)) {
225              throw new IOException("Failed to save output of task: " + 
226                              attemptId);
227            }
228          }
229          LOG.debug("Moved " + taskOutput + " to " + finalOutputPath);
230        } else if(fs.getFileStatus(taskOutput).isDirectory()) {
231          LOG.debug("Taskoutput " + taskOutput + " is a dir");
232          FileStatus[] paths = fs.listStatus(taskOutput);
233          Path finalOutputPath = getFinalPath(fs, jobOutputDir, taskOutput, 
234                      getTempTaskOutputPath(context));
235          fs.mkdirs(finalOutputPath);
236          LOG.debug("Creating dirs along path " + finalOutputPath);
237          if (paths != null) {
238            for (FileStatus path : paths) {
239              moveTaskOutputs(context, fs, jobOutputDir, path.getPath());
240            }
241          }
242        }
243      }
244    
245      public void abortTask(TaskAttemptContext context) throws IOException {
246        Path taskOutputPath =  getTempTaskOutputPath(context);
247        if (taskOutputPath != null) {
248          FileSystem fs = taskOutputPath.getFileSystem(context.getJobConf());
249          context.getProgressible().progress();
250          fs.delete(taskOutputPath, true);
251        }
252      }
253    
254      @SuppressWarnings("deprecation")
255      private Path getFinalPath(FileSystem fs, Path jobOutputDir, Path taskOutput, 
256                                Path taskOutputPath) throws IOException {
257        URI taskOutputUri = taskOutput.makeQualified(fs).toUri();
258        URI taskOutputPathUri = taskOutputPath.makeQualified(fs).toUri();
259        URI relativePath = taskOutputPathUri.relativize(taskOutputUri);
260        if (taskOutputUri == relativePath) { 
261          //taskOutputPath is not a parent of taskOutput
262          throw new IOException("Can not get the relative path: base = " + 
263              taskOutputPathUri + " child = " + taskOutputUri);
264        }
265        if (relativePath.getPath().length() > 0) {
266          return new Path(jobOutputDir, relativePath.getPath());
267        } else {
268          return jobOutputDir;
269        }
270      }
271    
272      public boolean needsTaskCommit(TaskAttemptContext context) 
273      throws IOException {
274        Path taskOutputPath = getTempTaskOutputPath(context);
275        if (taskOutputPath != null) {
276          context.getProgressible().progress();
277          // Get the file-system for the task output directory
278          FileSystem fs = taskOutputPath.getFileSystem(context.getJobConf());
279          // since task output path is created on demand, 
280          // if it exists, task needs a commit
281          if (fs.exists(taskOutputPath)) {
282            return true;
283          }
284        }
285        return false;
286      }
287    
288      Path getTempTaskOutputPath(TaskAttemptContext taskContext) 
289          throws IOException {
290        JobConf conf = taskContext.getJobConf();
291        Path outputPath = FileOutputFormat.getOutputPath(conf);
292        if (outputPath != null) {
293          Path p = new Path(outputPath,
294                         (FileOutputCommitter.TEMP_DIR_NAME + Path.SEPARATOR +
295                          "_" + taskContext.getTaskAttemptID().toString()));
296          FileSystem fs = p.getFileSystem(conf);
297          return p.makeQualified(fs);
298        }
299        return null;
300      }
301      
302      Path getWorkPath(TaskAttemptContext taskContext, Path basePath) 
303      throws IOException {
304        // ${mapred.out.dir}/_temporary
305        Path jobTmpDir = new Path(basePath, FileOutputCommitter.TEMP_DIR_NAME);
306        FileSystem fs = jobTmpDir.getFileSystem(taskContext.getJobConf());
307        if (!fs.exists(jobTmpDir)) {
308          throw new IOException("The temporary job-output directory " + 
309              jobTmpDir.toString() + " doesn't exist!"); 
310        }
311        // ${mapred.out.dir}/_temporary/_${taskid}
312        String taskid = taskContext.getTaskAttemptID().toString();
313        Path taskTmpDir = new Path(jobTmpDir, "_" + taskid);
314        if (!fs.mkdirs(taskTmpDir)) {
315          throw new IOException("Mkdirs failed to create " 
316              + taskTmpDir.toString());
317        }
318        return taskTmpDir;
319      }
320      
321      @Override
322      public boolean isRecoverySupported() {
323        return true;
324      }
325      
326      @Override
327      public void recoverTask(TaskAttemptContext context)
328          throws IOException {
329        Path outputPath = FileOutputFormat.getOutputPath(context.getJobConf());
330        context.progress();
331        Path jobOutputPath = new Path(outputPath, getJobTempDirName(context));
332        int previousAttempt =         
333            context.getConfiguration().getInt(
334                MRConstants.APPLICATION_ATTEMPT_ID, 0) - 1;
335        if (previousAttempt < 0) {
336          LOG.warn("Cannot recover task output for first attempt...");
337          return;
338        }
339    
340        FileSystem outputFileSystem = 
341            outputPath.getFileSystem(context.getJobConf());
342        Path pathToRecover = 
343            new Path(outputPath, getJobAttemptBaseDirName(previousAttempt));
344        if (outputFileSystem.exists(pathToRecover)) {
345          // Move the task outputs to their final place
346          LOG.debug("Trying to recover task from " + pathToRecover
347              + " into " + jobOutputPath);
348          moveJobOutputs(outputFileSystem, 
349              pathToRecover, jobOutputPath, pathToRecover);
350          LOG.info("Saved output of job to " + jobOutputPath);
351        }
352      }
353    
354      protected static String getJobAttemptBaseDirName(JobContext context) {
355        int appAttemptId = 
356            context.getJobConf().getInt(
357                MRConstants.APPLICATION_ATTEMPT_ID, 0);
358        return getJobAttemptBaseDirName(appAttemptId);
359      }
360    
361      protected static String getJobTempDirName(TaskAttemptContext context) {
362        int appAttemptId = 
363            context.getJobConf().getInt(
364                MRConstants.APPLICATION_ATTEMPT_ID, 0);
365        return getJobAttemptBaseDirName(appAttemptId);
366      }
367    
368      protected static String getJobAttemptBaseDirName(int appAttemptId) {
369        return FileOutputCommitter.TEMP_DIR_NAME + Path.SEPARATOR + 
370          + appAttemptId;
371      }
372    
373      protected static String getTaskAttemptBaseDirName(
374          TaskAttemptContext context) {
375        return getJobTempDirName(context) + Path.SEPARATOR + 
376          FileOutputCommitter.TEMP_DIR_NAME + Path.SEPARATOR +
377          "_" + context.getTaskAttemptID().toString();
378      }
379    }