001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.mapred; 020 021 import java.io.IOException; 022 import java.net.URI; 023 024 import org.apache.commons.logging.Log; 025 import org.apache.commons.logging.LogFactory; 026 import org.apache.hadoop.classification.InterfaceAudience; 027 import org.apache.hadoop.classification.InterfaceStability; 028 import org.apache.hadoop.fs.FileStatus; 029 import org.apache.hadoop.fs.FileSystem; 030 import org.apache.hadoop.fs.Path; 031 032 /** An {@link OutputCommitter} that commits files specified 033 * in job output directory i.e. ${mapreduce.output.fileoutputformat.outputdir}. 034 **/ 035 @InterfaceAudience.Public 036 @InterfaceStability.Stable 037 public class FileOutputCommitter extends OutputCommitter { 038 039 public static final Log LOG = LogFactory.getLog( 040 "org.apache.hadoop.mapred.FileOutputCommitter"); 041 042 /** 043 * Temporary directory name 044 */ 045 public static final String TEMP_DIR_NAME = "_temporary"; 046 public static final String SUCCEEDED_FILE_NAME = "_SUCCESS"; 047 static final String SUCCESSFUL_JOB_OUTPUT_DIR_MARKER = 048 "mapreduce.fileoutputcommitter.marksuccessfuljobs"; 049 050 public void setupJob(JobContext context) throws IOException { 051 JobConf conf = context.getJobConf(); 052 Path outputPath = FileOutputFormat.getOutputPath(conf); 053 if (outputPath != null) { 054 Path tmpDir = 055 new Path(outputPath, getJobAttemptBaseDirName(context) + 056 Path.SEPARATOR + FileOutputCommitter.TEMP_DIR_NAME); 057 FileSystem fileSys = tmpDir.getFileSystem(conf); 058 if (!fileSys.mkdirs(tmpDir)) { 059 LOG.error("Mkdirs failed to create " + tmpDir.toString()); 060 } 061 } 062 } 063 064 // True if the job requires output.dir marked on successful job. 065 // Note that by default it is set to true. 066 private boolean shouldMarkOutputDir(JobConf conf) { 067 return conf.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true); 068 } 069 070 public void commitJob(JobContext context) throws IOException { 071 //delete the task temp directory from the current jobtempdir 072 JobConf conf = context.getJobConf(); 073 Path outputPath = FileOutputFormat.getOutputPath(conf); 074 if (outputPath != null) { 075 FileSystem outputFileSystem = outputPath.getFileSystem(conf); 076 Path tmpDir = new Path(outputPath, getJobAttemptBaseDirName(context) + 077 Path.SEPARATOR + FileOutputCommitter.TEMP_DIR_NAME); 078 FileSystem fileSys = tmpDir.getFileSystem(context.getConfiguration()); 079 if (fileSys.exists(tmpDir)) { 080 fileSys.delete(tmpDir, true); 081 } else { 082 LOG.warn("Task temp dir could not be deleted " + tmpDir); 083 } 084 085 //move the job output to final place 086 Path jobOutputPath = 087 new Path(outputPath, getJobAttemptBaseDirName(context)); 088 moveJobOutputs(outputFileSystem, 089 jobOutputPath, outputPath, jobOutputPath); 090 091 // delete the _temporary folder in the output folder 092 cleanupJob(context); 093 // check if the output-dir marking is required 094 if (shouldMarkOutputDir(context.getJobConf())) { 095 // create a _success file in the output folder 096 markOutputDirSuccessful(context); 097 } 098 } 099 } 100 101 // Create a _success file in the job's output folder 102 private void markOutputDirSuccessful(JobContext context) throws IOException { 103 JobConf conf = context.getJobConf(); 104 // get the o/p path 105 Path outputPath = FileOutputFormat.getOutputPath(conf); 106 if (outputPath != null) { 107 // get the filesys 108 FileSystem fileSys = outputPath.getFileSystem(conf); 109 // create a file in the output folder to mark the job completion 110 Path filePath = new Path(outputPath, SUCCEEDED_FILE_NAME); 111 fileSys.create(filePath).close(); 112 } 113 } 114 115 private void moveJobOutputs(FileSystem fs, final Path origJobOutputPath, 116 Path finalOutputDir, Path jobOutput) throws IOException { 117 LOG.debug("Told to move job output from " + jobOutput 118 + " to " + finalOutputDir + 119 " and orig job output path is " + origJobOutputPath); 120 if (fs.isFile(jobOutput)) { 121 Path finalOutputPath = 122 getFinalPath(fs, finalOutputDir, jobOutput, origJobOutputPath); 123 if (!fs.rename(jobOutput, finalOutputPath)) { 124 if (!fs.delete(finalOutputPath, true)) { 125 throw new IOException("Failed to delete earlier output of job"); 126 } 127 if (!fs.rename(jobOutput, finalOutputPath)) { 128 throw new IOException("Failed to save output of job"); 129 } 130 } 131 LOG.debug("Moved job output file from " + jobOutput + " to " + 132 finalOutputPath); 133 } else if (fs.getFileStatus(jobOutput).isDirectory()) { 134 LOG.debug("Job output file " + jobOutput + " is a dir"); 135 FileStatus[] paths = fs.listStatus(jobOutput); 136 Path finalOutputPath = 137 getFinalPath(fs, finalOutputDir, jobOutput, origJobOutputPath); 138 fs.mkdirs(finalOutputPath); 139 LOG.debug("Creating dirs along job output path " + finalOutputPath); 140 if (paths != null) { 141 for (FileStatus path : paths) { 142 moveJobOutputs(fs, origJobOutputPath, finalOutputDir, path.getPath()); 143 } 144 } 145 } 146 } 147 148 @Override 149 @Deprecated 150 public void cleanupJob(JobContext context) throws IOException { 151 JobConf conf = context.getJobConf(); 152 // do the clean up of temporary directory 153 Path outputPath = FileOutputFormat.getOutputPath(conf); 154 if (outputPath != null) { 155 Path tmpDir = new Path(outputPath, FileOutputCommitter.TEMP_DIR_NAME); 156 FileSystem fileSys = tmpDir.getFileSystem(conf); 157 context.getProgressible().progress(); 158 if (fileSys.exists(tmpDir)) { 159 fileSys.delete(tmpDir, true); 160 } else { 161 LOG.warn("Output Path is Null in cleanup"); 162 } 163 } 164 } 165 166 @Override 167 public void abortJob(JobContext context, int runState) 168 throws IOException { 169 // simply delete the _temporary dir from the o/p folder of the job 170 cleanupJob(context); 171 } 172 173 public void setupTask(TaskAttemptContext context) throws IOException { 174 // FileOutputCommitter's setupTask doesn't do anything. Because the 175 // temporary task directory is created on demand when the 176 // task is writing. 177 } 178 179 public void commitTask(TaskAttemptContext context) 180 throws IOException { 181 Path taskOutputPath = getTempTaskOutputPath(context); 182 TaskAttemptID attemptId = context.getTaskAttemptID(); 183 JobConf job = context.getJobConf(); 184 if (taskOutputPath != null) { 185 FileSystem fs = taskOutputPath.getFileSystem(job); 186 context.getProgressible().progress(); 187 if (fs.exists(taskOutputPath)) { 188 // Move the task outputs to the current job attempt output dir 189 JobConf conf = context.getJobConf(); 190 Path outputPath = FileOutputFormat.getOutputPath(conf); 191 FileSystem outputFileSystem = outputPath.getFileSystem(conf); 192 Path jobOutputPath = new Path(outputPath, getJobTempDirName(context)); 193 moveTaskOutputs(context, outputFileSystem, jobOutputPath, 194 taskOutputPath); 195 196 // Delete the temporary task-specific output directory 197 if (!fs.delete(taskOutputPath, true)) { 198 LOG.info("Failed to delete the temporary output" + 199 " directory of task: " + attemptId + " - " + taskOutputPath); 200 } 201 LOG.info("Saved output of task '" + attemptId + "' to " + 202 jobOutputPath); 203 } 204 } 205 } 206 207 private void moveTaskOutputs(TaskAttemptContext context, 208 FileSystem fs, 209 Path jobOutputDir, 210 Path taskOutput) 211 throws IOException { 212 TaskAttemptID attemptId = context.getTaskAttemptID(); 213 context.getProgressible().progress(); 214 LOG.debug("Told to move taskoutput from " + taskOutput 215 + " to " + jobOutputDir); 216 if (fs.isFile(taskOutput)) { 217 Path finalOutputPath = getFinalPath(fs, jobOutputDir, taskOutput, 218 getTempTaskOutputPath(context)); 219 if (!fs.rename(taskOutput, finalOutputPath)) { 220 if (!fs.delete(finalOutputPath, true)) { 221 throw new IOException("Failed to delete earlier output of task: " + 222 attemptId); 223 } 224 if (!fs.rename(taskOutput, finalOutputPath)) { 225 throw new IOException("Failed to save output of task: " + 226 attemptId); 227 } 228 } 229 LOG.debug("Moved " + taskOutput + " to " + finalOutputPath); 230 } else if(fs.getFileStatus(taskOutput).isDirectory()) { 231 LOG.debug("Taskoutput " + taskOutput + " is a dir"); 232 FileStatus[] paths = fs.listStatus(taskOutput); 233 Path finalOutputPath = getFinalPath(fs, jobOutputDir, taskOutput, 234 getTempTaskOutputPath(context)); 235 fs.mkdirs(finalOutputPath); 236 LOG.debug("Creating dirs along path " + finalOutputPath); 237 if (paths != null) { 238 for (FileStatus path : paths) { 239 moveTaskOutputs(context, fs, jobOutputDir, path.getPath()); 240 } 241 } 242 } 243 } 244 245 public void abortTask(TaskAttemptContext context) throws IOException { 246 Path taskOutputPath = getTempTaskOutputPath(context); 247 if (taskOutputPath != null) { 248 FileSystem fs = taskOutputPath.getFileSystem(context.getJobConf()); 249 context.getProgressible().progress(); 250 fs.delete(taskOutputPath, true); 251 } 252 } 253 254 @SuppressWarnings("deprecation") 255 private Path getFinalPath(FileSystem fs, Path jobOutputDir, Path taskOutput, 256 Path taskOutputPath) throws IOException { 257 URI taskOutputUri = taskOutput.makeQualified(fs).toUri(); 258 URI taskOutputPathUri = taskOutputPath.makeQualified(fs).toUri(); 259 URI relativePath = taskOutputPathUri.relativize(taskOutputUri); 260 if (taskOutputUri == relativePath) { 261 //taskOutputPath is not a parent of taskOutput 262 throw new IOException("Can not get the relative path: base = " + 263 taskOutputPathUri + " child = " + taskOutputUri); 264 } 265 if (relativePath.getPath().length() > 0) { 266 return new Path(jobOutputDir, relativePath.getPath()); 267 } else { 268 return jobOutputDir; 269 } 270 } 271 272 public boolean needsTaskCommit(TaskAttemptContext context) 273 throws IOException { 274 Path taskOutputPath = getTempTaskOutputPath(context); 275 if (taskOutputPath != null) { 276 context.getProgressible().progress(); 277 // Get the file-system for the task output directory 278 FileSystem fs = taskOutputPath.getFileSystem(context.getJobConf()); 279 // since task output path is created on demand, 280 // if it exists, task needs a commit 281 if (fs.exists(taskOutputPath)) { 282 return true; 283 } 284 } 285 return false; 286 } 287 288 Path getTempTaskOutputPath(TaskAttemptContext taskContext) 289 throws IOException { 290 JobConf conf = taskContext.getJobConf(); 291 Path outputPath = FileOutputFormat.getOutputPath(conf); 292 if (outputPath != null) { 293 Path p = new Path(outputPath, 294 (FileOutputCommitter.TEMP_DIR_NAME + Path.SEPARATOR + 295 "_" + taskContext.getTaskAttemptID().toString())); 296 FileSystem fs = p.getFileSystem(conf); 297 return p.makeQualified(fs); 298 } 299 return null; 300 } 301 302 Path getWorkPath(TaskAttemptContext taskContext, Path basePath) 303 throws IOException { 304 // ${mapred.out.dir}/_temporary 305 Path jobTmpDir = new Path(basePath, FileOutputCommitter.TEMP_DIR_NAME); 306 FileSystem fs = jobTmpDir.getFileSystem(taskContext.getJobConf()); 307 if (!fs.exists(jobTmpDir)) { 308 throw new IOException("The temporary job-output directory " + 309 jobTmpDir.toString() + " doesn't exist!"); 310 } 311 // ${mapred.out.dir}/_temporary/_${taskid} 312 String taskid = taskContext.getTaskAttemptID().toString(); 313 Path taskTmpDir = new Path(jobTmpDir, "_" + taskid); 314 if (!fs.mkdirs(taskTmpDir)) { 315 throw new IOException("Mkdirs failed to create " 316 + taskTmpDir.toString()); 317 } 318 return taskTmpDir; 319 } 320 321 @Override 322 public boolean isRecoverySupported() { 323 return true; 324 } 325 326 @Override 327 public void recoverTask(TaskAttemptContext context) 328 throws IOException { 329 Path outputPath = FileOutputFormat.getOutputPath(context.getJobConf()); 330 context.progress(); 331 Path jobOutputPath = new Path(outputPath, getJobTempDirName(context)); 332 int previousAttempt = 333 context.getConfiguration().getInt( 334 MRConstants.APPLICATION_ATTEMPT_ID, 0) - 1; 335 if (previousAttempt < 0) { 336 LOG.warn("Cannot recover task output for first attempt..."); 337 return; 338 } 339 340 FileSystem outputFileSystem = 341 outputPath.getFileSystem(context.getJobConf()); 342 Path pathToRecover = 343 new Path(outputPath, getJobAttemptBaseDirName(previousAttempt)); 344 if (outputFileSystem.exists(pathToRecover)) { 345 // Move the task outputs to their final place 346 LOG.debug("Trying to recover task from " + pathToRecover 347 + " into " + jobOutputPath); 348 moveJobOutputs(outputFileSystem, 349 pathToRecover, jobOutputPath, pathToRecover); 350 LOG.info("Saved output of job to " + jobOutputPath); 351 } 352 } 353 354 protected static String getJobAttemptBaseDirName(JobContext context) { 355 int appAttemptId = 356 context.getJobConf().getInt( 357 MRConstants.APPLICATION_ATTEMPT_ID, 0); 358 return getJobAttemptBaseDirName(appAttemptId); 359 } 360 361 protected static String getJobTempDirName(TaskAttemptContext context) { 362 int appAttemptId = 363 context.getJobConf().getInt( 364 MRConstants.APPLICATION_ATTEMPT_ID, 0); 365 return getJobAttemptBaseDirName(appAttemptId); 366 } 367 368 protected static String getJobAttemptBaseDirName(int appAttemptId) { 369 return FileOutputCommitter.TEMP_DIR_NAME + Path.SEPARATOR + 370 + appAttemptId; 371 } 372 373 protected static String getTaskAttemptBaseDirName( 374 TaskAttemptContext context) { 375 return getJobTempDirName(context) + Path.SEPARATOR + 376 FileOutputCommitter.TEMP_DIR_NAME + Path.SEPARATOR + 377 "_" + context.getTaskAttemptID().toString(); 378 } 379 }