001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.mapred.pipes; 020 021 import java.io.IOException; 022 import java.net.URI; 023 import java.net.URISyntaxException; 024 import java.net.URL; 025 import java.net.URLClassLoader; 026 import java.security.AccessController; 027 import java.security.PrivilegedAction; 028 import java.util.Iterator; 029 import java.util.StringTokenizer; 030 031 import org.apache.commons.cli.BasicParser; 032 import org.apache.commons.cli.CommandLine; 033 import org.apache.commons.cli.Option; 034 import org.apache.commons.cli.OptionBuilder; 035 import org.apache.commons.cli.Options; 036 import org.apache.commons.cli.ParseException; 037 import org.apache.commons.cli.Parser; 038 import org.apache.commons.logging.Log; 039 import org.apache.commons.logging.LogFactory; 040 import org.apache.hadoop.classification.InterfaceAudience; 041 import org.apache.hadoop.classification.InterfaceStability; 042 import org.apache.hadoop.conf.Configuration; 043 import org.apache.hadoop.conf.Configured; 044 import org.apache.hadoop.mapreduce.MRJobConfig; 045 import org.apache.hadoop.mapreduce.filecache.DistributedCache; 046 import org.apache.hadoop.fs.FileSystem; 047 import org.apache.hadoop.fs.Path; 048 import org.apache.hadoop.io.Text; 049 import org.apache.hadoop.mapred.FileInputFormat; 050 import org.apache.hadoop.mapred.FileOutputFormat; 051 import org.apache.hadoop.mapred.InputFormat; 052 import org.apache.hadoop.mapred.JobClient; 053 import org.apache.hadoop.mapred.JobConf; 054 import org.apache.hadoop.mapred.Mapper; 055 import org.apache.hadoop.mapred.OutputFormat; 056 import org.apache.hadoop.mapred.Partitioner; 057 import org.apache.hadoop.mapred.Reducer; 058 import org.apache.hadoop.mapred.RunningJob; 059 import org.apache.hadoop.mapred.lib.HashPartitioner; 060 import org.apache.hadoop.mapred.lib.LazyOutputFormat; 061 import org.apache.hadoop.mapred.lib.NullOutputFormat; 062 import org.apache.hadoop.util.GenericOptionsParser; 063 import org.apache.hadoop.util.Tool; 064 065 /** 066 * The main entry point and job submitter. It may either be used as a command 067 * line-based or API-based method to launch Pipes jobs. 068 */ 069 @InterfaceAudience.Public 070 @InterfaceStability.Stable 071 public class Submitter extends Configured implements Tool { 072 073 protected static final Log LOG = LogFactory.getLog(Submitter.class); 074 public static final String PRESERVE_COMMANDFILE = 075 "mapreduce.pipes.commandfile.preserve"; 076 public static final String EXECUTABLE = "mapreduce.pipes.executable"; 077 public static final String INTERPRETOR = 078 "mapreduce.pipes.executable.interpretor"; 079 public static final String IS_JAVA_MAP = "mapreduce.pipes.isjavamapper"; 080 public static final String IS_JAVA_RR = "mapreduce.pipes.isjavarecordreader"; 081 public static final String IS_JAVA_RW = "mapreduce.pipes.isjavarecordwriter"; 082 public static final String IS_JAVA_REDUCE = "mapreduce.pipes.isjavareducer"; 083 public static final String PARTITIONER = "mapreduce.pipes.partitioner"; 084 public static final String INPUT_FORMAT = "mapreduce.pipes.inputformat"; 085 public static final String PORT = "mapreduce.pipes.command.port"; 086 087 public Submitter() { 088 this(new Configuration()); 089 } 090 091 public Submitter(Configuration conf) { 092 setConf(conf); 093 } 094 095 /** 096 * Get the URI of the application's executable. 097 * @param conf 098 * @return the URI where the application's executable is located 099 */ 100 public static String getExecutable(JobConf conf) { 101 return conf.get(Submitter.EXECUTABLE); 102 } 103 104 /** 105 * Set the URI for the application's executable. Normally this is a hdfs: 106 * location. 107 * @param conf 108 * @param executable The URI of the application's executable. 109 */ 110 public static void setExecutable(JobConf conf, String executable) { 111 conf.set(Submitter.EXECUTABLE, executable); 112 } 113 114 /** 115 * Set whether the job is using a Java RecordReader. 116 * @param conf the configuration to modify 117 * @param value the new value 118 */ 119 public static void setIsJavaRecordReader(JobConf conf, boolean value) { 120 conf.setBoolean(Submitter.IS_JAVA_RR, value); 121 } 122 123 /** 124 * Check whether the job is using a Java RecordReader 125 * @param conf the configuration to check 126 * @return is it a Java RecordReader? 127 */ 128 public static boolean getIsJavaRecordReader(JobConf conf) { 129 return conf.getBoolean(Submitter.IS_JAVA_RR, false); 130 } 131 132 /** 133 * Set whether the Mapper is written in Java. 134 * @param conf the configuration to modify 135 * @param value the new value 136 */ 137 public static void setIsJavaMapper(JobConf conf, boolean value) { 138 conf.setBoolean(Submitter.IS_JAVA_MAP, value); 139 } 140 141 /** 142 * Check whether the job is using a Java Mapper. 143 * @param conf the configuration to check 144 * @return is it a Java Mapper? 145 */ 146 public static boolean getIsJavaMapper(JobConf conf) { 147 return conf.getBoolean(Submitter.IS_JAVA_MAP, false); 148 } 149 150 /** 151 * Set whether the Reducer is written in Java. 152 * @param conf the configuration to modify 153 * @param value the new value 154 */ 155 public static void setIsJavaReducer(JobConf conf, boolean value) { 156 conf.setBoolean(Submitter.IS_JAVA_REDUCE, value); 157 } 158 159 /** 160 * Check whether the job is using a Java Reducer. 161 * @param conf the configuration to check 162 * @return is it a Java Reducer? 163 */ 164 public static boolean getIsJavaReducer(JobConf conf) { 165 return conf.getBoolean(Submitter.IS_JAVA_REDUCE, false); 166 } 167 168 /** 169 * Set whether the job will use a Java RecordWriter. 170 * @param conf the configuration to modify 171 * @param value the new value to set 172 */ 173 public static void setIsJavaRecordWriter(JobConf conf, boolean value) { 174 conf.setBoolean(Submitter.IS_JAVA_RW, value); 175 } 176 177 /** 178 * Will the reduce use a Java RecordWriter? 179 * @param conf the configuration to check 180 * @return true, if the output of the job will be written by Java 181 */ 182 public static boolean getIsJavaRecordWriter(JobConf conf) { 183 return conf.getBoolean(Submitter.IS_JAVA_RW, false); 184 } 185 186 /** 187 * Set the configuration, if it doesn't already have a value for the given 188 * key. 189 * @param conf the configuration to modify 190 * @param key the key to set 191 * @param value the new "default" value to set 192 */ 193 private static void setIfUnset(JobConf conf, String key, String value) { 194 if (conf.get(key) == null) { 195 conf.set(key, value); 196 } 197 } 198 199 /** 200 * Save away the user's original partitioner before we override it. 201 * @param conf the configuration to modify 202 * @param cls the user's partitioner class 203 */ 204 static void setJavaPartitioner(JobConf conf, Class cls) { 205 conf.set(Submitter.PARTITIONER, cls.getName()); 206 } 207 208 /** 209 * Get the user's original partitioner. 210 * @param conf the configuration to look in 211 * @return the class that the user submitted 212 */ 213 static Class<? extends Partitioner> getJavaPartitioner(JobConf conf) { 214 return conf.getClass(Submitter.PARTITIONER, 215 HashPartitioner.class, 216 Partitioner.class); 217 } 218 219 /** 220 * Does the user want to keep the command file for debugging? If this is 221 * true, pipes will write a copy of the command data to a file in the 222 * task directory named "downlink.data", which may be used to run the C++ 223 * program under the debugger. You probably also want to set 224 * JobConf.setKeepFailedTaskFiles(true) to keep the entire directory from 225 * being deleted. 226 * To run using the data file, set the environment variable 227 * "mapreduce.pipes.commandfile" to point to the file. 228 * @param conf the configuration to check 229 * @return will the framework save the command file? 230 */ 231 public static boolean getKeepCommandFile(JobConf conf) { 232 return conf.getBoolean(Submitter.PRESERVE_COMMANDFILE, false); 233 } 234 235 /** 236 * Set whether to keep the command file for debugging 237 * @param conf the configuration to modify 238 * @param keep the new value 239 */ 240 public static void setKeepCommandFile(JobConf conf, boolean keep) { 241 conf.setBoolean(Submitter.PRESERVE_COMMANDFILE, keep); 242 } 243 244 /** 245 * Submit a job to the map/reduce cluster. All of the necessary modifications 246 * to the job to run under pipes are made to the configuration. 247 * @param conf the job to submit to the cluster (MODIFIED) 248 * @throws IOException 249 * @deprecated Use {@link Submitter#runJob(JobConf)} 250 */ 251 @Deprecated 252 public static RunningJob submitJob(JobConf conf) throws IOException { 253 return runJob(conf); 254 } 255 256 /** 257 * Submit a job to the map/reduce cluster. All of the necessary modifications 258 * to the job to run under pipes are made to the configuration. 259 * @param conf the job to submit to the cluster (MODIFIED) 260 * @throws IOException 261 */ 262 public static RunningJob runJob(JobConf conf) throws IOException { 263 setupPipesJob(conf); 264 return JobClient.runJob(conf); 265 } 266 267 /** 268 * Submit a job to the Map-Reduce framework. 269 * This returns a handle to the {@link RunningJob} which can be used to track 270 * the running-job. 271 * 272 * @param conf the job configuration. 273 * @return a handle to the {@link RunningJob} which can be used to track the 274 * running-job. 275 * @throws IOException 276 */ 277 public static RunningJob jobSubmit(JobConf conf) throws IOException { 278 setupPipesJob(conf); 279 return new JobClient(conf).submitJob(conf); 280 } 281 282 private static void setupPipesJob(JobConf conf) throws IOException { 283 // default map output types to Text 284 if (!getIsJavaMapper(conf)) { 285 conf.setMapRunnerClass(PipesMapRunner.class); 286 // Save the user's partitioner and hook in our's. 287 setJavaPartitioner(conf, conf.getPartitionerClass()); 288 conf.setPartitionerClass(PipesPartitioner.class); 289 } 290 if (!getIsJavaReducer(conf)) { 291 conf.setReducerClass(PipesReducer.class); 292 if (!getIsJavaRecordWriter(conf)) { 293 conf.setOutputFormat(NullOutputFormat.class); 294 } 295 } 296 String textClassname = Text.class.getName(); 297 setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname); 298 setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname); 299 setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname); 300 setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname); 301 302 // Use PipesNonJavaInputFormat if necessary to handle progress reporting 303 // from C++ RecordReaders ... 304 if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) { 305 conf.setClass(Submitter.INPUT_FORMAT, 306 conf.getInputFormat().getClass(), InputFormat.class); 307 conf.setInputFormat(PipesNonJavaInputFormat.class); 308 } 309 310 String exec = getExecutable(conf); 311 if (exec == null) { 312 throw new IllegalArgumentException("No application program defined."); 313 } 314 // add default debug script only when executable is expressed as 315 // <path>#<executable> 316 if (exec.contains("#")) { 317 DistributedCache.createSymlink(conf); 318 // set default gdb commands for map and reduce task 319 String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script"; 320 setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT,defScript); 321 setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT,defScript); 322 } 323 URI[] fileCache = DistributedCache.getCacheFiles(conf); 324 if (fileCache == null) { 325 fileCache = new URI[1]; 326 } else { 327 URI[] tmp = new URI[fileCache.length+1]; 328 System.arraycopy(fileCache, 0, tmp, 1, fileCache.length); 329 fileCache = tmp; 330 } 331 try { 332 fileCache[0] = new URI(exec); 333 } catch (URISyntaxException e) { 334 IOException ie = new IOException("Problem parsing execable URI " + exec); 335 ie.initCause(e); 336 throw ie; 337 } 338 DistributedCache.setCacheFiles(fileCache, conf); 339 } 340 341 /** 342 * A command line parser for the CLI-based Pipes job submitter. 343 */ 344 static class CommandLineParser { 345 private Options options = new Options(); 346 347 void addOption(String longName, boolean required, String description, 348 String paramName) { 349 Option option = OptionBuilder.withArgName(paramName).hasArgs(1).withDescription(description).isRequired(required).create(longName); 350 options.addOption(option); 351 } 352 353 void addArgument(String name, boolean required, String description) { 354 Option option = OptionBuilder.withArgName(name).hasArgs(1).withDescription(description).isRequired(required).create(); 355 options.addOption(option); 356 357 } 358 359 Parser createParser() { 360 Parser result = new BasicParser(); 361 return result; 362 } 363 364 void printUsage() { 365 // The CLI package should do this for us, but I can't figure out how 366 // to make it print something reasonable. 367 System.out.println("bin/hadoop pipes"); 368 System.out.println(" [-input <path>] // Input directory"); 369 System.out.println(" [-output <path>] // Output directory"); 370 System.out.println(" [-jar <jar file> // jar filename"); 371 System.out.println(" [-inputformat <class>] // InputFormat class"); 372 System.out.println(" [-map <class>] // Java Map class"); 373 System.out.println(" [-partitioner <class>] // Java Partitioner"); 374 System.out.println(" [-reduce <class>] // Java Reduce class"); 375 System.out.println(" [-writer <class>] // Java RecordWriter"); 376 System.out.println(" [-program <executable>] // executable URI"); 377 System.out.println(" [-reduces <num>] // number of reduces"); 378 System.out.println(" [-lazyOutput <true/false>] // createOutputLazily"); 379 System.out.println(); 380 GenericOptionsParser.printGenericCommandUsage(System.out); 381 } 382 } 383 384 private static <InterfaceType> 385 Class<? extends InterfaceType> getClass(CommandLine cl, String key, 386 JobConf conf, 387 Class<InterfaceType> cls 388 ) throws ClassNotFoundException { 389 return conf.getClassByName(cl.getOptionValue(key)).asSubclass(cls); 390 } 391 392 @Override 393 public int run(String[] args) throws Exception { 394 CommandLineParser cli = new CommandLineParser(); 395 if (args.length == 0) { 396 cli.printUsage(); 397 return 1; 398 } 399 cli.addOption("input", false, "input path to the maps", "path"); 400 cli.addOption("output", false, "output path from the reduces", "path"); 401 402 cli.addOption("jar", false, "job jar file", "path"); 403 cli.addOption("inputformat", false, "java classname of InputFormat", 404 "class"); 405 //cli.addArgument("javareader", false, "is the RecordReader in Java"); 406 cli.addOption("map", false, "java classname of Mapper", "class"); 407 cli.addOption("partitioner", false, "java classname of Partitioner", 408 "class"); 409 cli.addOption("reduce", false, "java classname of Reducer", "class"); 410 cli.addOption("writer", false, "java classname of OutputFormat", "class"); 411 cli.addOption("program", false, "URI to application executable", "class"); 412 cli.addOption("reduces", false, "number of reduces", "num"); 413 cli.addOption("jobconf", false, 414 "\"n1=v1,n2=v2,..\" (Deprecated) Optional. Add or override a JobConf property.", 415 "key=val"); 416 cli.addOption("lazyOutput", false, "Optional. Create output lazily", 417 "boolean"); 418 Parser parser = cli.createParser(); 419 try { 420 421 GenericOptionsParser genericParser = new GenericOptionsParser(getConf(), args); 422 CommandLine results = parser.parse(cli.options, genericParser.getRemainingArgs()); 423 424 JobConf job = new JobConf(getConf()); 425 426 if (results.hasOption("input")) { 427 FileInputFormat.setInputPaths(job, results.getOptionValue("input")); 428 } 429 if (results.hasOption("output")) { 430 FileOutputFormat.setOutputPath(job, 431 new Path(results.getOptionValue("output"))); 432 } 433 if (results.hasOption("jar")) { 434 job.setJar(results.getOptionValue("jar")); 435 } 436 if (results.hasOption("inputformat")) { 437 setIsJavaRecordReader(job, true); 438 job.setInputFormat(getClass(results, "inputformat", job, 439 InputFormat.class)); 440 } 441 if (results.hasOption("javareader")) { 442 setIsJavaRecordReader(job, true); 443 } 444 if (results.hasOption("map")) { 445 setIsJavaMapper(job, true); 446 job.setMapperClass(getClass(results, "map", job, Mapper.class)); 447 } 448 if (results.hasOption("partitioner")) { 449 job.setPartitionerClass(getClass(results, "partitioner", job, 450 Partitioner.class)); 451 } 452 if (results.hasOption("reduce")) { 453 setIsJavaReducer(job, true); 454 job.setReducerClass(getClass(results, "reduce", job, Reducer.class)); 455 } 456 if (results.hasOption("reduces")) { 457 job.setNumReduceTasks(Integer.parseInt( 458 results.getOptionValue("reduces"))); 459 } 460 if (results.hasOption("writer")) { 461 setIsJavaRecordWriter(job, true); 462 job.setOutputFormat(getClass(results, "writer", job, 463 OutputFormat.class)); 464 } 465 466 if (results.hasOption("lazyOutput")) { 467 if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) { 468 LazyOutputFormat.setOutputFormatClass(job, 469 job.getOutputFormat().getClass()); 470 } 471 } 472 473 if (results.hasOption("program")) { 474 setExecutable(job, results.getOptionValue("program")); 475 } 476 if (results.hasOption("jobconf")) { 477 LOG.warn("-jobconf option is deprecated, please use -D instead."); 478 String options = results.getOptionValue("jobconf"); 479 StringTokenizer tokenizer = new StringTokenizer(options, ","); 480 while (tokenizer.hasMoreTokens()) { 481 String keyVal = tokenizer.nextToken().trim(); 482 String[] keyValSplit = keyVal.split("="); 483 job.set(keyValSplit[0], keyValSplit[1]); 484 } 485 } 486 // if they gave us a jar file, include it into the class path 487 String jarFile = job.getJar(); 488 if (jarFile != null) { 489 final URL[] urls = new URL[]{ FileSystem.getLocal(job). 490 pathToFile(new Path(jarFile)).toURL()}; 491 //FindBugs complains that creating a URLClassLoader should be 492 //in a doPrivileged() block. 493 ClassLoader loader = 494 AccessController.doPrivileged( 495 new PrivilegedAction<ClassLoader>() { 496 public ClassLoader run() { 497 return new URLClassLoader(urls); 498 } 499 } 500 ); 501 job.setClassLoader(loader); 502 } 503 504 runJob(job); 505 return 0; 506 } catch (ParseException pe) { 507 LOG.info("Error : " + pe); 508 cli.printUsage(); 509 return 1; 510 } 511 512 } 513 514 /** 515 * Submit a pipes job based on the command line arguments. 516 * @param args 517 */ 518 public static void main(String[] args) throws Exception { 519 int exitCode = new Submitter().run(args); 520 System.exit(exitCode); 521 } 522 523 }