001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.mapred.pipes;
020    
021    import java.io.IOException;
022    import java.net.URI;
023    import java.net.URISyntaxException;
024    import java.net.URL;
025    import java.net.URLClassLoader;
026    import java.security.AccessController;
027    import java.security.PrivilegedAction;
028    import java.util.Iterator;
029    import java.util.StringTokenizer;
030    
031    import org.apache.commons.cli.BasicParser;
032    import org.apache.commons.cli.CommandLine;
033    import org.apache.commons.cli.Option;
034    import org.apache.commons.cli.OptionBuilder;
035    import org.apache.commons.cli.Options;
036    import org.apache.commons.cli.ParseException;
037    import org.apache.commons.cli.Parser;
038    import org.apache.commons.logging.Log;
039    import org.apache.commons.logging.LogFactory;
040    import org.apache.hadoop.classification.InterfaceAudience;
041    import org.apache.hadoop.classification.InterfaceStability;
042    import org.apache.hadoop.conf.Configuration;
043    import org.apache.hadoop.conf.Configured;
044    import org.apache.hadoop.mapreduce.MRJobConfig;
045    import org.apache.hadoop.mapreduce.filecache.DistributedCache;
046    import org.apache.hadoop.fs.FileSystem;
047    import org.apache.hadoop.fs.Path;
048    import org.apache.hadoop.io.Text;
049    import org.apache.hadoop.mapred.FileInputFormat;
050    import org.apache.hadoop.mapred.FileOutputFormat;
051    import org.apache.hadoop.mapred.InputFormat;
052    import org.apache.hadoop.mapred.JobClient;
053    import org.apache.hadoop.mapred.JobConf;
054    import org.apache.hadoop.mapred.Mapper;
055    import org.apache.hadoop.mapred.OutputFormat;
056    import org.apache.hadoop.mapred.Partitioner;
057    import org.apache.hadoop.mapred.Reducer;
058    import org.apache.hadoop.mapred.RunningJob;
059    import org.apache.hadoop.mapred.lib.HashPartitioner;
060    import org.apache.hadoop.mapred.lib.LazyOutputFormat;
061    import org.apache.hadoop.mapred.lib.NullOutputFormat;
062    import org.apache.hadoop.util.GenericOptionsParser;
063    import org.apache.hadoop.util.Tool;
064    
065    /**
066     * The main entry point and job submitter. It may either be used as a command
067     * line-based or API-based method to launch Pipes jobs.
068     */
069    @InterfaceAudience.Public
070    @InterfaceStability.Stable
071    public class Submitter extends Configured implements Tool {
072    
073      protected static final Log LOG = LogFactory.getLog(Submitter.class);
074      public static final String PRESERVE_COMMANDFILE = 
075        "mapreduce.pipes.commandfile.preserve";
076      public static final String EXECUTABLE = "mapreduce.pipes.executable";
077      public static final String INTERPRETOR = 
078        "mapreduce.pipes.executable.interpretor";
079      public static final String IS_JAVA_MAP = "mapreduce.pipes.isjavamapper";
080      public static final String IS_JAVA_RR = "mapreduce.pipes.isjavarecordreader";
081      public static final String IS_JAVA_RW = "mapreduce.pipes.isjavarecordwriter";
082      public static final String IS_JAVA_REDUCE = "mapreduce.pipes.isjavareducer";
083      public static final String PARTITIONER = "mapreduce.pipes.partitioner";
084      public static final String INPUT_FORMAT = "mapreduce.pipes.inputformat";
085      public static final String PORT = "mapreduce.pipes.command.port";
086      
087      public Submitter() {
088        this(new Configuration());
089      }
090      
091      public Submitter(Configuration conf) {
092        setConf(conf);
093      }
094      
095      /**
096       * Get the URI of the application's executable.
097       * @param conf
098       * @return the URI where the application's executable is located
099       */
100      public static String getExecutable(JobConf conf) {
101        return conf.get(Submitter.EXECUTABLE);
102      }
103    
104      /**
105       * Set the URI for the application's executable. Normally this is a hdfs: 
106       * location.
107       * @param conf
108       * @param executable The URI of the application's executable.
109       */
110      public static void setExecutable(JobConf conf, String executable) {
111        conf.set(Submitter.EXECUTABLE, executable);
112      }
113    
114      /**
115       * Set whether the job is using a Java RecordReader.
116       * @param conf the configuration to modify
117       * @param value the new value
118       */
119      public static void setIsJavaRecordReader(JobConf conf, boolean value) {
120        conf.setBoolean(Submitter.IS_JAVA_RR, value);
121      }
122    
123      /**
124       * Check whether the job is using a Java RecordReader
125       * @param conf the configuration to check
126       * @return is it a Java RecordReader?
127       */
128      public static boolean getIsJavaRecordReader(JobConf conf) {
129        return conf.getBoolean(Submitter.IS_JAVA_RR, false);
130      }
131    
132      /**
133       * Set whether the Mapper is written in Java.
134       * @param conf the configuration to modify
135       * @param value the new value
136       */
137      public static void setIsJavaMapper(JobConf conf, boolean value) {
138        conf.setBoolean(Submitter.IS_JAVA_MAP, value);
139      }
140    
141      /**
142       * Check whether the job is using a Java Mapper.
143       * @param conf the configuration to check
144       * @return is it a Java Mapper?
145       */
146      public static boolean getIsJavaMapper(JobConf conf) {
147        return conf.getBoolean(Submitter.IS_JAVA_MAP, false);
148      }
149    
150      /**
151       * Set whether the Reducer is written in Java.
152       * @param conf the configuration to modify
153       * @param value the new value
154       */
155      public static void setIsJavaReducer(JobConf conf, boolean value) {
156        conf.setBoolean(Submitter.IS_JAVA_REDUCE, value);
157      }
158    
159      /**
160       * Check whether the job is using a Java Reducer.
161       * @param conf the configuration to check
162       * @return is it a Java Reducer?
163       */
164      public static boolean getIsJavaReducer(JobConf conf) {
165        return conf.getBoolean(Submitter.IS_JAVA_REDUCE, false);
166      }
167    
168      /**
169       * Set whether the job will use a Java RecordWriter.
170       * @param conf the configuration to modify
171       * @param value the new value to set
172       */
173      public static void setIsJavaRecordWriter(JobConf conf, boolean value) {
174        conf.setBoolean(Submitter.IS_JAVA_RW, value);
175      }
176    
177      /**
178       * Will the reduce use a Java RecordWriter?
179       * @param conf the configuration to check
180       * @return true, if the output of the job will be written by Java
181       */
182      public static boolean getIsJavaRecordWriter(JobConf conf) {
183        return conf.getBoolean(Submitter.IS_JAVA_RW, false);
184      }
185    
186      /**
187       * Set the configuration, if it doesn't already have a value for the given
188       * key.
189       * @param conf the configuration to modify
190       * @param key the key to set
191       * @param value the new "default" value to set
192       */
193      private static void setIfUnset(JobConf conf, String key, String value) {
194        if (conf.get(key) == null) {
195          conf.set(key, value);
196        }
197      }
198    
199      /**
200       * Save away the user's original partitioner before we override it.
201       * @param conf the configuration to modify
202       * @param cls the user's partitioner class
203       */
204      static void setJavaPartitioner(JobConf conf, Class cls) {
205        conf.set(Submitter.PARTITIONER, cls.getName());
206      }
207      
208      /**
209       * Get the user's original partitioner.
210       * @param conf the configuration to look in
211       * @return the class that the user submitted
212       */
213      static Class<? extends Partitioner> getJavaPartitioner(JobConf conf) {
214        return conf.getClass(Submitter.PARTITIONER, 
215                             HashPartitioner.class,
216                             Partitioner.class);
217      }
218    
219      /**
220       * Does the user want to keep the command file for debugging? If this is
221       * true, pipes will write a copy of the command data to a file in the
222       * task directory named "downlink.data", which may be used to run the C++
223       * program under the debugger. You probably also want to set 
224       * JobConf.setKeepFailedTaskFiles(true) to keep the entire directory from
225       * being deleted.
226       * To run using the data file, set the environment variable 
227       * "mapreduce.pipes.commandfile" to point to the file.
228       * @param conf the configuration to check
229       * @return will the framework save the command file?
230       */
231      public static boolean getKeepCommandFile(JobConf conf) {
232        return conf.getBoolean(Submitter.PRESERVE_COMMANDFILE, false);
233      }
234    
235      /**
236       * Set whether to keep the command file for debugging
237       * @param conf the configuration to modify
238       * @param keep the new value
239       */
240      public static void setKeepCommandFile(JobConf conf, boolean keep) {
241        conf.setBoolean(Submitter.PRESERVE_COMMANDFILE, keep);
242      }
243    
244      /**
245       * Submit a job to the map/reduce cluster. All of the necessary modifications
246       * to the job to run under pipes are made to the configuration.
247       * @param conf the job to submit to the cluster (MODIFIED)
248       * @throws IOException
249       * @deprecated Use {@link Submitter#runJob(JobConf)}
250       */
251      @Deprecated
252      public static RunningJob submitJob(JobConf conf) throws IOException {
253        return runJob(conf);
254      }
255    
256      /**
257       * Submit a job to the map/reduce cluster. All of the necessary modifications
258       * to the job to run under pipes are made to the configuration.
259       * @param conf the job to submit to the cluster (MODIFIED)
260       * @throws IOException
261       */
262      public static RunningJob runJob(JobConf conf) throws IOException {
263        setupPipesJob(conf);
264        return JobClient.runJob(conf);
265      }
266    
267      /**
268       * Submit a job to the Map-Reduce framework.
269       * This returns a handle to the {@link RunningJob} which can be used to track
270       * the running-job.
271       * 
272       * @param conf the job configuration.
273       * @return a handle to the {@link RunningJob} which can be used to track the
274       *         running-job.
275       * @throws IOException
276       */
277      public static RunningJob jobSubmit(JobConf conf) throws IOException {
278        setupPipesJob(conf);
279        return new JobClient(conf).submitJob(conf);
280      }
281      
282      private static void setupPipesJob(JobConf conf) throws IOException {
283        // default map output types to Text
284        if (!getIsJavaMapper(conf)) {
285          conf.setMapRunnerClass(PipesMapRunner.class);
286          // Save the user's partitioner and hook in our's.
287          setJavaPartitioner(conf, conf.getPartitionerClass());
288          conf.setPartitionerClass(PipesPartitioner.class);
289        }
290        if (!getIsJavaReducer(conf)) {
291          conf.setReducerClass(PipesReducer.class);
292          if (!getIsJavaRecordWriter(conf)) {
293            conf.setOutputFormat(NullOutputFormat.class);
294          }
295        }
296        String textClassname = Text.class.getName();
297        setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname);
298        setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname);
299        setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname);
300        setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname);
301        
302        // Use PipesNonJavaInputFormat if necessary to handle progress reporting
303        // from C++ RecordReaders ...
304        if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
305          conf.setClass(Submitter.INPUT_FORMAT, 
306                        conf.getInputFormat().getClass(), InputFormat.class);
307          conf.setInputFormat(PipesNonJavaInputFormat.class);
308        }
309        
310        String exec = getExecutable(conf);
311        if (exec == null) {
312          throw new IllegalArgumentException("No application program defined.");
313        }
314        // add default debug script only when executable is expressed as
315        // <path>#<executable>
316        if (exec.contains("#")) {
317          DistributedCache.createSymlink(conf);
318          // set default gdb commands for map and reduce task 
319          String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script";
320          setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT,defScript);
321          setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT,defScript);
322        }
323        URI[] fileCache = DistributedCache.getCacheFiles(conf);
324        if (fileCache == null) {
325          fileCache = new URI[1];
326        } else {
327          URI[] tmp = new URI[fileCache.length+1];
328          System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
329          fileCache = tmp;
330        }
331        try {
332          fileCache[0] = new URI(exec);
333        } catch (URISyntaxException e) {
334          IOException ie = new IOException("Problem parsing execable URI " + exec);
335          ie.initCause(e);
336          throw ie;
337        }
338        DistributedCache.setCacheFiles(fileCache, conf);
339      }
340    
341      /**
342       * A command line parser for the CLI-based Pipes job submitter.
343       */
344      static class CommandLineParser {
345        private Options options = new Options();
346        
347        void addOption(String longName, boolean required, String description, 
348                       String paramName) {
349          Option option = OptionBuilder.withArgName(paramName).hasArgs(1).withDescription(description).isRequired(required).create(longName);
350          options.addOption(option);
351        }
352        
353        void addArgument(String name, boolean required, String description) {
354          Option option = OptionBuilder.withArgName(name).hasArgs(1).withDescription(description).isRequired(required).create();
355          options.addOption(option);
356    
357        }
358    
359        Parser createParser() {
360          Parser result = new BasicParser();
361          return result;
362        }
363        
364        void printUsage() {
365          // The CLI package should do this for us, but I can't figure out how
366          // to make it print something reasonable.
367          System.out.println("bin/hadoop pipes");
368          System.out.println("  [-input <path>] // Input directory");
369          System.out.println("  [-output <path>] // Output directory");
370          System.out.println("  [-jar <jar file> // jar filename");
371          System.out.println("  [-inputformat <class>] // InputFormat class");
372          System.out.println("  [-map <class>] // Java Map class");
373          System.out.println("  [-partitioner <class>] // Java Partitioner");
374          System.out.println("  [-reduce <class>] // Java Reduce class");
375          System.out.println("  [-writer <class>] // Java RecordWriter");
376          System.out.println("  [-program <executable>] // executable URI");
377          System.out.println("  [-reduces <num>] // number of reduces");
378          System.out.println("  [-lazyOutput <true/false>] // createOutputLazily");
379          System.out.println();
380          GenericOptionsParser.printGenericCommandUsage(System.out);
381        }
382      }
383      
384      private static <InterfaceType> 
385      Class<? extends InterfaceType> getClass(CommandLine cl, String key, 
386                                              JobConf conf, 
387                                              Class<InterfaceType> cls
388                                             ) throws ClassNotFoundException {
389        return conf.getClassByName(cl.getOptionValue(key)).asSubclass(cls);
390      }
391    
392      @Override
393      public int run(String[] args) throws Exception {
394        CommandLineParser cli = new CommandLineParser();
395        if (args.length == 0) {
396          cli.printUsage();
397          return 1;
398        }
399        cli.addOption("input", false, "input path to the maps", "path");
400        cli.addOption("output", false, "output path from the reduces", "path");
401        
402        cli.addOption("jar", false, "job jar file", "path");
403        cli.addOption("inputformat", false, "java classname of InputFormat", 
404                      "class");
405        //cli.addArgument("javareader", false, "is the RecordReader in Java");
406        cli.addOption("map", false, "java classname of Mapper", "class");
407        cli.addOption("partitioner", false, "java classname of Partitioner", 
408                      "class");
409        cli.addOption("reduce", false, "java classname of Reducer", "class");
410        cli.addOption("writer", false, "java classname of OutputFormat", "class");
411        cli.addOption("program", false, "URI to application executable", "class");
412        cli.addOption("reduces", false, "number of reduces", "num");
413        cli.addOption("jobconf", false, 
414            "\"n1=v1,n2=v2,..\" (Deprecated) Optional. Add or override a JobConf property.",
415            "key=val");
416        cli.addOption("lazyOutput", false, "Optional. Create output lazily",
417                      "boolean");
418        Parser parser = cli.createParser();
419        try {
420          
421          GenericOptionsParser genericParser = new GenericOptionsParser(getConf(), args);
422          CommandLine results = parser.parse(cli.options, genericParser.getRemainingArgs());
423          
424          JobConf job = new JobConf(getConf());
425          
426          if (results.hasOption("input")) {
427            FileInputFormat.setInputPaths(job, results.getOptionValue("input"));
428          }
429          if (results.hasOption("output")) {
430            FileOutputFormat.setOutputPath(job, 
431              new Path(results.getOptionValue("output")));
432          }
433          if (results.hasOption("jar")) {
434            job.setJar(results.getOptionValue("jar"));
435          }
436          if (results.hasOption("inputformat")) {
437            setIsJavaRecordReader(job, true);
438            job.setInputFormat(getClass(results, "inputformat", job,
439                                         InputFormat.class));
440          }
441          if (results.hasOption("javareader")) {
442            setIsJavaRecordReader(job, true);
443          }
444          if (results.hasOption("map")) {
445            setIsJavaMapper(job, true);
446            job.setMapperClass(getClass(results, "map", job, Mapper.class));
447          }
448          if (results.hasOption("partitioner")) {
449            job.setPartitionerClass(getClass(results, "partitioner", job,
450                                              Partitioner.class));
451          }
452          if (results.hasOption("reduce")) {
453            setIsJavaReducer(job, true);
454            job.setReducerClass(getClass(results, "reduce", job, Reducer.class));
455          }
456          if (results.hasOption("reduces")) {
457            job.setNumReduceTasks(Integer.parseInt( 
458                                               results.getOptionValue("reduces")));
459          }
460          if (results.hasOption("writer")) {
461            setIsJavaRecordWriter(job, true);
462            job.setOutputFormat(getClass(results, "writer", job, 
463                                          OutputFormat.class));
464          }
465          
466          if (results.hasOption("lazyOutput")) {
467            if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) {
468              LazyOutputFormat.setOutputFormatClass(job,
469                  job.getOutputFormat().getClass());
470            }
471          }
472          
473          if (results.hasOption("program")) {
474            setExecutable(job, results.getOptionValue("program"));
475          }
476          if (results.hasOption("jobconf")) {
477            LOG.warn("-jobconf option is deprecated, please use -D instead.");
478            String options = results.getOptionValue("jobconf");
479            StringTokenizer tokenizer = new StringTokenizer(options, ",");
480            while (tokenizer.hasMoreTokens()) {
481              String keyVal = tokenizer.nextToken().trim();
482              String[] keyValSplit = keyVal.split("=");
483              job.set(keyValSplit[0], keyValSplit[1]);
484            }
485          }
486          // if they gave us a jar file, include it into the class path
487          String jarFile = job.getJar();
488          if (jarFile != null) {
489            final URL[] urls = new URL[]{ FileSystem.getLocal(job).
490                pathToFile(new Path(jarFile)).toURL()};
491            //FindBugs complains that creating a URLClassLoader should be
492            //in a doPrivileged() block. 
493            ClassLoader loader =
494              AccessController.doPrivileged(
495                  new PrivilegedAction<ClassLoader>() {
496                    public ClassLoader run() {
497                      return new URLClassLoader(urls);
498                    }
499                  }
500                );
501            job.setClassLoader(loader);
502          }
503          
504          runJob(job);
505          return 0;
506        } catch (ParseException pe) {
507          LOG.info("Error : " + pe);
508          cli.printUsage();
509          return 1;
510        }
511        
512      }
513      
514      /**
515       * Submit a pipes job based on the command line arguments.
516       * @param args
517       */
518      public static void main(String[] args) throws Exception {
519        int exitCode =  new Submitter().run(args);
520        System.exit(exitCode);
521      }
522    
523    }