001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.mapred.lib;
019    
020    import org.apache.hadoop.classification.InterfaceAudience;
021    import org.apache.hadoop.classification.InterfaceStability;
022    import org.apache.hadoop.fs.FileSystem;
023    import org.apache.hadoop.mapred.*;
024    import org.apache.hadoop.util.Progressable;
025    
026    import java.io.IOException;
027    import java.util.*;
028    
029    /**
030     * The MultipleOutputs class simplifies writting to additional outputs other
031     * than the job default output via the <code>OutputCollector</code> passed to
032     * the <code>map()</code> and <code>reduce()</code> methods of the
033     * <code>Mapper</code> and <code>Reducer</code> implementations.
034     * <p/>
035     * Each additional output, or named output, may be configured with its own
036     * <code>OutputFormat</code>, with its own key class and with its own value
037     * class.
038     * <p/>
039     * A named output can be a single file or a multi file. The later is refered as
040     * a multi named output.
041     * <p/>
042     * A multi named output is an unbound set of files all sharing the same
043     * <code>OutputFormat</code>, key class and value class configuration.
044     * <p/>
045     * When named outputs are used within a <code>Mapper</code> implementation,
046     * key/values written to a name output are not part of the reduce phase, only
047     * key/values written to the job <code>OutputCollector</code> are part of the
048     * reduce phase.
049     * <p/>
050     * MultipleOutputs supports counters, by default the are disabled. The counters
051     * group is the {@link MultipleOutputs} class name.
052     * </p>
053     * The names of the counters are the same as the named outputs. For multi
054     * named outputs the name of the counter is the concatenation of the named
055     * output, and underscore '_' and the multiname.
056     * <p/>
057     * Job configuration usage pattern is:
058     * <pre>
059     *
060     * JobConf conf = new JobConf();
061     *
062     * conf.setInputPath(inDir);
063     * FileOutputFormat.setOutputPath(conf, outDir);
064     *
065     * conf.setMapperClass(MOMap.class);
066     * conf.setReducerClass(MOReduce.class);
067     * ...
068     *
069     * // Defines additional single text based output 'text' for the job
070     * MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class,
071     * LongWritable.class, Text.class);
072     *
073     * // Defines additional multi sequencefile based output 'sequence' for the
074     * // job
075     * MultipleOutputs.addMultiNamedOutput(conf, "seq",
076     *   SequenceFileOutputFormat.class,
077     *   LongWritable.class, Text.class);
078     * ...
079     *
080     * JobClient jc = new JobClient();
081     * RunningJob job = jc.submitJob(conf);
082     *
083     * ...
084     * </pre>
085     * <p/>
086     * Job configuration usage pattern is:
087     * <pre>
088     *
089     * public class MOReduce implements
090     *   Reducer&lt;WritableComparable, Writable&gt; {
091     * private MultipleOutputs mos;
092     *
093     * public void configure(JobConf conf) {
094     * ...
095     * mos = new MultipleOutputs(conf);
096     * }
097     *
098     * public void reduce(WritableComparable key, Iterator&lt;Writable&gt; values,
099     * OutputCollector output, Reporter reporter)
100     * throws IOException {
101     * ...
102     * mos.getCollector("text", reporter).collect(key, new Text("Hello"));
103     * mos.getCollector("seq", "A", reporter).collect(key, new Text("Bye"));
104     * mos.getCollector("seq", "B", reporter).collect(key, new Text("Chau"));
105     * ...
106     * }
107     *
108     * public void close() throws IOException {
109     * mos.close();
110     * ...
111     * }
112     *
113     * }
114     * </pre>
115     * @deprecated Use 
116     * {@link org.apache.hadoop.mapreduce.lib.output.MultipleOutputs} instead
117     */
118    @Deprecated
119    @InterfaceAudience.Public
120    @InterfaceStability.Stable
121    public class MultipleOutputs {
122    
123      private static final String NAMED_OUTPUTS = "mo.namedOutputs";
124    
125      private static final String MO_PREFIX = "mo.namedOutput.";
126    
127      private static final String FORMAT = ".format";
128      private static final String KEY = ".key";
129      private static final String VALUE = ".value";
130      private static final String MULTI = ".multi";
131    
132      private static final String COUNTERS_ENABLED = "mo.counters";
133    
134      /**
135       * Counters group used by the counters of MultipleOutputs.
136       */
137      private static final String COUNTERS_GROUP = MultipleOutputs.class.getName();
138    
139      /**
140       * Checks if a named output is alreadyDefined or not.
141       *
142       * @param conf           job conf
143       * @param namedOutput    named output names
144       * @param alreadyDefined whether the existence/non-existence of
145       *                       the named output is to be checked
146       * @throws IllegalArgumentException if the output name is alreadyDefined or
147       *                                  not depending on the value of the
148       *                                  'alreadyDefined' parameter
149       */
150      private static void checkNamedOutput(JobConf conf, String namedOutput,
151                                           boolean alreadyDefined) {
152        List<String> definedChannels = getNamedOutputsList(conf);
153        if (alreadyDefined && definedChannels.contains(namedOutput)) {
154          throw new IllegalArgumentException("Named output '" + namedOutput +
155            "' already alreadyDefined");
156        } else if (!alreadyDefined && !definedChannels.contains(namedOutput)) {
157          throw new IllegalArgumentException("Named output '" + namedOutput +
158            "' not defined");
159        }
160      }
161    
162      /**
163       * Checks if a named output name is valid token.
164       *
165       * @param namedOutput named output Name
166       * @throws IllegalArgumentException if the output name is not valid.
167       */
168      private static void checkTokenName(String namedOutput) {
169        if (namedOutput == null || namedOutput.length() == 0) {
170          throw new IllegalArgumentException(
171            "Name cannot be NULL or emtpy");
172        }
173        for (char ch : namedOutput.toCharArray()) {
174          if ((ch >= 'A') && (ch <= 'Z')) {
175            continue;
176          }
177          if ((ch >= 'a') && (ch <= 'z')) {
178            continue;
179          }
180          if ((ch >= '0') && (ch <= '9')) {
181            continue;
182          }
183          throw new IllegalArgumentException(
184            "Name cannot be have a '" + ch + "' char");
185        }
186      }
187    
188      /**
189       * Checks if a named output name is valid.
190       *
191       * @param namedOutput named output Name
192       * @throws IllegalArgumentException if the output name is not valid.
193       */
194      private static void checkNamedOutputName(String namedOutput) {
195        checkTokenName(namedOutput);
196        // name cannot be the name used for the default output
197        if (namedOutput.equals("part")) {
198          throw new IllegalArgumentException(
199            "Named output name cannot be 'part'");
200        }
201      }
202    
203      /**
204       * Returns list of channel names.
205       *
206       * @param conf job conf
207       * @return List of channel Names
208       */
209      public static List<String> getNamedOutputsList(JobConf conf) {
210        List<String> names = new ArrayList<String>();
211        StringTokenizer st = new StringTokenizer(conf.get(NAMED_OUTPUTS, ""), " ");
212        while (st.hasMoreTokens()) {
213          names.add(st.nextToken());
214        }
215        return names;
216      }
217    
218    
219      /**
220       * Returns if a named output is multiple.
221       *
222       * @param conf        job conf
223       * @param namedOutput named output
224       * @return <code>true</code> if the name output is multi, <code>false</code>
225       *         if it is single. If the name output is not defined it returns
226       *         <code>false</code>
227       */
228      public static boolean isMultiNamedOutput(JobConf conf, String namedOutput) {
229        checkNamedOutput(conf, namedOutput, false);
230        return conf.getBoolean(MO_PREFIX + namedOutput + MULTI, false);
231      }
232    
233      /**
234       * Returns the named output OutputFormat.
235       *
236       * @param conf        job conf
237       * @param namedOutput named output
238       * @return namedOutput OutputFormat
239       */
240      public static Class<? extends OutputFormat> getNamedOutputFormatClass(
241        JobConf conf, String namedOutput) {
242        checkNamedOutput(conf, namedOutput, false);
243        return conf.getClass(MO_PREFIX + namedOutput + FORMAT, null,
244          OutputFormat.class);
245      }
246    
247      /**
248       * Returns the key class for a named output.
249       *
250       * @param conf        job conf
251       * @param namedOutput named output
252       * @return class for the named output key
253       */
254      public static Class<?> getNamedOutputKeyClass(JobConf conf,
255                                                    String namedOutput) {
256        checkNamedOutput(conf, namedOutput, false);
257        return conf.getClass(MO_PREFIX + namedOutput + KEY, null,
258            Object.class);
259      }
260    
261      /**
262       * Returns the value class for a named output.
263       *
264       * @param conf        job conf
265       * @param namedOutput named output
266       * @return class of named output value
267       */
268      public static Class<?> getNamedOutputValueClass(JobConf conf,
269                                                      String namedOutput) {
270        checkNamedOutput(conf, namedOutput, false);
271        return conf.getClass(MO_PREFIX + namedOutput + VALUE, null,
272          Object.class);
273      }
274    
275      /**
276       * Adds a named output for the job.
277       * <p/>
278       *
279       * @param conf              job conf to add the named output
280       * @param namedOutput       named output name, it has to be a word, letters
281       *                          and numbers only, cannot be the word 'part' as
282       *                          that is reserved for the
283       *                          default output.
284       * @param outputFormatClass OutputFormat class.
285       * @param keyClass          key class
286       * @param valueClass        value class
287       */
288      public static void addNamedOutput(JobConf conf, String namedOutput,
289                                    Class<? extends OutputFormat> outputFormatClass,
290                                    Class<?> keyClass, Class<?> valueClass) {
291        addNamedOutput(conf, namedOutput, false, outputFormatClass, keyClass,
292          valueClass);
293      }
294    
295      /**
296       * Adds a multi named output for the job.
297       * <p/>
298       *
299       * @param conf              job conf to add the named output
300       * @param namedOutput       named output name, it has to be a word, letters
301       *                          and numbers only, cannot be the word 'part' as
302       *                          that is reserved for the
303       *                          default output.
304       * @param outputFormatClass OutputFormat class.
305       * @param keyClass          key class
306       * @param valueClass        value class
307       */
308      public static void addMultiNamedOutput(JobConf conf, String namedOutput,
309                                   Class<? extends OutputFormat> outputFormatClass,
310                                   Class<?> keyClass, Class<?> valueClass) {
311        addNamedOutput(conf, namedOutput, true, outputFormatClass, keyClass,
312          valueClass);
313      }
314    
315      /**
316       * Adds a named output for the job.
317       * <p/>
318       *
319       * @param conf              job conf to add the named output
320       * @param namedOutput       named output name, it has to be a word, letters
321       *                          and numbers only, cannot be the word 'part' as
322       *                          that is reserved for the
323       *                          default output.
324       * @param multi             indicates if the named output is multi
325       * @param outputFormatClass OutputFormat class.
326       * @param keyClass          key class
327       * @param valueClass        value class
328       */
329      private static void addNamedOutput(JobConf conf, String namedOutput,
330                                   boolean multi,
331                                   Class<? extends OutputFormat> outputFormatClass,
332                                   Class<?> keyClass, Class<?> valueClass) {
333        checkNamedOutputName(namedOutput);
334        checkNamedOutput(conf, namedOutput, true);
335        conf.set(NAMED_OUTPUTS, conf.get(NAMED_OUTPUTS, "") + " " + namedOutput);
336        conf.setClass(MO_PREFIX + namedOutput + FORMAT, outputFormatClass,
337          OutputFormat.class);
338        conf.setClass(MO_PREFIX + namedOutput + KEY, keyClass, Object.class);
339        conf.setClass(MO_PREFIX + namedOutput + VALUE, valueClass, Object.class);
340        conf.setBoolean(MO_PREFIX + namedOutput + MULTI, multi);
341      }
342    
343      /**
344       * Enables or disables counters for the named outputs.
345       * <p/>
346       * By default these counters are disabled.
347       * <p/>
348       * MultipleOutputs supports counters, by default the are disabled.
349       * The counters group is the {@link MultipleOutputs} class name.
350       * </p>
351       * The names of the counters are the same as the named outputs. For multi
352       * named outputs the name of the counter is the concatenation of the named
353       * output, and underscore '_' and the multiname.
354       *
355       * @param conf    job conf to enableadd the named output.
356       * @param enabled indicates if the counters will be enabled or not.
357       */
358      public static void setCountersEnabled(JobConf conf, boolean enabled) {
359        conf.setBoolean(COUNTERS_ENABLED, enabled);
360      }
361    
362      /**
363       * Returns if the counters for the named outputs are enabled or not.
364       * <p/>
365       * By default these counters are disabled.
366       * <p/>
367       * MultipleOutputs supports counters, by default the are disabled.
368       * The counters group is the {@link MultipleOutputs} class name.
369       * </p>
370       * The names of the counters are the same as the named outputs. For multi
371       * named outputs the name of the counter is the concatenation of the named
372       * output, and underscore '_' and the multiname.
373       *
374       *
375       * @param conf    job conf to enableadd the named output.
376       * @return TRUE if the counters are enabled, FALSE if they are disabled.
377       */
378      public static boolean getCountersEnabled(JobConf conf) {
379        return conf.getBoolean(COUNTERS_ENABLED, false);
380      }
381    
382      // instance code, to be used from Mapper/Reducer code
383    
384      private JobConf conf;
385      private OutputFormat outputFormat;
386      private Set<String> namedOutputs;
387      private Map<String, RecordWriter> recordWriters;
388      private boolean countersEnabled;
389    
390      /**
391       * Creates and initializes multiple named outputs support, it should be
392       * instantiated in the Mapper/Reducer configure method.
393       *
394       * @param job the job configuration object
395       */
396      public MultipleOutputs(JobConf job) {
397        this.conf = job;
398        outputFormat = new InternalFileOutputFormat();
399        namedOutputs = Collections.unmodifiableSet(
400          new HashSet<String>(MultipleOutputs.getNamedOutputsList(job)));
401        recordWriters = new HashMap<String, RecordWriter>();
402        countersEnabled = getCountersEnabled(job);
403      }
404    
405      /**
406       * Returns iterator with the defined name outputs.
407       *
408       * @return iterator with the defined named outputs
409       */
410      public Iterator<String> getNamedOutputs() {
411        return namedOutputs.iterator();
412      }
413    
414    
415      // by being synchronized MultipleOutputTask can be use with a
416      // MultithreaderMapRunner.
417      private synchronized RecordWriter getRecordWriter(String namedOutput,
418                                                        String baseFileName,
419                                                        final Reporter reporter)
420        throws IOException {
421        RecordWriter writer = recordWriters.get(baseFileName);
422        if (writer == null) {
423          if (countersEnabled && reporter == null) {
424            throw new IllegalArgumentException(
425              "Counters are enabled, Reporter cannot be NULL");
426          }
427          JobConf jobConf = new JobConf(conf);
428          jobConf.set(InternalFileOutputFormat.CONFIG_NAMED_OUTPUT, namedOutput);
429          FileSystem fs = FileSystem.get(conf);
430          writer =
431            outputFormat.getRecordWriter(fs, jobConf, baseFileName, reporter);
432    
433          if (countersEnabled) {
434            if (reporter == null) {
435              throw new IllegalArgumentException(
436                "Counters are enabled, Reporter cannot be NULL");
437            }
438            writer = new RecordWriterWithCounter(writer, baseFileName, reporter);
439          }
440    
441          recordWriters.put(baseFileName, writer);
442        }
443        return writer;
444      }
445    
446      private static class RecordWriterWithCounter implements RecordWriter {
447        private RecordWriter writer;
448        private String counterName;
449        private Reporter reporter;
450    
451        public RecordWriterWithCounter(RecordWriter writer, String counterName,
452                                       Reporter reporter) {
453          this.writer = writer;
454          this.counterName = counterName;
455          this.reporter = reporter;
456        }
457    
458        @SuppressWarnings({"unchecked"})
459        public void write(Object key, Object value) throws IOException {
460          reporter.incrCounter(COUNTERS_GROUP, counterName, 1);
461          writer.write(key, value);
462        }
463    
464        public void close(Reporter reporter) throws IOException {
465          writer.close(reporter);
466        }
467      }
468    
469      /**
470       * Gets the output collector for a named output.
471       * <p/>
472       *
473       * @param namedOutput the named output name
474       * @param reporter    the reporter
475       * @return the output collector for the given named output
476       * @throws IOException thrown if output collector could not be created
477       */
478      @SuppressWarnings({"unchecked"})
479      public OutputCollector getCollector(String namedOutput, Reporter reporter)
480        throws IOException {
481        return getCollector(namedOutput, null, reporter);
482      }
483    
484      /**
485       * Gets the output collector for a multi named output.
486       * <p/>
487       *
488       * @param namedOutput the named output name
489       * @param multiName   the multi name part
490       * @param reporter    the reporter
491       * @return the output collector for the given named output
492       * @throws IOException thrown if output collector could not be created
493       */
494      @SuppressWarnings({"unchecked"})
495      public OutputCollector getCollector(String namedOutput, String multiName,
496                                          Reporter reporter)
497        throws IOException {
498    
499        checkNamedOutputName(namedOutput);
500        if (!namedOutputs.contains(namedOutput)) {
501          throw new IllegalArgumentException("Undefined named output '" +
502            namedOutput + "'");
503        }
504        boolean multi = isMultiNamedOutput(conf, namedOutput);
505    
506        if (!multi && multiName != null) {
507          throw new IllegalArgumentException("Name output '" + namedOutput +
508            "' has not been defined as multi");
509        }
510        if (multi) {
511          checkTokenName(multiName);
512        }
513    
514        String baseFileName = (multi) ? namedOutput + "_" + multiName : namedOutput;
515    
516        final RecordWriter writer =
517          getRecordWriter(namedOutput, baseFileName, reporter);
518    
519        return new OutputCollector() {
520    
521          @SuppressWarnings({"unchecked"})
522          public void collect(Object key, Object value) throws IOException {
523            writer.write(key, value);
524          }
525    
526        };
527      }
528    
529      /**
530       * Closes all the opened named outputs.
531       * <p/>
532       * If overriden subclasses must invoke <code>super.close()</code> at the
533       * end of their <code>close()</code>
534       *
535       * @throws java.io.IOException thrown if any of the MultipleOutput files
536       *                             could not be closed properly.
537       */
538      public void close() throws IOException {
539        for (RecordWriter writer : recordWriters.values()) {
540          writer.close(null);
541        }
542      }
543    
544      private static class InternalFileOutputFormat extends
545        FileOutputFormat<Object, Object> {
546    
547        public static final String CONFIG_NAMED_OUTPUT = "mo.config.namedOutput";
548    
549        @SuppressWarnings({"unchecked"})
550        public RecordWriter<Object, Object> getRecordWriter(
551          FileSystem fs, JobConf job, String baseFileName, Progressable progress)
552          throws IOException {
553    
554          String nameOutput = job.get(CONFIG_NAMED_OUTPUT, null);
555          String fileName = getUniqueName(job, baseFileName);
556    
557          // The following trick leverages the instantiation of a record writer via
558          // the job conf thus supporting arbitrary output formats.
559          JobConf outputConf = new JobConf(job);
560          outputConf.setOutputFormat(getNamedOutputFormatClass(job, nameOutput));
561          outputConf.setOutputKeyClass(getNamedOutputKeyClass(job, nameOutput));
562          outputConf.setOutputValueClass(getNamedOutputValueClass(job, nameOutput));
563          OutputFormat outputFormat = outputConf.getOutputFormat();
564          return outputFormat.getRecordWriter(fs, outputConf, fileName, progress);
565        }
566      }
567    
568    }