001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.mapred.lib;
020    
021    import java.io.IOException;
022    import java.util.ArrayList;
023    import java.util.Iterator;
024    import java.util.List;
025    
026    import org.apache.commons.logging.Log;
027    import org.apache.commons.logging.LogFactory;
028    import org.apache.hadoop.classification.InterfaceAudience;
029    import org.apache.hadoop.classification.InterfaceStability;
030    import org.apache.hadoop.io.Text;
031    import org.apache.hadoop.mapred.JobConf;
032    import org.apache.hadoop.mapred.Mapper;
033    import org.apache.hadoop.mapred.OutputCollector;
034    import org.apache.hadoop.mapred.Reducer;
035    import org.apache.hadoop.mapred.Reporter;
036    import org.apache.hadoop.mapred.TextInputFormat;
037    import org.apache.hadoop.mapreduce.lib.fieldsel.*;
038    
039    /**
040     * This class implements a mapper/reducer class that can be used to perform
041     * field selections in a manner similar to unix cut. The input data is treated
042     * as fields separated by a user specified separator (the default value is
043     * "\t"). The user can specify a list of fields that form the map output keys,
044     * and a list of fields that form the map output values. If the inputformat is
045     * TextInputFormat, the mapper will ignore the key to the map function. and the
046     * fields are from the value only. Otherwise, the fields are the union of those
047     * from the key and those from the value.
048     * 
049     * The field separator is under attribute "mapreduce.fieldsel.data.field.separator"
050     * 
051     * The map output field list spec is under attribute 
052     * "mapreduce.fieldsel.map.output.key.value.fields.spec".
053     * The value is expected to be like "keyFieldsSpec:valueFieldsSpec"
054     * key/valueFieldsSpec are comma (,) separated field spec: fieldSpec,fieldSpec,fieldSpec ...
055     * Each field spec can be a simple number (e.g. 5) specifying a specific field, or a range
056     * (like 2-5) to specify a range of fields, or an open range (like 3-) specifying all 
057     * the fields starting from field 3. The open range field spec applies value fields only.
058     * They have no effect on the key fields.
059     * 
060     * Here is an example: "4,3,0,1:6,5,1-3,7-". It specifies to use fields 4,3,0 and 1 for keys,
061     * and use fields 6,5,1,2,3,7 and above for values.
062     * 
063     * The reduce output field list spec is under attribute 
064     * "mapreduce.fieldsel.reduce.output.key.value.fields.spec".
065     * 
066     * The reducer extracts output key/value pairs in a similar manner, except that
067     * the key is never ignored.
068     * @deprecated Use {@link FieldSelectionMapper} and 
069     * {@link FieldSelectionReducer} instead
070     */
071    @Deprecated
072    @InterfaceAudience.Public
073    @InterfaceStability.Stable
074    public class FieldSelectionMapReduce<K, V>
075        implements Mapper<K, V, Text, Text>, Reducer<Text, Text, Text, Text> {
076    
077      private String mapOutputKeyValueSpec;
078    
079      private boolean ignoreInputKey;
080    
081      private String fieldSeparator = "\t";
082    
083      private List<Integer> mapOutputKeyFieldList = new ArrayList<Integer>();
084    
085      private List<Integer> mapOutputValueFieldList = new ArrayList<Integer>();
086    
087      private int allMapValueFieldsFrom = -1;
088    
089      private String reduceOutputKeyValueSpec;
090    
091      private List<Integer> reduceOutputKeyFieldList = new ArrayList<Integer>();
092    
093      private List<Integer> reduceOutputValueFieldList = new ArrayList<Integer>();
094    
095      private int allReduceValueFieldsFrom = -1;
096    
097    
098      public static final Log LOG = LogFactory.getLog("FieldSelectionMapReduce");
099    
100      private String specToString() {
101        StringBuffer sb = new StringBuffer();
102        sb.append("fieldSeparator: ").append(fieldSeparator).append("\n");
103    
104        sb.append("mapOutputKeyValueSpec: ").append(mapOutputKeyValueSpec).append(
105            "\n");
106        sb.append("reduceOutputKeyValueSpec: ").append(reduceOutputKeyValueSpec)
107            .append("\n");
108    
109        sb.append("allMapValueFieldsFrom: ").append(allMapValueFieldsFrom).append(
110            "\n");
111    
112        sb.append("allReduceValueFieldsFrom: ").append(allReduceValueFieldsFrom)
113            .append("\n");
114    
115        int i = 0;
116    
117        sb.append("mapOutputKeyFieldList.length: ").append(
118            mapOutputKeyFieldList.size()).append("\n");
119        for (i = 0; i < mapOutputKeyFieldList.size(); i++) {
120          sb.append("\t").append(mapOutputKeyFieldList.get(i)).append("\n");
121        }
122        sb.append("mapOutputValueFieldList.length: ").append(
123            mapOutputValueFieldList.size()).append("\n");
124        for (i = 0; i < mapOutputValueFieldList.size(); i++) {
125          sb.append("\t").append(mapOutputValueFieldList.get(i)).append("\n");
126        }
127    
128        sb.append("reduceOutputKeyFieldList.length: ").append(
129            reduceOutputKeyFieldList.size()).append("\n");
130        for (i = 0; i < reduceOutputKeyFieldList.size(); i++) {
131          sb.append("\t").append(reduceOutputKeyFieldList.get(i)).append("\n");
132        }
133        sb.append("reduceOutputValueFieldList.length: ").append(
134            reduceOutputValueFieldList.size()).append("\n");
135        for (i = 0; i < reduceOutputValueFieldList.size(); i++) {
136          sb.append("\t").append(reduceOutputValueFieldList.get(i)).append("\n");
137        }
138        return sb.toString();
139      }
140    
141      /**
142       * The identify function. Input key/value pair is written directly to output.
143       */
144      public void map(K key, V val,
145          OutputCollector<Text, Text> output, Reporter reporter) 
146          throws IOException {
147        FieldSelectionHelper helper = new FieldSelectionHelper(
148          FieldSelectionHelper.emptyText, FieldSelectionHelper.emptyText);
149        helper.extractOutputKeyValue(key.toString(), val.toString(),
150          fieldSeparator, mapOutputKeyFieldList, mapOutputValueFieldList,
151          allMapValueFieldsFrom, ignoreInputKey, true);
152        output.collect(helper.getKey(), helper.getValue());
153      }
154    
155      private void parseOutputKeyValueSpec() {
156        allMapValueFieldsFrom = FieldSelectionHelper.parseOutputKeyValueSpec(
157          mapOutputKeyValueSpec, mapOutputKeyFieldList, mapOutputValueFieldList);
158        
159        allReduceValueFieldsFrom = FieldSelectionHelper.parseOutputKeyValueSpec(
160          reduceOutputKeyValueSpec, reduceOutputKeyFieldList,
161          reduceOutputValueFieldList);
162      }
163    
164      public void configure(JobConf job) {
165        this.fieldSeparator = job.get(FieldSelectionHelper.DATA_FIELD_SEPERATOR,
166            "\t");
167        this.mapOutputKeyValueSpec = job.get(
168            FieldSelectionHelper.MAP_OUTPUT_KEY_VALUE_SPEC, "0-:");
169        this.ignoreInputKey = TextInputFormat.class.getCanonicalName().equals(
170            job.getInputFormat().getClass().getCanonicalName());
171        this.reduceOutputKeyValueSpec = job.get(
172            FieldSelectionHelper.REDUCE_OUTPUT_KEY_VALUE_SPEC, "0-:");
173        parseOutputKeyValueSpec();
174        LOG.info(specToString());
175      }
176    
177      public void close() throws IOException {
178        // TODO Auto-generated method stub
179    
180      }
181    
182      public void reduce(Text key, Iterator<Text> values,
183                         OutputCollector<Text, Text> output, Reporter reporter)
184        throws IOException {
185        String keyStr = key.toString() + this.fieldSeparator;
186        while (values.hasNext()) {
187            FieldSelectionHelper helper = new FieldSelectionHelper();
188            helper.extractOutputKeyValue(keyStr, values.next().toString(),
189              fieldSeparator, reduceOutputKeyFieldList,
190              reduceOutputValueFieldList, allReduceValueFieldsFrom, false, false);
191          output.collect(helper.getKey(), helper.getValue());
192        }
193      }
194    }