001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.mapred.lib; 020 021 import java.io.IOException; 022 import java.util.ArrayList; 023 import java.util.Iterator; 024 import java.util.List; 025 026 import org.apache.commons.logging.Log; 027 import org.apache.commons.logging.LogFactory; 028 import org.apache.hadoop.classification.InterfaceAudience; 029 import org.apache.hadoop.classification.InterfaceStability; 030 import org.apache.hadoop.io.Text; 031 import org.apache.hadoop.mapred.JobConf; 032 import org.apache.hadoop.mapred.Mapper; 033 import org.apache.hadoop.mapred.OutputCollector; 034 import org.apache.hadoop.mapred.Reducer; 035 import org.apache.hadoop.mapred.Reporter; 036 import org.apache.hadoop.mapred.TextInputFormat; 037 import org.apache.hadoop.mapreduce.lib.fieldsel.*; 038 039 /** 040 * This class implements a mapper/reducer class that can be used to perform 041 * field selections in a manner similar to unix cut. The input data is treated 042 * as fields separated by a user specified separator (the default value is 043 * "\t"). The user can specify a list of fields that form the map output keys, 044 * and a list of fields that form the map output values. If the inputformat is 045 * TextInputFormat, the mapper will ignore the key to the map function. and the 046 * fields are from the value only. Otherwise, the fields are the union of those 047 * from the key and those from the value. 048 * 049 * The field separator is under attribute "mapreduce.fieldsel.data.field.separator" 050 * 051 * The map output field list spec is under attribute 052 * "mapreduce.fieldsel.map.output.key.value.fields.spec". 053 * The value is expected to be like "keyFieldsSpec:valueFieldsSpec" 054 * key/valueFieldsSpec are comma (,) separated field spec: fieldSpec,fieldSpec,fieldSpec ... 055 * Each field spec can be a simple number (e.g. 5) specifying a specific field, or a range 056 * (like 2-5) to specify a range of fields, or an open range (like 3-) specifying all 057 * the fields starting from field 3. The open range field spec applies value fields only. 058 * They have no effect on the key fields. 059 * 060 * Here is an example: "4,3,0,1:6,5,1-3,7-". It specifies to use fields 4,3,0 and 1 for keys, 061 * and use fields 6,5,1,2,3,7 and above for values. 062 * 063 * The reduce output field list spec is under attribute 064 * "mapreduce.fieldsel.reduce.output.key.value.fields.spec". 065 * 066 * The reducer extracts output key/value pairs in a similar manner, except that 067 * the key is never ignored. 068 * @deprecated Use {@link FieldSelectionMapper} and 069 * {@link FieldSelectionReducer} instead 070 */ 071 @Deprecated 072 @InterfaceAudience.Public 073 @InterfaceStability.Stable 074 public class FieldSelectionMapReduce<K, V> 075 implements Mapper<K, V, Text, Text>, Reducer<Text, Text, Text, Text> { 076 077 private String mapOutputKeyValueSpec; 078 079 private boolean ignoreInputKey; 080 081 private String fieldSeparator = "\t"; 082 083 private List<Integer> mapOutputKeyFieldList = new ArrayList<Integer>(); 084 085 private List<Integer> mapOutputValueFieldList = new ArrayList<Integer>(); 086 087 private int allMapValueFieldsFrom = -1; 088 089 private String reduceOutputKeyValueSpec; 090 091 private List<Integer> reduceOutputKeyFieldList = new ArrayList<Integer>(); 092 093 private List<Integer> reduceOutputValueFieldList = new ArrayList<Integer>(); 094 095 private int allReduceValueFieldsFrom = -1; 096 097 098 public static final Log LOG = LogFactory.getLog("FieldSelectionMapReduce"); 099 100 private String specToString() { 101 StringBuffer sb = new StringBuffer(); 102 sb.append("fieldSeparator: ").append(fieldSeparator).append("\n"); 103 104 sb.append("mapOutputKeyValueSpec: ").append(mapOutputKeyValueSpec).append( 105 "\n"); 106 sb.append("reduceOutputKeyValueSpec: ").append(reduceOutputKeyValueSpec) 107 .append("\n"); 108 109 sb.append("allMapValueFieldsFrom: ").append(allMapValueFieldsFrom).append( 110 "\n"); 111 112 sb.append("allReduceValueFieldsFrom: ").append(allReduceValueFieldsFrom) 113 .append("\n"); 114 115 int i = 0; 116 117 sb.append("mapOutputKeyFieldList.length: ").append( 118 mapOutputKeyFieldList.size()).append("\n"); 119 for (i = 0; i < mapOutputKeyFieldList.size(); i++) { 120 sb.append("\t").append(mapOutputKeyFieldList.get(i)).append("\n"); 121 } 122 sb.append("mapOutputValueFieldList.length: ").append( 123 mapOutputValueFieldList.size()).append("\n"); 124 for (i = 0; i < mapOutputValueFieldList.size(); i++) { 125 sb.append("\t").append(mapOutputValueFieldList.get(i)).append("\n"); 126 } 127 128 sb.append("reduceOutputKeyFieldList.length: ").append( 129 reduceOutputKeyFieldList.size()).append("\n"); 130 for (i = 0; i < reduceOutputKeyFieldList.size(); i++) { 131 sb.append("\t").append(reduceOutputKeyFieldList.get(i)).append("\n"); 132 } 133 sb.append("reduceOutputValueFieldList.length: ").append( 134 reduceOutputValueFieldList.size()).append("\n"); 135 for (i = 0; i < reduceOutputValueFieldList.size(); i++) { 136 sb.append("\t").append(reduceOutputValueFieldList.get(i)).append("\n"); 137 } 138 return sb.toString(); 139 } 140 141 /** 142 * The identify function. Input key/value pair is written directly to output. 143 */ 144 public void map(K key, V val, 145 OutputCollector<Text, Text> output, Reporter reporter) 146 throws IOException { 147 FieldSelectionHelper helper = new FieldSelectionHelper( 148 FieldSelectionHelper.emptyText, FieldSelectionHelper.emptyText); 149 helper.extractOutputKeyValue(key.toString(), val.toString(), 150 fieldSeparator, mapOutputKeyFieldList, mapOutputValueFieldList, 151 allMapValueFieldsFrom, ignoreInputKey, true); 152 output.collect(helper.getKey(), helper.getValue()); 153 } 154 155 private void parseOutputKeyValueSpec() { 156 allMapValueFieldsFrom = FieldSelectionHelper.parseOutputKeyValueSpec( 157 mapOutputKeyValueSpec, mapOutputKeyFieldList, mapOutputValueFieldList); 158 159 allReduceValueFieldsFrom = FieldSelectionHelper.parseOutputKeyValueSpec( 160 reduceOutputKeyValueSpec, reduceOutputKeyFieldList, 161 reduceOutputValueFieldList); 162 } 163 164 public void configure(JobConf job) { 165 this.fieldSeparator = job.get(FieldSelectionHelper.DATA_FIELD_SEPERATOR, 166 "\t"); 167 this.mapOutputKeyValueSpec = job.get( 168 FieldSelectionHelper.MAP_OUTPUT_KEY_VALUE_SPEC, "0-:"); 169 this.ignoreInputKey = TextInputFormat.class.getCanonicalName().equals( 170 job.getInputFormat().getClass().getCanonicalName()); 171 this.reduceOutputKeyValueSpec = job.get( 172 FieldSelectionHelper.REDUCE_OUTPUT_KEY_VALUE_SPEC, "0-:"); 173 parseOutputKeyValueSpec(); 174 LOG.info(specToString()); 175 } 176 177 public void close() throws IOException { 178 // TODO Auto-generated method stub 179 180 } 181 182 public void reduce(Text key, Iterator<Text> values, 183 OutputCollector<Text, Text> output, Reporter reporter) 184 throws IOException { 185 String keyStr = key.toString() + this.fieldSeparator; 186 while (values.hasNext()) { 187 FieldSelectionHelper helper = new FieldSelectionHelper(); 188 helper.extractOutputKeyValue(keyStr, values.next().toString(), 189 fieldSeparator, reduceOutputKeyFieldList, 190 reduceOutputValueFieldList, allReduceValueFieldsFrom, false, false); 191 output.collect(helper.getKey(), helper.getValue()); 192 } 193 } 194 }