001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.mapred; 020 021 import java.io.IOException; 022 import java.util.regex.PatternSyntaxException; 023 024 import org.apache.hadoop.classification.InterfaceAudience; 025 import org.apache.hadoop.classification.InterfaceStability; 026 import org.apache.hadoop.conf.Configuration; 027 import org.apache.hadoop.util.ReflectionUtils; 028 029 /** 030 * A class that allows a map/red job to work on a sample of sequence files. 031 * The sample is decided by the filter class set by the job. 032 * @deprecated Use 033 * {@link org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter} 034 * instead 035 */ 036 @Deprecated 037 @InterfaceAudience.Public 038 @InterfaceStability.Stable 039 public class SequenceFileInputFilter<K, V> 040 extends SequenceFileInputFormat<K, V> { 041 042 final private static String FILTER_CLASS = org.apache.hadoop.mapreduce.lib. 043 input.SequenceFileInputFilter.FILTER_CLASS; 044 045 public SequenceFileInputFilter() { 046 } 047 048 /** Create a record reader for the given split 049 * @param split file split 050 * @param job job configuration 051 * @param reporter reporter who sends report to task tracker 052 * @return RecordReader 053 */ 054 public RecordReader<K, V> getRecordReader(InputSplit split, 055 JobConf job, Reporter reporter) 056 throws IOException { 057 058 reporter.setStatus(split.toString()); 059 060 return new FilterRecordReader<K, V>(job, (FileSplit) split); 061 } 062 063 064 /** set the filter class 065 * 066 * @param conf application configuration 067 * @param filterClass filter class 068 */ 069 public static void setFilterClass(Configuration conf, Class filterClass) { 070 conf.set(FILTER_CLASS, filterClass.getName()); 071 } 072 073 074 /** 075 * filter interface 076 */ 077 public interface Filter extends 078 org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter.Filter { 079 } 080 081 /** 082 * base class for Filters 083 */ 084 public static abstract class FilterBase extends org.apache.hadoop.mapreduce. 085 lib.input.SequenceFileInputFilter.FilterBase 086 implements Filter { 087 } 088 089 /** Records filter by matching key to regex 090 */ 091 public static class RegexFilter extends FilterBase { 092 org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter. 093 RegexFilter rf; 094 public static void setPattern(Configuration conf, String regex) 095 throws PatternSyntaxException { 096 org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter. 097 RegexFilter.setPattern(conf, regex); 098 } 099 100 public RegexFilter() { 101 rf = new org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter. 102 RegexFilter(); 103 } 104 105 /** configure the Filter by checking the configuration 106 */ 107 public void setConf(Configuration conf) { 108 rf.setConf(conf); 109 } 110 111 112 /** Filtering method 113 * If key matches the regex, return true; otherwise return false 114 * @see org.apache.hadoop.mapred.SequenceFileInputFilter.Filter#accept(Object) 115 */ 116 public boolean accept(Object key) { 117 return rf.accept(key); 118 } 119 } 120 121 /** This class returns a percentage of records 122 * The percentage is determined by a filtering frequency <i>f</i> using 123 * the criteria record# % f == 0. 124 * For example, if the frequency is 10, one out of 10 records is returned. 125 */ 126 public static class PercentFilter extends FilterBase { 127 org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter. 128 PercentFilter pf; 129 /** set the frequency and stores it in conf 130 * @param conf configuration 131 * @param frequency filtering frequencey 132 */ 133 public static void setFrequency(Configuration conf, int frequency) { 134 org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter. 135 PercentFilter.setFrequency(conf, frequency); 136 } 137 138 public PercentFilter() { 139 pf = new org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter. 140 PercentFilter(); 141 } 142 143 /** configure the filter by checking the configuration 144 * 145 * @param conf configuration 146 */ 147 public void setConf(Configuration conf) { 148 pf.setConf(conf); 149 } 150 151 /** Filtering method 152 * If record# % frequency==0, return true; otherwise return false 153 * @see org.apache.hadoop.mapred.SequenceFileInputFilter.Filter#accept(Object) 154 */ 155 public boolean accept(Object key) { 156 return pf.accept(key); 157 } 158 } 159 160 /** This class returns a set of records by examing the MD5 digest of its 161 * key against a filtering frequency <i>f</i>. The filtering criteria is 162 * MD5(key) % f == 0. 163 */ 164 public static class MD5Filter extends FilterBase { 165 public static final int MD5_LEN = org.apache.hadoop.mapreduce.lib. 166 input.SequenceFileInputFilter.MD5Filter.MD5_LEN; 167 org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter.MD5Filter mf; 168 /** set the filtering frequency in configuration 169 * 170 * @param conf configuration 171 * @param frequency filtering frequency 172 */ 173 public static void setFrequency(Configuration conf, int frequency) { 174 org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter.MD5Filter. 175 setFrequency(conf, frequency); 176 } 177 178 public MD5Filter() { 179 mf = new org.apache.hadoop.mapreduce.lib.input. 180 SequenceFileInputFilter.MD5Filter(); 181 } 182 183 /** configure the filter according to configuration 184 * 185 * @param conf configuration 186 */ 187 public void setConf(Configuration conf) { 188 mf.setConf(conf); 189 } 190 191 /** Filtering method 192 * If MD5(key) % frequency==0, return true; otherwise return false 193 * @see org.apache.hadoop.mapred.SequenceFileInputFilter.Filter#accept(Object) 194 */ 195 public boolean accept(Object key) { 196 return mf.accept(key); 197 } 198 } 199 200 private static class FilterRecordReader<K, V> 201 extends SequenceFileRecordReader<K, V> { 202 203 private Filter filter; 204 205 public FilterRecordReader(Configuration conf, FileSplit split) 206 throws IOException { 207 super(conf, split); 208 // instantiate filter 209 filter = (Filter)ReflectionUtils.newInstance( 210 conf.getClass(FILTER_CLASS, PercentFilter.class), 211 conf); 212 } 213 214 public synchronized boolean next(K key, V value) throws IOException { 215 while (next(key)) { 216 if (filter.accept(key)) { 217 getCurrentValue(value); 218 return true; 219 } 220 } 221 222 return false; 223 } 224 } 225 }