001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018     
019    package org.apache.hadoop.mapred;
020    
021    import java.io.IOException;
022    import java.util.regex.PatternSyntaxException;
023    
024    import org.apache.hadoop.classification.InterfaceAudience;
025    import org.apache.hadoop.classification.InterfaceStability;
026    import org.apache.hadoop.conf.Configuration;
027    import org.apache.hadoop.util.ReflectionUtils;
028    
029    /**
030     * A class that allows a map/red job to work on a sample of sequence files.
031     * The sample is decided by the filter class set by the job.
032     * @deprecated Use 
033     * {@link org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter}
034     * instead
035     */
036    @Deprecated
037    @InterfaceAudience.Public
038    @InterfaceStability.Stable
039    public class SequenceFileInputFilter<K, V>
040      extends SequenceFileInputFormat<K, V> {
041      
042      final private static String FILTER_CLASS = org.apache.hadoop.mapreduce.lib.
043          input.SequenceFileInputFilter.FILTER_CLASS;
044    
045      public SequenceFileInputFilter() {
046      }
047        
048      /** Create a record reader for the given split
049       * @param split file split
050       * @param job job configuration
051       * @param reporter reporter who sends report to task tracker
052       * @return RecordReader
053       */
054      public RecordReader<K, V> getRecordReader(InputSplit split,
055                                          JobConf job, Reporter reporter)
056        throws IOException {
057            
058        reporter.setStatus(split.toString());
059            
060        return new FilterRecordReader<K, V>(job, (FileSplit) split);
061      }
062    
063    
064      /** set the filter class
065       * 
066       * @param conf application configuration
067       * @param filterClass filter class
068       */
069      public static void setFilterClass(Configuration conf, Class filterClass) {
070        conf.set(FILTER_CLASS, filterClass.getName());
071      }
072    
073             
074      /**
075       * filter interface
076       */
077      public interface Filter extends 
078          org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter.Filter {
079      }
080        
081      /**
082       * base class for Filters
083       */
084      public static abstract class FilterBase extends org.apache.hadoop.mapreduce.
085          lib.input.SequenceFileInputFilter.FilterBase
086          implements Filter {
087      }
088        
089      /** Records filter by matching key to regex
090       */
091      public static class RegexFilter extends FilterBase {
092        org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter.
093          RegexFilter rf;
094        public static void setPattern(Configuration conf, String regex)
095            throws PatternSyntaxException {
096          org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter.
097            RegexFilter.setPattern(conf, regex);
098        }
099            
100        public RegexFilter() { 
101          rf = new org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter.
102                 RegexFilter();
103        }
104            
105        /** configure the Filter by checking the configuration
106         */
107        public void setConf(Configuration conf) {
108          rf.setConf(conf);
109        }
110    
111    
112        /** Filtering method
113         * If key matches the regex, return true; otherwise return false
114         * @see org.apache.hadoop.mapred.SequenceFileInputFilter.Filter#accept(Object)
115         */
116        public boolean accept(Object key) {
117          return rf.accept(key);
118        }
119      }
120    
121      /** This class returns a percentage of records
122       * The percentage is determined by a filtering frequency <i>f</i> using
123       * the criteria record# % f == 0.
124       * For example, if the frequency is 10, one out of 10 records is returned.
125       */
126      public static class PercentFilter extends FilterBase {
127        org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter.
128                  PercentFilter pf;
129        /** set the frequency and stores it in conf
130         * @param conf configuration
131         * @param frequency filtering frequencey
132         */
133        public static void setFrequency(Configuration conf, int frequency) {
134           org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter.
135                  PercentFilter.setFrequency(conf, frequency);
136        }
137                    
138        public PercentFilter() { 
139          pf = new org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter.
140            PercentFilter();
141        }
142                    
143        /** configure the filter by checking the configuration
144         * 
145         * @param conf configuration
146         */
147        public void setConf(Configuration conf) {
148          pf.setConf(conf);
149        }
150    
151        /** Filtering method
152         * If record# % frequency==0, return true; otherwise return false
153         * @see org.apache.hadoop.mapred.SequenceFileInputFilter.Filter#accept(Object)
154         */
155        public boolean accept(Object key) {
156          return pf.accept(key);
157        }
158      }
159    
160      /** This class returns a set of records by examing the MD5 digest of its
161       * key against a filtering frequency <i>f</i>. The filtering criteria is
162       * MD5(key) % f == 0.
163       */
164      public static class MD5Filter extends FilterBase {
165        public static final int MD5_LEN = org.apache.hadoop.mapreduce.lib.
166          input.SequenceFileInputFilter.MD5Filter.MD5_LEN;
167        org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter.MD5Filter mf;
168        /** set the filtering frequency in configuration
169         * 
170         * @param conf configuration
171         * @param frequency filtering frequency
172         */
173        public static void setFrequency(Configuration conf, int frequency) {
174          org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFilter.MD5Filter.
175            setFrequency(conf, frequency);
176        }
177            
178        public MD5Filter() { 
179          mf = new org.apache.hadoop.mapreduce.lib.input.
180            SequenceFileInputFilter.MD5Filter();
181        }
182            
183        /** configure the filter according to configuration
184         * 
185         * @param conf configuration
186         */
187        public void setConf(Configuration conf) {
188          mf.setConf(conf);
189        }
190    
191        /** Filtering method
192         * If MD5(key) % frequency==0, return true; otherwise return false
193         * @see org.apache.hadoop.mapred.SequenceFileInputFilter.Filter#accept(Object)
194         */
195        public boolean accept(Object key) {
196          return mf.accept(key);
197        }
198      }
199        
200      private static class FilterRecordReader<K, V>
201        extends SequenceFileRecordReader<K, V> {
202        
203        private Filter filter;
204            
205        public FilterRecordReader(Configuration conf, FileSplit split)
206          throws IOException {
207          super(conf, split);
208          // instantiate filter
209          filter = (Filter)ReflectionUtils.newInstance(
210                                                       conf.getClass(FILTER_CLASS, PercentFilter.class), 
211                                                       conf);
212        }
213            
214        public synchronized boolean next(K key, V value) throws IOException {
215          while (next(key)) {
216            if (filter.accept(key)) {
217              getCurrentValue(value);
218              return true;
219            }
220          }
221                
222          return false;
223        }
224      }
225    }