001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.mapred.join;
020    
021    import java.io.CharArrayReader;
022    import java.io.IOException;
023    import java.io.StreamTokenizer;
024    import java.lang.reflect.Constructor;
025    import java.lang.reflect.InvocationTargetException;
026    import java.util.ArrayList;
027    import java.util.HashMap;
028    import java.util.Iterator;
029    import java.util.LinkedList;
030    import java.util.List;
031    import java.util.ListIterator;
032    import java.util.Map;
033    import java.util.Stack;
034    
035    import org.apache.hadoop.classification.InterfaceAudience;
036    import org.apache.hadoop.classification.InterfaceStability;
037    import org.apache.hadoop.io.WritableComparator;
038    import org.apache.hadoop.mapred.FileInputFormat;
039    import org.apache.hadoop.mapred.InputFormat;
040    import org.apache.hadoop.mapred.InputSplit;
041    import org.apache.hadoop.mapred.JobConf;
042    import org.apache.hadoop.mapred.RecordReader;
043    import org.apache.hadoop.mapred.Reporter;
044    import org.apache.hadoop.util.ReflectionUtils;
045    
046    /**
047     * Very simple shift-reduce parser for join expressions.
048     *
049     * This should be sufficient for the user extension permitted now, but ought to
050     * be replaced with a parser generator if more complex grammars are supported.
051     * In particular, this "shift-reduce" parser has no states. Each set
052     * of formals requires a different internal node type, which is responsible for
053     * interpreting the list of tokens it receives. This is sufficient for the
054     * current grammar, but it has several annoying properties that might inhibit
055     * extension. In particular, parenthesis are always function calls; an
056     * algebraic or filter grammar would not only require a node type, but must
057     * also work around the internals of this parser.
058     *
059     * For most other cases, adding classes to the hierarchy- particularly by
060     * extending JoinRecordReader and MultiFilterRecordReader- is fairly
061     * straightforward. One need only override the relevant method(s) (usually only
062     * {@link CompositeRecordReader#combine}) and include a property to map its
063     * value to an identifier in the parser.
064     * @deprecated Use {@link org.apache.hadoop.mapreduce.lib.join.Parser} instead
065     */
066    @Deprecated
067    @InterfaceAudience.Public
068    @InterfaceStability.Evolving
069    public class Parser {
070      @InterfaceAudience.Public
071      @InterfaceStability.Evolving
072      public enum TType { CIF, IDENT, COMMA, LPAREN, RPAREN, QUOT, NUM, }
073    
074      /**
075       * Tagged-union type for tokens from the join expression.
076       * @see Parser.TType
077       */
078      @InterfaceAudience.Public
079      @InterfaceStability.Evolving
080      public static class Token {
081    
082        private TType type;
083    
084        Token(TType type) {
085          this.type = type;
086        }
087    
088        public TType getType() { return type; }
089        public Node getNode() throws IOException {
090          throw new IOException("Expected nodetype");
091        }
092        public double getNum() throws IOException {
093          throw new IOException("Expected numtype");
094        }
095        public String getStr() throws IOException {
096          throw new IOException("Expected strtype");
097        }
098      }
099    
100      @InterfaceAudience.Public
101      @InterfaceStability.Evolving
102      public static class NumToken extends Token {
103        private double num;
104        public NumToken(double num) {
105          super(TType.NUM);
106          this.num = num;
107        }
108        public double getNum() { return num; }
109      }
110    
111      @InterfaceAudience.Public
112      @InterfaceStability.Evolving
113      public static class NodeToken extends Token {
114        private Node node;
115        NodeToken(Node node) {
116          super(TType.CIF);
117          this.node = node;
118        }
119        public Node getNode() {
120          return node;
121        }
122      }
123    
124      @InterfaceAudience.Public
125      @InterfaceStability.Evolving
126      public static class StrToken extends Token {
127        private String str;
128        public StrToken(TType type, String str) {
129          super(type);
130          this.str = str;
131        }
132        public String getStr() {
133          return str;
134        }
135      }
136    
137      /**
138       * Simple lexer wrapping a StreamTokenizer.
139       * This encapsulates the creation of tagged-union Tokens and initializes the
140       * SteamTokenizer.
141       */
142      private static class Lexer {
143    
144        private StreamTokenizer tok;
145    
146        Lexer(String s) {
147          tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
148          tok.quoteChar('"');
149          tok.parseNumbers();
150          tok.ordinaryChar(',');
151          tok.ordinaryChar('(');
152          tok.ordinaryChar(')');
153          tok.wordChars('$','$');
154          tok.wordChars('_','_');
155        }
156    
157        Token next() throws IOException {
158          int type = tok.nextToken();
159          switch (type) {
160            case StreamTokenizer.TT_EOF:
161            case StreamTokenizer.TT_EOL:
162              return null;
163            case StreamTokenizer.TT_NUMBER:
164              return new NumToken(tok.nval);
165            case StreamTokenizer.TT_WORD:
166              return new StrToken(TType.IDENT, tok.sval);
167            case '"':
168              return new StrToken(TType.QUOT, tok.sval);
169            default:
170              switch (type) {
171                case ',':
172                  return new Token(TType.COMMA);
173                case '(':
174                  return new Token(TType.LPAREN);
175                case ')':
176                  return new Token(TType.RPAREN);
177                default:
178                  throw new IOException("Unexpected: " + type);
179              }
180          }
181        }
182      }
183    
184      @InterfaceAudience.Public
185      @InterfaceStability.Evolving
186      public abstract static class Node implements ComposableInputFormat {
187        /**
188         * Return the node type registered for the particular identifier.
189         * By default, this is a CNode for any composite node and a WNode
190         * for "wrapped" nodes. User nodes will likely be composite
191         * nodes.
192         * @see #addIdentifier(java.lang.String, java.lang.Class[], java.lang.Class, java.lang.Class)
193         * @see CompositeInputFormat#setFormat(org.apache.hadoop.mapred.JobConf)
194         */
195        static Node forIdent(String ident) throws IOException {
196          try {
197            if (!nodeCstrMap.containsKey(ident)) {
198              throw new IOException("No nodetype for " + ident);
199            }
200            return nodeCstrMap.get(ident).newInstance(ident);
201          } catch (IllegalAccessException e) {
202            throw (IOException)new IOException().initCause(e);
203          } catch (InstantiationException e) {
204            throw (IOException)new IOException().initCause(e);
205          } catch (InvocationTargetException e) {
206            throw (IOException)new IOException().initCause(e);
207          }
208        }
209    
210        private static final Class<?>[] ncstrSig = { String.class };
211        private static final
212            Map<String,Constructor<? extends Node>> nodeCstrMap =
213            new HashMap<String,Constructor<? extends Node>>();
214        protected static final
215            Map<String,Constructor<? extends ComposableRecordReader>> rrCstrMap =
216            new HashMap<String,Constructor<? extends ComposableRecordReader>>();
217    
218        /**
219         * For a given identifier, add a mapping to the nodetype for the parse
220         * tree and to the ComposableRecordReader to be created, including the
221         * formals required to invoke the constructor.
222         * The nodetype and constructor signature should be filled in from the
223         * child node.
224         */
225        protected static void addIdentifier(String ident, Class<?>[] mcstrSig,
226                                  Class<? extends Node> nodetype,
227                                  Class<? extends ComposableRecordReader> cl)
228            throws NoSuchMethodException {
229          Constructor<? extends Node> ncstr =
230            nodetype.getDeclaredConstructor(ncstrSig);
231          ncstr.setAccessible(true);
232          nodeCstrMap.put(ident, ncstr);
233          Constructor<? extends ComposableRecordReader> mcstr =
234            cl.getDeclaredConstructor(mcstrSig);
235          mcstr.setAccessible(true);
236          rrCstrMap.put(ident, mcstr);
237        }
238    
239        // inst
240        protected int id = -1;
241        protected String ident;
242        protected Class<? extends WritableComparator> cmpcl;
243    
244        protected Node(String ident) {
245          this.ident = ident;
246        }
247    
248        protected void setID(int id) {
249          this.id = id;
250        }
251    
252        protected void setKeyComparator(Class<? extends WritableComparator> cmpcl) {
253          this.cmpcl = cmpcl;
254        }
255        abstract void parse(List<Token> args, JobConf job) throws IOException;
256      }
257    
258      /**
259       * Nodetype in the parse tree for &quot;wrapped&quot; InputFormats.
260       */
261      static class WNode extends Node {
262        private static final Class<?>[] cstrSig =
263          { Integer.TYPE, RecordReader.class, Class.class };
264    
265        static void addIdentifier(String ident,
266                                  Class<? extends ComposableRecordReader> cl)
267            throws NoSuchMethodException {
268          Node.addIdentifier(ident, cstrSig, WNode.class, cl);
269        }
270    
271        private String indir;
272        private InputFormat inf;
273    
274        public WNode(String ident) {
275          super(ident);
276        }
277    
278        /**
279         * Let the first actual define the InputFormat and the second define
280         * the <tt>mapred.input.dir</tt> property.
281         */
282        public void parse(List<Token> ll, JobConf job) throws IOException {
283          StringBuilder sb = new StringBuilder();
284          Iterator<Token> i = ll.iterator();
285          while (i.hasNext()) {
286            Token t = i.next();
287            if (TType.COMMA.equals(t.getType())) {
288              try {
289                    inf = (InputFormat)ReflectionUtils.newInstance(
290                                    job.getClassByName(sb.toString()),
291                    job);
292              } catch (ClassNotFoundException e) {
293                throw (IOException)new IOException().initCause(e);
294              } catch (IllegalArgumentException e) {
295                throw (IOException)new IOException().initCause(e);
296              }
297              break;
298            }
299            sb.append(t.getStr());
300          }
301          if (!i.hasNext()) {
302            throw new IOException("Parse error");
303          }
304          Token t = i.next();
305          if (!TType.QUOT.equals(t.getType())) {
306            throw new IOException("Expected quoted string");
307          }
308          indir = t.getStr();
309          // no check for ll.isEmpty() to permit extension
310        }
311    
312        private JobConf getConf(JobConf job) {
313          JobConf conf = new JobConf(job);
314          FileInputFormat.setInputPaths(conf, indir);
315          conf.setClassLoader(job.getClassLoader());
316          return conf;
317        }
318    
319        public InputSplit[] getSplits(JobConf job, int numSplits)
320            throws IOException {
321          return inf.getSplits(getConf(job), numSplits);
322        }
323    
324        public ComposableRecordReader getRecordReader(
325            InputSplit split, JobConf job, Reporter reporter) throws IOException {
326          try {
327            if (!rrCstrMap.containsKey(ident)) {
328              throw new IOException("No RecordReader for " + ident);
329            }
330            return rrCstrMap.get(ident).newInstance(id,
331                inf.getRecordReader(split, getConf(job), reporter), cmpcl);
332          } catch (IllegalAccessException e) {
333            throw (IOException)new IOException().initCause(e);
334          } catch (InstantiationException e) {
335            throw (IOException)new IOException().initCause(e);
336          } catch (InvocationTargetException e) {
337            throw (IOException)new IOException().initCause(e);
338          }
339        }
340    
341        public String toString() {
342          return ident + "(" + inf.getClass().getName() + ",\"" + indir + "\")";
343        }
344      }
345    
346      /**
347       * Internal nodetype for &quot;composite&quot; InputFormats.
348       */
349      static class CNode extends Node {
350    
351        private static final Class<?>[] cstrSig =
352          { Integer.TYPE, JobConf.class, Integer.TYPE, Class.class };
353    
354        static void addIdentifier(String ident,
355                                  Class<? extends ComposableRecordReader> cl)
356            throws NoSuchMethodException {
357          Node.addIdentifier(ident, cstrSig, CNode.class, cl);
358        }
359    
360        // inst
361        private ArrayList<Node> kids = new ArrayList<Node>();
362    
363        public CNode(String ident) {
364          super(ident);
365        }
366    
367        public void setKeyComparator(Class<? extends WritableComparator> cmpcl) {
368          super.setKeyComparator(cmpcl);
369          for (Node n : kids) {
370            n.setKeyComparator(cmpcl);
371          }
372        }
373    
374        /**
375         * Combine InputSplits from child InputFormats into a
376         * {@link CompositeInputSplit}.
377         */
378        public InputSplit[] getSplits(JobConf job, int numSplits)
379            throws IOException {
380          InputSplit[][] splits = new InputSplit[kids.size()][];
381          for (int i = 0; i < kids.size(); ++i) {
382            final InputSplit[] tmp = kids.get(i).getSplits(job, numSplits);
383            if (null == tmp) {
384              throw new IOException("Error gathering splits from child RReader");
385            }
386            if (i > 0 && splits[i-1].length != tmp.length) {
387              throw new IOException("Inconsistent split cardinality from child " +
388                  i + " (" + splits[i-1].length + "/" + tmp.length + ")");
389            }
390            splits[i] = tmp;
391          }
392          final int size = splits[0].length;
393          CompositeInputSplit[] ret = new CompositeInputSplit[size];
394          for (int i = 0; i < size; ++i) {
395            ret[i] = new CompositeInputSplit(splits.length);
396            for (int j = 0; j < splits.length; ++j) {
397              ret[i].add(splits[j][i]);
398            }
399          }
400          return ret;
401        }
402    
403        @SuppressWarnings("unchecked") // child types unknowable
404        public ComposableRecordReader getRecordReader(
405            InputSplit split, JobConf job, Reporter reporter) throws IOException {
406          if (!(split instanceof CompositeInputSplit)) {
407            throw new IOException("Invalid split type:" +
408                                  split.getClass().getName());
409          }
410          final CompositeInputSplit spl = (CompositeInputSplit)split;
411          final int capacity = kids.size();
412          CompositeRecordReader ret = null;
413          try {
414            if (!rrCstrMap.containsKey(ident)) {
415              throw new IOException("No RecordReader for " + ident);
416            }
417            ret = (CompositeRecordReader)
418              rrCstrMap.get(ident).newInstance(id, job, capacity, cmpcl);
419          } catch (IllegalAccessException e) {
420            throw (IOException)new IOException().initCause(e);
421          } catch (InstantiationException e) {
422            throw (IOException)new IOException().initCause(e);
423          } catch (InvocationTargetException e) {
424            throw (IOException)new IOException().initCause(e);
425          }
426          for (int i = 0; i < capacity; ++i) {
427            ret.add(kids.get(i).getRecordReader(spl.get(i), job, reporter));
428          }
429          return (ComposableRecordReader)ret;
430        }
431    
432        /**
433         * Parse a list of comma-separated nodes.
434         */
435        public void parse(List<Token> args, JobConf job) throws IOException {
436          ListIterator<Token> i = args.listIterator();
437          while (i.hasNext()) {
438            Token t = i.next();
439            t.getNode().setID(i.previousIndex() >> 1);
440            kids.add(t.getNode());
441            if (i.hasNext() && !TType.COMMA.equals(i.next().getType())) {
442              throw new IOException("Expected ','");
443            }
444          }
445        }
446    
447        public String toString() {
448          StringBuilder sb = new StringBuilder();
449          sb.append(ident + "(");
450          for (Node n : kids) {
451            sb.append(n.toString() + ",");
452          }
453          sb.setCharAt(sb.length() - 1, ')');
454          return sb.toString();
455        }
456      }
457    
458      private static Token reduce(Stack<Token> st, JobConf job) throws IOException {
459        LinkedList<Token> args = new LinkedList<Token>();
460        while (!st.isEmpty() && !TType.LPAREN.equals(st.peek().getType())) {
461          args.addFirst(st.pop());
462        }
463        if (st.isEmpty()) {
464          throw new IOException("Unmatched ')'");
465        }
466        st.pop();
467        if (st.isEmpty() || !TType.IDENT.equals(st.peek().getType())) {
468          throw new IOException("Identifier expected");
469        }
470        Node n = Node.forIdent(st.pop().getStr());
471        n.parse(args, job);
472        return new NodeToken(n);
473      }
474    
475      /**
476       * Given an expression and an optional comparator, build a tree of
477       * InputFormats using the comparator to sort keys.
478       */
479      static Node parse(String expr, JobConf job) throws IOException {
480        if (null == expr) {
481          throw new IOException("Expression is null");
482        }
483        Class<? extends WritableComparator> cmpcl =
484          job.getClass("mapred.join.keycomparator", null, WritableComparator.class);
485        Lexer lex = new Lexer(expr);
486        Stack<Token> st = new Stack<Token>();
487        Token tok;
488        while ((tok = lex.next()) != null) {
489          if (TType.RPAREN.equals(tok.getType())) {
490            st.push(reduce(st, job));
491          } else {
492            st.push(tok);
493          }
494        }
495        if (st.size() == 1 && TType.CIF.equals(st.peek().getType())) {
496          Node ret = st.pop().getNode();
497          if (cmpcl != null) {
498            ret.setKeyComparator(cmpcl);
499          }
500          return ret;
501        }
502        throw new IOException("Missing ')'");
503      }
504    
505    }