001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.mapred.join; 020 021 import java.io.CharArrayReader; 022 import java.io.IOException; 023 import java.io.StreamTokenizer; 024 import java.lang.reflect.Constructor; 025 import java.lang.reflect.InvocationTargetException; 026 import java.util.ArrayList; 027 import java.util.HashMap; 028 import java.util.Iterator; 029 import java.util.LinkedList; 030 import java.util.List; 031 import java.util.ListIterator; 032 import java.util.Map; 033 import java.util.Stack; 034 035 import org.apache.hadoop.classification.InterfaceAudience; 036 import org.apache.hadoop.classification.InterfaceStability; 037 import org.apache.hadoop.io.WritableComparator; 038 import org.apache.hadoop.mapred.FileInputFormat; 039 import org.apache.hadoop.mapred.InputFormat; 040 import org.apache.hadoop.mapred.InputSplit; 041 import org.apache.hadoop.mapred.JobConf; 042 import org.apache.hadoop.mapred.RecordReader; 043 import org.apache.hadoop.mapred.Reporter; 044 import org.apache.hadoop.util.ReflectionUtils; 045 046 /** 047 * Very simple shift-reduce parser for join expressions. 048 * 049 * This should be sufficient for the user extension permitted now, but ought to 050 * be replaced with a parser generator if more complex grammars are supported. 051 * In particular, this "shift-reduce" parser has no states. Each set 052 * of formals requires a different internal node type, which is responsible for 053 * interpreting the list of tokens it receives. This is sufficient for the 054 * current grammar, but it has several annoying properties that might inhibit 055 * extension. In particular, parenthesis are always function calls; an 056 * algebraic or filter grammar would not only require a node type, but must 057 * also work around the internals of this parser. 058 * 059 * For most other cases, adding classes to the hierarchy- particularly by 060 * extending JoinRecordReader and MultiFilterRecordReader- is fairly 061 * straightforward. One need only override the relevant method(s) (usually only 062 * {@link CompositeRecordReader#combine}) and include a property to map its 063 * value to an identifier in the parser. 064 * @deprecated Use {@link org.apache.hadoop.mapreduce.lib.join.Parser} instead 065 */ 066 @Deprecated 067 @InterfaceAudience.Public 068 @InterfaceStability.Evolving 069 public class Parser { 070 @InterfaceAudience.Public 071 @InterfaceStability.Evolving 072 public enum TType { CIF, IDENT, COMMA, LPAREN, RPAREN, QUOT, NUM, } 073 074 /** 075 * Tagged-union type for tokens from the join expression. 076 * @see Parser.TType 077 */ 078 @InterfaceAudience.Public 079 @InterfaceStability.Evolving 080 public static class Token { 081 082 private TType type; 083 084 Token(TType type) { 085 this.type = type; 086 } 087 088 public TType getType() { return type; } 089 public Node getNode() throws IOException { 090 throw new IOException("Expected nodetype"); 091 } 092 public double getNum() throws IOException { 093 throw new IOException("Expected numtype"); 094 } 095 public String getStr() throws IOException { 096 throw new IOException("Expected strtype"); 097 } 098 } 099 100 @InterfaceAudience.Public 101 @InterfaceStability.Evolving 102 public static class NumToken extends Token { 103 private double num; 104 public NumToken(double num) { 105 super(TType.NUM); 106 this.num = num; 107 } 108 public double getNum() { return num; } 109 } 110 111 @InterfaceAudience.Public 112 @InterfaceStability.Evolving 113 public static class NodeToken extends Token { 114 private Node node; 115 NodeToken(Node node) { 116 super(TType.CIF); 117 this.node = node; 118 } 119 public Node getNode() { 120 return node; 121 } 122 } 123 124 @InterfaceAudience.Public 125 @InterfaceStability.Evolving 126 public static class StrToken extends Token { 127 private String str; 128 public StrToken(TType type, String str) { 129 super(type); 130 this.str = str; 131 } 132 public String getStr() { 133 return str; 134 } 135 } 136 137 /** 138 * Simple lexer wrapping a StreamTokenizer. 139 * This encapsulates the creation of tagged-union Tokens and initializes the 140 * SteamTokenizer. 141 */ 142 private static class Lexer { 143 144 private StreamTokenizer tok; 145 146 Lexer(String s) { 147 tok = new StreamTokenizer(new CharArrayReader(s.toCharArray())); 148 tok.quoteChar('"'); 149 tok.parseNumbers(); 150 tok.ordinaryChar(','); 151 tok.ordinaryChar('('); 152 tok.ordinaryChar(')'); 153 tok.wordChars('$','$'); 154 tok.wordChars('_','_'); 155 } 156 157 Token next() throws IOException { 158 int type = tok.nextToken(); 159 switch (type) { 160 case StreamTokenizer.TT_EOF: 161 case StreamTokenizer.TT_EOL: 162 return null; 163 case StreamTokenizer.TT_NUMBER: 164 return new NumToken(tok.nval); 165 case StreamTokenizer.TT_WORD: 166 return new StrToken(TType.IDENT, tok.sval); 167 case '"': 168 return new StrToken(TType.QUOT, tok.sval); 169 default: 170 switch (type) { 171 case ',': 172 return new Token(TType.COMMA); 173 case '(': 174 return new Token(TType.LPAREN); 175 case ')': 176 return new Token(TType.RPAREN); 177 default: 178 throw new IOException("Unexpected: " + type); 179 } 180 } 181 } 182 } 183 184 @InterfaceAudience.Public 185 @InterfaceStability.Evolving 186 public abstract static class Node implements ComposableInputFormat { 187 /** 188 * Return the node type registered for the particular identifier. 189 * By default, this is a CNode for any composite node and a WNode 190 * for "wrapped" nodes. User nodes will likely be composite 191 * nodes. 192 * @see #addIdentifier(java.lang.String, java.lang.Class[], java.lang.Class, java.lang.Class) 193 * @see CompositeInputFormat#setFormat(org.apache.hadoop.mapred.JobConf) 194 */ 195 static Node forIdent(String ident) throws IOException { 196 try { 197 if (!nodeCstrMap.containsKey(ident)) { 198 throw new IOException("No nodetype for " + ident); 199 } 200 return nodeCstrMap.get(ident).newInstance(ident); 201 } catch (IllegalAccessException e) { 202 throw (IOException)new IOException().initCause(e); 203 } catch (InstantiationException e) { 204 throw (IOException)new IOException().initCause(e); 205 } catch (InvocationTargetException e) { 206 throw (IOException)new IOException().initCause(e); 207 } 208 } 209 210 private static final Class<?>[] ncstrSig = { String.class }; 211 private static final 212 Map<String,Constructor<? extends Node>> nodeCstrMap = 213 new HashMap<String,Constructor<? extends Node>>(); 214 protected static final 215 Map<String,Constructor<? extends ComposableRecordReader>> rrCstrMap = 216 new HashMap<String,Constructor<? extends ComposableRecordReader>>(); 217 218 /** 219 * For a given identifier, add a mapping to the nodetype for the parse 220 * tree and to the ComposableRecordReader to be created, including the 221 * formals required to invoke the constructor. 222 * The nodetype and constructor signature should be filled in from the 223 * child node. 224 */ 225 protected static void addIdentifier(String ident, Class<?>[] mcstrSig, 226 Class<? extends Node> nodetype, 227 Class<? extends ComposableRecordReader> cl) 228 throws NoSuchMethodException { 229 Constructor<? extends Node> ncstr = 230 nodetype.getDeclaredConstructor(ncstrSig); 231 ncstr.setAccessible(true); 232 nodeCstrMap.put(ident, ncstr); 233 Constructor<? extends ComposableRecordReader> mcstr = 234 cl.getDeclaredConstructor(mcstrSig); 235 mcstr.setAccessible(true); 236 rrCstrMap.put(ident, mcstr); 237 } 238 239 // inst 240 protected int id = -1; 241 protected String ident; 242 protected Class<? extends WritableComparator> cmpcl; 243 244 protected Node(String ident) { 245 this.ident = ident; 246 } 247 248 protected void setID(int id) { 249 this.id = id; 250 } 251 252 protected void setKeyComparator(Class<? extends WritableComparator> cmpcl) { 253 this.cmpcl = cmpcl; 254 } 255 abstract void parse(List<Token> args, JobConf job) throws IOException; 256 } 257 258 /** 259 * Nodetype in the parse tree for "wrapped" InputFormats. 260 */ 261 static class WNode extends Node { 262 private static final Class<?>[] cstrSig = 263 { Integer.TYPE, RecordReader.class, Class.class }; 264 265 static void addIdentifier(String ident, 266 Class<? extends ComposableRecordReader> cl) 267 throws NoSuchMethodException { 268 Node.addIdentifier(ident, cstrSig, WNode.class, cl); 269 } 270 271 private String indir; 272 private InputFormat inf; 273 274 public WNode(String ident) { 275 super(ident); 276 } 277 278 /** 279 * Let the first actual define the InputFormat and the second define 280 * the <tt>mapred.input.dir</tt> property. 281 */ 282 public void parse(List<Token> ll, JobConf job) throws IOException { 283 StringBuilder sb = new StringBuilder(); 284 Iterator<Token> i = ll.iterator(); 285 while (i.hasNext()) { 286 Token t = i.next(); 287 if (TType.COMMA.equals(t.getType())) { 288 try { 289 inf = (InputFormat)ReflectionUtils.newInstance( 290 job.getClassByName(sb.toString()), 291 job); 292 } catch (ClassNotFoundException e) { 293 throw (IOException)new IOException().initCause(e); 294 } catch (IllegalArgumentException e) { 295 throw (IOException)new IOException().initCause(e); 296 } 297 break; 298 } 299 sb.append(t.getStr()); 300 } 301 if (!i.hasNext()) { 302 throw new IOException("Parse error"); 303 } 304 Token t = i.next(); 305 if (!TType.QUOT.equals(t.getType())) { 306 throw new IOException("Expected quoted string"); 307 } 308 indir = t.getStr(); 309 // no check for ll.isEmpty() to permit extension 310 } 311 312 private JobConf getConf(JobConf job) { 313 JobConf conf = new JobConf(job); 314 FileInputFormat.setInputPaths(conf, indir); 315 conf.setClassLoader(job.getClassLoader()); 316 return conf; 317 } 318 319 public InputSplit[] getSplits(JobConf job, int numSplits) 320 throws IOException { 321 return inf.getSplits(getConf(job), numSplits); 322 } 323 324 public ComposableRecordReader getRecordReader( 325 InputSplit split, JobConf job, Reporter reporter) throws IOException { 326 try { 327 if (!rrCstrMap.containsKey(ident)) { 328 throw new IOException("No RecordReader for " + ident); 329 } 330 return rrCstrMap.get(ident).newInstance(id, 331 inf.getRecordReader(split, getConf(job), reporter), cmpcl); 332 } catch (IllegalAccessException e) { 333 throw (IOException)new IOException().initCause(e); 334 } catch (InstantiationException e) { 335 throw (IOException)new IOException().initCause(e); 336 } catch (InvocationTargetException e) { 337 throw (IOException)new IOException().initCause(e); 338 } 339 } 340 341 public String toString() { 342 return ident + "(" + inf.getClass().getName() + ",\"" + indir + "\")"; 343 } 344 } 345 346 /** 347 * Internal nodetype for "composite" InputFormats. 348 */ 349 static class CNode extends Node { 350 351 private static final Class<?>[] cstrSig = 352 { Integer.TYPE, JobConf.class, Integer.TYPE, Class.class }; 353 354 static void addIdentifier(String ident, 355 Class<? extends ComposableRecordReader> cl) 356 throws NoSuchMethodException { 357 Node.addIdentifier(ident, cstrSig, CNode.class, cl); 358 } 359 360 // inst 361 private ArrayList<Node> kids = new ArrayList<Node>(); 362 363 public CNode(String ident) { 364 super(ident); 365 } 366 367 public void setKeyComparator(Class<? extends WritableComparator> cmpcl) { 368 super.setKeyComparator(cmpcl); 369 for (Node n : kids) { 370 n.setKeyComparator(cmpcl); 371 } 372 } 373 374 /** 375 * Combine InputSplits from child InputFormats into a 376 * {@link CompositeInputSplit}. 377 */ 378 public InputSplit[] getSplits(JobConf job, int numSplits) 379 throws IOException { 380 InputSplit[][] splits = new InputSplit[kids.size()][]; 381 for (int i = 0; i < kids.size(); ++i) { 382 final InputSplit[] tmp = kids.get(i).getSplits(job, numSplits); 383 if (null == tmp) { 384 throw new IOException("Error gathering splits from child RReader"); 385 } 386 if (i > 0 && splits[i-1].length != tmp.length) { 387 throw new IOException("Inconsistent split cardinality from child " + 388 i + " (" + splits[i-1].length + "/" + tmp.length + ")"); 389 } 390 splits[i] = tmp; 391 } 392 final int size = splits[0].length; 393 CompositeInputSplit[] ret = new CompositeInputSplit[size]; 394 for (int i = 0; i < size; ++i) { 395 ret[i] = new CompositeInputSplit(splits.length); 396 for (int j = 0; j < splits.length; ++j) { 397 ret[i].add(splits[j][i]); 398 } 399 } 400 return ret; 401 } 402 403 @SuppressWarnings("unchecked") // child types unknowable 404 public ComposableRecordReader getRecordReader( 405 InputSplit split, JobConf job, Reporter reporter) throws IOException { 406 if (!(split instanceof CompositeInputSplit)) { 407 throw new IOException("Invalid split type:" + 408 split.getClass().getName()); 409 } 410 final CompositeInputSplit spl = (CompositeInputSplit)split; 411 final int capacity = kids.size(); 412 CompositeRecordReader ret = null; 413 try { 414 if (!rrCstrMap.containsKey(ident)) { 415 throw new IOException("No RecordReader for " + ident); 416 } 417 ret = (CompositeRecordReader) 418 rrCstrMap.get(ident).newInstance(id, job, capacity, cmpcl); 419 } catch (IllegalAccessException e) { 420 throw (IOException)new IOException().initCause(e); 421 } catch (InstantiationException e) { 422 throw (IOException)new IOException().initCause(e); 423 } catch (InvocationTargetException e) { 424 throw (IOException)new IOException().initCause(e); 425 } 426 for (int i = 0; i < capacity; ++i) { 427 ret.add(kids.get(i).getRecordReader(spl.get(i), job, reporter)); 428 } 429 return (ComposableRecordReader)ret; 430 } 431 432 /** 433 * Parse a list of comma-separated nodes. 434 */ 435 public void parse(List<Token> args, JobConf job) throws IOException { 436 ListIterator<Token> i = args.listIterator(); 437 while (i.hasNext()) { 438 Token t = i.next(); 439 t.getNode().setID(i.previousIndex() >> 1); 440 kids.add(t.getNode()); 441 if (i.hasNext() && !TType.COMMA.equals(i.next().getType())) { 442 throw new IOException("Expected ','"); 443 } 444 } 445 } 446 447 public String toString() { 448 StringBuilder sb = new StringBuilder(); 449 sb.append(ident + "("); 450 for (Node n : kids) { 451 sb.append(n.toString() + ","); 452 } 453 sb.setCharAt(sb.length() - 1, ')'); 454 return sb.toString(); 455 } 456 } 457 458 private static Token reduce(Stack<Token> st, JobConf job) throws IOException { 459 LinkedList<Token> args = new LinkedList<Token>(); 460 while (!st.isEmpty() && !TType.LPAREN.equals(st.peek().getType())) { 461 args.addFirst(st.pop()); 462 } 463 if (st.isEmpty()) { 464 throw new IOException("Unmatched ')'"); 465 } 466 st.pop(); 467 if (st.isEmpty() || !TType.IDENT.equals(st.peek().getType())) { 468 throw new IOException("Identifier expected"); 469 } 470 Node n = Node.forIdent(st.pop().getStr()); 471 n.parse(args, job); 472 return new NodeToken(n); 473 } 474 475 /** 476 * Given an expression and an optional comparator, build a tree of 477 * InputFormats using the comparator to sort keys. 478 */ 479 static Node parse(String expr, JobConf job) throws IOException { 480 if (null == expr) { 481 throw new IOException("Expression is null"); 482 } 483 Class<? extends WritableComparator> cmpcl = 484 job.getClass("mapred.join.keycomparator", null, WritableComparator.class); 485 Lexer lex = new Lexer(expr); 486 Stack<Token> st = new Stack<Token>(); 487 Token tok; 488 while ((tok = lex.next()) != null) { 489 if (TType.RPAREN.equals(tok.getType())) { 490 st.push(reduce(st, job)); 491 } else { 492 st.push(tok); 493 } 494 } 495 if (st.size() == 1 && TType.CIF.equals(st.peek().getType())) { 496 Node ret = st.pop().getNode(); 497 if (cmpcl != null) { 498 ret.setKeyComparator(cmpcl); 499 } 500 return ret; 501 } 502 throw new IOException("Missing ')'"); 503 } 504 505 }