001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.mapred.join; 020 021 import java.io.IOException; 022 import java.util.ArrayList; 023 import java.util.Map; 024 import java.util.regex.Matcher; 025 import java.util.regex.Pattern; 026 027 import org.apache.hadoop.classification.InterfaceAudience; 028 import org.apache.hadoop.classification.InterfaceStability; 029 import org.apache.hadoop.fs.Path; 030 import org.apache.hadoop.io.WritableComparable; 031 import org.apache.hadoop.mapred.InputFormat; 032 import org.apache.hadoop.mapred.InputSplit; 033 import org.apache.hadoop.mapred.JobConf; 034 import org.apache.hadoop.mapred.Reporter; 035 036 /** 037 * An InputFormat capable of performing joins over a set of data sources sorted 038 * and partitioned the same way. 039 * @see #setFormat 040 * 041 * A user may define new join types by setting the property 042 * <tt>mapred.join.define.<ident></tt> to a classname. In the expression 043 * <tt>mapred.join.expr</tt>, the identifier will be assumed to be a 044 * ComposableRecordReader. 045 * <tt>mapred.join.keycomparator</tt> can be a classname used to compare keys 046 * in the join. 047 * @see JoinRecordReader 048 * @see MultiFilterRecordReader 049 * @deprecated Use 050 * {@link org.apache.hadoop.mapreduce.lib.join.CompositeInputFormat} instead 051 */ 052 @Deprecated 053 @InterfaceAudience.Public 054 @InterfaceStability.Stable 055 public class CompositeInputFormat<K extends WritableComparable> 056 implements ComposableInputFormat<K,TupleWritable> { 057 058 // expression parse tree to which IF requests are proxied 059 private Parser.Node root; 060 061 public CompositeInputFormat() { } 062 063 064 /** 065 * Interpret a given string as a composite expression. 066 * {@code 067 * func ::= <ident>([<func>,]*<func>) 068 * func ::= tbl(<class>,"<path>") 069 * class ::= @see java.lang.Class#forName(java.lang.String) 070 * path ::= @see org.apache.hadoop.fs.Path#Path(java.lang.String) 071 * } 072 * Reads expression from the <tt>mapred.join.expr</tt> property and 073 * user-supplied join types from <tt>mapred.join.define.<ident></tt> 074 * types. Paths supplied to <tt>tbl</tt> are given as input paths to the 075 * InputFormat class listed. 076 * @see #compose(java.lang.String, java.lang.Class, java.lang.String...) 077 */ 078 public void setFormat(JobConf job) throws IOException { 079 addDefaults(); 080 addUserIdentifiers(job); 081 root = Parser.parse(job.get("mapred.join.expr", null), job); 082 } 083 084 /** 085 * Adds the default set of identifiers to the parser. 086 */ 087 protected void addDefaults() { 088 try { 089 Parser.CNode.addIdentifier("inner", InnerJoinRecordReader.class); 090 Parser.CNode.addIdentifier("outer", OuterJoinRecordReader.class); 091 Parser.CNode.addIdentifier("override", OverrideRecordReader.class); 092 Parser.WNode.addIdentifier("tbl", WrappedRecordReader.class); 093 } catch (NoSuchMethodException e) { 094 throw new RuntimeException("FATAL: Failed to init defaults", e); 095 } 096 } 097 098 /** 099 * Inform the parser of user-defined types. 100 */ 101 private void addUserIdentifiers(JobConf job) throws IOException { 102 Pattern x = Pattern.compile("^mapred\\.join\\.define\\.(\\w+)$"); 103 for (Map.Entry<String,String> kv : job) { 104 Matcher m = x.matcher(kv.getKey()); 105 if (m.matches()) { 106 try { 107 Parser.CNode.addIdentifier(m.group(1), 108 job.getClass(m.group(0), null, ComposableRecordReader.class)); 109 } catch (NoSuchMethodException e) { 110 throw (IOException)new IOException( 111 "Invalid define for " + m.group(1)).initCause(e); 112 } 113 } 114 } 115 } 116 117 /** 118 * Build a CompositeInputSplit from the child InputFormats by assigning the 119 * ith split from each child to the ith composite split. 120 */ 121 public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { 122 setFormat(job); 123 job.setLong("mapred.min.split.size", Long.MAX_VALUE); 124 return root.getSplits(job, numSplits); 125 } 126 127 /** 128 * Construct a CompositeRecordReader for the children of this InputFormat 129 * as defined in the init expression. 130 * The outermost join need only be composable, not necessarily a composite. 131 * Mandating TupleWritable isn't strictly correct. 132 */ 133 @SuppressWarnings("unchecked") // child types unknown 134 public ComposableRecordReader<K,TupleWritable> getRecordReader( 135 InputSplit split, JobConf job, Reporter reporter) throws IOException { 136 setFormat(job); 137 return root.getRecordReader(split, job, reporter); 138 } 139 140 /** 141 * Convenience method for constructing composite formats. 142 * Given InputFormat class (inf), path (p) return: 143 * {@code tbl(<inf>, <p>) } 144 */ 145 public static String compose(Class<? extends InputFormat> inf, String path) { 146 return compose(inf.getName().intern(), path, new StringBuffer()).toString(); 147 } 148 149 /** 150 * Convenience method for constructing composite formats. 151 * Given operation (op), Object class (inf), set of paths (p) return: 152 * {@code <op>(tbl(<inf>,<p1>),tbl(<inf>,<p2>),...,tbl(<inf>,<pn>)) } 153 */ 154 public static String compose(String op, Class<? extends InputFormat> inf, 155 String... path) { 156 final String infname = inf.getName(); 157 StringBuffer ret = new StringBuffer(op + '('); 158 for (String p : path) { 159 compose(infname, p, ret); 160 ret.append(','); 161 } 162 ret.setCharAt(ret.length() - 1, ')'); 163 return ret.toString(); 164 } 165 166 /** 167 * Convenience method for constructing composite formats. 168 * Given operation (op), Object class (inf), set of paths (p) return: 169 * {@code <op>(tbl(<inf>,<p1>),tbl(<inf>,<p2>),...,tbl(<inf>,<pn>)) } 170 */ 171 public static String compose(String op, Class<? extends InputFormat> inf, 172 Path... path) { 173 ArrayList<String> tmp = new ArrayList<String>(path.length); 174 for (Path p : path) { 175 tmp.add(p.toString()); 176 } 177 return compose(op, inf, tmp.toArray(new String[0])); 178 } 179 180 private static StringBuffer compose(String inf, String path, 181 StringBuffer sb) { 182 sb.append("tbl(" + inf + ",\""); 183 sb.append(path); 184 sb.append("\")"); 185 return sb; 186 } 187 188 }