1 | |
package datafu.pig.bags; |
2 | |
|
3 | |
import java.io.IOException; |
4 | |
import java.util.HashSet; |
5 | |
|
6 | |
import org.apache.pig.EvalFunc; |
7 | |
import org.apache.pig.backend.executionengine.ExecException; |
8 | |
import org.apache.pig.data.BagFactory; |
9 | |
import org.apache.pig.data.DataBag; |
10 | |
import org.apache.pig.data.DataType; |
11 | |
import org.apache.pig.data.Tuple; |
12 | |
import org.apache.pig.impl.logicalLayer.FrontendException; |
13 | |
import org.apache.pig.impl.logicalLayer.schema.Schema; |
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
|
19 | |
|
20 | |
|
21 | |
|
22 | |
|
23 | |
|
24 | |
|
25 | |
|
26 | |
|
27 | |
|
28 | |
|
29 | |
|
30 | |
|
31 | |
|
32 | |
|
33 | |
|
34 | |
|
35 | |
|
36 | |
|
37 | |
|
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | 1 | public class DistinctBy extends EvalFunc<DataBag> |
43 | |
{ |
44 | |
private final static String delimiter = "-"; |
45 | 1069 | private HashSet<Integer> fields = new HashSet<Integer>(); |
46 | |
|
47 | |
public DistinctBy(String... fields) |
48 | 1069 | { |
49 | 2138 | for(String field : fields) { |
50 | 1069 | this.fields.add(Integer.parseInt(field)); |
51 | |
} |
52 | 1069 | } |
53 | |
|
54 | |
|
55 | |
@Override |
56 | |
public DataBag exec(Tuple input) throws IOException |
57 | |
{ |
58 | 1 | if (input.size() != 1) { |
59 | 0 | throw new RuntimeException("Expected input to have only a single field"); |
60 | |
} |
61 | 1 | if (input.getType(0) != DataType.BAG) { |
62 | 0 | throw new RuntimeException("Expected a BAG as input"); |
63 | |
} |
64 | |
|
65 | 1 | HashSet<String> seen = new HashSet<String>(); |
66 | |
|
67 | 1 | DataBag inputBag = (DataBag)input.get(0); |
68 | 1 | DataBag outputBag = BagFactory.getInstance().newDefaultBag(); |
69 | 1 | for (Tuple t : inputBag) { |
70 | 8 | String distinctString = getDistinctString(t, this.fields); |
71 | 8 | if (!seen.contains(distinctString)) { |
72 | 6 | outputBag.add(t); |
73 | 6 | seen.add(distinctString); |
74 | |
} |
75 | 8 | } |
76 | 1 | return outputBag; |
77 | |
} |
78 | |
|
79 | |
@Override |
80 | |
public Schema outputSchema(Schema input) |
81 | |
{ |
82 | |
try { |
83 | 206 | if (input.size() != 1) |
84 | |
{ |
85 | 0 | throw new RuntimeException("Expected input to have only a single field"); |
86 | |
} |
87 | |
|
88 | 206 | Schema.FieldSchema inputFieldSchema = input.getField(0); |
89 | |
|
90 | 206 | if (inputFieldSchema.type != DataType.BAG) |
91 | |
{ |
92 | 0 | throw new RuntimeException("Expected a BAG as input"); |
93 | |
} |
94 | |
|
95 | 206 | Schema inputBagSchema = inputFieldSchema.schema; |
96 | |
|
97 | 206 | if (inputBagSchema.getField(0).type != DataType.TUPLE) |
98 | |
{ |
99 | 0 | throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s", |
100 | |
DataType.findTypeName(inputBagSchema.getField(0).type))); |
101 | |
} |
102 | |
|
103 | 206 | Schema inputTupleSchema = inputBagSchema.getField(0).schema; |
104 | |
|
105 | 206 | Schema outputTupleSchema = inputTupleSchema.clone(); |
106 | |
|
107 | 206 | return new Schema(new Schema.FieldSchema( |
108 | |
getSchemaName(this.getClass().getName().toLowerCase(), input), |
109 | |
outputTupleSchema, |
110 | |
DataType.BAG)); |
111 | |
} |
112 | 0 | catch (CloneNotSupportedException e) { |
113 | 0 | throw new RuntimeException(e); |
114 | |
} |
115 | 0 | catch (FrontendException e) { |
116 | 0 | throw new RuntimeException(e); |
117 | |
} |
118 | |
} |
119 | |
|
120 | |
private String getDistinctString(Tuple t, HashSet<Integer> distinctFieldPositions) throws ExecException { |
121 | 8 | String[] tokens = t.toDelimitedString(delimiter).split(delimiter); |
122 | 8 | StringBuffer buffer = new StringBuffer(); |
123 | 32 | for(int i=0; i<tokens.length; i++) { |
124 | 24 | if (distinctFieldPositions.contains(i)) { |
125 | 8 | buffer.append(tokens[i]); |
126 | 8 | buffer.append(delimiter); |
127 | |
} |
128 | |
} |
129 | 8 | buffer.substring(0, buffer.length() - delimiter.length()); |
130 | 8 | return buffer.toString(); |
131 | |
} |
132 | |
|
133 | |
} |