1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
package datafu.pig.bags; |
18 | |
|
19 | |
import java.io.IOException; |
20 | |
|
21 | |
import org.apache.pig.EvalFunc; |
22 | |
import org.apache.pig.data.BagFactory; |
23 | |
import org.apache.pig.data.DataBag; |
24 | |
import org.apache.pig.data.DataType; |
25 | |
import org.apache.pig.data.Tuple; |
26 | |
import org.apache.pig.data.TupleFactory; |
27 | |
import org.apache.pig.impl.logicalLayer.schema.Schema; |
28 | |
import org.apache.pig.impl.util.WrappedIOException; |
29 | |
import org.apache.pig.tools.pigstats.PigStatusReporter; |
30 | |
|
31 | |
import com.google.common.collect.ImmutableList; |
32 | |
|
33 | |
|
34 | |
|
35 | |
|
36 | |
|
37 | |
|
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | |
|
51 | 4081 | public class UnorderedPairs extends EvalFunc<DataBag> |
52 | |
{ |
53 | 1 | private static final BagFactory bagFactory = BagFactory.getInstance(); |
54 | 1 | private static final TupleFactory tupleFactory = TupleFactory.getInstance(); |
55 | |
|
56 | |
@Override |
57 | |
public DataBag exec(Tuple input) throws IOException |
58 | |
{ |
59 | 3 | PigStatusReporter reporter = PigStatusReporter.getInstance(); |
60 | |
|
61 | |
try { |
62 | 3 | DataBag inputBag = (DataBag) input.get(0); |
63 | 3 | DataBag outputBag = bagFactory.newDefaultBag(); |
64 | 3 | long i=0, j, cnt=0; |
65 | |
|
66 | 3 | if (inputBag != null) |
67 | |
{ |
68 | 3 | for (Tuple elem1 : inputBag) { |
69 | 15 | j = 0; |
70 | 15 | for (Tuple elem2 : inputBag) { |
71 | 75 | if (j > i) { |
72 | 30 | outputBag.add(tupleFactory.newTuple(ImmutableList.of(elem1, elem2))); |
73 | 30 | cnt++; |
74 | |
} |
75 | 75 | j++; |
76 | |
|
77 | 75 | if (reporter != null) |
78 | 75 | reporter.progress(); |
79 | |
|
80 | 75 | if (cnt % 1000000 == 0) { |
81 | 3 | outputBag.spill(); |
82 | 3 | cnt = 0; |
83 | |
} |
84 | |
} |
85 | 15 | i++; |
86 | |
} |
87 | |
} |
88 | |
|
89 | 3 | return outputBag; |
90 | |
} |
91 | 0 | catch (Exception e) { |
92 | 0 | throw WrappedIOException.wrap("Caught exception processing input of " + this.getClass().getName(), e); |
93 | |
} |
94 | |
} |
95 | |
|
96 | |
@Override |
97 | |
public Schema outputSchema(Schema input) |
98 | |
{ |
99 | |
try { |
100 | 2128 | if (input.size() != 1) |
101 | |
{ |
102 | 0 | throw new RuntimeException("Expected input to have only a single field"); |
103 | |
} |
104 | |
|
105 | 2128 | Schema.FieldSchema inputFieldSchema = input.getField(0); |
106 | |
|
107 | 2128 | if (inputFieldSchema.type != DataType.BAG) |
108 | |
{ |
109 | 0 | throw new RuntimeException("Expected a BAG as input"); |
110 | |
} |
111 | |
|
112 | 2128 | Schema inputBagSchema = inputFieldSchema.schema; |
113 | |
|
114 | 2128 | if (inputBagSchema.getField(0).type != DataType.TUPLE) |
115 | |
{ |
116 | 0 | throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s", |
117 | |
DataType.findTypeName(inputBagSchema.getField(0).type))); |
118 | |
} |
119 | |
|
120 | 2128 | Schema ouputTupleSchema = new Schema(); |
121 | 2128 | ouputTupleSchema.add(new Schema.FieldSchema("elem1", inputBagSchema.getField(0).schema.clone(), DataType.TUPLE)); |
122 | 2128 | ouputTupleSchema.add(new Schema.FieldSchema("elem2", inputBagSchema.getField(0).schema.clone(), DataType.TUPLE)); |
123 | 2128 | return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), |
124 | |
ouputTupleSchema, |
125 | |
DataType.BAG)); |
126 | |
} |
127 | 0 | catch (Exception e) { |
128 | 0 | return null; |
129 | |
} |
130 | |
} |
131 | |
} |
132 | |
|