1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
package datafu.pig.stats; |
18 | |
|
19 | |
import java.io.IOException; |
20 | |
import java.util.ArrayList; |
21 | |
|
22 | |
import org.apache.pig.EvalFunc; |
23 | |
import org.apache.pig.backend.executionengine.ExecException; |
24 | |
import org.apache.pig.data.BagFactory; |
25 | |
import org.apache.pig.data.DataBag; |
26 | |
import org.apache.pig.data.DataType; |
27 | |
import org.apache.pig.data.Tuple; |
28 | |
import org.apache.pig.data.TupleFactory; |
29 | |
import org.apache.pig.impl.logicalLayer.schema.Schema; |
30 | |
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; |
31 | |
|
32 | |
import com.google.common.collect.ImmutableList; |
33 | |
|
34 | |
|
35 | |
|
36 | |
|
37 | |
|
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | 3 | public class MarkovPairs extends EvalFunc<DataBag> |
47 | |
{ |
48 | 1 | private static final BagFactory bagFactory = BagFactory.getInstance(); |
49 | 1 | private static final TupleFactory tupleFactory = TupleFactory.getInstance(); |
50 | |
|
51 | |
private static long lookahead_steps; |
52 | |
|
53 | 1620 | private final int SPILL_THRESHOLD = 1000000; |
54 | |
|
55 | |
public MarkovPairs() |
56 | 1080 | { |
57 | 1080 | MarkovPairs.lookahead_steps = 1; |
58 | 1080 | } |
59 | |
|
60 | |
public MarkovPairs(String lookahead_steps) |
61 | 540 | { |
62 | 540 | MarkovPairs.lookahead_steps = Integer.valueOf(lookahead_steps); |
63 | 540 | } |
64 | |
|
65 | |
|
66 | |
private void generatePairs(ArrayList<Tuple> input, int start, int end, DataBag outputBag) |
67 | |
throws ExecException |
68 | |
{ |
69 | 3 | int count = 0; |
70 | 17 | for (int i = start; (i + 1)<= end; i++) { |
71 | 14 | Tuple elem1 = input.get(i); |
72 | |
|
73 | |
lookahead: |
74 | 33 | for (int j = i+1; j <= i + lookahead_steps; j++) |
75 | |
{ |
76 | 21 | if (j > end) break lookahead; |
77 | 19 | Tuple elem2 = input.get(j); |
78 | 19 | if (count >= SPILL_THRESHOLD) { |
79 | 0 | outputBag.spill(); |
80 | 0 | count = 0; |
81 | |
} |
82 | 19 | outputBag.add(tupleFactory.newTuple(ImmutableList.of(elem1, elem2))); |
83 | 19 | count ++; |
84 | |
} |
85 | |
} |
86 | 3 | } |
87 | |
|
88 | |
@Override |
89 | |
public DataBag exec(Tuple input) |
90 | |
throws IOException |
91 | |
{ |
92 | |
|
93 | |
|
94 | 3 | DataBag inputBag = (DataBag) input.get(0); |
95 | 3 | ArrayList<Tuple> inputData = new ArrayList<Tuple>(); |
96 | |
|
97 | 3 | for (Tuple tuple : inputBag) { |
98 | 17 | inputData.add(tuple); |
99 | |
} |
100 | |
|
101 | 3 | int inputSize = inputData.size(); |
102 | |
|
103 | |
try { |
104 | 3 | DataBag outputBag = bagFactory.newDefaultBag(); |
105 | |
|
106 | 3 | int startPos = 0; |
107 | |
|
108 | 3 | int stopPos = inputSize - 1; |
109 | 3 | generatePairs(inputData, startPos, stopPos, outputBag); |
110 | |
|
111 | |
|
112 | 3 | startPos = stopPos + 1; |
113 | 3 | return outputBag; |
114 | |
} |
115 | 0 | catch (Exception e) { |
116 | 0 | throw new IOException(e); |
117 | |
} |
118 | |
} |
119 | |
|
120 | |
|
121 | |
@Override |
122 | |
public Schema outputSchema(Schema input) |
123 | |
{ |
124 | |
try { |
125 | 222 | Schema tupleSchema = new Schema(); |
126 | |
|
127 | 222 | FieldSchema fieldSchema = input.getField(0); |
128 | |
|
129 | 222 | if (fieldSchema.type != DataType.BAG) |
130 | |
{ |
131 | 0 | throw new RuntimeException(String.format("Expected input schema to be BAG, but instead found %s", |
132 | |
DataType.findTypeName(fieldSchema.type))); |
133 | |
} |
134 | |
|
135 | 222 | FieldSchema fieldSchema2 = fieldSchema.schema.getField(0); |
136 | |
|
137 | 222 | tupleSchema.add(new Schema.FieldSchema("elem1", fieldSchema2.schema)); |
138 | 222 | tupleSchema.add(new Schema.FieldSchema("elem2", fieldSchema2.schema)); |
139 | 222 | return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), |
140 | |
tupleSchema, |
141 | |
DataType.BAG)); |
142 | |
} |
143 | 0 | catch (Exception e) { |
144 | 0 | return null; |
145 | |
} |
146 | |
} |
147 | |
|
148 | |
} |