1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
package datafu.pig.sessions; |
17 | |
|
18 | |
import java.io.IOException; |
19 | |
import java.util.UUID; |
20 | |
|
21 | |
import org.apache.pig.Accumulator; |
22 | |
import org.apache.pig.EvalFunc; |
23 | |
import org.apache.pig.data.BagFactory; |
24 | |
import org.apache.pig.data.DataBag; |
25 | |
import org.apache.pig.data.DataType; |
26 | |
import org.apache.pig.data.Tuple; |
27 | |
import org.apache.pig.data.TupleFactory; |
28 | |
import org.apache.pig.impl.logicalLayer.FrontendException; |
29 | |
import org.apache.pig.impl.logicalLayer.schema.Schema; |
30 | |
import org.joda.time.DateTime; |
31 | |
import org.joda.time.Period; |
32 | |
|
33 | |
|
34 | |
|
35 | |
|
36 | |
|
37 | |
|
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | |
|
51 | |
|
52 | |
|
53 | |
|
54 | |
|
55 | |
|
56 | |
|
57 | |
|
58 | |
|
59 | |
|
60 | |
|
61 | |
|
62 | |
|
63 | 3 | public class Sessionize extends EvalFunc<DataBag> implements Accumulator<DataBag> |
64 | |
{ |
65 | |
private final long millis; |
66 | |
|
67 | |
private DataBag outputBag; |
68 | |
private DateTime last_date; |
69 | |
private String id; |
70 | |
|
71 | |
public Sessionize(String timeSpec) |
72 | 345 | { |
73 | 345 | Period p = new Period("PT" + timeSpec.toUpperCase()); |
74 | 345 | this.millis = p.toStandardSeconds().getSeconds() * 1000; |
75 | |
|
76 | 345 | cleanup(); |
77 | 345 | } |
78 | |
|
79 | |
@Override |
80 | |
public DataBag exec(Tuple input) throws IOException |
81 | |
{ |
82 | 0 | accumulate(input); |
83 | 0 | DataBag outputBag = getValue(); |
84 | 0 | cleanup(); |
85 | |
|
86 | 0 | return outputBag; |
87 | |
} |
88 | |
|
89 | |
@Override |
90 | |
public void accumulate(Tuple input) throws IOException |
91 | |
{ |
92 | 3 | for (Tuple t : (DataBag) input.get(0)) { |
93 | 17 | String timeString = (String)t.get(0); |
94 | 17 | DateTime date = new DateTime(timeString); |
95 | |
|
96 | 17 | if (this.last_date == null) |
97 | 3 | this.last_date = date; |
98 | 14 | else if (date.isAfter(this.last_date.plus(this.millis))) |
99 | 5 | this.id = UUID.randomUUID().toString(); |
100 | 9 | else if (date.isBefore(last_date)) |
101 | 0 | throw new IOException(String.format("input time series is not sorted (%s < %s)", date, last_date)); |
102 | |
|
103 | 17 | Tuple t_new = TupleFactory.getInstance().newTuple(t.getAll()); |
104 | 17 | t_new.append(this.id); |
105 | 17 | outputBag.add(t_new); |
106 | |
|
107 | 17 | this.last_date = date; |
108 | 17 | } |
109 | 3 | } |
110 | |
|
111 | |
@Override |
112 | |
public DataBag getValue() |
113 | |
{ |
114 | 3 | return outputBag; |
115 | |
} |
116 | |
|
117 | |
@Override |
118 | |
public void cleanup() |
119 | |
{ |
120 | 351 | this.last_date = null; |
121 | 351 | this.outputBag = BagFactory.getInstance().newDefaultBag(); |
122 | 351 | this.id = UUID.randomUUID().toString(); |
123 | 351 | } |
124 | |
|
125 | |
@Override |
126 | |
public Schema outputSchema(Schema input) |
127 | |
{ |
128 | |
try { |
129 | 181 | Schema.FieldSchema inputFieldSchema = input.getField(0); |
130 | |
|
131 | 181 | if (inputFieldSchema.type != DataType.BAG) |
132 | |
{ |
133 | 0 | throw new RuntimeException("Expected a BAG as input"); |
134 | |
} |
135 | |
|
136 | 181 | Schema inputBagSchema = inputFieldSchema.schema; |
137 | |
|
138 | 181 | if (inputBagSchema.getField(0).type != DataType.TUPLE) |
139 | |
{ |
140 | 0 | throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s", |
141 | |
DataType.findTypeName(inputBagSchema.getField(0).type))); |
142 | |
} |
143 | |
|
144 | 181 | Schema inputTupleSchema = inputBagSchema.getField(0).schema; |
145 | |
|
146 | 181 | if (inputTupleSchema.getField(0).type != DataType.CHARARRAY) |
147 | |
{ |
148 | 0 | throw new RuntimeException(String.format("Expected first element of tuple to be a CHARARRAY, but instead found %s", |
149 | |
DataType.findTypeName(inputTupleSchema.getField(0).type))); |
150 | |
} |
151 | |
|
152 | 181 | Schema outputTupleSchema = inputTupleSchema.clone(); |
153 | 181 | outputTupleSchema.add(new Schema.FieldSchema("session_id", DataType.CHARARRAY)); |
154 | |
|
155 | 181 | return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass() |
156 | |
.getName() |
157 | |
.toLowerCase(), input), |
158 | |
outputTupleSchema, |
159 | |
DataType.BAG)); |
160 | |
} |
161 | 0 | catch (CloneNotSupportedException e) { |
162 | 0 | throw new RuntimeException(e); |
163 | |
} |
164 | 0 | catch (FrontendException e) { |
165 | 0 | throw new RuntimeException(e); |
166 | |
} |
167 | |
} |
168 | |
} |