1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
package datafu.pig.bags; |
18 | |
|
19 | |
import java.io.IOException; |
20 | |
|
21 | |
import org.apache.pig.EvalFunc; |
22 | |
import org.apache.pig.data.BagFactory; |
23 | |
import org.apache.pig.data.DataBag; |
24 | |
import org.apache.pig.data.DataType; |
25 | |
import org.apache.pig.data.Tuple; |
26 | |
import org.apache.pig.data.TupleFactory; |
27 | |
import org.apache.pig.impl.logicalLayer.schema.Schema; |
28 | |
|
29 | |
|
30 | |
|
31 | |
|
32 | |
|
33 | |
|
34 | |
|
35 | |
|
36 | |
|
37 | |
|
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | |
|
51 | 3 | public class BagSplit extends EvalFunc<DataBag> |
52 | |
{ |
53 | 1 | private static final BagFactory bagFactory = BagFactory.getInstance(); |
54 | 1 | private static final TupleFactory tupleFactory = TupleFactory.getInstance(); |
55 | |
|
56 | |
private final boolean appendBagNum; |
57 | |
|
58 | |
public BagSplit() |
59 | 4011 | { |
60 | 4011 | this.appendBagNum = false; |
61 | 4011 | } |
62 | |
|
63 | |
public BagSplit(String appendBagNum) |
64 | 2012 | { |
65 | 2012 | this.appendBagNum = Boolean.parseBoolean(appendBagNum); |
66 | 2012 | } |
67 | |
|
68 | |
@Override |
69 | |
public DataBag exec(Tuple arg0) throws IOException |
70 | |
{ |
71 | 3 | DataBag outputBag = bagFactory.newDefaultBag(); |
72 | |
|
73 | 3 | Integer maxSize = (Integer)arg0.get(0); |
74 | |
|
75 | 3 | Object o = arg0.get(1); |
76 | 3 | if (!(o instanceof DataBag)) |
77 | 0 | throw new RuntimeException("parameter must be a databag"); |
78 | |
|
79 | 3 | DataBag inputBag = (DataBag)o; |
80 | |
|
81 | 3 | DataBag currentBag = null; |
82 | |
|
83 | 3 | int count = 0; |
84 | 3 | int numBags = 0; |
85 | 3 | for (Tuple tuple : inputBag) |
86 | |
{ |
87 | 29 | if (currentBag == null) |
88 | |
{ |
89 | 7 | currentBag = bagFactory.newDefaultBag(); |
90 | |
} |
91 | |
|
92 | 29 | currentBag.add(tuple); |
93 | 29 | count++; |
94 | |
|
95 | 29 | if (count >= maxSize) |
96 | |
{ |
97 | 4 | Tuple newTuple = tupleFactory.newTuple(); |
98 | 4 | newTuple.append(currentBag); |
99 | |
|
100 | 4 | if (this.appendBagNum) |
101 | |
{ |
102 | 1 | newTuple.append(numBags); |
103 | |
} |
104 | |
|
105 | 4 | numBags++; |
106 | |
|
107 | 4 | outputBag.add(newTuple); |
108 | |
|
109 | 4 | count = 0; |
110 | 4 | currentBag = null; |
111 | 29 | } |
112 | |
} |
113 | |
|
114 | 3 | if (currentBag != null) |
115 | |
{ |
116 | 3 | Tuple newTuple = tupleFactory.newTuple(); |
117 | 3 | newTuple.append(currentBag); |
118 | 3 | if (this.appendBagNum) |
119 | |
{ |
120 | 1 | newTuple.append(numBags); |
121 | |
} |
122 | 3 | outputBag.add(newTuple); |
123 | |
} |
124 | |
|
125 | 3 | return outputBag; |
126 | |
} |
127 | |
|
128 | |
@Override |
129 | |
public Schema outputSchema(Schema input) |
130 | |
{ |
131 | |
try { |
132 | 3200 | if (input.getField(0).type != DataType.INTEGER) |
133 | |
{ |
134 | 0 | throw new RuntimeException("Expected first argument to be an INTEGER"); |
135 | |
} |
136 | |
|
137 | 3200 | if (input.getField(1).type != DataType.BAG) |
138 | |
{ |
139 | 0 | throw new RuntimeException("Expected second argument to be a BAG"); |
140 | |
} |
141 | |
|
142 | 3200 | Schema tupleSchema = new Schema(); |
143 | 3200 | tupleSchema.add(new Schema.FieldSchema("data", input.getField(1).schema.clone(), DataType.BAG)); |
144 | |
|
145 | 3200 | if (this.appendBagNum) |
146 | |
{ |
147 | 1069 | tupleSchema.add(new Schema.FieldSchema("index", DataType.INTEGER)); |
148 | |
} |
149 | |
|
150 | 3200 | return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), |
151 | |
tupleSchema, DataType.BAG)); |
152 | |
} |
153 | 0 | catch (Exception e) { |
154 | 0 | throw new RuntimeException(e); |
155 | |
} |
156 | |
} |
157 | |
} |