Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
SetUnion |
|
| 9.0;9 |
1 | /* | |
2 | * Copyright 2010 LinkedIn, Inc | |
3 | * | |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not | |
5 | * use this file except in compliance with the License. You may obtain a copy of | |
6 | * the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | |
12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
13 | * License for the specific language governing permissions and limitations under | |
14 | * the License. | |
15 | */ | |
16 | ||
17 | package datafu.pig.bags.sets; | |
18 | ||
19 | import java.io.IOException; | |
20 | import java.util.HashSet; | |
21 | import java.util.Set; | |
22 | ||
23 | import org.apache.pig.data.BagFactory; | |
24 | import org.apache.pig.data.DataBag; | |
25 | import org.apache.pig.data.Tuple; | |
26 | import org.apache.pig.data.TupleFactory; | |
27 | ||
28 | /** | |
29 | * Computes the set union of two or more bags. Duplicates are eliminated. | |
30 | * <p> | |
31 | * Example: | |
32 | * <pre> | |
33 | * {@code | |
34 | * define SetUnion datafu.pig.bags.sets.SetUnion(); | |
35 | * | |
36 | * -- input: | |
37 | * -- ({(2,20),(3,30),(4,40)},{(1,10),(2,20),(4,40),(8,80)}) | |
38 | * input = LOAD 'input' AS (B1:bag{T:tuple(val1:int,val2:int)},B2:bag{T:tuple(val1:int,val2:int)}); | |
39 | * | |
40 | * -- output: | |
41 | * -- ({(2,20),(3,30),(4,40),(1,10),(8,80)}) | |
42 | * output = FOREACH input GENERATE SetUnion(B1,B2); | |
43 | * } | |
44 | * </pre> | |
45 | */ | |
46 | 214 | public class SetUnion extends SetOperationsBase |
47 | { | |
48 | 1 | private static final BagFactory bagFactory = BagFactory.getInstance(); |
49 | 1 | private static final TupleFactory tupleFactory = TupleFactory.getInstance(); |
50 | ||
51 | @Override | |
52 | public DataBag exec(Tuple input) throws IOException | |
53 | { | |
54 | 4 | Set<Object> seen = new HashSet<Object>(); |
55 | 4 | DataBag outputBag = bagFactory.newDefaultBag(); |
56 | ||
57 | try { | |
58 | 12 | for (int i=0; i < input.size(); i++) { |
59 | 8 | Object o = input.get(i); |
60 | 8 | if (!(o instanceof DataBag)) |
61 | 0 | throw new RuntimeException("parameters must be databags"); |
62 | ||
63 | 8 | DataBag inputBag = (DataBag) o; |
64 | 8 | for (Tuple elem : inputBag) { |
65 | 55 | if (!seen.contains(elem)) { |
66 | 38 | outputBag.add(elem); |
67 | 38 | seen.add(elem); |
68 | } | |
69 | } | |
70 | } | |
71 | ||
72 | 4 | return outputBag; |
73 | } | |
74 | 0 | catch (Exception e) { |
75 | 0 | throw new IOException(e); |
76 | } | |
77 | } | |
78 | } | |
79 | ||
80 |