Coverage Report - datafu.pig.bags.sets.SetIntersect
 
Classes in this File Line Coverage Branch Coverage Complexity
SetIntersect
96%
32/33
93%
15/16
3.4
SetIntersect$pair
100%
6/6
N/A
3.4
 
 1  
 /*
 2  
  * Copyright 2010 LinkedIn, Inc
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5  
  * use this file except in compliance with the License. You may obtain a copy of
 6  
  * the License at
 7  
  * 
 8  
  * http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12  
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13  
  * License for the specific language governing permissions and limitations under
 14  
  * the License.
 15  
  */
 16  
  
 17  
 package datafu.pig.bags.sets;
 18  
 
 19  
 import java.io.IOException;
 20  
 import java.util.Iterator;
 21  
 import java.util.PriorityQueue;
 22  
 
 23  
 import org.apache.pig.data.BagFactory;
 24  
 import org.apache.pig.data.DataBag;
 25  
 import org.apache.pig.data.Tuple;
 26  
 import org.apache.pig.data.TupleFactory;
 27  
 
 28  
 /**
 29  
  * Computes the set intersection of two or more bags.  Duplicates are eliminated. <b>The input bags must be sorted.</b>
 30  
  * <p>
 31  
  * Example:
 32  
  * <pre>
 33  
  * {@code
 34  
  * define SetIntersect datafu.pig.bags.sets.SetIntersect();
 35  
  * 
 36  
  * -- input:
 37  
  * -- ({(1,10),(2,20),(3,30),(4,40)},{(2,20),(4,40),(8,80)})
 38  
  * input = LOAD 'input' AS (B1:bag{T:tuple(val1:int,val2:int)},B2:bag{T:tuple(val1:int,val2:int)});
 39  
  *
 40  
  * input = FOREACH input {
 41  
  *   B1 = ORDER B1 BY val1 ASC, val2 ASC;
 42  
  *   B2 = ORDER B2 BY val1 ASC, val2 ASC;
 43  
  *
 44  
  *   -- output:
 45  
  *   -- ({(2,20),(4,40)})
 46  
  *   GENERATE SetIntersect(B1,B2);
 47  
  * }
 48  
  * }</pre>
 49  
  */
 50  426
 public class SetIntersect extends SetOperationsBase
 51  
 {
 52  1
   private static final BagFactory bagFactory = BagFactory.getInstance();
 53  1
   private static final TupleFactory tupleFactory = TupleFactory.getInstance();
 54  
 
 55  18
   class pair implements Comparable<pair>
 56  
   {
 57  
     final Iterator<Tuple> it;
 58  
     Tuple data;
 59  
 
 60  
     pair(Iterator<Tuple> it)
 61  6
     {
 62  6
       this.it = it;
 63  6
       this.data = it.next();
 64  6
     }
 65  
     
 66  
     @Override
 67  
     public int compareTo(pair o)
 68  
     {
 69  18
       return this.data.compareTo(o.data);
 70  
     }
 71  
   }
 72  
 
 73  
   private PriorityQueue<pair> load_bags(Tuple input) throws IOException
 74  
   {
 75  3
     PriorityQueue<pair> pq = new PriorityQueue<pair>(input.size());
 76  
 
 77  9
     for (int i=0; i < input.size(); i++) {
 78  6
       Object o = input.get(i);
 79  6
       if (!(o instanceof DataBag))
 80  0
         throw new RuntimeException("parameters must be databags");
 81  6
       DataBag inputBag = (DataBag) o;
 82  6
       pq.add(new pair(inputBag.iterator()));
 83  
     }
 84  3
     return pq;
 85  
   }
 86  
 
 87  
   public boolean all_equal(PriorityQueue<pair> pq)
 88  
   {
 89  14
     Object o = pq.peek().data;
 90  14
     for (pair p : pq) {
 91  28
       if (!o.equals(p.data))
 92  10
         return false;
 93  
     }
 94  4
     return true;
 95  
   }
 96  
   
 97  
   @Override
 98  
   public DataBag exec(Tuple input) throws IOException
 99  
   {
 100  3
     DataBag outputBag = bagFactory.newDefaultBag();
 101  3
     PriorityQueue<pair> pq = load_bags(input);
 102  3
     Tuple last_data = null;
 103  
     
 104  
     while (true) {
 105  18
       if (pq.peek().data.compareTo(last_data) != 0 && all_equal(pq)) {
 106  4
         last_data = pq.peek().data;
 107  4
         outputBag.add(last_data);
 108  
       }
 109  
             
 110  18
       pair p = pq.poll();
 111  18
       if (!p.it.hasNext())
 112  2
         break;
 113  16
       Tuple nextData = p.it.next();
 114  
       // algorithm assumes data is in order
 115  16
       if (p.data.compareTo(nextData) > 0)
 116  
       {
 117  1
         throw new RuntimeException("Out of order!");
 118  
       }
 119  15
       p.data = nextData;
 120  15
       pq.offer(p);
 121  15
     }
 122  
 
 123  2
     return outputBag;
 124  
   }
 125  
 }
 126  
 
 127