Coverage Report - datafu.pig.bags.UnorderedPairs
 
Classes in this File Line Coverage Branch Coverage Complexity
UnorderedPairs
81%
31/38
72%
13/18
9.5
 
 1  
 /*
 2  
  * Copyright 2010 LinkedIn, Inc
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5  
  * use this file except in compliance with the License. You may obtain a copy of
 6  
  * the License at
 7  
  * 
 8  
  * http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12  
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13  
  * License for the specific language governing permissions and limitations under
 14  
  * the License.
 15  
  */
 16  
  
 17  
 package datafu.pig.bags;
 18  
 
 19  
 import java.io.IOException;
 20  
 
 21  
 import org.apache.pig.EvalFunc;
 22  
 import org.apache.pig.data.BagFactory;
 23  
 import org.apache.pig.data.DataBag;
 24  
 import org.apache.pig.data.DataType;
 25  
 import org.apache.pig.data.Tuple;
 26  
 import org.apache.pig.data.TupleFactory;
 27  
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 28  
 import org.apache.pig.impl.util.WrappedIOException;
 29  
 import org.apache.pig.tools.pigstats.PigStatusReporter;
 30  
 
 31  
 import com.google.common.collect.ImmutableList;
 32  
 
 33  
 /**
 34  
  * Generates pairs of all items in a bag.
 35  
  * <p>
 36  
  * Example:
 37  
  * <pre>
 38  
  * {@code
 39  
  * define UnorderedPairs datafu.pig.bags.UnorderedPairs();
 40  
  * 
 41  
  * -- input:
 42  
  * -- ({(1),(2),(3),(4)})
 43  
  * input = LOAD 'input' AS (B: bag {T: tuple(v:INT)});
 44  
  * 
 45  
  * -- output:
 46  
  * -- ({((1),(2)),((1),(3)),((1),(4)),((2),(3)),((2),(4)),((3),(4))})
 47  
  * output = FOREACH input GENERATE UnorderedPairs(B);
 48  
  * } 
 49  
  * </pre>
 50  
  */
 51  4081
 public class UnorderedPairs extends EvalFunc<DataBag>
 52  
 {
 53  1
   private static final BagFactory bagFactory = BagFactory.getInstance();
 54  1
   private static final TupleFactory tupleFactory = TupleFactory.getInstance();
 55  
 
 56  
   @Override
 57  
   public DataBag exec(Tuple input) throws IOException
 58  
   {
 59  3
     PigStatusReporter reporter = PigStatusReporter.getInstance();
 60  
 
 61  
     try {
 62  3
       DataBag inputBag = (DataBag) input.get(0);
 63  3
       DataBag outputBag = bagFactory.newDefaultBag();
 64  3
       long i=0, j, cnt=0;
 65  
 
 66  3
       if (inputBag != null)
 67  
       {
 68  3
         for (Tuple elem1 : inputBag) {
 69  15
           j = 0; 
 70  15
           for (Tuple elem2 : inputBag) {
 71  75
             if (j > i) {
 72  30
               outputBag.add(tupleFactory.newTuple(ImmutableList.of(elem1, elem2)));
 73  30
               cnt++;
 74  
             }
 75  75
             j++;
 76  
   
 77  75
             if (reporter != null)
 78  75
               reporter.progress();
 79  
   
 80  75
             if (cnt % 1000000 == 0) {
 81  3
               outputBag.spill();
 82  3
               cnt = 0;
 83  
             }
 84  
           }
 85  15
           i++;
 86  
         }
 87  
       }
 88  
       
 89  3
       return outputBag;
 90  
     }
 91  0
     catch (Exception e) {
 92  0
       throw WrappedIOException.wrap("Caught exception processing input of " + this.getClass().getName(), e);
 93  
     }
 94  
   }
 95  
 
 96  
   @Override
 97  
   public Schema outputSchema(Schema input)
 98  
   {
 99  
     try {
 100  2128
       if (input.size() != 1)
 101  
       {
 102  0
         throw new RuntimeException("Expected input to have only a single field");
 103  
       }
 104  
       
 105  2128
       Schema.FieldSchema inputFieldSchema = input.getField(0);
 106  
 
 107  2128
       if (inputFieldSchema.type != DataType.BAG)
 108  
       {
 109  0
         throw new RuntimeException("Expected a BAG as input");
 110  
       }
 111  
       
 112  2128
       Schema inputBagSchema = inputFieldSchema.schema;
 113  
 
 114  2128
       if (inputBagSchema.getField(0).type != DataType.TUPLE)
 115  
       {
 116  0
         throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s",
 117  
                                                  DataType.findTypeName(inputBagSchema.getField(0).type)));
 118  
       }      
 119  
       
 120  2128
       Schema ouputTupleSchema = new Schema();
 121  2128
       ouputTupleSchema.add(new Schema.FieldSchema("elem1", inputBagSchema.getField(0).schema.clone(), DataType.TUPLE));
 122  2128
       ouputTupleSchema.add(new Schema.FieldSchema("elem2", inputBagSchema.getField(0).schema.clone(), DataType.TUPLE));
 123  2128
       return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),
 124  
                                                ouputTupleSchema, 
 125  
                                                DataType.BAG));
 126  
     }
 127  0
     catch (Exception e) {
 128  0
       return null;
 129  
     }
 130  
   }
 131  
 }
 132