Coverage Report - datafu.pig.bags.BagSplit
 
Classes in this File Line Coverage Branch Coverage Complexity
BagSplit
90%
46/51
80%
16/20
5
 
 1  
 /*
 2  
  * Copyright 2010 LinkedIn, Inc
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5  
  * use this file except in compliance with the License. You may obtain a copy of
 6  
  * the License at
 7  
  * 
 8  
  * http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12  
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13  
  * License for the specific language governing permissions and limitations under
 14  
  * the License.
 15  
  */
 16  
  
 17  
 package datafu.pig.bags;
 18  
 
 19  
 import java.io.IOException;
 20  
 
 21  
 import org.apache.pig.EvalFunc;
 22  
 import org.apache.pig.data.BagFactory;
 23  
 import org.apache.pig.data.DataBag;
 24  
 import org.apache.pig.data.DataType;
 25  
 import org.apache.pig.data.Tuple;
 26  
 import org.apache.pig.data.TupleFactory;
 27  
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 28  
 
 29  
 /**
 30  
  * Splits a bag of tuples into a bag of bags, where the inner bags collectively contain
 31  
  * the tuples from the original bag.  This can be used to split a bag into a set of smaller bags.
 32  
  * <p>
 33  
  * Example:
 34  
  * <pre>
 35  
  * {@code
 36  
  * define BagSplit datafu.pig.bags.BagSplit();
 37  
  * 
 38  
  * -- input:
 39  
  * -- ({(1),(2),(3),(4),(5),(6),(7)})
 40  
  * -- ({(1),(2),(3),(4),(5)})
 41  
  * -- ({(1),(2),(3),(4),(5),(6),(7),(8),(9),(10),(11)})
 42  
  * input = LOAD 'input' AS (B:bag{T:tuple(val1:INT,val2:INT)});
 43  
  * 
 44  
  * -- ouput:
 45  
  * -- ({{(1),(2),(3),(4),(5)},{(6),(7)}})
 46  
  * -- ({{(1),(2),(3),(4),(5)},{(6),(7),(8),(9),(10)},{(11)}})
 47  
  * output = FOREACH input GENERATE BagSplit(5,B);
 48  
  * }
 49  
  * </pre>
 50  
  */
 51  3
 public class BagSplit extends EvalFunc<DataBag>
 52  
 {
 53  1
   private static final BagFactory bagFactory = BagFactory.getInstance();
 54  1
   private static final TupleFactory tupleFactory = TupleFactory.getInstance();
 55  
     
 56  
   private final boolean appendBagNum;
 57  
   
 58  
   public BagSplit()
 59  4011
   {    
 60  4011
     this.appendBagNum = false;
 61  4011
   }
 62  
   
 63  
   public BagSplit(String appendBagNum)
 64  2012
   {
 65  2012
     this.appendBagNum = Boolean.parseBoolean(appendBagNum);
 66  2012
   }
 67  
   
 68  
   @Override
 69  
   public DataBag exec(Tuple arg0) throws IOException
 70  
   { 
 71  3
     DataBag outputBag = bagFactory.newDefaultBag();
 72  
     
 73  3
     Integer maxSize = (Integer)arg0.get(0);
 74  
     
 75  3
     Object o = arg0.get(1);
 76  3
     if (!(o instanceof DataBag))
 77  0
       throw new RuntimeException("parameter must be a databag");
 78  
     
 79  3
     DataBag inputBag = (DataBag)o;
 80  
     
 81  3
     DataBag currentBag = null;
 82  
     
 83  3
     int count = 0;
 84  3
     int numBags = 0;
 85  3
     for (Tuple tuple : inputBag)
 86  
     {
 87  29
       if (currentBag == null)
 88  
       {
 89  7
         currentBag = bagFactory.newDefaultBag();
 90  
       }
 91  
       
 92  29
       currentBag.add(tuple);
 93  29
       count++;
 94  
       
 95  29
       if (count >= maxSize)
 96  
       {
 97  4
         Tuple newTuple = tupleFactory.newTuple();
 98  4
         newTuple.append(currentBag);
 99  
         
 100  4
         if (this.appendBagNum)
 101  
         {
 102  1
           newTuple.append(numBags);
 103  
         }
 104  
         
 105  4
         numBags++;
 106  
         
 107  4
         outputBag.add(newTuple);
 108  
         
 109  4
         count = 0;
 110  4
         currentBag = null;
 111  29
       }
 112  
     }
 113  
     
 114  3
     if (currentBag != null)
 115  
     {
 116  3
       Tuple newTuple = tupleFactory.newTuple();
 117  3
       newTuple.append(currentBag);
 118  3
       if (this.appendBagNum)
 119  
       {
 120  1
         newTuple.append(numBags);
 121  
       }
 122  3
       outputBag.add(newTuple);
 123  
     }
 124  
         
 125  3
     return outputBag;
 126  
   }
 127  
 
 128  
   @Override
 129  
   public Schema outputSchema(Schema input)
 130  
   {
 131  
     try {
 132  3200
       if (input.getField(0).type != DataType.INTEGER)
 133  
       {
 134  0
         throw new RuntimeException("Expected first argument to be an INTEGER");
 135  
       }
 136  
       
 137  3200
       if (input.getField(1).type != DataType.BAG)
 138  
       {
 139  0
         throw new RuntimeException("Expected second argument to be a BAG");
 140  
       }
 141  
       
 142  3200
       Schema tupleSchema = new Schema();
 143  3200
       tupleSchema.add(new Schema.FieldSchema("data", input.getField(1).schema.clone(), DataType.BAG));
 144  
       
 145  3200
       if (this.appendBagNum)
 146  
       {
 147  1069
         tupleSchema.add(new Schema.FieldSchema("index", DataType.INTEGER));
 148  
       }
 149  
       
 150  3200
       return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),
 151  
                                                tupleSchema, DataType.BAG));
 152  
     }
 153  0
     catch (Exception e) {
 154  0
       throw new RuntimeException(e);
 155  
     }
 156  
   }
 157  
 }