Coverage Report - datafu.pig.stats.MarkovPairs
 
Classes in this File Line Coverage Branch Coverage Complexity
MarkovPairs
84%
38/45
83%
10/12
3.4
 
 1  
 /*
 2  
  * Copyright 2010 LinkedIn, Inc
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5  
  * use this file except in compliance with the License. You may obtain a copy of
 6  
  * the License at
 7  
  * 
 8  
  * http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12  
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13  
  * License for the specific language governing permissions and limitations under
 14  
  * the License.
 15  
  */
 16  
  
 17  
 package datafu.pig.stats;
 18  
 
 19  
 import java.io.IOException;
 20  
 import java.util.ArrayList;
 21  
 
 22  
 import org.apache.pig.EvalFunc;
 23  
 import org.apache.pig.backend.executionengine.ExecException;
 24  
 import org.apache.pig.data.BagFactory;
 25  
 import org.apache.pig.data.DataBag;
 26  
 import org.apache.pig.data.DataType;
 27  
 import org.apache.pig.data.Tuple;
 28  
 import org.apache.pig.data.TupleFactory;
 29  
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 30  
 import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
 31  
 
 32  
 import com.google.common.collect.ImmutableList;
 33  
 
 34  
 
 35  
 /**
 36  
  * Accepts a bag of tuples, with user supplied ordering, and generates pairs that can be used for
 37  
  * a Markov chain analysis. For example, if you had {(1), (4), (7)}, using the default lookahead of 1, you
 38  
  * get the pairs {
 39  
  *                ((1),(4)),
 40  
  *                ((4),(7))}
 41  
  * A lookahead factor tells the UDF how many steps in to the future to include. so, for a,b,c with a lookahead
 42  
  * of 2, a would be paired with both b and c.
 43  
  * The results are ordered are returned as ordered by the caller.
 44  
 */
 45  
 
 46  3
 public class MarkovPairs extends EvalFunc<DataBag>
 47  
 {
 48  1
   private static final BagFactory bagFactory = BagFactory.getInstance();
 49  1
   private static final TupleFactory tupleFactory = TupleFactory.getInstance();
 50  
 
 51  
   private static long lookahead_steps;
 52  
 
 53  1620
   private final int SPILL_THRESHOLD = 1000000;
 54  
 
 55  
   public MarkovPairs()
 56  1080
   {   
 57  1080
       MarkovPairs.lookahead_steps = 1;
 58  1080
   }
 59  
   
 60  
   public MarkovPairs(String lookahead_steps)
 61  540
   {   
 62  540
       MarkovPairs.lookahead_steps = Integer.valueOf(lookahead_steps);
 63  540
   }
 64  
 
 65  
   /* start and end are inclusive. This forms transition pairs */
 66  
   private void generatePairs(ArrayList<Tuple> input, int start, int end, DataBag outputBag)
 67  
       throws ExecException
 68  
   {
 69  3
     int count = 0;
 70  17
     for (int i = start; (i + 1)<= end; i++) {
 71  14
       Tuple elem1 = input.get(i);
 72  
       
 73  
       lookahead:
 74  33
       for (int j = i+1; j <= i + lookahead_steps; j++)
 75  
       {
 76  21
         if (j > end) break lookahead;
 77  19
         Tuple elem2 = input.get(j);        
 78  19
         if (count >= SPILL_THRESHOLD) {
 79  0
           outputBag.spill();
 80  0
           count = 0;
 81  
         }
 82  19
         outputBag.add(tupleFactory.newTuple(ImmutableList.of(elem1, elem2)));
 83  19
         count ++;
 84  
       }
 85  
     }
 86  3
   }
 87  
 
 88  
   @Override
 89  
   public DataBag exec(Tuple input)
 90  
       throws IOException
 91  
   {
 92  
     //things come in a tuple, in our case we have a bag (ordered views) passed. This is embedded in a length one tuple
 93  
     
 94  3
     DataBag inputBag = (DataBag) input.get(0);         
 95  3
     ArrayList<Tuple> inputData = new ArrayList<Tuple>();
 96  
 
 97  3
     for (Tuple tuple : inputBag) {
 98  17
       inputData.add(tuple);
 99  
     }
 100  
 
 101  3
     int inputSize = inputData.size();
 102  
 
 103  
     try {
 104  3
       DataBag outputBag = bagFactory.newDefaultBag();
 105  
 
 106  3
       int startPos = 0;
 107  
 
 108  3
       int stopPos = inputSize - 1;
 109  3
       generatePairs(inputData, startPos, stopPos, outputBag);
 110  
 
 111  
       // set startPos for the next bucket
 112  3
       startPos = stopPos + 1;
 113  3
       return outputBag;
 114  
     }
 115  0
     catch (Exception e) {
 116  0
       throw new IOException(e);
 117  
     }
 118  
   }
 119  
 
 120  
 
 121  
   @Override
 122  
   public Schema outputSchema(Schema input)
 123  
   {
 124  
     try {
 125  222
       Schema tupleSchema = new Schema();
 126  
                  
 127  222
       FieldSchema fieldSchema = input.getField(0);
 128  
       
 129  222
       if (fieldSchema.type != DataType.BAG)
 130  
       {
 131  0
         throw new RuntimeException(String.format("Expected input schema to be BAG, but instead found %s",
 132  
                                                  DataType.findTypeName(fieldSchema.type)));
 133  
       }
 134  
       
 135  222
       FieldSchema fieldSchema2 = fieldSchema.schema.getField(0);
 136  
       
 137  222
       tupleSchema.add(new Schema.FieldSchema("elem1", fieldSchema2.schema));
 138  222
       tupleSchema.add(new Schema.FieldSchema("elem2", fieldSchema2.schema));
 139  222
       return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),
 140  
                                                tupleSchema,
 141  
                                                DataType.BAG));
 142  
     }
 143  0
     catch (Exception e) {
 144  0
       return null;
 145  
     }
 146  
   }
 147  
   
 148  
 }