Coverage Report - datafu.pig.sessions.Sessionize
 
Classes in this File Line Coverage Branch Coverage Complexity
Sessionize
73%
34/46
71%
10/14
3.667
 
 1  
 /*
 2  
  * Copyright 2010 LinkedIn, Inc
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5  
  * use this file except in compliance with the License. You may obtain a copy of
 6  
  * the License at
 7  
  * 
 8  
  * http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12  
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13  
  * License for the specific language governing permissions and limitations under
 14  
  * the License.
 15  
  */
 16  
 package datafu.pig.sessions;
 17  
 
 18  
 import java.io.IOException;
 19  
 import java.util.UUID;
 20  
 
 21  
 import org.apache.pig.Accumulator;
 22  
 import org.apache.pig.EvalFunc;
 23  
 import org.apache.pig.data.BagFactory;
 24  
 import org.apache.pig.data.DataBag;
 25  
 import org.apache.pig.data.DataType;
 26  
 import org.apache.pig.data.Tuple;
 27  
 import org.apache.pig.data.TupleFactory;
 28  
 import org.apache.pig.impl.logicalLayer.FrontendException;
 29  
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 30  
 import org.joda.time.DateTime;
 31  
 import org.joda.time.Period;
 32  
 
 33  
 /**
 34  
  * Sessionizes an input stream.
 35  
  *
 36  
  * This UDF takes a constructor argument which is the session timeout (an idle
 37  
  * period of this amount indicates that a new session has started) and assumes
 38  
  * the first element of the input bag is an ISO8601 timestamp. The input bag
 39  
  * must be sorted by this timestamp. It returns the input bag with a new field,
 40  
  * session_id, that is a GUID indicating the session of the request.
 41  
  *
 42  
  * Example:
 43  
  * <pre>
 44  
  * {@code
 45  
  * 
 46  
  * %declare TIME_WINDOW  30m
 47  
  * 
 48  
  * define Sessionize datafu.pig.sessions.Sessionize('$TIME_WINDOW');
 49  
  *
 50  
  * -- sessionize the visit stream
 51  
  * VIEWS = group VIEWS by member_id;
 52  
  * SESSIONS = foreach VIEWS {
 53  
  *   VISITS = order VIEWS by visit_date;
 54  
  *   generate FLATTEN(Sessionize(VISITS)) as (visit_date,member_id,url,session_id); 
 55  
  * }
 56  
  *
 57  
  * -- count the number of sessions hitting the url
 58  
  * ROLLUP = group SESSIONS by url;
 59  
  * RESULT = foreach ROLLUP generate group as url, COUNT(SESSIONS) as session_cnt;
 60  
  * }
 61  
  * </pre>
 62  
  */
 63  3
 public class Sessionize extends EvalFunc<DataBag> implements Accumulator<DataBag>
 64  
 {
 65  
   private final long millis;
 66  
 
 67  
   private DataBag outputBag;
 68  
   private DateTime last_date;
 69  
   private String id;
 70  
 
 71  
   public Sessionize(String timeSpec)
 72  345
   {
 73  345
     Period p = new Period("PT" + timeSpec.toUpperCase());
 74  345
     this.millis = p.toStandardSeconds().getSeconds() * 1000;
 75  
 
 76  345
     cleanup();
 77  345
   }
 78  
 
 79  
   @Override
 80  
   public DataBag exec(Tuple input) throws IOException
 81  
   {
 82  0
     accumulate(input);
 83  0
     DataBag outputBag = getValue();
 84  0
     cleanup();
 85  
 
 86  0
     return outputBag;
 87  
   }
 88  
 
 89  
   @Override
 90  
   public void accumulate(Tuple input) throws IOException
 91  
   {
 92  3
     for (Tuple t : (DataBag) input.get(0)) {
 93  17
       String timeString = (String)t.get(0);
 94  17
       DateTime date = new DateTime(timeString);
 95  
 
 96  17
       if (this.last_date == null)
 97  3
         this.last_date = date;
 98  14
       else if (date.isAfter(this.last_date.plus(this.millis)))
 99  5
         this.id = UUID.randomUUID().toString();
 100  9
       else if (date.isBefore(last_date))
 101  0
         throw new IOException(String.format("input time series is not sorted (%s < %s)", date, last_date));
 102  
 
 103  17
       Tuple t_new = TupleFactory.getInstance().newTuple(t.getAll());
 104  17
       t_new.append(this.id);
 105  17
       outputBag.add(t_new);
 106  
       
 107  17
       this.last_date = date;
 108  17
     }
 109  3
   }
 110  
 
 111  
   @Override
 112  
   public DataBag getValue()
 113  
   {
 114  3
     return outputBag;
 115  
   }
 116  
 
 117  
   @Override
 118  
   public void cleanup()
 119  
   {
 120  351
     this.last_date = null;
 121  351
     this.outputBag = BagFactory.getInstance().newDefaultBag();
 122  351
     this.id = UUID.randomUUID().toString();
 123  351
   }
 124  
 
 125  
   @Override
 126  
   public Schema outputSchema(Schema input)
 127  
   {
 128  
     try {
 129  181
       Schema.FieldSchema inputFieldSchema = input.getField(0);
 130  
 
 131  181
       if (inputFieldSchema.type != DataType.BAG)
 132  
       {
 133  0
         throw new RuntimeException("Expected a BAG as input");
 134  
       }
 135  
       
 136  181
       Schema inputBagSchema = inputFieldSchema.schema;
 137  
       
 138  181
       if (inputBagSchema.getField(0).type != DataType.TUPLE)
 139  
       {
 140  0
         throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s",
 141  
                                                  DataType.findTypeName(inputBagSchema.getField(0).type)));
 142  
       }
 143  
       
 144  181
       Schema inputTupleSchema = inputBagSchema.getField(0).schema;
 145  
       
 146  181
       if (inputTupleSchema.getField(0).type != DataType.CHARARRAY)
 147  
       {
 148  0
         throw new RuntimeException(String.format("Expected first element of tuple to be a CHARARRAY, but instead found %s",
 149  
                                                  DataType.findTypeName(inputTupleSchema.getField(0).type)));
 150  
       }
 151  
       
 152  181
       Schema outputTupleSchema = inputTupleSchema.clone();
 153  181
       outputTupleSchema.add(new Schema.FieldSchema("session_id", DataType.CHARARRAY));      
 154  
       
 155  181
       return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass()
 156  
                                                              .getName()
 157  
                                                              .toLowerCase(), input),
 158  
                                            outputTupleSchema,
 159  
                                            DataType.BAG));
 160  
     }
 161  0
     catch (CloneNotSupportedException e) {
 162  0
       throw new RuntimeException(e);
 163  
     }
 164  0
     catch (FrontendException e) {
 165  0
       throw new RuntimeException(e);
 166  
     }
 167  
   }
 168  
 }