Coverage Report - datafu.pig.date.TimeCount
 
Classes in this File Line Coverage Branch Coverage Complexity
TimeCount
94%
17/18
87%
7/8
3.5
 
 1  
 /*
 2  
  * Copyright 2010 LinkedIn, Inc
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5  
  * use this file except in compliance with the License. You may obtain a copy of
 6  
  * the License at
 7  
  * 
 8  
  * http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12  
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13  
  * License for the specific language governing permissions and limitations under
 14  
  * the License.
 15  
  */
 16  
  
 17  
 package datafu.pig.date;
 18  
 
 19  
 import java.io.IOException;
 20  
 
 21  
 import org.apache.pig.data.DataBag;
 22  
 import org.apache.pig.data.Tuple;
 23  
 import org.joda.time.DateTime;
 24  
 import org.joda.time.Period;
 25  
 
 26  
 import datafu.pig.util.SimpleEvalFunc;
 27  
 
 28  
 /**
 29  
  * Performs a count of events, ignoring events which occur within the
 30  
  * same time window.  For events to occur within separate time windows they
 31  
  * must be separated by at least the specified time span. 
 32  
  * <p>
 33  
  * This is useful for tasks such as counting the number of page views per user since it:
 34  
  *  a) prevent reloads and go-backs from overcounting actual views
 35  
  *  b) captures the notion that views across multiple sessions are more meaningful
 36  
  * <p>
 37  
  * Input <b>must</b> be sorted ascendingly by time for this UDF to work.
 38  
  * <p>
 39  
  * Example:
 40  
  * <pre>
 41  
  * {@code
 42  
  * 
 43  
  * %declare TIME_WINDOW  10m
 44  
  * 
 45  
  * define TimeCount datafu.pig.date.TimeCount('$TIME_WINDOW');
 46  
  * 
 47  
  * views = LOAD 'views' as (user_id:int, page_id:int, time:chararray);
 48  
  * views_grouped = GROUP views by (user_id, page_id);
 49  
  * view_counts = FOREACH views_grouped { 
 50  
  *   views = order views by time;
 51  
  *   generate group.user_id as user_id, 
 52  
  *            group.page_id as page_id, 
 53  
  *            TimeCount(views.(time)) as count; }
 54  
  * }
 55  
  * </pre>
 56  
  * 
 57  
  */
 58  
 public class TimeCount extends SimpleEvalFunc<Long>
 59  
 {
 60  
   private final long millis;
 61  
 
 62  
   public TimeCount(String timeSpec)
 63  138
   {
 64  138
     Period p = new Period("PT" + timeSpec.toUpperCase());
 65  138
     this.millis = p.toStandardSeconds().getSeconds() * 1000;
 66  138
   }
 67  
 
 68  
   public Long call(DataBag bag) throws IOException
 69  
   {
 70  3
     DateTime last_date = null;
 71  3
     long sum = 0;
 72  
     
 73  3
     for (Tuple t : bag) {
 74  17
       DateTime date = new DateTime(t.get(0));
 75  
 
 76  17
       if (last_date == null) {
 77  3
         last_date = date;
 78  3
         sum = 1;
 79  14
       } else if (date.isAfter(last_date.plus(this.millis)))
 80  5
         sum += 1;
 81  9
       else if (date.isBefore(last_date))
 82  0
         throw new IOException("input time series is not sorted");
 83  
 
 84  17
       last_date = date;
 85  17
     }
 86  
 
 87  3
     return sum;
 88  
   }
 89  
 }