Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
TimeCount |
|
| 3.5;3.5 |
1 | /* | |
2 | * Copyright 2010 LinkedIn, Inc | |
3 | * | |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not | |
5 | * use this file except in compliance with the License. You may obtain a copy of | |
6 | * the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | |
12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
13 | * License for the specific language governing permissions and limitations under | |
14 | * the License. | |
15 | */ | |
16 | ||
17 | package datafu.pig.date; | |
18 | ||
19 | import java.io.IOException; | |
20 | ||
21 | import org.apache.pig.data.DataBag; | |
22 | import org.apache.pig.data.Tuple; | |
23 | import org.joda.time.DateTime; | |
24 | import org.joda.time.Period; | |
25 | ||
26 | import datafu.pig.util.SimpleEvalFunc; | |
27 | ||
28 | /** | |
29 | * Performs a count of events, ignoring events which occur within the | |
30 | * same time window. For events to occur within separate time windows they | |
31 | * must be separated by at least the specified time span. | |
32 | * <p> | |
33 | * This is useful for tasks such as counting the number of page views per user since it: | |
34 | * a) prevent reloads and go-backs from overcounting actual views | |
35 | * b) captures the notion that views across multiple sessions are more meaningful | |
36 | * <p> | |
37 | * Input <b>must</b> be sorted ascendingly by time for this UDF to work. | |
38 | * <p> | |
39 | * Example: | |
40 | * <pre> | |
41 | * {@code | |
42 | * | |
43 | * %declare TIME_WINDOW 10m | |
44 | * | |
45 | * define TimeCount datafu.pig.date.TimeCount('$TIME_WINDOW'); | |
46 | * | |
47 | * views = LOAD 'views' as (user_id:int, page_id:int, time:chararray); | |
48 | * views_grouped = GROUP views by (user_id, page_id); | |
49 | * view_counts = FOREACH views_grouped { | |
50 | * views = order views by time; | |
51 | * generate group.user_id as user_id, | |
52 | * group.page_id as page_id, | |
53 | * TimeCount(views.(time)) as count; } | |
54 | * } | |
55 | * </pre> | |
56 | * | |
57 | */ | |
58 | public class TimeCount extends SimpleEvalFunc<Long> | |
59 | { | |
60 | private final long millis; | |
61 | ||
62 | public TimeCount(String timeSpec) | |
63 | 138 | { |
64 | 138 | Period p = new Period("PT" + timeSpec.toUpperCase()); |
65 | 138 | this.millis = p.toStandardSeconds().getSeconds() * 1000; |
66 | 138 | } |
67 | ||
68 | public Long call(DataBag bag) throws IOException | |
69 | { | |
70 | 3 | DateTime last_date = null; |
71 | 3 | long sum = 0; |
72 | ||
73 | 3 | for (Tuple t : bag) { |
74 | 17 | DateTime date = new DateTime(t.get(0)); |
75 | ||
76 | 17 | if (last_date == null) { |
77 | 3 | last_date = date; |
78 | 3 | sum = 1; |
79 | 14 | } else if (date.isAfter(last_date.plus(this.millis))) |
80 | 5 | sum += 1; |
81 | 9 | else if (date.isBefore(last_date)) |
82 | 0 | throw new IOException("input time series is not sorted"); |
83 | ||
84 | 17 | last_date = date; |
85 | 17 | } |
86 | ||
87 | 3 | return sum; |
88 | } | |
89 | } |