Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
Quantile |
|
| 3.2;3.2 | ||||
Quantile$Pair |
|
| 3.2;3.2 |
1 | /* | |
2 | * Copyright 2010 LinkedIn, Inc | |
3 | * | |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not | |
5 | * use this file except in compliance with the License. You may obtain a copy of | |
6 | * the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | |
12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
13 | * License for the specific language governing permissions and limitations under | |
14 | * the License. | |
15 | */ | |
16 | ||
17 | package datafu.pig.stats; | |
18 | ||
19 | import java.io.IOException; | |
20 | import java.util.HashMap; | |
21 | import java.util.List; | |
22 | import java.util.Map; | |
23 | ||
24 | import org.apache.pig.data.DataBag; | |
25 | import org.apache.pig.data.DataType; | |
26 | import org.apache.pig.data.Tuple; | |
27 | import org.apache.pig.data.TupleFactory; | |
28 | import org.apache.pig.impl.logicalLayer.schema.Schema; | |
29 | ||
30 | import datafu.pig.util.SimpleEvalFunc; | |
31 | ||
32 | /** | |
33 | * Computes {@link <a href="http://en.wikipedia.org/wiki/Quantile" target="_blank">quantiles</a>} | |
34 | * for a <b>sorted</b> input bag, using type R-2 estimation. | |
35 | * | |
36 | * <p> | |
37 | * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is | |
38 | * done (e.g., group by 'day') if the data is too large. That is, this isn't distributed quantiles. | |
39 | * </p> | |
40 | * | |
41 | * <p> | |
42 | * Note that unlike datafu's StreamingQuantile algorithm, this implementation gives | |
43 | * <b>exact</b> quantiles. But, it requires that the input bag to be sorted. Quantile must spill to | |
44 | * disk when the input data is too large to fit in memory, which will contribute to longer runtimes. | |
45 | * Because StreamingQuantile implements accumulate it can be much more efficient than Quantile for | |
46 | * large input bags which do not fit well in memory. | |
47 | * </p> | |
48 | * | |
49 | * <p>The constructor takes a single integer argument that specifies the number of evenly-spaced | |
50 | * quantiles to compute, e.g.,</p> | |
51 | * | |
52 | * <ul> | |
53 | * <li>Quantile('3') yields the min, the median, and the max | |
54 | * <li>Quantile('5') yields the min, the 25th, 50th, 75th percentiles, and the max | |
55 | * <li>Quantile('101') yields the min, the max, and all 99 percentiles. | |
56 | * </ul> | |
57 | * | |
58 | * <p>Alternatively the constructor can take the explicit list of quantiles to compute, e.g.</p> | |
59 | * | |
60 | * <ul> | |
61 | * <li>Quantile('0.0','0.5','1.0') yields the min, the median, and the max | |
62 | * <li>Quantile('0.0','0.25','0.5','0.75','1.0') yields the min, the 25th, 50th, 75th percentiles, and the max | |
63 | * </ul> | |
64 | * | |
65 | * <p>The list of quantiles need not span the entire range from 0.0 to 1.0, nor do they need to be evenly spaced, e.g.</p> | |
66 | * | |
67 | * <ul> | |
68 | * <li>Quantile('0.5','0.90','0.95','0.99') yields the median, the 90th, 95th, and the 99th percentiles | |
69 | * <li>Quantile('0.0013','0.0228','0.1587','0.5','0.8413','0.9772','0.9987') yields the 0.13th, 2.28th, 15.87th, 50th, 84.13th, 97.72nd, and 99.87th percentiles | |
70 | * </ul> | |
71 | * | |
72 | * <p> | |
73 | * Example: | |
74 | * <pre> | |
75 | * {@code | |
76 | * | |
77 | * define Quantile datafu.pig.stats.Quantile('0.0','0.5','1.0'); | |
78 | ||
79 | * -- input: 9,10,2,3,5,8,1,4,6,7 | |
80 | * input = LOAD 'input' AS (val:int); | |
81 | * | |
82 | * grouped = GROUP input ALL; | |
83 | * | |
84 | * -- produces: (1,5.5,10) | |
85 | * quantiles = FOREACH grouped { | |
86 | * sorted = ORDER input BY val; | |
87 | * GENERATE Quantile(sorted); | |
88 | * } | |
89 | * }</pre></p> | |
90 | * | |
91 | * @see Median | |
92 | * @see StreamingQuantile | |
93 | */ | |
94 | public class Quantile extends SimpleEvalFunc<Tuple> | |
95 | { | |
96 | List<Double> quantiles; | |
97 | ||
98 | private static class Pair<T1,T2> | |
99 | { | |
100 | public T1 first; | |
101 | public T2 second; | |
102 | ||
103 | 36 | public Pair(T1 first, T2 second) { |
104 | 36 | this.first = first; |
105 | 36 | this.second = second; |
106 | 36 | } |
107 | } | |
108 | ||
109 | public Quantile(String... k) | |
110 | 2708 | { |
111 | 2708 | this.quantiles = QuantileUtil.getQuantilesFromParams(k); |
112 | 2708 | } |
113 | ||
114 | private static Pair<Long, Long> getIndexes(double k, long N) | |
115 | { | |
116 | 36 | double h = N*k + 0.5; |
117 | 36 | long i1 = Math.min(Math.max(1, (long)Math.ceil(h - 0.5)), N); |
118 | 36 | long i2 = Math.min(Math.max(1, (long)Math.floor(h + 0.5)), N); |
119 | ||
120 | 36 | return new Pair<Long, Long>(i1, i2); |
121 | } | |
122 | ||
123 | public Tuple call(DataBag bag) throws IOException | |
124 | { | |
125 | 4 | if (bag == null || bag.size() == 0) |
126 | 0 | return null; |
127 | ||
128 | 4 | Map<Long, Double> d = new HashMap<Long, Double>(); |
129 | 4 | long N = bag.size(), max_id = 1; |
130 | ||
131 | 4 | for (double k : this.quantiles) { |
132 | 18 | Pair<Long, Long> idx = getIndexes(k, N); |
133 | ||
134 | 18 | d.put(idx.first, null); |
135 | 18 | d.put(idx.second, null); |
136 | 18 | max_id = Math.max(max_id, idx.second); |
137 | 18 | } |
138 | ||
139 | 4 | long i = 1; |
140 | 4 | for (Tuple t : bag) { |
141 | 99899 | if (i > max_id) |
142 | 2 | break; |
143 | ||
144 | 99897 | if (d.containsKey(i)) { |
145 | 21 | Object o = t.get(0); |
146 | 21 | if (!(o instanceof Number)) |
147 | 0 | throw new IllegalStateException("bag must have numerical values (and be non-null)"); |
148 | 21 | d.put(i, ((Number) o).doubleValue()); |
149 | } | |
150 | 99897 | i++; |
151 | } | |
152 | ||
153 | 4 | Tuple t = TupleFactory.getInstance().newTuple(this.quantiles.size()); |
154 | 4 | int j = 0; |
155 | 4 | for (double k : this.quantiles) { |
156 | 18 | Pair<Long, Long> p = getIndexes(k, N); |
157 | 18 | double quantile = (d.get(p.first) + d.get(p.second)) / 2; |
158 | 18 | t.set(j, quantile); |
159 | 18 | j++; |
160 | 18 | } |
161 | 4 | return t; |
162 | } | |
163 | ||
164 | @Override | |
165 | public Schema outputSchema(Schema input) | |
166 | { | |
167 | 1388 | Schema tupleSchema = new Schema(); |
168 | 1388 | for (Double x : this.quantiles) |
169 | 6092 | tupleSchema.add(new Schema.FieldSchema("quantile_" + x.toString().replace(".", "_"), DataType.DOUBLE)); |
170 | 1388 | return tupleSchema; |
171 | } | |
172 | } | |
173 |