Coverage Report - datafu.pig.stats.Quantile
 
Classes in this File Line Coverage Branch Coverage Complexity
Quantile
95%
38/40
83%
15/18
3.2
Quantile$Pair
100%
4/4
N/A
3.2
 
 1  
 /*
 2  
  * Copyright 2010 LinkedIn, Inc
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5  
  * use this file except in compliance with the License. You may obtain a copy of
 6  
  * the License at
 7  
  * 
 8  
  * http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12  
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13  
  * License for the specific language governing permissions and limitations under
 14  
  * the License.
 15  
  */
 16  
  
 17  
 package datafu.pig.stats;
 18  
 
 19  
 import java.io.IOException;
 20  
 import java.util.HashMap;
 21  
 import java.util.List;
 22  
 import java.util.Map;
 23  
 
 24  
 import org.apache.pig.data.DataBag;
 25  
 import org.apache.pig.data.DataType;
 26  
 import org.apache.pig.data.Tuple;
 27  
 import org.apache.pig.data.TupleFactory;
 28  
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 29  
 
 30  
 import datafu.pig.util.SimpleEvalFunc;
 31  
 
 32  
 /**
 33  
  * Computes {@link <a href="http://en.wikipedia.org/wiki/Quantile" target="_blank">quantiles</a>} 
 34  
  * for a <b>sorted</b> input bag, using type R-2 estimation.
 35  
  *
 36  
  * <p>
 37  
  * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is 
 38  
  * done (e.g., group by 'day') if the data is too large.  That is, this isn't distributed quantiles.
 39  
  * </p>
 40  
  * 
 41  
  * <p>
 42  
  * Note that unlike datafu's StreamingQuantile algorithm, this implementation gives
 43  
  * <b>exact</b> quantiles.  But, it requires that the input bag to be sorted.  Quantile must spill to 
 44  
  * disk when the input data is too large to fit in memory, which will contribute to longer runtimes. 
 45  
  * Because StreamingQuantile implements accumulate it can be much more efficient than Quantile for 
 46  
  * large input bags which do not fit well in memory.
 47  
  * </p>
 48  
  * 
 49  
  * <p>The constructor takes a single integer argument that specifies the number of evenly-spaced 
 50  
  * quantiles to compute, e.g.,</p>
 51  
  * 
 52  
  * <ul>
 53  
  *   <li>Quantile('3') yields the min, the median, and the max
 54  
  *   <li>Quantile('5') yields the min, the 25th, 50th, 75th percentiles, and the max
 55  
  *   <li>Quantile('101') yields the min, the max, and all 99 percentiles.
 56  
  * </ul>
 57  
  * 
 58  
  * <p>Alternatively the constructor can take the explicit list of quantiles to compute, e.g.</p>
 59  
  *
 60  
  * <ul>
 61  
  *   <li>Quantile('0.0','0.5','1.0') yields the min, the median, and the max
 62  
  *   <li>Quantile('0.0','0.25','0.5','0.75','1.0') yields the min, the 25th, 50th, 75th percentiles, and the max
 63  
  * </ul>
 64  
  *
 65  
  * <p>The list of quantiles need not span the entire range from 0.0 to 1.0, nor do they need to be evenly spaced, e.g.</p>
 66  
  * 
 67  
  * <ul>
 68  
  *   <li>Quantile('0.5','0.90','0.95','0.99') yields the median, the 90th, 95th, and the 99th percentiles
 69  
  *   <li>Quantile('0.0013','0.0228','0.1587','0.5','0.8413','0.9772','0.9987') yields the 0.13th, 2.28th, 15.87th, 50th, 84.13th, 97.72nd, and 99.87th percentiles
 70  
  * </ul>
 71  
  * 
 72  
  * <p>
 73  
  * Example:
 74  
  * <pre>
 75  
  * {@code
 76  
  *
 77  
  * define Quantile datafu.pig.stats.Quantile('0.0','0.5','1.0');
 78  
 
 79  
  * -- input: 9,10,2,3,5,8,1,4,6,7
 80  
  * input = LOAD 'input' AS (val:int);
 81  
  *
 82  
  * grouped = GROUP input ALL;
 83  
  *
 84  
  * -- produces: (1,5.5,10)
 85  
  * quantiles = FOREACH grouped {
 86  
  *   sorted = ORDER input BY val;
 87  
  *   GENERATE Quantile(sorted);
 88  
  * }
 89  
  * }</pre></p>
 90  
  *
 91  
  * @see Median
 92  
  * @see StreamingQuantile
 93  
  */
 94  
 public class Quantile extends SimpleEvalFunc<Tuple>
 95  
 {
 96  
   List<Double> quantiles;
 97  
 
 98  
   private static class Pair<T1,T2>
 99  
   {
 100  
     public T1 first;
 101  
     public T2 second;
 102  
 
 103  36
     public Pair(T1 first, T2 second) {
 104  36
       this.first = first;
 105  36
       this.second = second;
 106  36
     }
 107  
   }
 108  
 
 109  
   public Quantile(String... k)
 110  2708
   {
 111  2708
     this.quantiles = QuantileUtil.getQuantilesFromParams(k);
 112  2708
   }
 113  
 
 114  
   private static Pair<Long, Long> getIndexes(double k, long N)
 115  
   {
 116  36
     double h = N*k + 0.5;
 117  36
     long i1 = Math.min(Math.max(1, (long)Math.ceil(h - 0.5)), N);
 118  36
     long i2 = Math.min(Math.max(1, (long)Math.floor(h + 0.5)), N);
 119  
 
 120  36
     return new Pair<Long, Long>(i1, i2);
 121  
   }
 122  
   
 123  
   public Tuple call(DataBag bag) throws IOException
 124  
   {
 125  4
     if (bag == null || bag.size() == 0)
 126  0
       return null;
 127  
 
 128  4
     Map<Long, Double> d = new HashMap<Long, Double>();
 129  4
     long N = bag.size(), max_id = 1;
 130  
     
 131  4
     for (double k : this.quantiles) {
 132  18
       Pair<Long, Long> idx = getIndexes(k, N);
 133  
 
 134  18
       d.put(idx.first, null);
 135  18
       d.put(idx.second, null);
 136  18
       max_id = Math.max(max_id, idx.second);
 137  18
     }
 138  
 
 139  4
     long i = 1;
 140  4
     for (Tuple t : bag) {
 141  99899
       if (i > max_id)
 142  2
         break;
 143  
 
 144  99897
       if (d.containsKey(i)) {
 145  21
         Object o = t.get(0);
 146  21
         if (!(o instanceof Number))
 147  0
           throw new IllegalStateException("bag must have numerical values (and be non-null)");
 148  21
         d.put(i, ((Number) o).doubleValue());
 149  
       }
 150  99897
       i++;
 151  
     }
 152  
 
 153  4
     Tuple t = TupleFactory.getInstance().newTuple(this.quantiles.size());
 154  4
     int j = 0;
 155  4
     for (double k : this.quantiles) {
 156  18
       Pair<Long, Long> p = getIndexes(k, N);
 157  18
       double quantile = (d.get(p.first) + d.get(p.second)) / 2;
 158  18
       t.set(j, quantile);
 159  18
       j++;
 160  18
     }
 161  4
     return t;
 162  
   }
 163  
 
 164  
   @Override
 165  
   public Schema outputSchema(Schema input)
 166  
   {
 167  1388
     Schema tupleSchema = new Schema();
 168  1388
     for (Double x : this.quantiles)
 169  6092
       tupleSchema.add(new Schema.FieldSchema("quantile_" + x.toString().replace(".", "_"), DataType.DOUBLE));
 170  1388
     return tupleSchema;
 171  
   }
 172  
 }
 173