Coverage Report - datafu.pig.stats.StreamingQuantile
 
Classes in this File Line Coverage Branch Coverage Complexity
StreamingQuantile
84%
54/64
80%
21/26
3.765
StreamingQuantile$QuantileEstimator
71%
55/77
59%
32/54
3.765
 
 1  
 /**
 2  
  * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 3  
  *
 4  
  * Cloudera, Inc. licenses this file to you under the Apache License,
 5  
  * Version 2.0 (the "License"). You may not use this file except in
 6  
  * compliance with the License. You may obtain a copy of the License at
 7  
  *
 8  
  *     http://www.apache.org/licenses/LICENSE-2.0
 9  
  *
 10  
  * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 11  
  * CONDITIONS OF ANY KIND, either express or implied. See the License for
 12  
  * the specific language governing permissions and limitations under the
 13  
  * License.
 14  
  */
 15  
 package datafu.pig.stats;
 16  
 
 17  
 import java.io.IOException;
 18  
 import java.util.ArrayList;
 19  
 import java.util.Collections;
 20  
 import java.util.HashMap;
 21  
 import java.util.List;
 22  
 
 23  
 import org.apache.pig.Accumulator;
 24  
 import org.apache.pig.data.DataBag;
 25  
 import org.apache.pig.data.DataType;
 26  
 import org.apache.pig.data.Tuple;
 27  
 import org.apache.pig.data.TupleFactory;
 28  
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 29  
 
 30  
 import com.google.common.collect.Lists;
 31  
 
 32  
 import datafu.pig.util.SimpleEvalFunc;
 33  
 
 34  
 /**
 35  
  * Computes approximate {@link <a href="http://en.wikipedia.org/wiki/Quantile" target="_blank">quantiles</a>} 
 36  
  * for a (not necessarily sorted) input bag, using the Munro-Paterson algorithm.
 37  
  * 
 38  
  * <p>
 39  
  * The algorithm is described here:
 40  
  * {@link <a href="www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf">http://www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf</a>}
 41  
  * </p>
 42  
  * 
 43  
  * <p>
 44  
  * The implementation is based on the one in Sawzall, available here:
 45  
  * {@link <a href="http://szl.googlecode.com/svn-history/r41/trunk/src/emitters/szlquantile.cc">szlquantile.cc</a>}
 46  
  * </p>
 47  
  * 
 48  
  * <p>
 49  
  * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is 
 50  
  * done (e.g., group by 'day') if the data is too large.  That is, this isn't distributed quantiles.
 51  
  * </p>
 52  
  * 
 53  
  * <p>
 54  
  * Note that unlike datafu's standard Quantile algorithm, the Munro-Paterson algorithm gives
 55  
  * <b>approximate</b> quantiles and does not require the input bag to be sorted.  Because it implements
 56  
  * accumulate, StreamingQuantile can be much more efficient than Quantile for large amounts of data which
 57  
  * do not fit in memory.  Quantile must spill to disk when the input data is too large to fit in memory, 
 58  
  * which will contribute to longer runtimes.
 59  
  * </p>
 60  
  * 
 61  
  * <p>The constructor takes a single integer argument that specifies the number of evenly-spaced 
 62  
  * quantiles to compute, e.g.,</p>
 63  
  * 
 64  
  * <ul>
 65  
  *   <li>StreamingQuantile('3') yields the min, the median, and the max
 66  
  *   <li>StreamingQuantile('5') yields the min, the 25th, 50th, 75th percentiles, and the max
 67  
  *   <li>StreamingQuantile('101') yields the min, the max, and all 99 percentiles.
 68  
  * </ul>
 69  
  * 
 70  
  * <p>Alternatively the constructor can take the explicit list of quantiles to compute, e.g.</p>
 71  
  *
 72  
  * <ul>
 73  
  *   <li>StreamingQuantile('0.0','0.5','1.0') yields the min, the median, and the max
 74  
  *   <li>StreamingQuantile('0.0','0.25','0.5','0.75','1.0') yields the min, the 25th, 50th, 75th percentiles, and the max
 75  
  * </ul>
 76  
  *
 77  
  * <p>The list of quantiles need not span the entire range from 0.0 to 1.0, nor do they need to be evenly spaced, e.g.</p>
 78  
  * 
 79  
  * <ul>
 80  
  *   <li>StreamingQuantile('0.5','0.90','0.95','0.99') yields the median, the 90th, 95th, and the 99th percentiles
 81  
  *   <li>StreamingQuantile('0.0013','0.0228','0.1587','0.5','0.8413','0.9772','0.9987') yields the 0.13th, 2.28th, 15.87th, 50th, 84.13th, 97.72nd, and 99.87th percentiles
 82  
  * </ul>
 83  
  *
 84  
  * <p>Be aware when specifying the list of quantiles in this way that more quantiles may be computed internally than are actually returned.
 85  
  * The GCD of the quantiles is found and this determines the number of evenly spaced quantiles to compute.  The requested quantiles
 86  
  * are then returned from this set.  For instance:</p>
 87  
  * 
 88  
  * <ul>
 89  
  *   <li>If the quantiles 0.2 and 0.6 are requested then the quantiles 0.0, 0.2, 0.4, 0.6, 0.8, and 1.0 are computed 
 90  
  *       because 0.2 is the GCD of 0.2, 0.6, and 1.0.</li>  
 91  
  *   <li>If 0.2 and 0.7 are requested then the quantiles 0.0, 0.1, 0.2, ... , 0.9, 1.0 are computed because 0.1 is the 
 92  
  *       GCD of 0.2, 0.7, and 1.0.</li>
 93  
  *   <li>If 0.999 is requested the quantiles 0.0, 0.001, 0.002, ... , 0.998, 0.999, 1.0 are computed because 0.001 is
 94  
  *       the GCD of 0.999 and 1.0.</li> 
 95  
  *  </p>  
 96  
  * </ul>
 97  
  * 
 98  
  * <p>The error on the approximation goes down as the number of buckets computed goes up.</p>
 99  
  * 
 100  
  * <p>
 101  
  * Example:
 102  
  * <pre>
 103  
  * {@code
 104  
  *
 105  
  * define Quantile datafu.pig.stats.StreamingQuantile('5');
 106  
 
 107  
  * -- input: 9,10,2,3,5,8,1,4,6,7
 108  
  * input = LOAD 'input' AS (val:int);
 109  
  *
 110  
  * grouped = GROUP input ALL;
 111  
  *
 112  
  * -- produces: (1.0,3.0,5.0,8.0,10.0)
 113  
  * quantiles = FOREACH grouped generate Quantile(input);
 114  
  * }
 115  
  * </pre></p>
 116  
  *
 117  
  * @see StreamingMedian
 118  
  * @see Quantile
 119  
  */
 120  5
 public class StreamingQuantile extends SimpleEvalFunc<Tuple> implements Accumulator<Tuple> {
 121  
 
 122  
   private final int numQuantiles;
 123  
   private final QuantileEstimator estimator;
 124  
   private List<Double> quantiles;
 125  
  
 126  
   public StreamingQuantile(String... k)
 127  2395
   {
 128  2395
     this.quantiles = QuantileUtil.getQuantilesFromParams(k);
 129  2395
     this.numQuantiles = getNumQuantiles(this.quantiles);
 130  2395
     this.estimator = new QuantileEstimator(this.numQuantiles);
 131  2395
   }
 132  
   
 133  
   private static int getNumQuantiles(List<Double> quantiles)
 134  
   {
 135  2395
     quantiles = new ArrayList<Double>(quantiles);
 136  2395
     Collections.sort(quantiles);
 137  2395
     int start = 0;
 138  2395
     int end = quantiles.size()-1;
 139  3265
     while (quantiles.get(start) == 0.0) start++;
 140  3788
     while (quantiles.get(end) == 1.0) end--;
 141  2395
     double gcd = 1.0;
 142  9663
     for (int i=end; i>=start; i--)
 143  
     {
 144  7268
       gcd = gcd(gcd,quantiles.get(i));
 145  
     }
 146  2395
     int numQuantiles = (int)(1/gcd) + 1;
 147  2395
     return numQuantiles;
 148  
   }
 149  
   
 150  
   private static double gcd(double a, double b)
 151  
   {
 152  7268
     if (round(a) == 0.0)
 153  
     {
 154  0
       throw new IllegalArgumentException("Quantiles are smaller than the allowed precision");
 155  
     }
 156  7268
     if (round(b) == 0.0)
 157  
     {
 158  0
       throw new IllegalArgumentException("Quantiles are smaller than the allowed precision");
 159  
     }
 160  26330
     while (round(b) != 0.0)
 161  
     {
 162  19062
       double t = b;
 163  19062
       b = a % b;
 164  19062
       a = t;
 165  19062
     }
 166  7268
     return round(a);
 167  
   }
 168  
   
 169  
   private static double round(double d)
 170  
   {
 171  58270
     return Math.round(d*100000.0)/100000.0;
 172  
   }
 173  
 
 174  
   @Override
 175  
   public void accumulate(Tuple b) throws IOException
 176  
   {
 177  10
     DataBag bag = (DataBag) b.get(0);
 178  10
     if (bag == null || bag.size() == 0)
 179  0
       return;
 180  
 
 181  10
     for (Tuple t : bag) {
 182  101032
       Object o = t.get(0);
 183  101032
       if (!(o instanceof Number)) {
 184  0
         throw new IllegalStateException("bag must have numerical values (and be non-null)");
 185  
       }
 186  101032
       estimator.add(((Number) o).doubleValue());
 187  101032
     }
 188  10
   }
 189  
 
 190  
   @Override
 191  
   public void cleanup()
 192  
   {
 193  10
     estimator.clear();
 194  10
   }
 195  
 
 196  
   @Override
 197  
   public Tuple getValue()
 198  
   {
 199  5
     Tuple t = TupleFactory.getInstance().newTuple(this.quantiles.size());
 200  
     try {
 201  5
       HashMap<Double,Double> quantileValues = new HashMap<Double,Double>(this.quantiles.size());
 202  5
       double quantileKey = 0.0;
 203  5
       for (double quantileValue : estimator.getQuantiles()) {
 204  10115
         quantileValues.put(round(quantileKey), quantileValue);
 205  10115
         quantileKey += 1.0/(this.numQuantiles-1);
 206  
       }
 207  5
       int j = 0;
 208  5
       for (double d : this.quantiles)
 209  
       {
 210  21
         Double quantileValue = quantileValues.get(round(d));
 211  21
         t.set(j, quantileValue);
 212  21
         j++;
 213  21
       }
 214  0
     } catch (IOException e) {
 215  0
       return null;
 216  5
     }
 217  5
     return t;
 218  
   }
 219  
 
 220  
   public Tuple call(DataBag b) throws IOException
 221  
   {
 222  0
     accumulate(TupleFactory.getInstance().newTuple(b));
 223  0
     Tuple ret = getValue();
 224  0
     cleanup();
 225  0
     return ret;
 226  
   }
 227  
 
 228  
   @Override
 229  
   public Schema outputSchema(Schema input)
 230  
   {
 231  1240
     Schema tupleSchema = new Schema();
 232  2289760
     for (int i = 0; i < numQuantiles; i++) {
 233  2288520
       tupleSchema.add(new Schema.FieldSchema("quantile_" + i, DataType.DOUBLE));
 234  
     }
 235  1240
     return tupleSchema;
 236  
   }
 237  
 
 238  
   static class QuantileEstimator
 239  
   {
 240  
     private static final long MAX_TOT_ELEMS = 1024L * 1024L * 1024L * 1024L;
 241  
 
 242  2395
     private final List<List<Double>> buffer = Lists.newArrayList();
 243  
     private final int numQuantiles;
 244  
     private final int maxElementsPerBuffer;
 245  
     private int totalElements;
 246  
     private double min;
 247  
     private double max;
 248  
     
 249  
     public QuantileEstimator(int numQuantiles)
 250  2395
     {
 251  2395
       this.numQuantiles = numQuantiles;
 252  2395
       this.maxElementsPerBuffer = computeMaxElementsPerBuffer();
 253  2395
     }
 254  
     
 255  
     private int computeMaxElementsPerBuffer()
 256  
     {
 257  2395
       double epsilon = 1.0 / (numQuantiles - 1.0);
 258  2395
       int b = 2;
 259  75731
       while ((b - 2) * (0x1L << (b - 2)) + 0.5 <= epsilon * MAX_TOT_ELEMS) {
 260  73336
         ++b;
 261  
       }
 262  2395
       return (int) (MAX_TOT_ELEMS / (0x1L << (b - 1)));
 263  
     }
 264  
     
 265  
     private void ensureBuffer(int level)
 266  
     {
 267  202074
       while (buffer.size() < level + 1) {
 268  10
         buffer.add(null);
 269  
       }
 270  202064
       if (buffer.get(level) == null) {
 271  10
         buffer.set(level, Lists.<Double>newArrayList());
 272  
       }
 273  202064
     }
 274  
     
 275  
     private void collapse(List<Double> a, List<Double> b, List<Double> out)
 276  
     {
 277  0
       int indexA = 0, indexB = 0, count = 0;
 278  0
       Double smaller = null;
 279  0
       while (indexA < maxElementsPerBuffer || indexB < maxElementsPerBuffer) {
 280  0
         if (indexA >= maxElementsPerBuffer ||
 281  
             (indexB < maxElementsPerBuffer && a.get(indexA) >= b.get(indexB))) {
 282  0
           smaller = b.get(indexB++);
 283  
         } else {
 284  0
           smaller = a.get(indexA++);
 285  
         }
 286  
         
 287  0
         if (count++ % 2 == 0) {
 288  0
           out.add(smaller);
 289  
         }
 290  
       }
 291  0
       a.clear();
 292  0
       b.clear();
 293  0
     }
 294  
     
 295  
     private void recursiveCollapse(List<Double> buf, int level)
 296  
     {
 297  0
       ensureBuffer(level + 1);
 298  
       
 299  
       List<Double> merged;
 300  0
       if (buffer.get(level + 1).isEmpty()) {
 301  0
         merged = buffer.get(level + 1);
 302  
       } else {
 303  0
         merged = Lists.newArrayListWithCapacity(maxElementsPerBuffer);
 304  
       }
 305  
       
 306  0
       collapse(buffer.get(level), buf, merged);
 307  0
       if (buffer.get(level + 1) != merged) {
 308  0
         recursiveCollapse(merged, level + 1);
 309  
       }
 310  0
     }
 311  
     
 312  
     public void add(double elem)
 313  
     {
 314  101032
       if (totalElements == 0 || elem < min) {
 315  101004
         min = elem;
 316  
       }
 317  101032
       if (totalElements == 0 || max < elem) {
 318  23
         max = elem;
 319  
       }
 320  
       
 321  101032
       if (totalElements > 0 && totalElements % (2 * maxElementsPerBuffer) == 0) {
 322  0
         Collections.sort(buffer.get(0));
 323  0
         Collections.sort(buffer.get(1));
 324  0
         recursiveCollapse(buffer.get(0), 1);
 325  
       }
 326  
       
 327  101032
       ensureBuffer(0);
 328  101032
       ensureBuffer(1);
 329  101032
       int index = buffer.get(0).size() < maxElementsPerBuffer ? 0 : 1;
 330  101032
       buffer.get(index).add(elem);
 331  101032
       totalElements++;
 332  101032
     }
 333  
 
 334  
     public void clear()
 335  
     {
 336  10
       buffer.clear();
 337  10
       totalElements = 0;
 338  10
     }
 339  
 
 340  
     public List<Double> getQuantiles()
 341  
     {
 342  5
       List<Double> quantiles = Lists.newArrayList();
 343  5
       quantiles.add(min);
 344  
       
 345  5
       if (buffer.get(0) != null) {
 346  5
         Collections.sort(buffer.get(0));
 347  
       }
 348  5
       if (buffer.get(1) != null) {
 349  5
         Collections.sort(buffer.get(1));
 350  
       }
 351  
       
 352  5
       int[] index = new int[buffer.size()];
 353  5
       long S = 0;
 354  10110
       for (int i = 1; i <= numQuantiles - 2; i++) {
 355  10105
         long targetS = (long) Math.ceil(i * (totalElements / (numQuantiles - 1.0)));
 356  
         
 357  
         while (true) {
 358  111103
           double smallest = max;
 359  111103
           int minBufferId = -1;
 360  333309
           for (int j = 0; j < buffer.size(); j++) {
 361  222206
             if (buffer.get(j) != null && index[j] < buffer.get(j).size()) {
 362  149014
               if (!(smallest < buffer.get(j).get(index[j]))) {
 363  149014
                 smallest = buffer.get(j).get(index[j]);
 364  149014
                 minBufferId = j;
 365  
               }
 366  
             }
 367  
           }
 368  
           
 369  111103
           long incrementS = minBufferId <= 1 ? 1L : (0x1L << (minBufferId - 1));
 370  111103
           if (S + incrementS >= targetS) {
 371  10105
             quantiles.add(smallest);
 372  10105
             break;
 373  
           } else {
 374  100998
             index[minBufferId]++;
 375  100998
             S += incrementS;
 376  
           }
 377  100998
         }
 378  
       }
 379  
       
 380  5
       quantiles.add(max);
 381  5
       return quantiles;
 382  
     }
 383  
   }
 384  
 }