Coverage Report - datafu.pig.stats.WilsonBinConf
 
Classes in this File Line Coverage Branch Coverage Complexity
WilsonBinConf
79%
27/34
60%
12/20
4.6
 
 1  
 /*
 2  
  * Copyright 2010 LinkedIn, Inc
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5  
  * use this file except in compliance with the License. You may obtain a copy of
 6  
  * the License at
 7  
  * 
 8  
  * http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12  
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13  
  * License for the specific language governing permissions and limitations under
 14  
  * the License.
 15  
  */
 16  
  
 17  
 package datafu.pig.stats;
 18  
 
 19  
 import java.io.IOException;
 20  
 
 21  
 import org.apache.commons.math.MathException;
 22  
 import org.apache.commons.math.distribution.NormalDistribution;
 23  
 import org.apache.commons.math.distribution.NormalDistributionImpl;
 24  
 import org.apache.pig.data.DataType;
 25  
 import org.apache.pig.data.Tuple;
 26  
 import org.apache.pig.data.TupleFactory;
 27  
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 28  
 
 29  
 import com.google.common.collect.ImmutableList;
 30  
 
 31  
 import datafu.pig.util.SimpleEvalFunc;
 32  
 
 33  
 /**
 34  
  * Computes the {@link <a href="http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval" target="_blank">Wilsonian binomial proportion confidence interval</a>}
 35  
  * <p>
 36  
  * Constructor requires the confidence interval (alpha) parameter, and the
 37  
  * parameters are the number of positive (success) outcomes and the total
 38  
  * number of observations. The UDF returns the (lower,upper) confidence
 39  
  * interval. 
 40  
  * <p>
 41  
  * Example:
 42  
  * <pre>
 43  
  * {@code
 44  
  * -- the Wilsonian binomial proportion confidence interval for scoring
 45  
  * %declare WILSON_ALPHA 0.10
 46  
  *
 47  
  * define WilsonBinConf      datafu.pig.stats.WilsonBinConf('$WILSON_ALPHA'); 
 48  
  *
 49  
  * bar = FOREACH foo GENERATE WilsonBinConf(successes, totals).lower as score;
 50  
  * quux = ORDER bar BY score DESC;
 51  
  * top = LIMIT quux 10;
 52  
  * }
 53  
  * </pre></p>
 54  
  */
 55  
 public class WilsonBinConf extends SimpleEvalFunc<Tuple>
 56  
 {
 57  1
   private static TupleFactory tupleFactory = TupleFactory.getInstance();
 58  
   private final double alpha;
 59  
 
 60  
   public WilsonBinConf(double alpha)
 61  26
   {
 62  26
     this.alpha = alpha;
 63  26
   }
 64  
 
 65  
   public WilsonBinConf(String alpha)
 66  
   {
 67  26
     this(Double.parseDouble(alpha));
 68  26
   }
 69  
 
 70  
   public Tuple call(Number x, Number n) throws IOException
 71  
   {
 72  7
     if (x == null || n == null)
 73  0
       return null;
 74  7
     return binconf(x.longValue(), n.longValue());
 75  
   }
 76  
   
 77  
   /**
 78  
    * @param x The number of positive (success) outcomes
 79  
    * @param n The number of observations
 80  
    * @return The (lower,upper) confidence interval
 81  
    */
 82  
   public Tuple binconf(Long x, Long n) throws IOException
 83  
   {
 84  7
     NormalDistribution normalDist = new NormalDistributionImpl();
 85  
 
 86  7
     if (x == null || n == null)
 87  0
       return null;
 88  7
     if (x < 0 || n < 0)
 89  0
       throw new IllegalArgumentException("non-negative values expected");
 90  7
     if (x > n)
 91  0
       throw new IllegalArgumentException("invariant violation: number of successes > number of obs");
 92  7
     if (n == 0)
 93  0
       return tupleFactory.newTuple(ImmutableList.of(Double.valueOf(0), Double.valueOf(0)));
 94  
 
 95  
     try {
 96  7
       double zcrit = -1.0 * normalDist.inverseCumulativeProbability(alpha/2);
 97  7
       double z2 = zcrit * zcrit;
 98  7
       double p = x/(double)n;
 99  
 
 100  7
       double a = p + z2/2/n;
 101  7
       double b = zcrit * Math.sqrt((p * (1 - p) + z2/4/n)/n);
 102  7
       double c = (1 + z2/n);
 103  
 
 104  7
       double lower = (a - b) / c;
 105  7
       double upper = (a + b) / c;
 106  
 
 107  
       // Add corrections for when x is very close to n.  This improves the estimates.
 108  
       // For more info on wilson binomial confidence interval, see paper:
 109  
       // L.D. Brown, T.T. Cai and A. DasGupta, Interval estimation for a binomial proportion (with discussion), 
 110  
       //   _Statistical Science,_*16*:101-133, 2001. 
 111  
       // http://www-stat.wharton.upenn.edu/~tcai/paper/Binomial-StatSci.pdf
 112  
       
 113  7
       if (x == 1)
 114  2
         lower = -Math.log(1 - alpha)/n;
 115  7
       if (x == (n - 1))
 116  2
         upper = 1 + Math.log(1 - alpha)/n;
 117  
 
 118  7
       return tupleFactory.newTuple(ImmutableList.of(lower, upper));
 119  
     }
 120  0
     catch (MathException e) {
 121  0
       throw new IOException("math error", e);
 122  
     }
 123  
   }
 124  
 
 125  
   @Override
 126  
   public Schema outputSchema(Schema input)
 127  
   {
 128  14
     return new Schema(ImmutableList.of(
 129  
             new Schema.FieldSchema("lower", DataType.DOUBLE),
 130  
             new Schema.FieldSchema("upper", DataType.DOUBLE)));
 131  
   }
 132  
 }