Coverage Report - datafu.pig.bags.DistinctBy
 
Classes in this File Line Coverage Branch Coverage Complexity
DistinctBy
79%
34/43
75%
15/20
6
 
 1  
 package datafu.pig.bags;
 2  
 
 3  
 import java.io.IOException;
 4  
 import java.util.HashSet;
 5  
 
 6  
 import org.apache.pig.EvalFunc;
 7  
 import org.apache.pig.backend.executionengine.ExecException;
 8  
 import org.apache.pig.data.BagFactory;
 9  
 import org.apache.pig.data.DataBag;
 10  
 import org.apache.pig.data.DataType;
 11  
 import org.apache.pig.data.Tuple;
 12  
 import org.apache.pig.impl.logicalLayer.FrontendException;
 13  
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 14  
 
 15  
 /**
 16  
  * Get distinct elements in a bag by a given set of field positions.
 17  
  * The input and output schemas will be identical.  
 18  
  * 
 19  
  * The first tuple containing each distinct combination of these fields will be taken.
 20  
  * 
 21  
  * This operation is order preserving.  If both A and B appear in the output,
 22  
  * and A appears before B in the input, then A will appear before B in the output.
 23  
  * 
 24  
  * Example:
 25  
  * <pre>
 26  
  * {@code
 27  
  * define DistinctBy datafu.pig.bags.DistinctBy('0');
 28  
  * 
 29  
  * -- input:
 30  
  * -- ({(a, 1),(a,1),(b, 2),(b,22),(c, 3),(d, 4)})
 31  
  * input = LOAD 'input' AS (B: bag {T: tuple(alpha:CHARARRAY, numeric:INT)});
 32  
  * 
 33  
  * output = FOREACH input GENERATE DistinctBy(B);
 34  
  * 
 35  
  * -- output:
 36  
  * -- ({(a,1),(b,2),(c,3),(d,4)})
 37  
  * } 
 38  
  * </pre>
 39  
  * 
 40  
  * @param map Any number of strings specifying field positions
 41  
  */
 42  1
 public class DistinctBy extends EvalFunc<DataBag>
 43  
 {
 44  
   private final static String delimiter = "-";
 45  1069
   private HashSet<Integer> fields = new HashSet<Integer>();
 46  
   
 47  
   public DistinctBy(String... fields)
 48  1069
   {
 49  2138
     for(String field : fields) {
 50  1069
       this.fields.add(Integer.parseInt(field));
 51  
     }   
 52  1069
   }
 53  
 
 54  
 
 55  
   @Override
 56  
   public DataBag exec(Tuple input) throws IOException
 57  
   {
 58  1
     if (input.size() != 1) {
 59  0
       throw new RuntimeException("Expected input to have only a single field");
 60  
     }    
 61  1
     if (input.getType(0) != DataType.BAG) {
 62  0
       throw new RuntimeException("Expected a BAG as input");
 63  
     }
 64  
     // new hash to record things that have already been seen
 65  1
     HashSet<String> seen = new HashSet<String>();    
 66  
 
 67  1
     DataBag inputBag = (DataBag)input.get(0);
 68  1
     DataBag outputBag = BagFactory.getInstance().newDefaultBag();
 69  1
     for (Tuple t : inputBag) {
 70  8
       String distinctString = getDistinctString(t, this.fields);
 71  8
       if (!seen.contains(distinctString)) {
 72  6
         outputBag.add(t);
 73  6
         seen.add(distinctString);
 74  
       }
 75  8
     }
 76  1
     return outputBag;    
 77  
   }
 78  
   
 79  
   @Override
 80  
   public Schema outputSchema(Schema input)
 81  
   {
 82  
     try {
 83  206
       if (input.size() != 1)
 84  
       {
 85  0
         throw new RuntimeException("Expected input to have only a single field");
 86  
       }
 87  
       
 88  206
       Schema.FieldSchema inputFieldSchema = input.getField(0);
 89  
 
 90  206
       if (inputFieldSchema.type != DataType.BAG)
 91  
       {
 92  0
         throw new RuntimeException("Expected a BAG as input");
 93  
       }
 94  
       
 95  206
       Schema inputBagSchema = inputFieldSchema.schema;
 96  
 
 97  206
       if (inputBagSchema.getField(0).type != DataType.TUPLE)
 98  
       {
 99  0
         throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s",
 100  
                                                  DataType.findTypeName(inputBagSchema.getField(0).type)));
 101  
       }
 102  
       
 103  206
       Schema inputTupleSchema = inputBagSchema.getField(0).schema;
 104  
       
 105  206
       Schema outputTupleSchema = inputTupleSchema.clone();     
 106  
       
 107  206
       return new Schema(new Schema.FieldSchema(
 108  
             getSchemaName(this.getClass().getName().toLowerCase(), input),
 109  
             outputTupleSchema, 
 110  
             DataType.BAG));
 111  
     }
 112  0
     catch (CloneNotSupportedException e) {
 113  0
       throw new RuntimeException(e);
 114  
     }
 115  0
     catch (FrontendException e) {
 116  0
       throw new RuntimeException(e);
 117  
     }
 118  
   }
 119  
   
 120  
   private String getDistinctString(Tuple t, HashSet<Integer> distinctFieldPositions) throws ExecException {
 121  8
     String[] tokens = t.toDelimitedString(delimiter).split(delimiter);
 122  8
     StringBuffer buffer = new StringBuffer();
 123  32
     for(int i=0; i<tokens.length; i++) {
 124  24
       if (distinctFieldPositions.contains(i)) {
 125  8
         buffer.append(tokens[i]);
 126  8
         buffer.append(delimiter);
 127  
       }
 128  
     }
 129  8
     buffer.substring(0, buffer.length() - delimiter.length());
 130  8
     return buffer.toString();
 131  
   }
 132  
 
 133  
 }