Coverage Report - datafu.pig.linkanalysis.PageRank
 
Classes in this File Line Coverage Branch Coverage Complexity
PageRank
73%
102/138
59%
32/54
5.7
PageRank$1
100%
3/3
N/A
5.7
 
 1  
 /*
 2  
  * Copyright 2010 LinkedIn, Inc
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5  
  * use this file except in compliance with the License. You may obtain a copy of
 6  
  * the License at
 7  
  * 
 8  
  * http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12  
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13  
  * License for the specific language governing permissions and limitations under
 14  
  * the License.
 15  
  */
 16  
  
 17  
 package datafu.pig.linkanalysis;
 18  
 
 19  
 import it.unimi.dsi.fastutil.ints.Int2IntMap;
 20  
 
 21  
 import java.io.IOException;
 22  
 import java.util.ArrayList;
 23  
 import java.util.HashMap;
 24  
 import java.util.List;
 25  
 import java.util.Map;
 26  
 
 27  
 import org.apache.pig.Accumulator;
 28  
 import org.apache.pig.EvalFunc;
 29  
 import org.apache.pig.data.BagFactory;
 30  
 import org.apache.pig.data.DataBag;
 31  
 import org.apache.pig.data.DataType;
 32  
 import org.apache.pig.data.Tuple;
 33  
 import org.apache.pig.data.TupleFactory;
 34  
 import org.apache.pig.impl.logicalLayer.FrontendException;
 35  
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 36  
 
 37  
 import datafu.linkanalysis.PageRank.ProgressIndicator;
 38  
 
 39  
 
 40  
 /**
 41  
  * A UDF which implements {@link <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>}.  
 42  
  * Each graph is stored in memory while running the algorithm, with edges optionally 
 43  
  * spilled to disk to conserve memory.  This can be used to distribute the execution of PageRank on a large number of 
 44  
  * reasonable sized graphs.  It does not distribute execuion of PageRank on a single graph.  Each graph is identified
 45  
  * by an integer valued topic ID.
 46  
  * <p>
 47  
  * Example:
 48  
  * <pre>
 49  
  * {@code
 50  
  * 
 51  
  * topic_edges = LOAD 'input_edges' as (topic:INT,source:INT,dest:INT,weight:DOUBLE);
 52  
  * 
 53  
  * topic_edges_grouped = GROUP topic_edges by (topic, source) ;
 54  
  * topic_edges_grouped = FOREACH topic_edges_grouped GENERATE
 55  
  *    group.topic as topic,
 56  
  *    group.source as source,
 57  
  *    topic_edges.(dest,weight) as edges;
 58  
  * 
 59  
  * topic_edges_grouped_by_topic = GROUP topic_edges_grouped BY topic; 
 60  
  * 
 61  
  * topic_ranks = FOREACH topic_edges_grouped_by_topic GENERATE
 62  
  *    group as topic,
 63  
  *    FLATTEN(PageRank(topic_edges_grouped.(source,edges))) as (source,rank);
 64  
  *
 65  
  * skill_ranks = FOREACH skill_ranks GENERATE
 66  
  *    topic, source, rank;
 67  
  * 
 68  
  * }
 69  
  * </pre> 
 70  
  */
 71  4229
 public class PageRank extends EvalFunc<DataBag> implements Accumulator<DataBag>
 72  
 {
 73  791
   private final datafu.linkanalysis.PageRank graph = new datafu.linkanalysis.PageRank();
 74  
 
 75  791
   private int maxNodesAndEdges = 100000000;
 76  791
   private int maxEdgesInMemory = 30000000;
 77  791
   private double tolerance = 1e-16;
 78  791
   private int maxIters = 150;
 79  791
   private boolean useEdgeDiskStorage = false;
 80  791
   private boolean enableDanglingNodeHandling = false;
 81  791
   private boolean aborted = false;
 82  
 
 83  791
   TupleFactory tupleFactory = TupleFactory.getInstance();
 84  791
   BagFactory bagFactory = BagFactory.getInstance();
 85  
   
 86  
   public PageRank()
 87  0
   {
 88  0
     initialize();
 89  0
   }
 90  
 
 91  
   public PageRank(String... parameters)
 92  791
   {
 93  791
     if (parameters.length % 2 != 0)
 94  
     {
 95  0
       throw new RuntimeException("Invalid parameters list");
 96  
     }
 97  
 
 98  1582
     for (int i=0; i<parameters.length; i+=2)
 99  
     {
 100  791
       String parameterName = parameters[i];
 101  791
       String value = parameters[i+1];
 102  791
       if (parameterName.equals("max_nodes_and_edges"))
 103  
       {
 104  0
         maxNodesAndEdges = Integer.parseInt(value);
 105  
       }
 106  791
       else if (parameterName.equals("max_edges_in_memory"))
 107  
       {
 108  0
         maxEdgesInMemory = Integer.parseInt(value);
 109  
       }
 110  791
       else if (parameterName.equals("tolerance"))
 111  
       {
 112  0
         tolerance = Double.parseDouble(value);
 113  
       }
 114  791
       else if (parameterName.equals("max_iters"))
 115  
       {
 116  0
         maxIters = Integer.parseInt(value);
 117  
       }
 118  791
       else if (parameterName.equals("spill_to_edge_disk_storage"))
 119  
       {
 120  0
         useEdgeDiskStorage = Boolean.parseBoolean(value);
 121  
       }
 122  791
       else if (parameterName.equals("dangling_nodes"))
 123  
       {
 124  791
         enableDanglingNodeHandling = Boolean.parseBoolean(value);
 125  
       }
 126  
     }
 127  
 
 128  791
     initialize();
 129  791
   }
 130  
 
 131  
   private void initialize()
 132  
   {
 133  791
     long heapSize = Runtime.getRuntime().totalMemory();
 134  791
     long heapMaxSize = Runtime.getRuntime().maxMemory();
 135  791
     long heapFreeSize = Runtime.getRuntime().freeMemory();
 136  
 //    System.out.println(String.format("Heap size: %d, Max heap size: %d, Heap free size: %d", heapSize, heapMaxSize, heapFreeSize));
 137  
 
 138  791
     if (useEdgeDiskStorage)
 139  
     {
 140  0
       this.graph.enableEdgeDiskCaching();
 141  
     }
 142  
     else
 143  
     {
 144  791
       this.graph.disableEdgeDiskCaching();
 145  
     }
 146  
 
 147  791
     if (enableDanglingNodeHandling)
 148  
     {
 149  791
       this.graph.enableDanglingNodeHandling();
 150  
     }
 151  
     else
 152  
     {
 153  0
       this.graph.disableDanglingNodeHandling();
 154  
     }
 155  
 
 156  791
     this.graph.setEdgeCachingThreshold(maxEdgesInMemory);
 157  791
   }
 158  
 
 159  
   @Override
 160  
   public void accumulate(Tuple t) throws IOException
 161  
   {
 162  1
     if (aborted)
 163  
     {
 164  0
       return;
 165  
     }
 166  
     
 167  1
     DataBag bag = (DataBag) t.get(0);
 168  1
     if (bag == null || bag.size() == 0)
 169  0
       return;
 170  
     
 171  1
     for (Tuple sourceTuple : bag) 
 172  
     {
 173  10
       Integer sourceId = (Integer)sourceTuple.get(0);
 174  10
       DataBag edges = (DataBag)sourceTuple.get(1);
 175  
 
 176  10
       ArrayList<Map<String,Object>> edgesMapList = new ArrayList<Map<String, Object>>();
 177  
 
 178  10
       for (Tuple edgeTuple : edges)
 179  
       {
 180  17
         Integer destId = (Integer)edgeTuple.get(0);
 181  17
         Double weight = (Double)edgeTuple.get(1);
 182  17
         HashMap<String,Object> edgeMap = new HashMap<String, Object>();
 183  17
         edgeMap.put("dest",destId);
 184  17
         edgeMap.put("weight",weight);
 185  17
         edgesMapList.add(edgeMap);
 186  17
       }
 187  
 
 188  10
       graph.addEdges(sourceId, edgesMapList);
 189  
 
 190  10
       if (graph.nodeCount() + graph.edgeCount() > maxNodesAndEdges)
 191  
       {
 192  0
         System.out.println(String.format("There are too many nodes and edges (%d + %d > %d). Aborting.", graph.nodeCount(), graph.edgeCount(), maxNodesAndEdges));
 193  0
         aborted = true;
 194  
       }
 195  
 
 196  10
       reporter.progress();
 197  10
     }
 198  1
   }
 199  
 
 200  
   @Override
 201  
   public DataBag getValue()
 202  
   {
 203  1
     if (aborted)
 204  
     {
 205  0
       return null;
 206  
     }
 207  
     
 208  1
     System.out.println(String.format("Nodes: %d, Edges: %d", graph.nodeCount(), graph.edgeCount()));
 209  
     
 210  1
     ProgressIndicator progressIndicator = getProgressIndicator();
 211  1
     System.out.println("Finished loading graph.");
 212  1
     long startTime = System.nanoTime();
 213  1
     System.out.println("Initializing.");
 214  
     try
 215  
     {
 216  1
       graph.init(progressIndicator);
 217  
     }
 218  0
     catch (IOException e)
 219  
     {
 220  0
       e.printStackTrace();
 221  0
       return null;
 222  1
     }
 223  1
     System.out.println(String.format("Done, took %f ms", (System.nanoTime() - startTime)/10.0e6));
 224  
 
 225  
     float totalDiff;
 226  1
     int iter = 0;
 227  
 
 228  1
     System.out.println("Beginning iterations");
 229  1
     startTime = System.nanoTime();
 230  
     do
 231  
     {
 232  
       // TODO log percentage complete every 5 minutes
 233  
       try
 234  
       {
 235  150
         totalDiff = graph.nextIteration(progressIndicator);
 236  
       }
 237  0
       catch (IOException e)
 238  
       {
 239  0
         e.printStackTrace();
 240  0
         return null;
 241  150
       }
 242  150
       iter++;
 243  150
     } while(iter < maxIters && totalDiff > tolerance);
 244  1
     System.out.println(String.format("Done, %d iterations took %f ms", iter, (System.nanoTime() - startTime)/10.0e6));
 245  
 
 246  1
     DataBag output = bagFactory.newDefaultBag();
 247  
 
 248  1
     for (Int2IntMap.Entry node : graph.getNodeIds())
 249  
     {
 250  11
       int nodeId = node.getIntKey();
 251  11
       float rank = graph.getNodeRank(nodeId);
 252  11
       List nodeData = new ArrayList(2);
 253  11
       nodeData.add(nodeId);
 254  11
       nodeData.add(rank);
 255  11
       output.add(tupleFactory.newTuple(nodeData));
 256  11
     }
 257  
 
 258  1
     return output;
 259  
   }
 260  
 
 261  
   @Override
 262  
   public void cleanup()
 263  
   {
 264  
     try
 265  
     {
 266  2
       aborted = false;
 267  2
       this.graph.clear();
 268  
     }
 269  0
     catch (IOException e)
 270  
     { 
 271  0
       e.printStackTrace();
 272  2
     }
 273  2
   }
 274  
 
 275  
   @Override
 276  
   public DataBag exec(Tuple input) throws IOException
 277  
   {
 278  
     try
 279  
     {
 280  0
       accumulate(input);
 281  
       
 282  0
       return getValue();
 283  
     }
 284  
     finally
 285  
     {
 286  0
       cleanup();
 287  
     }
 288  
   }
 289  
 
 290  
   private ProgressIndicator getProgressIndicator()
 291  
   {
 292  1
     return new ProgressIndicator()
 293  1
         {
 294  
           @Override
 295  
           public void progress()
 296  
           {
 297  4228
             reporter.progress();
 298  4228
           }
 299  
         };
 300  
   }
 301  
 
 302  
   @Override
 303  
   public Schema outputSchema(Schema input)
 304  
   {
 305  
     try
 306  
     {
 307  404
       Schema.FieldSchema inputFieldSchema = input.getField(0);
 308  
 
 309  404
       if (inputFieldSchema.type != DataType.BAG)
 310  
       {
 311  0
         throw new RuntimeException("Expected a BAG as input");
 312  
       }
 313  
 
 314  404
       Schema inputBagSchema = inputFieldSchema.schema;
 315  
 
 316  404
       if (inputBagSchema.getField(0).type != DataType.TUPLE)
 317  
       {
 318  0
         throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s",
 319  
                                                  DataType.findTypeName(inputBagSchema.getField(0).type)));
 320  
       }
 321  
       
 322  404
       Schema inputTupleSchema = inputBagSchema.getField(0).schema;
 323  
       
 324  404
       if (inputTupleSchema.getField(0).type != DataType.INTEGER)
 325  
       {
 326  0
         throw new RuntimeException(String.format("Expected source to be an INTEGER, but instead found %s",
 327  
                                                  DataType.findTypeName(inputTupleSchema.getField(0).type)));
 328  
       }
 329  
 
 330  404
       if (inputTupleSchema.getField(1).type != DataType.BAG)
 331  
       {
 332  0
         throw new RuntimeException(String.format("Expected edges to be represented with a BAG"));
 333  
       }
 334  
 
 335  404
       Schema.FieldSchema edgesFieldSchema = inputTupleSchema.getField(1);
 336  
 
 337  404
       if (edgesFieldSchema.schema.getField(0).type != DataType.TUPLE)
 338  
       {
 339  0
         throw new RuntimeException(String.format("Expected edges field to contain a TUPLE, but instead found %s",
 340  
                                                  DataType.findTypeName(edgesFieldSchema.schema.getField(0).type)));
 341  
       }
 342  
       
 343  404
       Schema edgesTupleSchema = edgesFieldSchema.schema.getField(0).schema;
 344  
       
 345  404
       if (edgesTupleSchema.getField(0).type != DataType.INTEGER)
 346  
       {
 347  0
         throw new RuntimeException(String.format("Expected destination edge ID to an INTEGER, but instead found %s",
 348  
                                                  DataType.findTypeName(edgesFieldSchema.schema.getField(0).type)));
 349  
       }
 350  
 
 351  404
       if (edgesTupleSchema.getField(1).type != DataType.DOUBLE)
 352  
       {
 353  0
         throw new RuntimeException(String.format("Expected destination edge weight to a DOUBLE, but instead found %s",
 354  
                                                  DataType.findTypeName(edgesFieldSchema.schema.getField(1).type)));
 355  
       }
 356  
 
 357  404
       Schema tupleSchema = new Schema();
 358  404
       tupleSchema.add(new Schema.FieldSchema("node",DataType.INTEGER));
 359  404
       tupleSchema.add(new Schema.FieldSchema("rank",DataType.FLOAT));
 360  
 
 361  404
       return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass()
 362  
                                                                  .getName()
 363  
                                                                  .toLowerCase(), input),
 364  
                                                tupleSchema,
 365  
                                                DataType.BAG));
 366  
     }
 367  0
     catch (FrontendException e)
 368  
     {
 369  0
       throw new RuntimeException(e);
 370  
     }
 371  
   }
 372  
 }