Coverage Report - datafu.linkanalysis.PageRank
 
Classes in this File Line Coverage Branch Coverage Complexity
PageRank
96%
151/157
93%
41/44
2
PageRank$1
100%
4/4
N/A
2
PageRank$ProgressIndicator
N/A
N/A
2
 
 1  
 /*
 2  
  * Copyright 2010 LinkedIn, Inc
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5  
  * use this file except in compliance with the License. You may obtain a copy of
 6  
  * the License at
 7  
  * 
 8  
  * http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12  
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13  
  * License for the specific language governing permissions and limitations under
 14  
  * the License.
 15  
  */
 16  
  
 17  
 package datafu.linkanalysis;
 18  
 
 19  
 import it.unimi.dsi.fastutil.floats.FloatArrayList;
 20  
 import it.unimi.dsi.fastutil.ints.Int2IntMap;
 21  
 import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
 22  
 import it.unimi.dsi.fastutil.ints.IntArrayList;
 23  
 
 24  
 import java.io.BufferedInputStream;
 25  
 import java.io.BufferedOutputStream;
 26  
 import java.io.DataInputStream;
 27  
 import java.io.DataOutputStream;
 28  
 import java.io.File;
 29  
 import java.io.FileInputStream;
 30  
 import java.io.FileOutputStream;
 31  
 import java.io.IOException;
 32  
 import java.util.ArrayList;
 33  
 import java.util.Iterator;
 34  
 import java.util.Map;
 35  
 
 36  
 import com.google.common.collect.AbstractIterator;
 37  
 
 38  
 /**
 39  
  * An implementation of {@link <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>}.
 40  
  * This implementation is not distributed.  It is intended for graphs of a reasonable size which can be processed
 41  
  * on a single machine.  Nodes are stored in memory.  Edges are stored in memory and can optionally be spilled to
 42  
  * disk once a certain limit is reached.  
 43  
  */
 44  795
 public class PageRank
 45  
 {    
 46  
   private float totalRankChange;
 47  
   private long edgeCount;
 48  
   private long nodeCount;
 49  
   
 50  
   // the damping factor
 51  1
   private static float ALPHA = 0.85f;
 52  
   
 53  
   // edge weights (which are doubles) are multiplied by this value so they can be stored as integers internally
 54  1
   private static float EDGE_WEIGHT_MULTIPLIER = 100000;
 55  
     
 56  795
   private final Int2IntOpenHashMap nodeIndices = new Int2IntOpenHashMap();
 57  795
   private final FloatArrayList nodeData = new FloatArrayList(); // rank, total weight, contribution, (repeat)
 58  
   
 59  795
   private final IntArrayList danglingNodes = new IntArrayList();
 60  
   
 61  795
   private final IntArrayList edges = new IntArrayList(); // source, dest node count... dest id, weight pos, (repeat)
 62  
   
 63  795
   private boolean shouldHandleDanglingNodes = false;
 64  795
   private boolean shouldCacheEdgesOnDisk = false;
 65  
   private long edgeCachingThreshold;
 66  
   
 67  
   private File edgesFile;
 68  
   private DataOutputStream edgeDataOutputStream;
 69  
   private boolean usingEdgeDiskCache;
 70  
   
 71  795
   public interface ProgressIndicator
 72  
   {
 73  
     void progress();
 74  
   }
 75  
   
 76  
   public void clear() throws IOException
 77  
   {
 78  2
     this.edgeCount = 0;
 79  2
     this.nodeCount = 0;
 80  2
     this.totalRankChange = 0.0f;
 81  
     
 82  2
     this.nodeIndices.clear();
 83  2
     this.nodeData.clear();
 84  2
     this.edges.clear();
 85  2
     this.danglingNodes.clear();
 86  
     
 87  2
     if (edgeDataOutputStream != null)
 88  
     {
 89  0
       this.edgeDataOutputStream.close();
 90  0
       this.edgeDataOutputStream = null;
 91  
     }
 92  
     
 93  2
     this.usingEdgeDiskCache = false;
 94  2
     this.edgesFile = null;
 95  2
   }
 96  
   
 97  
   /**
 98  
    * Gets whether disk is being used to cache edges.
 99  
    * @return True if the edges are cached on disk.
 100  
    */
 101  
   public boolean isUsingEdgeDiskCache()
 102  
   {
 103  1
     return usingEdgeDiskCache;
 104  
   }
 105  
   
 106  
   /**
 107  
    * Enable disk caching of edges once there are too many (disabled by default).
 108  
    */
 109  
   public void enableEdgeDiskCaching()
 110  
   {
 111  2
     shouldCacheEdgesOnDisk = true;
 112  2
   }
 113  
   
 114  
   /**
 115  
    * Disable disk caching of edges once there are too many (disabled by default).
 116  
    */
 117  
   public void disableEdgeDiskCaching()
 118  
   {
 119  791
     shouldCacheEdgesOnDisk = false;
 120  791
   }
 121  
   
 122  
   /**
 123  
    * Gets whether edge disk caching is enabled.
 124  
    * @return True if edge disk caching is enabled.
 125  
    */
 126  
   public boolean isEdgeDiskCachingEnabled()
 127  
   {
 128  0
     return shouldCacheEdgesOnDisk;
 129  
   }
 130  
   
 131  
   /**
 132  
    * Gets the number of edges past which they will be cached on disk instead of in memory.
 133  
    * Edge disk caching must be enabled for this to have any effect.
 134  
    * @return Edge count past which caching occurs
 135  
    */
 136  
   public long getEdgeCachingThreshold()
 137  
   {
 138  0
     return edgeCachingThreshold;
 139  
   }
 140  
 
 141  
   /**
 142  
    * Set the number of edges past which they will be cached on disk instead of in memory.
 143  
    * Edge disk caching must be enabled for this to have any effect.
 144  
    * @param count Edge count past which caching occurs
 145  
    */
 146  
   public void setEdgeCachingThreshold(long count)
 147  
   {
 148  793
     edgeCachingThreshold = count;
 149  793
   }
 150  
   
 151  
   /**
 152  
    * Enables dangling node handling (disabled by default).
 153  
    */
 154  
   public void enableDanglingNodeHandling()
 155  
   {
 156  795
     shouldHandleDanglingNodes = true;
 157  795
   }
 158  
   
 159  
   /**
 160  
    * Disables dangling node handling (disabled by default).
 161  
    */
 162  
   public void disableDanglingNodeHandling()
 163  
   {
 164  0
     shouldHandleDanglingNodes = false;
 165  0
   }
 166  
   
 167  
   public long nodeCount()
 168  
   {
 169  11
     return this.nodeCount;
 170  
   }
 171  
   
 172  
   public long edgeCount()
 173  
   {
 174  11
     return this.edgeCount;
 175  
   }
 176  
 
 177  
   public Int2IntMap.FastEntrySet getNodeIds()
 178  
   {
 179  1
     return this.nodeIndices.int2IntEntrySet();
 180  
   }
 181  
   
 182  
   public float getNodeRank(int nodeId)
 183  
   {
 184  33
     int nodeIndex = this.nodeIndices.get(nodeId);
 185  33
     return nodeData.get(nodeIndex);
 186  
   }
 187  
   
 188  
   public float getTotalRankChange()
 189  
   {
 190  750
     return this.totalRankChange;
 191  
   }
 192  
   
 193  
   private void maybeCreateNode(int nodeId)
 194  
   {
 195  
     // create from node if it doesn't already exist
 196  200081
     if (!nodeIndices.containsKey(nodeId))
 197  
     {      
 198  100035
       int index = this.nodeData.size();
 199  
       
 200  100035
       this.nodeData.add(0.0f); // rank
 201  100035
       this.nodeData.add(0.0f); // total weight
 202  100035
       this.nodeData.add(0.0f); // contribution
 203  
       
 204  100035
       this.nodeIndices.put(nodeId, index);
 205  
       
 206  100035
       this.nodeCount++;
 207  
     }
 208  200081
   }
 209  
   
 210  
   public void addEdges(Integer sourceId, ArrayList<Map<String,Object>> sourceEdges) throws IOException
 211  
   {
 212  100030
     int source = sourceId.intValue();
 213  
    
 214  100030
     maybeCreateNode(source);
 215  
     
 216  100030
     if (this.shouldCacheEdgesOnDisk && !usingEdgeDiskCache && (sourceEdges.size() + this.edgeCount) >= this.edgeCachingThreshold)
 217  
     {
 218  2
       writeEdgesToDisk();
 219  
     }
 220  
     
 221  
     // store the source node id itself
 222  100030
     appendEdgeData(source);
 223  
     
 224  
     // store how many outgoing edges this node has
 225  100030
     appendEdgeData(sourceEdges.size());
 226  
     
 227  
     // store the outgoing edges
 228  100030
     for (Map<String,Object> edge : sourceEdges)
 229  
     {
 230  100051
       int dest = ((Integer)edge.get("dest")).intValue();
 231  100051
       float weight = ((Double)edge.get("weight")).floatValue();
 232  
             
 233  100051
       maybeCreateNode(dest);
 234  
       
 235  100051
       appendEdgeData(dest);
 236  
       
 237  
       // location of weight in weights array
 238  100051
       appendEdgeData(Math.max(1, (int)(weight * EDGE_WEIGHT_MULTIPLIER)));
 239  
       
 240  100051
       this.edgeCount++;
 241  100051
     }
 242  100030
   }
 243  
   
 244  
   private void appendEdgeData(int data) throws IOException
 245  
   {
 246  400162
     if (this.edgeDataOutputStream != null)
 247  
     {
 248  200024
       this.edgeDataOutputStream.writeInt(data);
 249  
     }
 250  
     else
 251  
     {
 252  200138
       this.edges.add(data);
 253  
     }
 254  400162
   }
 255  
     
 256  
   public void init(ProgressIndicator progressIndicator) throws IOException
 257  
   {
 258  5
     if (this.edgeDataOutputStream != null)
 259  
     {
 260  2
       this.edgeDataOutputStream.close();
 261  2
       this.edgeDataOutputStream = null;
 262  
     }
 263  
     
 264  
     // initialize all nodes to an equal share of the total rank (1.0)
 265  5
     float nodeRank = 1.0f / this.nodeCount;        
 266  100040
     for (int j=0; j<this.nodeData.size(); j+=3)
 267  
     {
 268  100035
       nodeData.set(j, nodeRank);      
 269  100035
       progressIndicator.progress();
 270  
     }      
 271  
     
 272  5
     Iterator<Integer> edgeData = getEdgeData();
 273  
     
 274  100035
     while(edgeData.hasNext())
 275  
     {
 276  100030
       int sourceId = edgeData.next();
 277  100030
       int nodeEdgeCount = edgeData.next();
 278  
       
 279  200081
       while (nodeEdgeCount-- > 0)
 280  
       {
 281  
         // skip the destination node id
 282  100051
         edgeData.next();
 283  
         
 284  100051
         float weight = edgeData.next();
 285  
                 
 286  100051
         int nodeIndex = this.nodeIndices.get(sourceId);
 287  
         
 288  100051
         float totalWeight = this.nodeData.getFloat(nodeIndex+1); 
 289  100051
         totalWeight += weight;
 290  100051
         this.nodeData.set(nodeIndex+1, totalWeight);
 291  
         
 292  100051
         progressIndicator.progress();
 293  100051
       }
 294  100030
     }
 295  
     
 296  
     // if handling dangling nodes, get a list of them by finding those nodes with no outgoing
 297  
     // edges (i.e. total outgoing edge weight is 0.0)
 298  5
     if (shouldHandleDanglingNodes)
 299  
     {
 300  5
       for (Map.Entry<Integer,Integer> e : nodeIndices.entrySet())
 301  
       {
 302  100035
         int nodeId = e.getKey();
 303  100035
         int nodeIndex = e.getValue();
 304  100035
         float totalWeight = nodeData.getFloat(nodeIndex+1);
 305  100035
         if (totalWeight == 0.0f)
 306  
         {
 307  5
           danglingNodes.add(nodeId);
 308  
         }
 309  100035
       }
 310  
     }
 311  5
   }
 312  
   
 313  
   public float nextIteration(ProgressIndicator progressIndicator) throws IOException
 314  
   {
 315  750
     distribute(progressIndicator);
 316  750
     commit(progressIndicator);
 317  
     
 318  750
     return getTotalRankChange();
 319  
   }
 320  
   
 321  
   public void distribute(ProgressIndicator progressIndicator) throws IOException
 322  
   {    
 323  750
     Iterator<Integer> edgeData = getEdgeData();
 324  
     
 325  15005250
     while(edgeData.hasNext())
 326  
     {
 327  15004500
       int sourceId = edgeData.next();
 328  15004500
       int nodeEdgeCount = edgeData.next();
 329  
       
 330  30012150
       while (nodeEdgeCount-- > 0)
 331  
       {
 332  15007650
         int toId = edgeData.next();
 333  15007650
         float weight = edgeData.next();
 334  
                 
 335  15007650
         int fromNodeIndex = this.nodeIndices.get(sourceId);
 336  15007650
         int toNodeIndex = this.nodeIndices.get(toId);
 337  
         
 338  15007650
         float contributionChange = weight * this.nodeData.getFloat(fromNodeIndex) / this.nodeData.getFloat(fromNodeIndex+1);
 339  
         
 340  15007650
         float currentContribution = this.nodeData.getFloat(toNodeIndex+2);
 341  15007650
         this.nodeData.set(toNodeIndex+2, currentContribution + contributionChange);
 342  
         
 343  15007650
         progressIndicator.progress();
 344  15007650
       }      
 345  15004500
     }
 346  
     
 347  750
     if (shouldHandleDanglingNodes)
 348  
     {
 349  
       // get the rank from each of the dangling nodes
 350  750
       float totalRank = 0.0f;
 351  750
       for (int nodeId : danglingNodes)
 352  
       {
 353  750
         int nodeIndex = nodeIndices.get(nodeId);
 354  750
         float rank = nodeData.get(nodeIndex);
 355  750
         totalRank += rank;
 356  750
       }
 357  
       
 358  
       // distribute the dangling node ranks to all the nodes in the graph
 359  
       // note: the alpha factor is applied in the commit stage
 360  750
       float contributionIncrease = totalRank / this.nodeCount;
 361  15006000
       for (int i=2; i<nodeData.size(); i += 3)
 362  
       {
 363  15005250
         float contribution = nodeData.getFloat(i);
 364  15005250
         contribution += contributionIncrease;
 365  15005250
         nodeData.set(i, contribution);
 366  
       }
 367  
     }
 368  750
   }
 369  
   
 370  
   public void commit(ProgressIndicator progressIndicator)
 371  
   {
 372  750
     this.totalRankChange = 0.0f;
 373  
     
 374  750
     for (int id : nodeIndices.keySet())
 375  
     {
 376  15005250
       int nodeIndex = this.nodeIndices.get(id);
 377  
       
 378  15005250
       float alpha = datafu.linkanalysis.PageRank.ALPHA;
 379  15005250
       float newRank = (1.0f - alpha)/nodeCount + alpha * this.nodeData.get(nodeIndex+2);
 380  
       
 381  15005250
       this.nodeData.set(nodeIndex+2, 0.0f);
 382  
       
 383  15005250
       float lastRankDiff = newRank - this.nodeData.get(nodeIndex);
 384  
       
 385  15005250
       this.nodeData.set(nodeIndex, newRank);
 386  
       
 387  15005250
       this.totalRankChange += Math.abs(lastRankDiff);
 388  
       
 389  15005250
       progressIndicator.progress();
 390  15005250
     }
 391  750
   }
 392  
   
 393  
   private void writeEdgesToDisk() throws IOException
 394  
   { 
 395  2
     this.edgesFile = File.createTempFile("fastgraph", null);
 396  
     
 397  2
     FileOutputStream outStream = new FileOutputStream(this.edgesFile);
 398  2
     BufferedOutputStream bufferedStream = new BufferedOutputStream(outStream);
 399  2
     this.edgeDataOutputStream = new DataOutputStream(bufferedStream);
 400  
     
 401  2
     for (int edgeData : edges)
 402  
     {
 403  30
       this.edgeDataOutputStream.writeInt(edgeData);
 404  
     }
 405  
     
 406  2
     this.edges.clear();
 407  2
     usingEdgeDiskCache = true;
 408  2
   }
 409  
   
 410  
   private Iterator<Integer> getEdgeData() throws IOException
 411  
   {
 412  755
     if (!usingEdgeDiskCache)
 413  
     {
 414  453
       return this.edges.iterator();
 415  
     }
 416  
     else
 417  
     {
 418  302
       FileInputStream fileInputStream = new FileInputStream(this.edgesFile);
 419  302
       BufferedInputStream inputStream = new BufferedInputStream(fileInputStream);
 420  302
       final DataInputStream dataInputStream = new DataInputStream(inputStream);
 421  
       
 422  302
       return new AbstractIterator<Integer>() {
 423  
         
 424  
         @Override
 425  
         protected Integer computeNext()
 426  
         {
 427  
           try
 428  
           {
 429  30208456
             return dataInputStream.readInt();
 430  
           }
 431  302
           catch (IOException e)
 432  
           {
 433  302
             return endOfData();
 434  
           }
 435  
         }
 436  
         
 437  
       };
 438  
     }
 439  
   }
 440  
 }
 441