1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
package datafu.linkanalysis; |
18 | |
|
19 | |
import it.unimi.dsi.fastutil.floats.FloatArrayList; |
20 | |
import it.unimi.dsi.fastutil.ints.Int2IntMap; |
21 | |
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; |
22 | |
import it.unimi.dsi.fastutil.ints.IntArrayList; |
23 | |
|
24 | |
import java.io.BufferedInputStream; |
25 | |
import java.io.BufferedOutputStream; |
26 | |
import java.io.DataInputStream; |
27 | |
import java.io.DataOutputStream; |
28 | |
import java.io.File; |
29 | |
import java.io.FileInputStream; |
30 | |
import java.io.FileOutputStream; |
31 | |
import java.io.IOException; |
32 | |
import java.util.ArrayList; |
33 | |
import java.util.Iterator; |
34 | |
import java.util.Map; |
35 | |
|
36 | |
import com.google.common.collect.AbstractIterator; |
37 | |
|
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | 795 | public class PageRank |
45 | |
{ |
46 | |
private float totalRankChange; |
47 | |
private long edgeCount; |
48 | |
private long nodeCount; |
49 | |
|
50 | |
|
51 | 1 | private static float ALPHA = 0.85f; |
52 | |
|
53 | |
|
54 | 1 | private static float EDGE_WEIGHT_MULTIPLIER = 100000; |
55 | |
|
56 | 795 | private final Int2IntOpenHashMap nodeIndices = new Int2IntOpenHashMap(); |
57 | 795 | private final FloatArrayList nodeData = new FloatArrayList(); |
58 | |
|
59 | 795 | private final IntArrayList danglingNodes = new IntArrayList(); |
60 | |
|
61 | 795 | private final IntArrayList edges = new IntArrayList(); |
62 | |
|
63 | 795 | private boolean shouldHandleDanglingNodes = false; |
64 | 795 | private boolean shouldCacheEdgesOnDisk = false; |
65 | |
private long edgeCachingThreshold; |
66 | |
|
67 | |
private File edgesFile; |
68 | |
private DataOutputStream edgeDataOutputStream; |
69 | |
private boolean usingEdgeDiskCache; |
70 | |
|
71 | 795 | public interface ProgressIndicator |
72 | |
{ |
73 | |
void progress(); |
74 | |
} |
75 | |
|
76 | |
public void clear() throws IOException |
77 | |
{ |
78 | 2 | this.edgeCount = 0; |
79 | 2 | this.nodeCount = 0; |
80 | 2 | this.totalRankChange = 0.0f; |
81 | |
|
82 | 2 | this.nodeIndices.clear(); |
83 | 2 | this.nodeData.clear(); |
84 | 2 | this.edges.clear(); |
85 | 2 | this.danglingNodes.clear(); |
86 | |
|
87 | 2 | if (edgeDataOutputStream != null) |
88 | |
{ |
89 | 0 | this.edgeDataOutputStream.close(); |
90 | 0 | this.edgeDataOutputStream = null; |
91 | |
} |
92 | |
|
93 | 2 | this.usingEdgeDiskCache = false; |
94 | 2 | this.edgesFile = null; |
95 | 2 | } |
96 | |
|
97 | |
|
98 | |
|
99 | |
|
100 | |
|
101 | |
public boolean isUsingEdgeDiskCache() |
102 | |
{ |
103 | 1 | return usingEdgeDiskCache; |
104 | |
} |
105 | |
|
106 | |
|
107 | |
|
108 | |
|
109 | |
public void enableEdgeDiskCaching() |
110 | |
{ |
111 | 2 | shouldCacheEdgesOnDisk = true; |
112 | 2 | } |
113 | |
|
114 | |
|
115 | |
|
116 | |
|
117 | |
public void disableEdgeDiskCaching() |
118 | |
{ |
119 | 791 | shouldCacheEdgesOnDisk = false; |
120 | 791 | } |
121 | |
|
122 | |
|
123 | |
|
124 | |
|
125 | |
|
126 | |
public boolean isEdgeDiskCachingEnabled() |
127 | |
{ |
128 | 0 | return shouldCacheEdgesOnDisk; |
129 | |
} |
130 | |
|
131 | |
|
132 | |
|
133 | |
|
134 | |
|
135 | |
|
136 | |
public long getEdgeCachingThreshold() |
137 | |
{ |
138 | 0 | return edgeCachingThreshold; |
139 | |
} |
140 | |
|
141 | |
|
142 | |
|
143 | |
|
144 | |
|
145 | |
|
146 | |
public void setEdgeCachingThreshold(long count) |
147 | |
{ |
148 | 793 | edgeCachingThreshold = count; |
149 | 793 | } |
150 | |
|
151 | |
|
152 | |
|
153 | |
|
154 | |
public void enableDanglingNodeHandling() |
155 | |
{ |
156 | 795 | shouldHandleDanglingNodes = true; |
157 | 795 | } |
158 | |
|
159 | |
|
160 | |
|
161 | |
|
162 | |
public void disableDanglingNodeHandling() |
163 | |
{ |
164 | 0 | shouldHandleDanglingNodes = false; |
165 | 0 | } |
166 | |
|
167 | |
public long nodeCount() |
168 | |
{ |
169 | 11 | return this.nodeCount; |
170 | |
} |
171 | |
|
172 | |
public long edgeCount() |
173 | |
{ |
174 | 11 | return this.edgeCount; |
175 | |
} |
176 | |
|
177 | |
public Int2IntMap.FastEntrySet getNodeIds() |
178 | |
{ |
179 | 1 | return this.nodeIndices.int2IntEntrySet(); |
180 | |
} |
181 | |
|
182 | |
public float getNodeRank(int nodeId) |
183 | |
{ |
184 | 33 | int nodeIndex = this.nodeIndices.get(nodeId); |
185 | 33 | return nodeData.get(nodeIndex); |
186 | |
} |
187 | |
|
188 | |
public float getTotalRankChange() |
189 | |
{ |
190 | 750 | return this.totalRankChange; |
191 | |
} |
192 | |
|
193 | |
private void maybeCreateNode(int nodeId) |
194 | |
{ |
195 | |
|
196 | 200081 | if (!nodeIndices.containsKey(nodeId)) |
197 | |
{ |
198 | 100035 | int index = this.nodeData.size(); |
199 | |
|
200 | 100035 | this.nodeData.add(0.0f); |
201 | 100035 | this.nodeData.add(0.0f); |
202 | 100035 | this.nodeData.add(0.0f); |
203 | |
|
204 | 100035 | this.nodeIndices.put(nodeId, index); |
205 | |
|
206 | 100035 | this.nodeCount++; |
207 | |
} |
208 | 200081 | } |
209 | |
|
210 | |
public void addEdges(Integer sourceId, ArrayList<Map<String,Object>> sourceEdges) throws IOException |
211 | |
{ |
212 | 100030 | int source = sourceId.intValue(); |
213 | |
|
214 | 100030 | maybeCreateNode(source); |
215 | |
|
216 | 100030 | if (this.shouldCacheEdgesOnDisk && !usingEdgeDiskCache && (sourceEdges.size() + this.edgeCount) >= this.edgeCachingThreshold) |
217 | |
{ |
218 | 2 | writeEdgesToDisk(); |
219 | |
} |
220 | |
|
221 | |
|
222 | 100030 | appendEdgeData(source); |
223 | |
|
224 | |
|
225 | 100030 | appendEdgeData(sourceEdges.size()); |
226 | |
|
227 | |
|
228 | 100030 | for (Map<String,Object> edge : sourceEdges) |
229 | |
{ |
230 | 100051 | int dest = ((Integer)edge.get("dest")).intValue(); |
231 | 100051 | float weight = ((Double)edge.get("weight")).floatValue(); |
232 | |
|
233 | 100051 | maybeCreateNode(dest); |
234 | |
|
235 | 100051 | appendEdgeData(dest); |
236 | |
|
237 | |
|
238 | 100051 | appendEdgeData(Math.max(1, (int)(weight * EDGE_WEIGHT_MULTIPLIER))); |
239 | |
|
240 | 100051 | this.edgeCount++; |
241 | 100051 | } |
242 | 100030 | } |
243 | |
|
244 | |
private void appendEdgeData(int data) throws IOException |
245 | |
{ |
246 | 400162 | if (this.edgeDataOutputStream != null) |
247 | |
{ |
248 | 200024 | this.edgeDataOutputStream.writeInt(data); |
249 | |
} |
250 | |
else |
251 | |
{ |
252 | 200138 | this.edges.add(data); |
253 | |
} |
254 | 400162 | } |
255 | |
|
256 | |
public void init(ProgressIndicator progressIndicator) throws IOException |
257 | |
{ |
258 | 5 | if (this.edgeDataOutputStream != null) |
259 | |
{ |
260 | 2 | this.edgeDataOutputStream.close(); |
261 | 2 | this.edgeDataOutputStream = null; |
262 | |
} |
263 | |
|
264 | |
|
265 | 5 | float nodeRank = 1.0f / this.nodeCount; |
266 | 100040 | for (int j=0; j<this.nodeData.size(); j+=3) |
267 | |
{ |
268 | 100035 | nodeData.set(j, nodeRank); |
269 | 100035 | progressIndicator.progress(); |
270 | |
} |
271 | |
|
272 | 5 | Iterator<Integer> edgeData = getEdgeData(); |
273 | |
|
274 | 100035 | while(edgeData.hasNext()) |
275 | |
{ |
276 | 100030 | int sourceId = edgeData.next(); |
277 | 100030 | int nodeEdgeCount = edgeData.next(); |
278 | |
|
279 | 200081 | while (nodeEdgeCount-- > 0) |
280 | |
{ |
281 | |
|
282 | 100051 | edgeData.next(); |
283 | |
|
284 | 100051 | float weight = edgeData.next(); |
285 | |
|
286 | 100051 | int nodeIndex = this.nodeIndices.get(sourceId); |
287 | |
|
288 | 100051 | float totalWeight = this.nodeData.getFloat(nodeIndex+1); |
289 | 100051 | totalWeight += weight; |
290 | 100051 | this.nodeData.set(nodeIndex+1, totalWeight); |
291 | |
|
292 | 100051 | progressIndicator.progress(); |
293 | 100051 | } |
294 | 100030 | } |
295 | |
|
296 | |
|
297 | |
|
298 | 5 | if (shouldHandleDanglingNodes) |
299 | |
{ |
300 | 5 | for (Map.Entry<Integer,Integer> e : nodeIndices.entrySet()) |
301 | |
{ |
302 | 100035 | int nodeId = e.getKey(); |
303 | 100035 | int nodeIndex = e.getValue(); |
304 | 100035 | float totalWeight = nodeData.getFloat(nodeIndex+1); |
305 | 100035 | if (totalWeight == 0.0f) |
306 | |
{ |
307 | 5 | danglingNodes.add(nodeId); |
308 | |
} |
309 | 100035 | } |
310 | |
} |
311 | 5 | } |
312 | |
|
313 | |
public float nextIteration(ProgressIndicator progressIndicator) throws IOException |
314 | |
{ |
315 | 750 | distribute(progressIndicator); |
316 | 750 | commit(progressIndicator); |
317 | |
|
318 | 750 | return getTotalRankChange(); |
319 | |
} |
320 | |
|
321 | |
public void distribute(ProgressIndicator progressIndicator) throws IOException |
322 | |
{ |
323 | 750 | Iterator<Integer> edgeData = getEdgeData(); |
324 | |
|
325 | 15005250 | while(edgeData.hasNext()) |
326 | |
{ |
327 | 15004500 | int sourceId = edgeData.next(); |
328 | 15004500 | int nodeEdgeCount = edgeData.next(); |
329 | |
|
330 | 30012150 | while (nodeEdgeCount-- > 0) |
331 | |
{ |
332 | 15007650 | int toId = edgeData.next(); |
333 | 15007650 | float weight = edgeData.next(); |
334 | |
|
335 | 15007650 | int fromNodeIndex = this.nodeIndices.get(sourceId); |
336 | 15007650 | int toNodeIndex = this.nodeIndices.get(toId); |
337 | |
|
338 | 15007650 | float contributionChange = weight * this.nodeData.getFloat(fromNodeIndex) / this.nodeData.getFloat(fromNodeIndex+1); |
339 | |
|
340 | 15007650 | float currentContribution = this.nodeData.getFloat(toNodeIndex+2); |
341 | 15007650 | this.nodeData.set(toNodeIndex+2, currentContribution + contributionChange); |
342 | |
|
343 | 15007650 | progressIndicator.progress(); |
344 | 15007650 | } |
345 | 15004500 | } |
346 | |
|
347 | 750 | if (shouldHandleDanglingNodes) |
348 | |
{ |
349 | |
|
350 | 750 | float totalRank = 0.0f; |
351 | 750 | for (int nodeId : danglingNodes) |
352 | |
{ |
353 | 750 | int nodeIndex = nodeIndices.get(nodeId); |
354 | 750 | float rank = nodeData.get(nodeIndex); |
355 | 750 | totalRank += rank; |
356 | 750 | } |
357 | |
|
358 | |
|
359 | |
|
360 | 750 | float contributionIncrease = totalRank / this.nodeCount; |
361 | 15006000 | for (int i=2; i<nodeData.size(); i += 3) |
362 | |
{ |
363 | 15005250 | float contribution = nodeData.getFloat(i); |
364 | 15005250 | contribution += contributionIncrease; |
365 | 15005250 | nodeData.set(i, contribution); |
366 | |
} |
367 | |
} |
368 | 750 | } |
369 | |
|
370 | |
public void commit(ProgressIndicator progressIndicator) |
371 | |
{ |
372 | 750 | this.totalRankChange = 0.0f; |
373 | |
|
374 | 750 | for (int id : nodeIndices.keySet()) |
375 | |
{ |
376 | 15005250 | int nodeIndex = this.nodeIndices.get(id); |
377 | |
|
378 | 15005250 | float alpha = datafu.linkanalysis.PageRank.ALPHA; |
379 | 15005250 | float newRank = (1.0f - alpha)/nodeCount + alpha * this.nodeData.get(nodeIndex+2); |
380 | |
|
381 | 15005250 | this.nodeData.set(nodeIndex+2, 0.0f); |
382 | |
|
383 | 15005250 | float lastRankDiff = newRank - this.nodeData.get(nodeIndex); |
384 | |
|
385 | 15005250 | this.nodeData.set(nodeIndex, newRank); |
386 | |
|
387 | 15005250 | this.totalRankChange += Math.abs(lastRankDiff); |
388 | |
|
389 | 15005250 | progressIndicator.progress(); |
390 | 15005250 | } |
391 | 750 | } |
392 | |
|
393 | |
private void writeEdgesToDisk() throws IOException |
394 | |
{ |
395 | 2 | this.edgesFile = File.createTempFile("fastgraph", null); |
396 | |
|
397 | 2 | FileOutputStream outStream = new FileOutputStream(this.edgesFile); |
398 | 2 | BufferedOutputStream bufferedStream = new BufferedOutputStream(outStream); |
399 | 2 | this.edgeDataOutputStream = new DataOutputStream(bufferedStream); |
400 | |
|
401 | 2 | for (int edgeData : edges) |
402 | |
{ |
403 | 30 | this.edgeDataOutputStream.writeInt(edgeData); |
404 | |
} |
405 | |
|
406 | 2 | this.edges.clear(); |
407 | 2 | usingEdgeDiskCache = true; |
408 | 2 | } |
409 | |
|
410 | |
private Iterator<Integer> getEdgeData() throws IOException |
411 | |
{ |
412 | 755 | if (!usingEdgeDiskCache) |
413 | |
{ |
414 | 453 | return this.edges.iterator(); |
415 | |
} |
416 | |
else |
417 | |
{ |
418 | 302 | FileInputStream fileInputStream = new FileInputStream(this.edgesFile); |
419 | 302 | BufferedInputStream inputStream = new BufferedInputStream(fileInputStream); |
420 | 302 | final DataInputStream dataInputStream = new DataInputStream(inputStream); |
421 | |
|
422 | 302 | return new AbstractIterator<Integer>() { |
423 | |
|
424 | |
@Override |
425 | |
protected Integer computeNext() |
426 | |
{ |
427 | |
try |
428 | |
{ |
429 | 30208456 | return dataInputStream.readInt(); |
430 | |
} |
431 | 302 | catch (IOException e) |
432 | |
{ |
433 | 302 | return endOfData(); |
434 | |
} |
435 | |
} |
436 | |
|
437 | |
}; |
438 | |
} |
439 | |
} |
440 | |
} |
441 | |
|