1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
package datafu.pig.linkanalysis; |
18 | |
|
19 | |
import it.unimi.dsi.fastutil.ints.Int2IntMap; |
20 | |
|
21 | |
import java.io.IOException; |
22 | |
import java.util.ArrayList; |
23 | |
import java.util.HashMap; |
24 | |
import java.util.List; |
25 | |
import java.util.Map; |
26 | |
|
27 | |
import org.apache.pig.Accumulator; |
28 | |
import org.apache.pig.EvalFunc; |
29 | |
import org.apache.pig.data.BagFactory; |
30 | |
import org.apache.pig.data.DataBag; |
31 | |
import org.apache.pig.data.DataType; |
32 | |
import org.apache.pig.data.Tuple; |
33 | |
import org.apache.pig.data.TupleFactory; |
34 | |
import org.apache.pig.impl.logicalLayer.FrontendException; |
35 | |
import org.apache.pig.impl.logicalLayer.schema.Schema; |
36 | |
|
37 | |
import datafu.linkanalysis.PageRank.ProgressIndicator; |
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | |
|
51 | |
|
52 | |
|
53 | |
|
54 | |
|
55 | |
|
56 | |
|
57 | |
|
58 | |
|
59 | |
|
60 | |
|
61 | |
|
62 | |
|
63 | |
|
64 | |
|
65 | |
|
66 | |
|
67 | |
|
68 | |
|
69 | |
|
70 | |
|
71 | 4229 | public class PageRank extends EvalFunc<DataBag> implements Accumulator<DataBag> |
72 | |
{ |
73 | 791 | private final datafu.linkanalysis.PageRank graph = new datafu.linkanalysis.PageRank(); |
74 | |
|
75 | 791 | private int maxNodesAndEdges = 100000000; |
76 | 791 | private int maxEdgesInMemory = 30000000; |
77 | 791 | private double tolerance = 1e-16; |
78 | 791 | private int maxIters = 150; |
79 | 791 | private boolean useEdgeDiskStorage = false; |
80 | 791 | private boolean enableDanglingNodeHandling = false; |
81 | 791 | private boolean aborted = false; |
82 | |
|
83 | 791 | TupleFactory tupleFactory = TupleFactory.getInstance(); |
84 | 791 | BagFactory bagFactory = BagFactory.getInstance(); |
85 | |
|
86 | |
public PageRank() |
87 | 0 | { |
88 | 0 | initialize(); |
89 | 0 | } |
90 | |
|
91 | |
public PageRank(String... parameters) |
92 | 791 | { |
93 | 791 | if (parameters.length % 2 != 0) |
94 | |
{ |
95 | 0 | throw new RuntimeException("Invalid parameters list"); |
96 | |
} |
97 | |
|
98 | 1582 | for (int i=0; i<parameters.length; i+=2) |
99 | |
{ |
100 | 791 | String parameterName = parameters[i]; |
101 | 791 | String value = parameters[i+1]; |
102 | 791 | if (parameterName.equals("max_nodes_and_edges")) |
103 | |
{ |
104 | 0 | maxNodesAndEdges = Integer.parseInt(value); |
105 | |
} |
106 | 791 | else if (parameterName.equals("max_edges_in_memory")) |
107 | |
{ |
108 | 0 | maxEdgesInMemory = Integer.parseInt(value); |
109 | |
} |
110 | 791 | else if (parameterName.equals("tolerance")) |
111 | |
{ |
112 | 0 | tolerance = Double.parseDouble(value); |
113 | |
} |
114 | 791 | else if (parameterName.equals("max_iters")) |
115 | |
{ |
116 | 0 | maxIters = Integer.parseInt(value); |
117 | |
} |
118 | 791 | else if (parameterName.equals("spill_to_edge_disk_storage")) |
119 | |
{ |
120 | 0 | useEdgeDiskStorage = Boolean.parseBoolean(value); |
121 | |
} |
122 | 791 | else if (parameterName.equals("dangling_nodes")) |
123 | |
{ |
124 | 791 | enableDanglingNodeHandling = Boolean.parseBoolean(value); |
125 | |
} |
126 | |
} |
127 | |
|
128 | 791 | initialize(); |
129 | 791 | } |
130 | |
|
131 | |
private void initialize() |
132 | |
{ |
133 | 791 | long heapSize = Runtime.getRuntime().totalMemory(); |
134 | 791 | long heapMaxSize = Runtime.getRuntime().maxMemory(); |
135 | 791 | long heapFreeSize = Runtime.getRuntime().freeMemory(); |
136 | |
|
137 | |
|
138 | 791 | if (useEdgeDiskStorage) |
139 | |
{ |
140 | 0 | this.graph.enableEdgeDiskCaching(); |
141 | |
} |
142 | |
else |
143 | |
{ |
144 | 791 | this.graph.disableEdgeDiskCaching(); |
145 | |
} |
146 | |
|
147 | 791 | if (enableDanglingNodeHandling) |
148 | |
{ |
149 | 791 | this.graph.enableDanglingNodeHandling(); |
150 | |
} |
151 | |
else |
152 | |
{ |
153 | 0 | this.graph.disableDanglingNodeHandling(); |
154 | |
} |
155 | |
|
156 | 791 | this.graph.setEdgeCachingThreshold(maxEdgesInMemory); |
157 | 791 | } |
158 | |
|
159 | |
@Override |
160 | |
public void accumulate(Tuple t) throws IOException |
161 | |
{ |
162 | 1 | if (aborted) |
163 | |
{ |
164 | 0 | return; |
165 | |
} |
166 | |
|
167 | 1 | DataBag bag = (DataBag) t.get(0); |
168 | 1 | if (bag == null || bag.size() == 0) |
169 | 0 | return; |
170 | |
|
171 | 1 | for (Tuple sourceTuple : bag) |
172 | |
{ |
173 | 10 | Integer sourceId = (Integer)sourceTuple.get(0); |
174 | 10 | DataBag edges = (DataBag)sourceTuple.get(1); |
175 | |
|
176 | 10 | ArrayList<Map<String,Object>> edgesMapList = new ArrayList<Map<String, Object>>(); |
177 | |
|
178 | 10 | for (Tuple edgeTuple : edges) |
179 | |
{ |
180 | 17 | Integer destId = (Integer)edgeTuple.get(0); |
181 | 17 | Double weight = (Double)edgeTuple.get(1); |
182 | 17 | HashMap<String,Object> edgeMap = new HashMap<String, Object>(); |
183 | 17 | edgeMap.put("dest",destId); |
184 | 17 | edgeMap.put("weight",weight); |
185 | 17 | edgesMapList.add(edgeMap); |
186 | 17 | } |
187 | |
|
188 | 10 | graph.addEdges(sourceId, edgesMapList); |
189 | |
|
190 | 10 | if (graph.nodeCount() + graph.edgeCount() > maxNodesAndEdges) |
191 | |
{ |
192 | 0 | System.out.println(String.format("There are too many nodes and edges (%d + %d > %d). Aborting.", graph.nodeCount(), graph.edgeCount(), maxNodesAndEdges)); |
193 | 0 | aborted = true; |
194 | |
} |
195 | |
|
196 | 10 | reporter.progress(); |
197 | 10 | } |
198 | 1 | } |
199 | |
|
200 | |
@Override |
201 | |
public DataBag getValue() |
202 | |
{ |
203 | 1 | if (aborted) |
204 | |
{ |
205 | 0 | return null; |
206 | |
} |
207 | |
|
208 | 1 | System.out.println(String.format("Nodes: %d, Edges: %d", graph.nodeCount(), graph.edgeCount())); |
209 | |
|
210 | 1 | ProgressIndicator progressIndicator = getProgressIndicator(); |
211 | 1 | System.out.println("Finished loading graph."); |
212 | 1 | long startTime = System.nanoTime(); |
213 | 1 | System.out.println("Initializing."); |
214 | |
try |
215 | |
{ |
216 | 1 | graph.init(progressIndicator); |
217 | |
} |
218 | 0 | catch (IOException e) |
219 | |
{ |
220 | 0 | e.printStackTrace(); |
221 | 0 | return null; |
222 | 1 | } |
223 | 1 | System.out.println(String.format("Done, took %f ms", (System.nanoTime() - startTime)/10.0e6)); |
224 | |
|
225 | |
float totalDiff; |
226 | 1 | int iter = 0; |
227 | |
|
228 | 1 | System.out.println("Beginning iterations"); |
229 | 1 | startTime = System.nanoTime(); |
230 | |
do |
231 | |
{ |
232 | |
|
233 | |
try |
234 | |
{ |
235 | 150 | totalDiff = graph.nextIteration(progressIndicator); |
236 | |
} |
237 | 0 | catch (IOException e) |
238 | |
{ |
239 | 0 | e.printStackTrace(); |
240 | 0 | return null; |
241 | 150 | } |
242 | 150 | iter++; |
243 | 150 | } while(iter < maxIters && totalDiff > tolerance); |
244 | 1 | System.out.println(String.format("Done, %d iterations took %f ms", iter, (System.nanoTime() - startTime)/10.0e6)); |
245 | |
|
246 | 1 | DataBag output = bagFactory.newDefaultBag(); |
247 | |
|
248 | 1 | for (Int2IntMap.Entry node : graph.getNodeIds()) |
249 | |
{ |
250 | 11 | int nodeId = node.getIntKey(); |
251 | 11 | float rank = graph.getNodeRank(nodeId); |
252 | 11 | List nodeData = new ArrayList(2); |
253 | 11 | nodeData.add(nodeId); |
254 | 11 | nodeData.add(rank); |
255 | 11 | output.add(tupleFactory.newTuple(nodeData)); |
256 | 11 | } |
257 | |
|
258 | 1 | return output; |
259 | |
} |
260 | |
|
261 | |
@Override |
262 | |
public void cleanup() |
263 | |
{ |
264 | |
try |
265 | |
{ |
266 | 2 | aborted = false; |
267 | 2 | this.graph.clear(); |
268 | |
} |
269 | 0 | catch (IOException e) |
270 | |
{ |
271 | 0 | e.printStackTrace(); |
272 | 2 | } |
273 | 2 | } |
274 | |
|
275 | |
@Override |
276 | |
public DataBag exec(Tuple input) throws IOException |
277 | |
{ |
278 | |
try |
279 | |
{ |
280 | 0 | accumulate(input); |
281 | |
|
282 | 0 | return getValue(); |
283 | |
} |
284 | |
finally |
285 | |
{ |
286 | 0 | cleanup(); |
287 | |
} |
288 | |
} |
289 | |
|
290 | |
private ProgressIndicator getProgressIndicator() |
291 | |
{ |
292 | 1 | return new ProgressIndicator() |
293 | 1 | { |
294 | |
@Override |
295 | |
public void progress() |
296 | |
{ |
297 | 4228 | reporter.progress(); |
298 | 4228 | } |
299 | |
}; |
300 | |
} |
301 | |
|
302 | |
@Override |
303 | |
public Schema outputSchema(Schema input) |
304 | |
{ |
305 | |
try |
306 | |
{ |
307 | 404 | Schema.FieldSchema inputFieldSchema = input.getField(0); |
308 | |
|
309 | 404 | if (inputFieldSchema.type != DataType.BAG) |
310 | |
{ |
311 | 0 | throw new RuntimeException("Expected a BAG as input"); |
312 | |
} |
313 | |
|
314 | 404 | Schema inputBagSchema = inputFieldSchema.schema; |
315 | |
|
316 | 404 | if (inputBagSchema.getField(0).type != DataType.TUPLE) |
317 | |
{ |
318 | 0 | throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s", |
319 | |
DataType.findTypeName(inputBagSchema.getField(0).type))); |
320 | |
} |
321 | |
|
322 | 404 | Schema inputTupleSchema = inputBagSchema.getField(0).schema; |
323 | |
|
324 | 404 | if (inputTupleSchema.getField(0).type != DataType.INTEGER) |
325 | |
{ |
326 | 0 | throw new RuntimeException(String.format("Expected source to be an INTEGER, but instead found %s", |
327 | |
DataType.findTypeName(inputTupleSchema.getField(0).type))); |
328 | |
} |
329 | |
|
330 | 404 | if (inputTupleSchema.getField(1).type != DataType.BAG) |
331 | |
{ |
332 | 0 | throw new RuntimeException(String.format("Expected edges to be represented with a BAG")); |
333 | |
} |
334 | |
|
335 | 404 | Schema.FieldSchema edgesFieldSchema = inputTupleSchema.getField(1); |
336 | |
|
337 | 404 | if (edgesFieldSchema.schema.getField(0).type != DataType.TUPLE) |
338 | |
{ |
339 | 0 | throw new RuntimeException(String.format("Expected edges field to contain a TUPLE, but instead found %s", |
340 | |
DataType.findTypeName(edgesFieldSchema.schema.getField(0).type))); |
341 | |
} |
342 | |
|
343 | 404 | Schema edgesTupleSchema = edgesFieldSchema.schema.getField(0).schema; |
344 | |
|
345 | 404 | if (edgesTupleSchema.getField(0).type != DataType.INTEGER) |
346 | |
{ |
347 | 0 | throw new RuntimeException(String.format("Expected destination edge ID to an INTEGER, but instead found %s", |
348 | |
DataType.findTypeName(edgesFieldSchema.schema.getField(0).type))); |
349 | |
} |
350 | |
|
351 | 404 | if (edgesTupleSchema.getField(1).type != DataType.DOUBLE) |
352 | |
{ |
353 | 0 | throw new RuntimeException(String.format("Expected destination edge weight to a DOUBLE, but instead found %s", |
354 | |
DataType.findTypeName(edgesFieldSchema.schema.getField(1).type))); |
355 | |
} |
356 | |
|
357 | 404 | Schema tupleSchema = new Schema(); |
358 | 404 | tupleSchema.add(new Schema.FieldSchema("node",DataType.INTEGER)); |
359 | 404 | tupleSchema.add(new Schema.FieldSchema("rank",DataType.FLOAT)); |
360 | |
|
361 | 404 | return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass() |
362 | |
.getName() |
363 | |
.toLowerCase(), input), |
364 | |
tupleSchema, |
365 | |
DataType.BAG)); |
366 | |
} |
367 | 0 | catch (FrontendException e) |
368 | |
{ |
369 | 0 | throw new RuntimeException(e); |
370 | |
} |
371 | |
} |
372 | |
} |