|
22 | 22 | @contextlib.contextmanager |
23 | 23 | def tqdm_joblib(tqdm_object): |
24 | 24 | # Taken from https://stackoverflow.com/questions/24983493/tracking-progress-of-joblib-parallel-execution/41815007 |
25 | | - """Context manager to patch joblib to report into tqdm progress bar given as argument""" |
| 25 | + """Context manager to patch joblib to report into tqdm progress bar given as argument |
| 26 | +
|
| 27 | + Parameters |
| 28 | + ---------- |
| 29 | + tqdm_object : Object |
| 30 | + The tqdm object to paralellize |
| 31 | + """ |
26 | 32 |
|
27 | 33 | def tqdm_print_progress(self): |
28 | 34 | if self.n_completed_tasks > tqdm_object.n: |
@@ -273,11 +279,36 @@ def tim( |
273 | 279 | temp_dir="temp_dir", |
274 | 280 | out_pattern=re.compile("Selected k SeedSet: (.+?) \\n"), |
275 | 281 | ): |
276 | | - """ Runs TIM (the oracle function). |
277 | | - Input: df -- the graph to process |
278 | | - num_inf -- the k the we are looking for |
279 | | - epsilon -- hyperparameter |
280 | | - Output: T -- The k highest influencers |
| 282 | + """ Run the Offline IM algorithm, TIM |
| 283 | +
|
| 284 | + Parameters |
| 285 | + ---------- |
| 286 | + df : pandas.DataFrame |
| 287 | + The graph we run the TIM on, in the form of a DataFrame. A row represents one |
| 288 | + edge in the graph, with columns being named "source", "target", "probab". |
| 289 | + "probab" column contains the activation probability. |
| 290 | + num_nodes : int |
| 291 | + Number of nodes to pass into TIM. |
| 292 | + num_edges : int |
| 293 | + Number of edges to pass into TIM. |
| 294 | + num_inf : int |
| 295 | + Number of seed nodes to find. |
| 296 | + epsilon : float |
| 297 | + A hyperparameter for TIM. Refer to the paper for more details. [1] |
| 298 | + temp_dir : str, optional |
| 299 | + A temporary directory to run TIM in. Default: "temp_dir" |
| 300 | + out_pattern : re.Pattern, optional |
| 301 | + Regex pattern that gets the TIM results out of its output. |
| 302 | + Default: re.compile("Selected k SeedSet: (.+?) \\n"), |
| 303 | +
|
| 304 | + Returns |
| 305 | + ------- |
| 306 | + seeds : list |
| 307 | + A set of seed nodes that maximizes influence found by TIM |
| 308 | +
|
| 309 | + .. [1] Tang, Youze, Xiaokui Xiao, and Yanchen Shi. |
| 310 | + "Influence maximization: Near-optimal time complexity meets practical efficiency." |
| 311 | + Proceedings of the 2014 ACM SIGMOD international conference on Management of data. 2014. |
281 | 312 | """ |
282 | 313 | if not os.path.exists(temp_dir): |
283 | 314 | os.makedirs(temp_dir) |
@@ -317,15 +348,44 @@ def tim_parallel( |
317 | 348 | num_edges, |
318 | 349 | num_inf, |
319 | 350 | epsilon, |
320 | | - tim_file="./tim", |
| 351 | + tim_file="tim", |
321 | 352 | temp_dir="temp_dir", |
322 | 353 | out_pattern=re.compile("Selected k SeedSet: (.+?) \\n"), |
323 | 354 | ): |
324 | | - """ Runs TIM (the oracle function). |
325 | | - Input: df -- the graph to process |
326 | | - num_inf -- the k the we are looking for |
327 | | - epsilon -- hyperparameter |
328 | | - Output: T -- The k highest influencers |
| 355 | + """ Run the Offline IM algorithm, TIM, in parallel |
| 356 | +
|
| 357 | + Parameters |
| 358 | + ---------- |
| 359 | + df : pandas.DataFrame |
| 360 | + The graph we run the TIM on, in the form of a DataFrame. A row represents one |
| 361 | + edge in the graph, with columns being named "source", "target", "probab". |
| 362 | + "probab" column contains the activation probability. |
| 363 | + num_nodes : int |
| 364 | + Number of nodes to pass into TIM. |
| 365 | + num_edges : int |
| 366 | + Number of edges to pass into TIM. |
| 367 | + num_inf : int |
| 368 | + Number of seed nodes to find. |
| 369 | + epsilon : float |
| 370 | + A hyperparameter for TIM. Refer to the paper for more details. [1] |
| 371 | + temp_dir : str, optional |
| 372 | + A temporary directory to run TIM in. Default: "temp_dir" |
| 373 | + tim_file : str, optional |
| 374 | + A path to the TIM executionable that we are going to use. This parameter |
| 375 | + is added due to the parallel processing requiring creating more TIM files |
| 376 | + to not hog it. Default: "tim" |
| 377 | + out_pattern : re.Pattern, optional |
| 378 | + Regex pattern that gets the TIM results out of its output. |
| 379 | + Default: re.compile("Selected k SeedSet: (.+?) \\n"), |
| 380 | +
|
| 381 | + Returns |
| 382 | + ------- |
| 383 | + seeds : list |
| 384 | + A set of seed nodes that maximizes influence found by TIM |
| 385 | +
|
| 386 | + .. [1] Tang, Youze, Xiaokui Xiao, and Yanchen Shi. |
| 387 | + "Influence maximization: Near-optimal time complexity meets practical efficiency." |
| 388 | + Proceedings of the 2014 ACM SIGMOD international conference on Management of data. 2014. |
329 | 389 | """ |
330 | 390 | if not os.path.exists(temp_dir): |
331 | 391 | os.makedirs(temp_dir) |
@@ -360,6 +420,40 @@ def tim_parallel( |
360 | 420 |
|
361 | 421 |
|
362 | 422 | def tim_t(df_edges, nodes, times, num_seeds=5, num_repeats_reward=20, epsilon=0.4): |
| 423 | + """ Run the Offline IM algorithm, TIM, on every time step in a network |
| 424 | +
|
| 425 | + Parameters |
| 426 | + ---------- |
| 427 | + df_edges : pandas.DataFrame |
| 428 | + The graph we run the TIM on, in the form of a DataFrame. A row represents one |
| 429 | + edge in the graph, with columns being named "source", "target", "probab" and |
| 430 | + "day". "probab" column contains the activation probability and "day" should |
| 431 | + correspond to the days specified in times. |
| 432 | + nodes : pandas.Series, list |
| 433 | + A sorted list of all unique node ids in the graph. |
| 434 | + times : pd.Series, list |
| 435 | + A list representing the times that we want to run the algorithm on. Is useful |
| 436 | + if we don't want to run TIM on every single time step in the graph. |
| 437 | + num_seeds : int, optional |
| 438 | + Number of seed nodes to find. Default: 5 |
| 439 | + num_repeats_reward : int, optional |
| 440 | + Number of times we will try propagating the obtained seed nodes using the IC |
| 441 | + model to get the reward. The reward is then averaged over the runs. Default: 20 |
| 442 | + epsilon : float, optional |
| 443 | + A hyperparameter for TIM. Refer to the paper for more details. [1] |
| 444 | +
|
| 445 | + Returns |
| 446 | + ------- |
| 447 | + results : pd.DataFrame |
| 448 | + A dataframe with the following columns |
| 449 | + * time, representing the time step of the result |
| 450 | + * reward, an average reward obtained over num_repeats_reward runs |
| 451 | + * selected, a list of selected seed nodes |
| 452 | +
|
| 453 | + .. [1] Tang, Youze, Xiaokui Xiao, and Yanchen Shi. |
| 454 | + "Influence maximization: Near-optimal time complexity meets practical efficiency." |
| 455 | + Proceedings of the 2014 ACM SIGMOD international conference on Management of data. 2014. |
| 456 | + """ |
363 | 457 | # TIM wants the max node ID () |
364 | 458 | num_nodes = nodes[-1] + 1 |
365 | 459 | results = [] |
@@ -392,6 +486,43 @@ def tim_t_parallel( |
392 | 486 | epsilon=0.4, |
393 | 487 | process_id=1, |
394 | 488 | ): |
| 489 | + """ Run the Offline IM algorithm, TIM, on every time step in a network in parallel |
| 490 | +
|
| 491 | + Parameters |
| 492 | + ---------- |
| 493 | + df_edges : pandas.DataFrame |
| 494 | + The graph we run the TIM on, in the form of a DataFrame. A row represents one |
| 495 | + edge in the graph, with columns being named "source", "target", "probab" and |
| 496 | + "day". "probab" column contains the activation probability and "day" should |
| 497 | + correspond to the days specified in times. |
| 498 | + nodes : pandas.Series, list |
| 499 | + A sorted list of all unique node ids in the graph. |
| 500 | + times : pd.Series, list |
| 501 | + A list representing the times that we want to run the algorithm on. Is useful |
| 502 | + if we don't want to run TIM on every single time step in the graph. |
| 503 | + num_seeds : int, optional |
| 504 | + Number of seed nodes to find. Default: 5 |
| 505 | + num_repeats_reward : int, optional |
| 506 | + Number of times we will try propagating the obtained seed nodes using the IC |
| 507 | + model to get the reward. The reward is then averaged over the runs. Default: 20 |
| 508 | + epsilon : float, optional |
| 509 | + A hyperparameter for TIM. Refer to the paper for more details. [1] Default: 0.4 |
| 510 | + process_id : int or str, optional |
| 511 | + An identifier used in distinguishing the temporary TIM executable from others. |
| 512 | + Default: 1 |
| 513 | +
|
| 514 | + Returns |
| 515 | + ------- |
| 516 | + results : pd.DataFrame |
| 517 | + A dataframe with the following columns |
| 518 | + * time, representing the time step of the result |
| 519 | + * reward, an average reward obtained over num_repeats_reward runs |
| 520 | + * selected, a list of selected seed nodes |
| 521 | +
|
| 522 | + .. [1] Tang, Youze, Xiaokui Xiao, and Yanchen Shi. |
| 523 | + "Influence maximization: Near-optimal time complexity meets practical efficiency." |
| 524 | + Proceedings of the 2014 ACM SIGMOD international conference on Management of data. 2014. |
| 525 | + """ |
395 | 526 | tim_name = "tim_t_" + str(process_id) |
396 | 527 | temp_dir_name = tim_name + "_dir" |
397 | 528 | shutil.copyfile("tim", tim_name) |
|
0 commit comments