Skip to content

Commit 2e0de3a

Browse files
committed
Add numpy docstrings to the rest of the functions in helpers.py
1 parent 72f2f0f commit 2e0de3a

1 file changed

Lines changed: 143 additions & 12 deletions

File tree

helpers.py

Lines changed: 143 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,13 @@
2222
@contextlib.contextmanager
2323
def tqdm_joblib(tqdm_object):
2424
# Taken from https://stackoverflow.com/questions/24983493/tracking-progress-of-joblib-parallel-execution/41815007
25-
"""Context manager to patch joblib to report into tqdm progress bar given as argument"""
25+
"""Context manager to patch joblib to report into tqdm progress bar given as argument
26+
27+
Parameters
28+
----------
29+
tqdm_object : Object
30+
The tqdm object to paralellize
31+
"""
2632

2733
def tqdm_print_progress(self):
2834
if self.n_completed_tasks > tqdm_object.n:
@@ -273,11 +279,36 @@ def tim(
273279
temp_dir="temp_dir",
274280
out_pattern=re.compile("Selected k SeedSet: (.+?) \\n"),
275281
):
276-
""" Runs TIM (the oracle function).
277-
Input: df -- the graph to process
278-
num_inf -- the k the we are looking for
279-
epsilon -- hyperparameter
280-
Output: T -- The k highest influencers
282+
""" Run the Offline IM algorithm, TIM
283+
284+
Parameters
285+
----------
286+
df : pandas.DataFrame
287+
The graph we run the TIM on, in the form of a DataFrame. A row represents one
288+
edge in the graph, with columns being named "source", "target", "probab".
289+
"probab" column contains the activation probability.
290+
num_nodes : int
291+
Number of nodes to pass into TIM.
292+
num_edges : int
293+
Number of edges to pass into TIM.
294+
num_inf : int
295+
Number of seed nodes to find.
296+
epsilon : float
297+
A hyperparameter for TIM. Refer to the paper for more details. [1]
298+
temp_dir : str, optional
299+
A temporary directory to run TIM in. Default: "temp_dir"
300+
out_pattern : re.Pattern, optional
301+
Regex pattern that gets the TIM results out of its output.
302+
Default: re.compile("Selected k SeedSet: (.+?) \\n"),
303+
304+
Returns
305+
-------
306+
seeds : list
307+
A set of seed nodes that maximizes influence found by TIM
308+
309+
.. [1] Tang, Youze, Xiaokui Xiao, and Yanchen Shi.
310+
"Influence maximization: Near-optimal time complexity meets practical efficiency."
311+
Proceedings of the 2014 ACM SIGMOD international conference on Management of data. 2014.
281312
"""
282313
if not os.path.exists(temp_dir):
283314
os.makedirs(temp_dir)
@@ -317,15 +348,44 @@ def tim_parallel(
317348
num_edges,
318349
num_inf,
319350
epsilon,
320-
tim_file="./tim",
351+
tim_file="tim",
321352
temp_dir="temp_dir",
322353
out_pattern=re.compile("Selected k SeedSet: (.+?) \\n"),
323354
):
324-
""" Runs TIM (the oracle function).
325-
Input: df -- the graph to process
326-
num_inf -- the k the we are looking for
327-
epsilon -- hyperparameter
328-
Output: T -- The k highest influencers
355+
""" Run the Offline IM algorithm, TIM, in parallel
356+
357+
Parameters
358+
----------
359+
df : pandas.DataFrame
360+
The graph we run the TIM on, in the form of a DataFrame. A row represents one
361+
edge in the graph, with columns being named "source", "target", "probab".
362+
"probab" column contains the activation probability.
363+
num_nodes : int
364+
Number of nodes to pass into TIM.
365+
num_edges : int
366+
Number of edges to pass into TIM.
367+
num_inf : int
368+
Number of seed nodes to find.
369+
epsilon : float
370+
A hyperparameter for TIM. Refer to the paper for more details. [1]
371+
temp_dir : str, optional
372+
A temporary directory to run TIM in. Default: "temp_dir"
373+
tim_file : str, optional
374+
A path to the TIM executionable that we are going to use. This parameter
375+
is added due to the parallel processing requiring creating more TIM files
376+
to not hog it. Default: "tim"
377+
out_pattern : re.Pattern, optional
378+
Regex pattern that gets the TIM results out of its output.
379+
Default: re.compile("Selected k SeedSet: (.+?) \\n"),
380+
381+
Returns
382+
-------
383+
seeds : list
384+
A set of seed nodes that maximizes influence found by TIM
385+
386+
.. [1] Tang, Youze, Xiaokui Xiao, and Yanchen Shi.
387+
"Influence maximization: Near-optimal time complexity meets practical efficiency."
388+
Proceedings of the 2014 ACM SIGMOD international conference on Management of data. 2014.
329389
"""
330390
if not os.path.exists(temp_dir):
331391
os.makedirs(temp_dir)
@@ -360,6 +420,40 @@ def tim_parallel(
360420

361421

362422
def tim_t(df_edges, nodes, times, num_seeds=5, num_repeats_reward=20, epsilon=0.4):
423+
""" Run the Offline IM algorithm, TIM, on every time step in a network
424+
425+
Parameters
426+
----------
427+
df_edges : pandas.DataFrame
428+
The graph we run the TIM on, in the form of a DataFrame. A row represents one
429+
edge in the graph, with columns being named "source", "target", "probab" and
430+
"day". "probab" column contains the activation probability and "day" should
431+
correspond to the days specified in times.
432+
nodes : pandas.Series, list
433+
A sorted list of all unique node ids in the graph.
434+
times : pd.Series, list
435+
A list representing the times that we want to run the algorithm on. Is useful
436+
if we don't want to run TIM on every single time step in the graph.
437+
num_seeds : int, optional
438+
Number of seed nodes to find. Default: 5
439+
num_repeats_reward : int, optional
440+
Number of times we will try propagating the obtained seed nodes using the IC
441+
model to get the reward. The reward is then averaged over the runs. Default: 20
442+
epsilon : float, optional
443+
A hyperparameter for TIM. Refer to the paper for more details. [1]
444+
445+
Returns
446+
-------
447+
results : pd.DataFrame
448+
A dataframe with the following columns
449+
* time, representing the time step of the result
450+
* reward, an average reward obtained over num_repeats_reward runs
451+
* selected, a list of selected seed nodes
452+
453+
.. [1] Tang, Youze, Xiaokui Xiao, and Yanchen Shi.
454+
"Influence maximization: Near-optimal time complexity meets practical efficiency."
455+
Proceedings of the 2014 ACM SIGMOD international conference on Management of data. 2014.
456+
"""
363457
# TIM wants the max node ID ()
364458
num_nodes = nodes[-1] + 1
365459
results = []
@@ -392,6 +486,43 @@ def tim_t_parallel(
392486
epsilon=0.4,
393487
process_id=1,
394488
):
489+
""" Run the Offline IM algorithm, TIM, on every time step in a network in parallel
490+
491+
Parameters
492+
----------
493+
df_edges : pandas.DataFrame
494+
The graph we run the TIM on, in the form of a DataFrame. A row represents one
495+
edge in the graph, with columns being named "source", "target", "probab" and
496+
"day". "probab" column contains the activation probability and "day" should
497+
correspond to the days specified in times.
498+
nodes : pandas.Series, list
499+
A sorted list of all unique node ids in the graph.
500+
times : pd.Series, list
501+
A list representing the times that we want to run the algorithm on. Is useful
502+
if we don't want to run TIM on every single time step in the graph.
503+
num_seeds : int, optional
504+
Number of seed nodes to find. Default: 5
505+
num_repeats_reward : int, optional
506+
Number of times we will try propagating the obtained seed nodes using the IC
507+
model to get the reward. The reward is then averaged over the runs. Default: 20
508+
epsilon : float, optional
509+
A hyperparameter for TIM. Refer to the paper for more details. [1] Default: 0.4
510+
process_id : int or str, optional
511+
An identifier used in distinguishing the temporary TIM executable from others.
512+
Default: 1
513+
514+
Returns
515+
-------
516+
results : pd.DataFrame
517+
A dataframe with the following columns
518+
* time, representing the time step of the result
519+
* reward, an average reward obtained over num_repeats_reward runs
520+
* selected, a list of selected seed nodes
521+
522+
.. [1] Tang, Youze, Xiaokui Xiao, and Yanchen Shi.
523+
"Influence maximization: Near-optimal time complexity meets practical efficiency."
524+
Proceedings of the 2014 ACM SIGMOD international conference on Management of data. 2014.
525+
"""
395526
tim_name = "tim_t_" + str(process_id)
396527
temp_dir_name = tim_name + "_dir"
397528
shutil.copyfile("tim", tim_name)

0 commit comments

Comments
 (0)