Add numpy docstring to IC and Parallel functions in helpers.py

olety · olety · commit 72f2f0ff2edb · 2020-07-14T17:20:58.000+09:00
diff --git a/helpers.py b/helpers.py
@@ -14,27 +14,201 @@
 import traceback
 from subprocess import Popen, PIPE
 
+# --------------------------------------------------------------------------------------
+# Parallel processing-related functions
+# --------------------------------------------------------------------------------------
+
+
+@contextlib.contextmanager
+def tqdm_joblib(tqdm_object):
+    # Taken from https://stackoverflow.com/questions/24983493/tracking-progress-of-joblib-parallel-execution/41815007
+    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
+
+    def tqdm_print_progress(self):
+        if self.n_completed_tasks > tqdm_object.n:
+            n_completed = self.n_completed_tasks - tqdm_object.n
+            tqdm_object.update(n=n_completed)
+
+    original_print_progress = joblib.parallel.Parallel.print_progress
+    joblib.parallel.Parallel.print_progress = tqdm_print_progress
+
+    try:
+        yield tqdm_object
+    finally:
+        joblib.parallel.Parallel.print_progress = original_print_progress
+        tqdm_object.close()
+
+
+def run_algorithm(setup_dict):
+    """ Run an algorithm in parallel
+
+    Run an algorithm in parallel. This is supposed to be used in conjunction with
+    joblib.Parallel to run different IM algorithms at the same time.
+
+
+    Parameters
+    ----------
+    setup_dict : dict
+        A dictionary containing the following keys:
+        * "function" : a function that we have to run
+        * "algo_name" : name of the algorithm represented by the function
+        * "args" : arguments for that functions
+        * "kwargs" : keyword arguments for that functions
+
+    Returns
+    -------
+    result_dict : dict
+        A dictionary containing the following:
+        * "results" : results generated by the function
+        * "algo_name" : name of the algorithm represented by the function
+        * "kwargs" : keyword argument from the function
+
+    Examples
+    --------
+    >>> setup_array = []
+    ... setup_array.append(
+    ...    {
+    ...        "algo_name": "timlinucb",
+    ...        "function": timlinucb_parallel_t,
+    ...        "args": [DATASET, DATASET_FEATS, DATASET_TIMES, DATASET_NODES],
+    ...        "kwargs": {
+    ...            "num_seeds": NUM_SEEDS_TO_FIND,
+    ...            "num_repeats_oim": OPTIMAL_NUM_REPEATS_OIM_TLU,
+    ...            "num_repeats_oim_reward": OPTIMAL_NUM_REPEATS_REW_TLU,
+    ...            "sigma": OPTIMAL_SIGMA_TLU,
+    ...            "c": OPTIMAL_C_TLU,
+    ...            "epsilon": OPTIMAL_EPS_TLU,
+    ...        },
+    ...    }
+    ... )
+    ... results_array = joblib.Parallel(n_jobs=len(setup_array))(
+    ...     joblib.delayed(run_algorithm)(setup_dict) for setup_dict in setup_array
+    ... )
+
+
+    """
+    try:
+        result_dict = {
+            "result": setup_dict.get("function")(
+                *setup_dict.get("args"), **setup_dict.get("kwargs")
+            ),
+            "algo_name": setup_dict.get("algo_name"),
+            "kwargs": setup_dict.get("kwargs"),
+        }
+    except Exception as e:
+        print(e)
+        print(setup_dict)
+        traceback.print_exc()
+        return {}
+    return result_dict
+
+
+def _run_timlinucb_parallel(setup_dict):
+    """ Run IMLinUCB in parallel
+
+    This is a helper function used by timlinucb_parallel_oim from timlinucb.py to
+    run multiple IMLinUCB instances at the same time.
+
+    Parameters
+    ----------
+    setup_dict : dict
+        A dictionary containing the following keys:
+        * "function" : a function that we have to run
+        * "time" : time t of the current OIM execution
+        * "args" : arguments for that functions
+        * "kwargs" : keyword arguments for that functions
+
+    Returns
+    -------
+    result_dict : dict
+        A dictionary containing the following:
+        * "results" : results generated by IMLinUCB
+        * "time" : time t of the current OIM execution
+
+
+    """
+    result = setup_dict["function"](*setup_dict["args"], **setup_dict["kwargs"])
+    result["time"] = setup_dict["time"]
+    return result
+
+
+# --------------------------------------------------------------------------------------
+# IC-related functions
+# --------------------------------------------------------------------------------------
+
 
 def get_avg_reward(df, seeds, num_repeats):
+    """ Simulate the influence propagation using the IC model
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        The graph we run the IC on, in the form of a DataFrame. A row represents one
+        edge in the graph, with columns being named "source", "target", "probab".
+        "probab" column contains the activation probability.
+    seeds : list, pandas.Series
+        A list of the nodes to start propagating from.
+    num_repeats : int
+        Specifies how many times we want to simulate the propagation with IC.
+
+    Returns
+    -------
+    avg_reward : float
+        Number showing how many nodes were influenced on average
+    """
     reward = []
     for i in range(num_repeats):
         reward.append(run_ic_nodes(df, seeds).shape[0])
     return np.average(reward)
 
 
 def get_stats_reward(df, seeds, num_repeats):
+    """ Simulate the influence propagation using the IC model
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        The graph we run the IC on, in the form of a DataFrame. A row represents one
+        edge in the graph, with columns being named "source", "target", "probab".
+        "probab" column contains the activation probability.
+    seeds : list, pandas.Series
+        A list of the nodes to start propagating from.
+    num_repeats : int
+        Specifies how many times we want to simulate the propagation with IC.
+
+    Returns
+    -------
+    avg_reward : float
+        Number showing how many nodes were influenced on average
+    std_reward : float
+        Standard deviation of avg_reward
+    """
     reward = []
     for i in range(num_repeats):
         reward.append(run_ic_nodes(df, seeds).shape[0])
     return np.average(reward), np.std(reward)
 
 
 def run_ic_eff(df_graph, seed_nodes):
-    """ Runs independent cascade model.
-    Input: df_g -- a dataframe representing the graph (with the probabilities)
-    S -- initial set of vertices
-    tracking -- whether we want to check for active/observed nodes
-    Output: T -- resulted influenced set of vertices (including S)
+    """ Simulate the influence propagation using the IC model
+
+    Parameters
+    ----------
+    df_graph : pandas.DataFrame
+        The graph we run the IC on, in the form of a DataFrame. A row represents one
+        edge in the graph, with columns being named "source", "target", "probab".
+        "probab" column contains the activation probability.
+    seed_nodes : list, pandas.Series
+        A list of the nodes to start propagating from.
+
+    Returns
+    -------
+    results : tuple
+        A tuple of the following numpy arrays
+        - Affected nodes
+        - Activated edges
+        - Observed edges
+
     """
     affected_nodes = deepcopy(seed_nodes)  # copy already selected nodes
     activated_edges = []
@@ -55,11 +229,21 @@ def run_ic_eff(df_graph, seed_nodes):
 
 
 def run_ic_nodes(df_graph, seed_nodes):
-    """ Runs independent cascade model.
-    Input: df_g -- a dataframe representing the graph (with the probabilities)
-    S -- initial set of vertices
-    tracking -- whether we want to check for active/observed nodes
-    Output: T -- resulted influenced set of vertices (including S)
+    """ Simulate the influence propagation using the IC model
+
+    Parameters
+    ----------
+    df_graph : pandas.DataFrame
+        The graph we run the IC on, in the form of a DataFrame. A row represents one
+        edge in the graph, with columns being named "source", "target", "probab".
+        "probab" column contains the activation probability.
+    seed_nodes : list, pandas.Series
+        A list of the nodes to start propagating from.
+
+    Returns
+    -------
+    affected_nodes : numpy.array
+        Nodes influenced by propagating the seed nodes.
     """
     affected_nodes = deepcopy(seed_nodes)  # copy already selected nodes
     df_graph["activated"] = df_graph["probab"].apply(lambda x: random.random() <= x)
@@ -75,6 +259,11 @@ def run_ic_nodes(df_graph, seed_nodes):
     return np.array(affected_nodes)
 
 
+# --------------------------------------------------------------------------------------
+# TIM-related functions
+# --------------------------------------------------------------------------------------
+
+
 def tim(
     df,
     num_nodes,
@@ -233,46 +422,3 @@ def tim_t_parallel(
             }
         )
     return pd.DataFrame(results)
-
-
-@contextlib.contextmanager
-def tqdm_joblib(tqdm_object):
-    # Taken from https://stackoverflow.com/questions/24983493/tracking-progress-of-joblib-parallel-execution/41815007
-    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
-
-    def tqdm_print_progress(self):
-        if self.n_completed_tasks > tqdm_object.n:
-            n_completed = self.n_completed_tasks - tqdm_object.n
-            tqdm_object.update(n=n_completed)
-
-    original_print_progress = joblib.parallel.Parallel.print_progress
-    joblib.parallel.Parallel.print_progress = tqdm_print_progress
-
-    try:
-        yield tqdm_object
-    finally:
-        joblib.parallel.Parallel.print_progress = original_print_progress
-        tqdm_object.close()
-
-
-def run_algorithm(setup_dict):
-    try:
-        result_dict = {
-            "result": setup_dict.get("function")(
-                *setup_dict.get("args"), **setup_dict.get("kwargs")
-            ),
-            "algo_name": setup_dict.get("algo_name"),
-            "kwargs": setup_dict.get("kwargs"),
-        }
-    except Exception as e:
-        print(e)
-        print(setup_dict)
-        traceback.print_exc()
-        return {}
-    return result_dict
-
-
-def _run_timlinucb_parallel(setup_dict):
-    result = setup_dict["function"](*setup_dict["args"], **setup_dict["kwargs"])
-    result["time"] = setup_dict["time"]
-    return result