Convenience function for prop data added

Jacobluke- · Jacobluke- · commit 336522c366d9 · 2023-07-14T16:49:16.000+08:00
Add convenience function for unpaired proportional plot.
diff --git a/dabest/__init__.py b/dabest/__init__.py
@@ -1,4 +1,4 @@
-from ._api import load
+from ._api import load, prop_dataset
 from ._stats_tools import effsize as effsize
 from ._classes import TwoGroupsEffectSize, PermutationTest
 
diff --git a/dabest/_api.py b/dabest/_api.py
@@ -1,7 +1,7 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/API/load.ipynb.
 
 # %% auto 0
-__all__ = ['load']
+__all__ = ['load', 'prop_dataset']
 
 # %% ../nbs/API/load.ipynb 4
 def load(data, idx=None, x=None, y=None, paired=None, id_col=None,
@@ -77,3 +77,54 @@ def load(data, idx=None, x=None, y=None, paired=None, id_col=None,
     return Dabest(data, idx, x, y, paired, id_col, ci, resamples, random_seed, proportional, delta2, experiment, experiment_label, x1_level, mini_meta)
 
 
+
+# %% ../nbs/API/load.ipynb 5
+import numpy as np
+from typing import Union, Optional
+
+def prop_dataset(group:Union[list, tuple, np.ndarray, dict], #Accepts lists, tuples, or numpy ndarrays of numeric types.
+                 group_names: Optional[list] = None):
+    '''
+    Convenient function to generate a dataframe of binary data.
+    '''
+    import pandas as pd
+
+    if isinstance(group, dict):
+        # If group_names is not provided, use the keys of the dict as group_names
+        if group_names is None:
+            group_names = list(group.keys())
+        elif not set(group_names) == set(group.keys()):
+            # Check if the group_names provided is the same as the keys of the dict
+            raise ValueError('group_names must be the same as the keys of the dict.')
+        # Check if the values in the dict are numeric
+        if not all([isinstance(group[name], (list, tuple, np.ndarray)) for name in group_names]):
+            raise ValueError('group must be a dict of lists, tuples, or numpy ndarrays of numeric types.')
+        # Check if the values in the dict only have two elements under each parent key
+        if not all([len(group[name]) == 2 for name in group_names]):
+            raise ValueError('Each parent key should have only two elements.')
+        group_val = group
+
+    else:
+        if group_names is None:
+            raise ValueError('group_names must be provided if group is not a dict.')
+        # Check if the length of group is two times of the length of group_names
+        if not len(group) == 2 * len(group_names):
+            raise ValueError('The length of group must be two times of the length of group_names.')
+        group_val = {group_names[i]: [group[i*2], group[i*2+1]] for i in range(len(group_names))}
+
+    # Check if the sum of values in group_val under each key are the same
+    if not all([sum(group_val[name]) == sum(group_val[group_names[0]]) for name in group_val.keys()]):
+        raise ValueError('The sum of values under each key must be the same.')
+    
+    id_col = pd.Series(range(1, sum(group_val[group_names[0]])+1))
+    
+    final_df = pd.DataFrame()
+
+    for name in group_val.keys():
+        col = np.repeat(0, group_val[name][0]).tolist() + np.repeat(1, group_val[name][1]).tolist()
+        df = pd.DataFrame({name:col})
+        final_df = pd.concat([final_df, df], axis=1)
+
+    final_df['ID'] = id_col
+
+    return final_df
diff --git a/nbs/01-getting_started.ipynb b/nbs/01-getting_started.ipynb
@@ -35,7 +35,7 @@
     "* [scipy 1.9.3](https://www.scipy.org)\n",
     "* [matplotlib 3.5.1](https://www.matplotlib.org)\n",
     "* [pandas 1.5.0](https://pandas.pydata.org)\n",
-    "* [seaborn 0.11.2](https://seaborn.pydata.org)\n",
+    "* [seaborn 0.12.2](https://seaborn.pydata.org)\n",
     "* [lqrt 0.3](https://github.com/alyakin314/lqrt)\n",
     "\n",
     "To obtain these package dependencies easily, it is highly recommended to download the [Anaconda](https://www.continuum.io/downloads) distribution of Python.\n"
diff --git a/nbs/API/load.ipynb b/nbs/API/load.ipynb
@@ -123,6 +123,64 @@
     "\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "import numpy as np\n",
+    "from typing import Union, Optional\n",
+    "\n",
+    "def prop_dataset(group:Union[list, tuple, np.ndarray, dict], #Accepts lists, tuples, or numpy ndarrays of numeric types.\n",
+    "                 group_names: Optional[list] = None):\n",
+    "    '''\n",
+    "    Convenient function to generate a dataframe of binary data.\n",
+    "    '''\n",
+    "    import pandas as pd\n",
+    "\n",
+    "    if isinstance(group, dict):\n",
+    "        # If group_names is not provided, use the keys of the dict as group_names\n",
+    "        if group_names is None:\n",
+    "            group_names = list(group.keys())\n",
+    "        elif not set(group_names) == set(group.keys()):\n",
+    "            # Check if the group_names provided is the same as the keys of the dict\n",
+    "            raise ValueError('group_names must be the same as the keys of the dict.')\n",
+    "        # Check if the values in the dict are numeric\n",
+    "        if not all([isinstance(group[name], (list, tuple, np.ndarray)) for name in group_names]):\n",
+    "            raise ValueError('group must be a dict of lists, tuples, or numpy ndarrays of numeric types.')\n",
+    "        # Check if the values in the dict only have two elements under each parent key\n",
+    "        if not all([len(group[name]) == 2 for name in group_names]):\n",
+    "            raise ValueError('Each parent key should have only two elements.')\n",
+    "        group_val = group\n",
+    "\n",
+    "    else:\n",
+    "        if group_names is None:\n",
+    "            raise ValueError('group_names must be provided if group is not a dict.')\n",
+    "        # Check if the length of group is two times of the length of group_names\n",
+    "        if not len(group) == 2 * len(group_names):\n",
+    "            raise ValueError('The length of group must be two times of the length of group_names.')\n",
+    "        group_val = {group_names[i]: [group[i*2], group[i*2+1]] for i in range(len(group_names))}\n",
+    "\n",
+    "    # Check if the sum of values in group_val under each key are the same\n",
+    "    if not all([sum(group_val[name]) == sum(group_val[group_names[0]]) for name in group_val.keys()]):\n",
+    "        raise ValueError('The sum of values under each key must be the same.')\n",
+    "    \n",
+    "    id_col = pd.Series(range(1, sum(group_val[group_names[0]])+1))\n",
+    "    \n",
+    "    final_df = pd.DataFrame()\n",
+    "\n",
+    "    for name in group_val.keys():\n",
+    "        col = np.repeat(0, group_val[name][0]).tolist() + np.repeat(1, group_val[name][1]).tolist()\n",
+    "        df = pd.DataFrame({name:col})\n",
+    "        final_df = pd.concat([final_df, df], axis=1)\n",
+    "\n",
+    "    final_df['ID'] = id_col\n",
+    "\n",
+    "    return final_df"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/nbs/tutorials/03-proportion_plot.ipynb b/nbs/tutorials/03-proportion_plot.ipynb
@@ -262,6 +262,120 @@
     "df.head()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "7070baac",
+   "metadata": {},
+   "source": [
+    "### Convenient Funtion to Create Dataset for Unpaired Proportional Plot"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa0a822c",
+   "metadata": {},
+   "source": [
+    "In DABEST v2023.3.29, we absorbed some advice from biologists who will not have tables of 0's and 1's to hand. So a convenient function to generate the binary dataset according to the sample sizes is provided. Users can generate a pandas.DataFrame with the sample sizes of each element in the groups and the group names (optional if the sample sizes are provided in a dict)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4da428be",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>a</th>\n",
+       "      <th>b</th>\n",
+       "      <th>ID</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   a  b  ID\n",
+       "0  0  0   1\n",
+       "1  0  0   2\n",
+       "2  0  1   3\n",
+       "3  1  1   4\n",
+       "4  1  1   5"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample_size_1 = {'a':[3, 4], 'b':[2, 5]}\n",
+    "sample_size_2 = [3, 4, 2, 5]\n",
+    "names = ['a', 'b']\n",
+    "sample_df_1 = dabest.prop_dataset(sample_size_1)\n",
+    "sample_df_2 = dabest.prop_dataset(sample_size_2, names)\n",
+    "print(all(sample_df_1 == sample_df_2))\n",
+    "sample_df_1.head()"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from ._api import load`
	`1`	`+from ._api import load, prop_dataset`
`2`	`2`	`from ._stats_tools import effsize as effsize`
`3`	`3`	`from ._classes import TwoGroupsEffectSize, PermutationTest`
`4`	`4`