Skip to content

Commit 336522c

Browse files
committed
Convenience function for prop data added
Add convenience function for unpaired proportional plot.
1 parent f3ee4e3 commit 336522c

5 files changed

Lines changed: 226 additions & 3 deletions

File tree

dabest/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from ._api import load
1+
from ._api import load, prop_dataset
22
from ._stats_tools import effsize as effsize
33
from ._classes import TwoGroupsEffectSize, PermutationTest
44

dabest/_api.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/API/load.ipynb.
22

33
# %% auto 0
4-
__all__ = ['load']
4+
__all__ = ['load', 'prop_dataset']
55

66
# %% ../nbs/API/load.ipynb 4
77
def load(data, idx=None, x=None, y=None, paired=None, id_col=None,
@@ -77,3 +77,54 @@ def load(data, idx=None, x=None, y=None, paired=None, id_col=None,
7777
return Dabest(data, idx, x, y, paired, id_col, ci, resamples, random_seed, proportional, delta2, experiment, experiment_label, x1_level, mini_meta)
7878

7979

80+
81+
# %% ../nbs/API/load.ipynb 5
82+
import numpy as np
83+
from typing import Union, Optional
84+
85+
def prop_dataset(group:Union[list, tuple, np.ndarray, dict], #Accepts lists, tuples, or numpy ndarrays of numeric types.
86+
group_names: Optional[list] = None):
87+
'''
88+
Convenient function to generate a dataframe of binary data.
89+
'''
90+
import pandas as pd
91+
92+
if isinstance(group, dict):
93+
# If group_names is not provided, use the keys of the dict as group_names
94+
if group_names is None:
95+
group_names = list(group.keys())
96+
elif not set(group_names) == set(group.keys()):
97+
# Check if the group_names provided is the same as the keys of the dict
98+
raise ValueError('group_names must be the same as the keys of the dict.')
99+
# Check if the values in the dict are numeric
100+
if not all([isinstance(group[name], (list, tuple, np.ndarray)) for name in group_names]):
101+
raise ValueError('group must be a dict of lists, tuples, or numpy ndarrays of numeric types.')
102+
# Check if the values in the dict only have two elements under each parent key
103+
if not all([len(group[name]) == 2 for name in group_names]):
104+
raise ValueError('Each parent key should have only two elements.')
105+
group_val = group
106+
107+
else:
108+
if group_names is None:
109+
raise ValueError('group_names must be provided if group is not a dict.')
110+
# Check if the length of group is two times of the length of group_names
111+
if not len(group) == 2 * len(group_names):
112+
raise ValueError('The length of group must be two times of the length of group_names.')
113+
group_val = {group_names[i]: [group[i*2], group[i*2+1]] for i in range(len(group_names))}
114+
115+
# Check if the sum of values in group_val under each key are the same
116+
if not all([sum(group_val[name]) == sum(group_val[group_names[0]]) for name in group_val.keys()]):
117+
raise ValueError('The sum of values under each key must be the same.')
118+
119+
id_col = pd.Series(range(1, sum(group_val[group_names[0]])+1))
120+
121+
final_df = pd.DataFrame()
122+
123+
for name in group_val.keys():
124+
col = np.repeat(0, group_val[name][0]).tolist() + np.repeat(1, group_val[name][1]).tolist()
125+
df = pd.DataFrame({name:col})
126+
final_df = pd.concat([final_df, df], axis=1)
127+
128+
final_df['ID'] = id_col
129+
130+
return final_df

nbs/01-getting_started.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
"* [scipy 1.9.3](https://www.scipy.org)\n",
3636
"* [matplotlib 3.5.1](https://www.matplotlib.org)\n",
3737
"* [pandas 1.5.0](https://pandas.pydata.org)\n",
38-
"* [seaborn 0.11.2](https://seaborn.pydata.org)\n",
38+
"* [seaborn 0.12.2](https://seaborn.pydata.org)\n",
3939
"* [lqrt 0.3](https://github.com/alyakin314/lqrt)\n",
4040
"\n",
4141
"To obtain these package dependencies easily, it is highly recommended to download the [Anaconda](https://www.continuum.io/downloads) distribution of Python.\n"

nbs/API/load.ipynb

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,64 @@
123123
"\n"
124124
]
125125
},
126+
{
127+
"cell_type": "code",
128+
"execution_count": null,
129+
"metadata": {},
130+
"outputs": [],
131+
"source": [
132+
"#| export\n",
133+
"import numpy as np\n",
134+
"from typing import Union, Optional\n",
135+
"\n",
136+
"def prop_dataset(group:Union[list, tuple, np.ndarray, dict], #Accepts lists, tuples, or numpy ndarrays of numeric types.\n",
137+
" group_names: Optional[list] = None):\n",
138+
" '''\n",
139+
" Convenient function to generate a dataframe of binary data.\n",
140+
" '''\n",
141+
" import pandas as pd\n",
142+
"\n",
143+
" if isinstance(group, dict):\n",
144+
" # If group_names is not provided, use the keys of the dict as group_names\n",
145+
" if group_names is None:\n",
146+
" group_names = list(group.keys())\n",
147+
" elif not set(group_names) == set(group.keys()):\n",
148+
" # Check if the group_names provided is the same as the keys of the dict\n",
149+
" raise ValueError('group_names must be the same as the keys of the dict.')\n",
150+
" # Check if the values in the dict are numeric\n",
151+
" if not all([isinstance(group[name], (list, tuple, np.ndarray)) for name in group_names]):\n",
152+
" raise ValueError('group must be a dict of lists, tuples, or numpy ndarrays of numeric types.')\n",
153+
" # Check if the values in the dict only have two elements under each parent key\n",
154+
" if not all([len(group[name]) == 2 for name in group_names]):\n",
155+
" raise ValueError('Each parent key should have only two elements.')\n",
156+
" group_val = group\n",
157+
"\n",
158+
" else:\n",
159+
" if group_names is None:\n",
160+
" raise ValueError('group_names must be provided if group is not a dict.')\n",
161+
" # Check if the length of group is two times of the length of group_names\n",
162+
" if not len(group) == 2 * len(group_names):\n",
163+
" raise ValueError('The length of group must be two times of the length of group_names.')\n",
164+
" group_val = {group_names[i]: [group[i*2], group[i*2+1]] for i in range(len(group_names))}\n",
165+
"\n",
166+
" # Check if the sum of values in group_val under each key are the same\n",
167+
" if not all([sum(group_val[name]) == sum(group_val[group_names[0]]) for name in group_val.keys()]):\n",
168+
" raise ValueError('The sum of values under each key must be the same.')\n",
169+
" \n",
170+
" id_col = pd.Series(range(1, sum(group_val[group_names[0]])+1))\n",
171+
" \n",
172+
" final_df = pd.DataFrame()\n",
173+
"\n",
174+
" for name in group_val.keys():\n",
175+
" col = np.repeat(0, group_val[name][0]).tolist() + np.repeat(1, group_val[name][1]).tolist()\n",
176+
" df = pd.DataFrame({name:col})\n",
177+
" final_df = pd.concat([final_df, df], axis=1)\n",
178+
"\n",
179+
" final_df['ID'] = id_col\n",
180+
"\n",
181+
" return final_df"
182+
]
183+
},
126184
{
127185
"cell_type": "markdown",
128186
"metadata": {},

nbs/tutorials/03-proportion_plot.ipynb

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,120 @@
262262
"df.head()"
263263
]
264264
},
265+
{
266+
"cell_type": "markdown",
267+
"id": "7070baac",
268+
"metadata": {},
269+
"source": [
270+
"### Convenient Funtion to Create Dataset for Unpaired Proportional Plot"
271+
]
272+
},
273+
{
274+
"cell_type": "markdown",
275+
"id": "aa0a822c",
276+
"metadata": {},
277+
"source": [
278+
"In DABEST v2023.3.29, we absorbed some advice from biologists who will not have tables of 0's and 1's to hand. So a convenient function to generate the binary dataset according to the sample sizes is provided. Users can generate a pandas.DataFrame with the sample sizes of each element in the groups and the group names (optional if the sample sizes are provided in a dict)."
279+
]
280+
},
281+
{
282+
"cell_type": "code",
283+
"execution_count": null,
284+
"id": "4da428be",
285+
"metadata": {},
286+
"outputs": [
287+
{
288+
"name": "stdout",
289+
"output_type": "stream",
290+
"text": [
291+
"True\n"
292+
]
293+
},
294+
{
295+
"data": {
296+
"text/html": [
297+
"<div>\n",
298+
"<style scoped>\n",
299+
" .dataframe tbody tr th:only-of-type {\n",
300+
" vertical-align: middle;\n",
301+
" }\n",
302+
"\n",
303+
" .dataframe tbody tr th {\n",
304+
" vertical-align: top;\n",
305+
" }\n",
306+
"\n",
307+
" .dataframe thead th {\n",
308+
" text-align: right;\n",
309+
" }\n",
310+
"</style>\n",
311+
"<table border=\"1\" class=\"dataframe\">\n",
312+
" <thead>\n",
313+
" <tr style=\"text-align: right;\">\n",
314+
" <th></th>\n",
315+
" <th>a</th>\n",
316+
" <th>b</th>\n",
317+
" <th>ID</th>\n",
318+
" </tr>\n",
319+
" </thead>\n",
320+
" <tbody>\n",
321+
" <tr>\n",
322+
" <th>0</th>\n",
323+
" <td>0</td>\n",
324+
" <td>0</td>\n",
325+
" <td>1</td>\n",
326+
" </tr>\n",
327+
" <tr>\n",
328+
" <th>1</th>\n",
329+
" <td>0</td>\n",
330+
" <td>0</td>\n",
331+
" <td>2</td>\n",
332+
" </tr>\n",
333+
" <tr>\n",
334+
" <th>2</th>\n",
335+
" <td>0</td>\n",
336+
" <td>1</td>\n",
337+
" <td>3</td>\n",
338+
" </tr>\n",
339+
" <tr>\n",
340+
" <th>3</th>\n",
341+
" <td>1</td>\n",
342+
" <td>1</td>\n",
343+
" <td>4</td>\n",
344+
" </tr>\n",
345+
" <tr>\n",
346+
" <th>4</th>\n",
347+
" <td>1</td>\n",
348+
" <td>1</td>\n",
349+
" <td>5</td>\n",
350+
" </tr>\n",
351+
" </tbody>\n",
352+
"</table>\n",
353+
"</div>"
354+
],
355+
"text/plain": [
356+
" a b ID\n",
357+
"0 0 0 1\n",
358+
"1 0 0 2\n",
359+
"2 0 1 3\n",
360+
"3 1 1 4\n",
361+
"4 1 1 5"
362+
]
363+
},
364+
"execution_count": null,
365+
"metadata": {},
366+
"output_type": "execute_result"
367+
}
368+
],
369+
"source": [
370+
"sample_size_1 = {'a':[3, 4], 'b':[2, 5]}\n",
371+
"sample_size_2 = [3, 4, 2, 5]\n",
372+
"names = ['a', 'b']\n",
373+
"sample_df_1 = dabest.prop_dataset(sample_size_1)\n",
374+
"sample_df_2 = dabest.prop_dataset(sample_size_2, names)\n",
375+
"print(all(sample_df_1 == sample_df_2))\n",
376+
"sample_df_1.head()"
377+
]
378+
},
265379
{
266380
"attachments": {},
267381
"cell_type": "markdown",

0 commit comments

Comments
 (0)