|
212 | 212 | " \"\"\"\n", |
213 | 213 | "\n", |
214 | 214 | " rng = RandomState(PCG64(random_seed))\n", |
215 | | - " x1_len = len(x1)\n", |
216 | | - " x2_len = len(x2)\n", |
217 | | - " x3_len = len(x3)\n", |
218 | | - " x4_len = len(x4)\n", |
219 | | - " out_delta_g = np.repeat(np.nan, resamples)\n", |
220 | | - " deltadelta = np.repeat(np.nan, resamples)\n", |
221 | | - "\n", |
222 | | - " n_a1_b1, n_a2_b1, n_a1_b2, n_a2_b2 = x1_len, x2_len, x3_len, x4_len\n", |
223 | | - " s_a1_b1, s_a2_b1, s_a1_b2, s_a2_b2 = np.std(x1), np.std(x2), np.std(x3), np.std(x4)\n", |
224 | | - "\n", |
225 | | - " sd_numerator = (\n", |
226 | | - " (n_a2_b1 - 1) * s_a2_b1**2\n", |
227 | | - " + (n_a1_b1 - 1) * s_a1_b1**2\n", |
228 | | - " + (n_a2_b2 - 1) * s_a2_b2**2\n", |
229 | | - " + (n_a1_b2 - 1) * s_a1_b2**2\n", |
230 | | - " )\n", |
231 | | - " sd_denominator = (n_a2_b1 - 1) + (n_a1_b1 - 1) + (n_a2_b2 - 1) + (n_a1_b2 - 1)\n", |
| 215 | + "\n", |
| 216 | + " x1, x2, x3, x4 = map(np.asarray, [x1, x2, x3, x4])\n", |
| 217 | + "\n", |
| 218 | + " # Calculating pooled sample standard deviation\n", |
| 219 | + " stds = [np.std(x) for x in [x1, x2, x3, x4]]\n", |
| 220 | + " ns = [len(x) for x in [x1, x2, x3, x4]]\n", |
| 221 | + "\n", |
| 222 | + " sd_numerator = sum((n - 1) * s**2 for n, s in zip(ns, stds))\n", |
| 223 | + " sd_denominator = sum(n - 1 for n in ns)\n", |
| 224 | + "\n", |
| 225 | + " # Avoid division by zero\n", |
| 226 | + " if sd_denominator == 0:\n", |
| 227 | + " raise ValueError(\"Insufficient data to compute pooled standard deviation.\")\n", |
| 228 | + "\n", |
232 | 229 | " pooled_sample_sd = np.sqrt(sd_numerator / sd_denominator)\n", |
233 | 230 | "\n", |
234 | | - " for i in range(int(resamples)):\n", |
| 231 | + " # Ensure pooled_sample_sd is not NaN or zero (to avoid division by zero later)\n", |
| 232 | + " if np.isnan(pooled_sample_sd) or pooled_sample_sd == 0:\n", |
| 233 | + " raise ValueError(\"Pooled sample standard deviation is NaN or zero.\")\n", |
| 234 | + "\n", |
| 235 | + " out_delta_g = np.empty(resamples)\n", |
| 236 | + " deltadelta = np.empty(resamples)\n", |
| 237 | + "\n", |
| 238 | + " # Bootstrapping\n", |
| 239 | + " for i in range(resamples):\n", |
| 240 | + " # Paired or unpaired resampling\n", |
235 | 241 | " if is_paired:\n", |
236 | | - " if (x1_len != x2_len) or (x3_len != x4_len):\n", |
237 | | - " raise ValueError(\"The two arrays do not have the same length.\")\n", |
238 | | - " df_paired_1 = pd.DataFrame(\n", |
239 | | - " {\n", |
240 | | - " \"value\": np.concatenate([x1, x3]),\n", |
241 | | - " \"array_id\": np.repeat([\"x1\", \"x3\"], [x1_len, x3_len]),\n", |
242 | | - " }\n", |
243 | | - " )\n", |
244 | | - " df_paired_2 = pd.DataFrame(\n", |
245 | | - " {\n", |
246 | | - " \"value\": np.concatenate([x2, x4]),\n", |
247 | | - " \"array_id\": np.repeat([\"x2\", \"x4\"], [x1_len, x3_len]),\n", |
248 | | - " }\n", |
249 | | - " )\n", |
250 | | - " x_sample_index = rng.choice(\n", |
251 | | - " len(df_paired_1), len(df_paired_1), replace=True\n", |
252 | | - " )\n", |
253 | | - " x_sample_1 = df_paired_1.loc[x_sample_index]\n", |
254 | | - " x_sample_2 = df_paired_2.loc[x_sample_index]\n", |
255 | | - " x1_sample = x_sample_1[x_sample_1[\"array_id\"] == \"x1\"][\"value\"]\n", |
256 | | - " x2_sample = x_sample_2[x_sample_2[\"array_id\"] == \"x2\"][\"value\"]\n", |
257 | | - " x3_sample = x_sample_1[x_sample_1[\"array_id\"] == \"x3\"][\"value\"]\n", |
258 | | - " x4_sample = x_sample_2[x_sample_2[\"array_id\"] == \"x4\"][\"value\"]\n", |
| 242 | + " if len(x1) != len(x2) or len(x3) != len(x4):\n", |
| 243 | + " raise ValueError(\"Each control group must have the same length as its corresponding test group in paired analysis.\")\n", |
| 244 | + " indices_1 = rng.choice(len(x1), len(x1), replace=True)\n", |
| 245 | + " indices_2 = rng.choice(len(x3), len(x3), replace=True)\n", |
| 246 | + "\n", |
| 247 | + " x1_sample, x2_sample = x1[indices_1], x2[indices_1]\n", |
| 248 | + " x3_sample, x4_sample = x3[indices_2], x4[indices_2]\n", |
259 | 249 | " else:\n", |
260 | | - " df = pd.DataFrame(\n", |
261 | | - " {\n", |
262 | | - " \"value\": np.concatenate([x1, x2, x3, x4]),\n", |
263 | | - " \"array_id\": np.repeat(\n", |
264 | | - " [\"x1\", \"x2\", \"x3\", \"x4\"], [x1_len, x2_len, x3_len, x4_len]\n", |
265 | | - " ),\n", |
266 | | - " }\n", |
267 | | - " )\n", |
268 | | - " x_sample_index = rng.choice(len(df), len(df), replace=True)\n", |
269 | | - " x_sample = df.loc[x_sample_index]\n", |
270 | | - " x1_sample = x_sample[x_sample[\"array_id\"] == \"x1\"][\"value\"]\n", |
271 | | - " x2_sample = x_sample[x_sample[\"array_id\"] == \"x2\"][\"value\"]\n", |
272 | | - " x3_sample = x_sample[x_sample[\"array_id\"] == \"x3\"][\"value\"]\n", |
273 | | - " x4_sample = x_sample[x_sample[\"array_id\"] == \"x4\"][\"value\"]\n", |
| 250 | + " x1_sample = rng.choice(x1, len(x1), replace=True)\n", |
| 251 | + " x2_sample = rng.choice(x2, len(x2), replace=True)\n", |
| 252 | + " x3_sample = rng.choice(x3, len(x3), replace=True)\n", |
| 253 | + " x4_sample = rng.choice(x4, len(x4), replace=True)\n", |
274 | 254 | "\n", |
| 255 | + " # Calculating deltas\n", |
275 | 256 | " delta_1 = np.mean(x2_sample) - np.mean(x1_sample)\n", |
276 | 257 | " delta_2 = np.mean(x4_sample) - np.mean(x3_sample)\n", |
277 | 258 | " delta_delta = delta_2 - delta_1\n", |
| 259 | + "\n", |
278 | 260 | " deltadelta[i] = delta_delta\n", |
279 | 261 | " out_delta_g[i] = delta_delta / pooled_sample_sd\n", |
280 | | - " delta_g = (\n", |
281 | | - " (np.mean(x4) - np.mean(x3)) - (np.mean(x2) - np.mean(x1))\n", |
282 | | - " ) / pooled_sample_sd\n", |
| 262 | + "\n", |
| 263 | + " # Empirical delta_g calculation\n", |
| 264 | + " delta_g = ((np.mean(x4) - np.mean(x3)) - (np.mean(x2) - np.mean(x1))) / pooled_sample_sd\n", |
| 265 | + "\n", |
283 | 266 | " return out_delta_g, delta_g, deltadelta\n", |
284 | 267 | "\n", |
285 | 268 | "\n", |
|
0 commit comments