Skip to content

Commit a983c2c

Browse files
committed
iterate over ALL counts for every region
1 parent 72493f2 commit a983c2c

1 file changed

Lines changed: 8 additions & 10 deletions

File tree

ALLCools/count_matrix/dataset.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def _determine_datasets(regions, quantifiers, chrom_size_path):
9898
"do not have index in its fourth column, adding it automatically. "
9999
"If this is not desired, add a fourth column containing UNIQUE IDs to the BED file.",
100100
)
101-
region_bed_df[name] = [f"{name}_{i}" for i in range(region_bed_df.shape[0])]
101+
region_bed_df[name] = (f"{name}_{i}" for i in range(region_bed_df.shape[0]))
102102
# check if name is unique()
103103
if region_bed_df.iloc[:, 3].duplicated().sum() > 0:
104104
raise ValueError(f"Region IDs in {region_path} (fourth column) are not unique.")
@@ -219,7 +219,6 @@ def _count_single_zarr(
219219
count_ds = _count_single_region_set(
220220
allc_table=allc_table, region_config=region_config, obs_dim=obs_dim, region_dim=region_dim
221221
)
222-
223222
# deal with count quantifiers
224223
count_mc_types = []
225224
for quant in region_config["quant"]:
@@ -326,8 +325,7 @@ def generate_dataset(
326325
subprocess.run(["cp", "-f", chrom_size_path, f"{output_path}/chrom_sizes.txt"], check=True)
327326
rgs = {}
328327
for region_dim, region_config in datasets.items():
329-
regiongroup = root.create_group(region_dim)
330-
rgs[region_dim] = regiongroup
328+
rgs[region_dim] = root.create_group(region_dim)
331329
# save region coords to the ds
332330
bed = pd.read_csv(f"{tmpdir}/{region_dim}.regions.csv", index_col=0)
333331
bed.columns = [f"{region_dim}_chrom", f"{region_dim}_start", f"{region_dim}_end"]
@@ -343,7 +341,7 @@ def generate_dataset(
343341
if ds.coords[k].dtype == "O":
344342
ds.coords[k] = ds.coords[k].astype(str)
345343
ds.to_zarr(f"{output_path}/{region_dim}", mode="w", consolidated=False)
346-
dsobs = regiongroup.empty(
344+
dsobs = rgs[region_dim].empty(
347345
name=obs_dim, shape=allc_table.index.size, chunks=(chunk_size), dtype=f"<U{max_length}"
348346
)
349347
dsobs.attrs["_ARRAY_DIMENSIONS"] = [obs_dim]
@@ -353,22 +351,22 @@ def generate_dataset(
353351
count_mc_types += quant.mc_types
354352
count_mc_types = list(set(count_mc_types))
355353
if len(count_mc_types) > 0:
356-
DA = regiongroup.empty(
354+
DA = rgs[region_dim].empty(
357355
name=f"{region_dim}_da",
358356
shape=(n_sample, region_size, len(count_mc_types), 2),
359357
chunks=(chunk_size, region_size, len(count_mc_types), 2),
360358
dtype="uint32",
361359
)
362360
DA.attrs["_ARRAY_DIMENSIONS"] = [obs_dim, region_dim, "mc_type", "count_type"]
363-
count = regiongroup.array(name="count_type", data=(["mc", "cov"]), dtype="<U3")
361+
count = rgs[region_dim].array(name="count_type", data=(["mc", "cov"]), dtype="<U3")
364362
count.attrs["_ARRAY_DIMENSIONS"] = ["count_type"]
365-
mc = regiongroup.array(name="mc_type", data=count_mc_types, dtype="<U3")
363+
mc = rgs[region_dim].array(name="mc_type", data=count_mc_types, dtype="<U3")
366364
mc.attrs["_ARRAY_DIMENSIONS"] = ["mc_type"]
367365
# deal with hypo-score, hyper-score quantifiers
368366
for quant in region_config["quant"]:
369367
if quant.quant_type == "hypo-score":
370368
for mc_type in quant.mc_types:
371-
hypo = regiongroup.empty(
369+
hypo = rgs[region_dim].empty(
372370
name=f"{region_dim}_da_{mc_type}-hypo-score",
373371
shape=(allc_table.size, region_size),
374372
chunks=(chunk_size, region_size),
@@ -377,7 +375,7 @@ def generate_dataset(
377375
hypo.attrs["_ARRAY_DIMENSIONS"] = [obs_dim, region_dim]
378376
elif quant.quant_type == "hyper-score":
379377
for mc_type in quant.mc_types:
380-
hyper = regiongroup.empty(
378+
hyper = rgs[region_dim].empty(
381379
name=f"{region_dim}_da_{mc_type}-hyper-score",
382380
shape=(allc_table.size, region_size),
383381
chunks=(chunk_size, region_size),

0 commit comments

Comments
 (0)