-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
75 lines (57 loc) · 2.33 KB
/
Copy pathutils.py
File metadata and controls
75 lines (57 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import pandas as pd
__all__ = [
"note_drop_null",
"keep_first_with_max_serial",
"note_sort",
"concatenate_notes",
"format_notes",
"extract_suvr_values",
"replace_missing",
"check_qc",
]
def note_drop_null(df_note, col_list):
for name in col_list:
df_note = df_note[df_note[name].notna()]
return df_note
def keep_first_with_max_serial(df, groupby_col, transform_col, method=max):
idx = df.groupby([groupby_col])[transform_col].transform(method) == df[transform_col]
df = df[idx]
return df
def note_sort(df_note, col_name):
return df_note.sort_values(by=col_name)
def concatenate_notes(df_note, col_name_text, col_name_concat):
df_note[col_name_text] = df_note[col_name_text].astype(str)
df_note[col_name_text] = df_note.groupby(col_name_concat)[col_name_text].transform(
lambda x: " ".join(x)
)
df_note = df_note.drop_duplicates(subset=col_name_concat, keep="first")
return df_note
def format_notes(notes):
notes = note_drop_null(notes, ["NOTEID", "NOTETXT"])
notes = note_sort(notes, ["PATIENTID", "NOTEID", "NOTE_CSN_ID", "LINENBR"])
notes = concatenate_notes(notes, "NOTETXT", "NOTE_CSN_ID")
notes = keep_first_with_max_serial(notes, "NOTEID", "NOTE_CSN_ID", method="max")
return notes
def extract_suvr_values(df, column_name):
pattern = r"((\d+)|(\d+\.\d+))\s*\[(\d+\.\d+)\s*\((\d+\.\d+)\s*-\s*(\d+\.\d+)\)\]"
matches = df[column_name].str.extract(pattern)
df[f"{column_name}_value"] = matches[0].astype(float)
df[f"{column_name}_mean"] = matches[3].astype(float)
df[f"{column_name}_lower_range"] = matches[4].astype(float)
df[f"{column_name}_upper_range"] = matches[5].astype(float)
return df
def replace_missing(value):
if value == [] or value == ["missing"] or value == "missing":
return None
return value
def check_qc(total_score_column, component_columns, qc_column_name, df):
df["has_values_in_components"] = df[component_columns].notna().any(axis=1)
df["component_sum"] = df[component_columns].sum(axis=1)
df[qc_column_name] = df.apply(
lambda row: None
if not row["has_values_in_components"]
else row["component_sum"] == row[total_score_column],
axis=1,
)
df.drop(columns=["has_values_in_components", "component_sum"], inplace=True)
return df