Use utf-8 to open eBird codes file (#831)

Josef-Haupt · web-flow · commit 1f1aa420e44a · 2025-11-03T17:32:29.000+01:00
* Use utf-8 to open eBird codes file

* .
diff --git a/birdnet_analyzer/analyze/utils.py b/birdnet_analyzer/analyze/utils.py
@@ -11,7 +11,9 @@
 import birdnet_analyzer.config as cfg
 from birdnet_analyzer import audio, model, utils
 
-RAVEN_TABLE_HEADER = "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tCommon Name\tSpecies Code\tConfidence\tBegin Path\tFile Offset (s)\n"  # noqa: E501
+RAVEN_TABLE_HEADER = (
+    "Selection\tView\tChannel\tBegin Time (s)\tEnd Time (s)\tLow Freq (Hz)\tHigh Freq (Hz)\tCommon Name\tSpecies Code\tConfidence\tBegin Path\tFile Offset (s)\n"
+)
 KALEIDOSCOPE_HEADER = "INDIR,FOLDER,IN FILE,OFFSET,DURATION,scientific_name,common_name,confidence,lat,lon,week,overlap,sensitivity\n"
 CSV_HEADER = "Start (s),End (s),Scientific name,Common name,Confidence,File\n"
 SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
@@ -53,13 +55,11 @@ def load_codes():
     Returns:
         A dictionary containing the eBird codes.
     """
-    with open(os.path.join(SCRIPT_DIR, cfg.CODES_FILE)) as cfile:
+    with open(os.path.join(SCRIPT_DIR, cfg.CODES_FILE), encoding="utf-8") as cfile:
         return json.load(cfile)
 
 
-def generate_raven_table(
-    timestamps: list[str], result: dict[str, list], afile_path: str, result_path: str
-):
+def generate_raven_table(timestamps: list[str], result: dict[str, list], afile_path: str, result_path: str):
     """
     Generates a Raven selection table from the given timestamps and prediction results.
 
@@ -90,11 +90,7 @@ def generate_raven_table(
 
         for c in result[timestamp]:
             selection_id += 1
-            label = (
-                cfg.TRANSLATED_LABELS[cfg.LABELS.index(c[0])]
-                if cfg.TRANSLATED_LABELS
-                else c[0]
-            )
+            label = cfg.TRANSLATED_LABELS[cfg.LABELS.index(c[0])] if cfg.TRANSLATED_LABELS else c[0]
             code = cfg.CODES[c[0]] if c[0] in cfg.CODES else c[0]
             lbl = label if cfg.USE_PERCH else label.split("_", 1)[-1]
             rstring += f"{selection_id}\tSpectrogram 1\t1\t{start}\t{end}\t{low_freq}\t{high_freq}\t{lbl}\t{code}\t{c[1]:.4f}\t{afile_path}\t{start}\n"
@@ -131,11 +127,7 @@ def generate_audacity(timestamps: list[str], result: dict[str, list], result_pat
         rstring = ""
 
         for c in result[timestamp]:
-            label = (
-                cfg.TRANSLATED_LABELS[cfg.LABELS.index(c[0])]
-                if cfg.TRANSLATED_LABELS
-                else c[0]
-            )
+            label = cfg.TRANSLATED_LABELS[cfg.LABELS.index(c[0])] if cfg.TRANSLATED_LABELS else c[0]
             ts = timestamp.replace("-", "\t")
             lbl = label if cfg.USE_PERCH else label.replace("_", ", ")
             rstring += f"{ts}\t{lbl}\t{c[1]:.4f}\n"
@@ -146,9 +138,7 @@ def generate_audacity(timestamps: list[str], result: dict[str, list], result_pat
     utils.save_result_file(result_path, out_string)
 
 
-def generate_kaleidoscope(
-    timestamps: list[str], result: dict[str, list], afile_path: str, result_path: str
-):
+def generate_kaleidoscope(timestamps: list[str], result: dict[str, list], afile_path: str, result_path: str):
     """
     Generates a Kaleidoscope-compatible CSV string from the given timestamps and results, and saves it to a file.
 
@@ -172,11 +162,7 @@ def generate_kaleidoscope(
         start, end = timestamp.split("-", 1)
 
         for c in result[timestamp]:
-            label = (
-                cfg.TRANSLATED_LABELS[cfg.LABELS.index(c[0])]
-                if cfg.TRANSLATED_LABELS
-                else c[0]
-            )
+            label = cfg.TRANSLATED_LABELS[cfg.LABELS.index(c[0])] if cfg.TRANSLATED_LABELS else c[0]
 
             if cfg.USE_PERCH:
                 common = scientific = label
@@ -206,9 +192,7 @@ def generate_kaleidoscope(
     utils.save_result_file(result_path, out_string)
 
 
-def generate_csv(
-    timestamps: list[str], result: dict[str, list], afile_path: str, result_path: str
-):
+def generate_csv(timestamps: list[str], result: dict[str, list], afile_path: str, result_path: str):
     """
     Generates a CSV file from the given timestamps and results.
 
@@ -240,11 +224,7 @@ def generate_csv(
 
         for c in result[timestamp]:
             start, end = timestamp.split("-", 1)
-            label = (
-                cfg.TRANSLATED_LABELS[cfg.LABELS.index(c[0])]
-                if cfg.TRANSLATED_LABELS
-                else c[0]
-            )
+            label = cfg.TRANSLATED_LABELS[cfg.LABELS.index(c[0])] if cfg.TRANSLATED_LABELS else c[0]
 
             if cfg.USE_PERCH:
                 common = scientific = label
@@ -265,9 +245,7 @@ def generate_csv(
     utils.save_result_file(result_path, out_string)
 
 
-def save_result_files(
-    r: dict[str, list], result_files: dict[str, str], afile_path: str
-):
+def save_result_files(r: dict[str, list], result_files: dict[str, str], afile_path: str):
     """
     Saves the result files in various formats based on the provided configuration.
 
@@ -298,9 +276,7 @@ def save_result_files(
     #     generate_rtable(timestamps, r, afile_path, result_files["r"])
 
     if "kaleidoscope" in cfg.RESULT_TYPES:
-        generate_kaleidoscope(
-            timestamps, r_merged, afile_path, result_files["kaleidoscope"]
-        )
+        generate_kaleidoscope(timestamps, r_merged, afile_path, result_files["kaleidoscope"])
 
     if "csv" in cfg.RESULT_TYPES:
         generate_csv(timestamps, r_merged, afile_path, result_files["csv"])
@@ -321,9 +297,7 @@ def combine_raven_tables(saved_results: list[str]):
     time_offset = 0
     audiofiles = []
 
-    with open(
-        os.path.join(cfg.OUTPUT_PATH, cfg.OUTPUT_RAVEN_FILENAME), "w", encoding="utf-8"
-    ) as f:
+    with open(os.path.join(cfg.OUTPUT_PATH, cfg.OUTPUT_RAVEN_FILENAME), "w", encoding="utf-8") as f:
         f.write(RAVEN_TABLE_HEADER)
 
         for rfile in saved_results:
@@ -350,10 +324,7 @@ def combine_raven_tables(saved_results: list[str]):
 
                         # Is species code and common name == 'nocall'?
                         # If so, that's a dummy line and we can skip it
-                        if (
-                            line.split("\t")[7] == "nocall"
-                            and line.split("\t")[8] == "nocall"
-                        ):
+                        if line.split("\t")[7] == "nocall" and line.split("\t")[8] == "nocall":
                             continue
 
                         # adjust selection id
@@ -435,9 +406,7 @@ def combine_csv_files(saved_results: list[str]):
             print(f"Error: Cannot combine results from {rfile}.\n", flush=True)
             utils.write_error_log(ex)
 
-    with open(
-        os.path.join(cfg.OUTPUT_PATH, cfg.OUTPUT_CSV_FILENAME), "w", encoding="utf-8"
-    ) as f:
+    with open(os.path.join(cfg.OUTPUT_PATH, cfg.OUTPUT_CSV_FILENAME), "w", encoding="utf-8") as f:
         f.write(out_string)
 
 
@@ -465,9 +434,7 @@ def combine_results(saved_results: Sequence[dict[str, str] | str]):
         combine_csv_files([f["csv"] for f in saved_results if isinstance(f, dict)])
 
 
-def merge_consecutive_detections(
-    results: dict[str, list], max_consecutive: int | None = None
-):
+def merge_consecutive_detections(results: dict[str, list], max_consecutive: int | None = None):
     """Merges consecutive detections of the same species.
     Uses the mean of the top-3 highest scoring predictions as
     confidence score for the merged detection.
@@ -513,9 +480,7 @@ def merge_consecutive_detections(
                 merged_scores = [timestamps[i][1], timestamps[i + 1][1]]
                 timestamps.pop(i)
 
-                while i < len(timestamps) - 1 and float(next_end) >= float(
-                    timestamps[i + 1][0].split("-", 1)[0]
-                ):
+                while i < len(timestamps) - 1 and float(next_end) >= float(timestamps[i + 1][0].split("-", 1)[0]):
                     if max_consecutive and len(merged_scores) >= max_consecutive:
                         break
                     merged_scores.append(timestamps[i + 1][1])
@@ -576,9 +541,7 @@ def get_raw_audio_from_file(fpath: str, offset, duration):
     )
 
     # Split into raw audio chunks
-    return audio.split_signal(
-        sig, rate, cfg.SIG_LENGTH, cfg.SIG_OVERLAP, cfg.SIG_MINLEN
-    )
+    return audio.split_signal(sig, rate, cfg.SIG_LENGTH, cfg.SIG_OVERLAP, cfg.SIG_MINLEN)
 
 
 def iterate_audio_chunks(fpath: str, embeddings: bool = False):
@@ -604,9 +567,7 @@ def iterate_audio_chunks(fpath: str, embeddings: bool = False):
             break
 
         for chunk_index, chunk in enumerate(chunks):
-            t_start = start + (
-                chunk_index * (cfg.SIG_LENGTH - cfg.SIG_OVERLAP) * cfg.AUDIO_SPEED
-            )
+            t_start = start + (chunk_index * (cfg.SIG_LENGTH - cfg.SIG_OVERLAP) * cfg.AUDIO_SPEED)
             end = min(t_start + cfg.SIG_LENGTH * cfg.AUDIO_SPEED, fileLengthSeconds)
 
             # Add to batch
@@ -649,9 +610,7 @@ def predict(samples):
 
     # Logits or sigmoid activations?
     if cfg.APPLY_SIGMOID and not cfg.USE_PERCH:
-        prediction = model.flat_sigmoid(
-            np.array(prediction), sensitivity=-1, bias=cfg.SIGMOID_SENSITIVITY
-        )
+        prediction = model.flat_sigmoid(np.array(prediction), sensitivity=-1, bias=cfg.SIGMOID_SENSITIVITY)
 
     return prediction
 
@@ -671,32 +630,20 @@ def get_result_file_names(fpath: str):
 
     rpath = fpath.replace(cfg.INPUT_PATH, "")
 
-    rpath = (
-        (rpath[1:] if rpath[0] in ["/", "\\"] else rpath)
-        if rpath
-        else os.path.basename(fpath)
-    )
+    rpath = (rpath[1:] if rpath[0] in ["/", "\\"] else rpath) if rpath else os.path.basename(fpath)
 
     file_shorthand = rpath.rsplit(".", 1)[0]
 
     if "table" in cfg.RESULT_TYPES:
-        result_names["table"] = os.path.join(
-            cfg.OUTPUT_PATH, file_shorthand + ".BirdNET.selection.table.txt"
-        )
+        result_names["table"] = os.path.join(cfg.OUTPUT_PATH, file_shorthand + ".BirdNET.selection.table.txt")
     if "audacity" in cfg.RESULT_TYPES:
-        result_names["audacity"] = os.path.join(
-            cfg.OUTPUT_PATH, file_shorthand + ".BirdNET.results.txt"
-        )
+        result_names["audacity"] = os.path.join(cfg.OUTPUT_PATH, file_shorthand + ".BirdNET.results.txt")
     # if "r" in cfg.RESULT_TYPES:
     #     result_names["r"] = os.path.join(cfg.OUTPUT_PATH, file_shorthand + ".BirdNET.results.r.csv")
     if "kaleidoscope" in cfg.RESULT_TYPES:
-        result_names["kaleidoscope"] = os.path.join(
-            cfg.OUTPUT_PATH, file_shorthand + ".BirdNET.results.kaleidoscope.csv"
-        )
+        result_names["kaleidoscope"] = os.path.join(cfg.OUTPUT_PATH, file_shorthand + ".BirdNET.results.kaleidoscope.csv")
     if "csv" in cfg.RESULT_TYPES:
-        result_names["csv"] = os.path.join(
-            cfg.OUTPUT_PATH, file_shorthand + ".BirdNET.results.csv"
-        )
+        result_names["csv"] = os.path.join(cfg.OUTPUT_PATH, file_shorthand + ".BirdNET.results.csv")
 
     return result_names
 
@@ -720,9 +667,7 @@ def analyze_file(item) -> dict[str, str] | None:
 
     result_file_names = get_result_file_names(fpath)
 
-    if cfg.SKIP_EXISTING_RESULTS and all(
-        os.path.exists(f) for f in result_file_names.values()
-    ):
+    if cfg.SKIP_EXISTING_RESULTS and all(os.path.exists(f) for f in result_file_names.values()):
         print(f"Skipping {fpath} as it has already been analyzed", flush=True)
         return None  # or return path to combine later? TODO
 
@@ -741,10 +686,7 @@ def analyze_file(item) -> dict[str, str] | None:
 
             # Assign scores to labels
             p_labels = [
-                p
-                for p in zip(cfg.LABELS, pred, strict=True)
-                if (cfg.TOP_N or p[1] >= cfg.MIN_CONFIDENCE)
-                and (not cfg.SPECIES_LIST or p[0] in cfg.SPECIES_LIST)
+                p for p in zip(cfg.LABELS, pred, strict=True) if (cfg.TOP_N or p[1] >= cfg.MIN_CONFIDENCE) and (not cfg.SPECIES_LIST or p[0] in cfg.SPECIES_LIST)
             ]
 
             # Sort by score