streamline chunk generation for embeddings and analyze (#727)

Josef-Haupt · max-mauermann · web-flow · commit fa187eba6d59 · 2025-06-17T11:36:23.000+02:00
* streamline chunk generation for embeddings and analyze

* -

* -

* -

* timestamps for embeddings are now stored with respect to the audio speed

---------

Co-authored-by: Max Mauermann &lt;max-mauermann@web.de&gt;
diff --git a/birdnet_analyzer/analyze/utils.py b/birdnet_analyzer/analyze/utils.py
@@ -396,7 +396,7 @@ def combine_csv_files(saved_results: list[str]):
         f.write(out_string)
 
 
-def combine_results(saved_results: Sequence[dict[str, str]| None]):
+def combine_results(saved_results: Sequence[dict[str, str] | None]):
     """
     Combines various types of result files based on the configuration settings.
     This function checks the types of results specified in the configuration
@@ -522,6 +522,56 @@ def get_raw_audio_from_file(fpath: str, offset, duration):
     return audio.split_signal(sig, rate, cfg.SIG_LENGTH, cfg.SIG_OVERLAP, cfg.SIG_MINLEN)
 
 
+def iterate_audio_chunks(fpath: str, embeddings: bool = False):
+    """Iterates over audio chunks from a file.
+
+    Args:
+        fpath: Path to the audio file.
+        offset: Offset in seconds to start reading the file.
+
+    Yields:
+        Chunks of audio data.
+    """
+    fileLengthSeconds = audio.get_audio_file_length(fpath)
+    start, end = 0, cfg.SIG_LENGTH * cfg.AUDIO_SPEED
+    duration = int(cfg.FILE_SPLITTING_DURATION / cfg.AUDIO_SPEED)
+
+    while start < fileLengthSeconds and not np.isclose(start, fileLengthSeconds):
+        chunks = get_raw_audio_from_file(fpath, start, duration)
+        samples = []
+        timestamps = []
+
+        if not chunks:
+            break
+
+        for chunk_index, chunk in enumerate(chunks):
+            # Add to batch
+            samples.append(chunk)
+            timestamps.append([round(start, 1), round(end, 1)])
+
+            # Advance start and end
+            start += (cfg.SIG_LENGTH - cfg.SIG_OVERLAP) * cfg.AUDIO_SPEED
+            end = min(start + cfg.SIG_LENGTH * cfg.AUDIO_SPEED, fileLengthSeconds)
+
+            # Check if batch is full or last chunk
+            if len(samples) < cfg.BATCH_SIZE and chunk_index < len(chunks) - 1:
+                continue
+
+            # Predict
+            p = model.embeddings(samples) if embeddings else predict(samples)
+
+            # Add to results
+            for i in range(len(samples)):
+                # Get timestamp
+                s_start, s_end = timestamps[i]
+
+                yield s_start, s_end, p[i]
+
+            # Clear batch
+            samples = []
+            timestamps = []
+
+
 def predict(samples):
     """Predicts the classes for the given samples.
 
@@ -600,76 +650,31 @@ def analyze_file(item) -> dict[str, str] | None:
 
     # Start time
     start_time = datetime.datetime.now()
-    duration = int(cfg.FILE_SPLITTING_DURATION / cfg.AUDIO_SPEED)
-    start, end = 0, cfg.SIG_LENGTH * cfg.AUDIO_SPEED
     results = {}
 
     # Status
     print(f"Analyzing {fpath}", flush=True)
 
-    try:
-        fileLengthSeconds = audio.get_audio_file_length(fpath)
-    except Exception as ex:
-        # Write error log
-        print(f"Error: Cannot analyze audio file {fpath}. File corrupt?\n", flush=True)
-        utils.write_error_log(ex)
-
-        return None
-
     # Process each chunk
     try:
-        while start < fileLengthSeconds and not np.isclose(start, fileLengthSeconds):
-            chunks = get_raw_audio_from_file(fpath, start, duration)
-            samples = []
-            timestamps = []
-
-            for chunk_index, chunk in enumerate(chunks):
-                # Add to batch
-                samples.append(chunk)
-                timestamps.append([round(start, 1), round(end, 1)])
-
-                # Advance start and end
-                start += (cfg.SIG_LENGTH - cfg.SIG_OVERLAP) * cfg.AUDIO_SPEED
-                end = min(start + cfg.SIG_LENGTH * cfg.AUDIO_SPEED, fileLengthSeconds)
-
-                # Check if batch is full or last chunk
-                if len(samples) < cfg.BATCH_SIZE and chunk_index < len(chunks) - 1:
-                    continue
-
-                # Predict
-                p = predict(samples)
-
-                # Add to results
-                for i in range(len(samples)):
-                    # Get timestamp
-                    s_start, s_end = timestamps[i]
-
-                    # Get prediction
-                    pred = p[i]
-
-                    if not cfg.LABELS:
-                        cfg.LABELS = [f"Species-{i}_Species-{i}" for i in range(len(pred))]
-
-                    # Assign scores to labels
-                    p_labels = [
-                        p
-                        for p in zip(cfg.LABELS, pred, strict=True)
-                        if (cfg.TOP_N or p[1] >= cfg.MIN_CONFIDENCE) and (not cfg.SPECIES_LIST or p[0] in cfg.SPECIES_LIST)
-                    ]
+        for s_start, s_end, pred in iterate_audio_chunks(fpath):
+            if not cfg.LABELS:
+                cfg.LABELS = [f"Species-{i}_Species-{i}" for i in range(len(pred))]
 
-                    # Sort by score
-                    p_sorted = sorted(p_labels, key=operator.itemgetter(1), reverse=True)
+            # Assign scores to labels
+            p_labels = [
+                p for p in zip(cfg.LABELS, pred, strict=True) if (cfg.TOP_N or p[1] >= cfg.MIN_CONFIDENCE) and (not cfg.SPECIES_LIST or p[0] in cfg.SPECIES_LIST)
+            ]
 
-                    if cfg.TOP_N:
-                        p_sorted = p_sorted[: cfg.TOP_N]
+            # Sort by score
+            p_sorted = sorted(p_labels, key=operator.itemgetter(1), reverse=True)
 
-                    # TODO: hier schon top n oder min conf raussortieren
-                    # Store top 5 results and advance indices
-                    results[str(s_start) + "-" + str(s_end)] = p_sorted
+            if cfg.TOP_N:
+                p_sorted = p_sorted[: cfg.TOP_N]
 
-                # Clear batch
-                samples = []
-                timestamps = []
+            # TODO: hier schon top n oder min conf raussortieren
+            # Store top 5 results and advance indices
+            results[str(s_start) + "-" + str(s_end)] = p_sorted
 
     except Exception as ex:
         # Write error log
diff --git a/birdnet_analyzer/cli.py b/birdnet_analyzer/cli.py
@@ -456,7 +456,7 @@ def search_parser():
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, parents=parents)
     parser.add_argument("-q", "--queryfile", help="Path to the query file.")
     parser.add_argument("-o", "--output", help="Path to the output folder.")
-    parser.add_argument("--n_results", default=10, help="Number of results to return.")
+    parser.add_argument("--n_results", default=10, type=int, help="Number of results to return.")
 
     # TODO: use choice argument.
     parser.add_argument(
diff --git a/birdnet_analyzer/embeddings/utils.py b/birdnet_analyzer/embeddings/utils.py
@@ -12,8 +12,8 @@
 from tqdm import tqdm
 
 import birdnet_analyzer.config as cfg
-from birdnet_analyzer import audio, model, utils
-from birdnet_analyzer.analyze.utils import get_raw_audio_from_file
+from birdnet_analyzer import utils
+from birdnet_analyzer.analyze.utils import iterate_audio_chunks
 from birdnet_analyzer.embeddings.core import get_database
 
 DATASET_NAME: str = "birdnet_analyzer_dataset"
@@ -30,18 +30,6 @@ def analyze_file(item, db: sqlite_usearch_impl.SQLiteUsearchDB):
     fpath: str = item[0]
     cfg.set_config(item[1])
 
-    offset = 0
-    duration = cfg.FILE_SPLITTING_DURATION
-
-    try:
-        fileLengthSeconds = int(audio.get_audio_file_length(fpath))
-    except Exception as ex:
-        # Write error log
-        print(f"Error: Cannot analyze audio file {fpath}. File corrupt?\n", flush=True)
-        utils.write_error_log(ex)
-
-        return
-
     # Start time
     start_time = datetime.datetime.now()
 
@@ -52,53 +40,17 @@ def analyze_file(item, db: sqlite_usearch_impl.SQLiteUsearchDB):
 
     # Process each chunk
     try:
-        while offset < fileLengthSeconds:
-            chunks = get_raw_audio_from_file(fpath, offset, duration)
-            start, end = offset, cfg.SIG_LENGTH + offset
-            samples = []
-            timestamps = []
-
-            for c in range(len(chunks)):
-                # Add to batch
-                samples.append(chunks[c])
-                timestamps.append([start, end])
+        for s_start, s_end, embeddings in iterate_audio_chunks(fpath, embeddings=True):
+            # Check if embedding already exists
+            existing_embedding = db.get_embeddings_by_source(DATASET_NAME, source_id, np.array([s_start, s_end]))
 
-                # Advance start and end
-                start += cfg.SIG_LENGTH - cfg.SIG_OVERLAP
-                end = start + cfg.SIG_LENGTH
+            if existing_embedding.size == 0:
+                # Store embeddings
+                embeddings_source = hoplite.EmbeddingSource(DATASET_NAME, source_id, np.array([s_start, s_end]))
 
-                # Check if batch is full or last chunk
-                if len(samples) < cfg.BATCH_SIZE and c < len(chunks) - 1:
-                    continue
-
-                # Prepare sample and pass through model
-                data = np.array(samples, dtype="float32")
-                e = model.embeddings(data)
-
-                # Add to results
-                for i in range(len(samples)):
-                    # Get timestamp
-                    s_start, s_end = timestamps[i]
-
-                    # Check if embedding already exists
-                    existing_embedding = db.get_embeddings_by_source(DATASET_NAME, source_id, np.array([s_start, s_end]))
-
-                    if existing_embedding.size == 0:
-                        # Get prediction
-                        embeddings = e[i]
-
-                        # Store embeddings
-                        embeddings_source = hoplite.EmbeddingSource(DATASET_NAME, source_id, np.array([s_start, s_end]))
-
-                        # Insert into database
-                        db.insert_embedding(embeddings, embeddings_source)
-                        db.commit()
-
-                # Reset batch
-                samples = []
-                timestamps = []
-
-            offset = offset + duration
+                # Insert into database
+                db.insert_embedding(embeddings, embeddings_source)
+                db.commit()
 
     except Exception as ex:
         # Write error log
@@ -162,6 +114,7 @@ def create_file_output(output_path: str, db: sqlite_usearch_impl.SQLiteUsearchDB
         with open(target_path, "w") as f:
             f.write(",".join(map(str, embedding.tolist())))
 
+
 def run(audio_input, database, overlap, audio_speed, fmin, fmax, threads, batchsize, file_output):
     ### Make sure to comment out appropriately if you are not using args. ###
 
diff --git a/birdnet_analyzer/gui/embeddings.py b/birdnet_analyzer/gui/embeddings.py
@@ -459,8 +459,8 @@ def render_results(results, page, db_path, exports):
                                         index = i + page * PAGE_SIZE
                                         embedding_source = db.get_embedding_source(r.embedding_id)
                                         file = embedding_source.source_id
-                                        offset = embedding_source.offsets[0] * settings["AUDIO_SPEED"]
-                                        duration = 3 * settings["AUDIO_SPEED"]
+                                        offset = embedding_source.offsets[0]
+                                        duration = cfg.SIG_LENGTH * settings["AUDIO_SPEED"]
                                         spec = utils.spectrogram_from_file(
                                             file,
                                             offset=offset,
diff --git a/birdnet_analyzer/model.py b/birdnet_analyzer/model.py
@@ -1201,12 +1201,14 @@ def embeddings(sample):
 
     load_model(False)
 
+    sample = np.array(sample, dtype="float32")
+
     # Reshape input tensor
     INTERPRETER.resize_tensor_input(INPUT_LAYER_INDEX, [len(sample), *sample[0].shape])
     INTERPRETER.allocate_tensors()
 
     # Extract feature embeddings
-    INTERPRETER.set_tensor(INPUT_LAYER_INDEX, np.array(sample, dtype="float32"))
+    INTERPRETER.set_tensor(INPUT_LAYER_INDEX, sample)
     INTERPRETER.invoke()
 
     return INTERPRETER.get_tensor(OUTPUT_LAYER_INDEX)
diff --git a/birdnet_analyzer/search/core.py b/birdnet_analyzer/search/core.py
@@ -65,7 +65,7 @@ def search(
         file = embedding_source.source_id
         filebasename = os.path.basename(file)
         filebasename = os.path.splitext(filebasename)[0]
-        offset = embedding_source.offsets[0] * audio_speed
+        offset = embedding_source.offsets[0]
         duration = cfg.SIG_LENGTH * audio_speed
         sig, rate = audio.open_audio_file(file, offset=offset, duration=duration, sample_rate=None)
         result_path = os.path.join(output, f"{r.sort_score:.5f}_{filebasename}_{offset}_{offset + duration}.wav")
diff --git a/birdnet_analyzer/search/utils.py b/birdnet_analyzer/search/utils.py
@@ -49,10 +49,7 @@ def get_query_embedding(queryfile_path):
     else:
         sig_splits = audio.split_signal(sig, rate, cfg.SIG_LENGTH, cfg.SIG_OVERLAP, cfg.SIG_MINLEN)
 
-    samples = sig_splits
-    data = np.array(samples, dtype="float32")
-
-    return model.embeddings(data)
+    return model.embeddings(sig_splits)
 
 
 def get_search_results(