birdnet-team
diff --git a/‎benchmark.sh‎
Lines changed: 73 additions & 0 deletions b/‎benchmark.sh‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎birdnet_analyzer/cli.py‎
Lines changed: 5 additions & 4 deletions b/‎birdnet_analyzer/cli.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎birdnet_analyzer/embeddings/core.py‎
Lines changed: 20 additions & 2 deletions b/‎birdnet_analyzer/embeddings/core.py‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎birdnet_analyzer/embeddings/utils.py‎
Lines changed: 136 additions & 49 deletions b/‎birdnet_analyzer/embeddings/utils.py‎
Lines changed: 136 additions & 49 deletions
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+OUTFILE="benchmark.csv"
+TARGET_DIR="/data/embeddings/"
+
+source "./.venv/bin/activate"
+
+if [ ! -f "$OUTFILE" ]; then
+    echo "version,chunksize,target,cores,bs,sys,user,real" > "$OUTFILE"
+fi
+
+run_benchmark() {
+    local version=$1
+    local chunksize=$2
+    local target=$3
+    local cores=$4
+    local batchsize=$5
+
+    rm -rf "$TARGET_DIR"
+
+    export CHUNKSIZE="$chunksize"
+    export BVERSION="$version"
+
+    local real user sys
+    LC_NUMERIC=C
+    TIMEFORMAT="%lR %lU %lS"
+    { time python -m birdnet_analyzer.embeddings -i "$target" -db /data/embeddings -t "$cores" -b "$batchsize" 2> python_stderr.log ; } 2>timing.tmp
+
+    read real user sys <timing.tmp
+
+    echo "$version,$chunksize,$target,$cores,$batchsize,$sys,$user,$real" >> "$OUTFILE"
+
+    rm -f timing.tmp
+}
+
+LARGE_FILES="/data/testing_audio/medium_sized_soundscapes/" # 155
+SMALL_FILES="/data/testing_audio/small_files/" # 14018
+TEST_CORES="10"
+
+# warmup
+run_benchmark "V1" "0" "$LARGE_FILES" "$TEST_CORES" "16"
+
+# larger files
+run_benchmark "V1" "0" "$LARGE_FILES" "$TEST_CORES" "16"
+run_benchmark "V2" "1" "$LARGE_FILES" "$TEST_CORES" "16"
+run_benchmark "V2" "2" "$LARGE_FILES" "$TEST_CORES" "16"
+run_benchmark "V2" "3" "$LARGE_FILES" "$TEST_CORES" "16"
+run_benchmark "V2" "7" "$LARGE_FILES" "$TEST_CORES" "16" # (155 files // 10 cores) // 2
+run_benchmark "V2" "10" "$LARGE_FILES" "$TEST_CORES" "16"
+run_benchmark "V2" "15" "$LARGE_FILES" "$TEST_CORES" "16" # 155 files // 10 cores
+run_benchmark "V3" "1" "$LARGE_FILES" "$TEST_CORES" "16"
+run_benchmark "V3" "2" "$LARGE_FILES" "$TEST_CORES" "16"
+run_benchmark "V3" "3" "$LARGE_FILES" "$TEST_CORES" "16"
+run_benchmark "V3" "10" "$LARGE_FILES" "$TEST_CORES" "16"
+run_benchmark "V3" "10" "$LARGE_FILES" "$TEST_CORES" "16" # (155 files // 10 cores) // 2
+run_benchmark "V3" "15" "$LARGE_FILES" "$TEST_CORES" "16" # 155 files // 10 cores
+
+# small files
+run_benchmark "V1" "0" "$SMALL_FILES" "$TEST_CORES" "16"
+run_benchmark "V2" "1" "$SMALL_FILES" "$TEST_CORES" "16"
+run_benchmark "V2" "2" "$SMALL_FILES" "$TEST_CORES" "16"
+run_benchmark "V2" "3" "$SMALL_FILES" "$TEST_CORES" "16"
+run_benchmark "V2" "7" "$SMALL_FILES" "$TEST_CORES" "16" # (14018 files // 10 cores) // 2
+run_benchmark "V2" "700" "$SMALL_FILES" "$TEST_CORES" "16"
+run_benchmark "V2" "1401" "$SMALL_FILES" "$TEST_CORES" "16" # 14018 files // 10 cores
+run_benchmark "V3" "1" "$SMALL_FILES" "$TEST_CORES" "16"
+run_benchmark "V3" "2" "$SMALL_FILES" "$TEST_CORES" "16"
+run_benchmark "V3" "3" "$SMALL_FILES" "$TEST_CORES" "16"
+run_benchmark "V3" "10" "$SMALL_FILES" "$TEST_CORES" "16"
+run_benchmark "V3" "700" "$SMALL_FILES" "$TEST_CORES" "16" # (14018 files // 10 cores) // 2
+run_benchmark "V3" "1401" "$SMALL_FILES" "$TEST_CORES" "16" # 14018 files // 10 cores
@@ -80,6 +80,7 @@ def bandpass_args():
 
     return p
 
+
 def species_list_args():
     """
     Creates an argument parser for species-list arguments.
@@ -111,6 +112,7 @@ def species_list_args():
     )
     return p
 
+
 def species_args():
     """
     Creates an argument parser for species-related arguments including the species-list arguments.
@@ -325,6 +327,7 @@ def analyzer_parser():
         argparse.ArgumentParser: Configured argument parser for the BirdNET Analyzer CLI.
     """
     from birdnet_analyzer.analyze import POSSIBLE_ADDITIONAL_COLUMNS_MAP
+
     parents = [
         io_args(),
         bandpass_args(),
@@ -414,7 +417,7 @@ def embeddings_parser():
         argparse.ArgumentParser: Configured argument parser for extracting feature embeddings.
     """
 
-    parents = [db_args(), bandpass_args(), audio_speed_args(), overlap_args(), threads_args(), bs_args()]
+    parents = [db_args(), bandpass_args(), audio_speed_args(), overlap_args(), threads_args(), bs_args(default=8)]
 
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
@@ -428,9 +431,7 @@ def embeddings_parser():
         help="Path to input file or folder.",
     )
 
-    parser.add_argument(
-        "--file_output",
-    )
+    parser.add_argument("--file_output", help="Saves all embeddings contained in the database in a csv file.")
 
     return parser
 
 
@@ -50,7 +50,22 @@ def embeddings(
     run(audio_input, database, overlap, audio_speed, fmin, fmax, threads, batch_size, file_output)
 
 
-def get_database(db_path: str):
+def try_get_database(db_path: str):
+    """Try to get the database object. Creates or opens the databse.
+    Args:
+        db: The path to the database.
+    Returns:
+        The database object or None if it could not be created or opened.
+    """
+    from perch_hoplite.db import sqlite_usearch_impl
+
+    try:
+        return sqlite_usearch_impl.SQLiteUsearchDB.create(db_path=db_path)
+    except ValueError:
+        return None
+
+
+def get_or_create_database(db_path: str):
     """Get the database object. Creates or opens the databse.
     Args:
         db: The path to the database.
@@ -67,4 +82,7 @@ def get_database(db_path: str):
             db_path=db_path,
             usearch_cfg=sqlite_usearch_impl.get_default_usearch_config(embedding_dim=1024),  # TODO: dont hardcode this
         )
-    return sqlite_usearch_impl.SQLiteUsearchDB.create(db_path=db_path)
+    try:
+        return sqlite_usearch_impl.SQLiteUsearchDB.create(db_path=db_path)
+    except ValueError:
+        return sqlite_usearch_impl.SQLiteUsearchDB.create(db_path=db_path, usearch_cfg=sqlite_usearch_impl.get_default_usearch_config(embedding_dim=1024))
@@ -1,9 +1,8 @@
 """Module used to extract embeddings for samples."""
 
-import datetime
+import multiprocessing as mp
 import os
-from functools import partial
-from multiprocessing import Pool
+import time
 
 import numpy as np
 from ml_collections import ConfigDict
@@ -14,52 +13,40 @@
 import birdnet_analyzer.config as cfg
 from birdnet_analyzer import utils
 from birdnet_analyzer.analyze.utils import iterate_audio_chunks
-from birdnet_analyzer.embeddings.core import get_database
+from birdnet_analyzer.embeddings.core import get_or_create_database
 
 DATASET_NAME: str = "birdnet_analyzer_dataset"
+COMMIT_BS_SIZE = 512
 
 
-def analyze_file(item, db: sqlite_usearch_impl.SQLiteUsearchDB):
-    """Extracts the embeddings for a file.
-
-    Args:
-        item: (filepath, config)
-    """
-
-    # Get file path and restore cfg
-    fpath: str = item[0]
-    cfg.set_config(item[1])
-
-    # Start time
-    start_time = datetime.datetime.now()
-
-    # Status
-    print(f"Analyzing {fpath}", flush=True)
-
-    source_id = fpath
+def analyze_file_core(fpath, config):
+    results = []
+    cfg.set_config(config)
 
     # Process each chunk
     try:
         for s_start, s_end, embeddings in iterate_audio_chunks(fpath, embeddings=True):
-            # Check if embedding already exists
-            existing_embedding = db.get_embeddings_by_source(DATASET_NAME, source_id, np.array([s_start, s_end]))
-
-            if existing_embedding.size == 0:
-                # Store embeddings
-                embeddings_source = hoplite.EmbeddingSource(DATASET_NAME, source_id, np.array([s_start, s_end]))
-
-                # Insert into database
-                db.insert_embedding(embeddings, embeddings_source)
-        db.commit()
+            results.append((fpath, s_start, s_end, embeddings))
     except Exception as ex:
         # Write error log
         print(f"Error: Cannot analyze audio file {fpath}.", flush=True)
         utils.write_error_log(ex)
 
-        return
+    return results
+
+
+def analyze_file(items):
+    """Extracts the embeddings for a file.
+
+    Args:
+        item: (filepath, config)
+    """
+    results = []
+
+    for fpath, config in items:
+        results.extend(analyze_file_core(fpath, config))
 
-    delta_time = (datetime.datetime.now() - start_time).total_seconds()
-    print(f"Finished {fpath} in {delta_time:.2f} seconds", flush=True)
+    return results
 
 
 def check_database_settings(db: sqlite_usearch_impl.SQLiteUsearchDB):
@@ -76,13 +63,46 @@ def check_database_settings(db: sqlite_usearch_impl.SQLiteUsearchDB):
         db.commit()
 
 
-def create_file_output(output_path: str, db: sqlite_usearch_impl.SQLiteUsearchDB):
+def create_csv_output(output_path: str, database: str):
+    """Creates a CSV output for the database.
+
+    Args:
+        output_path: Path to the output file.
+        db: Database object.
+    """
+
+    db = get_or_create_database(database)
+    parent_dir = os.path.dirname(output_path)
+
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    embedding_ids = db.get_embedding_ids()
+
+    csv_content = "file_path,start,end,embedding\n"
+
+    for embedding_id in embedding_ids:
+        embedding = db.get_embedding(embedding_id)
+        source = db.get_embedding_source(embedding_id)
+
+        start, end = source.offsets
+
+        csv_content += f'{source.source_id},{start},{end},"{",".join(map(str, embedding.tolist()))}"\n'
+
+    with open(output_path, "w") as f:
+        f.write(csv_content)
+
+
+def create_file_output(output_path: str, database: str):
     """Creates a file output for the database.
 
     Args:
         output_path: Path to the output file.
         db: Database object.
     """
+
+    db = get_or_create_database(database)
+
     # Check if output path exists
     if not os.path.exists(output_path):
         os.makedirs(output_path)
@@ -114,6 +134,52 @@ def create_file_output(output_path: str, db: sqlite_usearch_impl.SQLiteUsearchDB
             f.write(",".join(map(str, embedding.tolist())))
 
 
+def consume_embedding(fpath, s_start, s_end, embeddings, db: sqlite_usearch_impl.SQLiteUsearchDB):
+    # Check if embedding already exists
+    existing_embedding = db.get_embeddings_by_source(DATASET_NAME, fpath, np.array([s_start, s_end]))
+
+    if existing_embedding.size == 0:
+        # Store embeddings
+        embeddings_source = hoplite.EmbeddingSource(DATASET_NAME, fpath, np.array([s_start, s_end]))
+
+        # Insert into database
+        db.insert_embedding(embeddings, embeddings_source)
+
+        return True
+
+    return False
+
+
+def consumer(q: mp.Queue, stop_at, database: str):
+    batchsize = COMMIT_BS_SIZE
+    batch = 0
+    break_signal = True
+    db = get_or_create_database(database)
+
+    check_database_settings(db)
+
+    while break_signal:
+        if not q.empty():
+            results = q.get()
+
+            for fpath, s_start, s_end, embeddings in results:
+                if fpath == stop_at:
+                    break_signal = False
+                    break
+
+                if consume_embedding(fpath, s_start, s_end, embeddings, db):
+                    batch += 1
+
+                if batch >= batchsize:
+                    db.commit()
+                    batch = 0
+        else:
+            time.sleep(0.1)
+
+    db.commit()
+    db.db.close()
+
+
 def run(audio_input, database, overlap, audio_speed, fmin, fmax, threads, batchsize, file_output):
     ### Make sure to comment out appropriately if you are not using args. ###
 
@@ -144,8 +210,6 @@ def run(audio_input, database, overlap, audio_speed, fmin, fmax, threads, batchs
         cfg.CPU_THREADS = 1
         cfg.TFLITE_THREADS = max(1, int(threads))
 
-    cfg.CPU_THREADS = 1  # TODO: with the current implementation, we can't use more than 1 thread
-
     # Set batch size
     cfg.BATCH_SIZE = max(1, int(batchsize))
 
@@ -155,18 +219,41 @@ def run(audio_input, database, overlap, audio_speed, fmin, fmax, threads, batchs
     # have its own config. USE LINUX!
     flist = [(f, cfg.get_config()) for f in cfg.FILE_LIST]
 
-    db = get_database(database)
-    check_database_settings(db)
-
-    # Analyze files
     if cfg.CPU_THREADS < 2:
-        for entry in tqdm(flist):
-            analyze_file(entry, db)
+        # Force single core
+        batchsize = COMMIT_BS_SIZE
+        batch = 0
+        db = get_or_create_database(database)
+        check_database_settings(db)
+
+        for fpath, config in tqdm(flist, desc="Files processed"):
+            for _, s_start, s_end, embeddings in analyze_file_core(fpath, config):
+                if consume_embedding(fpath, s_start, s_end, embeddings, db):
+                    batch += 1
+
+                if batch >= batchsize:
+                    db.commit()
+                    batch = 0
+
+        db.commit()
+        db.db.close()
     else:
-        with Pool(cfg.CPU_THREADS) as p:
-            tqdm(p.imap(partial(analyze_file, db=db), flist))
+        chunksize = 2
+        queue = mp.Queue(maxsize=10_000)
+        consumer_process = mp.Process(target=consumer, args=(queue, "STOP", database))
+        consumer_process.start()
+
+        # One less process for the pool, because we use one extra for the consumer
+        with mp.Pool(processes=cfg.CPU_THREADS - 1) as pool:
+            delta = chunksize
+            with tqdm(total=len(flist), desc="Files processed") as pbar:
+                # Instead of chunk_size arg, manual splitting, because this reduces the overhead for the iterable.
+                for res in pool.imap_unordered(analyze_file, [flist[i : i + delta] for i in range(0, len(flist), delta)], chunksize=1):
+                    queue.put(res)
+                    pbar.update(len(res))
+
+        queue.put([("STOP", 0, 0, None)])
+        consumer_process.join()
 
     if file_output:
-        create_file_output(file_output, db)
-
-    db.db.close()
+        create_csv_output(file_output, database)