datastax
diff --git a/‎.github/workflows/run-bench.yml‎
Lines changed: 14 additions & 2 deletions b/‎.github/workflows/run-bench.yml‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎.gitignore‎
Lines changed: 11 additions & 0 deletions b/‎.gitignore‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/benchmarking.md‎
Lines changed: 42 additions & 55 deletions b/‎docs/benchmarking.md‎
Lines changed: 42 additions & 55 deletions
diff --git a/‎jvector-examples/README.md‎
Lines changed: 2 additions & 2 deletions b/‎jvector-examples/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java‎
Lines changed: 2 additions & 2 deletions b/‎jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java‎
Lines changed: 3 additions & 4 deletions b/‎jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java‎
Lines changed: 3 additions & 4 deletions
@@ -126,6 +126,20 @@ jobs:
           ref: ${{ matrix.branch }}
           fetch-depth: 0
 
+      # ==========================================
+      # Decode and write the protected dataset catalog
+      #
+      # TO UPDATE THIS SECRET:
+      # 1. On your local machine, run:
+      #    base64 -i jvector-examples/yaml-configs/dataset-catalogs/protected-catalog.yaml
+      # 2. Go to GitHub Repo -> Settings -> Secrets and variables -> Actions
+      # 3. Update the PROTECTED_CATALOG_YAML secret with the new Base64 string.
+      # ==========================================
+      - name: Inject Protected Catalog
+        run: |
+          mkdir -p jvector-examples/yaml-configs/dataset-catalogs
+          echo "${{ secrets.PROTECTED_CATALOG_YAML }}" | base64 -d > jvector-examples/yaml-configs/dataset-catalogs/protected-catalog.yaml
+
       # Create a directory to store benchmark results
       - name: Create results directory
         run: mkdir -p benchmark_results
@@ -137,8 +151,6 @@ jobs:
       # Run the benchmark if jvector-examples exists
       - name: Run benchmark
         id: run-benchmark
-        env:
-          DATASET_HASH: ${{ secrets.DATASETS_KEYPATH }}
         run: |
           # Check if jvector-examples directory and AutoBenchYAML class exist
           if [ ! -d "jvector-examples" ]; then
 
@@ -3,10 +3,19 @@ local/
 .mvn/wrapper/maven-wrapper.jar
 .java-version
 .bob/
+dataset_
+**/local_datasets/**
 
 ### Bench caches
 pq_cache/
 index_cache/
+dataset_cache/
+
+### Data catalogs
+jvector-examples/yaml-configs/dataset-catalogs/*.yaml
+jvector-examples/yaml-configs/dataset-catalogs/*.yml
+!jvector-examples/yaml-configs/dataset-catalogs/public-catalog.yaml
+jvector-examples/yaml-configs/dataset-catalogs/.catalog-cache/
 
 ### Logging (or whatever you use)
 logging/
@@ -49,3 +58,5 @@ hdf5/
 # JMH generated files
 dependency-reduced-pom.xml
 results.csv
+**/datasets/custom/**
+**/dataset_cache/**
@@ -74,7 +74,7 @@ You may also use method-level filtering and patterns, e.g.,
 (The `failIfNoSpecifiedTests` option works around a quirk of surefire: it is happy to run `test` with submodules with empty test sets,
 but as soon as you supply a filter, it wants at least one match in every submodule.)
 
-You can run `SiftSmall` and `Bench` directly to get an idea of what all is going on here. `Bench` will automatically download required datasets to the `fvec` and `hdf5` directories.
+You can run `SiftSmall` and `Bench` directly to get an idea of what all is going on here. `Bench` will automatically download required datasets to the `dataset_cache` directory.
 The files used by `SiftSmall` can be found in the [siftsmall directory](./siftsmall) in the project root.
 
 To run either class, you can use the Maven exec-plugin via the following incantations:
 
@@ -4,21 +4,19 @@ JVector comes with a built-in benchmarking system in `jvector-examples/.../Bench
 
 To run a benchmark
 - Decide which dataset(s) you want to benchmark. A dataset consists of
-    - The vectors to be indexed, usually called the "base" or "target" vectors.
-    - The query vectors.
-    - The "ground truth" results which are used to compute accuracy metrics.
-    - The similarity metric which should have been used to compute the ground truth (dot product, cosine similarity or L2 distance).
-- Configure the parameters combinations for which you want to run the benchmark. This includes graph index parameters, quantization parameters and search parameters.
+    - The vectors to be indexed, usually called the "base" or "target" vectors
+    - The query vectors
+    - The "ground truth" results that are used to compute accuracy metrics
+    - The similarity metric used compute the ground truth (dot product, cosine similarity or L2 distance)
+- Configure the parameters combinations for which you want to run the benchmark. This includes index construction parameters, quantization parameters and search parameters.
 
-JVector supports two types of datasets:
-- **Fvec/Ivec**: The dataset consists of three files, for example `base.fvec`, `queries.fvec` and `neighbors.ivec` containing the base vectors, query vectors, and ground truth. (`fvec` and `ivec` file formats are described [here](http://corpus-texmex.irisa.fr/))
-- **HDF5**: The dataset consists of a single HDF5 file with three datasets labelled `train`, `test` and `neighbors`, representing the base vectors, query vectors and the ground truth.
+JVector supports datasets in the fvecs/ivecs format.  These consist of three files, for example `base.fvecs`, `queries.fvecs` and `neighbors.ivecs` containing the base vectors, query vectors, and ground truth. (`fvecs` and `ivecs` file formats are described [here](http://corpus-texmex.irisa.fr/))
 
 The general procedure for running benchmarks is mentioned below. The following sections describe the process in more detail.
 - [Specify the dataset](#specifying-datasets) names to benchmark in `datasets.yml`.
 - Certain datasets will be downloaded automatically. If using a different dataset, make sure the dataset files are downloaded and made available (refer the section on [Custom datasets](#custom-datasets)).
-- Adjust the benchmark parameters in `default.yml`. This will affect the parameters for all datasets to be benchmarked. You can specify custom parameters for a specific dataset by creating a file called `<your-dataset-name>.yml` in the same folder.
-- Decide on the kind of measurements and logging you want and configure them in `run.yml`.
+- Adjust the benchmark parameters in `default.yml`. This will affect the parameters for all datasets benchmarked. You can specify custom parameters for a specific dataset by creating a file called `<your-dataset-name>.yml` in the `index-parameters` subfolder.
+- Decide on the kind of measurements and logging you want and configure them in `run-config.yml`.
 
 You can run the configured benchmark with maven:
 ```sh
@@ -31,31 +29,28 @@ The datasets you want to benchmark should be specified in `jvector-examples/yaml
 
 To benchmark a single dataset, comment out the entries corresponding to all other datasets. (Or provide command line arguments as described in [Running `bench` from the command line](#running-bench-from-the-command-line))
 
-Datasets are assumed to be Fvec/Ivec based unless the entry in the `datasets.yml` ends with `.hdf5`. In this case, `.hdf5` is not considered part of the "dataset name" referenced in other sections.
+Datasets are grouped into categories. The categories can be arbitrarily chosen for convenience and are not currently considered by the benchmarking system.
 
-You'll notice that datasets are grouped into categories. The categories can be arbitrarily chosen for convenience and are not currently considered by the benchmarking system.
-
-For HDF5 files, the substrings `-angular`, `-euclidean` and `-dot` correspond to cosine similarity, L2 distance, and dot product similarity functions (these substrings ARE considered to be part of the "dataset name"). Currently, Fvec/Ivec datasets are implicitly assumed to use cosine similarity (changing this requires editing `DataSetLoaderMFD.java`).
+Dataset similarity functions are configured in `jvector-examples/yaml-configs/dataset-metadata.yml`.
 
 Example `datasets.yml`:
 
 ```yaml
 category0:
-  - my-fvec-dataset                      # fvec/ivec dataset, cosine similarity
-  - my-hdf5-dataset-angular.hdf5         # hdf5 dataset, cosine similarity
+  - my-dataset-a
+  - my-dataset-b
 some-other-category:
-  - a-huge-dataset-1024d-euclidean.hdf5  # hdf5 dataset, L2 similarity
-  - my-simple-dataset-dot.hdf5           # hdf5 dataset, dot product similarity
-  - some-dataset-euclidean               # fvec/ivec dataset, cosine similarity (NOT L2 unless you change the code!)
+  - another-dataset-a
+  - another-dataset-b
 ```
 
 ## Setting benchmark parameters
 
 ### default.yml / \<dataset-name\>.yml
 
-`jvector-examples/yaml-configs/default.yml` specifies the default index construction and search parameters to be used by `bench` for all datasets.
+`jvector-examples/yaml-configs/index-parameters/default.yml` specifies the default index construction and search parameters to be used by `bench` for all datasets.
 
-You can specify a custom set of a parameters for any given dataset by creating a file called `<dataset-name>.yml`, with `<dataset-name>` replaced by the actual name of the dataset. This is the same as the identifier used in `datasets.yml`, but without the `.hdf5` suffix for hdf5 datasets. The format of this file is exactly the same as `default.yml`.
+You can specify a custom set of a parameters for any given dataset by creating a file called `<dataset-name>.yml`, with `<dataset-name>` replaced by the actual name of the dataset. This is the same as the identifier used in `datasets.yml`. The format of this file is exactly the same as `default.yml`.
 
 Refer to `default.yml` for a list of all options.
 
@@ -67,15 +62,15 @@ construction:
 ```
 will build and benchmark four graphs, one for each combination of M and ef in {(32, 100), (64, 100), (32, 200), (64, 200)}. This is particularly useful when running a Grid search to identify the best performing parameters.
 
-### run.yml
+### run-config.yml
 
 This file contains configurations for
 - Specifying the measurements you want to report, like QPS, latency and recall
 - Specifying where to output these measurements, i.e. to the console, or to a file, or both.
 
 The configurations in this file are "run-level", meaning that they are shared across all the datasets being benchmarked.
 
-See `run.yml` for a full list of all options.
+See `run-config.yml` for a full list of all options.
 
 ## Running `bench` from the command line
 
@@ -86,45 +81,37 @@ mvn compile exec:exec@bench -pl jvector-examples -am
 
 To benchmark a subset of the datasets in `datasets.yml`, you can provide a space-separated list of regexes as arguments.
 ```sh
-# matches `glove-25-angular.hdf5`, `glove-50-angular.hdf5`, `nytimes-256-angular.hdf5` etc
+# matches `glove-25-angular`, `glove-50-angular`, `nytimes-256-angular` etc
 mvn compile exec:exec@bench -pl jvector-examples -am -DbenchArgs="glove nytimes"
 ```
 
 ## Custom Datasets
 
-### Custom Fvec/Ivec datasets
-
-Using fvec/ivec datasets requires them to be configured in `DataSetLoaderMFD.java`. Some datasets are already pre-configured; these will be downloaded and used automatically on running the benchmark.
-
-To use a custom dataset consisting of files `base.fvec`, `queries.fvec` and `neighbors.ivec`, do the following:
-- Ensure that you have three files:
-    - `base.fvec` containing N D-dimensional float vectors. These are used to build the index.
-    - `queries.fvec` containing Q D-dimensional float vectors. These are used for querying the built index.
-    - `neighbors.ivec` containing Q K-dimensional integer vectors, one for each query vector, representing the exact K-nearest neighbors for that query among the base vectors.
-    The files can be named however you like.
-- Save all three files somewhere in the `fvec` directory in the root of the `jvector` repo (if it doesn't exist, create it). It's recommended to create at least one sub-folder with the name of the dataset and copy or move all three files there.
-- Edit `DataSetLoaderMFD.java` to configure a new dataset and it's associated files:
-    ```java
-    put("cust-ds", new MultiFileDatasource("cust-ds",
-            "cust-ds/base.fvec",
-            "cust-ds/query.fvec",
-            "cust-ds/neighbors.ivec"));
+Datasets are configured via YAML catalog files under `jvector-examples/yaml-configs/dataset-catalogs/`. The loader recursively discovers all `.yaml`/`.yml` files in that directory tree. See `jvector-examples/yaml-configs/dataset-catalogs/local-catalog.yaml` for the full format reference.
+
+To add a custom fvecs/ivecs dataset:
+
+1. Add a `.yaml` file to the YAML catalog directory, mapping your dataset name to its files:
+    ```yaml
+    _defaults:
+      cache_dir: ${DATASET_CACHE_DIR:-dataset_cache}
+
+    my-dataset:
+      base: my_base_vectors.fvecs
+      query: my_query_vectors.fvecs
+      gt: my_ground_truth.ivecs
+    ```
+2. Place your fvecs/ivecs files at the paths you specified in the YAML (or specify a `cache_dir` / `base_url` to fetch them from a remote source).
+3. Add the dataset's similarity function to `jvector-examples/yaml-configs/dataset-metadata.yml`:
+    ```yaml
+    my-dataset:
+      similarity_function: COSINE
+      load_behavior: NO_SCRUB
     ```
-    The file paths are resolved relative to the `fvec` directory. `cust-ds` is the name of the dataset and can be changed to whatever is appropriate.
-- In `jvector-examples/yaml-configs/datasets.yml`, add an entry corresponding to your custom dataset. Comment out other datasets which you do not want to benchmark.
+4. Add the dataset name to `jvector-examples/yaml-configs/datasets.yml` so BenchYAML can find it:
     ```yaml
     custom:
-      - cust-ds
+      - my-dataset
     ```
 
-## Custom HDF5 datasets
-
-HDF5 datasets consist of a single file. The Hdf5Loader looks for three HDF5 datasets within the file, `train`, `test` and `neighbors`. These correspond to the base, query and neighbors vectors described above for fvec/ivec files.
-
-To use an HDF5 dataset, edit `jvector-examples/yaml-configs/datasets.yml` to add an entry like the following:
-```yaml
-category:
-  - <dataset-name>.hdf5
-```
-
-BenchYAML looks for hdf5 datasets with the name `<dataset-name>.hdf5` in the `hdf5` folder in the root of this repo. If the file doesn't exist, BenchYAML will attempt to automatically download the dataset from ann-benchmarks.com. If your dataset is not from ann-benchmarks.com, simply ensure that the dataset is available in the `hdf5` folder and edit `datasets.yml` accordingly.
+For remote datasets, use `base_url` to specify where files should be downloaded from. The `${VAR}` and `${VAR:-default}` syntax is supported for environment variable expansion. See the example config for details.
@@ -11,8 +11,8 @@ A simple benchmark for the sift dataset located in the [siftsmall](./siftsmall)
 Performs grid search across the `GraphIndexBuilder` parameter space to find
 the best tradeoffs between recall and throughput.  
 
-This benchmark requires datasets from [https://github.com/erikbern/ann-benchmarks](https://github.com/erikbern/ann-benchmarks/blob/main/README.md#data-sets) to be downloaded to hdf5 and fvec 
-directories `hdf5` or `fvec` under the project root depending on the dataset format. 
+This benchmark requires `fvecs' versions of datasets from [https://github.com/erikbern/ann-benchmarks](https://github.com/erikbern/ann-benchmarks/blob/main/README.md#data-sets) to be downloaded to `dataset_cache` 
+directory under the project root. 
 
 You can use [`plot_output.py`](./plot_output.py) to graph the [pareto-optimal points](https://en.wikipedia.org/wiki/Pareto_efficiency) found by `Bench`.
 
 
@@ -94,11 +94,11 @@ public static void main(String[] args) throws IOException {
             RunConfig runCfg = RunConfig.loadDefault();
             artifacts = RunArtifacts.open(runCfg, allConfigs);
         } catch (java.io.FileNotFoundException e) {
-            // Legacy yamlSchemaVersion "0" behavior: no run.yml
+            // Legacy yamlSchemaVersion "0" behavior: no run-config.yml
             // - logging disabled
             // - console shows compute selection
             // - compute selection comes from legacy search.benchmarks if present, else default
-            System.err.println("WARNING: run.yml not found. Falling back to deprecated legacy behavior: "
+            System.err.println("WARNING: run-config.yml not found. Falling back to deprecated legacy behavior: "
                     + "no logging, console mirrors computed benchmarks.");
 
             Map<String, List<String>> legacyBenchmarks = null;
 
@@ -16,7 +16,7 @@
 
 package io.github.jbellis.jvector.example;
 
-import io.github.jbellis.jvector.example.benchmarks.datasets.DataSetLoaderMFD;
+import io.github.jbellis.jvector.example.benchmarks.datasets.DataSets;
 import io.github.jbellis.jvector.example.reporting.RunArtifacts;
 import io.github.jbellis.jvector.example.yaml.MultiConfig;
 import io.github.jbellis.jvector.example.yaml.RunConfig;
@@ -36,9 +36,8 @@ public static void main(String[] args) throws IOException {
         // Run-level policy config (benchmarks/console/logging + run metadata)
         RunConfig runCfg = RunConfig.loadDefault();
 
-        // Load dataset
-        var ds = new DataSetLoaderMFD().loadDataSet(datasetName)
-                .orElseThrow(() -> new RuntimeException("dataset " + datasetName + " not found"))
+        var ds = DataSets.loadDataSet(datasetName).orElseThrow(
+                () -> new RuntimeException("dataset " + datasetName + " not found"))
                 .getDataSet();
 
         // Run artifacts + selections (sys_info/dataset_info/experiments.csv)