Skip to content

Commit 6fa6278

Browse files
authored
Make benchmark dataset scrubbing metadata-driven and unify HDF5/MFD load behavior (#653)
- Added metadata-controlled benchmark loader behavior via `load_behavior` with `LEGACY_SCRUB` and `NO_SCRUB`. - Introduced `DataSetUtils.processDataSet(...)` as the central processing path and kept the old scrubbing path behind a deprecated compatibility method. - Updated both `DataSetLoaderHDF5` and `DataSetLoaderMFD` to carry full `DataSetProperties` through loading instead of reducing metadata to only similarity. - Removed HDF5 filename-based similarity inference and made curated dataset metadata authoritative. - Expanded `dataset_metadata.yml` to include explicit `similarity_function` and `load_behavior` entries for curated HDF5 and MFD datasets. - Added console reporting of the dataset similarity function so the effective metadata-supplied similarity is visible during indexing runs.
1 parent 18488b8 commit 6fa6278

17 files changed

Lines changed: 418 additions & 187 deletions

jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ static void runOneGraph(OnDiskGraphIndexCache cache,
236236
diagnostics.startMonitoring("testDirectory", workDirectory);
237237
diagnostics.startMonitoring("indexCache", Paths.get(indexCacheDir));
238238
diagnostics.capturePrePhaseSnapshot("Graph Build");
239+
System.out.printf("%s: Dataset similarity function is %s%n", ds.getName(), ds.getSimilarityFunction());
239240

240241
// Resolve build compressor (and label quant type) so we can record compute time
241242
VectorCompressor<?> buildCompressorObj = null;

jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java

Lines changed: 18 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
package io.github.jbellis.jvector.example.benchmarks.datasets;
1818

19-
import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
2019
import io.github.jbellis.jvector.vector.VectorizationProvider;
2120
import io.github.jbellis.jvector.vector.types.VectorFloat;
2221
import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
@@ -41,10 +40,9 @@
4140
/**
4241
* This dataset loader will get and load hdf5 files from <a href="https://ann-benchmarks.com/">ann-benchmarks</a>.
4342
*
44-
* <p>The vector similarity function is first inferred from the filename (e.g. {@code -angular},
45-
* {@code -euclidean}). If the filename does not contain a recognized suffix, the loader falls
46-
* back to looking up the dataset in {@code dataset_metadata.yml} via {@link DataSetMetadataReader}.
47-
* If neither source provides a similarity function, an error is thrown.
43+
* <p>For curated benchmark datasets, properties are provided by
44+
* {@code dataset_metadata.yml} via {@link DataSetMetadataReader}. If the metadata
45+
* does not provide a similarity function, an error is thrown.
4846
*/
4947
public class DataSetLoaderHDF5 implements DataSetLoader {
5048
public static final Path HDF5_DIR = Path.of("hdf5");
@@ -57,19 +55,17 @@ public class DataSetLoaderHDF5 implements DataSetLoader {
5755
*/
5856
public Optional<DataSetInfo> loadDataSet(String datasetName) {
5957
return maybeDownloadHdf5(datasetName).map(path -> {
60-
var props = getProperties(datasetName, path);
61-
var similarity = props.similarityFunction()
58+
var props = getProperties(datasetName);
59+
props.similarityFunction()
6260
.orElseThrow(() -> new IllegalArgumentException(
63-
"No similarity function found for HDF5 dataset: " + datasetName
64-
+ ". Either include -angular, -dot, or -euclidean in the filename,"
65-
+ " or add an entry in dataset_metadata.yml"));
66-
return new DataSetInfo(props, () -> readHdf5Data(path, similarity));
61+
"No similarity function configured in dataset_metadata.yml for HDF5 dataset: " + datasetName));
62+
return new DataSetInfo(props, () -> readHdf5Data(path, props));
6763
});
6864
}
6965

7066
/// Reads base vectors, query vectors, and ground truth from an HDF5 file
71-
/// and returns a scrubbed {@link DataSet}.
72-
private DataSet readHdf5Data(Path path, VectorSimilarityFunction similarityFunction) {
67+
/// and returns a {@link DataSet} using the configured dataset properties.
68+
private DataSet readHdf5Data(Path path, DataSetProperties props) {
7369
VectorFloat<?>[] baseVectors;
7470
VectorFloat<?>[] queryVectors;
7571
var gtSets = new ArrayList<List<Integer>>();
@@ -103,37 +99,19 @@ private DataSet readHdf5Data(Path path, VectorSimilarityFunction similarityFunct
10399
}
104100
}
105101

106-
return DataSetUtils.getScrubbedDataSet(path.getFileName().toString(), similarityFunction, Arrays.asList(baseVectors), Arrays.asList(queryVectors), gtSets);
102+
return DataSetUtils.processDataSet(
103+
path.getFileName().toString(),
104+
props,
105+
Arrays.asList(baseVectors),
106+
Arrays.asList(queryVectors),
107+
gtSets);
107108
}
108109

109-
/// Derives dataset properties from the filename, falling back to {@link DataSetMetadataReader}.
110-
///
111-
/// The filename is checked first for known suffixes ({@code -angular}, {@code -dot},
112-
/// {@code -euclidean}) to infer the similarity function. If none match, the dataset name
113-
/// is looked up in {@code dataset_metadata.yml}. If neither source provides properties,
114-
/// a minimal {@link DataSetProperties} with an empty similarity function is returned
115-
/// so that the caller can produce a clear error.
110+
/// Looks up dataset properties in {@code dataset_metadata.yml}.
116111
///
117112
/// @param datasetName the logical dataset name (without {@code .hdf5} extension)
118-
/// @param filename the resolved file path including the {@code .hdf5} extension
119-
/// @return the dataset properties
120-
private static DataSetProperties getProperties(String datasetName, Path filename) {
121-
String filenameStr = filename.toString();
122-
VectorSimilarityFunction inferred = null;
123-
if (filenameStr.contains("-angular") || filenameStr.contains("-dot")) {
124-
inferred = VectorSimilarityFunction.COSINE;
125-
} else if (filenameStr.contains("-euclidean")) {
126-
inferred = VectorSimilarityFunction.EUCLIDEAN;
127-
}
128-
129-
// If filename inference succeeded, build properties with just the SF
130-
if (inferred != null) {
131-
return new DataSetProperties.PropertyMap(Map.of(
132-
DataSetProperties.KEY_NAME, datasetName,
133-
DataSetProperties.KEY_SIMILARITY_FUNCTION, inferred));
134-
}
135-
136-
// Fall back to metadata YAML
113+
/// @return the dataset properties, or a minimal name-only property set if no entry exists
114+
private static DataSetProperties getProperties(String datasetName) {
137115
return metadata.getProperties(datasetName)
138116
.orElse(new DataSetProperties.PropertyMap(Map.of(DataSetProperties.KEY_NAME, datasetName)));
139117
}

jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
package io.github.jbellis.jvector.example.benchmarks.datasets;
1818

1919
import io.github.jbellis.jvector.example.util.SiftLoader;
20-
import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
2120
import org.slf4j.Logger;
2221
import org.slf4j.LoggerFactory;
2322
import software.amazon.awssdk.auth.credentials.AnonymousCredentialsProvider;
@@ -67,10 +66,10 @@ public Optional<DataSetInfo> loadDataSet(String fileName) {
6766
var props = metadata.getProperties(mfd.name)
6867
.orElseThrow(() -> new IllegalArgumentException(
6968
"No metadata configured in dataset_metadata.yml for MFD dataset: " + mfd.name));
70-
var vsf = props.similarityFunction()
69+
props.similarityFunction()
7170
.orElseThrow(() -> new IllegalArgumentException(
7271
"No similarity_function configured in dataset_metadata.yml for MFD dataset: " + mfd.name));
73-
return new DataSetInfo(props, () -> mfd.load(vsf));
72+
return new DataSetInfo(props, () -> mfd.load(props));
7473
});
7574
}
7675

@@ -204,15 +203,16 @@ public Iterable<Path> paths() {
204203
return List.of(basePath, queriesPath, groundTruthPath);
205204
}
206205

207-
/// Reads the fvec/ivec files from disk and returns a scrubbed {@link DataSet}.
206+
/// Reads the fvec/ivec files from disk and processes the dataset using the
207+
/// configured dataset properties.
208208
///
209-
/// @param similarityFunction the similarity function to associate with the dataset
210-
/// @return the loaded and scrubbed dataset
211-
public DataSet load(VectorSimilarityFunction similarityFunction) {
209+
/// @param props the dataset properties controlling similarity and load behavior
210+
/// @return the loaded dataset
211+
public DataSet load(DataSetProperties props) {
212212
var baseVectors = SiftLoader.readFvecs("fvec/" + basePath);
213213
var queryVectors = SiftLoader.readFvecs("fvec/" + queriesPath);
214214
var gtVectors = SiftLoader.readIvecs("fvec/" + groundTruthPath);
215-
return DataSetUtils.getScrubbedDataSet(name, similarityFunction, baseVectors, queryVectors, gtVectors);
215+
return DataSetUtils.processDataSet(name, props, baseVectors, queryVectors, gtVectors);
216216
}
217217

218218
public static Map<String, MultiFileDatasource> byName = new HashMap<>() {{

jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ public class DataSetMetadataReader {
5353
private final Map<String, Map<String, Object>> metadata;
5454

5555
private DataSetMetadataReader(Map<String, Map<String, Object>> metadata) {
56-
this.metadata = metadata;
56+
this.metadata = metadata != null ? metadata : Map.of();
5757
}
5858

5959
/// Loads dataset metadata from the default file ({@code jvector-examples/yaml-configs/dataset_metadata.yml}).
@@ -72,8 +72,7 @@ public static DataSetMetadataReader load() {
7272
@SuppressWarnings("unchecked")
7373
public static DataSetMetadataReader load(String file) {
7474
try (InputStream inputStream = new FileInputStream(file)) {
75-
Yaml yaml = new Yaml();
76-
Map<String, Map<String, Object>> data = yaml.load(inputStream);
75+
Map<String, Map<String, Object>> data = new Yaml().load(inputStream);
7776
return new DataSetMetadataReader(data);
7877
} catch (IOException e) {
7978
throw new RuntimeException("Failed to load dataset metadata from " + file, e);
@@ -82,22 +81,34 @@ public static DataSetMetadataReader load(String file) {
8281

8382
/// Looks up the {@link DataSetProperties} for a dataset by key.
8483
///
85-
/// The lookup tries the exact key first, then the key with {@code .hdf5} appended.
86-
/// The YAML entry is wrapped in a {@link DataSetProperties.PropertyMap} with the dataset
87-
/// name injected. Properties not present in the YAML default to empty/false/zero.
84+
/// The lookup first tries the exact key. If that is not found, it also tries the
85+
/// corresponding key with or without the {@code .hdf5} suffix so that callers may
86+
/// use either form.
87+
///
88+
/// The matched YAML entry is wrapped in a {@link DataSetProperties.PropertyMap}
89+
/// with the requested dataset key injected as the dataset name when no explicit
90+
/// name is present. Properties not present in the YAML default to empty/false/zero.
8891
///
8992
/// @param datasetKey the dataset name or filename to look up
9093
/// @return the dataset properties if an entry exists, or empty if no entry is found
9194
public Optional<DataSetProperties> getProperties(String datasetKey) {
95+
return findEntry(datasetKey).map(entry -> {
96+
var props = new HashMap<>(entry);
97+
props.putIfAbsent(DataSetProperties.KEY_NAME, datasetKey);
98+
return new DataSetProperties.PropertyMap(props);
99+
});
100+
}
101+
102+
private Optional<Map<String, Object>> findEntry(String datasetKey) {
92103
Map<String, Object> entry = metadata.get(datasetKey);
93-
if (entry == null) {
94-
entry = metadata.get(datasetKey + ".hdf5");
104+
if (entry != null) {
105+
return Optional.of(entry);
95106
}
96-
if (entry == null) {
97-
return Optional.empty();
107+
108+
if (datasetKey.endsWith(".hdf5")) {
109+
return Optional.ofNullable(metadata.get(datasetKey.substring(0, datasetKey.length() - ".hdf5".length())));
98110
}
99-
var props = new HashMap<>(entry);
100-
props.putIfAbsent(DataSetProperties.KEY_NAME, datasetKey);
101-
return Optional.of(new DataSetProperties.PropertyMap(props));
111+
112+
return Optional.ofNullable(metadata.get(datasetKey + ".hdf5"));
102113
}
103114
}

jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,21 @@ public interface DataSetProperties {
5151
/// Canonical key for whether the dataset is free of duplicate vectors ({@link Boolean}).
5252
String KEY_IS_DUPLICATE_VECTOR_FREE = "is_duplicate_vector_free";
5353

54+
/// Canonical key for how benchmark loaders should treat the dataset at load time.
55+
String KEY_LOAD_BEHAVIOR = "load_behavior";
56+
57+
/**
58+
* Controls benchmark-loader behavior for this dataset.
59+
*
60+
* <p>LEGACY_SCRUB preserves current behavior (to be deprecated).
61+
* NO_SCRUB loads the dataset exactly as provided, without load-time scrubbing
62+
* or ground-truth remapping.
63+
*/
64+
enum LoadBehavior {
65+
LEGACY_SCRUB,
66+
NO_SCRUB
67+
}
68+
5469
/**
5570
* Returns the similarity function for this dataset.
5671
*
@@ -97,6 +112,18 @@ public interface DataSetProperties {
97112
*/
98113
public boolean isDuplicateVectorFree();
99114

115+
/**
116+
* Returns how benchmark loaders should treat this dataset at load time.
117+
*
118+
* <p>This is a loader policy, not a statement of dataset quality.
119+
* The default preserves legacy behavior.
120+
*
121+
* @return the benchmark loader behavior for this dataset
122+
*/
123+
default LoadBehavior loadBehavior() {
124+
return LoadBehavior.LEGACY_SCRUB;
125+
}
126+
100127
/**
101128
* A convenience method to capture the notion of a valid dataset.
102129
* As any additional qualifiers are added to this data carrier, this method should be updated accordingly.
@@ -222,5 +249,17 @@ public boolean isZeroVectorFree() {
222249
public boolean isDuplicateVectorFree() {
223250
return Boolean.TRUE.equals(properties.get(KEY_IS_DUPLICATE_VECTOR_FREE));
224251
}
252+
253+
@Override
254+
public LoadBehavior loadBehavior() {
255+
var value = properties.get(KEY_LOAD_BEHAVIOR);
256+
if (value instanceof LoadBehavior) {
257+
return (LoadBehavior) value;
258+
}
259+
if (value instanceof String) {
260+
return LoadBehavior.valueOf((String) value);
261+
}
262+
return LoadBehavior.LEGACY_SCRUB;
263+
}
225264
}
226265
}

0 commit comments

Comments
 (0)