Skip to content

Commit f977752

Browse files
committed
Add blocked bloom filter for approximate matching
1 parent ee50e30 commit f977752

26 files changed

Lines changed: 808 additions & 29 deletions

Src/FastData.Generator.CPlusPlus/CPlusPlusCodeGenerator.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ protected override void AppendFooter<T>(StringBuilder sb, GeneratorConfig<T> gen
6969
SingleValueContext<TKey, TValue> x => new SingleValueCode<TKey, TValue>(x, Shared),
7070
RangeContext<TKey, TValue> x => new RangeCode<TKey, TValue>(x),
7171
BitSetContext<TKey, TValue> x => new BitSetCode<TKey, TValue>(x, Shared),
72+
BloomFilterContext<TKey, TValue> x => new BloomFilterCode<TKey, TValue>(x),
7273
ArrayContext<TKey, TValue> x => new ArrayCode<TKey, TValue>(x, Shared),
7374
BinarySearchContext<TKey, TValue> x => new BinarySearchCode<TKey, TValue>(x, Shared),
7475
ConditionalContext<TKey, TValue> x => new ConditionalCode<TKey, TValue>(x, Shared),
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
using Genbox.FastData.Generator.CPlusPlus.Internal.Framework;
2+
using Genbox.FastData.Generator.Enums;
3+
using Genbox.FastData.Generator.Extensions;
4+
using Genbox.FastData.Generators.Contexts;
5+
6+
namespace Genbox.FastData.Generator.CPlusPlus.Internal.Generators;
7+
8+
internal sealed class BloomFilterCode<TKey, TValue>(BloomFilterContext<TKey, TValue> ctx) : CPlusPlusOutputWriter<TKey>
9+
{
10+
public override string Generate() =>
11+
$$"""
12+
{{GetFieldModifier(true)}}std::array<uint64_t, {{ctx.BitSet.Length.ToStringInvariant()}}> bloom = {
13+
{{FormatColumns(ctx.BitSet, ToValueLabel)}}
14+
};
15+
16+
{{HashSource}}
17+
18+
public:
19+
{{MethodAttribute}}
20+
{{GetMethodModifier(true)}}bool contains(const {{KeyTypeName}} {{InputKeyName}}){{PostMethodModifier}} {
21+
{{GetMethodHeader(MethodType.Contains)}}
22+
23+
const {{HashSizeType}} hash = get_hash({{LookupKeyName}});
24+
const {{ArraySizeType}} index = {{GetModFunction("hash", (ulong)ctx.BitSet.Length)}};
25+
const uint32_t shift1 = static_cast<uint32_t>(hash) & 63u;
26+
const uint32_t shift2 = static_cast<uint32_t>(hash >> 8) & 63u;
27+
const uint64_t mask = (1ULL << shift1) | (1ULL << shift2);
28+
return (bloom[index] & mask) == mask;
29+
}
30+
""";
31+
}

Src/FastData.Generator.CSharp/CSharpCodeGenerator.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ protected override void AppendFooter<T>(StringBuilder sb, GeneratorConfig<T> gen
7979
SingleValueContext<TKey, TValue> x => new SingleValueCode<TKey, TValue>(x, _cfg, Shared),
8080
RangeContext<TKey, TValue> x => new RangeCode<TKey, TValue>(x, _cfg),
8181
BitSetContext<TKey, TValue> x => new BitSetCode<TKey, TValue>(x, _cfg, Shared),
82+
BloomFilterContext<TKey, TValue> x => new BloomFilterCode<TKey, TValue>(x, _cfg),
8283
ArrayContext<TKey, TValue> x => new ArrayCode<TKey, TValue>(x, _cfg, Shared),
8384
BinarySearchContext<TKey, TValue> x => new BinarySearchCode<TKey, TValue>(x, _cfg, Shared),
8485
ConditionalContext<TKey, TValue> x => new ConditionalCode<TKey, TValue>(x, _cfg, Shared),
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
using Genbox.FastData.Generator.CSharp.Internal.Framework;
2+
using Genbox.FastData.Generator.Enums;
3+
using Genbox.FastData.Generators.Contexts;
4+
5+
namespace Genbox.FastData.Generator.CSharp.Internal.Generators;
6+
7+
internal sealed class BloomFilterCode<TKey, TValue>(BloomFilterContext<TKey, TValue> ctx, CSharpCodeGeneratorConfig cfg) : CSharpOutputWriter<TKey>(cfg)
8+
{
9+
public override string Generate() =>
10+
$$"""
11+
{{FieldModifier}}ulong[] _bloom = new ulong[] {
12+
{{FormatColumns(ctx.BitSet, ToValueLabel)}}
13+
};
14+
15+
{{HashSource}}
16+
17+
{{MethodAttribute}}
18+
{{MethodModifier}}bool Contains({{KeyTypeName}} {{InputKeyName}})
19+
{
20+
{{GetMethodHeader(MethodType.Contains)}}
21+
22+
ulong hash = Hash({{LookupKeyName}});
23+
{{ArraySizeType}} index = {{GetModFunction("hash", (ulong)ctx.BitSet.Length)}};
24+
uint shift1 = (uint)hash & 63u;
25+
uint shift2 = (uint)(hash >> 8) & 63u;
26+
ulong mask = (1UL << (int)shift1) | (1UL << (int)shift2);
27+
return (_bloom[index] & mask) == mask;
28+
}
29+
""";
30+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
using Genbox.FastData.Generator.Enums;
2+
using Genbox.FastData.Generator.Extensions;
3+
using Genbox.FastData.Generator.Rust.Internal.Framework;
4+
using Genbox.FastData.Generators.Contexts;
5+
6+
namespace Genbox.FastData.Generator.Rust.Internal.Generators;
7+
8+
internal sealed class BloomFilterCode<TKey, TValue>(BloomFilterContext<TKey, TValue> ctx) : RustOutputWriter<TKey>
9+
{
10+
public override string Generate() =>
11+
$$"""
12+
{{FieldModifier}}BLOOM: [u64; {{ctx.BitSet.Length.ToStringInvariant()}}] = [
13+
{{FormatColumns(ctx.BitSet, ToValueLabel)}}
14+
];
15+
16+
{{HashSource}}
17+
18+
{{MethodAttribute}}
19+
{{MethodModifier}}fn contains({{InputKeyName}}: {{GetKeyTypeName(!typeof(TKey).IsPrimitive)}}) -> bool {
20+
{{GetMethodHeader(MethodType.Contains)}}
21+
22+
let hash = unsafe { Self::get_hash({{LookupKeyName}}) };
23+
let index = {{GetModFunction("hash", (ulong)ctx.BitSet.Length)}};
24+
let shift1 = (hash as u32) & 63;
25+
let shift2 = ((hash >> 8) as u32) & 63;
26+
let mask = (1u64 << shift1) | (1u64 << shift2);
27+
(Self::BLOOM[index as usize] & mask) == mask
28+
}
29+
""";
30+
}

Src/FastData.Generator.Rust/RustCodeGenerator.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ protected override void AppendFooter<T>(StringBuilder sb, GeneratorConfig<T> gen
6464
SingleValueContext<TKey, TValue> x => new SingleValueCode<TKey, TValue>(x, Shared),
6565
RangeContext<TKey, TValue> x => new RangeCode<TKey, TValue>(x),
6666
BitSetContext<TKey, TValue> x => new BitSetCode<TKey, TValue>(x, Shared),
67+
BloomFilterContext<TKey, TValue> x => new BloomFilterCode<TKey, TValue>(x),
6768
ArrayContext<TKey, TValue> x => new ArrayCode<TKey, TValue>(x, Shared),
6869
BinarySearchContext<TKey, TValue> x => new BinarySearchCode<TKey, TValue>(x, Shared),
6970
ConditionalContext<TKey, TValue> x => new ConditionalCode<TKey, TValue>(x, Shared),

Src/FastData.InternalShared/Helpers/TestHelper.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,8 @@ private static GeneratorSpec GenerateInternal<TKey, TValue>(Func<string, ICodeGe
248248
return Generate(state, new BitSetStructure<TKey, TValue>((NumericKeyProperties<TKey>)props, keyType));
249249
if (vector.Type == typeof(HashTableCompactStructure<,>))
250250
return Generate(state, new HashTableCompactStructure<TKey, TValue>(GetHashData(keySpan, keyType, generator.Encoding), keyType));
251+
if (vector.Type == typeof(BloomFilterStructure<,>))
252+
return Generate(state, new BloomFilterStructure<TKey, TValue>(GetHashData(keySpan, keyType, generator.Encoding)));
251253

252254
throw new InvalidOperationException("Unsupported structure type: " + vector.Type.Name);
253255
}

Src/FastData.InternalShared/Helpers/TestVectorHelper.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ public static IEnumerable<ITestVector> GetTestVectors()
7878

7979
foreach (ITestVector testVector in GenerateTestVectors(GetDataOfSize(100), null,
8080
typeof(ArrayStructure<,>),
81+
typeof(BloomFilterStructure<,>),
8182
typeof(BinarySearchStructure<,>),
8283
typeof(ConditionalStructure<,>),
8384
typeof(HashTableStructure<,>),
@@ -96,6 +97,7 @@ public static IEnumerable<ITestVector> GetTestVectors()
9697
// Strings with characters that are not in the ASCII range
9798
foreach (ITestVector testVector in GenerateTestVectors([["æ", "à", "ä", "ö", "ü", "ß", "é", "è", "ê", "ç", "ñ", "ø", "å"]], "non_ascii",
9899
typeof(ArrayStructure<,>),
100+
typeof(BloomFilterStructure<,>),
99101
typeof(BinarySearchStructure<,>),
100102
typeof(ConditionalStructure<,>),
101103
typeof(HashTableCompactStructure<,>),
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// This file is auto-generated. Do not edit manually.
2+
// Structure: BloomFilter
3+
#pragma once
4+
#include <array>
5+
#include <cstring>
6+
#include <cstdint>
7+
#include <limits>
8+
#include <string_view>
9+
10+
class BloomFilterStructure_Int32_100 final {
11+
static constexpr std::array<uint64_t, 23> bloom = {
12+
70369021001761ull, 140738042003523ull, 281476084007045ull, 562952168014089ull, 1125904336028177ull, 2251808672056353ull, 4503617344112705ull, 9007234688225409ull, 18014400656974081ull, 36028801313948161ull,
13+
72057602627896321ull, 144115205255792641ull, 288230410511585281ull, 576460821023170561ull, 1152921642046341121ull, 2305843284092682241ull, 4611686568185364481ull, 9223373136370728961ull, 2199031906305ull, 4398063812611ull,
14+
8796127625221ull, 17592255250441ull, 35184510500881ull
15+
};
16+
17+
static constexpr uint64_t get_hash(const int32_t value) noexcept
18+
{
19+
return static_cast<uint64_t>(value);
20+
}
21+
22+
public:
23+
[[nodiscard]]
24+
static constexpr bool contains(const int32_t key) noexcept {
25+
if ((static_cast<uint32_t>(key) & 4294967168u) != 0)
26+
return false;
27+
28+
29+
const uint64_t hash = get_hash(key);
30+
const size_t index = hash % 23;
31+
const uint32_t shift1 = static_cast<uint32_t>(hash) & 63u;
32+
const uint32_t shift2 = static_cast<uint32_t>(hash >> 8) & 63u;
33+
const uint64_t mask = (1ULL << shift1) | (1ULL << shift2);
34+
return (bloom[index] & mask) == mask;
35+
}
36+
37+
static constexpr size_t item_count = 100;
38+
static constexpr int32_t min_key = 0;
39+
static constexpr int32_t max_key = 99;
40+
};
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// This file is auto-generated. Do not edit manually.
2+
// Structure: BloomFilter
3+
#pragma once
4+
#include <array>
5+
#include <cstring>
6+
#include <cstdint>
7+
#include <limits>
8+
#include <string_view>
9+
10+
class BloomFilterStructure_Single_100 final {
11+
static constexpr std::array<uint64_t, 23> bloom = {
12+
1ull, 1ull, 1ull, 1ull, 1ull, 1ull, 1ull, 1ull, 1ull, 1ull,
13+
1ull, 1ull, 1ull, 1ull, 1ull, 1ull, 1ull, 1ull, 1ull, 1ull,
14+
1ull, 1ull, 1ull
15+
};
16+
17+
static uint64_t get_hash(const float value) noexcept
18+
{
19+
uint32_t bits;
20+
std::memcpy(&bits, &value, sizeof(bits));
21+
if (((bits - 1) & ~0x80000000u) >= 0x7F800000u)
22+
bits &= 0x7F800000u;
23+
return bits;
24+
}
25+
26+
public:
27+
[[nodiscard]]
28+
static constexpr bool contains(const float key) noexcept {
29+
if (key < 0.0f || key > 99.0f)
30+
return false;
31+
32+
33+
const uint64_t hash = get_hash(key);
34+
const size_t index = hash % 23;
35+
const uint32_t shift1 = static_cast<uint32_t>(hash) & 63u;
36+
const uint32_t shift2 = static_cast<uint32_t>(hash >> 8) & 63u;
37+
const uint64_t mask = (1ULL << shift1) | (1ULL << shift2);
38+
return (bloom[index] & mask) == mask;
39+
}
40+
41+
static constexpr size_t item_count = 100;
42+
static constexpr float min_key = 0.0f;
43+
static constexpr float max_key = 99.0f;
44+
};

0 commit comments

Comments
 (0)