Skip to content

Commit a81ba33

Browse files
committed
Refactor encoding and default to UTF8 for generators
1 parent 30062a1 commit a81ba33

73 files changed

Lines changed: 499 additions & 420 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Src/FastData.Benchmarks/Benchmarks/GPerfAnalyzerBenchmarks.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ public GPerfAnalyzerBenchmarks()
1818
Random rng = new Random(42);
1919
_data = Enumerable.Range(1, 100).Select(_ => TestHelper.GenerateRandomString(rng, 50)).ToArray();
2020

21-
StringKeyProperties props = KeyAnalyzer.GetStringProperties(_data, false, false);
21+
StringKeyProperties props = KeyAnalyzer.GetStringProperties(_data, false, false, GeneratorEncoding.ASCII);
2222
_analyzer = new GPerfAnalyzer(_data.Length, props, new GPerfAnalyzerConfig(), new Simulator(_data.Length, GeneratorEncoding.UTF16), NullLogger<GPerfAnalyzer>.Instance);
2323
}
2424

Src/FastData.Benchmarks/Benchmarks/KeyAnalyzerBenchmarks.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
using Genbox.FastData.Enums;
12
using Genbox.FastData.Internal.Analysis;
23
using Genbox.FastData.InternalShared.Helpers;
34

@@ -15,5 +16,5 @@ public KeyAnalyzerBenchmarks()
1516
}
1617

1718
[Benchmark]
18-
public object GetStringProperties() => KeyAnalyzer.GetStringProperties(_data, true, false);
19+
public object GetStringProperties() => KeyAnalyzer.GetStringProperties(_data, true, false, GeneratorEncoding.UTF16);
1920
}

Src/FastData.Benchmarks/Benchmarks/SegmentGeneratorsBenchmarks.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
using Genbox.FastData.Enums;
12
using Genbox.FastData.Internal.Analysis;
23
using Genbox.FastData.Internal.Analysis.Properties;
34
using Genbox.FastData.Internal.Analysis.SegmentGenerators;
@@ -14,7 +15,7 @@ public class SegmentGeneratorsBenchmarks
1415
private readonly OffsetGenerator _ofGen = new OffsetGenerator();
1516

1617
//We start at 8 and go up to 100 to cover as many cases as possible
17-
private readonly StringKeyProperties _props = KeyAnalyzer.GetStringProperties(Enumerable.Range(8, 100).Select(x => TestHelper.GenerateRandomString(Random.Shared, x)).ToArray(), false, false);
18+
private readonly StringKeyProperties _props = KeyAnalyzer.GetStringProperties(Enumerable.Range(8, 100).Select(x => TestHelper.GenerateRandomString(Random.Shared, x)).ToArray(), false, false, GeneratorEncoding.UTF16);
1819

1920
[Benchmark]
2021
public object BruteForceGenerator() => _bfGen.Generate(_props).ToArray();

Src/FastData.Cli.Tests/CommandOutputs/cpp_-s HashTable_Files_Strings.input.verified.txt

Lines changed: 15 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
Item1:
33
// This file is auto-generated. Do not edit manually.
4-
// Structure: HashTable
4+
// Structure: HashTablePerfect
55
#pragma once
66
#include <array>
77
#include <cstring>
@@ -11,54 +11,44 @@
1111

1212
class MyData final {
1313
struct e {
14-
std::u16string_view key;
15-
int8_t next;
14+
std::string_view key;
1615
uint64_t hash_code;
1716

18-
e(const std::u16string_view key, const int8_t next, const uint64_t hash_code)
19-
: key(key), next(next), hash_code(hash_code) {}
20-
};
21-
22-
static constexpr std::array<int8_t, 2> buckets = {
23-
2, 0
24-
};
2517

18+
constexpr e(const std::string_view key, const uint64_t hash_code) noexcept
19+
: key(key), hash_code(hash_code) {}
20+
};
2621
inline static const std::array<e, 2> entries = {
27-
e(u"test1", -1, 114), e(u"test2", 0, 114)
22+
e(u8"test2", 0), e(u8"test1", 1)
2823
};
2924

30-
static constexpr uint64_t get_hash(const std::u16string_view value) noexcept
25+
static constexpr uint64_t get_hash(const std::string_view value) noexcept
3126
{
3227
uint64_t hash = 352654597;
3328

34-
for (char32_t ch : value)
29+
for (unsigned char ch : value)
3530
hash = (((hash << 5) | (hash >> 27)) + hash) ^ static_cast<uint32_t>(ch);
3631

3732
return 352654597 + (hash * 1566083941);
3833
}
3934

4035
public:
4136
[[nodiscard]]
42-
static constexpr bool contains(const std::u16string_view key) noexcept {
37+
static constexpr bool contains(const std::string_view key) noexcept {
4338
if (key.length() != 5u)
4439
return false;
40+
uint64_t first = 0;
41+
std::memcpy(&first, key.data(), 5);
4542

43+
if ((first & 878514576011ULL) != 0)
44+
return false;
4645

4746

4847
const uint64_t hash = get_hash(key);
4948
const size_t index = hash % 2;
50-
int8_t i = static_cast<int8_t>(buckets[index] - 1);
51-
52-
while (i >= 0) {
53-
const auto& entry = entries[i];
54-
55-
if (entry.hash_code == hash && entry.key == key)
56-
return true;
57-
58-
i = entry.next;
59-
}
49+
const auto& entry = entries[index];
6050

61-
return false;
51+
return hash == entry.hash_code && key == entry.key;
6252
}
6353

6454
static constexpr size_t item_count = 2;

Src/FastData.Cli.Tests/CommandOutputs/cpp_Files_Strings.input.verified.txt

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,16 @@
1212
class MyData final {
1313
public:
1414
[[nodiscard]]
15-
static constexpr bool contains(const std::u16string_view key) noexcept {
15+
static constexpr bool contains(const std::string_view key) noexcept {
1616
if (key.length() != 5u)
1717
return false;
18+
uint64_t first = 0;
19+
std::memcpy(&first, key.data(), 5);
1820

21+
if ((first & 878514576011ULL) != 0)
22+
return false;
1923

20-
if (key == u"test1" || key == u"test2")
24+
if (key == u8"test1" || key == u8"test2")
2125
return true;
2226

2327
return false;

Src/FastData.Generator.CPlusPlus.Benchmarks/Program.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using System.Globalization;
22
using System.Text;
3+
using Genbox.FastData.Enums;
34
using Genbox.FastData.Generator.CPlusPlus.Internal.Framework;
45
using Genbox.FastData.Generator.CPlusPlus.TestHarness;
56
using Genbox.FastData.Generator.Framework;
@@ -52,7 +53,7 @@ private static void Main()
5253
private static string PrintQueries(ITestData data, string identifier)
5354
{
5455
CPlusPlusLanguageDef langDef = new CPlusPlusLanguageDef();
55-
TypeMap map = new TypeMap(langDef.TypeDefinitions, langDef.Encoding);
56+
TypeMap map = new TypeMap(langDef.TypeDefinitions, GeneratorEncoding.UTF8);
5657

5758
StringBuilder sb = new StringBuilder();
5859

Src/FastData.Generator.CPlusPlus.TestHarness/CPlusPlusTestHarness.cs

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
using Genbox.FastData.Generator.Extensions;
44
using Genbox.FastData.Generator.Framework;
55
using Genbox.FastData.Generator.Helpers;
6-
using Genbox.FastData.Generators;
76
using Genbox.FastData.Generators.Abstracts;
87
using Genbox.FastData.InternalShared;
98
using Genbox.FastData.InternalShared.TestClasses;
@@ -25,7 +24,7 @@ public CPlusPlusTestHarness() : base("CPlusPlus")
2524

2625
public override ICodeGenerator CreateGenerator(string id) => CPlusPlusCodeGenerator.Create(new CPlusPlusCodeGeneratorConfig(id));
2726

28-
public override ITestRenderer CreateRenderer(GeneratorSpec spec) => new CPlusPlusRenderer(spec);
27+
public override ITestRenderer CreateRenderer(GeneratorSpec spec) => new CPlusPlusRenderer();
2928

3029
public override string RenderContainsProgram<T>(GeneratorSpec spec, ITestRenderer renderer, T[] present, T[] notPresent)
3130
{
@@ -80,27 +79,24 @@ int main()
8079
""";
8180
}
8281

82+
public override int Run(string fileId, string source)
83+
{
84+
string executable = _compiler.Compile(fileId, source);
85+
return RunProcess(executable).ExitCode;
86+
}
87+
8388
private sealed class CPlusPlusRenderer : ITestRenderer
8489
{
8590
private readonly TypeMap _map;
8691

87-
public CPlusPlusRenderer(GeneratorSpec spec)
92+
public CPlusPlusRenderer()
8893
{
8994
CPlusPlusLanguageDef langDef = new CPlusPlusLanguageDef();
90-
Encoding = langDef.Encoding;
91-
_map = new TypeMap(langDef.TypeDefinitions, Encoding);
95+
_map = new TypeMap(langDef.TypeDefinitions, GeneratorEncoding.UTF8);
9296
}
9397

94-
public GeneratorEncoding Encoding { get; }
95-
9698
public string ToValueLabel<T>(T value) => _map.ToValueLabel(value);
9799

98100
public string GetTypeName(Type type) => _map.GetTypeName(type);
99101
}
100-
101-
public override int Run(string fileId, string source)
102-
{
103-
string executable = _compiler.Compile(fileId, source);
104-
return RunProcess(executable).ExitCode;
105-
}
106102
}

Src/FastData.Generator.CPlusPlus/CPlusPlusCodeGenerator.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@ private CPlusPlusCodeGenerator(CPlusPlusCodeGeneratorConfig cfg, ILanguageDef la
1818
public static CPlusPlusCodeGenerator Create(CPlusPlusCodeGeneratorConfig userCfg)
1919
{
2020
CPlusPlusLanguageDef langDef = new CPlusPlusLanguageDef();
21-
TypeMap map = new TypeMap(langDef.TypeDefinitions, langDef.Encoding);
21+
TypeMap map = new TypeMap(langDef.TypeDefinitions, GeneratorEncoding.UTF8);
2222

2323
return new CPlusPlusCodeGenerator(userCfg, langDef, new CPlusPlusConstantsDef(), new CPlusPlusEarlyExitDef(map, userCfg.GeneratorOptions), new CPlusPlusHashDef(), map);
2424
}
2525

26+
public override GeneratorEncoding Encoding => GeneratorEncoding.UTF8;
27+
2628
public override string Generate<TKey, TValue>(GeneratorConfig<TKey> genCfg, IContext<TValue> context)
2729
{
2830
//C++ generator does not support chars outside ASCII

Src/FastData.Generator.CPlusPlus/Internal/Framework/CPlusPlusEarlyExitDef.cs

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,35 @@ protected override string GetValueBitMaskEarlyExit<T>(MethodType methodType, ulo
6161
""";
6262
}
6363

64-
protected override string GetLengthEarlyExit(MethodType methodType, uint min, uint max, uint minByte, uint maxByte) =>
65-
$"""
66-
if ({(min.Equals(max) ? $"key.length() != {map.ToValueLabel(max)}" : $"const size_t len = key.length(); len < {map.ToValueLabel(min)} || len > {map.ToValueLabel(max)}")})
67-
{RenderExit(methodType)}
68-
""";
64+
protected override string GetLengthEarlyExit(MethodType methodType, uint min, uint max, uint minByte, uint maxByte, GeneratorEncoding encoding)
65+
{
66+
uint minLen;
67+
uint maxLen;
68+
69+
switch (encoding)
70+
{
71+
case GeneratorEncoding.ASCII:
72+
case GeneratorEncoding.UTF8:
73+
minLen = minByte;
74+
maxLen = maxByte;
75+
break;
76+
case GeneratorEncoding.UTF32:
77+
minLen = minByte / 4;
78+
maxLen = maxByte / 4;
79+
break;
80+
case GeneratorEncoding.UTF16:
81+
minLen = min;
82+
maxLen = max;
83+
break;
84+
default:
85+
throw new InvalidOperationException("Unsupported encoding: " + encoding);
86+
}
87+
88+
return $"""
89+
if ({(minLen.Equals(maxLen) ? $"key.length() != {map.ToValueLabel(maxLen)}" : $"const size_t len = key.length(); len < {map.ToValueLabel(minLen)} || len > {map.ToValueLabel(maxLen)}")})
90+
{RenderExit(methodType)}
91+
""";
92+
}
6993

7094
protected override string GetStringBitMaskEarlyExit(MethodType methodType, ulong mask, int byteCount, bool ignoreCase, GeneratorEncoding encoding)
7195
{
@@ -78,12 +102,12 @@ protected override string GetStringBitMaskEarlyExit(MethodType methodType, ulong
78102
if (!ignoreCase)
79103
{
80104
return $"""
81-
uint64_t first = 0;
82-
std::memcpy(&first, key.data(), {byteCount});
105+
uint64_t first = 0;
106+
std::memcpy(&first, key.data(), {byteCount});
83107
84-
if ((first & {mask.ToStringInvariant()}ULL) != 0)
85-
{RenderExit(methodType)}
86-
""";
108+
if ((first & {mask.ToStringInvariant()}ULL) != 0)
109+
{RenderExit(methodType)}
110+
""";
87111
}
88112

89113
StringBuilder sb = new StringBuilder();
@@ -92,17 +116,17 @@ protected override string GetStringBitMaskEarlyExit(MethodType methodType, ulong
92116
for (int i = 0; i < byteCount; i++)
93117
{
94118
sb.Append($"""
95-
uint32_t c{i} = static_cast<uint32_t>(key[{i}]);
96-
c{i} = to_lower_ascii(c{i});
97-
first |= static_cast<uint64_t>(c{i}) << {i * 8};
98-
""");
119+
uint32_t c{i} = static_cast<uint32_t>(key[{i}]);
120+
c{i} = to_lower_ascii(c{i});
121+
first |= static_cast<uint64_t>(c{i}) << {i * 8};
122+
""");
99123
}
100124

101125
sb.Append($"""
102126
103-
if ((first & {mask.ToStringInvariant()}ULL) != 0)
104-
{RenderExit(methodType)}
105-
""");
127+
if ((first & {mask.ToStringInvariant()}ULL) != 0)
128+
{RenderExit(methodType)}
129+
""");
106130
return sb.ToString();
107131
}
108132

@@ -137,4 +161,4 @@ private static string RenderExit(MethodType methodType) => methodType == MethodT
137161
}
138162
"""
139163
: "return false;";
140-
}
164+
}

Src/FastData.Generator.CPlusPlus/Internal/Framework/CPlusPlusHashDef.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ private static string GetHash(KeyType keyType, HashInfo info)
2525
return """
2626
uint64_t hash = 352654597;
2727
28-
for (char32_t ch : value)
28+
for (unsigned char ch : value)
2929
hash = (((hash << 5) | (hash >> 27)) + hash) ^ static_cast<uint32_t>(ch);
3030
3131
return 352654597 + (hash * 1566083941);

0 commit comments

Comments
 (0)