Skip to content

Commit cd28633

Browse files
committed
Add more robust deduplication
1 parent a81ba33 commit cd28633

5 files changed

Lines changed: 221 additions & 33 deletions

File tree

Src/FastData.Tests/FastDataGeneratorTests.cs

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,82 @@ namespace Genbox.FastData.Tests;
99

1010
public class FastDataGeneratorTests
1111
{
12-
[Fact]
13-
public void Generate_ThrowOnDuplicate()
12+
[Theory]
13+
[InlineData(DeduplicationMode.HashSetThrowOnDup)]
14+
[InlineData(DeduplicationMode.SortThrowOnDup)]
15+
public void Generate_ThrowOnDuplicate(DeduplicationMode mode)
1416
{
1517
FastDataConfig config = new FastDataConfig();
18+
config.DeduplicationMode = mode;
19+
1620
Assert.Throws<InvalidOperationException>(() => FastDataGenerator.Generate(["item", "item"], config, new DummyGenerator()));
21+
Assert.Throws<InvalidOperationException>(() => FastDataGenerator.Generate([1, 2, 2], config, new DummyGenerator()));
22+
}
23+
24+
[Theory]
25+
[InlineData(DeduplicationMode.Disabled)]
26+
[InlineData(DeduplicationMode.HashSet)]
27+
[InlineData(DeduplicationMode.Sort)]
28+
public void Generate_NoThrowOnDuplicates(DeduplicationMode mode)
29+
{
30+
FastDataConfig config = new FastDataConfig();
31+
config.DeduplicationMode = mode;
32+
33+
FastDataGenerator.Generate(["item", "item"], config, new DummyGenerator());
34+
FastDataGenerator.Generate([1, 2, 2], config, new DummyGenerator());
35+
}
36+
37+
[Theory]
38+
[InlineData(DeduplicationMode.HashSet)]
39+
[InlineData(DeduplicationMode.Sort)]
40+
public void GenerateKeyed_StringDeduplication_RemovesDuplicates(DeduplicationMode mode)
41+
{
42+
string[] keys = ["b", "a", "b", "c"];
43+
string[] values = ["vb", "va", "vb", "vc"];
44+
45+
FastDataConfig config = new FastDataConfig(StructureType.Array);
46+
config.DeduplicationMode = mode;
47+
48+
ContextCaptureGenerator generator = new ContextCaptureGenerator();
49+
FastDataGenerator.GenerateKeyed(keys, values, config, generator);
50+
51+
ArrayContext<string, string> ctx = Assert.IsType<ArrayContext<string, string>>(generator.Context);
52+
if (mode == DeduplicationMode.HashSet)
53+
{
54+
Assert.True(ctx.Keys.Span.SequenceEqual(["b", "a", "c"]));
55+
Assert.True(ctx.Values.Span.SequenceEqual(["vb", "va", "vc"]));
56+
}
57+
else
58+
{
59+
Assert.True(ctx.Keys.Span.SequenceEqual(["a", "b", "c"]));
60+
Assert.True(ctx.Values.Span.SequenceEqual(["va", "vb", "vc"]));
61+
}
62+
}
63+
64+
[Theory]
65+
[InlineData(DeduplicationMode.HashSet)]
66+
[InlineData(DeduplicationMode.Sort)]
67+
public void GenerateKeyed_NumericDeduplication_RemovesDuplicates(DeduplicationMode mode)
68+
{
69+
int[] keys = [3, 1, 3, 2];
70+
string[] values = ["v3", "v1", "v3", "v2"];
71+
FastDataConfig config = new FastDataConfig(StructureType.Array);
72+
config.DeduplicationMode = mode;
73+
ContextCaptureGenerator generator = new ContextCaptureGenerator();
74+
75+
FastDataGenerator.GenerateKeyed(keys, values, config, generator);
76+
77+
ArrayContext<int, string> ctx = Assert.IsType<ArrayContext<int, string>>(generator.Context);
78+
if (mode == DeduplicationMode.HashSet)
79+
{
80+
Assert.True(ctx.Keys.Span.SequenceEqual([3, 1, 2]));
81+
Assert.True(ctx.Values.Span.SequenceEqual(["v3", "v1", "v2"]));
82+
}
83+
else
84+
{
85+
Assert.True(ctx.Keys.Span.SequenceEqual([1, 2, 3]));
86+
Assert.True(ctx.Values.Span.SequenceEqual(["v1", "v2", "v3"]));
87+
}
1788
}
1889

1990
[Fact]

Src/FastData/DeduplicationMode.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
namespace Genbox.FastData;
2+
3+
public enum DeduplicationMode : byte
4+
{
5+
Disabled = 0,
6+
HashSet,
7+
HashSetThrowOnDup,
8+
Sort,
9+
SortThrowOnDup
10+
}

Src/FastData/FastDataConfig.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ public sealed class FastDataConfig(StructureType structureType = StructureType.A
99
/// <summary>The type of structure to create. Defaults to Auto.</summary>
1010
public StructureType StructureType { get; set; } = structureType;
1111

12+
/// <summary>When true, duplicates will be eliminated from the input.</summary>
13+
public DeduplicationMode DeduplicationMode { get; set; } = DeduplicationMode.HashSet;
14+
1215
/// <summary>For hash-based structures, you can set this factor higher or lower to control how many slots are used. A factor higher than 1 will use more memory, but can improve performance by reducing collisions. A factor lower than 1 will use less memory, but can increase collisions and thus reduce performance.</summary>
1316
public int HashCapacityFactor { get; set; } = 1;
1417

Src/FastData/FastDataGenerator.Logging.cs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,11 @@ namespace Genbox.FastData;
55

66
public static partial class FastDataGenerator
77
{
8-
[LoggerMessage(LogLevel.Information, "There are {Count} unique items")]
9-
internal static partial void LogUniqueItems(ILogger logger, int count);
8+
[LoggerMessage(LogLevel.Information, "There are {Count} keys")]
9+
internal static partial void LogNumberOfKeys(ILogger logger, int count);
10+
11+
[LoggerMessage(LogLevel.Information, "There are {Count} keys, but only {UniqueCount} unique")]
12+
internal static partial void LogNumberOfUniqueKeys(ILogger logger, int count, int uniqueCount);
1013

1114
[LoggerMessage(LogLevel.Information, "Data consists of {KeyType}")]
1215
internal static partial void LogKeyType(ILogger logger, KeyType keyType);

Src/FastData/FastDataGenerator.cs

Lines changed: 130 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -76,24 +76,21 @@ private static string GenerateStringInternal<TValue>(ReadOnlyMemory<string> keys
7676
if (!values.IsEmpty && keys.Length != values.Length)
7777
throw new InvalidOperationException("The number of values does not match the number of keys.");
7878

79-
ReadOnlyMemory<string> keyMemory = keys;
80-
ReadOnlySpan<string> keySpan = keyMemory.Span;
81-
8279
factory ??= NullLoggerFactory.Instance;
8380

84-
//Validate that we only have unique data
85-
HashSet<string> uniq = new HashSet<string>(StringComparer.Ordinal);
86-
87-
for (int i = 0; i < keySpan.Length; i++)
88-
{
89-
string val = keySpan[i];
90-
if (!uniq.Add(val))
91-
throw new InvalidOperationException($"Duplicate data found: {val}");
92-
}
93-
9481
ILogger logger = factory.CreateLogger(typeof(FastDataGenerator));
9582
LogUserStructureType(logger, fdCfg.StructureType);
96-
LogUniqueItems(logger, uniq.Count);
83+
84+
int oldCount = keys.Length;
85+
86+
DeduplicateKeys(fdCfg, keys, values, StringComparer.Ordinal, StringComparer.Ordinal, out keys, out values, out int newCount);
87+
88+
if (oldCount == newCount)
89+
LogNumberOfKeys(logger, newCount);
90+
else
91+
LogNumberOfUniqueKeys(logger, oldCount, newCount);
92+
93+
ReadOnlySpan<string> keySpan = keys.Span;
9794

9895
const KeyType keyType = KeyType.String;
9996
LogKeyType(logger, keyType);
@@ -108,14 +105,14 @@ private static string GenerateStringInternal<TValue>(ReadOnlyMemory<string> keys
108105
{
109106
trimPrefix = strProps.DeltaData.Prefix;
110107
trimSuffix = strProps.DeltaData.Suffix;
111-
keyMemory = SubStringKeys(keySpan, strProps);
112-
keySpan = keyMemory.Span;
108+
keys = SubStringKeys(keySpan, strProps);
109+
keySpan = keys.Span;
113110
}
114111

115112
LogMinMaxLength(logger, strProps.LengthData.LengthMap.Min, strProps.LengthData.LengthMap.Max);
116113

117114
HashDetails hashDetails = new HashDetails();
118-
TempStringState<string, TValue> tempState = new TempStringState<string, TValue>(keyMemory, values, fdCfg, generator, strProps, hashDetails, trimPrefix, trimSuffix);
115+
TempStringState<string, TValue> tempState = new TempStringState<string, TValue>(keys, values, fdCfg, generator, strProps, hashDetails, trimPrefix, trimSuffix);
119116

120117
switch (fdCfg.StructureType)
121118
{
@@ -189,28 +186,25 @@ private static string GenerateNumericInternal<TKey, TValue>(ReadOnlyMemory<TKey>
189186
if (!values.IsEmpty && keys.Length != values.Length)
190187
throw new InvalidOperationException("The number of values does not match the number of keys.");
191188

192-
ReadOnlySpan<TKey> keySpan = keys.Span;
193-
194189
Type type = typeof(TKey);
195190

196191
if (type != typeof(char) && type != typeof(sbyte) && type != typeof(byte) && type != typeof(short) && type != typeof(ushort) && type != typeof(int) && type != typeof(uint) && type != typeof(long) && type != typeof(ulong) && type != typeof(float) && type != typeof(double))
197192
throw new InvalidOperationException($"Unsupported data type: {type.Name}");
198193

199194
factory ??= NullLoggerFactory.Instance;
195+
ILogger logger = factory.CreateLogger(typeof(FastDataGenerator));
200196

201-
//Validate that we only have unique data
202-
HashSet<TKey> uniq = new HashSet<TKey>();
197+
int oldCount = keys.Length;
198+
DeduplicateKeys(fdCfg, keys, values, EqualityComparer<TKey>.Default, Comparer<TKey>.Default, out keys, out values, out int newCount);
203199

204-
for (int i = 0; i < keySpan.Length; i++)
205-
{
206-
TKey key = keySpan[i];
207-
if (!uniq.Add(key))
208-
throw new InvalidOperationException($"Duplicate data found: {key}");
209-
}
200+
if (oldCount == newCount)
201+
LogNumberOfKeys(logger, newCount);
202+
else
203+
LogNumberOfUniqueKeys(logger, oldCount, newCount);
204+
205+
ReadOnlySpan<TKey> keySpan = keys.Span;
210206

211-
ILogger logger = factory.CreateLogger(typeof(FastDataGenerator));
212207
LogUserStructureType(logger, fdCfg.StructureType);
213-
LogUniqueItems(logger, uniq.Count);
214208

215209
KeyType keyType = (KeyType)Enum.Parse(typeof(KeyType), type.Name, false);
216210
LogKeyType(logger, keyType);
@@ -266,6 +260,113 @@ private static string GenerateNumericInternal<TKey, TValue>(ReadOnlyMemory<TKey>
266260
}
267261
}
268262

263+
private static void DeduplicateKeys<TKey, TValue>(FastDataConfig fdCfg, ReadOnlyMemory<TKey> keys, ReadOnlyMemory<TValue> values, IEqualityComparer<TKey> equalityComparer, IComparer<TKey> sortComparer, out ReadOnlyMemory<TKey> newKeys, out ReadOnlyMemory<TValue> newValues, out int uniqueCount)
264+
{
265+
if (fdCfg.DeduplicationMode == DeduplicationMode.Disabled)
266+
{
267+
TKey[] keyCopy = new TKey[keys.Length];
268+
keys.CopyTo(keyCopy);
269+
newKeys = keyCopy;
270+
271+
TValue[] valueCopy = new TValue[values.Length];
272+
values.CopyTo(valueCopy);
273+
newValues = valueCopy;
274+
275+
uniqueCount = keyCopy.Length;
276+
return;
277+
}
278+
279+
ReadOnlySpan<TKey> keySpan = keys.Span;
280+
ReadOnlySpan<TValue> valueSpan = values.Span;
281+
bool hasValues = !values.IsEmpty;
282+
283+
if (fdCfg.DeduplicationMode is DeduplicationMode.HashSet or DeduplicationMode.HashSetThrowOnDup)
284+
{
285+
HashSet<TKey> uniq = new HashSet<TKey>(equalityComparer);
286+
TKey[] keyCopy = new TKey[keys.Length];
287+
TValue[] valueCopy = hasValues ? new TValue[values.Length] : [];
288+
289+
int offset = 0;
290+
for (int i = 0; i < keySpan.Length; i++)
291+
{
292+
TKey key = keySpan[i];
293+
294+
if (!uniq.Add(key))
295+
{
296+
if (fdCfg.DeduplicationMode == DeduplicationMode.HashSetThrowOnDup)
297+
throw new InvalidOperationException($"Duplicate key found: {key}");
298+
299+
continue;
300+
}
301+
302+
keyCopy[offset] = key;
303+
304+
if (hasValues)
305+
valueCopy[offset] = valueSpan[i];
306+
307+
offset++;
308+
}
309+
310+
newKeys = keyCopy.AsMemory(0, offset);
311+
newValues = hasValues ? valueCopy.AsMemory(0, offset) : ReadOnlyMemory<TValue>.Empty;
312+
uniqueCount = offset;
313+
return;
314+
}
315+
316+
if (fdCfg.DeduplicationMode is DeduplicationMode.Sort or DeduplicationMode.SortThrowOnDup)
317+
{
318+
int[] map = new int[keys.Length];
319+
320+
for (int i = 0; i < keys.Length; i++)
321+
map[i] = i;
322+
323+
TKey[] keyCopy = new TKey[keys.Length];
324+
keys.CopyTo(keyCopy);
325+
Array.Sort(keyCopy, map, sortComparer);
326+
327+
TValue[] valueCopy = hasValues ? new TValue[values.Length] : [];
328+
329+
// Handle the first key/value manually to avoid branching inside the for loop below
330+
int firstIndex = map[0];
331+
TKey last = keySpan[firstIndex]!;
332+
333+
keyCopy[0] = last;
334+
335+
if (hasValues)
336+
valueCopy[0] = valueSpan[firstIndex];
337+
338+
int offset = 1;
339+
for (int i = 1; i < keys.Length; i++)
340+
{
341+
int sourceIndex = map[i];
342+
TKey key = keySpan[sourceIndex]!;
343+
344+
if (equalityComparer.Equals(key, last))
345+
{
346+
if (fdCfg.DeduplicationMode == DeduplicationMode.SortThrowOnDup)
347+
throw new InvalidOperationException($"Duplicate key found: {key}");
348+
349+
continue;
350+
}
351+
352+
keyCopy[offset] = key;
353+
354+
if (hasValues)
355+
valueCopy[offset] = valueSpan[sourceIndex];
356+
357+
last = key;
358+
offset++;
359+
}
360+
361+
newKeys = keyCopy.AsMemory(0, offset);
362+
newValues = hasValues ? valueCopy.AsMemory(0, offset) : ReadOnlyMemory<TValue>.Empty;
363+
uniqueCount = offset;
364+
return;
365+
}
366+
367+
throw new InvalidOperationException("Unsupported deduplication mode: " + fdCfg.DeduplicationMode);
368+
}
369+
269370
internal static string[] SubStringKeys(ReadOnlySpan<string> keys, StringKeyProperties props)
270371
{
271372
int prefix = props.DeltaData.Prefix.Length;

0 commit comments

Comments
 (0)