Skip to content

Commit bc262f8

Browse files
committed
Add benchmark for deduplication as well as docs on options
1 parent 5d827bb commit bc262f8

3 files changed

Lines changed: 64 additions & 2 deletions

File tree

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
namespace Genbox.FastData.Benchmarks.Benchmarks;
2+
3+
[MemoryDiagnoser]
4+
public class DeduplicationBenchmarks
5+
{
6+
private int[] _intKeys = [];
7+
private string[] _stringKeys = [];
8+
9+
[Params(DeduplicationMode.Disabled, DeduplicationMode.HashSet, DeduplicationMode.Sort)]
10+
public DeduplicationMode Mode { get; set; }
11+
12+
[GlobalSetup]
13+
public void Setup()
14+
{
15+
Random rng = new Random(42);
16+
17+
_intKeys = new int[1000];
18+
for (int i = 0; i < _intKeys.Length; i++)
19+
_intKeys[i] = rng.Next(0, 200);
20+
21+
_stringKeys = new string[1000];
22+
for (int i = 0; i < _stringKeys.Length; i++)
23+
_stringKeys[i] = "key" + rng.Next(0, 200);
24+
}
25+
26+
[Benchmark]
27+
public void IntKeys()
28+
{
29+
FastDataConfig cfg = new FastDataConfig();
30+
cfg.DeduplicationMode = Mode;
31+
32+
FastDataGenerator.DeduplicateKeys(cfg, _intKeys, ReadOnlyMemory<int>.Empty, EqualityComparer<int>.Default, Comparer<int>.Default, out _, out _, out int _);
33+
}
34+
35+
[Benchmark]
36+
public void StringKeys()
37+
{
38+
FastDataConfig cfg = new FastDataConfig();
39+
cfg.DeduplicationMode = Mode;
40+
41+
FastDataGenerator.DeduplicateKeys(cfg, _stringKeys, ReadOnlyMemory<int>.Empty, StringComparer.Ordinal, StringComparer.Ordinal, out _, out _, out int _);
42+
}
43+
}

Src/FastData/DeduplicationMode.cs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,28 @@ namespace Genbox.FastData;
22

33
public enum DeduplicationMode : byte
44
{
5+
/// <summary>
6+
/// No deduplication is performed. However, if there is a duplicate in the input, it might cause undefined behavior.
7+
/// </summary>
58
Disabled = 0,
9+
10+
/// <summary>
11+
/// Uses a hash set to deduplicate data. It is faster than sorting, but uses more memory. It does not change the order of keys.
12+
/// </summary>
613
HashSet,
14+
15+
/// <summary>
16+
/// Same as <seealso cref="HashSet"/>, but throws an exception when it finds a duplicate.
17+
/// </summary>
718
HashSetThrowOnDup,
19+
20+
/// <summary>
21+
/// Uses sorting to deduplicate data. It is not as fast as <seealso cref="HashSet"/>, but it uses about half the memory. As a side effect, it changes the order of keys, which might be a desired side effect under certain circumstances.
22+
/// </summary>
823
Sort,
24+
25+
/// <summary>
26+
/// Same as <seealso cref="Sort"/>, but throws an exception when it finds a duplicate.
27+
/// </summary>
928
SortThrowOnDup
1029
}

Src/FastData/FastDataGenerator.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ private static string GenerateNumericInternal<TKey, TValue>(ReadOnlyMemory<TKey>
260260
}
261261
}
262262

263-
private static void DeduplicateKeys<TKey, TValue>(FastDataConfig fdCfg, ReadOnlyMemory<TKey> keys, ReadOnlyMemory<TValue> values, IEqualityComparer<TKey> equalityComparer, IComparer<TKey> sortComparer, out ReadOnlyMemory<TKey> newKeys, out ReadOnlyMemory<TValue> newValues, out int uniqueCount)
263+
internal static void DeduplicateKeys<TKey, TValue>(FastDataConfig fdCfg, ReadOnlyMemory<TKey> keys, ReadOnlyMemory<TValue> values, IEqualityComparer<TKey> equalityComparer, IComparer<TKey> sortComparer, out ReadOnlyMemory<TKey> newKeys, out ReadOnlyMemory<TValue> newValues, out int uniqueCount)
264264
{
265265
if (fdCfg.DeduplicationMode == DeduplicationMode.Disabled)
266266
{
@@ -518,4 +518,4 @@ private static void Benchmark(byte[] data, int iterations, Candidate candidate)
518518

519519
private readonly record struct TempStringState<TKey, TValue>(ReadOnlyMemory<TKey> Keys, ReadOnlyMemory<TValue> Values, FastDataConfig Config, ICodeGenerator Generator, StringKeyProperties StringKeyProperties, HashDetails HashDetails, string TrimPrefix, string TrimSuffix);
520520
private readonly record struct TempNumericState<TKey, TValue>(ReadOnlyMemory<TKey> Keys, ReadOnlyMemory<TValue> Values, FastDataConfig Config, ICodeGenerator Generator, NumericKeyProperties<TKey> NumericKeyProperties, HashDetails HashDetails, KeyType KeyType);
521-
}
521+
}

0 commit comments

Comments
 (0)