@@ -76,24 +76,21 @@ private static string GenerateStringInternal<TValue>(ReadOnlyMemory<string> keys
7676 if ( ! values . IsEmpty && keys . Length != values . Length )
7777 throw new InvalidOperationException ( "The number of values does not match the number of keys." ) ;
7878
79- ReadOnlyMemory < string > keyMemory = keys ;
80- ReadOnlySpan < string > keySpan = keyMemory . Span ;
81-
8279 factory ??= NullLoggerFactory . Instance ;
8380
84- //Validate that we only have unique data
85- HashSet < string > uniq = new HashSet < string > ( StringComparer . Ordinal ) ;
86-
87- for ( int i = 0 ; i < keySpan . Length ; i ++ )
88- {
89- string val = keySpan [ i ] ;
90- if ( ! uniq . Add ( val ) )
91- throw new InvalidOperationException ( $ "Duplicate data found: { val } ") ;
92- }
93-
9481 ILogger logger = factory . CreateLogger ( typeof ( FastDataGenerator ) ) ;
9582 LogUserStructureType ( logger , fdCfg . StructureType ) ;
96- LogUniqueItems ( logger , uniq . Count ) ;
83+
84+ int oldCount = keys . Length ;
85+
86+ DeduplicateKeys ( fdCfg , keys , values , StringComparer . Ordinal , StringComparer . Ordinal , out keys , out values , out int newCount ) ;
87+
88+ if ( oldCount == newCount )
89+ LogNumberOfKeys ( logger , newCount ) ;
90+ else
91+ LogNumberOfUniqueKeys ( logger , oldCount , newCount ) ;
92+
93+ ReadOnlySpan < string > keySpan = keys . Span ;
9794
9895 const KeyType keyType = KeyType . String ;
9996 LogKeyType ( logger , keyType ) ;
@@ -108,14 +105,14 @@ private static string GenerateStringInternal<TValue>(ReadOnlyMemory<string> keys
108105 {
109106 trimPrefix = strProps . DeltaData . Prefix ;
110107 trimSuffix = strProps . DeltaData . Suffix ;
111- keyMemory = SubStringKeys ( keySpan , strProps ) ;
112- keySpan = keyMemory . Span ;
108+ keys = SubStringKeys ( keySpan , strProps ) ;
109+ keySpan = keys . Span ;
113110 }
114111
115112 LogMinMaxLength ( logger , strProps . LengthData . LengthMap . Min , strProps . LengthData . LengthMap . Max ) ;
116113
117114 HashDetails hashDetails = new HashDetails ( ) ;
118- TempStringState < string , TValue > tempState = new TempStringState < string , TValue > ( keyMemory , values , fdCfg , generator , strProps , hashDetails , trimPrefix , trimSuffix ) ;
115+ TempStringState < string , TValue > tempState = new TempStringState < string , TValue > ( keys , values , fdCfg , generator , strProps , hashDetails , trimPrefix , trimSuffix ) ;
119116
120117 switch ( fdCfg . StructureType )
121118 {
@@ -189,28 +186,25 @@ private static string GenerateNumericInternal<TKey, TValue>(ReadOnlyMemory<TKey>
189186 if ( ! values . IsEmpty && keys . Length != values . Length )
190187 throw new InvalidOperationException ( "The number of values does not match the number of keys." ) ;
191188
192- ReadOnlySpan < TKey > keySpan = keys . Span ;
193-
194189 Type type = typeof ( TKey ) ;
195190
196191 if ( type != typeof ( char ) && type != typeof ( sbyte ) && type != typeof ( byte ) && type != typeof ( short ) && type != typeof ( ushort ) && type != typeof ( int ) && type != typeof ( uint ) && type != typeof ( long ) && type != typeof ( ulong ) && type != typeof ( float ) && type != typeof ( double ) )
197192 throw new InvalidOperationException ( $ "Unsupported data type: { type . Name } ") ;
198193
199194 factory ??= NullLoggerFactory . Instance ;
195+ ILogger logger = factory . CreateLogger ( typeof ( FastDataGenerator ) ) ;
200196
201- //Validate that we only have unique data
202- HashSet < TKey > uniq = new HashSet < TKey > ( ) ;
197+ int oldCount = keys . Length ;
198+ DeduplicateKeys ( fdCfg , keys , values , EqualityComparer < TKey > . Default , Comparer < TKey > . Default , out keys , out values , out int newCount ) ;
203199
204- for ( int i = 0 ; i < keySpan . Length ; i ++ )
205- {
206- TKey key = keySpan [ i ] ;
207- if ( ! uniq . Add ( key ) )
208- throw new InvalidOperationException ( $ "Duplicate data found: { key } " ) ;
209- }
200+ if ( oldCount == newCount )
201+ LogNumberOfKeys ( logger , newCount ) ;
202+ else
203+ LogNumberOfUniqueKeys ( logger , oldCount , newCount ) ;
204+
205+ ReadOnlySpan < TKey > keySpan = keys . Span ;
210206
211- ILogger logger = factory . CreateLogger ( typeof ( FastDataGenerator ) ) ;
212207 LogUserStructureType ( logger , fdCfg . StructureType ) ;
213- LogUniqueItems ( logger , uniq . Count ) ;
214208
215209 KeyType keyType = ( KeyType ) Enum . Parse ( typeof ( KeyType ) , type . Name , false ) ;
216210 LogKeyType ( logger , keyType ) ;
@@ -266,6 +260,113 @@ private static string GenerateNumericInternal<TKey, TValue>(ReadOnlyMemory<TKey>
266260 }
267261 }
268262
263+ private static void DeduplicateKeys < TKey , TValue > ( FastDataConfig fdCfg , ReadOnlyMemory < TKey > keys , ReadOnlyMemory < TValue > values , IEqualityComparer < TKey > equalityComparer , IComparer < TKey > sortComparer , out ReadOnlyMemory < TKey > newKeys , out ReadOnlyMemory < TValue > newValues , out int uniqueCount )
264+ {
265+ if ( fdCfg . DeduplicationMode == DeduplicationMode . Disabled )
266+ {
267+ TKey [ ] keyCopy = new TKey [ keys . Length ] ;
268+ keys . CopyTo ( keyCopy ) ;
269+ newKeys = keyCopy ;
270+
271+ TValue [ ] valueCopy = new TValue [ values . Length ] ;
272+ values . CopyTo ( valueCopy ) ;
273+ newValues = valueCopy ;
274+
275+ uniqueCount = keyCopy . Length ;
276+ return ;
277+ }
278+
279+ ReadOnlySpan < TKey > keySpan = keys . Span ;
280+ ReadOnlySpan < TValue > valueSpan = values . Span ;
281+ bool hasValues = ! values . IsEmpty ;
282+
283+ if ( fdCfg . DeduplicationMode is DeduplicationMode . HashSet or DeduplicationMode . HashSetThrowOnDup )
284+ {
285+ HashSet < TKey > uniq = new HashSet < TKey > ( equalityComparer ) ;
286+ TKey [ ] keyCopy = new TKey [ keys . Length ] ;
287+ TValue [ ] valueCopy = hasValues ? new TValue [ values . Length ] : [ ] ;
288+
289+ int offset = 0 ;
290+ for ( int i = 0 ; i < keySpan . Length ; i ++ )
291+ {
292+ TKey key = keySpan [ i ] ;
293+
294+ if ( ! uniq . Add ( key ) )
295+ {
296+ if ( fdCfg . DeduplicationMode == DeduplicationMode . HashSetThrowOnDup )
297+ throw new InvalidOperationException ( $ "Duplicate key found: { key } ") ;
298+
299+ continue ;
300+ }
301+
302+ keyCopy [ offset ] = key ;
303+
304+ if ( hasValues )
305+ valueCopy [ offset ] = valueSpan [ i ] ;
306+
307+ offset ++ ;
308+ }
309+
310+ newKeys = keyCopy . AsMemory ( 0 , offset ) ;
311+ newValues = hasValues ? valueCopy . AsMemory ( 0 , offset ) : ReadOnlyMemory < TValue > . Empty ;
312+ uniqueCount = offset ;
313+ return ;
314+ }
315+
316+ if ( fdCfg . DeduplicationMode is DeduplicationMode . Sort or DeduplicationMode . SortThrowOnDup )
317+ {
318+ int [ ] map = new int [ keys . Length ] ;
319+
320+ for ( int i = 0 ; i < keys . Length ; i ++ )
321+ map [ i ] = i ;
322+
323+ TKey [ ] keyCopy = new TKey [ keys . Length ] ;
324+ keys . CopyTo ( keyCopy ) ;
325+ Array . Sort ( keyCopy , map , sortComparer ) ;
326+
327+ TValue [ ] valueCopy = hasValues ? new TValue [ values . Length ] : [ ] ;
328+
329+ // Handle the first key/value manually to avoid branching inside the for loop below
330+ int firstIndex = map [ 0 ] ;
331+ TKey last = keySpan [ firstIndex ] ! ;
332+
333+ keyCopy [ 0 ] = last ;
334+
335+ if ( hasValues )
336+ valueCopy [ 0 ] = valueSpan [ firstIndex ] ;
337+
338+ int offset = 1 ;
339+ for ( int i = 1 ; i < keys . Length ; i ++ )
340+ {
341+ int sourceIndex = map [ i ] ;
342+ TKey key = keySpan [ sourceIndex ] ! ;
343+
344+ if ( equalityComparer . Equals ( key , last ) )
345+ {
346+ if ( fdCfg . DeduplicationMode == DeduplicationMode . SortThrowOnDup )
347+ throw new InvalidOperationException ( $ "Duplicate key found: { key } ") ;
348+
349+ continue ;
350+ }
351+
352+ keyCopy [ offset ] = key ;
353+
354+ if ( hasValues )
355+ valueCopy [ offset ] = valueSpan [ sourceIndex ] ;
356+
357+ last = key ;
358+ offset ++ ;
359+ }
360+
361+ newKeys = keyCopy . AsMemory ( 0 , offset ) ;
362+ newValues = hasValues ? valueCopy . AsMemory ( 0 , offset ) : ReadOnlyMemory < TValue > . Empty ;
363+ uniqueCount = offset ;
364+ return ;
365+ }
366+
367+ throw new InvalidOperationException ( "Unsupported deduplication mode: " + fdCfg . DeduplicationMode ) ;
368+ }
369+
269370 internal static string [ ] SubStringKeys ( ReadOnlySpan < string > keys , StringKeyProperties props )
270371 {
271372 int prefix = props . DeltaData . Prefix . Length ;
0 commit comments