Skip to content

Commit 51e610d

Browse files
authored
Add SOUNDEX and DIFFERENCE scalar functions for phonetic string matching (#18293)
1 parent 8f7dafe commit 51e610d

2 files changed

Lines changed: 58 additions & 0 deletions

File tree

pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import java.util.Base64;
2727
import java.util.UUID;
2828
import javax.annotation.Nullable;
29+
import org.apache.commons.codec.language.Soundex;
2930
import org.apache.commons.lang3.StringUtils;
3031
import org.apache.pinot.common.utils.URIUtils;
3132
import org.apache.pinot.spi.annotations.ScalarFunction;
@@ -44,6 +45,8 @@ public class StringFunctions {
4445
private StringFunctions() {
4546
}
4647

48+
private static final Soundex SOUNDEX = new Soundex();
49+
4750
/**
4851
* @see StringUtils#reverse(String)
4952
* @param input
@@ -908,4 +911,37 @@ public static boolean isJson(String inputStr) {
908911
return false;
909912
}
910913
}
914+
915+
/**
916+
* Returns the Soundex code for a string. Empty string returns "0000" (SQL standard behaviour).
917+
*/
918+
@Nullable
919+
@ScalarFunction(nullableParameters = true)
920+
public static String soundex(@Nullable String input) {
921+
if (input == null) {
922+
return null;
923+
}
924+
if (input.isEmpty()) {
925+
return "0000";
926+
}
927+
return SOUNDEX.soundex(input);
928+
}
929+
930+
/**
931+
* Returns an integer 0-4 indicating how similar two strings sound based on their Soundex codes.
932+
* 4 means the codes are identical; 0 means they share no common code characters.
933+
* The framework null-propagates when either argument is null.
934+
*/
935+
@ScalarFunction
936+
public static int difference(String input1, String input2) {
937+
String code1 = soundex(input1);
938+
String code2 = soundex(input2);
939+
int matches = 0;
940+
for (int i = 0; i < 4; i++) {
941+
if (code1.charAt(i) == code2.charAt(i)) {
942+
matches++;
943+
}
944+
}
945+
return matches;
946+
}
911947
}

pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.testng.annotations.Test;
2525

2626
import static org.testng.Assert.assertEquals;
27+
import static org.testng.Assert.assertNull;
2728

2829

2930
public class StringFunctionsTest {
@@ -349,6 +350,27 @@ public void testHammingDistance() {
349350
assertEquals(StringFunctions.levenshteinDistance("cat", "cats"), 1); // Levenshtein can handle different lengths
350351
}
351352

353+
@Test
354+
public void testSoundex() {
355+
assertEquals(StringFunctions.soundex("Robert"), "R163");
356+
assertEquals(StringFunctions.soundex("Rupert"), "R163");
357+
assertEquals(StringFunctions.soundex("Ashcraft"), "A261");
358+
// Empty string returns SQL-standard fallback code
359+
assertEquals(StringFunctions.soundex(""), "0000");
360+
assertNull(StringFunctions.soundex(null));
361+
}
362+
363+
@Test
364+
public void testDifference() {
365+
assertEquals(StringFunctions.difference("Robert", "Rupert"), 4);
366+
assertEquals(StringFunctions.difference("Smith", "Johnson"), 1);
367+
assertEquals(StringFunctions.difference("Ann", "Ann"), 4);
368+
// "0000" vs "0000" — both encode to the standard empty fallback, all 4 positions match
369+
assertEquals(StringFunctions.difference("", ""), 4);
370+
// "R163" vs "0000" — first characters differ, no positions match
371+
assertEquals(StringFunctions.difference("Robert", ""), 0);
372+
}
373+
352374
@Test
353375
public void encodeUrl() {
354376
assertEquals(StringFunctions.encodeUrl(""), "");

0 commit comments

Comments
 (0)