Skip to content

Commit 7387ed2

Browse files
branch-4.0: [fix](inverted index) fix pinyin filter bug #60080 (#60336)
Cherry-picked from #60080 Co-authored-by: Ryan19929 <43268112+Ryan19929@users.noreply.github.com>
1 parent 3a9dcf4 commit 7387ed2

4 files changed

Lines changed: 273 additions & 26 deletions

File tree

be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -142,11 +142,17 @@ bool PinyinFilter::readTerm(Token* token) {
142142
if (!processed_original_ && has_current_token_) {
143143
bool should_add_original = config_->keepOriginal;
144144

145-
// For emoji/symbol fallback: check if ANY content was generated (candidates OR pending letters)
146-
// If nothing was generated, this is likely an emoji/symbol that should be preserved
147-
if (!should_add_original && candidate_.empty() && first_letters_.empty() &&
148-
full_pinyin_letters_.empty()) {
149-
// No candidates and no pending letters, this is emoji/symbol
145+
// For emoji/symbol fallback: check if ANY content WILL BE ACTUALLY OUTPUT
146+
// Not just whether buffers have content, but whether they will be processed
147+
// This handles cases like: keep_first_letter=false but first_letters_ has content
148+
bool will_output_first_letter = config_->keepFirstLetter && !first_letters_.empty();
149+
bool will_output_full_pinyin =
150+
config_->keepJoinedFullPinyin && !full_pinyin_letters_.empty();
151+
bool has_candidates = !candidate_.empty();
152+
153+
if (!should_add_original && !has_candidates && !will_output_first_letter &&
154+
!will_output_full_pinyin) {
155+
// No content will be output, trigger fallback to preserve original token
150156
should_add_original = true;
151157
}
152158

@@ -239,24 +245,6 @@ bool PinyinFilter::processCurrentToken() {
239245
PinyinUtil::instance().convert(source_codepoints, PinyinFormat::TONELESS_PINYIN_FORMAT);
240246
auto chinese_list = ChineseUtil::segmentChinese(source_codepoints);
241247

242-
// Early return optimization: if no Chinese characters found
243-
if (pinyin_list.empty() && chinese_list.empty()) {
244-
// Check if there are non-ASCII Unicode characters (like emoji) to preserve
245-
bool has_unicode_symbols = false;
246-
for (const auto& cp : source_codepoints) {
247-
if (cp >= 128) { // Non-ASCII character
248-
has_unicode_symbols = true;
249-
break;
250-
}
251-
}
252-
253-
// If no Unicode symbols, return false and let other filters handle it
254-
if (!has_unicode_symbols) {
255-
return false;
256-
}
257-
// Otherwise, continue processing to preserve Unicode symbols
258-
}
259-
260248
// Process each character and generate candidates
261249
position_ = 0;
262250
std::string first_letters_buffer;
@@ -306,8 +294,12 @@ bool PinyinFilter::processCurrentToken() {
306294
if (config_->keepNoneChineseInJoinedFullPinyin) {
307295
full_pinyin_buffer += static_cast<char>(codepoint);
308296
}
297+
} else if (is_ascii) {
298+
// For non-alphanumeric ASCII characters (like spaces, punctuation),
299+
// do nothing and continue to keep the buffer intact.
300+
continue;
309301
} else {
310-
// Process accumulated ASCII buffer when we hit non-ASCII
302+
// Process accumulated ASCII buffer when we hit non-ASCII (Chinese) characters
311303
if (!ascii_buffer.empty()) {
312304
processAsciiBuffer(ascii_buffer, ascii_buffer_start_pos, static_cast<int>(i));
313305
ascii_buffer.clear();

be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -761,4 +761,104 @@ TEST_F(PinyinFilterTest, TestTokenFilter_NonChineseCJK) {
761761
EXPECT_EQ(tokens2[0], "한글") << "Korean hangul should be preserved as-is";
762762
}
763763

764-
} // namespace doris::segment_v2::inverted_index
764+
TEST_F(PinyinFilterTest, TestBugFix_SpaceHandlingWithKeywordTokenizer) {
765+
std::unordered_map<std::string, std::string> config;
766+
config["keep_joined_full_pinyin"] = "true";
767+
config["keep_none_chinese"] = "true";
768+
config["keep_none_chinese_in_joined_full_pinyin"] = "true";
769+
config["none_chinese_pinyin_tokenize"] = "false";
770+
config["keep_original"] = "false";
771+
config["keep_first_letter"] = "false";
772+
config["keep_full_pinyin"] = "false";
773+
config["lowercase"] = "false";
774+
config["trim_whitespace"] = "false";
775+
config["ignore_pinyin_offset"] = "true";
776+
777+
// Test case 1: Pure English with space
778+
// Before fix: ["ALF", "Characters"] - space triggered buffer processing
779+
// After fix: ["ALFCharacters"] - space is skipped, buffer continues accumulating
780+
auto tokens1 = tokenizeWithFilter("ALF Characters", "keyword", config);
781+
EXPECT_EQ(tokens1.size(), 1) << "Should produce one token (space should not split)";
782+
EXPECT_EQ(tokens1[0], "ALFCharacters") << "Space should be ignored in joined output";
783+
784+
// Test case 2: English with multiple spaces
785+
auto tokens2 = tokenizeWithFilter("Hello World", "keyword", config);
786+
EXPECT_EQ(tokens2.size(), 1) << "Multiple spaces should not split tokens";
787+
EXPECT_EQ(tokens2[0], "HelloWorld") << "All spaces should be ignored";
788+
789+
// Test case 3: Mixed with punctuation
790+
auto tokens3 = tokenizeWithFilter("Test-Case_123", "keyword", config);
791+
EXPECT_EQ(tokens3.size(), 1) << "Punctuation should not split tokens";
792+
EXPECT_EQ(tokens3[0], "TestCase123") << "Non-alphanumeric ASCII chars should be ignored";
793+
}
794+
795+
// Test Bug #1: Space handling with Chinese-English mixed content
796+
TEST_F(PinyinFilterTest, TestBugFix_SpaceHandlingWithMixedContent) {
797+
std::unordered_map<std::string, std::string> config;
798+
config["keep_joined_full_pinyin"] = "true";
799+
config["keep_none_chinese"] = "true";
800+
config["keep_none_chinese_in_joined_full_pinyin"] = "true";
801+
config["none_chinese_pinyin_tokenize"] = "false";
802+
config["keep_original"] = "false";
803+
config["keep_first_letter"] = "false";
804+
config["keep_full_pinyin"] = "false";
805+
config["lowercase"] = "true";
806+
config["ignore_pinyin_offset"] = "true";
807+
808+
// Chinese-English mixed with spaces
809+
// The space should be ignored, English letters should be preserved in joined output
810+
auto tokens = tokenizeWithFilter("ALF 刘德华", "keyword", config);
811+
EXPECT_GT(tokens.size(), 0) << "Should produce tokens";
812+
813+
// Check that English and pinyin are joined together
814+
bool found_joined = false;
815+
for (const auto& token : tokens) {
816+
if (token.find("alf") != std::string::npos && token.find("liu") != std::string::npos) {
817+
found_joined = true;
818+
EXPECT_EQ(token, "alfliudehua") << "English and pinyin should be joined, space ignored";
819+
break;
820+
}
821+
}
822+
EXPECT_TRUE(found_joined) << "Should find joined English+Pinyin token";
823+
}
824+
825+
// Test Bug #2: Fallback mechanism for pure English text
826+
// When keep_none_chinese=false and input is pure English, should preserve original token (ES behavior)
827+
TEST_F(PinyinFilterTest, TestBugFix_PureEnglishFallback) {
828+
std::unordered_map<std::string, std::string> config;
829+
config["keep_none_chinese"] = "false"; // Don't generate separate English tokens
830+
config["keep_original"] = "false";
831+
config["keep_first_letter"] = "false";
832+
config["keep_full_pinyin"] = "false";
833+
config["keep_joined_full_pinyin"] = "true";
834+
config["ignore_pinyin_offset"] = "true";
835+
config["lowercase"] = "false"; // Preserve original case for testing
836+
config["trim_whitespace"] = "false"; // Preserve original whitespace
837+
// CRITICAL: Must set these to false to trigger fallback correctly
838+
config["keep_none_chinese_in_first_letter"] = "false";
839+
config["keep_none_chinese_in_joined_full_pinyin"] = "false";
840+
841+
// Test case 1: Pure English text (no Chinese to convert)
842+
// Before fix: [] - token was dropped because:
843+
// 1. processCurrentToken() returned false (early return removed in fix #1)
844+
// 2. Fallback checked first_letters_.empty() instead of will_output (fixed in this commit)
845+
// After fix: ["Lanky Kong"] - original token preserved via improved fallback mechanism
846+
// The fallback now checks if ANY content WILL BE OUTPUT, not just if buffers have content
847+
auto tokens1 = tokenizeWithFilter("Lanky Kong", "keyword", config);
848+
EXPECT_EQ(tokens1.size(), 1) << "Pure English should be preserved via fallback";
849+
EXPECT_EQ(tokens1[0], "Lanky Kong") << "Original token should be returned";
850+
851+
// Test case 2: Another pure English example
852+
// ES behavior: fallback preserves original text INCLUDING spaces
853+
// (trim_whitespace only removes leading/trailing, not middle spaces)
854+
auto tokens2 = tokenizeWithFilter("ALF Characters", "keyword", config);
855+
EXPECT_EQ(tokens2.size(), 1) << "Pure English with space should be preserved";
856+
EXPECT_EQ(tokens2[0], "ALF Characters") << "Original token preserved as-is (ES behavior)";
857+
858+
// Test case 3: Pure numbers
859+
auto tokens3 = tokenizeWithFilter("12345", "keyword", config);
860+
EXPECT_EQ(tokens3.size(), 1) << "Pure numbers should be preserved";
861+
EXPECT_EQ(tokens3[0], "12345");
862+
}
863+
864+
} // namespace doris::segment_v2::inverted_index

regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,21 @@
290290
-- !sql_ignore_offset_false_mixed --
291291
[{\n "token": "liu"\n }, {\n "token": "lad"\n }, {\n "token": "a"\n }, {\n "token": "de"\n }]
292292

293+
-- !sql_bug1_mixed_tokenizer --
294+
[{\n "token": "ALFliudehua"\n }]
295+
296+
-- !sql_bug1_mixed_filter --
297+
[{\n "token": "ALFliudehua"\n }]
298+
299+
-- !sql_bug2_pure_english --
300+
[{\n "token": "Lanky Kong"\n }]
301+
302+
-- !sql_bug2_pure_numbers --
303+
[{\n "token": "12345"\n }]
304+
305+
-- !sql_bug2_chinese --
306+
[{\n "token": "liudehua"\n }]
307+
293308
-- !sql_table_ignore_offset_1 --
294309
1 刘德华
295310

regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy

Lines changed: 141 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ suite("test_custom_analyzer", "p0") {
9494
"type" = "pinyin",
9595
"keep_first_letter" = "true",
9696
"keep_full_pinyin" = "false",
97-
"keep_joined_full_pinyin " = "false",
97+
"keep_joined_full_pinyin" = "false",
9898
"keep_original" = "false",
9999
"lowercase" = "true"
100100
);
@@ -639,6 +639,146 @@ suite("test_custom_analyzer", "p0") {
639639
qt_sql_ignore_offset_true_mixed """ select tokenize('刘a德', '"analyzer"="pinyin_analyzer_ignore_true"'); """
640640
qt_sql_ignore_offset_false_mixed """ select tokenize('刘a德', '"analyzer"="pinyin_analyzer_ignore_false"'); """
641641

642+
// ==================== Bug Fix Tests ====================
643+
// Test Bug #1: Space handling consistency between pinyin tokenizer and pinyin filter
644+
// When using pinyin filter with keyword tokenizer, spaces should be ignored (not trigger buffer processing)
645+
// This matches ES behavior where spaces don't split the ASCII buffer
646+
647+
// Drop existing objects first to ensure clean state
648+
try {
649+
sql """ DROP INVERTED INDEX ANALYZER pinyin_analyzer_space_test """
650+
} catch (Exception e) { /* ignore if not exists */ }
651+
try {
652+
sql """ DROP INVERTED INDEX ANALYZER pinyin_filter_analyzer_space_test """
653+
} catch (Exception e) { /* ignore if not exists */ }
654+
try {
655+
sql """ DROP INVERTED INDEX TOKENIZER pinyin_tokenizer_space_test """
656+
} catch (Exception e) { /* ignore if not exists */ }
657+
try {
658+
sql """ DROP INVERTED INDEX TOKEN_FILTER pinyin_filter_space_test """
659+
} catch (Exception e) { /* ignore if not exists */ }
660+
661+
// Create pinyin tokenizer for comparison (spaces should be ignored in joined output)
662+
// Key settings: keep_none_chinese=false (don't output English separately)
663+
// keep_none_chinese_in_joined_full_pinyin=true (include English in joined output)
664+
sql """
665+
CREATE INVERTED INDEX TOKENIZER pinyin_tokenizer_space_test
666+
PROPERTIES (
667+
"type" = "pinyin",
668+
"keep_first_letter" = "false",
669+
"keep_separate_first_letter" = "false",
670+
"keep_full_pinyin" = "false",
671+
"keep_joined_full_pinyin" = "true",
672+
"keep_none_chinese" = "false",
673+
"keep_none_chinese_in_joined_full_pinyin" = "true",
674+
"none_chinese_pinyin_tokenize" = "false",
675+
"keep_original" = "false",
676+
"lowercase" = "false",
677+
"trim_whitespace" = "false",
678+
"ignore_pinyin_offset" = "true"
679+
);
680+
"""
681+
682+
// Create pinyin filter with keyword tokenizer for comparison
683+
// Same settings as tokenizer to ensure consistent behavior
684+
sql """
685+
CREATE INVERTED INDEX TOKEN_FILTER pinyin_filter_space_test
686+
PROPERTIES (
687+
"type" = "pinyin",
688+
"keep_first_letter" = "false",
689+
"keep_separate_first_letter" = "false",
690+
"keep_full_pinyin" = "false",
691+
"keep_joined_full_pinyin" = "true",
692+
"keep_none_chinese" = "false",
693+
"keep_none_chinese_in_joined_full_pinyin" = "true",
694+
"none_chinese_pinyin_tokenize" = "false",
695+
"keep_original" = "false",
696+
"lowercase" = "false",
697+
"trim_whitespace" = "false",
698+
"ignore_pinyin_offset" = "true"
699+
);
700+
"""
701+
702+
// Wait for tokenizer and filter to be ready before creating analyzers
703+
sql """ select sleep(15) """
704+
705+
sql """
706+
CREATE INVERTED INDEX ANALYZER pinyin_analyzer_space_test
707+
PROPERTIES (
708+
"tokenizer" = "pinyin_tokenizer_space_test"
709+
);
710+
"""
711+
712+
sql """
713+
CREATE INVERTED INDEX ANALYZER pinyin_filter_analyzer_space_test
714+
PROPERTIES (
715+
"tokenizer" = "keyword",
716+
"token_filter" = "pinyin_filter_space_test"
717+
);
718+
"""
719+
720+
// Wait for analyzers to be ready
721+
sql """ select sleep(15) """
722+
723+
// Bug #1 Test: Mixed Chinese and English with spaces
724+
// Input: "ALF 刘德华" - space should be ignored, English and pinyin should be joined
725+
// Key point: Space between "ALF" and "刘德华" should NOT split the ASCII buffer
726+
// Expected output: ["ALFliudehua"] - English and pinyin joined together
727+
qt_sql_bug1_mixed_tokenizer """ select tokenize('ALF 刘德华', '"analyzer"="pinyin_analyzer_space_test"'); """
728+
qt_sql_bug1_mixed_filter """ select tokenize('ALF 刘德华', '"analyzer"="pinyin_filter_analyzer_space_test"'); """
729+
730+
// Test Bug #2: Pure English fallback
731+
// When keep_none_chinese=false and input is pure English, should preserve original token (ES behavior)
732+
733+
// Drop existing objects first
734+
try {
735+
sql """ DROP INVERTED INDEX ANALYZER pinyin_analyzer_fallback_test """
736+
} catch (Exception e) { /* ignore if not exists */ }
737+
try {
738+
sql """ DROP INVERTED INDEX TOKEN_FILTER pinyin_filter_fallback_test """
739+
} catch (Exception e) { /* ignore if not exists */ }
740+
741+
sql """
742+
CREATE INVERTED INDEX TOKEN_FILTER pinyin_filter_fallback_test
743+
PROPERTIES (
744+
"type" = "pinyin",
745+
"keep_none_chinese" = "false",
746+
"keep_original" = "false",
747+
"keep_first_letter" = "false",
748+
"keep_full_pinyin" = "false",
749+
"keep_joined_full_pinyin" = "true",
750+
"ignore_pinyin_offset" = "true",
751+
"keep_none_chinese_in_first_letter" = "false",
752+
"keep_none_chinese_in_joined_full_pinyin" = "false",
753+
"lowercase" = "false"
754+
);
755+
"""
756+
757+
// Wait for filter to be ready before creating analyzer
758+
sql """ select sleep(15) """
759+
760+
sql """
761+
CREATE INVERTED INDEX ANALYZER pinyin_analyzer_fallback_test
762+
PROPERTIES (
763+
"tokenizer" = "keyword",
764+
"token_filter" = "pinyin_filter_fallback_test"
765+
);
766+
"""
767+
768+
// Wait for analyzer to be ready
769+
sql """ select sleep(15) """
770+
771+
// Bug #2 Test: Pure English should be preserved via fallback mechanism
772+
// Before fix: [] (token was dropped)
773+
// After fix: original token preserved
774+
qt_sql_bug2_pure_english """ select tokenize('Lanky Kong', '"analyzer"="pinyin_analyzer_fallback_test"'); """
775+
qt_sql_bug2_pure_numbers """ select tokenize('12345', '"analyzer"="pinyin_analyzer_fallback_test"'); """
776+
777+
// Bug #2 Test: Chinese should still work normally (output joined pinyin)
778+
qt_sql_bug2_chinese """ select tokenize('刘德华', '"analyzer"="pinyin_analyzer_fallback_test"'); """
779+
780+
// ==================== End Bug Fix Tests ====================
781+
642782
// Test table creation and queries with ignore_pinyin_offset
643783
def indexTbName7 = "test_custom_analyzer_pinyin_offset"
644784
sql "DROP TABLE IF EXISTS ${indexTbName7}"

0 commit comments

Comments
 (0)