branch-4.0: [fix](inverted index) fix pinyin filter bug #60080 (#60336)

github-actions[bot] · Ryan19929 · web-flow · commit 7387ed2267e5 · 2026-01-29T16:27:49.000+08:00
Cherry-picked from #60080 Co-authored-by: Ryan19929 <43268112+Ryan19929@users.noreply.github.com>
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp
@@ -142,11 +142,17 @@ bool PinyinFilter::readTerm(Token* token) {
     if (!processed_original_ && has_current_token_) {
         bool should_add_original = config_->keepOriginal;
 
-        // For emoji/symbol fallback: check if ANY content was generated (candidates OR pending letters)
-        // If nothing was generated, this is likely an emoji/symbol that should be preserved
-        if (!should_add_original && candidate_.empty() && first_letters_.empty() &&
-            full_pinyin_letters_.empty()) {
-            // No candidates and no pending letters, this is emoji/symbol
+        // For emoji/symbol fallback: check if ANY content WILL BE ACTUALLY OUTPUT
+        // Not just whether buffers have content, but whether they will be processed
+        // This handles cases like: keep_first_letter=false but first_letters_ has content
+        bool will_output_first_letter = config_->keepFirstLetter && !first_letters_.empty();
+        bool will_output_full_pinyin =
+                config_->keepJoinedFullPinyin && !full_pinyin_letters_.empty();
+        bool has_candidates = !candidate_.empty();
+
+        if (!should_add_original && !has_candidates && !will_output_first_letter &&
+            !will_output_full_pinyin) {
+            // No content will be output, trigger fallback to preserve original token
             should_add_original = true;
         }
 
@@ -239,24 +245,6 @@ bool PinyinFilter::processCurrentToken() {
             PinyinUtil::instance().convert(source_codepoints, PinyinFormat::TONELESS_PINYIN_FORMAT);
     auto chinese_list = ChineseUtil::segmentChinese(source_codepoints);
 
-    // Early return optimization: if no Chinese characters found
-    if (pinyin_list.empty() && chinese_list.empty()) {
-        // Check if there are non-ASCII Unicode characters (like emoji) to preserve
-        bool has_unicode_symbols = false;
-        for (const auto& cp : source_codepoints) {
-            if (cp >= 128) { // Non-ASCII character
-                has_unicode_symbols = true;
-                break;
-            }
-        }
-
-        // If no Unicode symbols, return false and let other filters handle it
-        if (!has_unicode_symbols) {
-            return false;
-        }
-        // Otherwise, continue processing to preserve Unicode symbols
-    }
-
     // Process each character and generate candidates
     position_ = 0;
     std::string first_letters_buffer;
@@ -306,8 +294,12 @@ bool PinyinFilter::processCurrentToken() {
             if (config_->keepNoneChineseInJoinedFullPinyin) {
                 full_pinyin_buffer += static_cast<char>(codepoint);
             }
+        } else if (is_ascii) {
+            // For non-alphanumeric ASCII characters (like spaces, punctuation),
+            // do nothing and continue to keep the buffer intact.
+            continue;
         } else {
-            // Process accumulated ASCII buffer when we hit non-ASCII
+            // Process accumulated ASCII buffer when we hit non-ASCII (Chinese) characters
             if (!ascii_buffer.empty()) {
                 processAsciiBuffer(ascii_buffer, ascii_buffer_start_pos, static_cast<int>(i));
                 ascii_buffer.clear();
diff --git a/be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp
@@ -761,4 +761,104 @@ TEST_F(PinyinFilterTest, TestTokenFilter_NonChineseCJK) {
     EXPECT_EQ(tokens2[0], "한글") << "Korean hangul should be preserved as-is";
 }
 
-} // namespace doris::segment_v2::inverted_index
+TEST_F(PinyinFilterTest, TestBugFix_SpaceHandlingWithKeywordTokenizer) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_joined_full_pinyin"] = "true";
+    config["keep_none_chinese"] = "true";
+    config["keep_none_chinese_in_joined_full_pinyin"] = "true";
+    config["none_chinese_pinyin_tokenize"] = "false";
+    config["keep_original"] = "false";
+    config["keep_first_letter"] = "false";
+    config["keep_full_pinyin"] = "false";
+    config["lowercase"] = "false";
+    config["trim_whitespace"] = "false";
+    config["ignore_pinyin_offset"] = "true";
+
+    // Test case 1: Pure English with space
+    // Before fix: ["ALF", "Characters"] - space triggered buffer processing
+    // After fix: ["ALFCharacters"] - space is skipped, buffer continues accumulating
+    auto tokens1 = tokenizeWithFilter("ALF Characters", "keyword", config);
+    EXPECT_EQ(tokens1.size(), 1) << "Should produce one token (space should not split)";
+    EXPECT_EQ(tokens1[0], "ALFCharacters") << "Space should be ignored in joined output";
+
+    // Test case 2: English with multiple spaces
+    auto tokens2 = tokenizeWithFilter("Hello   World", "keyword", config);
+    EXPECT_EQ(tokens2.size(), 1) << "Multiple spaces should not split tokens";
+    EXPECT_EQ(tokens2[0], "HelloWorld") << "All spaces should be ignored";
+
+    // Test case 3: Mixed with punctuation
+    auto tokens3 = tokenizeWithFilter("Test-Case_123", "keyword", config);
+    EXPECT_EQ(tokens3.size(), 1) << "Punctuation should not split tokens";
+    EXPECT_EQ(tokens3[0], "TestCase123") << "Non-alphanumeric ASCII chars should be ignored";
+}
+
+// Test Bug #1: Space handling with Chinese-English mixed content
+TEST_F(PinyinFilterTest, TestBugFix_SpaceHandlingWithMixedContent) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_joined_full_pinyin"] = "true";
+    config["keep_none_chinese"] = "true";
+    config["keep_none_chinese_in_joined_full_pinyin"] = "true";
+    config["none_chinese_pinyin_tokenize"] = "false";
+    config["keep_original"] = "false";
+    config["keep_first_letter"] = "false";
+    config["keep_full_pinyin"] = "false";
+    config["lowercase"] = "true";
+    config["ignore_pinyin_offset"] = "true";
+
+    // Chinese-English mixed with spaces
+    // The space should be ignored, English letters should be preserved in joined output
+    auto tokens = tokenizeWithFilter("ALF 刘德华", "keyword", config);
+    EXPECT_GT(tokens.size(), 0) << "Should produce tokens";
+
+    // Check that English and pinyin are joined together
+    bool found_joined = false;
+    for (const auto& token : tokens) {
+        if (token.find("alf") != std::string::npos && token.find("liu") != std::string::npos) {
+            found_joined = true;
+            EXPECT_EQ(token, "alfliudehua") << "English and pinyin should be joined, space ignored";
+            break;
+        }
+    }
+    EXPECT_TRUE(found_joined) << "Should find joined English+Pinyin token";
+}
+
+// Test Bug #2: Fallback mechanism for pure English text
+// When keep_none_chinese=false and input is pure English, should preserve original token (ES behavior)
+TEST_F(PinyinFilterTest, TestBugFix_PureEnglishFallback) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_none_chinese"] = "false"; // Don't generate separate English tokens
+    config["keep_original"] = "false";
+    config["keep_first_letter"] = "false";
+    config["keep_full_pinyin"] = "false";
+    config["keep_joined_full_pinyin"] = "true";
+    config["ignore_pinyin_offset"] = "true";
+    config["lowercase"] = "false";       // Preserve original case for testing
+    config["trim_whitespace"] = "false"; // Preserve original whitespace
+    // CRITICAL: Must set these to false to trigger fallback correctly
+    config["keep_none_chinese_in_first_letter"] = "false";
+    config["keep_none_chinese_in_joined_full_pinyin"] = "false";
+
+    // Test case 1: Pure English text (no Chinese to convert)
+    // Before fix: [] - token was dropped because:
+    //   1. processCurrentToken() returned false (early return removed in fix #1)
+    //   2. Fallback checked first_letters_.empty() instead of will_output (fixed in this commit)
+    // After fix: ["Lanky Kong"] - original token preserved via improved fallback mechanism
+    //   The fallback now checks if ANY content WILL BE OUTPUT, not just if buffers have content
+    auto tokens1 = tokenizeWithFilter("Lanky Kong", "keyword", config);
+    EXPECT_EQ(tokens1.size(), 1) << "Pure English should be preserved via fallback";
+    EXPECT_EQ(tokens1[0], "Lanky Kong") << "Original token should be returned";
+
+    // Test case 2: Another pure English example
+    // ES behavior: fallback preserves original text INCLUDING spaces
+    // (trim_whitespace only removes leading/trailing, not middle spaces)
+    auto tokens2 = tokenizeWithFilter("ALF Characters", "keyword", config);
+    EXPECT_EQ(tokens2.size(), 1) << "Pure English with space should be preserved";
+    EXPECT_EQ(tokens2[0], "ALF Characters") << "Original token preserved as-is (ES behavior)";
+
+    // Test case 3: Pure numbers
+    auto tokens3 = tokenizeWithFilter("12345", "keyword", config);
+    EXPECT_EQ(tokens3.size(), 1) << "Pure numbers should be preserved";
+    EXPECT_EQ(tokens3[0], "12345");
+}
+
+} // namespace doris::segment_v2::inverted_index
diff --git a/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out b/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
@@ -290,6 +290,21 @@
 -- !sql_ignore_offset_false_mixed --
 [{\n        "token": "liu"\n    }, {\n        "token": "lad"\n    }, {\n        "token": "a"\n    }, {\n        "token": "de"\n    }]
 
+-- !sql_bug1_mixed_tokenizer --
+[{\n        "token": "ALFliudehua"\n    }]
+
+-- !sql_bug1_mixed_filter --
+[{\n        "token": "ALFliudehua"\n    }]
+
+-- !sql_bug2_pure_english --
+[{\n        "token": "Lanky Kong"\n    }]
+
+-- !sql_bug2_pure_numbers --
+[{\n        "token": "12345"\n    }]
+
+-- !sql_bug2_chinese --
+[{\n        "token": "liudehua"\n    }]
+
 -- !sql_table_ignore_offset_1 --
 1	刘德华
 
diff --git a/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy b/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
@@ -94,7 +94,7 @@ suite("test_custom_analyzer", "p0") {
             "type" = "pinyin",
             "keep_first_letter" = "true",
             "keep_full_pinyin" = "false",
-            "keep_joined_full_pinyin    " = "false",
+            "keep_joined_full_pinyin" = "false",
             "keep_original" = "false",
             "lowercase" = "true"
         );
@@ -639,6 +639,146 @@ suite("test_custom_analyzer", "p0") {
     qt_sql_ignore_offset_true_mixed """ select tokenize('刘a德', '"analyzer"="pinyin_analyzer_ignore_true"'); """
     qt_sql_ignore_offset_false_mixed """ select tokenize('刘a德', '"analyzer"="pinyin_analyzer_ignore_false"'); """
 
+    // ==================== Bug Fix Tests ====================
+    // Test Bug #1: Space handling consistency between pinyin tokenizer and pinyin filter
+    // When using pinyin filter with keyword tokenizer, spaces should be ignored (not trigger buffer processing)
+    // This matches ES behavior where spaces don't split the ASCII buffer
+    
+    // Drop existing objects first to ensure clean state
+    try {
+        sql """ DROP INVERTED INDEX ANALYZER pinyin_analyzer_space_test """
+    } catch (Exception e) { /* ignore if not exists */ }
+    try {
+        sql """ DROP INVERTED INDEX ANALYZER pinyin_filter_analyzer_space_test """
+    } catch (Exception e) { /* ignore if not exists */ }
+    try {
+        sql """ DROP INVERTED INDEX TOKENIZER pinyin_tokenizer_space_test """
+    } catch (Exception e) { /* ignore if not exists */ }
+    try {
+        sql """ DROP INVERTED INDEX TOKEN_FILTER pinyin_filter_space_test """
+    } catch (Exception e) { /* ignore if not exists */ }
+    
+    // Create pinyin tokenizer for comparison (spaces should be ignored in joined output)
+    // Key settings: keep_none_chinese=false (don't output English separately)
+    //               keep_none_chinese_in_joined_full_pinyin=true (include English in joined output)
+    sql """
+        CREATE INVERTED INDEX TOKENIZER pinyin_tokenizer_space_test
+        PROPERTIES (
+            "type" = "pinyin",
+            "keep_first_letter" = "false",
+            "keep_separate_first_letter" = "false",
+            "keep_full_pinyin" = "false",
+            "keep_joined_full_pinyin" = "true",
+            "keep_none_chinese" = "false",
+            "keep_none_chinese_in_joined_full_pinyin" = "true",
+            "none_chinese_pinyin_tokenize" = "false",
+            "keep_original" = "false",
+            "lowercase" = "false",
+            "trim_whitespace" = "false",
+            "ignore_pinyin_offset" = "true"
+        );
+    """
+    
+    // Create pinyin filter with keyword tokenizer for comparison
+    // Same settings as tokenizer to ensure consistent behavior
+    sql """
+        CREATE INVERTED INDEX TOKEN_FILTER pinyin_filter_space_test
+        PROPERTIES (
+            "type" = "pinyin",
+            "keep_first_letter" = "false",
+            "keep_separate_first_letter" = "false",
+            "keep_full_pinyin" = "false",
+            "keep_joined_full_pinyin" = "true",
+            "keep_none_chinese" = "false",
+            "keep_none_chinese_in_joined_full_pinyin" = "true",
+            "none_chinese_pinyin_tokenize" = "false",
+            "keep_original" = "false",
+            "lowercase" = "false",
+            "trim_whitespace" = "false",
+            "ignore_pinyin_offset" = "true"
+        );
+    """
+    
+    // Wait for tokenizer and filter to be ready before creating analyzers
+    sql """ select sleep(15) """
+    
+    sql """
+        CREATE INVERTED INDEX ANALYZER pinyin_analyzer_space_test
+        PROPERTIES (
+            "tokenizer" = "pinyin_tokenizer_space_test"
+        );
+    """
+    
+    sql """
+        CREATE INVERTED INDEX ANALYZER pinyin_filter_analyzer_space_test
+        PROPERTIES (
+            "tokenizer" = "keyword",
+            "token_filter" = "pinyin_filter_space_test"
+        );
+    """
+    
+    // Wait for analyzers to be ready
+    sql """ select sleep(15) """
+    
+    // Bug #1 Test: Mixed Chinese and English with spaces
+    // Input: "ALF 刘德华" - space should be ignored, English and pinyin should be joined
+    // Key point: Space between "ALF" and "刘德华" should NOT split the ASCII buffer
+    // Expected output: ["ALFliudehua"] - English and pinyin joined together
+    qt_sql_bug1_mixed_tokenizer """ select tokenize('ALF 刘德华', '"analyzer"="pinyin_analyzer_space_test"'); """
+    qt_sql_bug1_mixed_filter """ select tokenize('ALF 刘德华', '"analyzer"="pinyin_filter_analyzer_space_test"'); """
+    
+    // Test Bug #2: Pure English fallback
+    // When keep_none_chinese=false and input is pure English, should preserve original token (ES behavior)
+    
+    // Drop existing objects first
+    try {
+        sql """ DROP INVERTED INDEX ANALYZER pinyin_analyzer_fallback_test """
+    } catch (Exception e) { /* ignore if not exists */ }
+    try {
+        sql """ DROP INVERTED INDEX TOKEN_FILTER pinyin_filter_fallback_test """
+    } catch (Exception e) { /* ignore if not exists */ }
+    
+    sql """
+        CREATE INVERTED INDEX TOKEN_FILTER pinyin_filter_fallback_test
+        PROPERTIES (
+            "type" = "pinyin",
+            "keep_none_chinese" = "false",
+            "keep_original" = "false",
+            "keep_first_letter" = "false",
+            "keep_full_pinyin" = "false",
+            "keep_joined_full_pinyin" = "true",
+            "ignore_pinyin_offset" = "true",
+            "keep_none_chinese_in_first_letter" = "false",
+            "keep_none_chinese_in_joined_full_pinyin" = "false",
+            "lowercase" = "false"
+        );
+    """
+    
+    // Wait for filter to be ready before creating analyzer
+    sql """ select sleep(15) """
+    
+    sql """
+        CREATE INVERTED INDEX ANALYZER pinyin_analyzer_fallback_test
+        PROPERTIES (
+            "tokenizer" = "keyword",
+            "token_filter" = "pinyin_filter_fallback_test"
+        );
+    """
+    
+    // Wait for analyzer to be ready
+    sql """ select sleep(15) """
+    
+    // Bug #2 Test: Pure English should be preserved via fallback mechanism
+    // Before fix: [] (token was dropped)
+    // After fix: original token preserved
+    qt_sql_bug2_pure_english """ select tokenize('Lanky Kong', '"analyzer"="pinyin_analyzer_fallback_test"'); """
+    qt_sql_bug2_pure_numbers """ select tokenize('12345', '"analyzer"="pinyin_analyzer_fallback_test"'); """
+    
+    // Bug #2 Test: Chinese should still work normally (output joined pinyin)
+    qt_sql_bug2_chinese """ select tokenize('刘德华', '"analyzer"="pinyin_analyzer_fallback_test"'); """
+    
+    // ==================== End Bug Fix Tests ====================
+
     // Test table creation and queries with ignore_pinyin_offset
     def indexTbName7 = "test_custom_analyzer_pinyin_offset"
     sql "DROP TABLE IF EXISTS ${indexTbName7}"