@@ -94,7 +94,7 @@ suite("test_custom_analyzer", "p0") {
9494 "type" = "pinyin",
9595 "keep_first_letter" = "true",
9696 "keep_full_pinyin" = "false",
97- "keep_joined_full_pinyin " = "false",
97+ "keep_joined_full_pinyin" = "false",
9898 "keep_original" = "false",
9999 "lowercase" = "true"
100100 );
@@ -639,6 +639,146 @@ suite("test_custom_analyzer", "p0") {
639639 qt_sql_ignore_offset_true_mixed """ select tokenize('刘a德', '"analyzer"="pinyin_analyzer_ignore_true"'); """
640640 qt_sql_ignore_offset_false_mixed """ select tokenize('刘a德', '"analyzer"="pinyin_analyzer_ignore_false"'); """
641641
642+ // ==================== Bug Fix Tests ====================
643+ // Test Bug #1: Space handling consistency between pinyin tokenizer and pinyin filter
644+ // When using pinyin filter with keyword tokenizer, spaces should be ignored (not trigger buffer processing)
645+ // This matches ES behavior where spaces don't split the ASCII buffer
646+
647+ // Drop existing objects first to ensure clean state
648+ try {
649+ sql """ DROP INVERTED INDEX ANALYZER pinyin_analyzer_space_test """
650+ } catch (Exception e) { /* ignore if not exists */ }
651+ try {
652+ sql """ DROP INVERTED INDEX ANALYZER pinyin_filter_analyzer_space_test """
653+ } catch (Exception e) { /* ignore if not exists */ }
654+ try {
655+ sql """ DROP INVERTED INDEX TOKENIZER pinyin_tokenizer_space_test """
656+ } catch (Exception e) { /* ignore if not exists */ }
657+ try {
658+ sql """ DROP INVERTED INDEX TOKEN_FILTER pinyin_filter_space_test """
659+ } catch (Exception e) { /* ignore if not exists */ }
660+
661+ // Create pinyin tokenizer for comparison (spaces should be ignored in joined output)
662+ // Key settings: keep_none_chinese=false (don't output English separately)
663+ // keep_none_chinese_in_joined_full_pinyin=true (include English in joined output)
664+ sql """
665+ CREATE INVERTED INDEX TOKENIZER pinyin_tokenizer_space_test
666+ PROPERTIES (
667+ "type" = "pinyin",
668+ "keep_first_letter" = "false",
669+ "keep_separate_first_letter" = "false",
670+ "keep_full_pinyin" = "false",
671+ "keep_joined_full_pinyin" = "true",
672+ "keep_none_chinese" = "false",
673+ "keep_none_chinese_in_joined_full_pinyin" = "true",
674+ "none_chinese_pinyin_tokenize" = "false",
675+ "keep_original" = "false",
676+ "lowercase" = "false",
677+ "trim_whitespace" = "false",
678+ "ignore_pinyin_offset" = "true"
679+ );
680+ """
681+
682+ // Create pinyin filter with keyword tokenizer for comparison
683+ // Same settings as tokenizer to ensure consistent behavior
684+ sql """
685+ CREATE INVERTED INDEX TOKEN_FILTER pinyin_filter_space_test
686+ PROPERTIES (
687+ "type" = "pinyin",
688+ "keep_first_letter" = "false",
689+ "keep_separate_first_letter" = "false",
690+ "keep_full_pinyin" = "false",
691+ "keep_joined_full_pinyin" = "true",
692+ "keep_none_chinese" = "false",
693+ "keep_none_chinese_in_joined_full_pinyin" = "true",
694+ "none_chinese_pinyin_tokenize" = "false",
695+ "keep_original" = "false",
696+ "lowercase" = "false",
697+ "trim_whitespace" = "false",
698+ "ignore_pinyin_offset" = "true"
699+ );
700+ """
701+
702+ // Wait for tokenizer and filter to be ready before creating analyzers
703+ sql """ select sleep(15) """
704+
705+ sql """
706+ CREATE INVERTED INDEX ANALYZER pinyin_analyzer_space_test
707+ PROPERTIES (
708+ "tokenizer" = "pinyin_tokenizer_space_test"
709+ );
710+ """
711+
712+ sql """
713+ CREATE INVERTED INDEX ANALYZER pinyin_filter_analyzer_space_test
714+ PROPERTIES (
715+ "tokenizer" = "keyword",
716+ "token_filter" = "pinyin_filter_space_test"
717+ );
718+ """
719+
720+ // Wait for analyzers to be ready
721+ sql """ select sleep(15) """
722+
723+ // Bug #1 Test: Mixed Chinese and English with spaces
724+ // Input: "ALF 刘德华" - space should be ignored, English and pinyin should be joined
725+ // Key point: Space between "ALF" and "刘德华" should NOT split the ASCII buffer
726+ // Expected output: ["ALFliudehua"] - English and pinyin joined together
727+ qt_sql_bug1_mixed_tokenizer """ select tokenize('ALF 刘德华', '"analyzer"="pinyin_analyzer_space_test"'); """
728+ qt_sql_bug1_mixed_filter """ select tokenize('ALF 刘德华', '"analyzer"="pinyin_filter_analyzer_space_test"'); """
729+
730+ // Test Bug #2: Pure English fallback
731+ // When keep_none_chinese=false and input is pure English, should preserve original token (ES behavior)
732+
733+ // Drop existing objects first
734+ try {
735+ sql """ DROP INVERTED INDEX ANALYZER pinyin_analyzer_fallback_test """
736+ } catch (Exception e) { /* ignore if not exists */ }
737+ try {
738+ sql """ DROP INVERTED INDEX TOKEN_FILTER pinyin_filter_fallback_test """
739+ } catch (Exception e) { /* ignore if not exists */ }
740+
741+ sql """
742+ CREATE INVERTED INDEX TOKEN_FILTER pinyin_filter_fallback_test
743+ PROPERTIES (
744+ "type" = "pinyin",
745+ "keep_none_chinese" = "false",
746+ "keep_original" = "false",
747+ "keep_first_letter" = "false",
748+ "keep_full_pinyin" = "false",
749+ "keep_joined_full_pinyin" = "true",
750+ "ignore_pinyin_offset" = "true",
751+ "keep_none_chinese_in_first_letter" = "false",
752+ "keep_none_chinese_in_joined_full_pinyin" = "false",
753+ "lowercase" = "false"
754+ );
755+ """
756+
757+ // Wait for filter to be ready before creating analyzer
758+ sql """ select sleep(15) """
759+
760+ sql """
761+ CREATE INVERTED INDEX ANALYZER pinyin_analyzer_fallback_test
762+ PROPERTIES (
763+ "tokenizer" = "keyword",
764+ "token_filter" = "pinyin_filter_fallback_test"
765+ );
766+ """
767+
768+ // Wait for analyzer to be ready
769+ sql """ select sleep(15) """
770+
771+ // Bug #2 Test: Pure English should be preserved via fallback mechanism
772+ // Before fix: [] (token was dropped)
773+ // After fix: original token preserved
774+ qt_sql_bug2_pure_english """ select tokenize('Lanky Kong', '"analyzer"="pinyin_analyzer_fallback_test"'); """
775+ qt_sql_bug2_pure_numbers """ select tokenize('12345', '"analyzer"="pinyin_analyzer_fallback_test"'); """
776+
777+ // Bug #2 Test: Chinese should still work normally (output joined pinyin)
778+ qt_sql_bug2_chinese """ select tokenize('刘德华', '"analyzer"="pinyin_analyzer_fallback_test"'); """
779+
780+ // ==================== End Bug Fix Tests ====================
781+
642782 // Test table creation and queries with ignore_pinyin_offset
643783 def indexTbName7 = " test_custom_analyzer_pinyin_offset"
644784 sql " DROP TABLE IF EXISTS ${ indexTbName7} "
0 commit comments