diff --git a/docs/changelog.rst b/docs/changelog.rst index 2fc68e9a..a3ce2372 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,5 +1,13 @@ Changelog --------- +2.2.24 +^^^^^^ + - Fix SCC ingestion error when a doubled italic-off mid-row code + (9120 9120) appears before punctuation. The punctuation lookahead + now skips the error-correction duplicate, preventing an unwanted + space that pushed lines past the 32-character limit. + + 2.2.23 ^^^^^^ - bumps nltk from 3.9.1 to 3.9.4. diff --git a/docs/conf.py b/docs/conf.py index 5a4c169f..1a7f72fb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -53,7 +53,7 @@ # built documents. # # The short X.Y version. -version = "2.2.23" +version = "2.2.24.dev1" # The full version, including alpha/beta/rc tags. release = "2.2.23" diff --git a/pycaption/scc/__init__.py b/pycaption/scc/__init__.py index 205dc2dc..fcd0d458 100644 --- a/pycaption/scc/__init__.py +++ b/pycaption/scc/__init__.py @@ -332,7 +332,15 @@ def _translate_line(self, line): for idx, word in enumerate(word_list): word = word.strip() if len(word) == 4: - next_command = word_list[idx + 1] if idx + 1 < len(word_list) else None + # Look ahead for the next command, skipping the duplicate + # that SCC uses for error-correction (same word repeated). + next_idx = idx + 1 + if (next_idx < len(word_list) + and word_list[next_idx].strip() == word): + next_idx += 1 + next_command = ( + word_list[next_idx] if next_idx < len(word_list) else None + ) self._translate_word(word=word, next_command=next_command) def _translate_word(self, word, next_command=None): diff --git a/setup.py b/setup.py index 59abf1d9..c22cc404 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ setup( name="pycaption", - version="2.2.23", + version="2.2.24.dev1", description="Closed caption converter", long_description=open(README_PATH).read(), author="Joe Norton", diff --git a/tests/conftest.py b/tests/conftest.py index 39bf9cd6..37987723 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -91,6 +91,7 @@ sample_no_positioning_at_all_scc, sample_scc_created_dfxp_with_wrongly_closing_spans, sample_scc_duplicate_special_characters, + sample_scc_doubled_mid_row_before_punctuation, sample_scc_duplicate_tab_offset, sample_scc_empty, sample_scc_eoc_first_command, diff --git a/tests/fixtures/scc.py b/tests/fixtures/scc.py index b552dc8e..7605418d 100644 --- a/tests/fixtures/scc.py +++ b/tests/fixtures/scc.py @@ -659,3 +659,13 @@ def sample_scc_paint_on_edm(): 00:00:04;00 942c """ + + +@pytest.fixture(scope="session") +def sample_scc_doubled_mid_row_before_punctuation(): + return """\ +Scenarist_SCC V1.0 + +00:26:48;29\t9420 9420 94d0 94d0 97a1 97a1 3e3e 2057 e5a7 ecec 2062 e520 6261 e36b 206e e5f8 f420 f7e5 e56b 20f7 e9f4 6880 9470 9470 616e eff4 68e5 f220 e570 e973 ef64 e520 efe6 91ae 91ae 4361 6e61 6461 2046 e9ec e573 9120 9120 ae80 942c 942c 8080 8080 942f 942f + +""" diff --git a/tests/test_scc.py b/tests/test_scc.py index e0b01d77..f6f16bac 100644 --- a/tests/test_scc.py +++ b/tests/test_scc.py @@ -392,6 +392,24 @@ def test_mid_row_codes_not_adding_space_if_there_is_one_before( ] assert expected_lines == actual_lines + def test_doubled_mid_row_before_punctuation_no_extra_space( + self, + sample_scc_doubled_mid_row_before_punctuation, + ): + caption_set = SCCReader().read( + sample_scc_doubled_mid_row_before_punctuation + ) + captions = caption_set.get_captions("en-US") + text_nodes = [ + node.content + for cap_ in captions + for node in cap_.nodes + if node.type_ == CaptionNode.TEXT + ] + full_text = "".join(text_nodes) + assert " ." not in full_text + assert full_text.endswith("Files.") + def test_removing_spaces_at_end_of_lines( self, sample_scc_with_spaces_at_eol_pop,