From 43a95c23c782b04be14a188d28aa3c7c79e604b2 Mon Sep 17 00:00:00 2001 From: ashish-066 Date: Sun, 28 Jun 2026 12:28:24 +0530 Subject: [PATCH 1/2] handling email addresses in translate tagger --- app.py | 69 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/app.py b/app.py index 8460995..b0382ed 100644 --- a/app.py +++ b/app.py @@ -676,6 +676,55 @@ def _repl(_m): out.append(tvar_name.sub(_repl, piece)) return ''.join(out) +email_regrex = re.compile(r'[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}') + +def email_to_nospam(addr): + user, domain = addr.split('@', 1) + return f'{{{{nospam|{user}|{domain}}}}}' + +def process_emails_in_units(text): + def replace_in_unit(match): + content = match.group(1) + + emails = email_regrex.findall(content) + if not emails: + return match.group(0) + + # Push outside if it's just a label + email + if len(emails) == 1: + match_email = next(email_regrex.finditer(content)) + prefix = content[:match_email.start()] + suffix = content[match_email.end():] + + plain_prefix = re.sub(r'<[^>]+>', '', prefix).strip() + plain_suffix = re.sub(r'<[^>]+>', '', suffix).strip() + + # If suffix is basically empty or just punctuation: + if len(plain_suffix) <= 1 and plain_suffix in ('', '.', ',', ';', ':', '!', '?'): + # If prefix is a short label + if len(plain_prefix) < 50 and len(plain_prefix.split()) <= 10: + nospam = email_to_nospam(match_email.group(0)) + + if not prefix.strip(): + return f'{nospam}{suffix}' + + return f'{prefix.rstrip()} {nospam}{suffix}' + + # Otherwise, keep inside and tvar + email_counter = 1 + def email_regrexpl(m): + nonlocal email_counter + addr = m.group(0) + nospam = email_to_nospam(addr) + res = f'{nospam}' + email_counter += 1 + return res + + new_content = email_regrex.sub(email_regrexpl, content) + return f'{new_content}' + + return re.sub(r'(.*?)', replace_in_unit, text, flags=re.DOTALL) + # --- Main Tokenisation Logic --- @@ -700,19 +749,6 @@ def convert_to_translatable_wikitext(wikitext): while curr < text_length : found = None - if wikitext[curr] == '=': - # Find the end of the line - end_line = wikitext.find('\n', curr) - if end_line == -1: - end_line = text_length - line = wikitext[curr:end_line] - if re.match(r'^(=+)[^=]+(=+)$', line.strip()): - if last < curr: - parts.append((wikitext[last:curr], _wrap_in_translate)) - parts.append((line, process_section_heading)) - curr = end_line - last = curr - continue # Syntax highlight block pattern = ' Date: Sun, 28 Jun 2026 13:30:36 +0530 Subject: [PATCH 2/2] adding tests --- tests.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests.py b/tests.py index 82f1c37..ce6786c 100644 --- a/tests.py +++ b/tests.py @@ -271,5 +271,30 @@ def test_italic_text(self): ), "[[m:Special:MyLanguage/Main Page|Main Page]]" ) + + def test_email_label_only_pushed_outside(self): + self.assertEqual( + convert_to_translatable_wikitext("Contact: foo@bar.com"), + "Contact: {{nospam|foo|bar.com}}" + ) + + def test_email_already_in_nospam_untouched(self): + self.assertEqual( + convert_to_translatable_wikitext("Email: {{nospam|foo|bar.com}}"), + "Email: {{nospam|foo|bar.com}}" + ) + + def test_email_with_trailing_punctuation(self): + self.assertEqual( + convert_to_translatable_wikitext("Write to: hello@example.org."), + "Write to: {{nospam|hello|example.org}}." + ) + + def test_email_with_surrounding_text_keeps_tvar(self): + self.assertEqual( + convert_to_translatable_wikitext("Send questions to info@wiki.org or use the form."), + 'Send questions to {{nospam|info|wiki.org}} or use the form.' + ) + if __name__ == '__main__': unittest.main(exit=False, failfast=True)