Skip to content
Merged
166 changes: 166 additions & 0 deletions notebooks/mt_browser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import marimo

__generated_with = "0.22.0"
app = marimo.App(width="medium")


@app.cell
def _():
import marimo as mo

return (mo,)


@app.cell
def _(mo):
mo.md("""
# Phase 1 Machine Translation Browser

Browse Phase 1 machine translations side-by-side with source, reference, and evaluation
scores (chrF, COMET, CometKiwi). Translations and backtranslations are shown in separate
sections.
""")
return


@app.cell
def _():
import pathlib

import polars as pl

return pathlib, pl


@app.cell
def _(mo):
mo.md("""
## Configuration

Set `DATA_DIR` to the phase-1 data directory (e.g. the project drive or TigerData mount).
All data files are resolved relative to this path.
""")
return


@app.cell
def _(pathlib):
# Set this to the phase-1 data directory on the project drive / TigerData mount.
# Defaults to the local data directory for local development.
DATA_DIR = pathlib.Path(__file__).parent.parent / "data" / "Phase 1"
return (DATA_DIR,)


@app.cell
def _(DATA_DIR, pl):
# Load full sentence translations and join with eval scores.
# Uses the complete notion-sents corpus (all Notion concepts), not the annotation subset.
_sents_meta = pl.concat(
[
pl.read_ndjson(DATA_DIR / f"notion-sents/mt-sents-{m}.jsonl")
for m in ["google_tllm", "hymt", "gemma"]
]
)
_sents_scores = pl.concat(
[
pl.read_csv(DATA_DIR / f"notion-sents/eval-sents-{m}.csv")
for m in ["google_tllm", "hymt", "gemma"]
]
)
sents_df = _sents_meta.join(_sents_scores, on="tr_id", how="left").with_columns(
pl.col("chrf").round(3),
pl.col("comet").round(3),
pl.col("cometkiwi").round(3),
)
return (sents_df,)


@app.cell
def _(DATA_DIR, pl):
# Load paragraph translations and join with eval scores.
_pars_meta = pl.concat(
[
pl.read_ndjson(DATA_DIR / f"mto-pars/mt-pars-{m}.jsonl")
for m in ["google_tllm", "hymt", "gemma"]
]
)
_pars_scores = pl.concat(
[
pl.read_csv(DATA_DIR / f"mto-pars/eval-pars-{m}.csv")
for m in ["google_tllm", "hymt", "gemma"]
]
)
pars_df = _pars_meta.join(_pars_scores, on="tr_id", how="left").with_columns(
pl.col("chrf").round(3),
pl.col("comet").round(3),
pl.col("cometkiwi").round(3),
)
return (pars_df,)


@app.cell
def _(mo):
mo.md("""
## 1. Sentence translations
""")
return


@app.cell
def _(pl, sents_df):
sents_df.filter(pl.col("src_lang") != "en").select(
["pair_id", "model", "src_lang", "tr_lang", "chrf", "comet", "cometkiwi", "src_text", "ref_text", "tr_text"]
)
return


@app.cell
def _(mo):
mo.md("""
## 2. Sentence backtranslations
""")
return


@app.cell
def _(pl, sents_df):
sents_df.filter(pl.col("src_lang") == "en").select(
["pair_id", "model", "src_lang", "tr_lang", "chrf", "comet", "cometkiwi", "src_text", "ref_text", "tr_text"]
)
return


@app.cell
def _(mo):
mo.md("""
## 3. Paragraph translations
""")
return


@app.cell
def _(pars_df, pl):
pars_df.filter(pl.col("src_lang") != "en").select(
["pair_id", "model", "src_lang", "tr_lang", "chrf", "comet", "cometkiwi", "src_text", "ref_text", "tr_text"]
)
return


@app.cell
def _(mo):
mo.md("""
## 4. Paragraph backtranslations
""")
return


@app.cell
def _(pars_df, pl):
pars_df.filter(pl.col("src_lang") == "en").select(
["pair_id", "model", "src_lang", "tr_lang", "chrf", "comet", "cometkiwi", "src_text", "ref_text", "tr_text"]
)
return


if __name__ == "__main__":
app.run()
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ classifiers = [
]
dependencies = [
"pip", # required for downloading spaCy model in uv-created virtualenvs
"marimo>=0.13.11",
"marimo>=0.22.0",
"polars>=0.20.4",
"numpy",
"ipython", # Required by transformers for trainer functionality
Expand Down Expand Up @@ -56,6 +56,8 @@ path = "src/muse/__init__.py"
target-version = "py312"
# Configure src path so ruff import fixes can identify local imports
src = [ "src" ]
# Ignore notebooks — marimo notebooks are not linted
extend-exclude = [ "notebooks/*.py" ]

[tool.ruff.lint]
# Include these rules in addition to ruff's defaults
Expand Down
53 changes: 48 additions & 5 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading