diff --git a/.github/workflows/pythontest.yml b/.github/workflows/pythontest.yml index e77c613e69..609a0a933d 100644 --- a/.github/workflows/pythontest.yml +++ b/.github/workflows/pythontest.yml @@ -62,6 +62,8 @@ jobs: - 6379:6379 steps: - uses: actions/checkout@v6 + with: + fetch-depth: 0 - name: Set up minio run: | docker run -d -p 9000:9000 --name minio \ @@ -79,6 +81,17 @@ jobs: run: | # Use uv to install dependencies directly from requirements files uv pip sync requirements.txt requirements-dev.txt + - name: Lint new migrations for unsafe operations + if: github.event_name == 'pull_request' + env: + BASE_REF: ${{ github.base_ref }} + DJANGO_SETTINGS_MODULE: contentcuration.not_production_settings + run: | + set -euo pipefail + git fetch --no-tags origin "$BASE_REF" + base="$(git merge-base "origin/$BASE_REF" HEAD)" + test -n "$base" + python contentcuration/manage.py lintmigrations --git-commit-id "$base" --no-cache --warnings-as-errors - name: Test pytest run: | sh -c './contentcuration/manage.py makemigrations --check' diff --git a/Makefile b/Makefile index 27c222a7d2..2f7ff57212 100644 --- a/Makefile +++ b/Makefile @@ -39,7 +39,8 @@ migrate: # 4) Remove the management command from this `deploy-migrate` recipe # 5) Repeat! deploy-migrate: - echo "Nothing to do here!" + # studio#5974: remove at cutover. + python contentcuration/manage.py backfill_column --model contentcuration.File --source-field file_size --target-field file_size_bigint contentnodegc: python contentcuration/manage.py garbage_collect diff --git a/contentcuration/contentcuration/db/backends/__init__.py b/contentcuration/contentcuration/db/backends/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/contentcuration/contentcuration/db/backends/zero_downtime_prometheus/__init__.py b/contentcuration/contentcuration/db/backends/zero_downtime_prometheus/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/contentcuration/contentcuration/db/backends/zero_downtime_prometheus/base.py b/contentcuration/contentcuration/db/backends/zero_downtime_prometheus/base.py new file mode 100644 index 0000000000..ad23a39030 --- /dev/null +++ b/contentcuration/contentcuration/db/backends/zero_downtime_prometheus/base.py @@ -0,0 +1,12 @@ +from django_prometheus.db.backends.postgresql.base import ( + DatabaseWrapper as PrometheusDatabaseWrapper, +) +from django_zero_downtime_migrations.backends.postgres.schema import ( + DatabaseSchemaEditor, +) + + +class DatabaseWrapper(PrometheusDatabaseWrapper): + """Prometheus query metrics + zero-downtime safe-DDL schema editor.""" + + SchemaEditorClass = DatabaseSchemaEditor diff --git a/contentcuration/contentcuration/db/dual_write.py b/contentcuration/contentcuration/db/dual_write.py new file mode 100644 index 0000000000..604875670e --- /dev/null +++ b/contentcuration/contentcuration/db/dual_write.py @@ -0,0 +1,31 @@ +import hashlib + +import pgtrigger + + +def mirror_field(source, target): + """Mirror Django field `source` into `target` via a BEFORE INSERT/UPDATE + trigger (expand/contract dual-write).""" + + def decorator(model): + source_col = model._meta.get_field(source).column + target_col = model._meta.get_field(target).column + name = "mirror_{}_to_{}".format(source_col, target_col) + if len(name) > 43: # stay safely under pgtrigger's trigger-name limit + digest = hashlib.sha1( + "{}_{}".format(source_col, target_col).encode() + ).hexdigest()[:8] + name = "mirror_{}".format(digest) + # Change-guard (IS DISTINCT FROM): keeps a read cutover from clobbering + # writes to the repointed column with the stale source value. + trigger = pgtrigger.Trigger( + name=name, + when=pgtrigger.Before, + operation=pgtrigger.Insert | pgtrigger.Update, + func="IF NEW.{s} IS DISTINCT FROM OLD.{s} THEN NEW.{t} = NEW.{s}; END IF; RETURN NEW;".format( + s=source_col, t=target_col + ), + ) + return pgtrigger.register(trigger)(model) + + return decorator diff --git a/contentcuration/contentcuration/management/commands/backfill_column.py b/contentcuration/contentcuration/management/commands/backfill_column.py new file mode 100644 index 0000000000..eb99c6d09c --- /dev/null +++ b/contentcuration/contentcuration/management/commands/backfill_column.py @@ -0,0 +1,99 @@ +import time + +from django.apps import apps +from django.core.exceptions import FieldDoesNotExist +from django.core.management.base import BaseCommand +from django.core.management.base import CommandError +from django.db import transaction +from django.db.models import F + + +class Command(BaseCommand): + help = "Idempotent, resumable, throttled online backfill of one column into another, in batches." + + def add_arguments(self, parser): + parser.add_argument("--model", required=True, help="app_label.ModelName") + parser.add_argument("--source-field", required=True) + parser.add_argument("--target-field", required=True) + parser.add_argument("--batch-size", type=int, default=1000) + parser.add_argument( + "--sleep", type=float, default=0.1, help="seconds between batches" + ) + parser.add_argument("--start-id", default=None, help="resume from this pk") + parser.add_argument( + "--verify", + action="store_true", + help="report unbackfilled rows, exit nonzero if any", + ) + + def _resolve_model_fields(self, model_label, source, target): + try: + model = apps.get_model(model_label) + except (LookupError, ValueError) as e: + raise CommandError("Bad --model {!r}: {}".format(model_label, e)) + try: + model._meta.get_field(source) + model._meta.get_field(target) + except FieldDoesNotExist as e: + raise CommandError(str(e)) + return model + + def handle(self, *args, **options): + if options["sleep"] < 0: + raise CommandError("--sleep must be >= 0") + if options["batch_size"] < 1: + raise CommandError("--batch-size must be >= 1") + source = options["source_field"] + target = options["target_field"] + model = self._resolve_model_fields(options["model"], source, target) + + pk_name = model._meta.pk.name + batch_size, throttle = options["batch_size"], options["sleep"] + only_unfilled = {target + "__isnull": True, source + "__isnull": False} + unfilled = model.objects.filter(**only_unfilled) + unfilled_pks = unfilled.order_by(pk_name).values_list("pk", flat=True) + + if options["verify"]: + pending = unfilled.count() + self.stdout.write("{} rows still need backfill.".format(pending)) + if pending: + raise CommandError( + "backfill incomplete: {} rows pending".format(pending) + ) + return + + # Start at the first unfilled pk (>= --start-id if given); re-runs and + # resumes skip straight past an already-filled prefix. + lo = unfilled_pks + if options["start_id"] is not None: + lo = lo.filter(pk__gte=options["start_id"]) + lo = lo.first() + + total = 0 + while lo is not None: + # hi = batch_size-th pk at/after lo (None on the final short batch). + # Keyset paging by pk — works for any pk type, int or UUID. + hi = ( + model.objects.filter(pk__gte=lo) + .order_by(pk_name) + .values_list("pk", flat=True)[batch_size - 1 : batch_size] + .first() + ) + window = {"pk__gte": lo} if hi is None else {"pk__gte": lo, "pk__lte": hi} + with transaction.atomic(): + total += model.objects.filter(**window, **only_unfilled).update( + **{target: F(source)} + ) + self.stdout.write( + "backfilled through pk={} (updated {} so far)".format( + hi if hi is not None else lo, total + ) + ) + if hi is None: + break + lo = unfilled_pks.filter(pk__gt=hi).first() + # Throttle between batches so WAL generation / replication lag and autovacuum + # can keep up on large tables. Pass --sleep 0 to disable. + if throttle: + time.sleep(throttle) + self.stdout.write("Done. {} rows updated.".format(total)) diff --git a/contentcuration/contentcuration/migrations/0167_file_size_bigint_expand.py b/contentcuration/contentcuration/migrations/0167_file_size_bigint_expand.py new file mode 100644 index 0000000000..4ce8906620 --- /dev/null +++ b/contentcuration/contentcuration/migrations/0167_file_size_bigint_expand.py @@ -0,0 +1,41 @@ +# Generated by Django 3.2.24 on 2026-06-23 05:56 +import pgtrigger.compiler +import pgtrigger.migrations +from django.db import migrations +from django.db import models + + +class Migration(migrations.Migration): + + dependencies = [ + ("contentcuration", "0166_add_usersubscription"), + ] + + operations = [ + migrations.AddField( + model_name="file", + name="file_size_bigint", + field=models.BigIntegerField(blank=True, null=True), + ), + migrations.AddIndex( + model_name="file", + index=models.Index( + fields=["checksum", "file_size_bigint"], + name="file_checksum_fsizebig_idx", + ), + ), + pgtrigger.migrations.AddTrigger( + model_name="file", + trigger=pgtrigger.compiler.Trigger( + name="mirror_file_size_to_file_size_bigint", + sql=pgtrigger.compiler.UpsertTriggerSql( + func="IF NEW.file_size IS DISTINCT FROM OLD.file_size THEN NEW.file_size_bigint = NEW.file_size; END IF; RETURN NEW;", + hash="051e321c4cdf91ea81f96b9f9a29e3b5015def67", + operation="INSERT OR UPDATE", + pgid="pgtrigger_mirror_file_size_to_file_size_bigint_54326", + table="contentcuration_file", + when="BEFORE", + ), + ), + ), + ] diff --git a/contentcuration/contentcuration/models.py b/contentcuration/contentcuration/models.py index ae2ab2b615..3d2e384eb5 100644 --- a/contentcuration/contentcuration/models.py +++ b/contentcuration/contentcuration/models.py @@ -79,6 +79,7 @@ from contentcuration.constants import feedback from contentcuration.constants import user_history from contentcuration.constants.contentnode import kind_activity_map +from contentcuration.db.dual_write import mirror_field from contentcuration.db.models.expressions import Array from contentcuration.db.models.functions import ArrayRemove from contentcuration.db.models.functions import Unnest @@ -3255,6 +3256,8 @@ class StagedFile(models.Model): FILE_DISTINCT_INDEX_NAME = "file_checksum_file_size_idx" +# studio#5974: bigint shadow of FILE_DISTINCT_INDEX_NAME, for the file_size widening. +FILE_DISTINCT_BIGINT_INDEX_NAME = "file_checksum_fsizebig_idx" FILE_MODIFIED_DESC_INDEX_NAME = "file_modified_desc_idx" FILE_DURATION_CONSTRAINT = "file_media_duration_int" MEDIA_PRESETS = [ @@ -3266,6 +3269,14 @@ class StagedFile(models.Model): ] +# studio#5974 cutover (next release, after backfill completes): +# - drop the @mirror_field decorator and the file_size_bigint field below +# - file_size = models.BigIntegerField(blank=True, null=True, db_column="file_size_bigint") +# - FILE_DISTINCT_BIGINT_INDEX_NAME -> fields=["checksum", "file_size"] +# - migration: swap the mirror trigger + SeparateDatabaseAndState state realignment +# studio#5974 contract (later release, no old pods left): migration drops the old +# file_size int column (IgnoreMigration) + the trigger; no model change. +@mirror_field("file_size", "file_size_bigint") # studio#5974: dual-write int->bigint class File(models.Model): """ The bottom layer of the contentDB schema, defines the basic building brick for content. @@ -3275,6 +3286,9 @@ class File(models.Model): id = UUIDField(primary_key=True, default=uuid.uuid4) checksum = models.CharField(max_length=400, blank=True, db_index=True) file_size = models.IntegerField(blank=True, null=True) + file_size_bigint = models.BigIntegerField( + blank=True, null=True + ) # studio#5974 shadow file_on_disk = models.FileField( upload_to=object_storage_name, storage=default_storage, @@ -3485,6 +3499,10 @@ class Meta: models.Index( fields=["checksum", "file_size"], name=FILE_DISTINCT_INDEX_NAME ), + models.Index( + fields=["checksum", "file_size_bigint"], + name=FILE_DISTINCT_BIGINT_INDEX_NAME, + ), models.Index(fields=["-modified"], name=FILE_MODIFIED_DESC_INDEX_NAME), ] constraints = [ diff --git a/contentcuration/contentcuration/not_production_settings.py b/contentcuration/contentcuration/not_production_settings.py index afcc6460bc..35be6db3c2 100644 --- a/contentcuration/contentcuration/not_production_settings.py +++ b/contentcuration/contentcuration/not_production_settings.py @@ -20,5 +20,14 @@ AWS_AUTO_CREATE_BUCKET = True +INSTALLED_APPS += ("django_migration_linter",) # noqa F405 + +MIGRATION_LINTER_OPTIONS = { + "exclude_apps": [ + "kolibri_content" + ], # SQLite content-export app; not on the safe-DDL Postgres backend + "sql_analyser": "postgresql", +} + # Use local instance for curriculum automation for development CURRICULUM_AUTOMATION_API_URL = "http://localhost:8000" diff --git a/contentcuration/contentcuration/production_settings.py b/contentcuration/contentcuration/production_settings.py index a00bf43a41..92d63cff73 100644 --- a/contentcuration/contentcuration/production_settings.py +++ b/contentcuration/contentcuration/production_settings.py @@ -39,7 +39,7 @@ if SITE_READ_ONLY: CACHES["default"]["BACKEND"] = "django_prometheus.cache.backends.locmem.LocMemCache" -DATABASES["default"]["ENGINE"] = "django_prometheus.db.backends.postgresql" +DATABASES["default"]["ENGINE"] = "contentcuration.db.backends.zero_downtime_prometheus" REST_FRAMEWORK["DEFAULT_RENDERER_CLASSES"] = [ diff --git a/contentcuration/contentcuration/settings.py b/contentcuration/contentcuration/settings.py index 2d8bafaa9b..c0fb988996 100644 --- a/contentcuration/contentcuration/settings.py +++ b/contentcuration/contentcuration/settings.py @@ -92,6 +92,7 @@ "django_celery_results", "kolibri_public", "automation", + "pgtrigger", ) SESSION_ENGINE = "django.contrib.sessions.backends.cached_db" @@ -193,7 +194,7 @@ DATABASES = { "default": { - "ENGINE": "django.db.backends.postgresql_psycopg2", + "ENGINE": "django_zero_downtime_migrations.backends.postgres", "NAME": os.getenv("DATA_DB_NAME") or "kolibri-studio", # For dev purposes only "USER": os.getenv("DATA_DB_USER") or "learningequality", @@ -204,6 +205,13 @@ }, } +ZERO_DOWNTIME_MIGRATIONS_LOCK_TIMEOUT = "5s" +ZERO_DOWNTIME_MIGRATIONS_STATEMENT_TIMEOUT = "15s" +ZERO_DOWNTIME_MIGRATIONS_FLEXIBLE_STATEMENT_TIMEOUT = ( + True # don't kill long-but-safe ops (e.g. CREATE INDEX CONCURRENTLY) +) +ZERO_DOWNTIME_MIGRATIONS_RAISE_FOR_UNSAFE = True # surface unsafe DDL at runtime + IS_CONTENTNODE_TABLE_PARTITIONED = ( os.getenv("IS_CONTENTNODE_TABLE_PARTITIONED") or False ) diff --git a/contentcuration/contentcuration/test_settings.py b/contentcuration/contentcuration/test_settings.py index a1fcf20e6b..208e25f919 100644 --- a/contentcuration/contentcuration/test_settings.py +++ b/contentcuration/contentcuration/test_settings.py @@ -2,6 +2,11 @@ DEBUG = True +# Test DB build replays full migration history, including legacy migrations pre-dating safe-DDL. +# Disabling RAISE_FOR_UNSAFE here is safe: runtime/production still enforces it, CI migration +# linter gates new migrations. This is the intended steady state; migration history is not rewritten. +ZERO_DOWNTIME_MIGRATIONS_RAISE_FOR_UNSAFE = False + WEBPACK_LOADER["DEFAULT"][ # noqa "LOADER_CLASS" ] = "contentcuration.tests.webpack_loader.TestWebpackLoader" diff --git a/contentcuration/contentcuration/tests/test_backfill_column.py b/contentcuration/contentcuration/tests/test_backfill_column.py new file mode 100644 index 0000000000..1ecf3e151f --- /dev/null +++ b/contentcuration/contentcuration/tests/test_backfill_column.py @@ -0,0 +1,194 @@ +import uuid +from io import StringIO +from unittest.mock import patch + +from django.core.management import call_command +from django.core.management import CommandError +from django.db import connection +from django.db import models +from django.db.models import F +from django.test import SimpleTestCase +from django.test import TransactionTestCase +from django.test.utils import isolate_apps + + +def _make_probe_class(): + class Probe(models.Model): + source = models.IntegerField(null=True) + shadow = models.IntegerField(null=True) + + class Meta: + app_label = "contentcuration" + + return Probe + + +def _make_uuid_probe_class(): + class UUIDProbe(models.Model): + id = models.UUIDField(primary_key=True, default=uuid.uuid4) + source = models.IntegerField(null=True) + shadow = models.IntegerField(null=True) + + class Meta: + app_label = "contentcuration" + + return UUIDProbe + + +@isolate_apps("contentcuration") +class BackfillColumnTestCase(TransactionTestCase): + def _create_model(self, model): + with connection.schema_editor(atomic=False) as editor: + editor.create_model(model) + self.addCleanup(self._delete_model, model) + + def _delete_model(self, model): + with connection.schema_editor(atomic=False) as editor: + editor.delete_model(model) + + def _run_backfill(self, model, **kwargs): + out = StringIO() + with patch( + "contentcuration.management.commands.backfill_column.apps", + model._meta.apps, + ): + call_command( + "backfill_column", + stdout=out, + model="contentcuration.{}".format(model.__name__), + source_field="source", + target_field="shadow", + **kwargs, + ) + return out.getvalue() + + def _assert_all_synced(self, model): + self.assertEqual( + model.objects.count(), model.objects.filter(shadow=F("source")).count() + ) + + def _assert_synced_from(self, model, resume_pk): + below = model.objects.filter(pk__lt=resume_pk) + at_or_above = model.objects.filter(pk__gte=resume_pk) + self.assertEqual(below.count(), below.filter(shadow__isnull=True).count()) + self.assertEqual( + at_or_above.count(), at_or_above.filter(shadow=F("source")).count() + ) + + def test_backfills_all_rows(self): + Probe = _make_probe_class() + self._create_model(Probe) + for i in range(1, 6): + Probe.objects.create(source=i * 10, shadow=None) + + # batch_size=2 over 5 rows exercises the multi-batch loop. + self._run_backfill(Probe, batch_size=2, sleep=0) + + self._assert_all_synced(Probe) + + def test_idempotent(self): + Probe = _make_probe_class() + self._create_model(Probe) + for i in range(1, 4): + Probe.objects.create(source=i * 10, shadow=None) + + self._run_backfill(Probe, batch_size=10, sleep=0) + output = self._run_backfill(Probe, batch_size=10, sleep=0) + + self.assertIn("Done. 0 rows updated.", output) + self._assert_all_synced(Probe) + + def test_resumable(self): + Probe = _make_probe_class() + self._create_model(Probe) + objs = sorted( + [Probe.objects.create(source=i * 10, shadow=None) for i in range(1, 6)], + key=lambda o: o.pk, + ) + resume_pk = objs[2].pk + + self._run_backfill(Probe, batch_size=10, sleep=0, start_id=resume_pk) + + self._assert_synced_from(Probe, resume_pk) + + def test_null_source_safe(self): + Probe = _make_probe_class() + self._create_model(Probe) + Probe.objects.create(source=None, shadow=None) + Probe.objects.create(source=42, shadow=None) + + output = self._run_backfill(Probe, batch_size=10, sleep=0) + + self.assertIn("Done.", output) + self.assertIsNone(Probe.objects.get(source__isnull=True).shadow) + self.assertEqual(Probe.objects.get(source=42).shadow, 42) + + def test_backfills_uuid_pk_across_batches(self): + """Regression: paging must not assume an integer pk (File has a UUID pk).""" + UUIDProbe = _make_uuid_probe_class() + self._create_model(UUIDProbe) + for i in range(1, 6): + UUIDProbe.objects.create(source=i * 10, shadow=None) + + # batch_size=2 forces the lower-bound advance where integer arithmetic on a + # UUID pk would blow up. + self._run_backfill(UUIDProbe, batch_size=2, sleep=0) + + self._assert_all_synced(UUIDProbe) + + def test_resumable_uuid_pk(self): + """--start-id must accept a UUID and resume from it.""" + UUIDProbe = _make_uuid_probe_class() + self._create_model(UUIDProbe) + objs = sorted( + [UUIDProbe.objects.create(source=i * 10, shadow=None) for i in range(1, 6)], + key=lambda o: o.pk, + ) + resume_pk = objs[2].pk + + self._run_backfill(UUIDProbe, batch_size=10, sleep=0, start_id=str(resume_pk)) + + self._assert_synced_from(UUIDProbe, resume_pk) + + def test_verify_passes_when_complete(self): + Probe = _make_probe_class() + self._create_model(Probe) + Probe.objects.create(source=1, shadow=1) + Probe.objects.create(source=None, shadow=None) # null source doesn't count + + output = self._run_backfill(Probe, verify=True) + + self.assertIn("0 rows still need backfill.", output) + + def test_verify_fails_and_writes_nothing_when_incomplete(self): + Probe = _make_probe_class() + self._create_model(Probe) + Probe.objects.create(source=7, shadow=None) + + with self.assertRaisesRegex( + CommandError, "backfill incomplete: 1 rows pending" + ): + self._run_backfill(Probe, verify=True) + + self.assertIsNone(Probe.objects.get(source=7).shadow) # --verify must not write + + +class BackfillColumnArgValidationTestCase(SimpleTestCase): + def _call(self, **kwargs): + call_command( + "backfill_column", + model="contentcuration.Channel", + source_field="name", + target_field="name", + **kwargs, + ) + + def test_negative_sleep_raises(self): + with self.assertRaisesRegex(CommandError, "--sleep must be >= 0"): + self._call(sleep=-1) + + def test_non_positive_batch_size_raises(self): + for bad in (0, -1): + with self.subTest(batch_size=bad): + with self.assertRaisesRegex(CommandError, "--batch-size must be >= 1"): + self._call(batch_size=bad) diff --git a/contentcuration/contentcuration/tests/test_dual_write.py b/contentcuration/contentcuration/tests/test_dual_write.py new file mode 100644 index 0000000000..f123b39ad8 --- /dev/null +++ b/contentcuration/contentcuration/tests/test_dual_write.py @@ -0,0 +1,51 @@ +from django.db import connection +from django.db import models +from django.test import TransactionTestCase +from django.test.utils import isolate_apps + +from contentcuration.db.dual_write import mirror_field + + +@isolate_apps("contentcuration") +class MirrorFieldTestCase(TransactionTestCase): + def _delete_model(self, model): + with connection.schema_editor(atomic=False) as editor: + editor.delete_model(model) + + def test_long_field_names_truncate_trigger_name(self): + """Trigger name from long field names is truncated, not raised.""" + + @mirror_field("a_very_long_source_field_name", "a_very_long_target_field_name") + class LongNameProbe(models.Model): + a_very_long_source_field_name = models.IntegerField() + a_very_long_target_field_name = models.IntegerField(null=True) + + class Meta: + app_label = "contentcuration" + + self.assertEqual(len(LongNameProbe._meta.triggers), 1) + self.assertLessEqual(len(LongNameProbe._meta.triggers[0].name), 43) + + def test_syncs_shadow_column_in_db(self): + @mirror_field("source", "shadow") + class Probe(models.Model): + source = models.IntegerField() + shadow = models.IntegerField(null=True) + + class Meta: + app_label = "contentcuration" + + with connection.schema_editor(atomic=False) as editor: + editor.create_model(Probe) + self.addCleanup(self._delete_model, Probe) + for trigger in Probe._meta.triggers: + trigger.install(Probe) + + obj = Probe.objects.create(source=5) + obj.refresh_from_db() + self.assertEqual(obj.shadow, 5) + + obj.source = 9 + obj.save() + obj.refresh_from_db() + self.assertEqual(obj.shadow, 9) diff --git a/docs/_index.md b/docs/_index.md index c3e67006ba..072419ef13 100644 --- a/docs/_index.md +++ b/docs/_index.md @@ -17,6 +17,10 @@ - [Docker + Kubernetes Studio Instance Setup](./docker_kubernetes_setup.md) +## Database + +- [Zero-downtime migrations (expand/contract runbook)](./zero_downtime_migrations.md) + ## API - [API Endpoints](./api_endpoints.md) diff --git a/docs/zero_downtime_migrations.md b/docs/zero_downtime_migrations.md new file mode 100644 index 0000000000..85dd169abc --- /dev/null +++ b/docs/zero_downtime_migrations.md @@ -0,0 +1,106 @@ +# Zero-downtime migrations — expand/contract runbook + +On large tables (`File` has ~100 M rows) a single migration can cause downtime two ways: by taking an `ACCESS EXCLUSIVE` lock / rewriting the table, or by shipping a schema the still-running old pods can't use (a dropped or renamed column). The expand/contract procedure below avoids both. Two guards (already configured) enforce it; the rest of this doc is the step-by-step. + +## Guards (already configured) + +- **Safe-DDL backend** — `DATABASES["default"]["ENGINE"]` in `settings.py` (prod adds Prometheus via `contentcuration.db.backends.zero_downtime_prometheus`). Lock/statement timeouts and `RAISE_FOR_UNSAFE` make rewriting or exclusively-locking DDL fail at migrate time. See the `ZERO_DOWNTIME_MIGRATIONS_*` settings. +- **`django-migration-linter`** (CI, `pythontest.yml`) — flags backward-incompatible schema (drops, renames, NOT NULL adds) the old pods would break on. The backend doesn't judge this axis. + +`test_settings.py` sets `RAISE_FOR_UNSAFE = False` so the test DB can replay pre-guard legacy migrations; runtime and CI still enforce both guards. + +## Procedure + +Goal: replace a column (here `File.file_size` int → bigint) with no downtime. The physical column ends up named `file_size_bigint`, aliased back to the ORM field `file_size` via `db_column` — a live rename is backward-incompatible, so we don't rename. + +### Release 1 — expand + +Add the shadow field and the dual-write trigger: + +```python +from contentcuration.db.dual_write import mirror_field + +@mirror_field("file_size", "file_size_bigint") +class File(models.Model): + file_size = models.IntegerField(blank=True, null=True) + file_size_bigint = models.BigIntegerField(blank=True, null=True) +``` + +`makemigrations` emits a nullable `AddField` and the `CreateTrigger` — both safe (no rewrite, no lock). New writes now land in both columns. + +Backfill old rows in the same release: wire `backfill_column` as a `deploy-migrate` step (Makefile), which runs after `migrate`, so the column and trigger already exist. Re-run until it reports `0 rows updated`, then gate on `--verify` (read-only, exits nonzero while rows remain): + +```bash +python contentcuration/manage.py backfill_column \ + --model contentcuration.File --source-field file_size --target-field file_size_bigint --verify +``` + +### Release 2 — read cutover (state-only, no DDL) + +Repoint the ORM field at the shadow column and drop the shadow field + decorator: + +```python +class File(models.Model): + file_size = models.BigIntegerField(blank=True, null=True, db_column="file_size_bigint") + # file_size_bigint field and @mirror_field decorator removed +``` + +The migration realigns Django state only — the physical trigger stays live through the rollout: + +```python +migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.RemoveField("file", "file_size_bigint"), + migrations.AlterField( + "file", "file_size", + models.BigIntegerField(blank=True, null=True, db_column="file_size_bigint"), + ), + pgtrigger.migrations.RemoveTrigger("file", "mirror_file_size_to_file_size_bigint"), + ], + database_operations=[], # columns and trigger physically unchanged +) +``` + +The single forward trigger covers the mixed-pod window: old pods write the int column → trigger mirrors to bigint; new pods write bigint → the change-guard sees the int column unchanged and skips. The only gap — an old pod missing a new pod's `> 2.1 GB` value — already fails as `integer out of range` on old pods today, so it's not a regression. + +### Release 3 — contract (drop the old column) + +`models.py` is unchanged from release 2. Drop the old physical column and trigger in the database only, after release 2 is fully deployed: + +```python +operations = [ + IgnoreMigration(), # safe: read-cutover release deployed; nothing references old file_size + migrations.SeparateDatabaseAndState( + state_operations=[], + database_operations=[ + migrations.RunSQL( + sql=( + "DROP TRIGGER IF EXISTS pgtrigger_mirror_file_size_to_file_size_bigint_54326" + " ON contentcuration_file;" + 'ALTER TABLE contentcuration_file DROP COLUMN "file_size";' + ), + reverse_sql='ALTER TABLE contentcuration_file ADD COLUMN "file_size" integer;', + ), + ], + ), +] +``` + +Raw SQL because cutover detached both objects from Django state: the column lost its field, and `RemoveTrigger` was applied state-only to keep the physical trigger alive. `RemoveTrigger.database_forwards` and `RemoveField` look their target up in migration state, so with nothing there they have nothing to drop. Copy the trigger `pgid` from release 1's `AddTrigger` operation. + +The linter flags the drop as backward-incompatible — expected; `IgnoreMigration()` acknowledges it's safe by sequencing. + +## Tooling + +- **`@mirror_field(source, target)`** (`contentcuration/db/dual_write.py`) — BEFORE INSERT/UPDATE trigger copying field `source` → `target`. Change-guarded: load-bearing, not an optimization — an unconditional copy corrupts data at cutover. +- **`backfill_column`** — idempotent, resumable (`--start-id `), throttled (`--sleep`, `--batch-size`); one transaction per batch. `--verify` counts remaining rows without writing and exits nonzero if any remain. +- **`lintmigrations`** — run locally before pushing: + ```bash + python contentcuration/manage.py lintmigrations --git-commit-id --no-cache --warnings-as-errors + ``` + `--git-commit-id` is a flag, not positional — a positional value is read as an app label and lints nothing. +- **`IgnoreMigration()`** — escape hatch for a migration whose backward-incompatibility is made safe by release sequencing (see release 3). Always comment the sequencing guarantee. + +## Reclaiming the physical name (usually skip) + +`db_column` makes the `file_size_bigint` physical name invisible to all ORM/app code; only raw SQL sees it. Matching the physical name to `file_size` requires a second expand/contract cycle — needed only for raw-SQL or external consumers. diff --git a/requirements-dev.in b/requirements-dev.in index 8a9224102c..7cd6acd113 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -8,3 +8,4 @@ pytest-timeout pre-commit==4.5.1 nodeenv drf-yasg==1.21.10 +django-migration-linter==6.0.0 diff --git a/requirements-dev.txt b/requirements-dev.txt index ac6539f932..7391a7cb93 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,7 @@ # This file was autogenerated by uv via the following command: -# uv pip compile requirements-dev.in --output-file requirements-dev.txt +# uv pip compile requirements-dev.in -o requirements-dev.txt +appdirs==1.4.4 + # via django-migration-linter asgiref==3.3.4 # via # -c requirements.txt @@ -11,10 +13,13 @@ distlib==0.3.9 django==3.2.24 # via # -c requirements.txt + # django-migration-linter # djangorestframework # drf-yasg django-concurrent-test-helper==0.7.0 # via -r requirements-dev.in +django-migration-linter==6.0.0 + # via -r requirements-dev.in djangorestframework==3.15.1 # via # -c requirements.txt @@ -91,6 +96,8 @@ sqlparse==0.4.1 # django tblib==1.7.0 # via django-concurrent-test-helper +toml==0.10.2 + # via django-migration-linter tomli==1.2.3 # via pytest typing-extensions==4.15.0 diff --git a/requirements.in b/requirements.in index 2d59962414..581fb71410 100644 --- a/requirements.in +++ b/requirements.in @@ -39,3 +39,5 @@ pydantic==2.12.5 latex2mathml==3.78.1 markdown-it-py==4.0.0 stripe>=5.0.0,<6.0.0 +django-pg-zero-downtime-migrations==0.13 +django-pgtrigger==4.11.0 diff --git a/requirements.txt b/requirements.txt index 0d66015b7a..2c4771b609 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile requirements.in --output-file requirements.txt +# uv pip compile requirements.in -o requirements.txt amqp==5.1.1 # via kombu annotated-types==0.7.0 @@ -58,6 +58,8 @@ django==3.2.24 # django-filter # django-js-reverse # django-model-utils + # django-pg-zero-downtime-migrations + # django-pgtrigger # django-redis # django-registration # django-s3-storage @@ -79,6 +81,10 @@ django-model-utils==5.0.0 # via -r requirements.in django-mptt==0.16.0 # via -r requirements.in +django-pg-zero-downtime-migrations==0.13 + # via -r requirements.in +django-pgtrigger==4.11.0 + # via -r requirements.in django-postmark==0.1.6 # via -r requirements.in django-prometheus==2.3.1