diff --git a/.gitignore b/.gitignore index 041eaf3fbb..e2c5afe719 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# TMP archival dir for local dev +tmpcontent/archives + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/contentcuration/contentcuration/management/commands/archivechanneltree.py b/contentcuration/contentcuration/management/commands/archivechanneltree.py new file mode 100644 index 0000000000..86857d49c3 --- /dev/null +++ b/contentcuration/contentcuration/management/commands/archivechanneltree.py @@ -0,0 +1,27 @@ +import os +import logging as logmodule + +from django.core.management.base import BaseCommand + +from contentcuration.utils.archive import archive_channel_tree + +logmodule.basicConfig() +logging = logmodule.getLogger(__name__) + + +class Command(BaseCommand): + def add_arguments(self, parser): + parser.add_argument("channel_id", type=str) + parser.add_argument("--tree", default='main') + + def handle(self, *args, **options): + channel_id = options["channel_id"] + tree = options["tree"] + + archive_path = archive_channel_tree(channel_id, tree=tree) + + if os.path.exists(archive_path): + self.stdout.write("Archive saved to {}".format(archive_path)) + else: + logging.warning("Channel archive failed.") + diff --git a/contentcuration/contentcuration/utils/archive.py b/contentcuration/contentcuration/utils/archive.py new file mode 100644 index 0000000000..dbe4c58a47 --- /dev/null +++ b/contentcuration/contentcuration/utils/archive.py @@ -0,0 +1,283 @@ +from datetime import datetime +import json +import os + +from rest_framework import serializers + +from contentcuration.models import AssessmentItem +from contentcuration.models import Channel +from contentcuration.models import ContentNode +from contentcuration.models import File +from contentcuration.viewsets.assessmentitem import AssessmentItemSerializer +from contentcuration.viewsets.file import FileSerializer +from contentcuration.viewsets.file import retrieve_storage_url + +# TASKS +################################################################################ + + +def archive_channel_tree(channel_id, tree="main"): + """ + Convert the `tree`_tree of `channel_id` to JSON and save it to archives dir. + """ + channel = Channel.objects.get(id=channel_id) + # 1. serialize tree + root = getattr(channel, tree + "_tree") + tree_serializer = ContentNodeArchiveSerializer(root) + tree_data = tree_serializer.data + + # 2. get channel attributes + channel_serializer = ChannelMetadataArchiveSerializer(channel) + channel_data = channel_serializer.data + + # 3. manually transplant attributes from tree root node onto channel node + # TODO: review if all these are necessay and archive-worthy + channel_data["children"] = tree_data["children"] + channel_data["tree_name"] = tree + "_tree" # to know what we're archiving + channel_data["tree_id"] = tree_data["tree_id"] # to know what we're archiving + channel_data["created"] = tree_data["created"] + channel_data["modified"] = tree_data["modified"] + channel_data["extra_fields"] = tree_data["extra_fields"] + channel_data["publishing"] = tree_data["publishing"] + channel_data["published"] = tree_data["published"] + channel_data["complete"] = tree_data["complete"] + channel_data["changed"] = tree_data["changed"] + channel_data["freeze_authoring_data"] = tree_data["freeze_authoring_data"] + + # 4. dict -> json + tree_data_json_str = json.dumps(channel_data, indent=4, ensure_ascii=False) + + # 5. save dat + archive_time = datetime.now().strftime("%Y-%m-%d__%H%M") + filename_ext = channel_id + "_" + tree + "_" + archive_time + ".json" + save_to_path = tmpcontent_write(filename_ext, tree_data_json_str) + return save_to_path + + +# ARCHIVAL SERIALIZERS +################################################################################ + +# NOTE: the NODE_ATTRIBUTES was obtained from node.__dict__.keys() and is a complete +# picture of a content node (maybe too complete!). This is because ContentNodeSerializer +# only ouputs a subset of fields +# https://github.com/learningequality/studio/blob/develop/contentcuration/contentcuration/viewsets/contentnode.py#L219-L238 +# while for archival purposes we want ot have the complete picture. +# TODO: do not reinvent the wheel: try to ContentNodeSerializer if possible so as +# not to create a "custom json" format but reuse same structure as API json. + +NODE_ATTRIBUTES = [ + # ids + "kind_id", + "id", + "source_domain", + "source_id", + "content_id", + "node_id", + # data + "title", + "description", + "language", + "author", + "aggregator", + "provider", + "thumbnail_encoding", + # licensing metadata + "license_id", + "license_description", + "copyright_holder", + # domain-specific metadata + "role_visibility", + # content provenance + "original_node_id", + "cloned_source_id", + "original_channel_id", + "source_channel_id", + "original_source_node_id", + "source_node_id", + # workflows + "publishing", + "published", + "complete", + "changed", + "freeze_authoring_data", # needed? + # structural + "parent_id", + "sort_order", + # via MPTTModel + "tree_id", + "level", # TODO: remove me (info not neeeded) + "lft", + "rght", # TODO: remove me (info not neeeded) + # timestamps + "created", + "modified", + # kind-specific extended attributes + "extra_fields", +] + +NODE_RELATIONS = [ + "children", + "files", + "assessment_items", +] + + +# copied from +# https://github.com/learningequality/studio/blob/develop/contentcuration/contentcuration/viewsets/file.py#L74-L95 + + +class FileArchiveSerializer(FileSerializer): + class Meta: + model = File + fields = ( + "id", + "checksum", + "file_size", + "language", + "file_format", + "contentnode_id", + "assessment_item_id", + "file_on_disk", + "preset_id", + "language_id", + "original_filename", + "uploaded_by", + ) + field_map = { + "url": retrieve_storage_url, + "preset": "preset_id", + "language": "language_id", + "contentnode": "contentnode_id", + "assessment_item": "assessment_item_id", + } + + +# copied from +# https://github.com/learningequality/studio/blob/develop/contentcuration/contentcuration/viewsets/assessmentitem.py#L202-L218 + + +class AssessmentItemArchiveSerializer(AssessmentItemSerializer): + class Meta: + model = AssessmentItem + fields = ( + "question", + "type", + "answers", + "contentnode_id", + "assessment_id", + "hints", + "raw_data", + "order", + "source_url", + "randomize", + "deleted", + ) + field_map = { + "contentnode": "contentnode_id", + } + + +class ContentNodeArchiveSerializer(serializers.ModelSerializer): + """ + This is a read-only content node serializer used for channel archiving. + """ + + files = FileArchiveSerializer(many=True) + assessment_items = AssessmentItemArchiveSerializer(many=True) + # TODO: finish all fields (reusing existing serializers as much as possible) + # tags = TagField(required=False)? + # license as nested obj? + # prerequisites? + + class Meta: + model = ContentNode + fields = NODE_ATTRIBUTES + NODE_RELATIONS + + def get_fields(self): + fields = super(ContentNodeArchiveSerializer, self).get_fields() + fields["children"] = ContentNodeArchiveSerializer(many=True) + return fields + + +CHANNEL_ATTRIBUTES = [ + "id", + "name", + "description", + "tagline", + "version", + "thumbnail", + "thumbnail_encoding", + "language_id", + "trash_tree_id", + "clipboard_tree_id", + "main_tree_id", + "staging_tree_id", + "chef_tree_id", + "previous_tree_id", + "deleted", + "public", + "preferences", + "content_defaults", + "priority", + "last_published", + "source_url", + "demo_server_url", + "source_id", + "source_domain", + "ricecooker_version", + "published_data", + "icon_encoding", + "total_resource_count", + "published_kind_count", + "published_size", +] + +CHANNEL_RELATIONS = [] + + +class ChannelMetadataArchiveSerializer(serializers.ModelSerializer): + """ + This is a read-only channel metadata serializer used for channel archiving. + """ + + # TODO: finish all fields + # editors? + # viewers? + # secret_tokens + + class Meta: + model = Channel + fields = CHANNEL_ATTRIBUTES + CHANNEL_RELATIONS + + +# SAVE +################################################################################ + +settings_ARCHIVES_ROOT = "tmpcontent/archives" +# TODO: move me to GCP bucket. This dir could be a sibling of content/databases +# and content/storage, like content/archives/jsontrees/{channel_id}/?/?.json + +if not os.path.exists(settings_ARCHIVES_ROOT): + os.makedirs(settings_ARCHIVES_ROOT, exist_ok=True) + + +def tmpcontent_write(filename_ext, jsondata): + save_to_path = os.path.join(settings_ARCHIVES_ROOT, filename_ext) + with open(save_to_path, "w") as outf: + outf.write(jsondata) + return save_to_path + + +# TODO (continued): replace tmpcontent_write; sample code below +# def write(self, *args, **kwargs): +# try: +# filepath = self.get_write_to_path() +# self._write_details(filepath) +# saved_filename = "{}.{}".format(self.filename, self.ext) +# save_to_path = os.path.sep.join([settings_settings_ARCHIVES_ROOT, saved_filename]) +# # Write file to default storage +# with open(filepath, 'rb') as fobj: +# default_storage.save(save_to_path, fobj) +# return save_to_path +# finally: +# self.delete_tempfiles()