[sc-198001] Fix out of memory while exporting large datasets + cache optimizations (#12)

RoyTeddy · web-flow · commit 34ca92a94f68 · 2024-10-03T09:08:41.000+02:00
* [sc-198001] Fix out of memory while exporting large datasets + cache optimizations
* [sc-198001] Remove useless cache border check
* [sc-198001] PR review fixes
* Fix linter issues
* Optimization: use openpyxl in write only mode with lxml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Changelog
 
+## [Version 2.1.0](https://github.com/dataiku/dss-plugin-multisheet-excel-export/releases/tag/v2.1.0) - Major release - 2024-09
+- Bug fix: one temporary workbook is used per dataset to avoid out of memory issues while exporting large datasets. All these temporary workbooks are merged at the end to generate the final excel file
+- Optimizations: using of a cache for styles to avoid useless copies + openpyxl write only mode with lxml
+
 ## [Version 2.0.0](https://github.com/dataiku/dss-plugin-multisheet-excel-export/releases/tag/v2.0.0) - Major release - 2024-07
 - Important : Column type changed ! From this version, cell types in excel will reflect the storage type in DSS. For example, string column containing only numbers will be exported as text column. If you want a number column in excel, you need to have a integer/float column on DSS
 - Export dataset conditional formatting colors (colors the cells, does not export rules)
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ This plugin relies on the [openpyxl](https://openpyxl.readthedocs.io/en/stable/)
 
 Once the plugin is successfully installed, select the datasets that you want to export as one excel file. 
 Then run the Multi-Sheet Excel Export recipe from the flow. 
-It will create a folder in your flow containing the output `.xls` file. Each sheet of this file contains one dataset and is named after this dataset.
+It will create a folder in your flow containing the output `.xlsx` file. Each sheet of this file contains one dataset and is named after this dataset.
  
 ## Running tests
 
@@ -24,4 +24,4 @@ In order to run the tests contained in `python-test\`, launch the following comm
 
 Copyright 2020-2022 Dataiku SAS
 
-This plugin is distributed under the Apache License version 2.0
+This plugin is distributed under the Apache License version 2.0
diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt
@@ -1,2 +1,3 @@
 openpyxl==3.0.6
 pathvalidate==2.3.0
+lxml==5.3.0
diff --git a/custom-recipes/to-excel/recipe.py b/custom-recipes/to-excel/recipe.py
@@ -18,22 +18,23 @@
 from typing import Union
 
 DEFAULT_DATAIKU_SHEET_NAME = "Sheet1"
-READ_CHUNK_SIZE = 1024 * 1024 # 1Mbytes
+READ_CHUNK_SIZE = 1024 * 1024  # 1Mbytes
+
 
 def get_excel_worksheet(dataset: dataiku.Dataset, apply_conditional_formatting: bool) -> Union[Workbook, None]:
-    logger.info(f"Getting Excel workbook from DSS dataset {dataset.short_name}")
+    logger.info(f"Getting Excel workbook from DSS dataset '{dataset.short_name}'...")
     workbook = None
     with tempfile.NamedTemporaryFile(delete=True) as tmp_file:
-        with dataset.raw_formatted_data(format="excel", format_params={ "applyColoring": apply_conditional_formatting }) as stream:
+        with dataset.raw_formatted_data(format="excel", format_params={"applyColoring": apply_conditional_formatting}) as stream:
             # read steam with chunks to save RAM
             chunk_size = READ_CHUNK_SIZE
             while True:
                 chunk = stream.read(chunk_size)
                 if not chunk:
                     break
                 tmp_file.write(chunk)
-        tmp_file.flush() # Make sure file is written on disk
-        tmp_file.seek(0) # Read back from start of file to load it in the workbook
+        tmp_file.flush()  # Make sure file is written on disk
+        tmp_file.seek(0)  # Read back from start of file to load it in the workbook
 
         # DEV WARNING : Excel exported file contains header row in Calibri and rest in Aptos Narrow font. But load_workbook converts everything into Calibri
         workbook = load_workbook(tmp_file)
@@ -42,10 +43,10 @@ def get_excel_worksheet(dataset: dataiku.Dataset, apply_conditional_formatting:
         if DEFAULT_DATAIKU_SHEET_NAME in workbook:
             return workbook[DEFAULT_DATAIKU_SHEET_NAME]
         elif len(workbook.sheetnames) == 1:
-            logger.warn(f"Default DSS default sheet name has changed from {DEFAULT_DATAIKU_SHEET_NAME} to {workbook.sheetnames[0]}")
+            logger.warning(f"Default DSS default sheet name has changed from '{DEFAULT_DATAIKU_SHEET_NAME}' to '{workbook.sheetnames[0]}'")
             return workbook[workbook.sheetnames[0]]
 
-    logger.error("Error getting Excel workbook from DSS dataset {dataset.short_name}, this dataset will not be exported")
+    logger.error(f"Error getting Excel workbook from DSS dataset '{dataset.short_name}', this dataset will not be exported")
     return None
 
 
diff --git a/plugin.json b/plugin.json
@@ -1,6 +1,6 @@
 {
     "id" : "multisheet-excel-export",
-    "version" : "2.0.0",
+    "version" : "2.1.0",
 
 
     "meta" : {
diff --git a/python-lib/xlsx_writer.py b/python-lib/xlsx_writer.py
diff --git a/tests/python/unit/test_multi_sheet_export.py b/tests/python/unit/test_multi_sheet_export.py

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`openpyxl==3.0.6`
`2`	`2`	`pathvalidate==2.3.0`
	`3`	`+lxml==5.3.0`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"id" : "multisheet-excel-export",`
`3`		`- "version" : "2.0.0",`
	`3`	`+ "version" : "2.1.0",`
`4`	`4`
`5`	`5`
`6`	`6`	`"meta" : {`