[sc-184418] support dataset conditional formatting (#10)

louisbarjon · Ellana42 · web-flow · commit 7e6607291fd0 · 2024-07-12T13:58:16.000+02:00
* [sc-184418] support dataset conditional formatting in exported workbook

* [sc-184418] fix unit test

* [sc-182181] add a checkbox to apply CF

* [sc-18441] draft migrate to tmp file

* [sc-18441] code review comments + write on tmp file using chunks

* [sc-18441] update README

* Handle sheet with too long name longer than 31 char

* add unit test

* add unit test

* type readme

Co-authored-by: Mathilde K. &lt;58742217+Ellana42@users.noreply.github.com&gt;

* code review comments

* remove style header useless

* Add warning about Aptos Narrow font

* Fix wrong tag

---------

Co-authored-by: Mathilde K. &lt;58742217+Ellana42@users.noreply.github.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -58,6 +58,7 @@ unit.xml
 .hypothesis/
 .pytest_cache/
 cover/
+tests/allure_report/
 
 # Translations
 *.mo
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## [Version 2.0.0](https://github.com/dataiku/dss-plugin-multisheet-excel-export/releases/tag/v2.0.0) - Major release - 2024-07
+- Important : Column type changed ! From this version, cell types in excel will reflect the storage type in DSS. For example, string column containing only numbers will be exported as text column. If you want a number column in excel, you need to have a integer/float column on DSS
+- Export dataset conditional formatting colors (colors the cells, does not export rules)
+- Bug fix : can now export dataset with date types
+
 ## [Version 1.1.4](https://github.com/dataiku/dss-plugin-multisheet-excel-export/releases/tag/v1.1.4) - Bug release - 2024-06
 - Fix numpy issue with DSS 13
 
diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
-# Plugin information
+# Multisheet excel export Plugin
 
 This plugin converts several DSS datasets to one multi-sheet excel (`.xlsx`) file containing one sheet per input dataset.
+More information in the [Documentation](https://www.dataiku.com/product/plugins/multisheet-excel-export/)
 
 # Prerequisites
 
@@ -16,4 +17,11 @@ It will create a folder in your flow containing the output `.xls` file. Each she
 ## Running tests
 
 In order to run the tests contained in `python-test\`, launch the following command from the plugin root directory: 
-`PYTHONPATH=$PYTHONPATH:/path/to/python-lib pytest`
+`PYTHONPATH=$PYTHONPATH:/path/to/python-lib pytest`
+
+
+### Licence
+
+Copyright 2020-2022 Dataiku SAS
+
+This plugin is distributed under the Apache License version 2.0
diff --git a/custom-recipes/to-excel/recipe.json b/custom-recipes/to-excel/recipe.json
@@ -37,8 +37,15 @@
             "type": "STRING",
             "defaultValue": "output",
             "mandatory": true
+        },
+        {
+            "name": "export_conditional_formatting",
+            "label": "Apply conditional formatting",
+            "description": "Color cells by rules, when applicable",
+            "type": "BOOLEAN",
+            "defaultValue": false,
+            "mandatory": true
         }
-
     ],
     "resourceKeys" : []
 }
diff --git a/custom-recipes/to-excel/recipe.py b/custom-recipes/to-excel/recipe.py
@@ -13,8 +13,41 @@
 from dataiku.customrecipe import get_input_names_for_role
 from dataiku.customrecipe import get_output_names_for_role
 from dataiku.customrecipe import get_recipe_config
+from openpyxl import load_workbook, Workbook
+from xlsx_writer import datasets_to_xlsx
+from typing import Union
+
+DEFAULT_DATAIKU_SHEET_NAME = "Sheet1"
+READ_CHUNK_SIZE = 1024 * 1024 # 1Mbytes
+
+def get_excel_worksheet(dataset: dataiku.Dataset, apply_conditional_formatting: bool) -> Union[Workbook, None]:
+    logger.info(f"Getting Excel workbook from DSS dataset {dataset.short_name}")
+    workbook = None
+    with tempfile.NamedTemporaryFile(delete=True) as tmp_file:
+        with dataset.raw_formatted_data(format="excel", format_params={ "applyColoring": apply_conditional_formatting }) as stream:
+            # read steam with chunks to save RAM
+            chunk_size = READ_CHUNK_SIZE
+            while True:
+                chunk = stream.read(chunk_size)
+                if not chunk:
+                    break
+                tmp_file.write(chunk)
+        tmp_file.flush() # Make sure file is written on disk
+        tmp_file.seek(0) # Read back from start of file to load it in the workbook
+
+        # DEV WARNING : Excel exported file contains header row in Calibri and rest in Aptos Narrow font. But load_workbook converts everything into Calibri
+        workbook = load_workbook(tmp_file)
+
+    if workbook is not None:
+        if DEFAULT_DATAIKU_SHEET_NAME in workbook:
+            return workbook[DEFAULT_DATAIKU_SHEET_NAME]
+        elif len(workbook.sheetnames) == 1:
+            logger.warn(f"Default DSS default sheet name has changed from {DEFAULT_DATAIKU_SHEET_NAME} to {workbook.sheetnames[0]}")
+            return workbook[workbook.sheetnames[0]]
+
+    logger.error("Error getting Excel workbook from DSS dataset {dataset.short_name}, this dataset will not be exported")
+    return None
 
-from xlsx_writer import dataframes_to_xlsx
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO, format='Multi-Sheet Excel Exporter | %(levelname)s - %(message)s')
@@ -37,6 +70,7 @@
 
 input_config = get_recipe_config()
 workbook_name = input_config.get('output_workbook_name', None)
+apply_conditional_formatting = input_config.get('export_conditional_formatting', False)
 
 if workbook_name is None:
     logger.warning("Received input received recipe config: {}".format(input_config))
@@ -54,7 +88,7 @@
     tmp_file_path = tmp_file.name
     logger.info("Intend to write the output xls file to the following location: {}".format(tmp_file_path))
 
-    dataframes_to_xlsx(input_datasets_names, tmp_file_path, lambda name: dataiku.Dataset(name).get_dataframe())
+    datasets_to_xlsx(input_datasets_names, tmp_file_path, lambda name: get_excel_worksheet(dataiku.Dataset(name), apply_conditional_formatting))
 
     with open(tmp_file_path, 'rb', encoding=None) as f:
         output_folder.upload_stream(output_file_name, f)
diff --git a/plugin.json b/plugin.json
@@ -1,6 +1,6 @@
 {
     "id" : "multisheet-excel-export",
-    "version" : "1.1.4",
+    "version" : "2.0.0",
 
 
     "meta" : {
diff --git a/python-lib/xlsx_writer.py b/python-lib/xlsx_writer.py
@@ -8,52 +8,25 @@
 
 import logging
 import math
-from typing import Tuple
+from typing import Tuple, List, Dict
+from copy import copy
 
 from openpyxl.styles import Alignment, Font, PatternFill, Side
 from openpyxl.styles.borders import Border
 from openpyxl.styles.colors import WHITE
 from openpyxl.utils import get_column_letter
 from openpyxl.worksheet.dimensions import ColumnDimension, DimensionHolder
 from openpyxl.worksheet.worksheet import Worksheet
-import pandas as pd
+from openpyxl import Workbook
 
 DATAIKU_TEAL = "FF2AB1AC"
 LETTER_WIDTH = 1.20 # Approximative letter width to scale column width
 MAX_LENGTH_TO_SHOW = 45 # Limit copied from DSS native excel exporter
+EXCEL_MAX_LEN_SHEET_NAME = 31
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO, format='Multi-Sheet Excel Exporter | %(levelname)s - %(message)s')
 
-def style_header(worksheet: Worksheet, 
-                  font_name: str = "Calibri", 
-                  font_size: int = 11, 
-                  font_color : str = WHITE, 
-                  background_color : str = DATAIKU_TEAL,
-                  bold : bool = True
-                 ):
-    """
-    Style header of the worksheet
-    """
-
-    if worksheet.min_column < 1:
-        logger.warn(f"No header row for worksheet {worksheet}. Styling skipped.")
-        return
-
-    font = Font(name=font_name, size=font_size, color=font_color, bold=bold)
-    fill = PatternFill("solid", fgColor=background_color)
-
-    no_border_side = Side(border_style=None)
-    border = Border(left=no_border_side, right=no_border_side, top=no_border_side, bottom=no_border_side)
-
-    alignment = Alignment(vertical='bottom', horizontal='center')
-
-    for header_cell in worksheet[1]:
-        header_cell.font = font
-        header_cell.fill = fill
-        header_cell.border = border
-        header_cell.alignment = alignment
-
 def get_column_width(column: Tuple):
     """
     Find optimum column width based on content and header length
@@ -104,33 +77,94 @@ def auto_size_column_width(worksheet: Worksheet):
     worksheet.column_dimensions = dimension_holder
 
 
-def dataframes_to_xlsx(input_dataframes_names, xlsx_abs_path, dataframe_provider):
+
+
+# code inspired from https://openpyxl.readthedocs.io/en/stable/_modules/openpyxl/worksheet/copier.html
+def copy_sheet_to_workbook(source_sheet: Worksheet, target_workbook: Workbook) -> Worksheet:
     """
-    Write the input datasets into same excel into the folder
-    :param input_datasets_names:
-    :param writer:
-    :return:
+    Copy the source worksheet as a new worksheet in the target workbook
+    :param source_sheet: the source sheet
+    :param target_workbook: the workbook used to store the new sheet
+    :return: a reference to the created sheet inside the workbook
+    """
+    logger.info(f"Copying sheet {source_sheet.title} to target workbook")
+    target_sheet = target_workbook.create_sheet(source_sheet.title)
+    for row in source_sheet:
+        for cell in row:
+            new_cell = target_sheet.cell(row=cell.row, column=cell.column, value=cell.value)
+            new_cell.data_type = cell.data_type
+            if cell.has_style:
+                new_cell.font = copy(cell.font)
+                new_cell.border = copy(cell.border)
+                new_cell.fill = copy(cell.fill)
+                new_cell.number_format = copy(cell.number_format)
+                new_cell.alignment = copy(cell.alignment)
+
+    return target_sheet
+
+def rename_too_long_dataset_names(input_dataset_names: List[str]) -> Dict[str, str]:
+    """
+    Excel allows for only maximum 30 chars in the sheet names, so if some DS have more than 30 chars :
+        - truncate the name to 28 chars
+        - Add an index from 00 to 99 at the end in case of overlap
+    :param input_dataset_names: the list of dataset names to remap
+    :returns: a Dict[str, str] mapping the DS names with the sheet names
     """
-    logger.info("Writing output xlsx file ...")
-    writer = pd.ExcelWriter(xlsx_abs_path, engine='openpyxl')
 
-    for name in input_dataframes_names:
-        df = dataframe_provider(name)
+    return_map = {}
+    index_rename = -1
+    renaming_length = EXCEL_MAX_LEN_SHEET_NAME - 2
+    for name in input_dataset_names:
+        if len(name) > EXCEL_MAX_LEN_SHEET_NAME:
+            index_rename += 1
+            rename = f"{name[0:renaming_length]}{index_rename:02d}"
+            # Almost impossible case : a DS already has this name
+            while rename in input_dataset_names:
+                index_rename += 1
+                rename = f"{name[0:renaming_length]}{index_rename:02d}"
 
-        logger.info("Writing dataset into excel sheet...")
-        df.to_excel(writer, sheet_name=name, index=False, encoding='utf-8')
+            logger.info(f"Dataset {name} with a too long name will be stored as sheet {rename}")
+            return_map[name] = rename
+        else:
+            return_map[name] = name
 
-        worksheet = writer.sheets.get(name)
+    return return_map
 
-        if worksheet is None:
-            logger.warn(f"No worksheet for dataset {name}. Written but styling skipped.")
-            continue
 
-        logger.info(f"Styling excel sheet...")
-        style_header(worksheet)
-        auto_size_column_width(worksheet)
+def datasets_to_xlsx(input_dataset_names, xlsx_abs_path, worksheet_provider):
+    """
+    Write the input datasets into same excel into the folder
+    :param input_dataset_names: the list of dataset to put in a single excel file, using one sheet (excel tab) per dataset
+    :param xlsx_abs_path: the temporary path where to write the final excel file
+    :param dataset_provider: a lambda used to get the dataset
+    """
+
+    logger.info(f"Building output excel file {xlsx_abs_path}")
+    # The final workbook where all dataset sheets will be written
+    workbook = Workbook()
+    # remove the default sheet created
+    workbook.remove(workbook.active)
 
-        logger.info("Finished writing dataset {} into excel sheet.".format(name))
+    renaming_map = rename_too_long_dataset_names(input_dataset_names)
+
+    for name in input_dataset_names:
+        dataset_worksheet = worksheet_provider(name)
+        if dataset_worksheet is None:
+            continue
 
-    writer.save()
+        if name in renaming_map:
+            dataset_worksheet.title = renaming_map[name]
+        else:
+            # should never happen
+            logger.warn(f"Failed to find a name for the workshhet {name}")
+            dataset_worksheet.title = name
+        
+        target_sheet = copy_sheet_to_workbook(dataset_worksheet, workbook)
+        
+        logger.info(f"Styling excel sheet {target_sheet.title} in target workbook")
+        auto_size_column_width(target_sheet)
+
+        logger.info(f"Finished writing dataset {name} into excel sheet.")
+        
+    workbook.save(xlsx_abs_path)
     logger.info("Done writing output xlsx file")
diff --git a/tests/python/unit/test_multi_sheet_export.py b/tests/python/unit/test_multi_sheet_export.py

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"id" : "multisheet-excel-export",`
`3`		`- "version" : "1.1.4",`
	`3`	`+ "version" : "2.0.0",`
`4`	`4`
`5`	`5`
`6`	`6`	`"meta" : {`