Skip to content

Commit 7e66072

Browse files
[sc-184418] support dataset conditional formatting (#10)
* [sc-184418] support dataset conditional formatting in exported workbook * [sc-184418] fix unit test * [sc-182181] add a checkbox to apply CF * [sc-18441] draft migrate to tmp file * [sc-18441] code review comments + write on tmp file using chunks * [sc-18441] update README * Handle sheet with too long name longer than 31 char * add unit test * add unit test * type readme Co-authored-by: Mathilde K. <58742217+Ellana42@users.noreply.github.com> * code review comments * remove style header useless * Add warning about Aptos Narrow font * Fix wrong tag --------- Co-authored-by: Mathilde K. <58742217+Ellana42@users.noreply.github.com>
1 parent 846802a commit 7e66072

8 files changed

Lines changed: 220 additions & 67 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ unit.xml
5858
.hypothesis/
5959
.pytest_cache/
6060
cover/
61+
tests/allure_report/
6162

6263
# Translations
6364
*.mo

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# Changelog
22

3+
## [Version 2.0.0](https://github.com/dataiku/dss-plugin-multisheet-excel-export/releases/tag/v2.0.0) - Major release - 2024-07
4+
- Important : Column type changed ! From this version, cell types in excel will reflect the storage type in DSS. For example, string column containing only numbers will be exported as text column. If you want a number column in excel, you need to have a integer/float column on DSS
5+
- Export dataset conditional formatting colors (colors the cells, does not export rules)
6+
- Bug fix : can now export dataset with date types
7+
38
## [Version 1.1.4](https://github.com/dataiku/dss-plugin-multisheet-excel-export/releases/tag/v1.1.4) - Bug release - 2024-06
49
- Fix numpy issue with DSS 13
510

README.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
# Plugin information
1+
# Multisheet excel export Plugin
22

33
This plugin converts several DSS datasets to one multi-sheet excel (`.xlsx`) file containing one sheet per input dataset.
4+
More information in the [Documentation](https://www.dataiku.com/product/plugins/multisheet-excel-export/)
45

56
# Prerequisites
67

@@ -16,4 +17,11 @@ It will create a folder in your flow containing the output `.xls` file. Each she
1617
## Running tests
1718

1819
In order to run the tests contained in `python-test\`, launch the following command from the plugin root directory:
19-
`PYTHONPATH=$PYTHONPATH:/path/to/python-lib pytest`
20+
`PYTHONPATH=$PYTHONPATH:/path/to/python-lib pytest`
21+
22+
23+
### Licence
24+
25+
Copyright 2020-2022 Dataiku SAS
26+
27+
This plugin is distributed under the Apache License version 2.0

custom-recipes/to-excel/recipe.json

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,15 @@
3737
"type": "STRING",
3838
"defaultValue": "output",
3939
"mandatory": true
40+
},
41+
{
42+
"name": "export_conditional_formatting",
43+
"label": "Apply conditional formatting",
44+
"description": "Color cells by rules, when applicable",
45+
"type": "BOOLEAN",
46+
"defaultValue": false,
47+
"mandatory": true
4048
}
41-
4249
],
4350
"resourceKeys" : []
4451
}

custom-recipes/to-excel/recipe.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,41 @@
1313
from dataiku.customrecipe import get_input_names_for_role
1414
from dataiku.customrecipe import get_output_names_for_role
1515
from dataiku.customrecipe import get_recipe_config
16+
from openpyxl import load_workbook, Workbook
17+
from xlsx_writer import datasets_to_xlsx
18+
from typing import Union
19+
20+
DEFAULT_DATAIKU_SHEET_NAME = "Sheet1"
21+
READ_CHUNK_SIZE = 1024 * 1024 # 1Mbytes
22+
23+
def get_excel_worksheet(dataset: dataiku.Dataset, apply_conditional_formatting: bool) -> Union[Workbook, None]:
24+
logger.info(f"Getting Excel workbook from DSS dataset {dataset.short_name}")
25+
workbook = None
26+
with tempfile.NamedTemporaryFile(delete=True) as tmp_file:
27+
with dataset.raw_formatted_data(format="excel", format_params={ "applyColoring": apply_conditional_formatting }) as stream:
28+
# read steam with chunks to save RAM
29+
chunk_size = READ_CHUNK_SIZE
30+
while True:
31+
chunk = stream.read(chunk_size)
32+
if not chunk:
33+
break
34+
tmp_file.write(chunk)
35+
tmp_file.flush() # Make sure file is written on disk
36+
tmp_file.seek(0) # Read back from start of file to load it in the workbook
37+
38+
# DEV WARNING : Excel exported file contains header row in Calibri and rest in Aptos Narrow font. But load_workbook converts everything into Calibri
39+
workbook = load_workbook(tmp_file)
40+
41+
if workbook is not None:
42+
if DEFAULT_DATAIKU_SHEET_NAME in workbook:
43+
return workbook[DEFAULT_DATAIKU_SHEET_NAME]
44+
elif len(workbook.sheetnames) == 1:
45+
logger.warn(f"Default DSS default sheet name has changed from {DEFAULT_DATAIKU_SHEET_NAME} to {workbook.sheetnames[0]}")
46+
return workbook[workbook.sheetnames[0]]
47+
48+
logger.error("Error getting Excel workbook from DSS dataset {dataset.short_name}, this dataset will not be exported")
49+
return None
1650

17-
from xlsx_writer import dataframes_to_xlsx
1851

1952
logger = logging.getLogger(__name__)
2053
logging.basicConfig(level=logging.INFO, format='Multi-Sheet Excel Exporter | %(levelname)s - %(message)s')
@@ -37,6 +70,7 @@
3770

3871
input_config = get_recipe_config()
3972
workbook_name = input_config.get('output_workbook_name', None)
73+
apply_conditional_formatting = input_config.get('export_conditional_formatting', False)
4074

4175
if workbook_name is None:
4276
logger.warning("Received input received recipe config: {}".format(input_config))
@@ -54,7 +88,7 @@
5488
tmp_file_path = tmp_file.name
5589
logger.info("Intend to write the output xls file to the following location: {}".format(tmp_file_path))
5690

57-
dataframes_to_xlsx(input_datasets_names, tmp_file_path, lambda name: dataiku.Dataset(name).get_dataframe())
91+
datasets_to_xlsx(input_datasets_names, tmp_file_path, lambda name: get_excel_worksheet(dataiku.Dataset(name), apply_conditional_formatting))
5892

5993
with open(tmp_file_path, 'rb', encoding=None) as f:
6094
output_folder.upload_stream(output_file_name, f)

plugin.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"id" : "multisheet-excel-export",
3-
"version" : "1.1.4",
3+
"version" : "2.0.0",
44

55

66
"meta" : {

python-lib/xlsx_writer.py

Lines changed: 85 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -8,52 +8,25 @@
88

99
import logging
1010
import math
11-
from typing import Tuple
11+
from typing import Tuple, List, Dict
12+
from copy import copy
1213

1314
from openpyxl.styles import Alignment, Font, PatternFill, Side
1415
from openpyxl.styles.borders import Border
1516
from openpyxl.styles.colors import WHITE
1617
from openpyxl.utils import get_column_letter
1718
from openpyxl.worksheet.dimensions import ColumnDimension, DimensionHolder
1819
from openpyxl.worksheet.worksheet import Worksheet
19-
import pandas as pd
20+
from openpyxl import Workbook
2021

2122
DATAIKU_TEAL = "FF2AB1AC"
2223
LETTER_WIDTH = 1.20 # Approximative letter width to scale column width
2324
MAX_LENGTH_TO_SHOW = 45 # Limit copied from DSS native excel exporter
25+
EXCEL_MAX_LEN_SHEET_NAME = 31
2426

2527
logger = logging.getLogger(__name__)
2628
logging.basicConfig(level=logging.INFO, format='Multi-Sheet Excel Exporter | %(levelname)s - %(message)s')
2729

28-
def style_header(worksheet: Worksheet,
29-
font_name: str = "Calibri",
30-
font_size: int = 11,
31-
font_color : str = WHITE,
32-
background_color : str = DATAIKU_TEAL,
33-
bold : bool = True
34-
):
35-
"""
36-
Style header of the worksheet
37-
"""
38-
39-
if worksheet.min_column < 1:
40-
logger.warn(f"No header row for worksheet {worksheet}. Styling skipped.")
41-
return
42-
43-
font = Font(name=font_name, size=font_size, color=font_color, bold=bold)
44-
fill = PatternFill("solid", fgColor=background_color)
45-
46-
no_border_side = Side(border_style=None)
47-
border = Border(left=no_border_side, right=no_border_side, top=no_border_side, bottom=no_border_side)
48-
49-
alignment = Alignment(vertical='bottom', horizontal='center')
50-
51-
for header_cell in worksheet[1]:
52-
header_cell.font = font
53-
header_cell.fill = fill
54-
header_cell.border = border
55-
header_cell.alignment = alignment
56-
5730
def get_column_width(column: Tuple):
5831
"""
5932
Find optimum column width based on content and header length
@@ -104,33 +77,94 @@ def auto_size_column_width(worksheet: Worksheet):
10477
worksheet.column_dimensions = dimension_holder
10578

10679

107-
def dataframes_to_xlsx(input_dataframes_names, xlsx_abs_path, dataframe_provider):
80+
81+
82+
# code inspired from https://openpyxl.readthedocs.io/en/stable/_modules/openpyxl/worksheet/copier.html
83+
def copy_sheet_to_workbook(source_sheet: Worksheet, target_workbook: Workbook) -> Worksheet:
10884
"""
109-
Write the input datasets into same excel into the folder
110-
:param input_datasets_names:
111-
:param writer:
112-
:return:
85+
Copy the source worksheet as a new worksheet in the target workbook
86+
:param source_sheet: the source sheet
87+
:param target_workbook: the workbook used to store the new sheet
88+
:return: a reference to the created sheet inside the workbook
89+
"""
90+
logger.info(f"Copying sheet {source_sheet.title} to target workbook")
91+
target_sheet = target_workbook.create_sheet(source_sheet.title)
92+
for row in source_sheet:
93+
for cell in row:
94+
new_cell = target_sheet.cell(row=cell.row, column=cell.column, value=cell.value)
95+
new_cell.data_type = cell.data_type
96+
if cell.has_style:
97+
new_cell.font = copy(cell.font)
98+
new_cell.border = copy(cell.border)
99+
new_cell.fill = copy(cell.fill)
100+
new_cell.number_format = copy(cell.number_format)
101+
new_cell.alignment = copy(cell.alignment)
102+
103+
return target_sheet
104+
105+
def rename_too_long_dataset_names(input_dataset_names: List[str]) -> Dict[str, str]:
106+
"""
107+
Excel allows for only maximum 30 chars in the sheet names, so if some DS have more than 30 chars :
108+
- truncate the name to 28 chars
109+
- Add an index from 00 to 99 at the end in case of overlap
110+
:param input_dataset_names: the list of dataset names to remap
111+
:returns: a Dict[str, str] mapping the DS names with the sheet names
113112
"""
114-
logger.info("Writing output xlsx file ...")
115-
writer = pd.ExcelWriter(xlsx_abs_path, engine='openpyxl')
116113

117-
for name in input_dataframes_names:
118-
df = dataframe_provider(name)
114+
return_map = {}
115+
index_rename = -1
116+
renaming_length = EXCEL_MAX_LEN_SHEET_NAME - 2
117+
for name in input_dataset_names:
118+
if len(name) > EXCEL_MAX_LEN_SHEET_NAME:
119+
index_rename += 1
120+
rename = f"{name[0:renaming_length]}{index_rename:02d}"
121+
# Almost impossible case : a DS already has this name
122+
while rename in input_dataset_names:
123+
index_rename += 1
124+
rename = f"{name[0:renaming_length]}{index_rename:02d}"
119125

120-
logger.info("Writing dataset into excel sheet...")
121-
df.to_excel(writer, sheet_name=name, index=False, encoding='utf-8')
126+
logger.info(f"Dataset {name} with a too long name will be stored as sheet {rename}")
127+
return_map[name] = rename
128+
else:
129+
return_map[name] = name
122130

123-
worksheet = writer.sheets.get(name)
131+
return return_map
124132

125-
if worksheet is None:
126-
logger.warn(f"No worksheet for dataset {name}. Written but styling skipped.")
127-
continue
128133

129-
logger.info(f"Styling excel sheet...")
130-
style_header(worksheet)
131-
auto_size_column_width(worksheet)
134+
def datasets_to_xlsx(input_dataset_names, xlsx_abs_path, worksheet_provider):
135+
"""
136+
Write the input datasets into same excel into the folder
137+
:param input_dataset_names: the list of dataset to put in a single excel file, using one sheet (excel tab) per dataset
138+
:param xlsx_abs_path: the temporary path where to write the final excel file
139+
:param dataset_provider: a lambda used to get the dataset
140+
"""
141+
142+
logger.info(f"Building output excel file {xlsx_abs_path}")
143+
# The final workbook where all dataset sheets will be written
144+
workbook = Workbook()
145+
# remove the default sheet created
146+
workbook.remove(workbook.active)
132147

133-
logger.info("Finished writing dataset {} into excel sheet.".format(name))
148+
renaming_map = rename_too_long_dataset_names(input_dataset_names)
149+
150+
for name in input_dataset_names:
151+
dataset_worksheet = worksheet_provider(name)
152+
if dataset_worksheet is None:
153+
continue
134154

135-
writer.save()
155+
if name in renaming_map:
156+
dataset_worksheet.title = renaming_map[name]
157+
else:
158+
# should never happen
159+
logger.warn(f"Failed to find a name for the workshhet {name}")
160+
dataset_worksheet.title = name
161+
162+
target_sheet = copy_sheet_to_workbook(dataset_worksheet, workbook)
163+
164+
logger.info(f"Styling excel sheet {target_sheet.title} in target workbook")
165+
auto_size_column_width(target_sheet)
166+
167+
logger.info(f"Finished writing dataset {name} into excel sheet.")
168+
169+
workbook.save(xlsx_abs_path)
136170
logger.info("Done writing output xlsx file")

0 commit comments

Comments
 (0)