|
8 | 8 |
|
9 | 9 | import logging |
10 | 10 | import math |
11 | | -from typing import Tuple |
| 11 | +from typing import Tuple, List, Dict |
| 12 | +from copy import copy |
12 | 13 |
|
13 | 14 | from openpyxl.styles import Alignment, Font, PatternFill, Side |
14 | 15 | from openpyxl.styles.borders import Border |
15 | 16 | from openpyxl.styles.colors import WHITE |
16 | 17 | from openpyxl.utils import get_column_letter |
17 | 18 | from openpyxl.worksheet.dimensions import ColumnDimension, DimensionHolder |
18 | 19 | from openpyxl.worksheet.worksheet import Worksheet |
19 | | -import pandas as pd |
| 20 | +from openpyxl import Workbook |
20 | 21 |
|
21 | 22 | DATAIKU_TEAL = "FF2AB1AC" |
22 | 23 | LETTER_WIDTH = 1.20 # Approximative letter width to scale column width |
23 | 24 | MAX_LENGTH_TO_SHOW = 45 # Limit copied from DSS native excel exporter |
| 25 | +EXCEL_MAX_LEN_SHEET_NAME = 31 |
24 | 26 |
|
25 | 27 | logger = logging.getLogger(__name__) |
26 | 28 | logging.basicConfig(level=logging.INFO, format='Multi-Sheet Excel Exporter | %(levelname)s - %(message)s') |
27 | 29 |
|
28 | | -def style_header(worksheet: Worksheet, |
29 | | - font_name: str = "Calibri", |
30 | | - font_size: int = 11, |
31 | | - font_color : str = WHITE, |
32 | | - background_color : str = DATAIKU_TEAL, |
33 | | - bold : bool = True |
34 | | - ): |
35 | | - """ |
36 | | - Style header of the worksheet |
37 | | - """ |
38 | | - |
39 | | - if worksheet.min_column < 1: |
40 | | - logger.warn(f"No header row for worksheet {worksheet}. Styling skipped.") |
41 | | - return |
42 | | - |
43 | | - font = Font(name=font_name, size=font_size, color=font_color, bold=bold) |
44 | | - fill = PatternFill("solid", fgColor=background_color) |
45 | | - |
46 | | - no_border_side = Side(border_style=None) |
47 | | - border = Border(left=no_border_side, right=no_border_side, top=no_border_side, bottom=no_border_side) |
48 | | - |
49 | | - alignment = Alignment(vertical='bottom', horizontal='center') |
50 | | - |
51 | | - for header_cell in worksheet[1]: |
52 | | - header_cell.font = font |
53 | | - header_cell.fill = fill |
54 | | - header_cell.border = border |
55 | | - header_cell.alignment = alignment |
56 | | - |
57 | 30 | def get_column_width(column: Tuple): |
58 | 31 | """ |
59 | 32 | Find optimum column width based on content and header length |
@@ -104,33 +77,94 @@ def auto_size_column_width(worksheet: Worksheet): |
104 | 77 | worksheet.column_dimensions = dimension_holder |
105 | 78 |
|
106 | 79 |
|
107 | | -def dataframes_to_xlsx(input_dataframes_names, xlsx_abs_path, dataframe_provider): |
| 80 | + |
| 81 | + |
| 82 | +# code inspired from https://openpyxl.readthedocs.io/en/stable/_modules/openpyxl/worksheet/copier.html |
| 83 | +def copy_sheet_to_workbook(source_sheet: Worksheet, target_workbook: Workbook) -> Worksheet: |
108 | 84 | """ |
109 | | - Write the input datasets into same excel into the folder |
110 | | - :param input_datasets_names: |
111 | | - :param writer: |
112 | | - :return: |
| 85 | + Copy the source worksheet as a new worksheet in the target workbook |
| 86 | + :param source_sheet: the source sheet |
| 87 | + :param target_workbook: the workbook used to store the new sheet |
| 88 | + :return: a reference to the created sheet inside the workbook |
| 89 | + """ |
| 90 | + logger.info(f"Copying sheet {source_sheet.title} to target workbook") |
| 91 | + target_sheet = target_workbook.create_sheet(source_sheet.title) |
| 92 | + for row in source_sheet: |
| 93 | + for cell in row: |
| 94 | + new_cell = target_sheet.cell(row=cell.row, column=cell.column, value=cell.value) |
| 95 | + new_cell.data_type = cell.data_type |
| 96 | + if cell.has_style: |
| 97 | + new_cell.font = copy(cell.font) |
| 98 | + new_cell.border = copy(cell.border) |
| 99 | + new_cell.fill = copy(cell.fill) |
| 100 | + new_cell.number_format = copy(cell.number_format) |
| 101 | + new_cell.alignment = copy(cell.alignment) |
| 102 | + |
| 103 | + return target_sheet |
| 104 | + |
| 105 | +def rename_too_long_dataset_names(input_dataset_names: List[str]) -> Dict[str, str]: |
| 106 | + """ |
| 107 | + Excel allows for only maximum 30 chars in the sheet names, so if some DS have more than 30 chars : |
| 108 | + - truncate the name to 28 chars |
| 109 | + - Add an index from 00 to 99 at the end in case of overlap |
| 110 | + :param input_dataset_names: the list of dataset names to remap |
| 111 | + :returns: a Dict[str, str] mapping the DS names with the sheet names |
113 | 112 | """ |
114 | | - logger.info("Writing output xlsx file ...") |
115 | | - writer = pd.ExcelWriter(xlsx_abs_path, engine='openpyxl') |
116 | 113 |
|
117 | | - for name in input_dataframes_names: |
118 | | - df = dataframe_provider(name) |
| 114 | + return_map = {} |
| 115 | + index_rename = -1 |
| 116 | + renaming_length = EXCEL_MAX_LEN_SHEET_NAME - 2 |
| 117 | + for name in input_dataset_names: |
| 118 | + if len(name) > EXCEL_MAX_LEN_SHEET_NAME: |
| 119 | + index_rename += 1 |
| 120 | + rename = f"{name[0:renaming_length]}{index_rename:02d}" |
| 121 | + # Almost impossible case : a DS already has this name |
| 122 | + while rename in input_dataset_names: |
| 123 | + index_rename += 1 |
| 124 | + rename = f"{name[0:renaming_length]}{index_rename:02d}" |
119 | 125 |
|
120 | | - logger.info("Writing dataset into excel sheet...") |
121 | | - df.to_excel(writer, sheet_name=name, index=False, encoding='utf-8') |
| 126 | + logger.info(f"Dataset {name} with a too long name will be stored as sheet {rename}") |
| 127 | + return_map[name] = rename |
| 128 | + else: |
| 129 | + return_map[name] = name |
122 | 130 |
|
123 | | - worksheet = writer.sheets.get(name) |
| 131 | + return return_map |
124 | 132 |
|
125 | | - if worksheet is None: |
126 | | - logger.warn(f"No worksheet for dataset {name}. Written but styling skipped.") |
127 | | - continue |
128 | 133 |
|
129 | | - logger.info(f"Styling excel sheet...") |
130 | | - style_header(worksheet) |
131 | | - auto_size_column_width(worksheet) |
| 134 | +def datasets_to_xlsx(input_dataset_names, xlsx_abs_path, worksheet_provider): |
| 135 | + """ |
| 136 | + Write the input datasets into same excel into the folder |
| 137 | + :param input_dataset_names: the list of dataset to put in a single excel file, using one sheet (excel tab) per dataset |
| 138 | + :param xlsx_abs_path: the temporary path where to write the final excel file |
| 139 | + :param dataset_provider: a lambda used to get the dataset |
| 140 | + """ |
| 141 | + |
| 142 | + logger.info(f"Building output excel file {xlsx_abs_path}") |
| 143 | + # The final workbook where all dataset sheets will be written |
| 144 | + workbook = Workbook() |
| 145 | + # remove the default sheet created |
| 146 | + workbook.remove(workbook.active) |
132 | 147 |
|
133 | | - logger.info("Finished writing dataset {} into excel sheet.".format(name)) |
| 148 | + renaming_map = rename_too_long_dataset_names(input_dataset_names) |
| 149 | + |
| 150 | + for name in input_dataset_names: |
| 151 | + dataset_worksheet = worksheet_provider(name) |
| 152 | + if dataset_worksheet is None: |
| 153 | + continue |
134 | 154 |
|
135 | | - writer.save() |
| 155 | + if name in renaming_map: |
| 156 | + dataset_worksheet.title = renaming_map[name] |
| 157 | + else: |
| 158 | + # should never happen |
| 159 | + logger.warn(f"Failed to find a name for the workshhet {name}") |
| 160 | + dataset_worksheet.title = name |
| 161 | + |
| 162 | + target_sheet = copy_sheet_to_workbook(dataset_worksheet, workbook) |
| 163 | + |
| 164 | + logger.info(f"Styling excel sheet {target_sheet.title} in target workbook") |
| 165 | + auto_size_column_width(target_sheet) |
| 166 | + |
| 167 | + logger.info(f"Finished writing dataset {name} into excel sheet.") |
| 168 | + |
| 169 | + workbook.save(xlsx_abs_path) |
136 | 170 | logger.info("Done writing output xlsx file") |
0 commit comments