@@ -211,6 +211,7 @@ class OptimizedRegex:
211211# TODO: RELS_CONTENT_TYPE may be incorrect or not well named, needs review
212212RELS_CONTENT_TYPE = "application/vnd.openxmlformats-package.core-properties+xml"
213213RELS_FOLDER_NAME = "_rels"
214+ CORE_PROPERTIES_FOLDER_NAME = "docProps"
214215
215216primitives = (bool , str , int , float , type (None ))
216217
@@ -225,6 +226,20 @@ class MimeType(Enum):
225226 RELS = "application/vnd.openxmlformats-package.relationships+xml"
226227 CORE_PROPERTIES = "application/vnd.openxmlformats-package.core-properties+xml"
227228 EXTENDED_CORE_PROPERTIES = "application/x-extended-core-properties+xml"
229+ JPEG = "image/jpeg"
230+ PNG = "image/png"
231+ TIFF = "image/tiff"
232+ GIF = "image/gif"
233+ SVG = "image/svg+xml"
234+ DOC = "application/msword"
235+ DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
236+ XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
237+ XML = "application/xml"
238+ JSON = "application/json"
239+ TXT = "text/plain"
240+ MARKDOWN = "text/markdown"
241+ HTML = "text/html"
242+ ZIP = "application/zip"
228243
229244 def __str__ (self ):
230245 return self .value
@@ -282,6 +297,120 @@ class RawFile:
282297 content : Optional [BytesIO ] = field (default = None )
283298
284299
300+ # ===================================
301+ # MIME TYPE MAPPINGS
302+ # ===================================
303+
304+ # Primary mapping: MimeType enum → file extension
305+ MIME_TYPE_TO_EXTENSION : dict [MimeType , str ] = {
306+ MimeType .CSV : "csv" ,
307+ MimeType .HDF5 : "h5" ,
308+ MimeType .PARQUET : "parquet" ,
309+ MimeType .PDF : "pdf" ,
310+ MimeType .RELS : "rels" ,
311+ MimeType .CORE_PROPERTIES : "xml" ,
312+ MimeType .EXTENDED_CORE_PROPERTIES : "xml" ,
313+ MimeType .JPEG : "jpg" ,
314+ MimeType .PNG : "png" ,
315+ MimeType .TIFF : "tiff" ,
316+ MimeType .GIF : "gif" ,
317+ MimeType .SVG : "svg" ,
318+ MimeType .DOC : "doc" ,
319+ MimeType .DOCX : "docx" ,
320+ MimeType .XLSX : "xlsx" ,
321+ MimeType .XML : "xml" ,
322+ MimeType .JSON : "json" ,
323+ MimeType .TXT : "txt" ,
324+ MimeType .MARKDOWN : "md" ,
325+ MimeType .HTML : "html" ,
326+ MimeType .ZIP : "zip" ,
327+ }
328+
329+ # Alternative MIME type strings (aliases and variants)
330+ MIME_TYPE_ALIASES : dict [str , MimeType ] = {
331+ "application/parquet" : MimeType .PARQUET ,
332+ "application/vnd.apache.parquet" : MimeType .PARQUET ,
333+ "text/xml" : MimeType .XML ,
334+ "image/jpg" : MimeType .JPEG ,
335+ }
336+
337+ # Alternative file extensions
338+ EXTENSION_ALIASES : dict [str , str ] = {
339+ "hdf5" : "h5" ,
340+ "jpeg" : "jpg" ,
341+ "tif" : "tiff" ,
342+ "markdown" : "md" ,
343+ "htm" : "html" ,
344+ }
345+
346+
347+ def mime_type_to_file_extension (mime_type : str ) -> Optional [str ]:
348+ """
349+ Convert MIME type to file extension using the MimeType enum and aliases.
350+
351+ Args:
352+ mime_type: MIME type string (case-insensitive)
353+
354+ Returns:
355+ File extension without leading dot, or None if not found
356+
357+ Examples:
358+ >>> mime_type_to_file_extension("text/csv")
359+ 'csv'
360+ >>> mime_type_to_file_extension("application/parquet")
361+ 'parquet'
362+ """
363+ if not mime_type :
364+ return None
365+
366+ mime_type_lower = mime_type .lower ()
367+
368+ # Try to find in MimeType enum
369+ for mime_enum in MimeType :
370+ if mime_enum .value .lower () == mime_type_lower :
371+ return MIME_TYPE_TO_EXTENSION .get (mime_enum )
372+
373+ # Try aliases
374+ mime_enum = MIME_TYPE_ALIASES .get (mime_type_lower )
375+ if mime_enum :
376+ return MIME_TYPE_TO_EXTENSION .get (mime_enum )
377+
378+ return None
379+
380+
381+ def file_extension_to_mime_type (extension : str ) -> Optional [str ]:
382+ """
383+ Convert file extension to MIME type using the MimeType enum.
384+
385+ Args:
386+ extension: File extension with or without leading dot (case-insensitive)
387+
388+ Returns:
389+ MIME type string, or None if not found
390+
391+ Examples:
392+ >>> file_extension_to_mime_type("csv")
393+ 'text/csv'
394+ >>> file_extension_to_mime_type(".json")
395+ 'application/json'
396+ """
397+ if not extension :
398+ return None
399+
400+ # Remove leading dot if present
401+ ext_lower = extension .lstrip ("." ).lower ()
402+
403+ # Normalize through aliases first
404+ ext_normalized = EXTENSION_ALIASES .get (ext_lower , ext_lower )
405+
406+ # Find the MimeType that matches this extension
407+ for mime_enum , ext in MIME_TYPE_TO_EXTENSION .items ():
408+ if ext == ext_normalized :
409+ return mime_enum .value
410+
411+ return None
412+
413+
285414# ===================================
286415# OPTIMIZED UTILITY FUNCTIONS
287416# ===================================
@@ -499,54 +628,6 @@ def extract_uuid_from_string(s: str) -> Optional[str]:
499628 return None
500629
501630
502- def mime_type_to_file_extension (mime_type : str ) -> Optional [str ]:
503- """Convert MIME type to file extension"""
504- if not mime_type :
505- return None
506-
507- mime_type_lower = mime_type .lower ()
508-
509- # Use dict for faster lookup than if/elif chain
510- mime_to_ext = {
511- "application/x-parquet" : "parquet" ,
512- "application/parquet" : "parquet" ,
513- "application/vnd.apache.parquet" : "parquet" ,
514- "application/x-hdf5" : "h5" ,
515- "text/csv" : "csv" ,
516- "application/vnd.openxmlformats-package.relationships+xml" : "rels" ,
517- "application/pdf" : "pdf" ,
518- "application/xml" : "xml" ,
519- "text/xml" : "xml" ,
520- "application/json" : "json" ,
521- "application/vnd.openxmlformats-package.core-properties+xml" : "xml" ,
522- "application/x-extended-core-properties+xml" : "xml" ,
523- }
524-
525- return mime_to_ext .get (mime_type_lower )
526-
527-
528- def file_extension_to_mime_type (extension : str ) -> Optional [str ]:
529- """Convert file extension to MIME type"""
530- if not extension :
531- return None
532-
533- ext_lower = extension .lower ()
534-
535- # Use dict for faster lookup than if/elif chain
536- ext_to_mime = {
537- "parquet" : "application/x-parquet" ,
538- "h5" : "application/x-hdf5" ,
539- "hdf5" : "application/x-hdf5" ,
540- "csv" : "text/csv" ,
541- "rels" : "application/vnd.openxmlformats-package.relationships+xml" ,
542- "pdf" : "application/pdf" ,
543- "xml" : "application/xml" ,
544- "json" : "application/json" ,
545- }
546-
547- return ext_to_mime .get (ext_lower )
548-
549-
550631# ===================================
551632# PATH UTILITIES
552633# ===================================
0 commit comments