Skip to content

Commit 2570e86

Browse files
author
Jah-yee
committed
Add .doc file extension support
- Add .doc to ACCEPTED_FILE_EXTENSIONS in DocxConverter - Add DOC type to DocumentIntelligenceFileType enum - Add MIME type and extension mappings for .doc files Fixes: #23
1 parent 6cb5307 commit 2570e86

2 files changed

Lines changed: 10 additions & 3 deletions

File tree

packages/markitdown/src/markitdown/converters/_doc_intel_converter.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ class DocumentIntelligenceFileType(str, Enum):
5656
"""Enum of file types supported by the Document Intelligence Converter."""
5757

5858
# No OCR
59+
DOC = "doc"
5960
DOCX = "docx"
6061
PPTX = "pptx"
6162
XLSX = "xlsx"
@@ -72,7 +73,11 @@ def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[s
7273
"""Get the MIME type prefixes for the given file types."""
7374
prefixes: List[str] = []
7475
for type_ in types:
75-
if type_ == DocumentIntelligenceFileType.DOCX:
76+
if type_ == DocumentIntelligenceFileType.DOC:
77+
prefixes.append(
78+
"application/msword"
79+
)
80+
elif type_ == DocumentIntelligenceFileType.DOCX:
7681
prefixes.append(
7782
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
7883
)
@@ -105,7 +110,9 @@ def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]
105110
"""Get the file extensions for the given file types."""
106111
extensions: List[str] = []
107112
for type_ in types:
108-
if type_ == DocumentIntelligenceFileType.DOCX:
113+
if type_ == DocumentIntelligenceFileType.DOC:
114+
extensions.append(".doc")
115+
elif type_ == DocumentIntelligenceFileType.DOCX:
109116
extensions.append(".docx")
110117
elif type_ == DocumentIntelligenceFileType.PPTX:
111118
extensions.append(".pptx")

packages/markitdown/src/markitdown/converters/_docx_converter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
2626
]
2727

28-
ACCEPTED_FILE_EXTENSIONS = [".docx"]
28+
ACCEPTED_FILE_EXTENSIONS = [".docx", ".doc"]
2929

3030

3131
class DocxConverter(HtmlConverter):

0 commit comments

Comments
 (0)