Skip to content

Commit 181361d

Browse files
author
RoomWithOutRoof
committed
Add .doc file support via Document Intelligence
Support for legacy .doc format in addition to .docx. - Added DOC to DocumentIntelligenceFileType enum - Added .doc MIME type (application/msword) - Added .doc extension handling - Updated default file_types to include DOC - Added DOC to no_ocr_types list Fixes: #23
1 parent 6cb5307 commit 181361d

1 file changed

Lines changed: 9 additions & 2 deletions

File tree

packages/markitdown/src/markitdown/converters/_doc_intel_converter.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ class DocumentIntelligenceFileType(str, Enum):
5656
"""Enum of file types supported by the Document Intelligence Converter."""
5757

5858
# No OCR
59+
DOC = "doc"
5960
DOCX = "docx"
6061
PPTX = "pptx"
6162
XLSX = "xlsx"
@@ -72,7 +73,9 @@ def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[s
7273
"""Get the MIME type prefixes for the given file types."""
7374
prefixes: List[str] = []
7475
for type_ in types:
75-
if type_ == DocumentIntelligenceFileType.DOCX:
76+
if type_ == DocumentIntelligenceFileType.DOC:
77+
prefixes.append("application/msword")
78+
elif type_ == DocumentIntelligenceFileType.DOCX:
7679
prefixes.append(
7780
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
7881
)
@@ -105,7 +108,9 @@ def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]
105108
"""Get the file extensions for the given file types."""
106109
extensions: List[str] = []
107110
for type_ in types:
108-
if type_ == DocumentIntelligenceFileType.DOCX:
111+
if type_ == DocumentIntelligenceFileType.DOC:
112+
extensions.append(".doc")
113+
elif type_ == DocumentIntelligenceFileType.DOCX:
109114
extensions.append(".docx")
110115
elif type_ == DocumentIntelligenceFileType.PPTX:
111116
extensions.append(".pptx")
@@ -137,6 +142,7 @@ def __init__(
137142
api_version: str = "2024-07-31-preview",
138143
credential: AzureKeyCredential | TokenCredential | None = None,
139144
file_types: List[DocumentIntelligenceFileType] = [
145+
DocumentIntelligenceFileType.DOC,
140146
DocumentIntelligenceFileType.DOCX,
141147
DocumentIntelligenceFileType.PPTX,
142148
DocumentIntelligenceFileType.XLSX,
@@ -215,6 +221,7 @@ def _analysis_features(self, stream_info: StreamInfo) -> List[str]:
215221

216222
# Types that don't support ocr
217223
no_ocr_types = [
224+
DocumentIntelligenceFileType.DOC,
218225
DocumentIntelligenceFileType.DOCX,
219226
DocumentIntelligenceFileType.PPTX,
220227
DocumentIntelligenceFileType.XLSX,

0 commit comments

Comments
 (0)