microsoft · Jah-yee · Mar 11, 2026 · Apr 13, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 [![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
 
 > [!TIP]
-> MarkItDown now offers an MCP (Model Context Protocol) server for integration with LLM applications like Claude Desktop. See [markitdown-mcp](https://github.com/microsoft/markitdown/tree/main/packages/markitdown-mcp) for more information.
+> MarkItDown now offers an MCP (Model Context Protocol) server for integration with LLM applications like Claude Desktop. See [markitdown_mcp](https://github.com/microsoft/markitdown/tree/main/packages/markitdown-mcp) for more information.
 
 > [!IMPORTANT]
 > Breaking changes between 0.0.1 to 0.1.0:
@@ -26,7 +26,7 @@ MarkItDown currently supports the conversion from:
 - HTML
 - Text-based formats (CSV, JSON, XML)
 - ZIP files (iterates over contents)
-- Youtube URLs
+- YouTube URLs
 - EPubs
 - ... and more!
 

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
@@ -29,6 +29,7 @@
     BingSerpConverter,
     PdfConverter,
     DocxConverter,
+    DocConverter,
     XlsxConverter,
     XlsConverter,
     PptxConverter,
@@ -191,6 +192,7 @@ def enable_builtins(self, **kwargs) -> None:
             self.register_converter(WikipediaConverter())
             self.register_converter(YouTubeConverter())
             self.register_converter(BingSerpConverter())
+            self.register_converter(DocConverter())
             self.register_converter(DocxConverter())
             self.register_converter(XlsxConverter())
             self.register_converter(XlsConverter())

diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py
@@ -11,6 +11,7 @@
 from ._bing_serp_converter import BingSerpConverter
 from ._pdf_converter import PdfConverter
 from ._docx_converter import DocxConverter
+from ._doc_converter import DocConverter
 from ._xlsx_converter import XlsxConverter, XlsConverter
 from ._pptx_converter import PptxConverter
 from ._image_converter import ImageConverter

diff --git a/packages/markitdown/src/markitdown/converters/_doc_converter.py b/packages/markitdown/src/markitdown/converters/_doc_converter.py
@@ -0,0 +1,97 @@
+import subprocess
+import sys
+import tempfile
+import io
+
+from typing import BinaryIO, Any
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
+from .._exceptions import MissingDependencyException
+
+
+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "application/msword",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".doc"]
+
+
+def _check_antiword() -> None:
+    """Check if antiword is available on the system."""
+    try:
+        subprocess.run(
+            ["antiword"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=True,
+        )
+    except FileNotFoundError:
+        msg = (
+            f"DOCConverter recognized the input as a potential .doc file, "
+            "but the 'antiword' tool is not installed. "
+            "On Debian/Ubuntu, install it with: sudo apt install antiword"
+        )
+        raise MissingDependencyException(msg)
+    except subprocess.CalledProcessError as e:
+        msg = (
+            f"DOCConverter recognized the input as a potential .doc file, "
+            "but the 'antiword' tool is not installed. "
+            "On Debian/Ubuntu, install it with: sudo apt install antiword"
+        )
+        raise MissingDependencyException(msg) from e
+
+
+class DocConverter(DocumentConverter):
+    """
+    Converts legacy DOC files (not DOCX) to Markdown using antiword.
+    DOC is the older binary format that was used before Office Open XML (DOCX).
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> bool:
+        extension = (stream_info.extension or "").lower()
+
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        mimetype = (stream_info.mimetype or "").lower()
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
+        # Check if antiword is available
+        _check_antiword()
+
+        # Write stream to temp file
+        with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as tmp:
+            tmp.write(file_stream.read())
+            tmp_path = tmp.name
+
+        try:
+            result = subprocess.run(
+                ["antiword", "-w", "0", tmp_path],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            text_content = result.stdout
+        except subprocess.CalledProcessError as e:
+            text_content = e.stdout if e.stdout else ""
+            text_content += f"\n\n[Warning: Conversion had errors: {e.stderr}]"
+
+        return DocumentConverterResult(markdown=text_content)
diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -1,5 +1,8 @@
+import re
 import sys
-from typing import BinaryIO, Any
+from io import BytesIO
+from typing import Any, BinaryIO
+
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@@ -10,7 +13,7 @@
 _xlsx_dependency_exc_info = None
 try:
     import pandas as pd
-    import openpyxl  # noqa: F401
+    import openpyxl
 except ImportError:
     _xlsx_dependency_exc_info = sys.exc_info()
 
@@ -32,6 +35,109 @@
 ]
 ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
 
+# Pattern to match currency formats (e.g., "$"#,##0.00, €#,##0.00, £$#,##0.00)
+CURRENCY_FORMAT_PATTERN = re.compile(r'["\']([$€£¥₹])["\']|([$€£¥₹])\d|#|0')
+
+
+def _format_cell_value(cell: "openpyxl.cell.Cell") -> str:
+    """
+    Format a cell value, preserving currency and other number formats.
+    """
+    if cell.value is None:
+        return ""
+
+    # Check if it's a number type
+    if isinstance(cell.value, (int, float)):
+        number_format = cell.number_format
+
+        # Check if the number format contains currency symbols
+        # Common currency formats: "$"#,##0.00, €#,##0.00, $#,##0.00
+        if "$" in number_format or "€" in number_format or "£" in number_format or "¥" in number_format or "₹" in number_format:
+            # Try to use openpyxl's built-in formatting
+            try:
+                formatted = openpyxl.styles.numbers.format(cell.value, number_format)
+                # Clean up the formatted value (remove extra spaces, fix formatting)
+                formatted = formatted.strip()
+                if formatted and formatted != str(cell.value):
+                    return formatted
+            except Exception:
+                pass
+
+            # Fallback: extract currency symbol from format string
+            currency_match = re.search(r'["\']([$€£¥₹])["\']|([$€£¥₹])(?=\d|#)', number_format)
+            if currency_match:
+                currency_symbol = currency_match.group(1) or currency_match.group(2)
+                # Format with currency symbol
+                if isinstance(cell.value, float):
+                    return f"{currency_symbol}{cell.value:,.2f}"
+                else:
+                    return f"{currency_symbol}{cell.value:,}"
+
+        # Handle percentage format
+        if "%" in number_format and isinstance(cell.value, (int, float)):
+            return f"{cell.value * 100:.2f}%"
+
+        # Handle decimal places from format
+        if "#" in number_format or "0" in number_format:
+            # Try to preserve decimal places
+            decimal_match = re.search(r'\.(0+|#+)', number_format)
+            if decimal_match:
+                decimal_places = len(decimal_match.group(1))
+                if isinstance(cell.value, float):
+                    return f"{cell.value:,.{decimal_places}f}"
+
+        # Default number formatting with thousand separators
+        if isinstance(cell.value, float):
+            return f"{cell.value:,.2f}"
+        elif isinstance(cell.value, int):
+            return f"{cell.value:,}"
+
+    return str(cell.value)
+
+
+def _convert_sheet_to_markdown(ws: "openpyxl.worksheet.worksheet.Worksheet") -> str:
+    """
+    Convert an openpyxl worksheet to a Markdown table, preserving number formats.
+    """
+    rows = list(ws.iter_rows(values_only=True))
+    if not rows:
+        return ""
+
+    # Get the max column count
+    max_cols = max(len(row) for row in rows)
+
+    # Build markdown table
+    lines = []
+
+    # Header row
+    header = [str(cell) if cell is not None else "" for cell in rows[0]]
+    lines.append("| " + " | ".join(header) + " |")
+    lines.append("| " + " | ".join(["---"] * len(header)) + " |")
+
+    # Data rows - need to use openpyxl cells to get formatting
+    for row_idx in range(1, len(rows)):
+        row = rows[row_idx]
+        # Pad row if needed
+        row = list(row) + [""] * (max_cols - len(row))
+
+        # Get cell objects for formatting
+        cells = list(ws[row_idx + 1])[:max_cols]  # +1 because openpyxl is 1-indexed
+
+        formatted_cells = []
+        for i, cell in enumerate(cells):
+            if cell.value is not None:
+                # Check if we need to use cell object for formatting
+                if isinstance(cell.value, (int, float)):
+                    formatted_cells.append(_format_cell_value(cell))
+                else:
+                    formatted_cells.append(str(cell.value))
+            else:
+                formatted_cells.append("")
+
+        lines.append("| " + " | ".join(formatted_cells) + " |")
+
+    return "\n".join(lines)
+
 
 class XlsxConverter(DocumentConverter):
     """
@@ -80,17 +186,15 @@ def convert(
                 _xlsx_dependency_exc_info[2]
             )
 
-        sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
+        # Read the Excel file using openpyxl to preserve number formats
+        file_stream.seek(0)
+        wb = openpyxl.load_workbook(file_stream, data_only=True)
+
         md_content = ""
-        for s in sheets:
-            md_content += f"## {s}\n"
-            html_content = sheets[s].to_html(index=False)
-            md_content += (
-                self._html_converter.convert_string(
-                    html_content, **kwargs
-                ).markdown.strip()
-                + "\n\n"
-            )
+        for sheet_name in wb.sheetnames:
+            ws = wb[sheet_name]
+            md_content += f"## {sheet_name}\n"
+            md_content += _convert_sheet_to_markdown(ws) + "\n\n"
 
         return DocumentConverterResult(markdown=md_content.strip())