Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
[![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)

> [!TIP]
> MarkItDown now offers an MCP (Model Context Protocol) server for integration with LLM applications like Claude Desktop. See [markitdown-mcp](https://github.com/microsoft/markitdown/tree/main/packages/markitdown-mcp) for more information.
> MarkItDown now offers an MCP (Model Context Protocol) server for integration with LLM applications like Claude Desktop. See [markitdown_mcp](https://github.com/microsoft/markitdown/tree/main/packages/markitdown-mcp) for more information.

> [!IMPORTANT]
> Breaking changes between 0.0.1 to 0.1.0:
Expand All @@ -26,7 +26,7 @@ MarkItDown currently supports the conversion from:
- HTML
- Text-based formats (CSV, JSON, XML)
- ZIP files (iterates over contents)
- Youtube URLs
- YouTube URLs
- EPubs
- ... and more!

Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
BingSerpConverter,
PdfConverter,
DocxConverter,
DocConverter,
XlsxConverter,
XlsConverter,
PptxConverter,
Expand Down Expand Up @@ -191,6 +192,7 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(WikipediaConverter())
self.register_converter(YouTubeConverter())
self.register_converter(BingSerpConverter())
self.register_converter(DocConverter())
self.register_converter(DocxConverter())
self.register_converter(XlsxConverter())
self.register_converter(XlsConverter())
Expand Down
1 change: 1 addition & 0 deletions packages/markitdown/src/markitdown/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ._bing_serp_converter import BingSerpConverter
from ._pdf_converter import PdfConverter
from ._docx_converter import DocxConverter
from ._doc_converter import DocConverter
from ._xlsx_converter import XlsxConverter, XlsConverter
from ._pptx_converter import PptxConverter
from ._image_converter import ImageConverter
Expand Down
97 changes: 97 additions & 0 deletions packages/markitdown/src/markitdown/converters/_doc_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import subprocess
import sys
import tempfile
import io

from typing import BinaryIO, Any
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException


ACCEPTED_MIME_TYPE_PREFIXES = [
"application/msword",
]

ACCEPTED_FILE_EXTENSIONS = [".doc"]


def _check_antiword() -> None:
"""Check if antiword is available on the system."""
try:
subprocess.run(
["antiword"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
)
except FileNotFoundError:
msg = (
f"DOCConverter recognized the input as a potential .doc file, "
"but the 'antiword' tool is not installed. "
"On Debian/Ubuntu, install it with: sudo apt install antiword"
)
raise MissingDependencyException(msg)
except subprocess.CalledProcessError as e:
msg = (
f"DOCConverter recognized the input as a potential .doc file, "
"but the 'antiword' tool is not installed. "
"On Debian/Ubuntu, install it with: sudo apt install antiword"
)
raise MissingDependencyException(msg) from e


class DocConverter(DocumentConverter):
"""
Converts legacy DOC files (not DOCX) to Markdown using antiword.
DOC is the older binary format that was used before Office Open XML (DOCX).
"""

def __init__(self):
super().__init__()

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
extension = (stream_info.extension or "").lower()

if extension in ACCEPTED_FILE_EXTENSIONS:
return True

mimetype = (stream_info.mimetype or "").lower()
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True

return False

def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
# Check if antiword is available
_check_antiword()

# Write stream to temp file
with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as tmp:
tmp.write(file_stream.read())
tmp_path = tmp.name

try:
result = subprocess.run(
["antiword", "-w", "0", tmp_path],
capture_output=True,
text=True,
check=True,
)
text_content = result.stdout
except subprocess.CalledProcessError as e:
text_content = e.stdout if e.stdout else ""
text_content += f"\n\n[Warning: Conversion had errors: {e.stderr}]"

return DocumentConverterResult(markdown=text_content)
128 changes: 116 additions & 12 deletions packages/markitdown/src/markitdown/converters/_xlsx_converter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import re
import sys
from typing import BinaryIO, Any
from io import BytesIO
from typing import Any, BinaryIO

from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
Expand All @@ -10,7 +13,7 @@
_xlsx_dependency_exc_info = None
try:
import pandas as pd
import openpyxl # noqa: F401
import openpyxl
except ImportError:
_xlsx_dependency_exc_info = sys.exc_info()

Expand All @@ -32,6 +35,109 @@
]
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]

# Pattern to match currency formats (e.g., "$"#,##0.00, €#,##0.00, £$#,##0.00)
CURRENCY_FORMAT_PATTERN = re.compile(r'["\']([$€£¥₹])["\']|([$€£¥₹])\d|#|0')


def _format_cell_value(cell: "openpyxl.cell.Cell") -> str:
"""
Format a cell value, preserving currency and other number formats.
"""
if cell.value is None:
return ""

# Check if it's a number type
if isinstance(cell.value, (int, float)):
number_format = cell.number_format

# Check if the number format contains currency symbols
# Common currency formats: "$"#,##0.00, €#,##0.00, $#,##0.00
if "$" in number_format or "€" in number_format or "£" in number_format or "¥" in number_format or "₹" in number_format:
# Try to use openpyxl's built-in formatting
try:
formatted = openpyxl.styles.numbers.format(cell.value, number_format)
# Clean up the formatted value (remove extra spaces, fix formatting)
formatted = formatted.strip()
if formatted and formatted != str(cell.value):
return formatted
except Exception:
pass

# Fallback: extract currency symbol from format string
currency_match = re.search(r'["\']([$€£¥₹])["\']|([$€£¥₹])(?=\d|#)', number_format)
if currency_match:
currency_symbol = currency_match.group(1) or currency_match.group(2)
# Format with currency symbol
if isinstance(cell.value, float):
return f"{currency_symbol}{cell.value:,.2f}"
else:
return f"{currency_symbol}{cell.value:,}"

# Handle percentage format
if "%" in number_format and isinstance(cell.value, (int, float)):
return f"{cell.value * 100:.2f}%"

# Handle decimal places from format
if "#" in number_format or "0" in number_format:
# Try to preserve decimal places
decimal_match = re.search(r'\.(0+|#+)', number_format)
if decimal_match:
decimal_places = len(decimal_match.group(1))
if isinstance(cell.value, float):
return f"{cell.value:,.{decimal_places}f}"

# Default number formatting with thousand separators
if isinstance(cell.value, float):
return f"{cell.value:,.2f}"
elif isinstance(cell.value, int):
return f"{cell.value:,}"

return str(cell.value)


def _convert_sheet_to_markdown(ws: "openpyxl.worksheet.worksheet.Worksheet") -> str:
"""
Convert an openpyxl worksheet to a Markdown table, preserving number formats.
"""
rows = list(ws.iter_rows(values_only=True))
if not rows:
return ""

# Get the max column count
max_cols = max(len(row) for row in rows)

# Build markdown table
lines = []

# Header row
header = [str(cell) if cell is not None else "" for cell in rows[0]]
lines.append("| " + " | ".join(header) + " |")
lines.append("| " + " | ".join(["---"] * len(header)) + " |")

# Data rows - need to use openpyxl cells to get formatting
for row_idx in range(1, len(rows)):
row = rows[row_idx]
# Pad row if needed
row = list(row) + [""] * (max_cols - len(row))

# Get cell objects for formatting
cells = list(ws[row_idx + 1])[:max_cols] # +1 because openpyxl is 1-indexed

formatted_cells = []
for i, cell in enumerate(cells):
if cell.value is not None:
# Check if we need to use cell object for formatting
if isinstance(cell.value, (int, float)):
formatted_cells.append(_format_cell_value(cell))
else:
formatted_cells.append(str(cell.value))
else:
formatted_cells.append("")

lines.append("| " + " | ".join(formatted_cells) + " |")

return "\n".join(lines)


class XlsxConverter(DocumentConverter):
"""
Expand Down Expand Up @@ -80,17 +186,15 @@ def convert(
_xlsx_dependency_exc_info[2]
)

sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
# Read the Excel file using openpyxl to preserve number formats
file_stream.seek(0)
wb = openpyxl.load_workbook(file_stream, data_only=True)

md_content = ""
for s in sheets:
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += (
self._html_converter.convert_string(
html_content, **kwargs
).markdown.strip()
+ "\n\n"
)
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
md_content += f"## {sheet_name}\n"
md_content += _convert_sheet_to_markdown(ws) + "\n\n"

return DocumentConverterResult(markdown=md_content.strip())

Expand Down