diff --git a/tests/workspace/test_parsers.py b/tests/workspace/test_parsers.py index 0c01a43182..e4e9dfd653 100644 --- a/tests/workspace/test_parsers.py +++ b/tests/workspace/test_parsers.py @@ -1,3 +1,8 @@ +import sys +import types +from pathlib import Path +from unittest.mock import MagicMock, patch + import pytest from workspace.constants import BINARY_SUFFIXES, CODE_SUFFIXES, MARKDOWN_SUFFIXES, PARSEABLE_SUFFIXES @@ -55,3 +60,114 @@ class TestParsingConfig: cfg = ParsingConfig.model_validate({"default": "markitdown", "future_key": True}) assert cfg.default == "markitdown" assert not hasattr(cfg, "future_key") + + +class TestFileParserABC: + def test_cannot_instantiate_directly(self): + from workspace.parsers import FileParser + + with pytest.raises(TypeError): + FileParser() + + def test_parse_returns_none_on_exception(self): + from workspace.parsers import FileParser + + class ExplodingParser(FileParser): + name = "exploding" + + def supported_suffixes(self) -> frozenset[str]: + return frozenset({".boom"}) + + def _convert(self, path: Path) -> str: + raise RuntimeError("kaboom") + + parser = ExplodingParser() + result = parser.parse(Path("/fake/file.boom")) + assert result is None + + def test_parse_returns_none_on_empty_output(self): + from workspace.parsers import FileParser + + class EmptyParser(FileParser): + name = "empty" + + def supported_suffixes(self) -> frozenset[str]: + return frozenset({".empty"}) + + def _convert(self, path: Path) -> str: + return " \n \n " + + parser = EmptyParser() + result = parser.parse(Path("/fake/file.empty")) + assert result is None + + def test_parse_returns_content_on_success(self): + from workspace.parsers import FileParser + + class GoodParser(FileParser): + name = "good" + + def supported_suffixes(self) -> frozenset[str]: + return frozenset({".good"}) + + def _convert(self, path: Path) -> str: + return "# Converted\n\nContent." + + parser = GoodParser() + result = parser.parse(Path("/fake/file.good")) + assert result == "# Converted\n\nContent." + + +@pytest.fixture +def mock_markitdown(): + """Mock the markitdown module regardless of whether it's installed.""" + mock_result = MagicMock() + mock_result.markdown = "# Converted\n\nParsed content from document." + mock_instance = MagicMock() + mock_instance.convert.return_value = mock_result + mock_class = MagicMock(return_value=mock_instance) + + module = types.ModuleType("markitdown") + module.MarkItDown = mock_class + + with patch.dict(sys.modules, {"markitdown": module}): + yield mock_instance + + +class TestMarkitdownParser: + def test_name(self): + from workspace.parsers import MarkitdownParser + + assert MarkitdownParser.name == "markitdown" + + def test_supported_suffixes(self): + from workspace.parsers import MarkitdownParser + + suffixes = MarkitdownParser().supported_suffixes() + assert ".pdf" in suffixes + assert ".docx" in suffixes + assert ".pptx" in suffixes + + def test_parse_calls_markitdown_convert(self, tmp_path, mock_markitdown): + from workspace.parsers import MarkitdownParser + + pdf = tmp_path / "report.pdf" + pdf.write_bytes(b"%PDF-1.4 fake content") + + parser = MarkitdownParser() + result = parser.parse(pdf) + + assert result == "# Converted\n\nParsed content from document." + mock_markitdown.convert.assert_called_once_with(str(pdf)) + + def test_parse_returns_none_when_markitdown_not_installed(self, tmp_path): + from workspace.parsers import MarkitdownParser + + pdf = tmp_path / "report.pdf" + pdf.write_bytes(b"%PDF-1.4 fake") + + with patch.dict(sys.modules, {"markitdown": None}): + parser = MarkitdownParser() + result = parser.parse(pdf) + + assert result is None diff --git a/workspace/parsers.py b/workspace/parsers.py new file mode 100644 index 0000000000..77f27aed59 --- /dev/null +++ b/workspace/parsers.py @@ -0,0 +1,44 @@ +"""File parsers -- convert binary formats (.pdf, .docx, .pptx) to markdown. + +FileParser ABC defines the contract. Built-in backends: markitdown, pandoc. +CompositeParser routes extensions to the configured backend. +""" + +import logging +from abc import ABC, abstractmethod +from pathlib import Path +from typing import ClassVar + +log = logging.getLogger(__name__) + + +class FileParser(ABC): + name: ClassVar[str] + + @abstractmethod + def supported_suffixes(self) -> frozenset[str]: ... + + @abstractmethod + def _convert(self, path: Path) -> str: ... + + def parse(self, path: Path) -> str | None: + try: + result = self._convert(path) + return result if result and result.strip() else None + except Exception as exc: + log.warning("Parser %s failed on %s: %s", self.name, path, exc) + return None + + +class MarkitdownParser(FileParser): + name = "markitdown" + + def supported_suffixes(self) -> frozenset[str]: + return frozenset({".pdf", ".docx", ".pptx"}) + + def _convert(self, path: Path) -> str: + from markitdown import MarkItDown + + md = MarkItDown() + result = md.convert(str(path)) + return result.markdown