feat(workspace): FileParser ABC and MarkitdownParser

This commit is contained in:
alt-glitch
2026-04-20 03:38:32 +05:30
parent ec0fa5a2be
commit 5008f123ae
2 changed files with 160 additions and 0 deletions
+116
View File
@@ -1,3 +1,8 @@
import sys
import types
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from workspace.constants import BINARY_SUFFIXES, CODE_SUFFIXES, MARKDOWN_SUFFIXES, PARSEABLE_SUFFIXES
@@ -55,3 +60,114 @@ class TestParsingConfig:
cfg = ParsingConfig.model_validate({"default": "markitdown", "future_key": True})
assert cfg.default == "markitdown"
assert not hasattr(cfg, "future_key")
class TestFileParserABC:
def test_cannot_instantiate_directly(self):
from workspace.parsers import FileParser
with pytest.raises(TypeError):
FileParser()
def test_parse_returns_none_on_exception(self):
from workspace.parsers import FileParser
class ExplodingParser(FileParser):
name = "exploding"
def supported_suffixes(self) -> frozenset[str]:
return frozenset({".boom"})
def _convert(self, path: Path) -> str:
raise RuntimeError("kaboom")
parser = ExplodingParser()
result = parser.parse(Path("/fake/file.boom"))
assert result is None
def test_parse_returns_none_on_empty_output(self):
from workspace.parsers import FileParser
class EmptyParser(FileParser):
name = "empty"
def supported_suffixes(self) -> frozenset[str]:
return frozenset({".empty"})
def _convert(self, path: Path) -> str:
return " \n \n "
parser = EmptyParser()
result = parser.parse(Path("/fake/file.empty"))
assert result is None
def test_parse_returns_content_on_success(self):
from workspace.parsers import FileParser
class GoodParser(FileParser):
name = "good"
def supported_suffixes(self) -> frozenset[str]:
return frozenset({".good"})
def _convert(self, path: Path) -> str:
return "# Converted\n\nContent."
parser = GoodParser()
result = parser.parse(Path("/fake/file.good"))
assert result == "# Converted\n\nContent."
@pytest.fixture
def mock_markitdown():
"""Mock the markitdown module regardless of whether it's installed."""
mock_result = MagicMock()
mock_result.markdown = "# Converted\n\nParsed content from document."
mock_instance = MagicMock()
mock_instance.convert.return_value = mock_result
mock_class = MagicMock(return_value=mock_instance)
module = types.ModuleType("markitdown")
module.MarkItDown = mock_class
with patch.dict(sys.modules, {"markitdown": module}):
yield mock_instance
class TestMarkitdownParser:
def test_name(self):
from workspace.parsers import MarkitdownParser
assert MarkitdownParser.name == "markitdown"
def test_supported_suffixes(self):
from workspace.parsers import MarkitdownParser
suffixes = MarkitdownParser().supported_suffixes()
assert ".pdf" in suffixes
assert ".docx" in suffixes
assert ".pptx" in suffixes
def test_parse_calls_markitdown_convert(self, tmp_path, mock_markitdown):
from workspace.parsers import MarkitdownParser
pdf = tmp_path / "report.pdf"
pdf.write_bytes(b"%PDF-1.4 fake content")
parser = MarkitdownParser()
result = parser.parse(pdf)
assert result == "# Converted\n\nParsed content from document."
mock_markitdown.convert.assert_called_once_with(str(pdf))
def test_parse_returns_none_when_markitdown_not_installed(self, tmp_path):
from workspace.parsers import MarkitdownParser
pdf = tmp_path / "report.pdf"
pdf.write_bytes(b"%PDF-1.4 fake")
with patch.dict(sys.modules, {"markitdown": None}):
parser = MarkitdownParser()
result = parser.parse(pdf)
assert result is None
+44
View File
@@ -0,0 +1,44 @@
"""File parsers -- convert binary formats (.pdf, .docx, .pptx) to markdown.
FileParser ABC defines the contract. Built-in backends: markitdown, pandoc.
CompositeParser routes extensions to the configured backend.
"""
import logging
from abc import ABC, abstractmethod
from pathlib import Path
from typing import ClassVar
log = logging.getLogger(__name__)
class FileParser(ABC):
name: ClassVar[str]
@abstractmethod
def supported_suffixes(self) -> frozenset[str]: ...
@abstractmethod
def _convert(self, path: Path) -> str: ...
def parse(self, path: Path) -> str | None:
try:
result = self._convert(path)
return result if result and result.strip() else None
except Exception as exc:
log.warning("Parser %s failed on %s: %s", self.name, path, exc)
return None
class MarkitdownParser(FileParser):
name = "markitdown"
def supported_suffixes(self) -> frozenset[str]:
return frozenset({".pdf", ".docx", ".pptx"})
def _convert(self, path: Path) -> str:
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(str(path))
return result.markdown