feat(workspace): FileParser ABC and MarkitdownParser
This commit is contained in:
@@ -1,3 +1,8 @@
|
||||
import sys
|
||||
import types
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from workspace.constants import BINARY_SUFFIXES, CODE_SUFFIXES, MARKDOWN_SUFFIXES, PARSEABLE_SUFFIXES
|
||||
@@ -55,3 +60,114 @@ class TestParsingConfig:
|
||||
cfg = ParsingConfig.model_validate({"default": "markitdown", "future_key": True})
|
||||
assert cfg.default == "markitdown"
|
||||
assert not hasattr(cfg, "future_key")
|
||||
|
||||
|
||||
class TestFileParserABC:
|
||||
def test_cannot_instantiate_directly(self):
|
||||
from workspace.parsers import FileParser
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
FileParser()
|
||||
|
||||
def test_parse_returns_none_on_exception(self):
|
||||
from workspace.parsers import FileParser
|
||||
|
||||
class ExplodingParser(FileParser):
|
||||
name = "exploding"
|
||||
|
||||
def supported_suffixes(self) -> frozenset[str]:
|
||||
return frozenset({".boom"})
|
||||
|
||||
def _convert(self, path: Path) -> str:
|
||||
raise RuntimeError("kaboom")
|
||||
|
||||
parser = ExplodingParser()
|
||||
result = parser.parse(Path("/fake/file.boom"))
|
||||
assert result is None
|
||||
|
||||
def test_parse_returns_none_on_empty_output(self):
|
||||
from workspace.parsers import FileParser
|
||||
|
||||
class EmptyParser(FileParser):
|
||||
name = "empty"
|
||||
|
||||
def supported_suffixes(self) -> frozenset[str]:
|
||||
return frozenset({".empty"})
|
||||
|
||||
def _convert(self, path: Path) -> str:
|
||||
return " \n \n "
|
||||
|
||||
parser = EmptyParser()
|
||||
result = parser.parse(Path("/fake/file.empty"))
|
||||
assert result is None
|
||||
|
||||
def test_parse_returns_content_on_success(self):
|
||||
from workspace.parsers import FileParser
|
||||
|
||||
class GoodParser(FileParser):
|
||||
name = "good"
|
||||
|
||||
def supported_suffixes(self) -> frozenset[str]:
|
||||
return frozenset({".good"})
|
||||
|
||||
def _convert(self, path: Path) -> str:
|
||||
return "# Converted\n\nContent."
|
||||
|
||||
parser = GoodParser()
|
||||
result = parser.parse(Path("/fake/file.good"))
|
||||
assert result == "# Converted\n\nContent."
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_markitdown():
|
||||
"""Mock the markitdown module regardless of whether it's installed."""
|
||||
mock_result = MagicMock()
|
||||
mock_result.markdown = "# Converted\n\nParsed content from document."
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.convert.return_value = mock_result
|
||||
mock_class = MagicMock(return_value=mock_instance)
|
||||
|
||||
module = types.ModuleType("markitdown")
|
||||
module.MarkItDown = mock_class
|
||||
|
||||
with patch.dict(sys.modules, {"markitdown": module}):
|
||||
yield mock_instance
|
||||
|
||||
|
||||
class TestMarkitdownParser:
|
||||
def test_name(self):
|
||||
from workspace.parsers import MarkitdownParser
|
||||
|
||||
assert MarkitdownParser.name == "markitdown"
|
||||
|
||||
def test_supported_suffixes(self):
|
||||
from workspace.parsers import MarkitdownParser
|
||||
|
||||
suffixes = MarkitdownParser().supported_suffixes()
|
||||
assert ".pdf" in suffixes
|
||||
assert ".docx" in suffixes
|
||||
assert ".pptx" in suffixes
|
||||
|
||||
def test_parse_calls_markitdown_convert(self, tmp_path, mock_markitdown):
|
||||
from workspace.parsers import MarkitdownParser
|
||||
|
||||
pdf = tmp_path / "report.pdf"
|
||||
pdf.write_bytes(b"%PDF-1.4 fake content")
|
||||
|
||||
parser = MarkitdownParser()
|
||||
result = parser.parse(pdf)
|
||||
|
||||
assert result == "# Converted\n\nParsed content from document."
|
||||
mock_markitdown.convert.assert_called_once_with(str(pdf))
|
||||
|
||||
def test_parse_returns_none_when_markitdown_not_installed(self, tmp_path):
|
||||
from workspace.parsers import MarkitdownParser
|
||||
|
||||
pdf = tmp_path / "report.pdf"
|
||||
pdf.write_bytes(b"%PDF-1.4 fake")
|
||||
|
||||
with patch.dict(sys.modules, {"markitdown": None}):
|
||||
parser = MarkitdownParser()
|
||||
result = parser.parse(pdf)
|
||||
|
||||
assert result is None
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
"""File parsers -- convert binary formats (.pdf, .docx, .pptx) to markdown.
|
||||
|
||||
FileParser ABC defines the contract. Built-in backends: markitdown, pandoc.
|
||||
CompositeParser routes extensions to the configured backend.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import ClassVar
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FileParser(ABC):
|
||||
name: ClassVar[str]
|
||||
|
||||
@abstractmethod
|
||||
def supported_suffixes(self) -> frozenset[str]: ...
|
||||
|
||||
@abstractmethod
|
||||
def _convert(self, path: Path) -> str: ...
|
||||
|
||||
def parse(self, path: Path) -> str | None:
|
||||
try:
|
||||
result = self._convert(path)
|
||||
return result if result and result.strip() else None
|
||||
except Exception as exc:
|
||||
log.warning("Parser %s failed on %s: %s", self.name, path, exc)
|
||||
return None
|
||||
|
||||
|
||||
class MarkitdownParser(FileParser):
|
||||
name = "markitdown"
|
||||
|
||||
def supported_suffixes(self) -> frozenset[str]:
|
||||
return frozenset({".pdf", ".docx", ".pptx"})
|
||||
|
||||
def _convert(self, path: Path) -> str:
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert(str(path))
|
||||
return result.markdown
|
||||
Reference in New Issue
Block a user