mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-21 18:55:16 +02:00
chore: linting
This commit is contained in:
parent
219a5977b7
commit
c187b04e82
25 changed files with 102 additions and 108 deletions
|
|
@ -903,9 +903,7 @@ async def test_extract_pdf_without_vision_llm_skips_picture_descriptions(
|
|||
describe_mock.assert_not_called()
|
||||
|
||||
|
||||
async def test_extract_pdf_with_vision_llm_swallows_describe_failure(
|
||||
tmp_path, mocker
|
||||
):
|
||||
async def test_extract_pdf_with_vision_llm_swallows_describe_failure(tmp_path, mocker):
|
||||
"""A pypdf or vision LLM blow-up never fails the document upload."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake content")
|
||||
|
|
@ -976,9 +974,7 @@ async def test_extract_pdf_with_vision_llm_no_images_returns_parser_text(
|
|||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def test_extract_pdf_passes_ocr_runner_to_describe_pictures(
|
||||
tmp_path, mocker
|
||||
):
|
||||
async def test_extract_pdf_passes_ocr_runner_to_describe_pictures(tmp_path, mocker):
|
||||
"""The ETL service must wire an ocr_runner kwarg to describe_pictures."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake content")
|
||||
|
|
@ -1027,9 +1023,7 @@ async def test_extract_pdf_ocr_runner_invokes_document_parser_on_image(
|
|||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
|
||||
fake_docling = mocker.AsyncMock()
|
||||
fake_docling.process_document.return_value = {
|
||||
"content": "Slice 24 / 60 L R"
|
||||
}
|
||||
fake_docling.process_document.return_value = {"content": "Slice 24 / 60 L R"}
|
||||
mocker.patch(
|
||||
"app.services.docling_service.create_docling_service",
|
||||
return_value=fake_docling,
|
||||
|
|
@ -1074,7 +1068,7 @@ async def test_extract_pdf_ocr_runner_returns_empty_on_unsupported_image(
|
|||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake content")
|
||||
weird_image = tmp_path / "Im0.jp2" # JPEG2000, unlikely to be supported
|
||||
weird_image.write_bytes(b"\x00\x00\x00\x0CjP" + b"\x00" * 50)
|
||||
weird_image.write_bytes(b"\x00\x00\x00\x0cjP" + b"\x00" * 50)
|
||||
|
||||
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||
|
||||
|
|
|
|||
|
|
@ -330,11 +330,17 @@ def test_inject_handles_multiple_images_in_order():
|
|||
result = PictureExtractionResult(
|
||||
descriptions=[
|
||||
PictureDescription(
|
||||
page_number=1, ordinal_in_page=0, name="Im0", sha256="aa",
|
||||
page_number=1,
|
||||
ordinal_in_page=0,
|
||||
name="Im0",
|
||||
sha256="aa",
|
||||
description="Desc A",
|
||||
),
|
||||
PictureDescription(
|
||||
page_number=2, ordinal_in_page=0, name="Im1", sha256="bb",
|
||||
page_number=2,
|
||||
ordinal_in_page=0,
|
||||
name="Im1",
|
||||
sha256="bb",
|
||||
description="Desc B",
|
||||
),
|
||||
]
|
||||
|
|
@ -511,9 +517,7 @@ async def test_describe_pictures_calls_ocr_runner_per_image(tmp_path, mocker):
|
|||
assert by_name == {"Im0.jpeg": "OCR text A", "Im1.png": "OCR text B"}
|
||||
|
||||
|
||||
async def test_describe_pictures_runs_vision_and_ocr_in_parallel(
|
||||
tmp_path, mocker
|
||||
):
|
||||
async def test_describe_pictures_runs_vision_and_ocr_in_parallel(tmp_path, mocker):
|
||||
"""Vision LLM and OCR run concurrently per image, not sequentially.
|
||||
|
||||
We verify this by recording call timestamps: if both finish within
|
||||
|
|
@ -656,9 +660,7 @@ async def test_describe_pictures_vision_failure_with_ocr_runner_skips_image(
|
|||
assert result.failed == 1
|
||||
|
||||
|
||||
async def test_describe_pictures_no_ocr_runner_keeps_ocr_text_none(
|
||||
tmp_path, mocker
|
||||
):
|
||||
async def test_describe_pictures_no_ocr_runner_keeps_ocr_text_none(tmp_path, mocker):
|
||||
"""Backward compat: omitting ocr_runner produces description-only blocks."""
|
||||
pdf_file = tmp_path / "report.pdf"
|
||||
pdf_file.write_bytes(b"%PDF-1.4 fake")
|
||||
|
|
@ -824,11 +826,17 @@ def test_inject_handles_multiple_figures_in_document_order():
|
|||
result = PictureExtractionResult(
|
||||
descriptions=[
|
||||
PictureDescription(
|
||||
page_number=1, ordinal_in_page=0, name="Im0", sha256="aa",
|
||||
page_number=1,
|
||||
ordinal_in_page=0,
|
||||
name="Im0",
|
||||
sha256="aa",
|
||||
description="Description of chart A.",
|
||||
),
|
||||
PictureDescription(
|
||||
page_number=2, ordinal_in_page=0, name="Im1", sha256="bb",
|
||||
page_number=2,
|
||||
ordinal_in_page=0,
|
||||
name="Im1",
|
||||
sha256="bb",
|
||||
description="Description of chart B.",
|
||||
),
|
||||
]
|
||||
|
|
@ -842,9 +850,7 @@ def test_inject_handles_multiple_figures_in_document_order():
|
|||
assert out.count("</figure>") == 2
|
||||
assert "Description of chart A." in out
|
||||
assert "Description of chart B." in out
|
||||
assert out.index("Description of chart A.") < out.index(
|
||||
"Description of chart B."
|
||||
)
|
||||
assert out.index("Description of chart A.") < out.index("Description of chart B.")
|
||||
# Each description appears AFTER its corresponding </figure>.
|
||||
first_close = out.index("</figure>")
|
||||
assert first_close < out.index("Description of chart A.")
|
||||
|
|
@ -856,7 +862,7 @@ def test_inject_figures_with_attributes_and_nested_tags():
|
|||
"""``<figure>`` with attributes and nested tags is matched and preserved."""
|
||||
markdown = (
|
||||
'<figure id="fig-3" class="chart">\n'
|
||||
'<figcaption>Source: Pew Research</figcaption>\n'
|
||||
"<figcaption>Source: Pew Research</figcaption>\n"
|
||||
"<table><tr><td>Republican</td><td>57</td></tr></table>\n"
|
||||
"</figure>\n"
|
||||
)
|
||||
|
|
@ -899,8 +905,7 @@ def test_inject_figures_more_descriptions_than_figures_returns_remaining():
|
|||
def test_inject_figures_more_figures_than_descriptions_leaves_extras_untouched():
|
||||
"""Two figures, one description -> first figure enriched, second left raw."""
|
||||
markdown = (
|
||||
"<figure>\nfigure 1 content\n</figure>\n"
|
||||
"<figure>\nfigure 2 content\n</figure>\n"
|
||||
"<figure>\nfigure 1 content\n</figure>\n<figure>\nfigure 2 content\n</figure>\n"
|
||||
)
|
||||
result = PictureExtractionResult(
|
||||
descriptions=[_desc(name="Im0", description="Only description.")]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue