refactor: improve content extraction and encoding handling

- Enhanced Azure Document Intelligence parser to raise an error for empty or whitespace-only content. - Updated LLMRouterService to log premium model strings more clearly. - Added automatic encoding detection for file reading in document processors. - Improved error handling for empty markdown content extraction in file processors. - Refactored DocumentUploadTab component for better accessibility and user interaction.
2026-06-22 21:28:12 +02:00 · 2026-04-16 00:25:46 -07:00 · 2026-04-16 00:25:46 -07:00 · 2f793e7a69
commit 2f793e7a69
parent 4a51ccdc2c
5 changed files with 91 additions and 33 deletions
--- a/surfsense_backend/app/tasks/document_processors/_direct_converters.py
+++ b/surfsense_backend/app/tasks/document_processors/_direct_converters.py
@ -21,6 +21,42 @@ from markdownify import markdownify
 # at import time so every csv.reader call in this module can handle large fields.
 csv.field_size_limit(2**31 - 1)

+_BOM_ENCODINGS: list[tuple[bytes, str]] = [
+    (b"\xff\xfe\x00\x00", "utf-32-le"),
+    (b"\x00\x00\xfe\xff", "utf-32-be"),
+    (b"\xff\xfe", "utf-16-le"),
+    (b"\xfe\xff", "utf-16-be"),
+    (b"\xef\xbb\xbf", "utf-8-sig"),
+]
+
+
+def _detect_encoding(file_path: str) -> str:
+    """Sniff the BOM to pick an encoding, falling back to utf-8."""
+    head = Path(file_path).read_bytes()[:4]
+    for bom, encoding in _BOM_ENCODINGS:
+        if head.startswith(bom):
+            return encoding
+    return "utf-8"
+
+
+def _read_text(file_path: str) -> str:
+    """Read a file with automatic encoding detection.
+
+    Tries BOM-based detection first, then utf-8, then latin-1 as a
+    last resort (latin-1 accepts every byte value).
+    """
+    encoding = _detect_encoding(file_path)
+    try:
+        return Path(file_path).read_text(encoding=encoding)
+    except (UnicodeDecodeError, UnicodeError):
+        pass
+    if encoding != "utf-8":
+        try:
+            return Path(file_path).read_text(encoding="utf-8")
+        except (UnicodeDecodeError, UnicodeError):
+            pass
+    return Path(file_path).read_text(encoding="latin-1")
+

 def _escape_pipe(cell: str) -> str:
    """Escape literal pipe characters inside a markdown table cell."""
@ -33,9 +69,9 @@ def csv_to_markdown(file_path: str, *, delimiter: str = ",") -> str:
    The first row is treated as the header.  An empty file returns an
    empty string so the caller can decide how to handle it.
    """
-    with open(file_path, encoding="utf-8", newline="") as fh:
-        reader = csv.reader(fh, delimiter=delimiter)
-        rows = list(reader)
+    text = _read_text(file_path)
+    reader = csv.reader(text.splitlines(), delimiter=delimiter)
+    rows = list(reader)

    if not rows:
        return ""
@ -64,7 +100,7 @@ def tsv_to_markdown(file_path: str) -> str:

 def html_to_markdown(file_path: str) -> str:
    """Convert an HTML file to markdown via ``markdownify``."""
-    html = Path(file_path).read_text(encoding="utf-8")
+    html = _read_text(file_path)
    return markdownify(html).strip()


--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -436,7 +436,7 @@ async def _extract_file_content(
    with contextlib.suppress(Exception):
        os.unlink(file_path)

-    if not result.markdown_content:
+    if not result.markdown_content or not result.markdown_content.strip():
        raise RuntimeError(f"Failed to extract content from file: {filename}")

    return result.markdown_content, result.etl_service, billable_pages