diff --git a/surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py b/surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py index c15e18e97..41de98455 100644 --- a/surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py +++ b/surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py @@ -62,10 +62,13 @@ async def parse_with_azure_doc_intelligence( f"after {len(attempt_errors)} failures" ) - if not result.content: - return "" + content = result.content or "" + if not content.strip(): + raise RuntimeError( + "Azure Document Intelligence returned empty/whitespace-only content" + ) - return result.content + return content except ClientAuthenticationError: raise diff --git a/surfsense_backend/app/services/llm_router_service.py b/surfsense_backend/app/services/llm_router_service.py index 65f10daed..35dfdd44e 100644 --- a/surfsense_backend/app/services/llm_router_service.py +++ b/surfsense_backend/app/services/llm_router_service.py @@ -186,8 +186,12 @@ class LLMRouterService: if deployment: model_list.append(deployment) if config.get("billing_tier") == "premium": - model_string = deployment["litellm_params"]["model"] + params = deployment["litellm_params"] + model_string = params["model"] premium_models.add(model_string) + base = params.get("base_model") or config.get("model_name", "") + if base and base != model_string: + premium_models.add(base) if not model_list: logger.warning("No valid LLM configs found for router initialization") @@ -197,9 +201,9 @@ class LLMRouterService: instance._premium_model_strings = premium_models instance._router_settings = router_settings or {} logger.info( - "Router pool: %d deployments (%d premium)", + "Router pool: %d deployments, premium model strings: %s", len(model_list), - len(premium_models), + sorted(premium_models), ) # Default router settings optimized for rate limit handling @@ -258,9 +262,18 @@ class LLMRouterService: def compute_premium_tokens(cls, calls: list) -> int: """Sum ``total_tokens`` for calls whose model is premium.""" instance = cls.get_instance() - return sum( + total = sum( c.total_tokens for c in calls if c.model in instance._premium_model_strings ) + if calls: + call_models = [c.model for c in calls] + logger.info( + "[premium_tokens] call models=%s, premium_set=%s, result=%d", + call_models, + sorted(instance._premium_model_strings), + total, + ) + return total @classmethod def _build_context_fallback_groups( diff --git a/surfsense_backend/app/tasks/document_processors/_direct_converters.py b/surfsense_backend/app/tasks/document_processors/_direct_converters.py index bbff4838e..76c3ea209 100644 --- a/surfsense_backend/app/tasks/document_processors/_direct_converters.py +++ b/surfsense_backend/app/tasks/document_processors/_direct_converters.py @@ -21,6 +21,42 @@ from markdownify import markdownify # at import time so every csv.reader call in this module can handle large fields. csv.field_size_limit(2**31 - 1) +_BOM_ENCODINGS: list[tuple[bytes, str]] = [ + (b"\xff\xfe\x00\x00", "utf-32-le"), + (b"\x00\x00\xfe\xff", "utf-32-be"), + (b"\xff\xfe", "utf-16-le"), + (b"\xfe\xff", "utf-16-be"), + (b"\xef\xbb\xbf", "utf-8-sig"), +] + + +def _detect_encoding(file_path: str) -> str: + """Sniff the BOM to pick an encoding, falling back to utf-8.""" + head = Path(file_path).read_bytes()[:4] + for bom, encoding in _BOM_ENCODINGS: + if head.startswith(bom): + return encoding + return "utf-8" + + +def _read_text(file_path: str) -> str: + """Read a file with automatic encoding detection. + + Tries BOM-based detection first, then utf-8, then latin-1 as a + last resort (latin-1 accepts every byte value). + """ + encoding = _detect_encoding(file_path) + try: + return Path(file_path).read_text(encoding=encoding) + except (UnicodeDecodeError, UnicodeError): + pass + if encoding != "utf-8": + try: + return Path(file_path).read_text(encoding="utf-8") + except (UnicodeDecodeError, UnicodeError): + pass + return Path(file_path).read_text(encoding="latin-1") + def _escape_pipe(cell: str) -> str: """Escape literal pipe characters inside a markdown table cell.""" @@ -33,9 +69,9 @@ def csv_to_markdown(file_path: str, *, delimiter: str = ",") -> str: The first row is treated as the header. An empty file returns an empty string so the caller can decide how to handle it. """ - with open(file_path, encoding="utf-8", newline="") as fh: - reader = csv.reader(fh, delimiter=delimiter) - rows = list(reader) + text = _read_text(file_path) + reader = csv.reader(text.splitlines(), delimiter=delimiter) + rows = list(reader) if not rows: return "" @@ -64,7 +100,7 @@ def tsv_to_markdown(file_path: str) -> str: def html_to_markdown(file_path: str) -> str: """Convert an HTML file to markdown via ``markdownify``.""" - html = Path(file_path).read_text(encoding="utf-8") + html = _read_text(file_path) return markdownify(html).strip() diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index a2d1a6412..1271550df 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -436,7 +436,7 @@ async def _extract_file_content( with contextlib.suppress(Exception): os.unlink(file_path) - if not result.markdown_content: + if not result.markdown_content or not result.markdown_content.strip(): raise RuntimeError(f"Failed to extract content from file: {filename}") return result.markdown_content, result.etl_service, billable_pages diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 9e1905dc3..65fa117f7 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -546,29 +546,35 @@ export function DocumentUploadTab({ ) ) : ( - + {renderBrowseButton({ fullWidth: true })} + + )}