refactor: improve content extraction and encoding handling

- Enhanced Azure Document Intelligence parser to raise an error for empty or whitespace-only content.
- Updated LLMRouterService to log premium model strings more clearly.
- Added automatic encoding detection for file reading in document processors.
- Improved error handling for empty markdown content extraction in file processors.
- Refactored DocumentUploadTab component for better accessibility and user interaction.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-04-16 00:25:46 -07:00
parent 4a51ccdc2c
commit 2f793e7a69
5 changed files with 91 additions and 33 deletions

View file

@ -62,10 +62,13 @@ async def parse_with_azure_doc_intelligence(
f"after {len(attempt_errors)} failures"
)
if not result.content:
return ""
content = result.content or ""
if not content.strip():
raise RuntimeError(
"Azure Document Intelligence returned empty/whitespace-only content"
)
return result.content
return content
except ClientAuthenticationError:
raise

View file

@ -186,8 +186,12 @@ class LLMRouterService:
if deployment:
model_list.append(deployment)
if config.get("billing_tier") == "premium":
model_string = deployment["litellm_params"]["model"]
params = deployment["litellm_params"]
model_string = params["model"]
premium_models.add(model_string)
base = params.get("base_model") or config.get("model_name", "")
if base and base != model_string:
premium_models.add(base)
if not model_list:
logger.warning("No valid LLM configs found for router initialization")
@ -197,9 +201,9 @@ class LLMRouterService:
instance._premium_model_strings = premium_models
instance._router_settings = router_settings or {}
logger.info(
"Router pool: %d deployments (%d premium)",
"Router pool: %d deployments, premium model strings: %s",
len(model_list),
len(premium_models),
sorted(premium_models),
)
# Default router settings optimized for rate limit handling
@ -258,9 +262,18 @@ class LLMRouterService:
def compute_premium_tokens(cls, calls: list) -> int:
"""Sum ``total_tokens`` for calls whose model is premium."""
instance = cls.get_instance()
return sum(
total = sum(
c.total_tokens for c in calls if c.model in instance._premium_model_strings
)
if calls:
call_models = [c.model for c in calls]
logger.info(
"[premium_tokens] call models=%s, premium_set=%s, result=%d",
call_models,
sorted(instance._premium_model_strings),
total,
)
return total
@classmethod
def _build_context_fallback_groups(

View file

@ -21,6 +21,42 @@ from markdownify import markdownify
# at import time so every csv.reader call in this module can handle large fields.
csv.field_size_limit(2**31 - 1)
_BOM_ENCODINGS: list[tuple[bytes, str]] = [
(b"\xff\xfe\x00\x00", "utf-32-le"),
(b"\x00\x00\xfe\xff", "utf-32-be"),
(b"\xff\xfe", "utf-16-le"),
(b"\xfe\xff", "utf-16-be"),
(b"\xef\xbb\xbf", "utf-8-sig"),
]
def _detect_encoding(file_path: str) -> str:
"""Sniff the BOM to pick an encoding, falling back to utf-8."""
head = Path(file_path).read_bytes()[:4]
for bom, encoding in _BOM_ENCODINGS:
if head.startswith(bom):
return encoding
return "utf-8"
def _read_text(file_path: str) -> str:
"""Read a file with automatic encoding detection.
Tries BOM-based detection first, then utf-8, then latin-1 as a
last resort (latin-1 accepts every byte value).
"""
encoding = _detect_encoding(file_path)
try:
return Path(file_path).read_text(encoding=encoding)
except (UnicodeDecodeError, UnicodeError):
pass
if encoding != "utf-8":
try:
return Path(file_path).read_text(encoding="utf-8")
except (UnicodeDecodeError, UnicodeError):
pass
return Path(file_path).read_text(encoding="latin-1")
def _escape_pipe(cell: str) -> str:
"""Escape literal pipe characters inside a markdown table cell."""
@ -33,8 +69,8 @@ def csv_to_markdown(file_path: str, *, delimiter: str = ",") -> str:
The first row is treated as the header. An empty file returns an
empty string so the caller can decide how to handle it.
"""
with open(file_path, encoding="utf-8", newline="") as fh:
reader = csv.reader(fh, delimiter=delimiter)
text = _read_text(file_path)
reader = csv.reader(text.splitlines(), delimiter=delimiter)
rows = list(reader)
if not rows:
@ -64,7 +100,7 @@ def tsv_to_markdown(file_path: str) -> str:
def html_to_markdown(file_path: str) -> str:
"""Convert an HTML file to markdown via ``markdownify``."""
html = Path(file_path).read_text(encoding="utf-8")
html = _read_text(file_path)
return markdownify(html).strip()

View file

@ -436,7 +436,7 @@ async def _extract_file_content(
with contextlib.suppress(Exception):
os.unlink(file_path)
if not result.markdown_content:
if not result.markdown_content or not result.markdown_content.strip():
raise RuntimeError(f"Failed to extract content from file: {filename}")
return result.markdown_content, result.etl_service, billable_pages

View file

@ -546,13 +546,19 @@ export function DocumentUploadTab({
</button>
)
) : (
<button
type="button"
<div
role="button"
tabIndex={0}
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent outline-none select-none"
onClick={() => {
if (!isElectron) fileInputRef.current?.click();
}}
onKeyDown={(e) => {
if (e.key === "Enter" || e.key === " ") {
e.preventDefault();
if (!isElectron) fileInputRef.current?.click();
}
}}
>
<Upload className="h-10 w-10 text-muted-foreground" />
<div className="text-center space-y-1.5">
@ -568,7 +574,7 @@ export function DocumentUploadTab({
>
{renderBrowseButton({ fullWidth: true })}
</fieldset>
</button>
</div>
)}
</div>