refactor: improve content extraction and encoding handling

- Enhanced Azure Document Intelligence parser to raise an error for empty or whitespace-only content.
- Updated LLMRouterService to log premium model strings more clearly.
- Added automatic encoding detection for file reading in document processors.
- Improved error handling for empty markdown content extraction in file processors.
- Refactored DocumentUploadTab component for better accessibility and user interaction.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-04-16 00:25:46 -07:00
parent 4a51ccdc2c
commit 2f793e7a69
5 changed files with 91 additions and 33 deletions

View file

@ -62,10 +62,13 @@ async def parse_with_azure_doc_intelligence(
f"after {len(attempt_errors)} failures" f"after {len(attempt_errors)} failures"
) )
if not result.content: content = result.content or ""
return "" if not content.strip():
raise RuntimeError(
"Azure Document Intelligence returned empty/whitespace-only content"
)
return result.content return content
except ClientAuthenticationError: except ClientAuthenticationError:
raise raise

View file

@ -186,8 +186,12 @@ class LLMRouterService:
if deployment: if deployment:
model_list.append(deployment) model_list.append(deployment)
if config.get("billing_tier") == "premium": if config.get("billing_tier") == "premium":
model_string = deployment["litellm_params"]["model"] params = deployment["litellm_params"]
model_string = params["model"]
premium_models.add(model_string) premium_models.add(model_string)
base = params.get("base_model") or config.get("model_name", "")
if base and base != model_string:
premium_models.add(base)
if not model_list: if not model_list:
logger.warning("No valid LLM configs found for router initialization") logger.warning("No valid LLM configs found for router initialization")
@ -197,9 +201,9 @@ class LLMRouterService:
instance._premium_model_strings = premium_models instance._premium_model_strings = premium_models
instance._router_settings = router_settings or {} instance._router_settings = router_settings or {}
logger.info( logger.info(
"Router pool: %d deployments (%d premium)", "Router pool: %d deployments, premium model strings: %s",
len(model_list), len(model_list),
len(premium_models), sorted(premium_models),
) )
# Default router settings optimized for rate limit handling # Default router settings optimized for rate limit handling
@ -258,9 +262,18 @@ class LLMRouterService:
def compute_premium_tokens(cls, calls: list) -> int: def compute_premium_tokens(cls, calls: list) -> int:
"""Sum ``total_tokens`` for calls whose model is premium.""" """Sum ``total_tokens`` for calls whose model is premium."""
instance = cls.get_instance() instance = cls.get_instance()
return sum( total = sum(
c.total_tokens for c in calls if c.model in instance._premium_model_strings c.total_tokens for c in calls if c.model in instance._premium_model_strings
) )
if calls:
call_models = [c.model for c in calls]
logger.info(
"[premium_tokens] call models=%s, premium_set=%s, result=%d",
call_models,
sorted(instance._premium_model_strings),
total,
)
return total
@classmethod @classmethod
def _build_context_fallback_groups( def _build_context_fallback_groups(

View file

@ -21,6 +21,42 @@ from markdownify import markdownify
# at import time so every csv.reader call in this module can handle large fields. # at import time so every csv.reader call in this module can handle large fields.
csv.field_size_limit(2**31 - 1) csv.field_size_limit(2**31 - 1)
_BOM_ENCODINGS: list[tuple[bytes, str]] = [
(b"\xff\xfe\x00\x00", "utf-32-le"),
(b"\x00\x00\xfe\xff", "utf-32-be"),
(b"\xff\xfe", "utf-16-le"),
(b"\xfe\xff", "utf-16-be"),
(b"\xef\xbb\xbf", "utf-8-sig"),
]
def _detect_encoding(file_path: str) -> str:
"""Sniff the BOM to pick an encoding, falling back to utf-8."""
head = Path(file_path).read_bytes()[:4]
for bom, encoding in _BOM_ENCODINGS:
if head.startswith(bom):
return encoding
return "utf-8"
def _read_text(file_path: str) -> str:
"""Read a file with automatic encoding detection.
Tries BOM-based detection first, then utf-8, then latin-1 as a
last resort (latin-1 accepts every byte value).
"""
encoding = _detect_encoding(file_path)
try:
return Path(file_path).read_text(encoding=encoding)
except (UnicodeDecodeError, UnicodeError):
pass
if encoding != "utf-8":
try:
return Path(file_path).read_text(encoding="utf-8")
except (UnicodeDecodeError, UnicodeError):
pass
return Path(file_path).read_text(encoding="latin-1")
def _escape_pipe(cell: str) -> str: def _escape_pipe(cell: str) -> str:
"""Escape literal pipe characters inside a markdown table cell.""" """Escape literal pipe characters inside a markdown table cell."""
@ -33,9 +69,9 @@ def csv_to_markdown(file_path: str, *, delimiter: str = ",") -> str:
The first row is treated as the header. An empty file returns an The first row is treated as the header. An empty file returns an
empty string so the caller can decide how to handle it. empty string so the caller can decide how to handle it.
""" """
with open(file_path, encoding="utf-8", newline="") as fh: text = _read_text(file_path)
reader = csv.reader(fh, delimiter=delimiter) reader = csv.reader(text.splitlines(), delimiter=delimiter)
rows = list(reader) rows = list(reader)
if not rows: if not rows:
return "" return ""
@ -64,7 +100,7 @@ def tsv_to_markdown(file_path: str) -> str:
def html_to_markdown(file_path: str) -> str: def html_to_markdown(file_path: str) -> str:
"""Convert an HTML file to markdown via ``markdownify``.""" """Convert an HTML file to markdown via ``markdownify``."""
html = Path(file_path).read_text(encoding="utf-8") html = _read_text(file_path)
return markdownify(html).strip() return markdownify(html).strip()

View file

@ -436,7 +436,7 @@ async def _extract_file_content(
with contextlib.suppress(Exception): with contextlib.suppress(Exception):
os.unlink(file_path) os.unlink(file_path)
if not result.markdown_content: if not result.markdown_content or not result.markdown_content.strip():
raise RuntimeError(f"Failed to extract content from file: {filename}") raise RuntimeError(f"Failed to extract content from file: {filename}")
return result.markdown_content, result.etl_service, billable_pages return result.markdown_content, result.etl_service, billable_pages

View file

@ -546,29 +546,35 @@ export function DocumentUploadTab({
</button> </button>
) )
) : ( ) : (
<button <div
type="button" role="button"
tabIndex={0} tabIndex={0}
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent outline-none select-none" className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent outline-none select-none"
onClick={() => { onClick={() => {
if (!isElectron) fileInputRef.current?.click();
}}
onKeyDown={(e) => {
if (e.key === "Enter" || e.key === " ") {
e.preventDefault();
if (!isElectron) fileInputRef.current?.click(); if (!isElectron) fileInputRef.current?.click();
}} }
}}
>
<Upload className="h-10 w-10 text-muted-foreground" />
<div className="text-center space-y-1.5">
<p className="text-base font-medium">
{isElectron ? t("select_files_or_folder") : t("tap_select_files_or_folder")}
</p>
<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
</div>
<fieldset
className="w-full mt-1 border-none p-0 m-0"
onClick={(e) => e.stopPropagation()}
onKeyDown={(e) => e.stopPropagation()}
> >
<Upload className="h-10 w-10 text-muted-foreground" /> {renderBrowseButton({ fullWidth: true })}
<div className="text-center space-y-1.5"> </fieldset>
<p className="text-base font-medium"> </div>
{isElectron ? t("select_files_or_folder") : t("tap_select_files_or_folder")}
</p>
<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
</div>
<fieldset
className="w-full mt-1 border-none p-0 m-0"
onClick={(e) => e.stopPropagation()}
onKeyDown={(e) => e.stopPropagation()}
>
{renderBrowseButton({ fullWidth: true })}
</fieldset>
</button>
)} )}
</div> </div>