mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 00:36:31 +02:00
refactor: improve content extraction and encoding handling
- Enhanced Azure Document Intelligence parser to raise an error for empty or whitespace-only content. - Updated LLMRouterService to log premium model strings more clearly. - Added automatic encoding detection for file reading in document processors. - Improved error handling for empty markdown content extraction in file processors. - Refactored DocumentUploadTab component for better accessibility and user interaction.
This commit is contained in:
parent
4a51ccdc2c
commit
2f793e7a69
5 changed files with 91 additions and 33 deletions
|
|
@ -62,10 +62,13 @@ async def parse_with_azure_doc_intelligence(
|
||||||
f"after {len(attempt_errors)} failures"
|
f"after {len(attempt_errors)} failures"
|
||||||
)
|
)
|
||||||
|
|
||||||
if not result.content:
|
content = result.content or ""
|
||||||
return ""
|
if not content.strip():
|
||||||
|
raise RuntimeError(
|
||||||
|
"Azure Document Intelligence returned empty/whitespace-only content"
|
||||||
|
)
|
||||||
|
|
||||||
return result.content
|
return content
|
||||||
|
|
||||||
except ClientAuthenticationError:
|
except ClientAuthenticationError:
|
||||||
raise
|
raise
|
||||||
|
|
|
||||||
|
|
@ -186,8 +186,12 @@ class LLMRouterService:
|
||||||
if deployment:
|
if deployment:
|
||||||
model_list.append(deployment)
|
model_list.append(deployment)
|
||||||
if config.get("billing_tier") == "premium":
|
if config.get("billing_tier") == "premium":
|
||||||
model_string = deployment["litellm_params"]["model"]
|
params = deployment["litellm_params"]
|
||||||
|
model_string = params["model"]
|
||||||
premium_models.add(model_string)
|
premium_models.add(model_string)
|
||||||
|
base = params.get("base_model") or config.get("model_name", "")
|
||||||
|
if base and base != model_string:
|
||||||
|
premium_models.add(base)
|
||||||
|
|
||||||
if not model_list:
|
if not model_list:
|
||||||
logger.warning("No valid LLM configs found for router initialization")
|
logger.warning("No valid LLM configs found for router initialization")
|
||||||
|
|
@ -197,9 +201,9 @@ class LLMRouterService:
|
||||||
instance._premium_model_strings = premium_models
|
instance._premium_model_strings = premium_models
|
||||||
instance._router_settings = router_settings or {}
|
instance._router_settings = router_settings or {}
|
||||||
logger.info(
|
logger.info(
|
||||||
"Router pool: %d deployments (%d premium)",
|
"Router pool: %d deployments, premium model strings: %s",
|
||||||
len(model_list),
|
len(model_list),
|
||||||
len(premium_models),
|
sorted(premium_models),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Default router settings optimized for rate limit handling
|
# Default router settings optimized for rate limit handling
|
||||||
|
|
@ -258,9 +262,18 @@ class LLMRouterService:
|
||||||
def compute_premium_tokens(cls, calls: list) -> int:
|
def compute_premium_tokens(cls, calls: list) -> int:
|
||||||
"""Sum ``total_tokens`` for calls whose model is premium."""
|
"""Sum ``total_tokens`` for calls whose model is premium."""
|
||||||
instance = cls.get_instance()
|
instance = cls.get_instance()
|
||||||
return sum(
|
total = sum(
|
||||||
c.total_tokens for c in calls if c.model in instance._premium_model_strings
|
c.total_tokens for c in calls if c.model in instance._premium_model_strings
|
||||||
)
|
)
|
||||||
|
if calls:
|
||||||
|
call_models = [c.model for c in calls]
|
||||||
|
logger.info(
|
||||||
|
"[premium_tokens] call models=%s, premium_set=%s, result=%d",
|
||||||
|
call_models,
|
||||||
|
sorted(instance._premium_model_strings),
|
||||||
|
total,
|
||||||
|
)
|
||||||
|
return total
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _build_context_fallback_groups(
|
def _build_context_fallback_groups(
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,42 @@ from markdownify import markdownify
|
||||||
# at import time so every csv.reader call in this module can handle large fields.
|
# at import time so every csv.reader call in this module can handle large fields.
|
||||||
csv.field_size_limit(2**31 - 1)
|
csv.field_size_limit(2**31 - 1)
|
||||||
|
|
||||||
|
_BOM_ENCODINGS: list[tuple[bytes, str]] = [
|
||||||
|
(b"\xff\xfe\x00\x00", "utf-32-le"),
|
||||||
|
(b"\x00\x00\xfe\xff", "utf-32-be"),
|
||||||
|
(b"\xff\xfe", "utf-16-le"),
|
||||||
|
(b"\xfe\xff", "utf-16-be"),
|
||||||
|
(b"\xef\xbb\xbf", "utf-8-sig"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_encoding(file_path: str) -> str:
|
||||||
|
"""Sniff the BOM to pick an encoding, falling back to utf-8."""
|
||||||
|
head = Path(file_path).read_bytes()[:4]
|
||||||
|
for bom, encoding in _BOM_ENCODINGS:
|
||||||
|
if head.startswith(bom):
|
||||||
|
return encoding
|
||||||
|
return "utf-8"
|
||||||
|
|
||||||
|
|
||||||
|
def _read_text(file_path: str) -> str:
|
||||||
|
"""Read a file with automatic encoding detection.
|
||||||
|
|
||||||
|
Tries BOM-based detection first, then utf-8, then latin-1 as a
|
||||||
|
last resort (latin-1 accepts every byte value).
|
||||||
|
"""
|
||||||
|
encoding = _detect_encoding(file_path)
|
||||||
|
try:
|
||||||
|
return Path(file_path).read_text(encoding=encoding)
|
||||||
|
except (UnicodeDecodeError, UnicodeError):
|
||||||
|
pass
|
||||||
|
if encoding != "utf-8":
|
||||||
|
try:
|
||||||
|
return Path(file_path).read_text(encoding="utf-8")
|
||||||
|
except (UnicodeDecodeError, UnicodeError):
|
||||||
|
pass
|
||||||
|
return Path(file_path).read_text(encoding="latin-1")
|
||||||
|
|
||||||
|
|
||||||
def _escape_pipe(cell: str) -> str:
|
def _escape_pipe(cell: str) -> str:
|
||||||
"""Escape literal pipe characters inside a markdown table cell."""
|
"""Escape literal pipe characters inside a markdown table cell."""
|
||||||
|
|
@ -33,9 +69,9 @@ def csv_to_markdown(file_path: str, *, delimiter: str = ",") -> str:
|
||||||
The first row is treated as the header. An empty file returns an
|
The first row is treated as the header. An empty file returns an
|
||||||
empty string so the caller can decide how to handle it.
|
empty string so the caller can decide how to handle it.
|
||||||
"""
|
"""
|
||||||
with open(file_path, encoding="utf-8", newline="") as fh:
|
text = _read_text(file_path)
|
||||||
reader = csv.reader(fh, delimiter=delimiter)
|
reader = csv.reader(text.splitlines(), delimiter=delimiter)
|
||||||
rows = list(reader)
|
rows = list(reader)
|
||||||
|
|
||||||
if not rows:
|
if not rows:
|
||||||
return ""
|
return ""
|
||||||
|
|
@ -64,7 +100,7 @@ def tsv_to_markdown(file_path: str) -> str:
|
||||||
|
|
||||||
def html_to_markdown(file_path: str) -> str:
|
def html_to_markdown(file_path: str) -> str:
|
||||||
"""Convert an HTML file to markdown via ``markdownify``."""
|
"""Convert an HTML file to markdown via ``markdownify``."""
|
||||||
html = Path(file_path).read_text(encoding="utf-8")
|
html = _read_text(file_path)
|
||||||
return markdownify(html).strip()
|
return markdownify(html).strip()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -436,7 +436,7 @@ async def _extract_file_content(
|
||||||
with contextlib.suppress(Exception):
|
with contextlib.suppress(Exception):
|
||||||
os.unlink(file_path)
|
os.unlink(file_path)
|
||||||
|
|
||||||
if not result.markdown_content:
|
if not result.markdown_content or not result.markdown_content.strip():
|
||||||
raise RuntimeError(f"Failed to extract content from file: {filename}")
|
raise RuntimeError(f"Failed to extract content from file: {filename}")
|
||||||
|
|
||||||
return result.markdown_content, result.etl_service, billable_pages
|
return result.markdown_content, result.etl_service, billable_pages
|
||||||
|
|
|
||||||
|
|
@ -546,29 +546,35 @@ export function DocumentUploadTab({
|
||||||
</button>
|
</button>
|
||||||
)
|
)
|
||||||
) : (
|
) : (
|
||||||
<button
|
<div
|
||||||
type="button"
|
role="button"
|
||||||
tabIndex={0}
|
tabIndex={0}
|
||||||
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent outline-none select-none"
|
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent outline-none select-none"
|
||||||
onClick={() => {
|
onClick={() => {
|
||||||
|
if (!isElectron) fileInputRef.current?.click();
|
||||||
|
}}
|
||||||
|
onKeyDown={(e) => {
|
||||||
|
if (e.key === "Enter" || e.key === " ") {
|
||||||
|
e.preventDefault();
|
||||||
if (!isElectron) fileInputRef.current?.click();
|
if (!isElectron) fileInputRef.current?.click();
|
||||||
}}
|
}
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<Upload className="h-10 w-10 text-muted-foreground" />
|
||||||
|
<div className="text-center space-y-1.5">
|
||||||
|
<p className="text-base font-medium">
|
||||||
|
{isElectron ? t("select_files_or_folder") : t("tap_select_files_or_folder")}
|
||||||
|
</p>
|
||||||
|
<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
|
||||||
|
</div>
|
||||||
|
<fieldset
|
||||||
|
className="w-full mt-1 border-none p-0 m-0"
|
||||||
|
onClick={(e) => e.stopPropagation()}
|
||||||
|
onKeyDown={(e) => e.stopPropagation()}
|
||||||
>
|
>
|
||||||
<Upload className="h-10 w-10 text-muted-foreground" />
|
{renderBrowseButton({ fullWidth: true })}
|
||||||
<div className="text-center space-y-1.5">
|
</fieldset>
|
||||||
<p className="text-base font-medium">
|
</div>
|
||||||
{isElectron ? t("select_files_or_folder") : t("tap_select_files_or_folder")}
|
|
||||||
</p>
|
|
||||||
<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
|
|
||||||
</div>
|
|
||||||
<fieldset
|
|
||||||
className="w-full mt-1 border-none p-0 m-0"
|
|
||||||
onClick={(e) => e.stopPropagation()}
|
|
||||||
onKeyDown={(e) => e.stopPropagation()}
|
|
||||||
>
|
|
||||||
{renderBrowseButton({ fullWidth: true })}
|
|
||||||
</fieldset>
|
|
||||||
</button>
|
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue