mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
fix: guard missing text_as_html in Table element markdown conversion
When the Unstructured API returns a Table element without text_as_html in its metadata (e.g. local install or free-tier API), the lambda was raising KeyError: 'text_as_html', crashing the entire document indexing pipeline for any file containing tables. Guard the key access with .get() and fall back to the plain extracted text content (x) so the pipeline continues and the table content is still indexed, just without HTML formatting.
This commit is contained in:
parent
ee241e0ff2
commit
e16e4e2c5c
1 changed files with 5 additions and 1 deletions
|
|
@ -221,7 +221,11 @@ async def convert_element_to_markdown(element) -> str:
|
||||||
"EmailAddress": lambda x: f"`{x}`",
|
"EmailAddress": lambda x: f"`{x}`",
|
||||||
"Image": lambda x: f"",
|
"Image": lambda x: f"",
|
||||||
"PageBreak": lambda x: "\n---\n",
|
"PageBreak": lambda x: "\n---\n",
|
||||||
"Table": lambda x: f"```html\n{element.metadata['text_as_html']}\n```",
|
"Table": lambda x: (
|
||||||
|
f"```html\n{element.metadata['text_as_html']}\n```"
|
||||||
|
if element.metadata.get("text_as_html")
|
||||||
|
else x
|
||||||
|
),
|
||||||
"Header": lambda x: f"## {x}\n\n",
|
"Header": lambda x: f"## {x}\n\n",
|
||||||
"Footer": lambda x: f"*{x}*\n\n",
|
"Footer": lambda x: f"*{x}*\n\n",
|
||||||
"CodeSnippet": lambda x: f"```\n{x}\n```",
|
"CodeSnippet": lambda x: f"```\n{x}\n```",
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue