mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 09:16:22 +02:00
chore: cleanup
This commit is contained in:
parent
33ab74f698
commit
48fc70a08b
22 changed files with 8 additions and 1540 deletions
|
|
@ -222,88 +222,6 @@ async def convert_document_to_markdown(elements):
|
|||
return "".join(markdown_parts)
|
||||
|
||||
|
||||
def convert_chunks_to_langchain_documents(chunks):
|
||||
"""
|
||||
Convert chunks from hybrid search results to LangChain Document objects.
|
||||
|
||||
Args:
|
||||
chunks: List of chunk dictionaries from hybrid search results
|
||||
|
||||
Returns:
|
||||
List of LangChain Document objects
|
||||
"""
|
||||
try:
|
||||
from langchain_core.documents import Document as LangChainDocument
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"LangChain is not installed. Please install it with `pip install langchain langchain-core`"
|
||||
) from None
|
||||
|
||||
langchain_docs = []
|
||||
|
||||
for chunk in chunks:
|
||||
# Extract content from the chunk
|
||||
content = chunk.get("content", "")
|
||||
|
||||
# Create metadata dictionary
|
||||
metadata = {
|
||||
"chunk_id": chunk.get("chunk_id"),
|
||||
"score": chunk.get("score"),
|
||||
"rank": chunk.get("rank") if "rank" in chunk else None,
|
||||
}
|
||||
|
||||
# Add document information to metadata
|
||||
if "document" in chunk:
|
||||
doc = chunk["document"]
|
||||
metadata.update(
|
||||
{
|
||||
"document_id": doc.get("id"),
|
||||
"document_title": doc.get("title"),
|
||||
"document_type": doc.get("document_type"),
|
||||
}
|
||||
)
|
||||
|
||||
# Add document metadata if available
|
||||
if "metadata" in doc:
|
||||
# Prefix document metadata keys to avoid conflicts
|
||||
doc_metadata = {
|
||||
f"doc_meta_{k}": v for k, v in doc.get("metadata", {}).items()
|
||||
}
|
||||
metadata.update(doc_metadata)
|
||||
|
||||
# Add source URL if available in metadata
|
||||
if "url" in doc.get("metadata", {}):
|
||||
metadata["source"] = doc["metadata"]["url"]
|
||||
elif "sourceURL" in doc.get("metadata", {}):
|
||||
metadata["source"] = doc["metadata"]["sourceURL"]
|
||||
|
||||
# Ensure source_id is set for citation purposes
|
||||
# Use document_id as the source_id if available
|
||||
if "document_id" in metadata:
|
||||
metadata["source_id"] = metadata["document_id"]
|
||||
|
||||
# Update content for citation mode - format as XML with explicit source_id
|
||||
new_content = f"""
|
||||
<document>
|
||||
<metadata>
|
||||
<source_id>{metadata.get("source_id", metadata.get("document_id", "unknown"))}</source_id>
|
||||
</metadata>
|
||||
<content>
|
||||
<text>
|
||||
{content}
|
||||
</text>
|
||||
</content>
|
||||
</document>
|
||||
"""
|
||||
|
||||
# Create LangChain Document
|
||||
langchain_doc = LangChainDocument(page_content=new_content, metadata=metadata)
|
||||
|
||||
langchain_docs.append(langchain_doc)
|
||||
|
||||
return langchain_docs
|
||||
|
||||
|
||||
def generate_content_hash(content: str, search_space_id: int) -> str:
|
||||
"""Generate SHA-256 hash for the given content combined with search space ID."""
|
||||
combined_data = f"{search_space_id}:{content}"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue