mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-27 09:46:25 +02:00
feat: add chunking and embedding logic to indexer
This commit is contained in:
parent
fff851ae3f
commit
2e83ed8dcd
1 changed files with 22 additions and 0 deletions
|
|
@ -8,6 +8,9 @@ import logging
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from app.config import config
|
||||||
|
from app.db import SurfsenseDocsChunk
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Path to docs relative to project root
|
# Path to docs relative to project root
|
||||||
|
|
@ -62,3 +65,22 @@ def generate_surfsense_docs_content_hash(content: str) -> str:
|
||||||
"""Generate SHA-256 hash for Surfsense docs content."""
|
"""Generate SHA-256 hash for Surfsense docs content."""
|
||||||
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def create_surfsense_docs_chunks(content: str) -> list[SurfsenseDocsChunk]:
|
||||||
|
"""
|
||||||
|
Create chunks from Surfsense documentation content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: Document content to chunk
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of SurfsenseDocsChunk objects with embeddings
|
||||||
|
"""
|
||||||
|
return [
|
||||||
|
SurfsenseDocsChunk(
|
||||||
|
content=chunk.text,
|
||||||
|
embedding=config.embedding_model_instance.embed(chunk.text),
|
||||||
|
)
|
||||||
|
for chunk in config.chunker_instance.chunk(content)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue