Fixed all ruff lint and formatting errors

This commit is contained in:
Utkarsh-Patel-13 2025-07-24 14:43:48 -07:00
parent 0a03c42cc5
commit d359a59f6d
85 changed files with 5520 additions and 3870 deletions

View file

@ -2,34 +2,41 @@ class ChucksHybridSearchRetriever:
def __init__(self, db_session):
"""
Initialize the hybrid search retriever with a database session.
Args:
db_session: SQLAlchemy AsyncSession from FastAPI dependency injection
"""
self.db_session = db_session
async def vector_search(self, query_text: str, top_k: int, user_id: str, search_space_id: int = None) -> list:
async def vector_search(
self,
query_text: str,
top_k: int,
user_id: str,
search_space_id: int | None = None,
) -> list:
"""
Perform vector similarity search on chunks.
Args:
query_text: The search query text
top_k: Number of results to return
user_id: The ID of the user performing the search
search_space_id: Optional search space ID to filter results
Returns:
List of chunks sorted by vector similarity
"""
from sqlalchemy import select, func
from sqlalchemy import select
from sqlalchemy.orm import joinedload
from app.db import Chunk, Document, SearchSpace
from app.config import config
from app.db import Chunk, Document, SearchSpace
# Get embedding for the query
embedding_model = config.embedding_model_instance
query_embedding = embedding_model.embed(query_text)
# Build the base query with user ownership check
query = (
select(Chunk)
@ -38,45 +45,48 @@ class ChucksHybridSearchRetriever:
.join(SearchSpace, Document.search_space_id == SearchSpace.id)
.where(SearchSpace.user_id == user_id)
)
# Add search space filter if provided
if search_space_id is not None:
query = query.where(Document.search_space_id == search_space_id)
# Add vector similarity ordering
query = (
query
.order_by(Chunk.embedding.op("<=>")(query_embedding))
.limit(top_k)
)
query = query.order_by(Chunk.embedding.op("<=>")(query_embedding)).limit(top_k)
# Execute the query
result = await self.db_session.execute(query)
chunks = result.scalars().all()
return chunks
async def full_text_search(self, query_text: str, top_k: int, user_id: str, search_space_id: int = None) -> list:
async def full_text_search(
self,
query_text: str,
top_k: int,
user_id: str,
search_space_id: int | None = None,
) -> list:
"""
Perform full-text keyword search on chunks.
Args:
query_text: The search query text
top_k: Number of results to return
user_id: The ID of the user performing the search
search_space_id: Optional search space ID to filter results
Returns:
List of chunks sorted by text relevance
"""
from sqlalchemy import select, func, text
from sqlalchemy import func, select
from sqlalchemy.orm import joinedload
from app.db import Chunk, Document, SearchSpace
# Create tsvector and tsquery for PostgreSQL full-text search
tsvector = func.to_tsvector('english', Chunk.content)
tsquery = func.plainto_tsquery('english', query_text)
tsvector = func.to_tsvector("english", Chunk.content)
tsquery = func.plainto_tsquery("english", query_text)
# Build the base query with user ownership check
query = (
select(Chunk)
@ -84,64 +94,70 @@ class ChucksHybridSearchRetriever:
.join(Document, Chunk.document_id == Document.id)
.join(SearchSpace, Document.search_space_id == SearchSpace.id)
.where(SearchSpace.user_id == user_id)
.where(tsvector.op("@@")(tsquery)) # Only include results that match the query
.where(
tsvector.op("@@")(tsquery)
) # Only include results that match the query
)
# Add search space filter if provided
if search_space_id is not None:
query = query.where(Document.search_space_id == search_space_id)
# Add text search ranking
query = (
query
.order_by(func.ts_rank_cd(tsvector, tsquery).desc())
.limit(top_k)
)
query = query.order_by(func.ts_rank_cd(tsvector, tsquery).desc()).limit(top_k)
# Execute the query
result = await self.db_session.execute(query)
chunks = result.scalars().all()
return chunks
async def hybrid_search(self, query_text: str, top_k: int, user_id: str, search_space_id: int = None, document_type: str = None) -> list:
async def hybrid_search(
self,
query_text: str,
top_k: int,
user_id: str,
search_space_id: int | None = None,
document_type: str | None = None,
) -> list:
"""
Combine vector similarity and full-text search results using Reciprocal Rank Fusion.
Args:
query_text: The search query text
top_k: Number of results to return
user_id: The ID of the user performing the search
search_space_id: Optional search space ID to filter results
document_type: Optional document type to filter results (e.g., "FILE", "CRAWLED_URL")
Returns:
List of dictionaries containing chunk data and relevance scores
"""
from sqlalchemy import select, func, text
from sqlalchemy import func, select, text
from sqlalchemy.orm import joinedload
from app.db import Chunk, Document, SearchSpace, DocumentType
from app.config import config
from app.db import Chunk, Document, DocumentType, SearchSpace
# Get embedding for the query
embedding_model = config.embedding_model_instance
query_embedding = embedding_model.embed(query_text)
# Constants for RRF calculation
k = 60 # Constant for RRF calculation
n_results = top_k * 2 # Get more results for better fusion
# Create tsvector and tsquery for PostgreSQL full-text search
tsvector = func.to_tsvector('english', Chunk.content)
tsquery = func.plainto_tsquery('english', query_text)
tsvector = func.to_tsvector("english", Chunk.content)
tsquery = func.plainto_tsquery("english", query_text)
# Base conditions for document filtering
base_conditions = [SearchSpace.user_id == user_id]
# Add search space filter if provided
if search_space_id is not None:
base_conditions.append(Document.search_space_id == search_space_id)
# Add document type filter if provided
if document_type is not None:
# Convert string to enum value if needed
@ -154,90 +170,97 @@ class ChucksHybridSearchRetriever:
return []
else:
base_conditions.append(Document.document_type == document_type)
# CTE for semantic search with user ownership check
semantic_search_cte = (
select(
Chunk.id,
func.rank().over(order_by=Chunk.embedding.op("<=>")(query_embedding)).label("rank")
func.rank()
.over(order_by=Chunk.embedding.op("<=>")(query_embedding))
.label("rank"),
)
.join(Document, Chunk.document_id == Document.id)
.join(SearchSpace, Document.search_space_id == SearchSpace.id)
.where(*base_conditions)
)
semantic_search_cte = (
semantic_search_cte
.order_by(Chunk.embedding.op("<=>")(query_embedding))
semantic_search_cte.order_by(Chunk.embedding.op("<=>")(query_embedding))
.limit(n_results)
.cte("semantic_search")
)
# CTE for keyword search with user ownership check
keyword_search_cte = (
select(
Chunk.id,
func.rank().over(order_by=func.ts_rank_cd(tsvector, tsquery).desc()).label("rank")
func.rank()
.over(order_by=func.ts_rank_cd(tsvector, tsquery).desc())
.label("rank"),
)
.join(Document, Chunk.document_id == Document.id)
.join(SearchSpace, Document.search_space_id == SearchSpace.id)
.where(*base_conditions)
.where(tsvector.op("@@")(tsquery))
)
keyword_search_cte = (
keyword_search_cte
.order_by(func.ts_rank_cd(tsvector, tsquery).desc())
keyword_search_cte.order_by(func.ts_rank_cd(tsvector, tsquery).desc())
.limit(n_results)
.cte("keyword_search")
)
# Final combined query using a FULL OUTER JOIN with RRF scoring
final_query = (
select(
Chunk,
(
func.coalesce(1.0 / (k + semantic_search_cte.c.rank), 0.0) +
func.coalesce(1.0 / (k + keyword_search_cte.c.rank), 0.0)
).label("score")
func.coalesce(1.0 / (k + semantic_search_cte.c.rank), 0.0)
+ func.coalesce(1.0 / (k + keyword_search_cte.c.rank), 0.0)
).label("score"),
)
.select_from(
semantic_search_cte.outerjoin(
keyword_search_cte,
keyword_search_cte,
semantic_search_cte.c.id == keyword_search_cte.c.id,
full=True
full=True,
)
)
.join(
Chunk,
Chunk.id == func.coalesce(semantic_search_cte.c.id, keyword_search_cte.c.id)
Chunk.id
== func.coalesce(semantic_search_cte.c.id, keyword_search_cte.c.id),
)
.options(joinedload(Chunk.document))
.order_by(text("score DESC"))
.limit(top_k)
)
# Execute the query
result = await self.db_session.execute(final_query)
chunks_with_scores = result.all()
# If no results were found, return an empty list
if not chunks_with_scores:
return []
# Convert to serializable dictionaries if no reranker is available or if reranking failed
serialized_results = []
for chunk, score in chunks_with_scores:
serialized_results.append({
"chunk_id": chunk.id,
"content": chunk.content,
"score": float(score), # Ensure score is a Python float
"document": {
"id": chunk.document.id,
"title": chunk.document.title,
"document_type": chunk.document.document_type.value if hasattr(chunk.document, 'document_type') else None,
"metadata": chunk.document.document_metadata
serialized_results.append(
{
"chunk_id": chunk.id,
"content": chunk.content,
"score": float(score), # Ensure score is a Python float
"document": {
"id": chunk.document.id,
"title": chunk.document.title,
"document_type": chunk.document.document_type.value
if hasattr(chunk.document, "document_type")
else None,
"metadata": chunk.document.document_metadata,
},
}
})
)
return serialized_results