Making async

This commit is contained in:
samkul-swe 2025-11-04 15:27:57 -08:00
parent b03365cded
commit e49c455c01

View file

@ -5,7 +5,7 @@ URL crawler document processor.
import logging import logging
import validators import validators
from firecrawl import FirecrawlApp from firecrawl import AsyncFirecrawlApp
from langchain_community.document_loaders import AsyncChromiumLoader from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.documents import Document as LangchainDocument from langchain_core.documents import Document as LangchainDocument
from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.exc import SQLAlchemyError
@ -76,7 +76,7 @@ async def add_crawled_url_document(
if use_firecrawl: if use_firecrawl:
# Use Firecrawl SDK directly # Use Firecrawl SDK directly
firecrawl_app = FirecrawlApp(api_key=config.FIRECRAWL_API_KEY) firecrawl_app = AsyncFirecrawlApp(api_key=config.FIRECRAWL_API_KEY)
else: else:
crawl_loader = AsyncChromiumLoader(urls=[url], headless=True) crawl_loader = AsyncChromiumLoader(urls=[url], headless=True)
@ -84,17 +84,15 @@ async def add_crawled_url_document(
await task_logger.log_task_progress( await task_logger.log_task_progress(
log_entry, log_entry,
f"Crawling URL content: {url}", f"Crawling URL content: {url}",
{"stage": "crawling", "crawler_type": "FirecrawlApp" if use_firecrawl else "AsyncChromiumLoader"}, {"stage": "crawling", "crawler_type": "AsyncFirecrawlApp" if use_firecrawl else "AsyncChromiumLoader"},
) )
if use_firecrawl: if use_firecrawl:
# Use Firecrawl SDK directly with v1 API # Use async Firecrawl SDK with v1 API - properly awaited
scrape_result = firecrawl_app.scrape_url( scrape_result = await firecrawl_app.scrape_url(
url=url, url=url,
formats=['markdown'] formats=['markdown']
) )
print(scrape_result)
# scrape_result is a Pydantic ScrapeResponse object # scrape_result is a Pydantic ScrapeResponse object
# Access attributes directly # Access attributes directly
@ -102,7 +100,7 @@ async def add_crawled_url_document(
# Extract markdown content # Extract markdown content
markdown_content = scrape_result.markdown or '' markdown_content = scrape_result.markdown or ''
# Extract metadata - this is a DICT, not a Pydantic object # Extract metadata - this is a DICT
metadata = scrape_result.metadata if scrape_result.metadata else {} metadata = scrape_result.metadata if scrape_result.metadata else {}
# Convert to LangChain Document format # Convert to LangChain Document format