mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-06 20:15:17 +02:00
Making async
This commit is contained in:
parent
b03365cded
commit
e49c455c01
1 changed files with 6 additions and 8 deletions
|
|
@ -5,7 +5,7 @@ URL crawler document processor.
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import validators
|
import validators
|
||||||
from firecrawl import FirecrawlApp
|
from firecrawl import AsyncFirecrawlApp
|
||||||
from langchain_community.document_loaders import AsyncChromiumLoader
|
from langchain_community.document_loaders import AsyncChromiumLoader
|
||||||
from langchain_core.documents import Document as LangchainDocument
|
from langchain_core.documents import Document as LangchainDocument
|
||||||
from sqlalchemy.exc import SQLAlchemyError
|
from sqlalchemy.exc import SQLAlchemyError
|
||||||
|
|
@ -76,7 +76,7 @@ async def add_crawled_url_document(
|
||||||
|
|
||||||
if use_firecrawl:
|
if use_firecrawl:
|
||||||
# Use Firecrawl SDK directly
|
# Use Firecrawl SDK directly
|
||||||
firecrawl_app = FirecrawlApp(api_key=config.FIRECRAWL_API_KEY)
|
firecrawl_app = AsyncFirecrawlApp(api_key=config.FIRECRAWL_API_KEY)
|
||||||
else:
|
else:
|
||||||
crawl_loader = AsyncChromiumLoader(urls=[url], headless=True)
|
crawl_loader = AsyncChromiumLoader(urls=[url], headless=True)
|
||||||
|
|
||||||
|
|
@ -84,25 +84,23 @@ async def add_crawled_url_document(
|
||||||
await task_logger.log_task_progress(
|
await task_logger.log_task_progress(
|
||||||
log_entry,
|
log_entry,
|
||||||
f"Crawling URL content: {url}",
|
f"Crawling URL content: {url}",
|
||||||
{"stage": "crawling", "crawler_type": "FirecrawlApp" if use_firecrawl else "AsyncChromiumLoader"},
|
{"stage": "crawling", "crawler_type": "AsyncFirecrawlApp" if use_firecrawl else "AsyncChromiumLoader"},
|
||||||
)
|
)
|
||||||
|
|
||||||
if use_firecrawl:
|
if use_firecrawl:
|
||||||
# Use Firecrawl SDK directly with v1 API
|
# Use async Firecrawl SDK with v1 API - properly awaited
|
||||||
scrape_result = firecrawl_app.scrape_url(
|
scrape_result = await firecrawl_app.scrape_url(
|
||||||
url=url,
|
url=url,
|
||||||
formats=['markdown']
|
formats=['markdown']
|
||||||
)
|
)
|
||||||
|
|
||||||
print(scrape_result)
|
|
||||||
|
|
||||||
# scrape_result is a Pydantic ScrapeResponse object
|
# scrape_result is a Pydantic ScrapeResponse object
|
||||||
# Access attributes directly
|
# Access attributes directly
|
||||||
if scrape_result and scrape_result.success:
|
if scrape_result and scrape_result.success:
|
||||||
# Extract markdown content
|
# Extract markdown content
|
||||||
markdown_content = scrape_result.markdown or ''
|
markdown_content = scrape_result.markdown or ''
|
||||||
|
|
||||||
# Extract metadata - this is a DICT, not a Pydantic object
|
# Extract metadata - this is a DICT
|
||||||
metadata = scrape_result.metadata if scrape_result.metadata else {}
|
metadata = scrape_result.metadata if scrape_result.metadata else {}
|
||||||
|
|
||||||
# Convert to LangChain Document format
|
# Convert to LangChain Document format
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue