mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-30 21:59:46 +02:00
feat(proxy): integrate Scrapling for enhanced web scraping capabilities
- Replaced Playwright with Scrapling's fetchers in the web crawling and YouTube processing modules for improved performance and flexibility. - Updated proxy configuration to support dynamic proxy selection via environment variables. - Enhanced logging to track performance metrics during web scraping operations. - Refactored related modules to utilize the new proxy utilities and streamline the scraping process.
This commit is contained in:
parent
41a93ca8fb
commit
640ef5f15d
16 changed files with 5770 additions and 4886 deletions
40
surfsense_backend/app/utils/proxy/__init__.py
Normal file
40
surfsense_backend/app/utils/proxy/__init__.py
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
"""Modular residential / rotating proxy provider package.
|
||||
|
||||
Selects a provider via the ``PROXY_PROVIDER`` env var (see ``registry.py``) and
|
||||
exposes proxy settings in the formats different HTTP libraries expect. Add new
|
||||
vendors by implementing :class:`ProxyProvider` in ``providers/`` and registering
|
||||
them in ``registry.py``.
|
||||
"""
|
||||
|
||||
from app.utils.proxy.base import ProxyProvider
|
||||
from app.utils.proxy.registry import get_active_provider
|
||||
|
||||
|
||||
def get_proxy_url() -> str | None:
|
||||
"""Canonical ``http://user:pass@host:port`` URL for Scrapling/curl_cffi."""
|
||||
return get_active_provider().get_proxy_url()
|
||||
|
||||
|
||||
def get_playwright_proxy() -> dict[str, str] | None:
|
||||
"""Playwright-style proxy dict, or ``None`` when not configured."""
|
||||
return get_active_provider().get_playwright_proxy()
|
||||
|
||||
|
||||
def get_requests_proxies() -> dict[str, str] | None:
|
||||
"""``{"http": ..., "https": ...}`` dict for requests/aiohttp, or ``None``."""
|
||||
return get_active_provider().get_requests_proxies()
|
||||
|
||||
|
||||
def get_residential_proxy_url() -> str | None:
|
||||
"""Backward-compatible alias for :func:`get_proxy_url`."""
|
||||
return get_proxy_url()
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ProxyProvider",
|
||||
"get_active_provider",
|
||||
"get_playwright_proxy",
|
||||
"get_proxy_url",
|
||||
"get_requests_proxies",
|
||||
"get_residential_proxy_url",
|
||||
]
|
||||
Loading…
Add table
Add a link
Reference in a new issue