mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-30 21:59:46 +02:00
feat(proxy): integrate Scrapling for enhanced web scraping capabilities
- Replaced Playwright with Scrapling's fetchers in the web crawling and YouTube processing modules for improved performance and flexibility. - Updated proxy configuration to support dynamic proxy selection via environment variables. - Enhanced logging to track performance metrics during web scraping operations. - Refactored related modules to utilize the new proxy utilities and streamline the scraping process.
This commit is contained in:
parent
41a93ca8fb
commit
640ef5f15d
16 changed files with 5770 additions and 4886 deletions
40
surfsense_backend/app/utils/proxy/__init__.py
Normal file
40
surfsense_backend/app/utils/proxy/__init__.py
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
"""Modular residential / rotating proxy provider package.
|
||||
|
||||
Selects a provider via the ``PROXY_PROVIDER`` env var (see ``registry.py``) and
|
||||
exposes proxy settings in the formats different HTTP libraries expect. Add new
|
||||
vendors by implementing :class:`ProxyProvider` in ``providers/`` and registering
|
||||
them in ``registry.py``.
|
||||
"""
|
||||
|
||||
from app.utils.proxy.base import ProxyProvider
|
||||
from app.utils.proxy.registry import get_active_provider
|
||||
|
||||
|
||||
def get_proxy_url() -> str | None:
|
||||
"""Canonical ``http://user:pass@host:port`` URL for Scrapling/curl_cffi."""
|
||||
return get_active_provider().get_proxy_url()
|
||||
|
||||
|
||||
def get_playwright_proxy() -> dict[str, str] | None:
|
||||
"""Playwright-style proxy dict, or ``None`` when not configured."""
|
||||
return get_active_provider().get_playwright_proxy()
|
||||
|
||||
|
||||
def get_requests_proxies() -> dict[str, str] | None:
|
||||
"""``{"http": ..., "https": ...}`` dict for requests/aiohttp, or ``None``."""
|
||||
return get_active_provider().get_requests_proxies()
|
||||
|
||||
|
||||
def get_residential_proxy_url() -> str | None:
|
||||
"""Backward-compatible alias for :func:`get_proxy_url`."""
|
||||
return get_proxy_url()
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ProxyProvider",
|
||||
"get_active_provider",
|
||||
"get_playwright_proxy",
|
||||
"get_proxy_url",
|
||||
"get_requests_proxies",
|
||||
"get_residential_proxy_url",
|
||||
]
|
||||
46
surfsense_backend/app/utils/proxy/base.py
Normal file
46
surfsense_backend/app/utils/proxy/base.py
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
"""Abstract base class for residential / rotating proxy providers.
|
||||
|
||||
Each provider reads its own credentials from the application Config and exposes
|
||||
proxy settings in the formats the different HTTP stacks expect:
|
||||
|
||||
* ``get_proxy_url`` -> canonical ``http://user:pass@host:port`` string consumed
|
||||
by Scrapling's fetchers (curl_cffi / patchright / camoufox).
|
||||
* ``get_requests_proxies`` -> ``{"http": ..., "https": ...}`` dict for
|
||||
``requests`` / ``aiohttp``.
|
||||
* ``get_playwright_proxy`` -> Playwright-style ``{"server", "username",
|
||||
"password"}`` dict.
|
||||
|
||||
Add a new vendor by subclassing :class:`ProxyProvider` in ``providers/`` and
|
||||
registering it in ``registry.py``.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class ProxyProvider(ABC):
|
||||
"""Interface every proxy provider must implement."""
|
||||
|
||||
#: Unique key used to select this provider via the ``PROXY_PROVIDER`` env var.
|
||||
name: str = "base"
|
||||
|
||||
@abstractmethod
|
||||
def get_proxy_url(self) -> str | None:
|
||||
"""Return ``http://user:pass@host:port`` (no trailing slash), or ``None``.
|
||||
|
||||
This is the canonical form Scrapling/curl_cffi consume directly.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_playwright_proxy(self) -> dict[str, str] | None:
|
||||
"""Return a Playwright proxy dict, or ``None`` when not configured."""
|
||||
|
||||
def get_requests_proxies(self) -> dict[str, str] | None:
|
||||
"""Return a ``requests``/``aiohttp`` proxies dict, or ``None``.
|
||||
|
||||
Built from :meth:`get_proxy_url` by default; override if a provider needs
|
||||
different http vs https endpoints.
|
||||
"""
|
||||
proxy_url = self.get_proxy_url()
|
||||
if proxy_url is None:
|
||||
return None
|
||||
return {"http": proxy_url, "https": proxy_url}
|
||||
1
surfsense_backend/app/utils/proxy/providers/__init__.py
Normal file
1
surfsense_backend/app/utils/proxy/providers/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"""Concrete proxy provider implementations."""
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
"""anonymous-proxies.net residential / rotating proxy provider.
|
||||
|
||||
The vendor (``rotating.dnsproxifier.com``) encodes the location and rotation
|
||||
``type`` options inside a base64-encoded JSON "password". The hostname already
|
||||
includes the port (e.g. ``rotating.dnsproxifier.com:31230``).
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
|
||||
from app.config import Config
|
||||
from app.utils.proxy.base import ProxyProvider
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AnonymousProxiesProvider(ProxyProvider):
|
||||
"""Provider for anonymous-proxies.net credentials in ``RESIDENTIAL_PROXY_*``."""
|
||||
|
||||
name = "anonymous_proxies"
|
||||
|
||||
def _password_b64(self) -> str | None:
|
||||
"""Build the base64-encoded password dict required by the vendor.
|
||||
|
||||
Returns ``None`` when the password is not configured.
|
||||
"""
|
||||
password = Config.RESIDENTIAL_PROXY_PASSWORD
|
||||
if not password:
|
||||
return None
|
||||
|
||||
password_dict = {
|
||||
"p": password,
|
||||
"l": Config.RESIDENTIAL_PROXY_LOCATION,
|
||||
"t": Config.RESIDENTIAL_PROXY_TYPE,
|
||||
}
|
||||
return base64.b64encode(
|
||||
json.dumps(password_dict).encode("utf-8")
|
||||
).decode("utf-8")
|
||||
|
||||
def get_proxy_url(self) -> str | None:
|
||||
username = Config.RESIDENTIAL_PROXY_USERNAME
|
||||
hostname = Config.RESIDENTIAL_PROXY_HOSTNAME
|
||||
password_b64 = self._password_b64()
|
||||
|
||||
if not all([username, hostname, password_b64]):
|
||||
return None
|
||||
|
||||
# No trailing slash: curl_cffi (Scrapling static fetcher) expects a bare
|
||||
# ``http://user:pass@host:port`` URL.
|
||||
return f"http://{username}:{password_b64}@{hostname}"
|
||||
|
||||
def get_playwright_proxy(self) -> dict[str, str] | None:
|
||||
username = Config.RESIDENTIAL_PROXY_USERNAME
|
||||
hostname = Config.RESIDENTIAL_PROXY_HOSTNAME
|
||||
password_b64 = self._password_b64()
|
||||
|
||||
if not all([username, hostname, password_b64]):
|
||||
return None
|
||||
|
||||
return {
|
||||
"server": f"http://{hostname}",
|
||||
"username": username,
|
||||
"password": password_b64,
|
||||
}
|
||||
44
surfsense_backend/app/utils/proxy/registry.py
Normal file
44
surfsense_backend/app/utils/proxy/registry.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
"""Proxy provider registry.
|
||||
|
||||
Maps the ``PROXY_PROVIDER`` config value to a :class:`ProxyProvider`
|
||||
implementation. To add a new vendor: implement a provider in ``providers/`` and
|
||||
add a single entry to ``_PROVIDERS`` below - no caller changes required.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from app.config import Config
|
||||
from app.utils.proxy.base import ProxyProvider
|
||||
from app.utils.proxy.providers.anonymous_proxies import AnonymousProxiesProvider
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Registered proxy providers, keyed by their ``name``.
|
||||
_PROVIDERS: dict[str, type[ProxyProvider]] = {
|
||||
AnonymousProxiesProvider.name: AnonymousProxiesProvider,
|
||||
}
|
||||
|
||||
_DEFAULT_PROVIDER = AnonymousProxiesProvider.name
|
||||
|
||||
_active_provider: ProxyProvider | None = None
|
||||
|
||||
|
||||
def get_active_provider() -> ProxyProvider:
|
||||
"""Return the configured proxy provider instance (cached for the process)."""
|
||||
global _active_provider
|
||||
if _active_provider is not None:
|
||||
return _active_provider
|
||||
|
||||
key = (Config.PROXY_PROVIDER or _DEFAULT_PROVIDER).strip()
|
||||
provider_cls = _PROVIDERS.get(key)
|
||||
if provider_cls is None:
|
||||
logger.warning(
|
||||
"Unknown PROXY_PROVIDER '%s'; falling back to '%s'. Available: %s",
|
||||
key,
|
||||
_DEFAULT_PROVIDER,
|
||||
", ".join(sorted(_PROVIDERS)),
|
||||
)
|
||||
provider_cls = _PROVIDERS[_DEFAULT_PROVIDER]
|
||||
|
||||
_active_provider = provider_cls()
|
||||
return _active_provider
|
||||
Loading…
Add table
Add a link
Reference in a new issue