feat(proxy): integrate Scrapling for enhanced web scraping capabilities

- Replaced Playwright with Scrapling's fetchers in the web crawling and YouTube processing modules for improved performance and flexibility.
- Updated proxy configuration to support dynamic proxy selection via environment variables.
- Enhanced logging to track performance metrics during web scraping operations.
- Refactored related modules to utilize the new proxy utilities and streamline the scraping process.
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-06-09 00:15:10 -07:00
parent 41a93ca8fb
commit 640ef5f15d
16 changed files with 5770 additions and 4886 deletions

View file

@ -0,0 +1,40 @@
"""Modular residential / rotating proxy provider package.
Selects a provider via the ``PROXY_PROVIDER`` env var (see ``registry.py``) and
exposes proxy settings in the formats different HTTP libraries expect. Add new
vendors by implementing :class:`ProxyProvider` in ``providers/`` and registering
them in ``registry.py``.
"""
from app.utils.proxy.base import ProxyProvider
from app.utils.proxy.registry import get_active_provider
def get_proxy_url() -> str | None:
"""Canonical ``http://user:pass@host:port`` URL for Scrapling/curl_cffi."""
return get_active_provider().get_proxy_url()
def get_playwright_proxy() -> dict[str, str] | None:
"""Playwright-style proxy dict, or ``None`` when not configured."""
return get_active_provider().get_playwright_proxy()
def get_requests_proxies() -> dict[str, str] | None:
"""``{"http": ..., "https": ...}`` dict for requests/aiohttp, or ``None``."""
return get_active_provider().get_requests_proxies()
def get_residential_proxy_url() -> str | None:
"""Backward-compatible alias for :func:`get_proxy_url`."""
return get_proxy_url()
__all__ = [
"ProxyProvider",
"get_active_provider",
"get_playwright_proxy",
"get_proxy_url",
"get_requests_proxies",
"get_residential_proxy_url",
]

View file

@ -0,0 +1,46 @@
"""Abstract base class for residential / rotating proxy providers.
Each provider reads its own credentials from the application Config and exposes
proxy settings in the formats the different HTTP stacks expect:
* ``get_proxy_url`` -> canonical ``http://user:pass@host:port`` string consumed
by Scrapling's fetchers (curl_cffi / patchright / camoufox).
* ``get_requests_proxies`` -> ``{"http": ..., "https": ...}`` dict for
``requests`` / ``aiohttp``.
* ``get_playwright_proxy`` -> Playwright-style ``{"server", "username",
"password"}`` dict.
Add a new vendor by subclassing :class:`ProxyProvider` in ``providers/`` and
registering it in ``registry.py``.
"""
from abc import ABC, abstractmethod
class ProxyProvider(ABC):
"""Interface every proxy provider must implement."""
#: Unique key used to select this provider via the ``PROXY_PROVIDER`` env var.
name: str = "base"
@abstractmethod
def get_proxy_url(self) -> str | None:
"""Return ``http://user:pass@host:port`` (no trailing slash), or ``None``.
This is the canonical form Scrapling/curl_cffi consume directly.
"""
@abstractmethod
def get_playwright_proxy(self) -> dict[str, str] | None:
"""Return a Playwright proxy dict, or ``None`` when not configured."""
def get_requests_proxies(self) -> dict[str, str] | None:
"""Return a ``requests``/``aiohttp`` proxies dict, or ``None``.
Built from :meth:`get_proxy_url` by default; override if a provider needs
different http vs https endpoints.
"""
proxy_url = self.get_proxy_url()
if proxy_url is None:
return None
return {"http": proxy_url, "https": proxy_url}

View file

@ -0,0 +1 @@
"""Concrete proxy provider implementations."""

View file

@ -0,0 +1,65 @@
"""anonymous-proxies.net residential / rotating proxy provider.
The vendor (``rotating.dnsproxifier.com``) encodes the location and rotation
``type`` options inside a base64-encoded JSON "password". The hostname already
includes the port (e.g. ``rotating.dnsproxifier.com:31230``).
"""
import base64
import json
import logging
from app.config import Config
from app.utils.proxy.base import ProxyProvider
logger = logging.getLogger(__name__)
class AnonymousProxiesProvider(ProxyProvider):
"""Provider for anonymous-proxies.net credentials in ``RESIDENTIAL_PROXY_*``."""
name = "anonymous_proxies"
def _password_b64(self) -> str | None:
"""Build the base64-encoded password dict required by the vendor.
Returns ``None`` when the password is not configured.
"""
password = Config.RESIDENTIAL_PROXY_PASSWORD
if not password:
return None
password_dict = {
"p": password,
"l": Config.RESIDENTIAL_PROXY_LOCATION,
"t": Config.RESIDENTIAL_PROXY_TYPE,
}
return base64.b64encode(
json.dumps(password_dict).encode("utf-8")
).decode("utf-8")
def get_proxy_url(self) -> str | None:
username = Config.RESIDENTIAL_PROXY_USERNAME
hostname = Config.RESIDENTIAL_PROXY_HOSTNAME
password_b64 = self._password_b64()
if not all([username, hostname, password_b64]):
return None
# No trailing slash: curl_cffi (Scrapling static fetcher) expects a bare
# ``http://user:pass@host:port`` URL.
return f"http://{username}:{password_b64}@{hostname}"
def get_playwright_proxy(self) -> dict[str, str] | None:
username = Config.RESIDENTIAL_PROXY_USERNAME
hostname = Config.RESIDENTIAL_PROXY_HOSTNAME
password_b64 = self._password_b64()
if not all([username, hostname, password_b64]):
return None
return {
"server": f"http://{hostname}",
"username": username,
"password": password_b64,
}

View file

@ -0,0 +1,44 @@
"""Proxy provider registry.
Maps the ``PROXY_PROVIDER`` config value to a :class:`ProxyProvider`
implementation. To add a new vendor: implement a provider in ``providers/`` and
add a single entry to ``_PROVIDERS`` below - no caller changes required.
"""
import logging
from app.config import Config
from app.utils.proxy.base import ProxyProvider
from app.utils.proxy.providers.anonymous_proxies import AnonymousProxiesProvider
logger = logging.getLogger(__name__)
# Registered proxy providers, keyed by their ``name``.
_PROVIDERS: dict[str, type[ProxyProvider]] = {
AnonymousProxiesProvider.name: AnonymousProxiesProvider,
}
_DEFAULT_PROVIDER = AnonymousProxiesProvider.name
_active_provider: ProxyProvider | None = None
def get_active_provider() -> ProxyProvider:
"""Return the configured proxy provider instance (cached for the process)."""
global _active_provider
if _active_provider is not None:
return _active_provider
key = (Config.PROXY_PROVIDER or _DEFAULT_PROVIDER).strip()
provider_cls = _PROVIDERS.get(key)
if provider_cls is None:
logger.warning(
"Unknown PROXY_PROVIDER '%s'; falling back to '%s'. Available: %s",
key,
_DEFAULT_PROVIDER,
", ".join(sorted(_PROVIDERS)),
)
provider_cls = _PROVIDERS[_DEFAULT_PROVIDER]
_active_provider = provider_cls()
return _active_provider