mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-07-02 22:01:05 +02:00
feat(proxy): integrate Scrapling for enhanced web scraping capabilities
- Replaced Playwright with Scrapling's fetchers in the web crawling and YouTube processing modules for improved performance and flexibility. - Updated proxy configuration to support dynamic proxy selection via environment variables. - Enhanced logging to track performance metrics during web scraping operations. - Refactored related modules to utilize the new proxy utilities and streamline the scraping process.
This commit is contained in:
parent
41a93ca8fb
commit
640ef5f15d
16 changed files with 5770 additions and 4886 deletions
46
surfsense_backend/app/utils/proxy/base.py
Normal file
46
surfsense_backend/app/utils/proxy/base.py
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
"""Abstract base class for residential / rotating proxy providers.
|
||||
|
||||
Each provider reads its own credentials from the application Config and exposes
|
||||
proxy settings in the formats the different HTTP stacks expect:
|
||||
|
||||
* ``get_proxy_url`` -> canonical ``http://user:pass@host:port`` string consumed
|
||||
by Scrapling's fetchers (curl_cffi / patchright / camoufox).
|
||||
* ``get_requests_proxies`` -> ``{"http": ..., "https": ...}`` dict for
|
||||
``requests`` / ``aiohttp``.
|
||||
* ``get_playwright_proxy`` -> Playwright-style ``{"server", "username",
|
||||
"password"}`` dict.
|
||||
|
||||
Add a new vendor by subclassing :class:`ProxyProvider` in ``providers/`` and
|
||||
registering it in ``registry.py``.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class ProxyProvider(ABC):
|
||||
"""Interface every proxy provider must implement."""
|
||||
|
||||
#: Unique key used to select this provider via the ``PROXY_PROVIDER`` env var.
|
||||
name: str = "base"
|
||||
|
||||
@abstractmethod
|
||||
def get_proxy_url(self) -> str | None:
|
||||
"""Return ``http://user:pass@host:port`` (no trailing slash), or ``None``.
|
||||
|
||||
This is the canonical form Scrapling/curl_cffi consume directly.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_playwright_proxy(self) -> dict[str, str] | None:
|
||||
"""Return a Playwright proxy dict, or ``None`` when not configured."""
|
||||
|
||||
def get_requests_proxies(self) -> dict[str, str] | None:
|
||||
"""Return a ``requests``/``aiohttp`` proxies dict, or ``None``.
|
||||
|
||||
Built from :meth:`get_proxy_url` by default; override if a provider needs
|
||||
different http vs https endpoints.
|
||||
"""
|
||||
proxy_url = self.get_proxy_url()
|
||||
if proxy_url is None:
|
||||
return None
|
||||
return {"http": proxy_url, "https": proxy_url}
|
||||
Loading…
Add table
Add a link
Reference in a new issue