Unified WebPage object return for the WebBrowserEngine API

This commit is contained in:
shenchucheng 2023-08-07 16:35:08 +08:00
parent c62c870ab9
commit ede23b2fe9
7 changed files with 228 additions and 69 deletions

View file

@ -0,0 +1,57 @@
#!/usr/bin/env python
from __future__ import annotations
from typing import Generator, Optional
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from pydantic import BaseModel
class WebPage(BaseModel):
inner_text: str
html: str
url: str
class Config:
underscore_attrs_are_private = True
_soup : Optional[BeautifulSoup] = None
_title: Optional[str] = None
@property
def soup(self) -> BeautifulSoup:
if self._soup is None:
self._soup = BeautifulSoup(self.html, "html.parser")
return self._soup
@property
def title(self):
if self._title is None:
title_tag = self.soup.find("title")
self._title = title_tag.text.strip() if title_tag is not None else ""
return self._title
def get_links(self) -> Generator[str, None, None]:
for i in self.soup.find_all("a", href=True):
url = i["href"]
if url.startswith("data:"):
continue
if not url.startswith(("http://", "https://")):
url = urljoin(self.url, url)
yield url
def get_html_content(page: str, base: str):
soup = _get_soup(page)
return soup.get_text(strip=True)
def _get_soup(page: str):
soup = BeautifulSoup(page, "html.parser")
# https://stackoverflow.com/questions/1936466/how-to-scrape-only-visible-webpage-text-with-beautifulsoup
for s in soup(["style", "script", "[document]", "head", "title"]):
s.extract()
return soup