mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-04-28 02:23:52 +02:00
Unified WebPage object return for the WebBrowserEngine API
This commit is contained in:
parent
c62c870ab9
commit
ede23b2fe9
7 changed files with 228 additions and 69 deletions
57
metagpt/utils/parse_html.py
Normal file
57
metagpt/utils/parse_html.py
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Generator, Optional
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class WebPage(BaseModel):
|
||||
inner_text: str
|
||||
html: str
|
||||
url: str
|
||||
|
||||
class Config:
|
||||
underscore_attrs_are_private = True
|
||||
|
||||
_soup : Optional[BeautifulSoup] = None
|
||||
_title: Optional[str] = None
|
||||
|
||||
@property
|
||||
def soup(self) -> BeautifulSoup:
|
||||
if self._soup is None:
|
||||
self._soup = BeautifulSoup(self.html, "html.parser")
|
||||
return self._soup
|
||||
|
||||
@property
|
||||
def title(self):
|
||||
if self._title is None:
|
||||
title_tag = self.soup.find("title")
|
||||
self._title = title_tag.text.strip() if title_tag is not None else ""
|
||||
return self._title
|
||||
|
||||
def get_links(self) -> Generator[str, None, None]:
|
||||
for i in self.soup.find_all("a", href=True):
|
||||
url = i["href"]
|
||||
if url.startswith("data:"):
|
||||
continue
|
||||
if not url.startswith(("http://", "https://")):
|
||||
url = urljoin(self.url, url)
|
||||
yield url
|
||||
|
||||
|
||||
def get_html_content(page: str, base: str):
|
||||
soup = _get_soup(page)
|
||||
|
||||
return soup.get_text(strip=True)
|
||||
|
||||
|
||||
def _get_soup(page: str):
|
||||
soup = BeautifulSoup(page, "html.parser")
|
||||
# https://stackoverflow.com/questions/1936466/how-to-scrape-only-visible-webpage-text-with-beautifulsoup
|
||||
for s in soup(["style", "script", "[document]", "head", "title"]):
|
||||
s.extract()
|
||||
|
||||
return soup
|
||||
Loading…
Add table
Add a link
Reference in a new issue