mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 00:36:31 +02:00
fix Gmail HTML rendering: unwrap layout tables and use native img
This commit is contained in:
parent
dfb1c6534d
commit
0bb1b730dd
3 changed files with 32 additions and 10 deletions
|
|
@ -10,6 +10,9 @@ from collections.abc import Awaitable, Callable
|
|||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from markdownify import markdownify as md
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.future import select
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
|
@ -130,6 +133,16 @@ class ComposioGmailConnector(ComposioConnector):
|
|||
message_id=message_id,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _html_to_markdown(html: str) -> str:
|
||||
"""Convert HTML (especially email layouts with nested tables) to clean markdown."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
for tag in soup.find_all(["style", "script", "img"]):
|
||||
tag.decompose()
|
||||
for tag in soup.find_all(["table", "thead", "tbody", "tfoot", "tr", "td", "th"]):
|
||||
tag.unwrap()
|
||||
return md(str(soup)).strip()
|
||||
|
||||
def format_gmail_message_to_markdown(self, message: dict[str, Any]) -> str:
|
||||
"""
|
||||
Format a Gmail message to markdown.
|
||||
|
|
@ -178,9 +191,10 @@ class ComposioGmailConnector(ComposioConnector):
|
|||
|
||||
markdown_content += "\n---\n\n"
|
||||
|
||||
# Composio provides full message text in 'messageText'
|
||||
# Composio provides full message text in 'messageText' which is often raw HTML
|
||||
message_text = message.get("messageText", "")
|
||||
if message_text:
|
||||
message_text = self._html_to_markdown(message_text)
|
||||
markdown_content += f"## Content\n\n{message_text}\n\n"
|
||||
else:
|
||||
# Fallback to snippet if no messageText
|
||||
|
|
|
|||
|
|
@ -7,9 +7,11 @@ Allows fetching emails from Gmail mailbox using Google OAuth credentials.
|
|||
import base64
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from markdownify import markdownify as md
|
||||
|
||||
from google.auth.transport.requests import Request
|
||||
from google.oauth2.credentials import Credentials
|
||||
from googleapiclient.discovery import build
|
||||
|
|
@ -348,6 +350,16 @@ class GoogleGmailConnector:
|
|||
except Exception as e:
|
||||
return [], f"Error fetching recent messages: {e!s}"
|
||||
|
||||
@staticmethod
|
||||
def _html_to_markdown(html: str) -> str:
|
||||
"""Convert HTML (especially email layouts with nested tables) to clean markdown."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
for tag in soup.find_all(["style", "script", "img"]):
|
||||
tag.decompose()
|
||||
for tag in soup.find_all(["table", "thead", "tbody", "tfoot", "tr", "td", "th"]):
|
||||
tag.unwrap()
|
||||
return md(str(soup)).strip()
|
||||
|
||||
def extract_message_text(self, message: dict[str, Any]) -> str:
|
||||
"""
|
||||
Extract text content from a Gmail message.
|
||||
|
|
@ -387,13 +399,10 @@ class GoogleGmailConnector:
|
|||
)
|
||||
text_content += decoded_data + "\n"
|
||||
elif mime_type == "text/html" and data and not text_content:
|
||||
# Use HTML as fallback if no plain text
|
||||
decoded_data = base64.urlsafe_b64decode(data + "===").decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
# Basic HTML tag removal (you might want to use a proper HTML parser)
|
||||
|
||||
text_content = re.sub(r"<[^>]+>", "", decoded_data)
|
||||
text_content = self._html_to_markdown(decoded_data)
|
||||
|
||||
return text_content.strip()
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
import { createCodePlugin } from "@streamdown/code";
|
||||
import { createMathPlugin } from "@streamdown/math";
|
||||
import Image from "next/image";
|
||||
import { Streamdown, type StreamdownProps } from "streamdown";
|
||||
import "katex/dist/katex.min.css";
|
||||
import { cn } from "@/lib/utils";
|
||||
|
|
@ -126,12 +125,12 @@ export function MarkdownViewer({ content, className }: MarkdownViewerProps) {
|
|||
),
|
||||
hr: ({ ...props }) => <hr className="my-4 border-muted" {...props} />,
|
||||
img: ({ src, alt, width: _w, height: _h, ...props }) => (
|
||||
<Image
|
||||
// eslint-disable-next-line @next/next/no-img-element
|
||||
<img
|
||||
className="max-w-full h-auto my-4 rounded"
|
||||
alt={alt || "markdown image"}
|
||||
height={100}
|
||||
width={100}
|
||||
src={typeof src === "string" ? src : ""}
|
||||
loading="lazy"
|
||||
{...props}
|
||||
/>
|
||||
),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue