diff --git a/surfsense_backend/app/connectors/composio_gmail_connector.py b/surfsense_backend/app/connectors/composio_gmail_connector.py index e83ba5cfb..94ee7b14c 100644 --- a/surfsense_backend/app/connectors/composio_gmail_connector.py +++ b/surfsense_backend/app/connectors/composio_gmail_connector.py @@ -10,6 +10,9 @@ from collections.abc import Awaitable, Callable from datetime import UTC, datetime from typing import Any +from bs4 import BeautifulSoup +from markdownify import markdownify as md + from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from sqlalchemy.orm import selectinload @@ -130,6 +133,16 @@ class ComposioGmailConnector(ComposioConnector): message_id=message_id, ) + @staticmethod + def _html_to_markdown(html: str) -> str: + """Convert HTML (especially email layouts with nested tables) to clean markdown.""" + soup = BeautifulSoup(html, "html.parser") + for tag in soup.find_all(["style", "script", "img"]): + tag.decompose() + for tag in soup.find_all(["table", "thead", "tbody", "tfoot", "tr", "td", "th"]): + tag.unwrap() + return md(str(soup)).strip() + def format_gmail_message_to_markdown(self, message: dict[str, Any]) -> str: """ Format a Gmail message to markdown. @@ -178,9 +191,10 @@ class ComposioGmailConnector(ComposioConnector): markdown_content += "\n---\n\n" - # Composio provides full message text in 'messageText' + # Composio provides full message text in 'messageText' which is often raw HTML message_text = message.get("messageText", "") if message_text: + message_text = self._html_to_markdown(message_text) markdown_content += f"## Content\n\n{message_text}\n\n" else: # Fallback to snippet if no messageText diff --git a/surfsense_backend/app/connectors/google_gmail_connector.py b/surfsense_backend/app/connectors/google_gmail_connector.py index 7c7262bff..46b825253 100644 --- a/surfsense_backend/app/connectors/google_gmail_connector.py +++ b/surfsense_backend/app/connectors/google_gmail_connector.py @@ -7,9 +7,11 @@ Allows fetching emails from Gmail mailbox using Google OAuth credentials. import base64 import json import logging -import re from typing import Any +from bs4 import BeautifulSoup +from markdownify import markdownify as md + from google.auth.transport.requests import Request from google.oauth2.credentials import Credentials from googleapiclient.discovery import build @@ -348,6 +350,16 @@ class GoogleGmailConnector: except Exception as e: return [], f"Error fetching recent messages: {e!s}" + @staticmethod + def _html_to_markdown(html: str) -> str: + """Convert HTML (especially email layouts with nested tables) to clean markdown.""" + soup = BeautifulSoup(html, "html.parser") + for tag in soup.find_all(["style", "script", "img"]): + tag.decompose() + for tag in soup.find_all(["table", "thead", "tbody", "tfoot", "tr", "td", "th"]): + tag.unwrap() + return md(str(soup)).strip() + def extract_message_text(self, message: dict[str, Any]) -> str: """ Extract text content from a Gmail message. @@ -387,13 +399,10 @@ class GoogleGmailConnector: ) text_content += decoded_data + "\n" elif mime_type == "text/html" and data and not text_content: - # Use HTML as fallback if no plain text decoded_data = base64.urlsafe_b64decode(data + "===").decode( "utf-8", errors="ignore" ) - # Basic HTML tag removal (you might want to use a proper HTML parser) - - text_content = re.sub(r"<[^>]+>", "", decoded_data) + text_content = self._html_to_markdown(decoded_data) return text_content.strip() diff --git a/surfsense_web/components/markdown-viewer.tsx b/surfsense_web/components/markdown-viewer.tsx index 7e0fc17b1..e22df8998 100644 --- a/surfsense_web/components/markdown-viewer.tsx +++ b/surfsense_web/components/markdown-viewer.tsx @@ -1,6 +1,5 @@ import { createCodePlugin } from "@streamdown/code"; import { createMathPlugin } from "@streamdown/math"; -import Image from "next/image"; import { Streamdown, type StreamdownProps } from "streamdown"; import "katex/dist/katex.min.css"; import { cn } from "@/lib/utils"; @@ -126,12 +125,12 @@ export function MarkdownViewer({ content, className }: MarkdownViewerProps) { ), hr: ({ ...props }) =>
, img: ({ src, alt, width: _w, height: _h, ...props }) => ( - {alt ),