fix Gmail HTML rendering: unwrap layout tables and use native img

This commit is contained in:
CREDO23 2026-03-10 18:57:49 +02:00
parent dfb1c6534d
commit 0bb1b730dd
3 changed files with 32 additions and 10 deletions

View file

@ -10,6 +10,9 @@ from collections.abc import Awaitable, Callable
from datetime import UTC, datetime
from typing import Any
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
@ -130,6 +133,16 @@ class ComposioGmailConnector(ComposioConnector):
message_id=message_id,
)
@staticmethod
def _html_to_markdown(html: str) -> str:
"""Convert HTML (especially email layouts with nested tables) to clean markdown."""
soup = BeautifulSoup(html, "html.parser")
for tag in soup.find_all(["style", "script", "img"]):
tag.decompose()
for tag in soup.find_all(["table", "thead", "tbody", "tfoot", "tr", "td", "th"]):
tag.unwrap()
return md(str(soup)).strip()
def format_gmail_message_to_markdown(self, message: dict[str, Any]) -> str:
"""
Format a Gmail message to markdown.
@ -178,9 +191,10 @@ class ComposioGmailConnector(ComposioConnector):
markdown_content += "\n---\n\n"
# Composio provides full message text in 'messageText'
# Composio provides full message text in 'messageText' which is often raw HTML
message_text = message.get("messageText", "")
if message_text:
message_text = self._html_to_markdown(message_text)
markdown_content += f"## Content\n\n{message_text}\n\n"
else:
# Fallback to snippet if no messageText

View file

@ -7,9 +7,11 @@ Allows fetching emails from Gmail mailbox using Google OAuth credentials.
import base64
import json
import logging
import re
from typing import Any
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
@ -348,6 +350,16 @@ class GoogleGmailConnector:
except Exception as e:
return [], f"Error fetching recent messages: {e!s}"
@staticmethod
def _html_to_markdown(html: str) -> str:
"""Convert HTML (especially email layouts with nested tables) to clean markdown."""
soup = BeautifulSoup(html, "html.parser")
for tag in soup.find_all(["style", "script", "img"]):
tag.decompose()
for tag in soup.find_all(["table", "thead", "tbody", "tfoot", "tr", "td", "th"]):
tag.unwrap()
return md(str(soup)).strip()
def extract_message_text(self, message: dict[str, Any]) -> str:
"""
Extract text content from a Gmail message.
@ -387,13 +399,10 @@ class GoogleGmailConnector:
)
text_content += decoded_data + "\n"
elif mime_type == "text/html" and data and not text_content:
# Use HTML as fallback if no plain text
decoded_data = base64.urlsafe_b64decode(data + "===").decode(
"utf-8", errors="ignore"
)
# Basic HTML tag removal (you might want to use a proper HTML parser)
text_content = re.sub(r"<[^>]+>", "", decoded_data)
text_content = self._html_to_markdown(decoded_data)
return text_content.strip()

View file

@ -1,6 +1,5 @@
import { createCodePlugin } from "@streamdown/code";
import { createMathPlugin } from "@streamdown/math";
import Image from "next/image";
import { Streamdown, type StreamdownProps } from "streamdown";
import "katex/dist/katex.min.css";
import { cn } from "@/lib/utils";
@ -126,12 +125,12 @@ export function MarkdownViewer({ content, className }: MarkdownViewerProps) {
),
hr: ({ ...props }) => <hr className="my-4 border-muted" {...props} />,
img: ({ src, alt, width: _w, height: _h, ...props }) => (
<Image
// eslint-disable-next-line @next/next/no-img-element
<img
className="max-w-full h-auto my-4 rounded"
alt={alt || "markdown image"}
height={100}
width={100}
src={typeof src === "string" ? src : ""}
loading="lazy"
{...props}
/>
),