mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-07-04 22:02:16 +02:00
fix Gmail HTML rendering: unwrap layout tables and use native img
This commit is contained in:
parent
dfb1c6534d
commit
0bb1b730dd
3 changed files with 32 additions and 10 deletions
|
|
@ -10,6 +10,9 @@ from collections.abc import Awaitable, Callable
|
||||||
from datetime import UTC, datetime
|
from datetime import UTC, datetime
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from markdownify import markdownify as md
|
||||||
|
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from sqlalchemy.future import select
|
from sqlalchemy.future import select
|
||||||
from sqlalchemy.orm import selectinload
|
from sqlalchemy.orm import selectinload
|
||||||
|
|
@ -130,6 +133,16 @@ class ComposioGmailConnector(ComposioConnector):
|
||||||
message_id=message_id,
|
message_id=message_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _html_to_markdown(html: str) -> str:
|
||||||
|
"""Convert HTML (especially email layouts with nested tables) to clean markdown."""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
for tag in soup.find_all(["style", "script", "img"]):
|
||||||
|
tag.decompose()
|
||||||
|
for tag in soup.find_all(["table", "thead", "tbody", "tfoot", "tr", "td", "th"]):
|
||||||
|
tag.unwrap()
|
||||||
|
return md(str(soup)).strip()
|
||||||
|
|
||||||
def format_gmail_message_to_markdown(self, message: dict[str, Any]) -> str:
|
def format_gmail_message_to_markdown(self, message: dict[str, Any]) -> str:
|
||||||
"""
|
"""
|
||||||
Format a Gmail message to markdown.
|
Format a Gmail message to markdown.
|
||||||
|
|
@ -178,9 +191,10 @@ class ComposioGmailConnector(ComposioConnector):
|
||||||
|
|
||||||
markdown_content += "\n---\n\n"
|
markdown_content += "\n---\n\n"
|
||||||
|
|
||||||
# Composio provides full message text in 'messageText'
|
# Composio provides full message text in 'messageText' which is often raw HTML
|
||||||
message_text = message.get("messageText", "")
|
message_text = message.get("messageText", "")
|
||||||
if message_text:
|
if message_text:
|
||||||
|
message_text = self._html_to_markdown(message_text)
|
||||||
markdown_content += f"## Content\n\n{message_text}\n\n"
|
markdown_content += f"## Content\n\n{message_text}\n\n"
|
||||||
else:
|
else:
|
||||||
# Fallback to snippet if no messageText
|
# Fallback to snippet if no messageText
|
||||||
|
|
|
||||||
|
|
@ -7,9 +7,11 @@ Allows fetching emails from Gmail mailbox using Google OAuth credentials.
|
||||||
import base64
|
import base64
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from markdownify import markdownify as md
|
||||||
|
|
||||||
from google.auth.transport.requests import Request
|
from google.auth.transport.requests import Request
|
||||||
from google.oauth2.credentials import Credentials
|
from google.oauth2.credentials import Credentials
|
||||||
from googleapiclient.discovery import build
|
from googleapiclient.discovery import build
|
||||||
|
|
@ -348,6 +350,16 @@ class GoogleGmailConnector:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return [], f"Error fetching recent messages: {e!s}"
|
return [], f"Error fetching recent messages: {e!s}"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _html_to_markdown(html: str) -> str:
|
||||||
|
"""Convert HTML (especially email layouts with nested tables) to clean markdown."""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
for tag in soup.find_all(["style", "script", "img"]):
|
||||||
|
tag.decompose()
|
||||||
|
for tag in soup.find_all(["table", "thead", "tbody", "tfoot", "tr", "td", "th"]):
|
||||||
|
tag.unwrap()
|
||||||
|
return md(str(soup)).strip()
|
||||||
|
|
||||||
def extract_message_text(self, message: dict[str, Any]) -> str:
|
def extract_message_text(self, message: dict[str, Any]) -> str:
|
||||||
"""
|
"""
|
||||||
Extract text content from a Gmail message.
|
Extract text content from a Gmail message.
|
||||||
|
|
@ -387,13 +399,10 @@ class GoogleGmailConnector:
|
||||||
)
|
)
|
||||||
text_content += decoded_data + "\n"
|
text_content += decoded_data + "\n"
|
||||||
elif mime_type == "text/html" and data and not text_content:
|
elif mime_type == "text/html" and data and not text_content:
|
||||||
# Use HTML as fallback if no plain text
|
|
||||||
decoded_data = base64.urlsafe_b64decode(data + "===").decode(
|
decoded_data = base64.urlsafe_b64decode(data + "===").decode(
|
||||||
"utf-8", errors="ignore"
|
"utf-8", errors="ignore"
|
||||||
)
|
)
|
||||||
# Basic HTML tag removal (you might want to use a proper HTML parser)
|
text_content = self._html_to_markdown(decoded_data)
|
||||||
|
|
||||||
text_content = re.sub(r"<[^>]+>", "", decoded_data)
|
|
||||||
|
|
||||||
return text_content.strip()
|
return text_content.strip()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
import { createCodePlugin } from "@streamdown/code";
|
import { createCodePlugin } from "@streamdown/code";
|
||||||
import { createMathPlugin } from "@streamdown/math";
|
import { createMathPlugin } from "@streamdown/math";
|
||||||
import Image from "next/image";
|
|
||||||
import { Streamdown, type StreamdownProps } from "streamdown";
|
import { Streamdown, type StreamdownProps } from "streamdown";
|
||||||
import "katex/dist/katex.min.css";
|
import "katex/dist/katex.min.css";
|
||||||
import { cn } from "@/lib/utils";
|
import { cn } from "@/lib/utils";
|
||||||
|
|
@ -126,12 +125,12 @@ export function MarkdownViewer({ content, className }: MarkdownViewerProps) {
|
||||||
),
|
),
|
||||||
hr: ({ ...props }) => <hr className="my-4 border-muted" {...props} />,
|
hr: ({ ...props }) => <hr className="my-4 border-muted" {...props} />,
|
||||||
img: ({ src, alt, width: _w, height: _h, ...props }) => (
|
img: ({ src, alt, width: _w, height: _h, ...props }) => (
|
||||||
<Image
|
// eslint-disable-next-line @next/next/no-img-element
|
||||||
|
<img
|
||||||
className="max-w-full h-auto my-4 rounded"
|
className="max-w-full h-auto my-4 rounded"
|
||||||
alt={alt || "markdown image"}
|
alt={alt || "markdown image"}
|
||||||
height={100}
|
|
||||||
width={100}
|
|
||||||
src={typeof src === "string" ? src : ""}
|
src={typeof src === "string" ? src : ""}
|
||||||
|
loading="lazy"
|
||||||
{...props}
|
{...props}
|
||||||
/>
|
/>
|
||||||
),
|
),
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue