mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 16:56:22 +02:00
610 lines
21 KiB
Python
610 lines
21 KiB
Python
"""
|
|
Confluence OAuth Connector.
|
|
|
|
Handles OAuth-based authentication and token refresh for Confluence API access.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Any
|
|
|
|
import httpx
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
from sqlalchemy.future import select
|
|
|
|
from app.config import config
|
|
from app.connectors.confluence_connector import ConfluenceConnector
|
|
from app.db import SearchSourceConnector
|
|
from app.routes.confluence_add_connector_route import refresh_confluence_token
|
|
from app.schemas.atlassian_auth_credentials import AtlassianAuthCredentialsBase
|
|
from app.utils.oauth_security import TokenEncryption
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ConfluenceHistoryConnector:
|
|
"""
|
|
Confluence connector with OAuth support and automatic token refresh.
|
|
|
|
This connector uses OAuth 2.0 access tokens to authenticate with the
|
|
Confluence API. It automatically refreshes expired tokens when needed.
|
|
Also supports legacy API token authentication for backward compatibility.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
session: AsyncSession,
|
|
connector_id: int,
|
|
credentials: AtlassianAuthCredentialsBase | None = None,
|
|
):
|
|
"""
|
|
Initialize the ConfluenceHistoryConnector with auto-refresh capability.
|
|
|
|
Args:
|
|
session: Database session for updating connector
|
|
connector_id: Connector ID for direct updates
|
|
credentials: Confluence OAuth credentials (optional, will be loaded from DB if not provided)
|
|
"""
|
|
self._session = session
|
|
self._connector_id = connector_id
|
|
self._credentials = credentials
|
|
self._cloud_id: str | None = None
|
|
self._base_url: str | None = None
|
|
self._http_client: httpx.AsyncClient | None = None
|
|
self._use_oauth = True
|
|
self._legacy_email: str | None = None
|
|
self._legacy_api_token: str | None = None
|
|
self._legacy_confluence_client: ConfluenceConnector | None = None
|
|
|
|
async def _get_valid_token(self) -> str:
|
|
"""
|
|
Get valid Confluence access token, refreshing if needed.
|
|
|
|
Returns:
|
|
Valid access token
|
|
|
|
Raises:
|
|
ValueError: If credentials are missing or invalid
|
|
Exception: If token refresh fails
|
|
"""
|
|
# Load credentials from DB if not provided
|
|
if self._credentials is None:
|
|
result = await self._session.execute(
|
|
select(SearchSourceConnector).filter(
|
|
SearchSourceConnector.id == self._connector_id
|
|
)
|
|
)
|
|
connector = result.scalars().first()
|
|
|
|
if not connector:
|
|
raise ValueError(f"Connector {self._connector_id} not found")
|
|
|
|
config_data = connector.config.copy()
|
|
|
|
# Check if using OAuth or legacy API token
|
|
is_oauth = config_data.get("_token_encrypted", False) or config_data.get(
|
|
"access_token"
|
|
)
|
|
|
|
if is_oauth:
|
|
# OAuth 2.0 authentication
|
|
# Check if access_token exists before processing
|
|
raw_access_token = config_data.get("access_token")
|
|
if not raw_access_token:
|
|
raise ValueError(
|
|
"Confluence access token not found. "
|
|
"Please reconnect your Confluence account."
|
|
)
|
|
|
|
# Decrypt credentials if they are encrypted
|
|
token_encrypted = config_data.get("_token_encrypted", False)
|
|
if token_encrypted and config.SECRET_KEY:
|
|
try:
|
|
token_encryption = TokenEncryption(config.SECRET_KEY)
|
|
|
|
# Decrypt sensitive fields
|
|
if config_data.get("access_token"):
|
|
config_data["access_token"] = (
|
|
token_encryption.decrypt_token(
|
|
config_data["access_token"]
|
|
)
|
|
)
|
|
if config_data.get("refresh_token"):
|
|
config_data["refresh_token"] = (
|
|
token_encryption.decrypt_token(
|
|
config_data["refresh_token"]
|
|
)
|
|
)
|
|
|
|
logger.info(
|
|
f"Decrypted Confluence credentials for connector {self._connector_id}"
|
|
)
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Failed to decrypt Confluence credentials for connector {self._connector_id}: {e!s}"
|
|
)
|
|
raise ValueError(
|
|
f"Failed to decrypt Confluence credentials: {e!s}"
|
|
) from e
|
|
|
|
# Final validation after decryption
|
|
final_token = config_data.get("access_token")
|
|
if not final_token or (
|
|
isinstance(final_token, str) and not final_token.strip()
|
|
):
|
|
raise ValueError(
|
|
"Confluence access token is invalid or empty. "
|
|
"Please reconnect your Confluence account."
|
|
)
|
|
|
|
try:
|
|
self._credentials = AtlassianAuthCredentialsBase.from_dict(
|
|
config_data
|
|
)
|
|
# Store cloud_id and base_url for API calls (with backward compatibility for site_url)
|
|
self._cloud_id = config_data.get("cloud_id")
|
|
self._base_url = config_data.get("base_url") or config_data.get(
|
|
"site_url"
|
|
)
|
|
self._use_oauth = True
|
|
except Exception as e:
|
|
raise ValueError(
|
|
f"Invalid Confluence OAuth credentials: {e!s}"
|
|
) from e
|
|
else:
|
|
# Legacy API token authentication
|
|
self._legacy_email = config_data.get("CONFLUENCE_EMAIL")
|
|
self._legacy_api_token = config_data.get("CONFLUENCE_API_TOKEN")
|
|
self._base_url = config_data.get("CONFLUENCE_BASE_URL")
|
|
self._use_oauth = False
|
|
|
|
if (
|
|
not self._legacy_email
|
|
or not self._legacy_api_token
|
|
or not self._base_url
|
|
):
|
|
raise ValueError(
|
|
"Confluence credentials not found in connector config"
|
|
)
|
|
|
|
# Check if token is expired and refreshable (only for OAuth)
|
|
if (
|
|
self._use_oauth
|
|
and self._credentials.is_expired
|
|
and self._credentials.is_refreshable
|
|
):
|
|
try:
|
|
logger.info(
|
|
f"Confluence token expired for connector {self._connector_id}, refreshing..."
|
|
)
|
|
|
|
# Get connector for refresh
|
|
result = await self._session.execute(
|
|
select(SearchSourceConnector).filter(
|
|
SearchSourceConnector.id == self._connector_id
|
|
)
|
|
)
|
|
connector = result.scalars().first()
|
|
|
|
if not connector:
|
|
raise RuntimeError(
|
|
f"Connector {self._connector_id} not found; cannot refresh token."
|
|
)
|
|
|
|
# Refresh token
|
|
connector = await refresh_confluence_token(self._session, connector)
|
|
|
|
# Reload credentials after refresh
|
|
config_data = connector.config.copy()
|
|
token_encrypted = config_data.get("_token_encrypted", False)
|
|
if token_encrypted and config.SECRET_KEY:
|
|
token_encryption = TokenEncryption(config.SECRET_KEY)
|
|
if config_data.get("access_token"):
|
|
config_data["access_token"] = token_encryption.decrypt_token(
|
|
config_data["access_token"]
|
|
)
|
|
if config_data.get("refresh_token"):
|
|
config_data["refresh_token"] = token_encryption.decrypt_token(
|
|
config_data["refresh_token"]
|
|
)
|
|
|
|
self._credentials = AtlassianAuthCredentialsBase.from_dict(config_data)
|
|
self._cloud_id = config_data.get("cloud_id")
|
|
# Handle backward compatibility: check both base_url and site_url
|
|
self._base_url = config_data.get("base_url") or config_data.get(
|
|
"site_url"
|
|
)
|
|
|
|
# Invalidate cached client so it's recreated with new token
|
|
if self._http_client:
|
|
await self._http_client.aclose()
|
|
self._http_client = None
|
|
|
|
logger.info(
|
|
f"Successfully refreshed Confluence token for connector {self._connector_id}"
|
|
)
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Failed to refresh Confluence token for connector {self._connector_id}: {e!s}"
|
|
)
|
|
raise Exception(
|
|
f"Failed to refresh Confluence OAuth credentials: {e!s}"
|
|
) from e
|
|
|
|
if self._use_oauth:
|
|
return self._credentials.access_token
|
|
else:
|
|
# For legacy auth, return empty string (not used for token-based auth)
|
|
return ""
|
|
|
|
async def _get_client(self) -> httpx.AsyncClient:
|
|
"""
|
|
Get or create HTTP client with valid token.
|
|
|
|
Returns:
|
|
httpx.AsyncClient instance
|
|
"""
|
|
if self._http_client is None:
|
|
self._http_client = httpx.AsyncClient(timeout=30.0)
|
|
return self._http_client
|
|
|
|
async def _get_legacy_client(self) -> ConfluenceConnector:
|
|
"""
|
|
Get or create ConfluenceConnector with legacy credentials.
|
|
|
|
Returns:
|
|
ConfluenceConnector instance
|
|
"""
|
|
if self._legacy_confluence_client is None:
|
|
self._legacy_confluence_client = ConfluenceConnector(
|
|
base_url=self._base_url,
|
|
email=self._legacy_email,
|
|
api_token=self._legacy_api_token,
|
|
)
|
|
return self._legacy_confluence_client
|
|
|
|
async def _get_base_url(self) -> str:
|
|
"""
|
|
Get the base URL for Confluence API calls.
|
|
|
|
Returns:
|
|
Base URL string
|
|
"""
|
|
if not self._use_oauth:
|
|
# For legacy auth, use the base_url directly
|
|
return self._base_url or ""
|
|
|
|
if not self._cloud_id:
|
|
raise ValueError("Cloud ID not available. Cannot construct API URL.")
|
|
|
|
# Use the Atlassian API format: https://api.atlassian.com/ex/confluence/{cloudid}
|
|
return f"https://api.atlassian.com/ex/confluence/{self._cloud_id}"
|
|
|
|
async def _make_api_request(
|
|
self, endpoint: str, params: dict[str, Any] | None = None
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Make a request to the Confluence API.
|
|
|
|
Args:
|
|
endpoint: API endpoint (without base URL)
|
|
params: Query parameters for the request (optional)
|
|
|
|
Returns:
|
|
Response data from the API
|
|
|
|
Raises:
|
|
ValueError: If credentials have not been set
|
|
Exception: If the API request fails
|
|
"""
|
|
if not self._use_oauth:
|
|
# Use legacy ConfluenceConnector for API requests
|
|
client = await self._get_legacy_client()
|
|
# ConfluenceConnector uses synchronous requests, so we need to handle this differently
|
|
# For now, we'll use the legacy client's make_api_request method
|
|
# But since it's sync, we'll need to wrap it
|
|
import asyncio
|
|
|
|
loop = asyncio.get_event_loop()
|
|
return await loop.run_in_executor(
|
|
None, client.make_api_request, endpoint, params
|
|
)
|
|
|
|
# OAuth flow
|
|
token = await self._get_valid_token()
|
|
base_url = await self._get_base_url()
|
|
http_client = await self._get_client()
|
|
|
|
url = f"{base_url}/wiki/api/v2/{endpoint}"
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"Authorization": f"Bearer {token}",
|
|
"Accept": "application/json",
|
|
}
|
|
|
|
try:
|
|
response = await http_client.get(url, headers=headers, params=params)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except httpx.HTTPStatusError as e:
|
|
# Enhanced error logging to see the actual error
|
|
error_detail = {
|
|
"status_code": e.response.status_code,
|
|
"url": str(e.request.url),
|
|
"response_text": e.response.text,
|
|
"headers": dict(e.response.headers),
|
|
}
|
|
logger.error(f"Confluence API HTTP error: {error_detail}")
|
|
raise Exception(
|
|
f"Confluence API request failed (HTTP {e.response.status_code}): {e.response.text}"
|
|
) from e
|
|
except httpx.RequestError as e:
|
|
logger.error(f"Confluence API request error: {e!s}", exc_info=True)
|
|
raise Exception(f"Confluence API request failed: {e!s}") from e
|
|
|
|
async def get_all_spaces(self) -> list[dict[str, Any]]:
|
|
"""
|
|
Fetch all spaces from Confluence.
|
|
|
|
Returns:
|
|
List of space objects
|
|
|
|
Raises:
|
|
ValueError: If credentials have not been set
|
|
Exception: If the API request fails
|
|
"""
|
|
params = {
|
|
"limit": 100,
|
|
}
|
|
|
|
all_spaces = []
|
|
cursor = None
|
|
|
|
while True:
|
|
if cursor:
|
|
params["cursor"] = cursor
|
|
|
|
result = await self._make_api_request("spaces", params)
|
|
|
|
if not isinstance(result, dict) or "results" not in result:
|
|
raise Exception("Invalid response from Confluence API")
|
|
|
|
spaces = result["results"]
|
|
all_spaces.extend(spaces)
|
|
|
|
# Check if there are more spaces to fetch
|
|
links = result.get("_links", {})
|
|
if "next" not in links:
|
|
break
|
|
|
|
# Extract cursor from next link if available
|
|
next_link = links["next"]
|
|
if "cursor=" in next_link:
|
|
cursor = next_link.split("cursor=")[1].split("&")[0]
|
|
else:
|
|
break
|
|
|
|
return all_spaces
|
|
|
|
async def get_pages_in_space(
|
|
self, space_id: str, include_body: bool = True
|
|
) -> list[dict[str, Any]]:
|
|
"""
|
|
Fetch all pages in a specific space.
|
|
|
|
Args:
|
|
space_id: The ID of the space to fetch pages from
|
|
include_body: Whether to include page body content
|
|
|
|
Returns:
|
|
List of page objects
|
|
|
|
Raises:
|
|
ValueError: If credentials have not been set
|
|
Exception: If the API request fails
|
|
"""
|
|
params = {
|
|
"limit": 100,
|
|
}
|
|
|
|
if include_body:
|
|
params["body-format"] = "storage"
|
|
|
|
all_pages = []
|
|
cursor = None
|
|
|
|
while True:
|
|
if cursor:
|
|
params["cursor"] = cursor
|
|
|
|
result = await self._make_api_request(f"spaces/{space_id}/pages", params)
|
|
|
|
if not isinstance(result, dict) or "results" not in result:
|
|
raise Exception("Invalid response from Confluence API")
|
|
|
|
pages = result["results"]
|
|
all_pages.extend(pages)
|
|
|
|
# Check if there are more pages to fetch
|
|
links = result.get("_links", {})
|
|
if "next" not in links:
|
|
break
|
|
|
|
# Extract cursor from next link if available
|
|
next_link = links["next"]
|
|
if "cursor=" in next_link:
|
|
cursor = next_link.split("cursor=")[1].split("&")[0]
|
|
else:
|
|
break
|
|
|
|
return all_pages
|
|
|
|
async def get_page_comments(self, page_id: str) -> list[dict[str, Any]]:
|
|
"""
|
|
Fetch all comments for a specific page (both footer and inline comments).
|
|
|
|
Args:
|
|
page_id: The ID of the page to fetch comments from
|
|
|
|
Returns:
|
|
List of comment objects
|
|
|
|
Raises:
|
|
ValueError: If credentials have not been set
|
|
Exception: If the API request fails
|
|
"""
|
|
all_comments = []
|
|
|
|
# Get footer comments
|
|
footer_comments = await self._get_comments_for_page(page_id, "footer-comments")
|
|
all_comments.extend(footer_comments)
|
|
|
|
# Get inline comments
|
|
inline_comments = await self._get_comments_for_page(page_id, "inline-comments")
|
|
all_comments.extend(inline_comments)
|
|
|
|
return all_comments
|
|
|
|
async def _get_comments_for_page(
|
|
self, page_id: str, comment_type: str
|
|
) -> list[dict[str, Any]]:
|
|
"""
|
|
Helper method to fetch comments of a specific type for a page.
|
|
|
|
Args:
|
|
page_id: The ID of the page
|
|
comment_type: Type of comments ('footer-comments' or 'inline-comments')
|
|
|
|
Returns:
|
|
List of comment objects
|
|
"""
|
|
params = {
|
|
"limit": 100,
|
|
"body-format": "storage",
|
|
}
|
|
|
|
all_comments = []
|
|
cursor = None
|
|
|
|
while True:
|
|
if cursor:
|
|
params["cursor"] = cursor
|
|
|
|
result = await self._make_api_request(
|
|
f"pages/{page_id}/{comment_type}", params
|
|
)
|
|
|
|
if not isinstance(result, dict) or "results" not in result:
|
|
break # No comments or invalid response
|
|
|
|
comments = result["results"]
|
|
all_comments.extend(comments)
|
|
|
|
# Check if there are more comments to fetch
|
|
links = result.get("_links", {})
|
|
if "next" not in links:
|
|
break
|
|
|
|
# Extract cursor from next link if available
|
|
next_link = links["next"]
|
|
if "cursor=" in next_link:
|
|
cursor = next_link.split("cursor=")[1].split("&")[0]
|
|
else:
|
|
break
|
|
|
|
return all_comments
|
|
|
|
async def get_pages_by_date_range(
|
|
self,
|
|
start_date: str,
|
|
end_date: str,
|
|
space_ids: list[str] | None = None,
|
|
include_comments: bool = True,
|
|
) -> tuple[list[dict[str, Any]], str | None]:
|
|
"""
|
|
Fetch pages within a date range, optionally filtered by spaces.
|
|
|
|
Args:
|
|
start_date: Start date in YYYY-MM-DD format
|
|
end_date: End date in YYYY-MM-DD format (inclusive)
|
|
space_ids: Optional list of space IDs to filter pages
|
|
include_comments: Whether to include comments for each page
|
|
|
|
Returns:
|
|
Tuple containing (pages list with comments, error message or None)
|
|
"""
|
|
try:
|
|
if not self._use_oauth:
|
|
# Use legacy ConfluenceConnector for API requests
|
|
client = await self._get_legacy_client()
|
|
# Ensure credentials are loaded
|
|
await self._get_valid_token()
|
|
# ConfluenceConnector.get_pages_by_date_range is synchronous
|
|
import asyncio
|
|
|
|
loop = asyncio.get_event_loop()
|
|
return await loop.run_in_executor(
|
|
None,
|
|
client.get_pages_by_date_range,
|
|
start_date,
|
|
end_date,
|
|
space_ids,
|
|
include_comments,
|
|
)
|
|
|
|
# OAuth flow
|
|
all_pages = []
|
|
|
|
if space_ids:
|
|
# Fetch pages from specific spaces
|
|
for space_id in space_ids:
|
|
pages = await self.get_pages_in_space(space_id, include_body=True)
|
|
all_pages.extend(pages)
|
|
else:
|
|
# Fetch all pages (this might be expensive for large instances)
|
|
params = {
|
|
"limit": 100,
|
|
"body-format": "storage",
|
|
}
|
|
|
|
cursor = None
|
|
while True:
|
|
if cursor:
|
|
params["cursor"] = cursor
|
|
|
|
result = await self._make_api_request("pages", params)
|
|
if not isinstance(result, dict) or "results" not in result:
|
|
break
|
|
|
|
pages = result["results"]
|
|
all_pages.extend(pages)
|
|
|
|
links = result.get("_links", {})
|
|
if "next" not in links:
|
|
break
|
|
|
|
next_link = links["next"]
|
|
if "cursor=" in next_link:
|
|
cursor = next_link.split("cursor=")[1].split("&")[0]
|
|
else:
|
|
break
|
|
|
|
return all_pages, None
|
|
|
|
except Exception as e:
|
|
return [], f"Error fetching pages: {e!s}"
|
|
|
|
async def close(self):
|
|
"""Close the HTTP client connection."""
|
|
if self._http_client:
|
|
await self._http_client.aclose()
|
|
self._http_client = None
|
|
# Legacy client doesn't need explicit closing
|
|
self._legacy_confluence_client = None
|
|
|
|
async def __aenter__(self):
|
|
"""Async context manager entry."""
|
|
return self
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
"""Async context manager exit."""
|
|
await self.close()
|