SurfSense/surfsense_backend/app/etl_pipeline/file_classifier.py

143 lines
3.2 KiB
Python

from enum import Enum
from pathlib import PurePosixPath
from app.utils.file_extensions import (
DOCUMENT_EXTENSIONS,
IMAGE_EXTENSIONS,
get_document_extensions_for_service,
)
PLAINTEXT_EXTENSIONS = frozenset(
{
".md",
".markdown",
".txt",
".text",
".json",
".jsonl",
".yaml",
".yml",
".toml",
".ini",
".cfg",
".conf",
".xml",
".css",
".scss",
".less",
".sass",
".py",
".pyw",
".pyi",
".pyx",
".js",
".jsx",
".ts",
".tsx",
".mjs",
".cjs",
".java",
".kt",
".kts",
".scala",
".groovy",
".c",
".h",
".cpp",
".cxx",
".cc",
".hpp",
".hxx",
".cs",
".fs",
".fsx",
".go",
".rs",
".rb",
".php",
".pl",
".pm",
".lua",
".swift",
".m",
".mm",
".r",
".jl",
".sh",
".bash",
".zsh",
".fish",
".bat",
".cmd",
".ps1",
".sql",
".graphql",
".gql",
".env",
".gitignore",
".dockerignore",
".editorconfig",
".makefile",
".cmake",
".log",
".rst",
".tex",
".bib",
".org",
".adoc",
".asciidoc",
".vue",
".svelte",
".astro",
".tf",
".hcl",
".proto",
}
)
AUDIO_EXTENSIONS = frozenset(
{".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"}
)
DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm", ".xhtml"})
class FileCategory(Enum):
PLAINTEXT = "plaintext"
AUDIO = "audio"
DIRECT_CONVERT = "direct_convert"
IMAGE = "image"
UNSUPPORTED = "unsupported"
DOCUMENT = "document"
def classify_file(filename: str) -> FileCategory:
suffix = PurePosixPath(filename).suffix.lower()
if suffix in PLAINTEXT_EXTENSIONS:
return FileCategory.PLAINTEXT
if suffix in AUDIO_EXTENSIONS:
return FileCategory.AUDIO
if suffix in DIRECT_CONVERT_EXTENSIONS:
return FileCategory.DIRECT_CONVERT
if suffix in IMAGE_EXTENSIONS:
return FileCategory.IMAGE
if suffix in DOCUMENT_EXTENSIONS:
return FileCategory.DOCUMENT
return FileCategory.UNSUPPORTED
def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
"""Return True if *filename* cannot be processed by *etl_service*.
Plaintext, audio, and direct-convert files are parser-agnostic and never
skipped. Image and document files are checked against the per-parser
extension set (images fall back to the document parser when no vision LLM
is available, so the same service constraint applies).
"""
category = classify_file(filename)
if category == FileCategory.UNSUPPORTED:
return True
if category in (FileCategory.DOCUMENT, FileCategory.IMAGE):
suffix = PurePosixPath(filename).suffix.lower()
return suffix not in get_document_extensions_for_service(etl_service)
return False