mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-28 18:36:23 +02:00
feat: implement ETL pipeline with file classification and extraction services
This commit is contained in:
parent
9c0af6569d
commit
5d22349dc1
6 changed files with 188 additions and 0 deletions
21
surfsense_backend/app/etl_pipeline/etl_document.py
Normal file
21
surfsense_backend/app/etl_pipeline/etl_document.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
from pydantic import BaseModel, field_validator
|
||||
|
||||
|
||||
class EtlRequest(BaseModel):
|
||||
file_path: str
|
||||
filename: str
|
||||
estimated_pages: int = 0
|
||||
|
||||
@field_validator("filename")
|
||||
@classmethod
|
||||
def filename_must_not_be_empty(cls, v: str) -> str:
|
||||
if not v.strip():
|
||||
raise ValueError("filename must not be empty")
|
||||
return v
|
||||
|
||||
|
||||
class EtlResult(BaseModel):
|
||||
markdown_content: str
|
||||
etl_service: str
|
||||
actual_pages: int = 0
|
||||
content_type: str
|
||||
Loading…
Add table
Add a link
Reference in a new issue