From 2824410be225e43d5b22335776ca009c8c1ae2d1 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Sun, 5 Apr 2026 17:26:42 +0530 Subject: [PATCH] feat: add plaintext parser to ETL pipeline for reading text files --- surfsense_backend/app/etl_pipeline/parsers/plaintext.py | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 surfsense_backend/app/etl_pipeline/parsers/plaintext.py diff --git a/surfsense_backend/app/etl_pipeline/parsers/plaintext.py b/surfsense_backend/app/etl_pipeline/parsers/plaintext.py new file mode 100644 index 000000000..24bfb71e5 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/plaintext.py @@ -0,0 +1,8 @@ +def read_plaintext(file_path: str) -> str: + with open(file_path, encoding="utf-8", errors="replace") as f: + content = f.read() + if "\x00" in content: + raise ValueError( + f"File contains null bytes — likely a binary file opened as text: {file_path}" + ) + return content