trustgraph/trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py

84 lines
1.9 KiB
Python
Raw Normal View History

2024-07-10 23:20:06 +01:00
"""
Simple decoder, accepts PDF documents on input, outputs pages from the
PDF document as text as separate output objects.
"""
import tempfile
import base64
from langchain_community.document_loaders import PyPDFLoader
2024-07-10 23:20:06 +01:00
from ... schema import Document, TextDocument, Metadata
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
2024-07-10 23:20:06 +01:00
default_ident = "pdf-decoder"
class Processor(FlowProcessor):
2024-07-10 23:20:06 +01:00
def __init__(self, **params):
id = params.get("id", default_ident)
2024-07-10 23:20:06 +01:00
super(Processor, self).__init__(
**params | {
"id": id,
}
2024-07-10 23:20:06 +01:00
)
self.register_specification(
ConsumerSpec(
name = "input",
schema = Document,
handler = self.on_message,
)
)
self.register_specification(
ProducerSpec(
name = "output",
schema = TextDocument,
)
)
print("PDF inited", flush=True)
2024-07-15 17:17:04 +01:00
async def on_message(self, msg, consumer, flow):
2024-07-10 23:20:06 +01:00
print("PDF message received", flush=True)
2024-07-15 17:17:04 +01:00
v = msg.value()
2024-07-10 23:20:06 +01:00
print(f"Decoding {v.metadata.id}...", flush=True)
2024-07-10 23:20:06 +01:00
with tempfile.NamedTemporaryFile(delete_on_close=False) as fp:
2024-07-15 17:17:04 +01:00
fp.write(base64.b64decode(v.data))
fp.close()
2024-07-10 23:20:06 +01:00
with open(fp.name, mode='rb') as f:
2024-07-10 23:20:06 +01:00
loader = PyPDFLoader(fp.name)
pages = loader.load()
2024-07-10 23:20:06 +01:00
for ix, page in enumerate(pages):
2024-07-10 23:20:06 +01:00
print("page", ix, flush=True)
r = TextDocument(
metadata=v.metadata,
text=page.page_content.encode("utf-8"),
)
2024-07-10 23:20:06 +01:00
await flow("output").send(r)
2024-07-10 23:20:06 +01:00
print("Done.", flush=True)
2024-07-10 23:20:06 +01:00
@staticmethod
def add_args(parser):
FlowProcessor.add_args(parser)
2024-07-10 23:20:06 +01:00
def run():
Processor.launch(default_ident, __doc__)
2024-07-10 23:20:06 +01:00