Implement logging strategy (#444)

* Logging strategy and convert all prints() to logging invocations
2026-06-20 04:08:06 +02:00 · 2025-07-30 23:18:38 +01:00 · 2025-07-30 23:18:38 +01:00 · dd70aade11
commit dd70aade11
parent 3e0651222b
117 changed files with 1216 additions and 667 deletions
--- a/trustgraph-ocr/trustgraph/decoding/ocr/pdf_decoder.py
+++ b/trustgraph-ocr/trustgraph/decoding/ocr/pdf_decoder.py
@ -6,12 +6,16 @@ PDF document as text as separate output objects.

 import tempfile
 import base64
+import logging
 import pytesseract
 from pdf2image import convert_from_bytes

 from ... schema import Document, TextDocument, Metadata
 from ... base import FlowProcessor, ConsumerSpec, ProducerSpec

+# Module logger
+logger = logging.getLogger(__name__)
+
 default_ident = "pdf-decoder"

 class Processor(FlowProcessor):
@ -41,15 +45,15 @@ class Processor(FlowProcessor):
            )
        )

-        print("PDF OCR inited")
+        logger.info("PDF OCR processor initialized")

    async def on_message(self, msg, consumer, flow):

-        print("PDF message received", flush=True)
+        logger.info("PDF message received")

        v = msg.value()

-        print(f"Decoding {v.metadata.id}...", flush=True)
+        logger.info(f"Decoding {v.metadata.id}...")

        blob = base64.b64decode(v.data)

@ -60,7 +64,7 @@ class Processor(FlowProcessor):
            try:
                text = pytesseract.image_to_string(page, lang='eng')
            except Exception as e:
-                print(f"Page did not OCR: {e}")
+                logger.warning(f"Page did not OCR: {e}")
                continue

            r = TextDocument(
@ -70,7 +74,7 @@ class Processor(FlowProcessor):

            await flow("output").send(r)

-        print("Done.", flush=True)
+        logger.info("PDF decoding complete")

    @staticmethod
    def add_args(parser):