fix for issue #821: deferring optional SDK imports to runtime for provider modules (#828)

This commit is contained in:
Syed Ishmum Ahnaf 2026-04-18 16:14:52 +06:00 committed by GitHub
parent 290922858f
commit b341bf5ea1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 39 additions and 31 deletions

View file

@ -5,7 +5,6 @@ as text as separate output objects.
"""
import logging
from langchain_text_splitters import RecursiveCharacterTextSplitter
from prometheus_client import Histogram
from ... schema import TextDocument, Chunk, Metadata, Triples
@ -42,6 +41,9 @@ class Processor(ChunkingService):
self.default_chunk_size = chunk_size
self.default_chunk_overlap = chunk_overlap
from langchain_text_splitters import RecursiveCharacterTextSplitter
self.RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter
if not hasattr(__class__, "chunk_metric"):
__class__.chunk_metric = Histogram(
'chunk_size', 'Chunk size',
@ -50,7 +52,7 @@ class Processor(ChunkingService):
2500, 4000, 6400, 10000, 16000]
)
self.text_splitter = RecursiveCharacterTextSplitter(
self.text_splitter = self.RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
@ -103,7 +105,7 @@ class Processor(ChunkingService):
chunk_overlap = int(chunk_overlap)
# Create text splitter with effective parameters
text_splitter = RecursiveCharacterTextSplitter(
text_splitter = self.RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,

View file

@ -5,7 +5,6 @@ as text as separate output objects.
"""
import logging
from langchain_text_splitters import TokenTextSplitter
from prometheus_client import Histogram
from ... schema import TextDocument, Chunk, Metadata, Triples
@ -42,6 +41,9 @@ class Processor(ChunkingService):
self.default_chunk_size = chunk_size
self.default_chunk_overlap = chunk_overlap
from langchain_text_splitters import TokenTextSplitter
self.TokenTextSplitter = TokenTextSplitter
if not hasattr(__class__, "chunk_metric"):
__class__.chunk_metric = Histogram(
'chunk_size', 'Chunk size',
@ -50,7 +52,7 @@ class Processor(ChunkingService):
2500, 4000, 6400, 10000, 16000]
)
self.text_splitter = TokenTextSplitter(
self.text_splitter = self.TokenTextSplitter(
encoding_name="cl100k_base",
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
@ -102,7 +104,7 @@ class Processor(ChunkingService):
chunk_overlap = int(chunk_overlap)
# Create text splitter with effective parameters
text_splitter = TokenTextSplitter(
text_splitter = self.TokenTextSplitter(
encoding_name="cl100k_base",
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,

View file

@ -11,13 +11,10 @@ import os
import tempfile
import base64
import logging
from langchain_community.document_loaders import PyPDFLoader
from ... schema import Document, TextDocument, Metadata
from ... schema import librarian_request_queue, librarian_response_queue
from ... schema import Triples
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec, LibrarianClient
from ... provenance import (
document_uri, page_uri as make_page_uri, derived_entity_triples,
set_graph, GRAPH_SOURCE,
@ -131,6 +128,7 @@ class Processor(FlowProcessor):
fp.write(base64.b64decode(v.data))
fp.close()
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(temp_path)
pages = loader.load()