release/v1.4 -> master (#548)

This commit is contained in:
cybermaggedon 2025-10-06 17:54:26 +01:00 committed by GitHub
parent 3ec2cd54f9
commit 2bd68ed7f4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
94 changed files with 8571 additions and 1740 deletions

View file

@ -9,14 +9,14 @@ from langchain_text_splitters import RecursiveCharacterTextSplitter
from prometheus_client import Histogram
from ... schema import TextDocument, Chunk
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
from ... base import ChunkingService, ConsumerSpec, ProducerSpec
# Module logger
logger = logging.getLogger(__name__)
default_ident = "chunker"
class Processor(FlowProcessor):
class Processor(ChunkingService):
def __init__(self, **params):
@ -28,6 +28,10 @@ class Processor(FlowProcessor):
**params | { "id": id }
)
# Store default values for parameter override
self.default_chunk_size = chunk_size
self.default_chunk_overlap = chunk_overlap
if not hasattr(__class__, "chunk_metric"):
__class__.chunk_metric = Histogram(
'chunk_size', 'Chunk size',
@ -65,7 +69,22 @@ class Processor(FlowProcessor):
v = msg.value()
logger.info(f"Chunking document {v.metadata.id}...")
texts = self.text_splitter.create_documents(
# Extract chunk parameters from flow (allows runtime override)
chunk_size, chunk_overlap = await self.chunk_document(
msg, consumer, flow,
self.default_chunk_size,
self.default_chunk_overlap
)
# Create text splitter with effective parameters
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
is_separator_regex=False,
)
texts = text_splitter.create_documents(
[v.text.decode("utf-8")]
)
@ -89,7 +108,7 @@ class Processor(FlowProcessor):
@staticmethod
def add_args(parser):
FlowProcessor.add_args(parser)
ChunkingService.add_args(parser)
parser.add_argument(
'-z', '--chunk-size',

View file

@ -9,14 +9,14 @@ from langchain_text_splitters import TokenTextSplitter
from prometheus_client import Histogram
from ... schema import TextDocument, Chunk
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
from ... base import ChunkingService, ConsumerSpec, ProducerSpec
# Module logger
logger = logging.getLogger(__name__)
default_ident = "chunker"
class Processor(FlowProcessor):
class Processor(ChunkingService):
def __init__(self, **params):
@ -28,6 +28,10 @@ class Processor(FlowProcessor):
**params | { "id": id }
)
# Store default values for parameter override
self.default_chunk_size = chunk_size
self.default_chunk_overlap = chunk_overlap
if not hasattr(__class__, "chunk_metric"):
__class__.chunk_metric = Histogram(
'chunk_size', 'Chunk size',
@ -64,7 +68,21 @@ class Processor(FlowProcessor):
v = msg.value()
logger.info(f"Chunking document {v.metadata.id}...")
texts = self.text_splitter.create_documents(
# Extract chunk parameters from flow (allows runtime override)
chunk_size, chunk_overlap = await self.chunk_document(
msg, consumer, flow,
self.default_chunk_size,
self.default_chunk_overlap
)
# Create text splitter with effective parameters
text_splitter = TokenTextSplitter(
encoding_name="cl100k_base",
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
texts = text_splitter.create_documents(
[v.text.decode("utf-8")]
)
@ -88,7 +106,7 @@ class Processor(FlowProcessor):
@staticmethod
def add_args(parser):
FlowProcessor.add_args(parser)
ChunkingService.add_args(parser)
parser.add_argument(
'-z', '--chunk-size',