From 5c4a1ce0925afc602b55015244d51705bd628a1a Mon Sep 17 00:00:00 2001 From: Cyber MacGeddon Date: Tue, 16 Jul 2024 19:34:43 +0100 Subject: [PATCH] Configuration of chunk size / overlap --- trustgraph/chunker/recursive/chunker.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/trustgraph/chunker/recursive/chunker.py b/trustgraph/chunker/recursive/chunker.py index f1389186..16e3c992 100755 --- a/trustgraph/chunker/recursive/chunker.py +++ b/trustgraph/chunker/recursive/chunker.py @@ -30,6 +30,8 @@ class Processor: output_queue=default_output_queue, subscriber=default_subscriber, log_level=LogLevel.INFO, + chunk_size=2000, + chunk_overlap=100, ): self.client = None @@ -50,8 +52,8 @@ class Processor: ) self.text_splitter = RecursiveCharacterTextSplitter( - chunk_size=1000, - chunk_overlap=20, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False, ) @@ -146,6 +148,20 @@ def run(): help=f'Output queue (default: info)' ) + parser.add_argument( + '-z', '--chunk-size', + type=int, + default=2000, + help=f'Chunk size (default: 2000)' + ) + + parser.add_argument( + '-v', '--chunk-overlap', + type=int, + default=100, + help=f'Chunk overlap (default: 100)' + ) + args = parser.parse_args() @@ -159,6 +175,8 @@ def run(): output_queue=args.output_queue, subscriber=args.subscriber, log_level=args.log_level, + chunk_size=args.chunk_size, + chunk_overlap=args.chunk_overlap, ) p.run()