mirror of
https://github.com/katanemo/plano.git
synced 2026-04-30 11:26:27 +02:00
20 lines
592 B
Python
20 lines
592 B
Python
|
|
import numpy as np
|
||
|
|
|
||
|
|
|
||
|
|
def split_text_into_chunks(text, max_words=300):
|
||
|
|
"""
|
||
|
|
Max number of tokens for tokenizer is 512
|
||
|
|
Split the text into chunks of 300 words (as approximation for tokens)
|
||
|
|
"""
|
||
|
|
words = text.split() # Split text into words
|
||
|
|
# Estimate token count based on word count (1 word ≈ 1 token)
|
||
|
|
chunk_size = max_words # Use the word count as an approximation for tokens
|
||
|
|
chunks = [
|
||
|
|
" ".join(words[i : i + chunk_size]) for i in range(0, len(words), chunk_size)
|
||
|
|
]
|
||
|
|
return chunks
|
||
|
|
|
||
|
|
|
||
|
|
def softmax(x):
|
||
|
|
return np.exp(x) / np.exp(x).sum(axis=0)
|