Optimal Chunk Parameters for Text Splitting
This script calculates the optimal chunk size and overlap for splitting a given text into a specified number of chunks, ensuring each chunk respects the minimum and maximum size constraints.unknown
python
a year ago
2.0 kB
8
Indexable
import re
from langchain.text_splitter import CharacterTextSplitter
def get_optimal_chunk_params(text, target_chunks=4, min_chunk_size=100, max_chunk_size=1000):
"""
Determine optimal chunk size and overlap for text splitting.
Args:
text (str): The input text to be split.
target_chunks (int): The desired number of chunks (default 4).
min_chunk_size (int): Minimum chunk size (default 100).
max_chunk_size (int): Maximum chunk size (default 1000).
Returns:
tuple: (chunk_size, chunk_overlap)
"""
# Calculate text statistics
total_length = len(text)
sentences = re.split(r'(?<=[.!?])\s+', text)
avg_sentence_length = sum(len(s) for s in sentences) / len(sentences)
# Calculate initial chunk size
chunk_size = max(min_chunk_size, min(total_length // target_chunks, max_chunk_size))
# Adjust chunk size to be a multiple of average sentence length
chunk_size = round(chunk_size / avg_sentence_length) * avg_sentence_length
# Ensure chunk size is within bounds
chunk_size = max(min_chunk_size, min(chunk_size, max_chunk_size))
# Calculate overlap (20% of chunk size, rounded to nearest sentence)
chunk_overlap = round((0.2 * chunk_size) / avg_sentence_length) * avg_sentence_length
print(int(chunk_size), int(chunk_overlap))
return int(chunk_size), int(chunk_overlap)
def get_text_chunks(text, chunk_size=250, chunk_overlap=25):
text_sp = CharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separator="\n",
length_function=len
)
text_chunks = text_sp.split_text(text)
return text_chunks
def activeSplitChuncks(sample_text):
chunk_size, chunk_overlap = get_optimal_chunk_params(sample_text)
text_chunks = get_text_chunks(sample_text,chunk_size, chunk_overlap)
print(f"Number of chunks: {len(text_chunks)}")
for i, chunk in enumerate(text_chunks, 1):
print(f"\nChunk {i}:")
print(chunk)Editor is loading...
Leave a Comment