Optimal Chunk Parameters for Text Splitting

This script calculates the optimal chunk size and overlap for splitting a given text into a specified number of chunks, ensuring each chunk respects the minimum and maximum size constraints.
mail@pastecode.io avatar
unknown
python
9 days ago
2.0 kB
1
Indexable
Never
import re
from langchain.text_splitter import CharacterTextSplitter

def get_optimal_chunk_params(text, target_chunks=4, min_chunk_size=100, max_chunk_size=1000):
    """
    Determine optimal chunk size and overlap for text splitting.
    
    Args:
    text (str): The input text to be split.
    target_chunks (int): The desired number of chunks (default 4).
    min_chunk_size (int): Minimum chunk size (default 100).
    max_chunk_size (int): Maximum chunk size (default 1000).
    
    Returns:
    tuple: (chunk_size, chunk_overlap)
    """
    # Calculate text statistics
    total_length = len(text)
    sentences = re.split(r'(?<=[.!?])\s+', text)
    avg_sentence_length = sum(len(s) for s in sentences) / len(sentences)
    
    # Calculate initial chunk size
    chunk_size = max(min_chunk_size, min(total_length // target_chunks, max_chunk_size))
    
    # Adjust chunk size to be a multiple of average sentence length
    chunk_size = round(chunk_size / avg_sentence_length) * avg_sentence_length
    
    # Ensure chunk size is within bounds
    chunk_size = max(min_chunk_size, min(chunk_size, max_chunk_size))
    
    # Calculate overlap (20% of chunk size, rounded to nearest sentence)
    chunk_overlap = round((0.2 * chunk_size) / avg_sentence_length) * avg_sentence_length
    print(int(chunk_size), int(chunk_overlap))
    return int(chunk_size), int(chunk_overlap)


def get_text_chunks(text, chunk_size=250, chunk_overlap=25):
    text_sp = CharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separator="\n",
        length_function=len
    )
    text_chunks = text_sp.split_text(text)
    return text_chunks

def activeSplitChuncks(sample_text):
  chunk_size, chunk_overlap = get_optimal_chunk_params(sample_text)
  text_chunks = get_text_chunks(sample_text,chunk_size, chunk_overlap)
  print(f"Number of chunks: {len(text_chunks)}")
  for i, chunk in enumerate(text_chunks, 1):
    print(f"\nChunk {i}:")
    print(chunk)
Leave a Comment