Optimal Chunk Parameters for Text Splitting
This script calculates the optimal chunk size and overlap for splitting a given text into a specified number of chunks, ensuring each chunk respects the minimum and maximum size constraints.unknown
python
a year ago
2.0 kB
5
Indexable
import re from langchain.text_splitter import CharacterTextSplitter def get_optimal_chunk_params(text, target_chunks=4, min_chunk_size=100, max_chunk_size=1000): """ Determine optimal chunk size and overlap for text splitting. Args: text (str): The input text to be split. target_chunks (int): The desired number of chunks (default 4). min_chunk_size (int): Minimum chunk size (default 100). max_chunk_size (int): Maximum chunk size (default 1000). Returns: tuple: (chunk_size, chunk_overlap) """ # Calculate text statistics total_length = len(text) sentences = re.split(r'(?<=[.!?])\s+', text) avg_sentence_length = sum(len(s) for s in sentences) / len(sentences) # Calculate initial chunk size chunk_size = max(min_chunk_size, min(total_length // target_chunks, max_chunk_size)) # Adjust chunk size to be a multiple of average sentence length chunk_size = round(chunk_size / avg_sentence_length) * avg_sentence_length # Ensure chunk size is within bounds chunk_size = max(min_chunk_size, min(chunk_size, max_chunk_size)) # Calculate overlap (20% of chunk size, rounded to nearest sentence) chunk_overlap = round((0.2 * chunk_size) / avg_sentence_length) * avg_sentence_length print(int(chunk_size), int(chunk_overlap)) return int(chunk_size), int(chunk_overlap) def get_text_chunks(text, chunk_size=250, chunk_overlap=25): text_sp = CharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator="\n", length_function=len ) text_chunks = text_sp.split_text(text) return text_chunks def activeSplitChuncks(sample_text): chunk_size, chunk_overlap = get_optimal_chunk_params(sample_text) text_chunks = get_text_chunks(sample_text,chunk_size, chunk_overlap) print(f"Number of chunks: {len(text_chunks)}") for i, chunk in enumerate(text_chunks, 1): print(f"\nChunk {i}:") print(chunk)
Editor is loading...
Leave a Comment