Untitled
from transformers import AutoTokenizer, AutoModelForMaskedLM from dotenv import load_dotenv import os import json import time from concurrent.futures import ThreadPoolExecutor, as_completed from multiprocessing import cpu_count load_dotenv() model_id = "answerdotai/ModernBERT-base" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForMaskedLM.from_pretrained(model_id) parent_folder = os.environ.get("SAMPLE_CASE_BASE_DIR") def process_page(file): page_start = time.time() with open(os.path.join(parent_folder, file), 'r') as f: page_data = json.load(f) seq = "".join(page_data["textLines"]) # Count words word_count = len(seq.split()) # Process with model tokens = tokenizer(seq, return_tensors="pt") outputs = model(**tokens) page_time = time.time() - page_start print(f"Page {file}:") print(f"Word count: {word_count}") print(f"Processing time: {page_time:.2f} seconds") print("---") return word_count, page_time total_start_time = time.time() total_words = 0 page_count = 0 total_page_time = 0 # Get list of JSON files json_files = [f for f in os.listdir(parent_folder) if f.startswith("page") and f.endswith(".json")] # Use ThreadPoolExecutor for parallel processing max_workers = cpu_count() with ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit all tasks future_to_file = {executor.submit(process_page, file): file for file in json_files} # Process completed tasks for future in as_completed(future_to_file): word_count, page_time = future.result() total_words += word_count total_page_time += page_time page_count += 1 avg_words = total_words / page_count if page_count > 0 else 0 avg_time = total_page_time / page_count if page_count > 0 else 0 total_time = time.time() - total_start_time print("\nSummary:") print(f"Average words per page: {avg_words:.1f}") print(f"Average time per page: {avg_time:.2f} seconds") print(f"Total time taken: {total_time:.2f} seconds") print(f"Total pages processed: {page_count}")
Leave a Comment