Untitled

from transformers import AutoTokenizer, AutoModelForMaskedLM
from dotenv import load_dotenv
import os
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from multiprocessing import cpu_count

load_dotenv()

model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)

parent_folder = os.environ.get("SAMPLE_CASE_BASE_DIR")

def process_page(file):
    page_start = time.time()

    with open(os.path.join(parent_folder, file), 'r') as f:
        page_data = json.load(f)
    seq = "".join(page_data["textLines"])

    # Count words
    word_count = len(seq.split())

    # Process with model
    tokens = tokenizer(seq, return_tensors="pt")
    outputs = model(**tokens)

    page_time = time.time() - page_start

    print(f"Page {file}:")
    print(f"Word count: {word_count}")
    print(f"Processing time: {page_time:.2f} seconds")
    print("---")

    return word_count, page_time

total_start_time = time.time()
total_words = 0
page_count = 0
total_page_time = 0

# Get list of JSON files
json_files = [f for f in os.listdir(parent_folder) if f.startswith("page") and f.endswith(".json")]

# Use ThreadPoolExecutor for parallel processing
max_workers = cpu_count()
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit all tasks
    future_to_file = {executor.submit(process_page, file): file for file in json_files}

    # Process completed tasks
    for future in as_completed(future_to_file):
        word_count, page_time = future.result()
        total_words += word_count
        total_page_time += page_time
        page_count += 1

avg_words = total_words / page_count if page_count > 0 else 0
avg_time = total_page_time / page_count if page_count > 0 else 0
total_time = time.time() - total_start_time

print("\nSummary:")
print(f"Average words per page: {avg_words:.1f}")
print(f"Average time per page: {avg_time:.2f} seconds")
print(f"Total time taken: {total_time:.2f} seconds")
print(f"Total pages processed: {page_count}")
Editor is loading...