Untitled
unknown
python
a year ago
2.1 kB
27
Indexable
from transformers import AutoTokenizer, AutoModelForMaskedLM
from dotenv import load_dotenv
import os
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from multiprocessing import cpu_count
load_dotenv()
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)
parent_folder = os.environ.get("SAMPLE_CASE_BASE_DIR")
def process_page(file):
page_start = time.time()
with open(os.path.join(parent_folder, file), 'r') as f:
page_data = json.load(f)
seq = "".join(page_data["textLines"])
# Count words
word_count = len(seq.split())
# Process with model
tokens = tokenizer(seq, return_tensors="pt")
outputs = model(**tokens)
page_time = time.time() - page_start
print(f"Page {file}:")
print(f"Word count: {word_count}")
print(f"Processing time: {page_time:.2f} seconds")
print("---")
return word_count, page_time
total_start_time = time.time()
total_words = 0
page_count = 0
total_page_time = 0
# Get list of JSON files
json_files = [f for f in os.listdir(parent_folder) if f.startswith("page") and f.endswith(".json")]
# Use ThreadPoolExecutor for parallel processing
max_workers = cpu_count()
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all tasks
future_to_file = {executor.submit(process_page, file): file for file in json_files}
# Process completed tasks
for future in as_completed(future_to_file):
word_count, page_time = future.result()
total_words += word_count
total_page_time += page_time
page_count += 1
avg_words = total_words / page_count if page_count > 0 else 0
avg_time = total_page_time / page_count if page_count > 0 else 0
total_time = time.time() - total_start_time
print("\nSummary:")
print(f"Average words per page: {avg_words:.1f}")
print(f"Average time per page: {avg_time:.2f} seconds")
print(f"Total time taken: {total_time:.2f} seconds")
print(f"Total pages processed: {page_count}")
Editor is loading...
Leave a Comment