Untitled
unknown
plain_text
3 months ago
4.0 kB
4
Indexable
import os import re import shutil import pytesseract from pdf2image import convert_from_path from PIL import Image from google.colab import files # Function to extract information and return a new file name def generate_new_file_name(text): # Define regex patterns order_number_pattern = r'\b\d{4}-\d{2}[A-Za-z]+\b' # Order number pattern despatch_note_pattern = r'\bD\d+\b' # Example: D12154 date_pattern = r'\b\d{2} \w{3} \d{4}\b' # DD Mon YYYY format # Search for matches order_number_match = re.search(order_number_pattern, text) despatch_note_match = re.search(despatch_note_pattern, text) date_match = re.search(date_pattern, text) # Extract matches order_number = order_number_match.group(0) if order_number_match else "Not_found" despatch_note = despatch_note_match.group(0) if despatch_note_match else "Not_found" date = date_match.group(0) if date_match else "Not_found" # Convert date format from DD Mon YYYY to YYYY.MM.DD if date != "Not_found": day, month, year = date.split() month_numbers = { "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12" } formatted_date = f"{year}.{month_numbers[month]}.{day.zfill(2)}" else: formatted_date = "date_not_found" # Create new file name new_file_name = f"{formatted_date} - {order_number} - {despatch_note}.pdf" return new_file_name # Function to check if the file name already exists and modify it if necessary def get_unique_file_name(base_dir, new_file_name): base_name, extension = os.path.splitext(new_file_name) counter = 1 unique_file_name = new_file_name while os.path.exists(os.path.join(base_dir, unique_file_name)): unique_file_name = f"{base_name} ({counter}){extension}" counter += 1 return unique_file_name # Function to process each page def process_and_save_page(image, page_number, base_dir): # Save the image temporarily temp_img_path = os.path.join(base_dir, f"temp_page_{page_number}.jpg") image.save(temp_img_path, 'JPEG') # Perform OCR to extract text text = pytesseract.image_to_string(Image.open(temp_img_path), lang='eng') # Generate new file name based on extracted text new_file_name = generate_new_file_name(text) # Ensure the file name is unique unique_file_name = get_unique_file_name(base_dir, new_file_name) # Convert the image back to PDF and save with new name new_file_path = os.path.join(base_dir, unique_file_name) image.save(new_file_path, 'PDF', resolution=25) # Cleanup the temporary image file os.remove(temp_img_path) return new_file_path # 📂 Find the most recent PDF file in /content/ pdf_dir = "/content/" pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')] pdf_paths = [os.path.join(pdf_dir, f) for f in pdf_files] # Get the most recent PDF by checking modification time if pdf_paths: most_recent_pdf = max(pdf_paths, key=os.path.getmtime) pdf_path = most_recent_pdf print(f"📄 Processing most recent file: {pdf_path}") else: print("❌ No PDF files found in the /content/ directory.") exit() # 📁 Create a folder for processed PDFs base_dir = "/content/processed_pdfs" os.makedirs(base_dir, exist_ok=True) # 📄 Convert PDF to images pages = convert_from_path(pdf_path, 400) # Process each page for page_number, page_image in enumerate(pages): new_file_path = process_and_save_page(page_image, page_number, base_dir) print(f"✅ Saved as: {new_file_path}") # 📦 Zip and Download Processed PDFs shutil.make_archive("Processed_PDFs", 'zip', base_dir) files.download("Processed_PDFs.zip") print("✅ Processing complete! Download the ZIP file.")
Editor is loading...
Leave a Comment