Untitled

import os
import re
import shutil
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from google.colab import files

# Function to extract information and return a new file name
def generate_new_file_name(text):
    # Define regex patterns
    order_number_pattern = r'\b\d{4}-\d{2}[A-Za-z]+\b'  # Order number pattern
    despatch_note_pattern = r'\bD\d+\b'  # Example: D12154
    date_pattern = r'\b\d{2} \w{3} \d{4}\b'  # DD Mon YYYY format

    # Search for matches
    order_number_match = re.search(order_number_pattern, text)
    despatch_note_match = re.search(despatch_note_pattern, text)
    date_match = re.search(date_pattern, text)

    # Extract matches
    order_number = order_number_match.group(0) if order_number_match else "Not_found"
    despatch_note = despatch_note_match.group(0) if despatch_note_match else "Not_found"
    date = date_match.group(0) if date_match else "Not_found"

    # Convert date format from DD Mon YYYY to YYYY.MM.DD
    if date != "Not_found":
        day, month, year = date.split()
        month_numbers = {
            "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06",
            "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"
        }
        formatted_date = f"{year}.{month_numbers[month]}.{day.zfill(2)}"
    else:
        formatted_date = "date_not_found"

    # Create new file name
    new_file_name = f"{formatted_date} - {order_number} - {despatch_note}.pdf"
    return new_file_name

# Function to check if the file name already exists and modify it if necessary
def get_unique_file_name(base_dir, new_file_name):
    base_name, extension = os.path.splitext(new_file_name)
    counter = 1
    unique_file_name = new_file_name
    while os.path.exists(os.path.join(base_dir, unique_file_name)):
        unique_file_name = f"{base_name} ({counter}){extension}"
        counter += 1
    return unique_file_name

# Function to process each page
def process_and_save_page(image, page_number, base_dir):
    # Save the image temporarily
    temp_img_path = os.path.join(base_dir, f"temp_page_{page_number}.jpg")
    image.save(temp_img_path, 'JPEG')

    # Perform OCR to extract text
    text = pytesseract.image_to_string(Image.open(temp_img_path), lang='eng')

    # Generate new file name based on extracted text
    new_file_name = generate_new_file_name(text)

    # Ensure the file name is unique
    unique_file_name = get_unique_file_name(base_dir, new_file_name)

    # Convert the image back to PDF and save with new name
    new_file_path = os.path.join(base_dir, unique_file_name)
    image.save(new_file_path, 'PDF', resolution=25)

    # Cleanup the temporary image file
    os.remove(temp_img_path)

    return new_file_path

# 📂 Find the most recent PDF file in /content/
pdf_dir = "/content/"
pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
pdf_paths = [os.path.join(pdf_dir, f) for f in pdf_files]

# Get the most recent PDF by checking modification time
if pdf_paths:
    most_recent_pdf = max(pdf_paths, key=os.path.getmtime)
    pdf_path = most_recent_pdf
    print(f"📄 Processing most recent file: {pdf_path}")
else:
    print("❌ No PDF files found in the /content/ directory.")
    exit()

# 📁 Create a folder for processed PDFs
base_dir = "/content/processed_pdfs"
os.makedirs(base_dir, exist_ok=True)

# 📄 Convert PDF to images
pages = convert_from_path(pdf_path, 400)

# Process each page
for page_number, page_image in enumerate(pages):
    new_file_path = process_and_save_page(page_image, page_number, base_dir)
    print(f"✅ Saved as: {new_file_path}")

# 📦 Zip and Download Processed PDFs
shutil.make_archive("Processed_PDFs", 'zip', base_dir)
files.download("Processed_PDFs.zip")
print("✅ Processing complete! Download the ZIP file.")
Editor is loading...