Untitled
unknown
plain_text
a year ago
4.0 kB
8
Indexable
import os
import re
import shutil
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from google.colab import files
# Function to extract information and return a new file name
def generate_new_file_name(text):
# Define regex patterns
order_number_pattern = r'\b\d{4}-\d{2}[A-Za-z]+\b' # Order number pattern
despatch_note_pattern = r'\bD\d+\b' # Example: D12154
date_pattern = r'\b\d{2} \w{3} \d{4}\b' # DD Mon YYYY format
# Search for matches
order_number_match = re.search(order_number_pattern, text)
despatch_note_match = re.search(despatch_note_pattern, text)
date_match = re.search(date_pattern, text)
# Extract matches
order_number = order_number_match.group(0) if order_number_match else "Not_found"
despatch_note = despatch_note_match.group(0) if despatch_note_match else "Not_found"
date = date_match.group(0) if date_match else "Not_found"
# Convert date format from DD Mon YYYY to YYYY.MM.DD
if date != "Not_found":
day, month, year = date.split()
month_numbers = {
"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06",
"Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"
}
formatted_date = f"{year}.{month_numbers[month]}.{day.zfill(2)}"
else:
formatted_date = "date_not_found"
# Create new file name
new_file_name = f"{formatted_date} - {order_number} - {despatch_note}.pdf"
return new_file_name
# Function to check if the file name already exists and modify it if necessary
def get_unique_file_name(base_dir, new_file_name):
base_name, extension = os.path.splitext(new_file_name)
counter = 1
unique_file_name = new_file_name
while os.path.exists(os.path.join(base_dir, unique_file_name)):
unique_file_name = f"{base_name} ({counter}){extension}"
counter += 1
return unique_file_name
# Function to process each page
def process_and_save_page(image, page_number, base_dir):
# Save the image temporarily
temp_img_path = os.path.join(base_dir, f"temp_page_{page_number}.jpg")
image.save(temp_img_path, 'JPEG')
# Perform OCR to extract text
text = pytesseract.image_to_string(Image.open(temp_img_path), lang='eng')
# Generate new file name based on extracted text
new_file_name = generate_new_file_name(text)
# Ensure the file name is unique
unique_file_name = get_unique_file_name(base_dir, new_file_name)
# Convert the image back to PDF and save with new name
new_file_path = os.path.join(base_dir, unique_file_name)
image.save(new_file_path, 'PDF', resolution=25)
# Cleanup the temporary image file
os.remove(temp_img_path)
return new_file_path
# 📂 Find the most recent PDF file in /content/
pdf_dir = "/content/"
pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
pdf_paths = [os.path.join(pdf_dir, f) for f in pdf_files]
# Get the most recent PDF by checking modification time
if pdf_paths:
most_recent_pdf = max(pdf_paths, key=os.path.getmtime)
pdf_path = most_recent_pdf
print(f"📄 Processing most recent file: {pdf_path}")
else:
print("❌ No PDF files found in the /content/ directory.")
exit()
# 📁 Create a folder for processed PDFs
base_dir = "/content/processed_pdfs"
os.makedirs(base_dir, exist_ok=True)
# 📄 Convert PDF to images
pages = convert_from_path(pdf_path, 400)
# Process each page
for page_number, page_image in enumerate(pages):
new_file_path = process_and_save_page(page_image, page_number, base_dir)
print(f"✅ Saved as: {new_file_path}")
# 📦 Zip and Download Processed PDFs
shutil.make_archive("Processed_PDFs", 'zip', base_dir)
files.download("Processed_PDFs.zip")
print("✅ Processing complete! Download the ZIP file.")
Editor is loading...
Leave a Comment