Untitled

 avatar
unknown
plain_text
6 days ago
1.1 kB
74
Indexable
import pdfplumber
import pandas as pd
import os

def extract_basic_data(pdf_path):
    full_text = ""

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text + "\n"

    return full_text


print("Tool initialized for data extraction...")

# Folder where main.py is located
current_folder = os.path.dirname(os.path.abspath(__file__))

data = []

# Loop through all files in the folder
for file_name in os.listdir(current_folder):
    if file_name.lower().endswith(".pdf"):
        pdf_path = os.path.join(current_folder, file_name)
        print(f"Extracting data from: {file_name}")

        text = extract_basic_data(pdf_path)

        data.append({
            "PDF File": file_name,
            "Extracted Text": text
        })

# Create Excel file
df = pd.DataFrame(data)
df.to_excel("pdf_extracted_data.xlsx", index=False)

print("Extraction completed! File 'pdf_extracted_data.xlsx' created successfully.")
Editor is loading...
Leave a Comment