Untitled
unknown
plain_text
a year ago
4.2 kB
6
Indexable
import pdfplumber
import pandas as pd
import re
from pathlib import Path
def extract_chargen_nr(lines):
for i, line in enumerate(lines):
if "Chargen Nr." in line:
return lines[i+1].split(" ")[3]
return None
def extract_kd_wst_bez(lines):
for line in lines:
pattern = r"Kd Wst Bez\s+(.*?)\s+Werkstoff"
match = re.search(pattern, line)
if match:
return match.group(1)
return None
def extract_abmessung(lines):
for line in lines:
if line.startswith('Abmessung'):
pattern = r'\d{1,3}(?:\.\d{3})*(?:,\d+)?'
matches = re.findall(pattern, line)
if matches:
return [float(num.replace('.', '').replace(',', '.')) for num in matches]
return None
def extract_reduction_ratio(lines):
for line in lines:
if line.startswith("Verformungsgrad"):
pattern = r'(\d+,\d+)(?=:1)'
match = re.search(pattern, line)
if match:
return match.group(1)
return None
def extract_karbid(lines):
results = {
'Karbidnetzwerk': [],
'Karbidzeiligkeit': []
}
for i, line in enumerate(lines):
if line.startswith("Glühgefüge"):
extracted_lines = lines[i+2:i+6]
for item in extracted_lines:
if not ('Karbidnetzwerk' in item or 'Karbidzeiligkeit' in item):
continue
if 'Karbidnetzwerk' in item:
values = item.split('Karbidnetzwerk')[-1].strip().split('-')
results['Karbidnetzwerk'].extend([float(val.strip()) for val in values])
if 'Karbidzeiligkeit' in item:
values = item.split('Karbidzeiligkeit')[-1].strip().split('-')
results['Karbidzeiligkeit'].extend([float(val.strip()) for val in values])
return results
def process_pdf(pdf_path):
"""Process a single PDF and return its extracted data as a dictionary."""
with pdfplumber.open(pdf_path) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text()
lines = text.split("\n")
lines = [line.strip() for line in lines if line.strip()]
karbid_results = extract_karbid(lines)
abmessung_results = extract_abmessung(lines)
data = {
'filename': pdf_path.name,
'chargen_nr': extract_chargen_nr(lines),
'kd_wst_bez': extract_kd_wst_bez(lines),
'reduction_ratio': extract_reduction_ratio(lines),
'Karbidnetzwerk': karbid_results['Karbidnetzwerk'],
'Karbidzeiligkeit': karbid_results['Karbidzeiligkeit'],
'Abmessung' : abmessung_results
}
return data
def process_multiple_pdfs(pdf_folder):
"""Process all PDFs in a folder and return a DataFrame."""
pdf_folder = Path(pdf_folder)
all_data = []
for pdf_path in pdf_folder.glob('*.pdf'):
try:
pdf_data = process_pdf(pdf_path)
all_data.append(pdf_data)
except Exception as e:
print(f"Error processing {pdf_path}: {str(e)}")
# Convert to DataFrame
df = pd.DataFrame(all_data)
# Convert Karbid lists to strings with values joined by '-'
if 'Karbidnetzwerk' in df.columns:
df['Karbidnetzwerk'] = df['Karbidnetzwerk'].apply(lambda x: ','.join(map(str, x)) if isinstance(x, list) else x)
if 'Karbidzeiligkeit' in df.columns:
df['Karbidzeiligkeit'] = df['Karbidzeiligkeit'].apply(lambda x: ','.join(map(str, x)) if isinstance(x, list) else x)
return df
# Example usage:
if __name__ == "__main__":
# Replace with your PDF folder path
pdf_folder = r"D:\Programming\Python\shaeffler_task\pdf"
# Process all PDFs and create DataFrame
df = process_multiple_pdfs(pdf_folder)
# Save to CSV
df.to_csv("extracted_data.csv", index=False)
print("Data extraction completed!")
print(f"Processed {len(df)} PDF files")
print("\nFirst few rows of the DataFrame:")
print(df.head())
Editor is loading...
Leave a Comment