Untitled
unknown
plain_text
6 months ago
4.2 kB
3
Indexable
import pdfplumber import pandas as pd import re from pathlib import Path def extract_chargen_nr(lines): for i, line in enumerate(lines): if "Chargen Nr." in line: return lines[i+1].split(" ")[3] return None def extract_kd_wst_bez(lines): for line in lines: pattern = r"Kd Wst Bez\s+(.*?)\s+Werkstoff" match = re.search(pattern, line) if match: return match.group(1) return None def extract_abmessung(lines): for line in lines: if line.startswith('Abmessung'): pattern = r'\d{1,3}(?:\.\d{3})*(?:,\d+)?' matches = re.findall(pattern, line) if matches: return [float(num.replace('.', '').replace(',', '.')) for num in matches] return None def extract_reduction_ratio(lines): for line in lines: if line.startswith("Verformungsgrad"): pattern = r'(\d+,\d+)(?=:1)' match = re.search(pattern, line) if match: return match.group(1) return None def extract_karbid(lines): results = { 'Karbidnetzwerk': [], 'Karbidzeiligkeit': [] } for i, line in enumerate(lines): if line.startswith("Glühgefüge"): extracted_lines = lines[i+2:i+6] for item in extracted_lines: if not ('Karbidnetzwerk' in item or 'Karbidzeiligkeit' in item): continue if 'Karbidnetzwerk' in item: values = item.split('Karbidnetzwerk')[-1].strip().split('-') results['Karbidnetzwerk'].extend([float(val.strip()) for val in values]) if 'Karbidzeiligkeit' in item: values = item.split('Karbidzeiligkeit')[-1].strip().split('-') results['Karbidzeiligkeit'].extend([float(val.strip()) for val in values]) return results def process_pdf(pdf_path): """Process a single PDF and return its extracted data as a dictionary.""" with pdfplumber.open(pdf_path) as pdf: text = "" for page in pdf.pages: text += page.extract_text() lines = text.split("\n") lines = [line.strip() for line in lines if line.strip()] karbid_results = extract_karbid(lines) abmessung_results = extract_abmessung(lines) data = { 'filename': pdf_path.name, 'chargen_nr': extract_chargen_nr(lines), 'kd_wst_bez': extract_kd_wst_bez(lines), 'reduction_ratio': extract_reduction_ratio(lines), 'Karbidnetzwerk': karbid_results['Karbidnetzwerk'], 'Karbidzeiligkeit': karbid_results['Karbidzeiligkeit'], 'Abmessung' : abmessung_results } return data def process_multiple_pdfs(pdf_folder): """Process all PDFs in a folder and return a DataFrame.""" pdf_folder = Path(pdf_folder) all_data = [] for pdf_path in pdf_folder.glob('*.pdf'): try: pdf_data = process_pdf(pdf_path) all_data.append(pdf_data) except Exception as e: print(f"Error processing {pdf_path}: {str(e)}") # Convert to DataFrame df = pd.DataFrame(all_data) # Convert Karbid lists to strings with values joined by '-' if 'Karbidnetzwerk' in df.columns: df['Karbidnetzwerk'] = df['Karbidnetzwerk'].apply(lambda x: ','.join(map(str, x)) if isinstance(x, list) else x) if 'Karbidzeiligkeit' in df.columns: df['Karbidzeiligkeit'] = df['Karbidzeiligkeit'].apply(lambda x: ','.join(map(str, x)) if isinstance(x, list) else x) return df # Example usage: if __name__ == "__main__": # Replace with your PDF folder path pdf_folder = r"D:\Programming\Python\shaeffler_task\pdf" # Process all PDFs and create DataFrame df = process_multiple_pdfs(pdf_folder) # Save to CSV df.to_csv("extracted_data.csv", index=False) print("Data extraction completed!") print(f"Processed {len(df)} PDF files") print("\nFirst few rows of the DataFrame:") print(df.head())
Editor is loading...
Leave a Comment