Untitled
unknown
plain_text
a year ago
5.1 kB
1
Indexable
Never
import os import zipfile import re from PyPDF2 import PdfReader import re import pandas as pd from tabula import read_pdf import pdfplumber import tabula import shutil import numpy as np path = 'C:/Users/Maria Smirnova/Desktop/КУ/5k/MVP_mcko/Исходники 547/zip' path_to_exctract = 'C:/Users/Maria Smirnova/Desktop/КУ/5k/MVP_mcko/Исходники 547/zip/unpacked' for file in os.listdir(path): file_path = path+'/'+file archive = zipfile.ZipFile(file_path) for entry in archive.infolist(): encoding='cp866' encode = 'cp437' name = entry.filename.encode(encode).decode(encoding) if ('код' in name.lower()) & ('pdf' in name.lower()): #print('file in zip name with codes:', name) #print('is there re search', re.search('.*/.*/.*', name)) if re.search('.*/.*/.*', name) == None: archive.extract(entry.filename, path_to_exctract) os.rename(path_to_exctract + '/' + entry.filename, path_to_exctract + '/' + name) else: archive.extract(entry.filename, path_to_exctract) old_name = os.path.dirname(path_to_exctract + '/' + entry.filename) + '/' + os.path.basename(entry.filename) new_name = os.path.dirname(path_to_exctract + '/' + entry.filename) + '/' + os.path.basename(name) os.rename(old_name, new_name) try: old_folder = os.path.dirname(path_to_exctract + '/' + entry.filename) new_folder = path_to_exctract + '/' + os.path.dirname(name) #print(old_folder) #print(new_folder) os.rename(old_folder, new_folder) except: pass columns_names = ['фио обучающегося', 'индивидуальный\rкод диагностики', 'код диагностики', 'код', 'пароль', 'класс', 'комплект'] df_codes = pd.DataFrame(columns=columns_names) for address, dirs, files in os.walk(path_to_exctract): for name in files: file_path = os.path.join(address, name) #print(file_path) for d in tabula.read_pdf(file_path, pages="all"): d.columns = [x.lower() for x in d.columns] if 'фио обучающегося' in d.columns: df_codes = pd.concat([df_codes, d], ignore_index=True) df_codes.loc[df_codes['код'].isna(), 'код'] = df_codes[df_codes['код'].isna()]['код диагностики'] df_codes.loc[df_codes['код'].isna(), 'код'] = df_codes[df_codes['код'].isna()]['индивидуальный\rкод диагностики'] df_codes = df_codes.drop(['код диагностики','индивидуальный\rкод диагностики'], axis=1) df_codes.to_excel('df_codes.xlsx') def read_result(text): school : str = re.search(r'Государственное.*', text)[0] subject : str = re.search(r'Предмет: [а-яА-ЯёЁ]*\b ', text)[0] subject = re.sub(r'Предмет: ', '', subject) class_name : str = re.search(r'Класс: [0-9а-яА-ЯёЁ]*\b', text)[0] class_name = re.sub(r'Класс: ', '', class_name) mcko_date = re.search(r'Дата: .*\d\d\d\d', text)[0] mcko_date = re.sub(r'Дата: ', '', mcko_date) return [school, subject, class_name, mcko_date] path = 'C:/Users/Maria Smirnova/Desktop/КУ/5k/MVP_mcko/Исходники 547/results' df_res = pd.DataFrame() for address, dirs, files in os.walk(path): for name in files: if 'pdf' in name: file_path = os.path.join(address, name) #print(file_path) reader = PdfReader(file_path) number_of_pages = len(reader.pages) page = reader.pages[0] text = page.extract_text() info_arr = read_result(text) #df = pd.concat([df,d], ignore_index=True) for d in tabula.read_pdf(file_path, pages="all"): if "Код диагн." in d.columns: #info = pd.DataFrame(np.repeat(info_arr, len(d), axis=0)) #print(info) #info.columns = ['school', 'subject', 'class_name', 'mcko_date'] #info.rename(columns = ['school', 'subject', 'class_name', 'mcko_date'] d['school'] = info_arr[0] d['subject'] = info_arr[1] d['class_name'] = info_arr[2] d['mcko_date'] = info_arr[3] #d = pd.concat([d, info], axis=1, ignore_index=True) df_res = pd.concat([df_res,d], ignore_index=True) df_res = df_res.rename(columns={'Код диагн.': 'код'}) df_res.to_excel('df_res.xlsx') df_codes['код']=df_codes['код'].astype(str) df_res['код']=df_res['код'].astype(str) df_final = df_res.set_index('код').join(df_codes.set_index('код'), on='код', how='left', lsuffix='', rsuffix='') print(df_final)