Untitled
unknown
plain_text
2 years ago
5.1 kB
11
Indexable
import os
import zipfile
import re
from PyPDF2 import PdfReader
import re
import pandas as pd
from tabula import read_pdf
import pdfplumber
import tabula
import shutil
import numpy as np
path = 'C:/Users/Maria Smirnova/Desktop/КУ/5k/MVP_mcko/Исходники 547/zip'
path_to_exctract = 'C:/Users/Maria Smirnova/Desktop/КУ/5k/MVP_mcko/Исходники 547/zip/unpacked'
for file in os.listdir(path):
file_path = path+'/'+file
archive = zipfile.ZipFile(file_path)
for entry in archive.infolist():
encoding='cp866'
encode = 'cp437'
name = entry.filename.encode(encode).decode(encoding)
if ('код' in name.lower()) & ('pdf' in name.lower()):
#print('file in zip name with codes:', name)
#print('is there re search', re.search('.*/.*/.*', name))
if re.search('.*/.*/.*', name) == None:
archive.extract(entry.filename, path_to_exctract)
os.rename(path_to_exctract + '/' + entry.filename, path_to_exctract + '/' + name)
else:
archive.extract(entry.filename, path_to_exctract)
old_name = os.path.dirname(path_to_exctract + '/' + entry.filename) + '/' + os.path.basename(entry.filename)
new_name = os.path.dirname(path_to_exctract + '/' + entry.filename) + '/' + os.path.basename(name)
os.rename(old_name, new_name)
try:
old_folder = os.path.dirname(path_to_exctract + '/' + entry.filename)
new_folder = path_to_exctract + '/' + os.path.dirname(name)
#print(old_folder)
#print(new_folder)
os.rename(old_folder, new_folder)
except:
pass
columns_names = ['фио обучающегося', 'индивидуальный\rкод диагностики', 'код диагностики', 'код', 'пароль', 'класс', 'комплект']
df_codes = pd.DataFrame(columns=columns_names)
for address, dirs, files in os.walk(path_to_exctract):
for name in files:
file_path = os.path.join(address, name)
#print(file_path)
for d in tabula.read_pdf(file_path, pages="all"):
d.columns = [x.lower() for x in d.columns]
if 'фио обучающегося' in d.columns:
df_codes = pd.concat([df_codes, d], ignore_index=True)
df_codes.loc[df_codes['код'].isna(), 'код'] = df_codes[df_codes['код'].isna()]['код диагностики']
df_codes.loc[df_codes['код'].isna(), 'код'] = df_codes[df_codes['код'].isna()]['индивидуальный\rкод диагностики']
df_codes = df_codes.drop(['код диагностики','индивидуальный\rкод диагностики'], axis=1)
df_codes.to_excel('df_codes.xlsx')
def read_result(text):
school : str = re.search(r'Государственное.*', text)[0]
subject : str = re.search(r'Предмет: [а-яА-ЯёЁ]*\b ', text)[0]
subject = re.sub(r'Предмет: ', '', subject)
class_name : str = re.search(r'Класс: [0-9а-яА-ЯёЁ]*\b', text)[0]
class_name = re.sub(r'Класс: ', '', class_name)
mcko_date = re.search(r'Дата: .*\d\d\d\d', text)[0]
mcko_date = re.sub(r'Дата: ', '', mcko_date)
return [school, subject, class_name, mcko_date]
path = 'C:/Users/Maria Smirnova/Desktop/КУ/5k/MVP_mcko/Исходники 547/results'
df_res = pd.DataFrame()
for address, dirs, files in os.walk(path):
for name in files:
if 'pdf' in name:
file_path = os.path.join(address, name)
#print(file_path)
reader = PdfReader(file_path)
number_of_pages = len(reader.pages)
page = reader.pages[0]
text = page.extract_text()
info_arr = read_result(text)
#df = pd.concat([df,d], ignore_index=True)
for d in tabula.read_pdf(file_path, pages="all"):
if "Код диагн." in d.columns:
#info = pd.DataFrame(np.repeat(info_arr, len(d), axis=0))
#print(info)
#info.columns = ['school', 'subject', 'class_name', 'mcko_date']
#info.rename(columns = ['school', 'subject', 'class_name', 'mcko_date']
d['school'] = info_arr[0]
d['subject'] = info_arr[1]
d['class_name'] = info_arr[2]
d['mcko_date'] = info_arr[3]
#d = pd.concat([d, info], axis=1, ignore_index=True)
df_res = pd.concat([df_res,d], ignore_index=True)
df_res = df_res.rename(columns={'Код диагн.': 'код'})
df_res.to_excel('df_res.xlsx')
df_codes['код']=df_codes['код'].astype(str)
df_res['код']=df_res['код'].astype(str)
df_final = df_res.set_index('код').join(df_codes.set_index('код'), on='код', how='left', lsuffix='', rsuffix='')
print(df_final)Editor is loading...