PDF Main
unknown
python
2 years ago
2.4 kB
7
Indexable
from pdfminer.high_level import extract_pages from pdfminer.layout import LTFigure, LTChar class CEETemplate: keys: dict maxCoords: dict ccaa: str class ParsePDF: def __init__(self, base): self.keys = base.keys self.maxCoords = base.maxCoords self.ccaa = base.ccaa def extractElements(self, input): figureList = [] mapingFigures = [] for page_layout in extract_pages(input): for element in page_layout: if isinstance(element, LTFigure): figureList.append(element) for element in figureList: temp_str = [] for instance_figure in element: if isinstance(instance_figure, LTChar): value = instance_figure.get_text() temp_str.append(value) textReturn = ''.join(temp_str) if (textReturn): temporalMapGeneration = { 'value': ''.join(temp_str), 'h': element.height, 'w': element.width } mapingFigures.append(temporalMapGeneration) return mapingFigures def parseCoordsPDF(self, input): outputMap = [] search = self.keys.copy() maxCoords = self.maxCoords.copy() for element in input: tempValue = element['value'] hTemp = element['h'] wTemp = element['w'] for key in search.copy().keys(): h = search[key]['h'] w = search[key]['w'] if key != 'consumo' and key != 'emisiones': if (hTemp - h == 0) and (wTemp - w == 0): outputMap.append({ f'{key}': tempValue}) search.pop(key) else: if (maxCoords[key]['h'][0] <= hTemp <= maxCoords[key]['h'][1] and maxCoords[key]['w'][0] <= wTemp <= maxCoords[key]['w'][1]): outputMap.append({ f'{key}': tempValue}) search.pop(key) return outputMap def parse(self, pdf): elements = self.extractElements(pdf) output = self.parseCoordsPDF(elements) return dict((key, d[key]) for d in output for key in d)
Editor is loading...