PDF Main
unknown
python
3 years ago
2.4 kB
10
Indexable
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTFigure, LTChar
class CEETemplate:
keys: dict
maxCoords: dict
ccaa: str
class ParsePDF:
def __init__(self, base):
self.keys = base.keys
self.maxCoords = base.maxCoords
self.ccaa = base.ccaa
def extractElements(self, input):
figureList = []
mapingFigures = []
for page_layout in extract_pages(input):
for element in page_layout:
if isinstance(element, LTFigure):
figureList.append(element)
for element in figureList:
temp_str = []
for instance_figure in element:
if isinstance(instance_figure, LTChar):
value = instance_figure.get_text()
temp_str.append(value)
textReturn = ''.join(temp_str)
if (textReturn):
temporalMapGeneration = {
'value': ''.join(temp_str),
'h': element.height,
'w': element.width
}
mapingFigures.append(temporalMapGeneration)
return mapingFigures
def parseCoordsPDF(self, input):
outputMap = []
search = self.keys.copy()
maxCoords = self.maxCoords.copy()
for element in input:
tempValue = element['value']
hTemp = element['h']
wTemp = element['w']
for key in search.copy().keys():
h = search[key]['h']
w = search[key]['w']
if key != 'consumo' and key != 'emisiones':
if (hTemp - h == 0) and (wTemp - w == 0):
outputMap.append({
f'{key}': tempValue})
search.pop(key)
else:
if (maxCoords[key]['h'][0] <= hTemp <= maxCoords[key]['h'][1] and
maxCoords[key]['w'][0] <= wTemp <= maxCoords[key]['w'][1]):
outputMap.append({
f'{key}': tempValue})
search.pop(key)
return outputMap
def parse(self, pdf):
elements = self.extractElements(pdf)
output = self.parseCoordsPDF(elements)
return dict((key, d[key]) for d in output for key in d)
Editor is loading...