PDF Main

mail@pastecode.io avatar
unknown
python
a year ago
2.4 kB
5
Indexable
Never
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTFigure, LTChar


class CEETemplate:
    keys: dict
    maxCoords: dict
    ccaa: str


class ParsePDF:

    def __init__(self, base):
        self.keys = base.keys
        self.maxCoords = base.maxCoords
        self.ccaa = base.ccaa

    def extractElements(self, input):

        figureList = []
        mapingFigures = []

        for page_layout in extract_pages(input):
            for element in page_layout:
                if isinstance(element, LTFigure):
                    figureList.append(element)

        for element in figureList:
            temp_str = []

            for instance_figure in element:

                if isinstance(instance_figure, LTChar):
                    value = instance_figure.get_text()
                    temp_str.append(value)

            textReturn = ''.join(temp_str)

            if (textReturn):
                temporalMapGeneration = {
                        'value': ''.join(temp_str),
                        'h': element.height,
                        'w': element.width
                    }
                mapingFigures.append(temporalMapGeneration)

        return mapingFigures

    def parseCoordsPDF(self, input):

        outputMap = []

        search = self.keys.copy()
        maxCoords = self.maxCoords.copy()

        for element in input:

            tempValue = element['value']
            hTemp = element['h']
            wTemp = element['w']

            for key in search.copy().keys():
                h = search[key]['h']
                w = search[key]['w']
                if key != 'consumo' and key != 'emisiones':
                    if (hTemp - h == 0) and (wTemp - w == 0):
                        outputMap.append({
                            f'{key}': tempValue})
                        search.pop(key)
                else:
                    if (maxCoords[key]['h'][0] <= hTemp <= maxCoords[key]['h'][1] and
                            maxCoords[key]['w'][0] <= wTemp <= maxCoords[key]['w'][1]):
                        outputMap.append({
                            f'{key}': tempValue})
                        search.pop(key)

        return outputMap

    def parse(self, pdf):
        elements = self.extractElements(pdf)
        output = self.parseCoordsPDF(elements)
        return dict((key, d[key]) for d in output for key in d)