Untitled

mail@pastecode.io avatar
unknown
plain_text
8 months ago
671 B
5
Indexable
Never
import pdfplumber
import re

def cidToChar(cidx):
    return chr(int(re.findall(r'\(cid\:(\d+)\)', cidx)[0]) + 29)

def extract_data(file):
    text = ''
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            for x in page.extract_text().split('\n'):
                if x != '' and x != '(cid:3)':  # merely to compact the output
                    abc = re.findall(r'\(cid\:\d+\)', x)
                    if len(abc) > 0:
                        for cid in abc: x = x.replace(cid, cidToChar(cid))
                    text+=repr(x).strip("'")+'\n'
    return text
print(extract_data('C:\\Users\\dan.hd\\\Downloads\\file.pdf'))
Leave a Comment