Untitled
unknown
plain_text
8 months ago
671 B
5
Indexable
Never
import pdfplumber import re def cidToChar(cidx): return chr(int(re.findall(r'\(cid\:(\d+)\)', cidx)[0]) + 29) def extract_data(file): text = '' with pdfplumber.open(file) as pdf: for page in pdf.pages: for x in page.extract_text().split('\n'): if x != '' and x != '(cid:3)': # merely to compact the output abc = re.findall(r'\(cid\:\d+\)', x) if len(abc) > 0: for cid in abc: x = x.replace(cid, cidToChar(cid)) text+=repr(x).strip("'")+'\n' return text print(extract_data('C:\\Users\\dan.hd\\\Downloads\\file.pdf'))
Leave a Comment