Untitled
unknown
plain_text
2 years ago
671 B
16
Indexable
import pdfplumber
import re
def cidToChar(cidx):
return chr(int(re.findall(r'\(cid\:(\d+)\)', cidx)[0]) + 29)
def extract_data(file):
text = ''
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
for x in page.extract_text().split('\n'):
if x != '' and x != '(cid:3)': # merely to compact the output
abc = re.findall(r'\(cid\:\d+\)', x)
if len(abc) > 0:
for cid in abc: x = x.replace(cid, cidToChar(cid))
text+=repr(x).strip("'")+'\n'
return text
print(extract_data('C:\\Users\\dan.hd\\\Downloads\\file.pdf'))Editor is loading...
Leave a Comment