Untitled
unknown
plain_text
a year ago
9.7 kB
0
Indexable
Never
import os import pytesseract # A popular OCR library # Set the path to the command : pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' import fitz # PyMuPDF - to convert PDFs to images from PIL import Image, ImageDraw, ImageFont # Use these if you want to try pasting to google sheet- I didn't test this yet. Change this note when you do. :) from google.oauth2 import service_account from googleapiclient.discovery import build # Set up Google Sheets API credentials SERVICE_ACCOUNT_FILE = 'C:/Users/ricky/OneDrive/Desktop/Scheduled Bots(Dont Run Any)/J-ModsNewUpdatesAllBots/LateShipment(Scheduled Only)/atlantean-site-346902-e0304f09a5d9.json' SCOPES = ['https://www.googleapis.com/auth/spreadsheets'] credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES) service = build('sheets', 'v4', credentials=credentials) # Google Sheets ID and range SPREADSHEET_ID = '1drAk3tqjFnxJ83fGM09tqdbrRexF2kSP1_DNP3lyMjk' #RANGE = 'Atlantis!A1' # Use this for looping throuh multiple files in a folder. # PDFs folder path pdf_folder = r'C:\Users\ricky\OneDrive\Desktop\Tiktok PDFs\2023-04-162' # Use this if you want to try pasting to google sheet- I didn't test this yet. Change this note when you do. :) #pdf_data = [] # Use this if you just wanted it printed to console to copy and paste it yourself pdf_data = "" def doParse() : global pdf_data # OCR the image text = pytesseract.image_to_string(img, lang='eng') # Note: The 'goOCR' variable is used throughout the code as flags to tell us if it's ok # to keep going (if no error is found yet.) Better than a goto statement? # (Goto statement doesn't exist in python anyway) # This way we also don't need to keep nesting our "if" statements and make indentation too big. # Start parsing the text goOCR = (text.startswith("In transit by: ")) errMsg = "'In transit by:' not found" if not goOCR else "" if goOCR : tlines = text.splitlines() # Sometimes there's a blank line before this, sometimes after. # So just put the 2 lines together to check it. goOCR = (tlines[1]+tlines[2]=="Product Name SKU[Seller SKU] Qty") # Sometimes the "Qty" is missing off the end, and "aty" shows up instead, # but 2 lines down, and then another blank line. Check for this : n = 3 if not goOCR : goOCR = (tlines[1] + tlines[2] + tlines[3] + tlines[4] =="Product Name SKU[Seller SKU]aty") n += 2 errMsg = "field headers not found in expected format" if not goOCR else "" # Initialize these even if goOCR =false, because we still append it to pdf_data later prod, sku = '', '' while goOCR : t = tlines[n] # Add a space between lines for prod and for sku, unless the last # line ended with a dash, or it was the 1st line. # And find the pipe char to split between product name and sku p = (t+'|').find('|') prod += ('' if prod.endswith('-') or prod=='' else ' ') + t[:p] sku += ('' if sku.endswith('-') or sku=='' else ' ') + t[p+1:] n += 1 goOCR = (n < len(tlines)) errMsg = "No blank line found after product/sku/quantity section." if not goOCR else "" # Once we reach a blank line, that should normally be the end of the line item. if goOCR and tlines[n]=='' : # Now see if the next line starts with a number, followed by " (Order ID: " # If so, that number is the total. Yeah, the OCR got it out of order on a lot of them. t = tlines[n+1] p = t.find(' ') goOCR = (p > 0 and t[:p].isdigit() and t[p:].startswith(' (Order ID: ')) break if goOCR : qty = t[:p] else : # If not, don't give up yet-- Sometimes the product name continues past # the blank line. (see 0098-7178.txt) # If that's the case, goOCR will be false here, but let's check the next 2 lines # before we give up... If it's a blank line followed by "OTickTok shop...", # then qty follows the last pipe in that line. prod += ('' if prod.endswith('-') else ' ') + t t = tlines[n+2] + tlines[n+3] goOCR = (t.startswith('OTikTok Shop _ | Total|')) errMsg = "total qty not found" if not goOCR else "" if goOCR : qty = t.split('|')[-1] if goOCR : success += 1 print(f"\n File parsed successfully: {success} / {tot} : {file}") else : numErr += 1 print(f"\n File error #{numErr} / {tot} : {file} doesn't look as expected: " + errMsg \ +"\n\n Here is the full file: \n\n" + text) with open(os.path.join(pdf_folder, file.replace(".pdf", ".txt")), 'w') as f : f.write(text) # Use this if you just wanted it printed to console to copy and paste it yourself pdf_data += '"' + prod + '","' + sku + '","' + qty + '","' + file + '","' + errMsg + '"\n' # Use this if you want to try pasting to google sheet- I didn't test this yet. Change this note when you do. :) #pdf_data.append([prod, sku, qty]) def doCombineImage() : global tot, img, fullImg # Crop the image and add it to a full image file containing all the line item cropped images img = img.crop((0, 0, img.width, 250)) draw = ImageDraw.Draw(img) fnt = ImageFont.truetype("arial.ttf", 20) draw.text((350,10),file, font=fnt, fill="green") #img.show() # for debugging- show the image in default image program #break # for debugging- exit file loop if fullImg is None: fullImg = Image.new(img.mode, (img.width, img.height * len(files))) y = img.height * (tot - 1) print(f"pasting at {y}") fullImg.paste(img, (0, y)) #fullImg.show() #print(f"cropped from {file}") #fullImg.show(fullImg) # for debugging #if tot==2: break # for debugging print(f"{tot} / {len(files)} images collected from: {file}") # Print instruction to screen. print(""" In order to parse the data properly, the program's expecting the text extracted from the PDF (by Tesseract OCR) to be in the following format: (This is an example) In transit by: 11/04/2023 15:59:59 Product Name SKU[Seller SKU] Qty Helmet motor motorcycle helmet Topi keledar | RED/BLACK- 1 double visor open face motosikal bike helmet | MEDIUM moto Stylish dual lens 1 (Order ID: 577182552601626782 Package ID: 1153192398784137374 dTikTok Shop __ {Total The program will let you know if it encounters text that is not in this format. """) numErr, success, tot = 0, 0, 0 fullImg = None while True: choice = input(""" 1 = Collect line item images into one image. 2 = Use OCR to extract and save text files, and parse and combine the data from them. Choose option:""") if choice in ('1','2'): break #For testing one file: #for file in ["C:/Users/admin.todd/Downloads/0001-_6782.pdf"] : #For looping through all PDFs in folder: files = [a for a in os.listdir(pdf_folder) if a.endswith(".pdf")] for file in files : tot += 1 print(f"Next file #{tot}: {file}") file_path = os.path.join(pdf_folder, file) # Open the PDF pdf = fitz.open(file_path) # Check if the PDF has at least two pages if pdf.page_count <= 1 : print(f"Only 1 page in pdf: {file}") else : # Get the second page page = pdf.load_page(1) # Convert the page to an image zoom_x = 2.0 zoom_y = 2.0 mat = fitz.Matrix(zoom_x, zoom_y) pix = page.get_pixmap(matrix=mat) # Prepare the image to be used in OCR img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples) # debugging code to help verify the image: #img.save("temp1.png") #print(img.size) # Looks like we need to rotate these clockwise. img = img.rotate(270, expand=True) prod, sku, qty = '', '', '' #Initialize vars if choice == '2' : doParse() if choice == '1': doCombineImage() #if tot >= 3: break # for debugging # Save the combined image file as "AllBills.png". if choice == '1' : fullImg.save(os.path.join(pdf_folder, "AllBills.png")) print("\n All images are combined into file = AllBills.png.") if choice == '2' : # Use this part if you want to just copy the extracted data to paste in Google Sheet print(""" Here is the data the program reorganized, to be copied and pasted into the google sheet (all at once -- then use google sheet's 'text to columns' feature, with separator=comma) : """, pdf_data) # Use this part if you want the program to try auto pasting to google sheet. # I didn't test this yet. Change this note when you do. :) """ body = { 'values': pdf_data } result = service.spreadsheets().values().append( spreadsheetId=SPREADSHEET_ID, range=RANGE, valueInputOption='RAW', body=body).execute() print(f'{result.get("updates").get("updatedCells")} cells updated in the Google Sheet.') """