mail@pastecode.io avatar
a year ago
9.7 kB
import os
import pytesseract      # A popular OCR library

    # Set the path to the command :
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

import fitz  # PyMuPDF - to convert PDFs to images
from PIL import Image, ImageDraw, ImageFont

    # Use these if you want to try pasting to google sheet- I didn't test this yet. Change this note when you do. :)

from google.oauth2 import service_account
from googleapiclient.discovery import build

# Set up Google Sheets API credentials
SERVICE_ACCOUNT_FILE = 'C:/Users/ricky/OneDrive/Desktop/Scheduled Bots(Dont Run Any)/J-ModsNewUpdatesAllBots/LateShipment(Scheduled Only)/atlantean-site-346902-e0304f09a5d9.json'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']

credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
service = build('sheets', 'v4', credentials=credentials)

# Google Sheets ID and range
SPREADSHEET_ID = '1drAk3tqjFnxJ83fGM09tqdbrRexF2kSP1_DNP3lyMjk'
RANGE = 'Atlantis!A1'

    # Use this for looping throuh multiple files in a folder.
# PDFs folder path
pdf_folder = r'C:\Users\ricky\OneDrive\Desktop\Tiktok PDFs\2023-04-16'

    # Use this if you want to try pasting to google sheet- I didn't test this yet. Change this note when you do. :)
pdf_data = []

    # Use this if you just wanted it printed to console to copy and paste it yourself
pdf_data = ""

numErr, success, tot = 0, 0, 0
def doParse() :
    global pdf_data numErr success

    # OCR the image
    text = pytesseract.image_to_string(img, lang='eng')

        # Note: The 'goOCR' variable is used throughout the code as flags to tell us if it's ok
        #   to keep going (if no error is found yet.)  Better than a goto statement?
        #   (Goto statement doesn't exist in python anyway)
        #   This way we also don't need to keep nesting our "if" statements and make indentation too big.

        # Start parsing the text
    goOCR = (text.startswith("In transit by: "))
    errMsg = "'In transit by:' not found" if not goOCR else ""

    if goOCR :
        tlines = text.splitlines()

        # Sometimes there's a blank line before this, sometimes after.
        # So just put the 2 lines together to check it.
        goOCR = (tlines[1]+tlines[2]=="Product Name SKU[Seller SKU] Qty")

        # Sometimes the "Qty" is missing off the end, and "aty" shows up instead,
        #   but 2 lines down, and then another blank line. Check for this :

        n = 3
        if not goOCR :
            goOCR = (tlines[1] + tlines[2] + tlines[3] + tlines[4] =="Product Name SKU[Seller SKU]aty")
            n += 2

        errMsg = "field headers not found in expected format" if not goOCR  else ""

    # Initialize these even if goOCR =false, because we still append it to pdf_data later
    prod, sku = '', ''
    while goOCR  :
        t = tlines[n]

        # Add a space between lines for prod and for sku, unless the last
        # line ended with a dash, or it was the 1st line.
        # And find the pipe char to split between product name and sku
        p = (t+'|').find('|') 
        prod += ('' if prod.endswith('-') or prod=='' else ' ') + t[:p]
        sku += ('' if sku.endswith('-') or sku=='' else ' ') + t[p+1:]

        n += 1
        goOCR = (n < len(tlines))
        errMsg = "No blank line found after product/sku/quantity section." if not goOCR  else ""

        # Once we reach a blank line, that should normally be the end of the line item.
        if goOCR and tlines[n]=='' :

            # Now see if the next line starts with a number, followed by " (Order ID: "
            # If so, that number is the total.  Yeah, the OCR got it out of order on a lot of them.

            t = tlines[n+1]
            p = t.find(' ')
            goOCR = (p > 0 and t[:p].isdigit() and t[p:].startswith(' (Order ID: '))

    if goOCR :
        qty = t[:p]
    else :
        # If not, don't give up yet-- Sometimes the product name continues past
        # the blank line. (see 0098-7178.txt)
        # If that's the case, goOCR will be false here, but let's check the next 2 lines
        # before we give up... If it's a blank line followed by "OTickTok shop...",
        # then qty follows the last pipe in that line.

        prod += ('' if prod.endswith('-') else ' ') + t
        t = tlines[n+2] + tlines[n+3]
        goOCR = (t.startswith('OTikTok Shop _ | Total|'))
        errMsg = "total qty not found" if not goOCR else ""

        if goOCR :
            qty = t.split('|')[-1]

    if goOCR :
        success += 1
        print(f"\n File parsed successfully: {success} / {tot} : {file}")
    else :
        numErr += 1
        print(f"\n File error #{numErr} / {tot} :  {file} doesn't look as expected: " + errMsg \
            +"\n\n Here is the full file: \n\n" + text)
    with open(os.path.join(pdf_folder, file.replace(".pdf", ".txt")), 'w') as f :

    # Use this if you just wanted it printed to console to copy and paste it yourself
    pdf_data += '"' + prod + '","' + sku + '","' + qty + '","' + file + '","' + errMsg + '"\n'

    # Use this if you want to try pasting to google sheet- I didn't test this yet. Change this note when you do. :)
    pdf_data.append([prod, sku, qty])

def doCombineImage() :
    global tot, img, fullImg
    # Crop the image and add it to a full image file containing all the line item cropped images
    img = img.crop((0, 0, img.width, 250))
    draw = ImageDraw.Draw(img)
    fnt = ImageFont.truetype("arial.ttf", 20)
    draw.text((350,10),file, font=fnt, fill="green")

    #img.show()  # for debugging- show the image in default image program
    #break       # for debugging- exit file loop

    if fullImg is None:
        fullImg = Image.new(img.mode, (img.width, img.height * len(files)))
    y = img.height * (tot - 1)
    print(f"pasting at {y}")
    fullImg.paste(img, (0, y))

    #print(f"cropped from {file}")
    #fullImg.show(fullImg)       # for debugging
    #if tot==2: break           # for debugging

    print(f"{tot} / {len(files)} images collected from: {file}")

    # Print instruction to screen.
In order to parse the data properly, the program's expecting
the text extracted from the PDF (by Tesseract OCR) to be in the following format:
(This is an example)

In transit by: 11/04/2023 15:59:59
Product Name SKU[Seller SKU] Qty

Helmet motor motorcycle helmet Topi keledar | RED/BLACK- 1
double visor open face motosikal bike helmet | MEDIUM
moto Stylish dual lens

1 (Order ID: 577182552601626782
Package ID: 1153192398784137374

dTikTok Shop __ {Total

The program will let you know if it encounters text that is not in this format.

fullImg = None

while True:
    choice = input("""

1 = Collect line item images into one image.
2 = Use OCR to extract and save text files, and parse and combine the data from them.

Choose option:""")
    if choice in ('1','2'): break

    #For testing one file:
#for file in ["C:/Users/admin.todd/Downloads/0001-_6782.pdf"] :

    #For looping through all PDFs in folder:
files = [a for a in os.listdir(pdf_folder) if a.endswith(".pdf")]
for file in files :

    tot += 1
    print(f"Next file #{tot}: {file}")
    file_path = os.path.join(pdf_folder, file)

        # Open the PDF
    pdf = fitz.open(file_path)

        # Check if the PDF has at least two pages
    if pdf.page_count <= 1 :
        print(f"Only 1 page in pdf: {file}")

    else :
            # Get the second page
        page = pdf.load_page(1)

            # Convert the page to an image
        zoom_x = 2.0
        zoom_y = 2.0
        mat = fitz.Matrix(zoom_x, zoom_y)
        pix = page.get_pixmap(matrix=mat)

        # Prepare the image to be used in OCR
        img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)

        #   debugging code to help verify the image:

        # Looks like we need to rotate these clockwise.
        img = img.rotate(270, expand=True)

        prod, sku, qty = '', '', ''     #Initialize vars
        if choice == '2' :

        if choice == '1':
            #if tot >= 3: break      # for debugging

# Save the combined image file as "AllBills.png".
if choice == '1' :
    fullImg.save(os.path.join(pdf_folder, "AllBills.png"))
    print("\n All images are combined into file = AllBills.png.")

if choice == '2' :
    # Use this part if you want to just copy the extracted data to paste in Google Sheet

    Here is the data the program reorganized, to be copied and pasted into the
    google sheet (all at once -- then use google sheet's 'text to columns' feature, with separator=comma) :

    """, pdf_data)

        # Use this part if you want the program to try auto pasting to google sheet.
        # I didn't test this yet. Change this note when you do. :)
    body = {
        'values': pdf_data
    result = service.spreadsheets().values().append(

    print(f'{result.get("updates").get("updatedCells")} cells updated in the Google Sheet.')