Untitled

 avatar
unknown
plain_text
a year ago
1.1 kB
9
Indexable
import fitz  

def Appooz(pdf_path):
    try:
        pdf_document = fitz.open(pdf_path)
        italicized_words = []

        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text_dict = page.get_text("dict")

            blocks = text_dict.get("blocks", [])
            for block in blocks:
                if "lines" in block:
                    for line in block["lines"]:
                        for span in line["spans"]:
                            if span["flags"] & 2:
                                italicized_words.append(f"Page {page_num + 1}: {span['text']}")

        output_path = pdf_path.replace(".pdf", "_italicized_words.txt")
        with open(output_path, 'w') as file:
            for word in italicized_words:
                file.write(word + "\n")

        print(f"Results saved to {output_path}")

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        pdf_document.close()

pdf_path = "/Users/appoos/Documents/Thesis/Abhijith_PhD-Corrected-19-06-24.pdf"
Appooz(pdf_path)
Editor is loading...
Leave a Comment