Untitled

 avatar
unknown
plain_text
a month ago
4.1 kB
3
Indexable
import fitz  # PyMuPDF
import re
from docx import Document
from docx.shared import Pt


def extract_bold_location_and_comment(pdf_path):
    bold_words = []
    locations = []
    comments = []
    counter = []
    nth = []
    map = {}
    doc = fitz.open(pdf_path)

    current_comment = ""
    capturing_comment = False


    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    bold_text = ""
                    location_text = ""
                    for span in line["spans"]:
                        if "bold" in span["font"].lower():
                            if capturing_comment:
                                comments.append(current_comment.strip())
                                current_comment = ""
                                capturing_comment = False
                            bold_text += span["text"] + " "
                        elif bold_text and "Sept. 1" not in span["text"]:
                            location_text += span["text"] + " "
                        elif "Sept. 1" in span["text"]:
                            location_text += span["text"].split("Sept. 1")[0]
                            capturing_comment = True
                            current_comment = span["text"].split("Sept. 1")[1] + " "
                        elif capturing_comment:
                            current_comment += span["text"] + " "

                    bold_text = bold_text.strip()
                    location_text = location_text.strip()

                    if bold_text and re.search(r'[a-zA-Z0-9.,*^!@#$%&+\-=?]', bold_text):
                        bold_words.append(bold_text)
                        locations.append(location_text)
                        if not capturing_comment:
                            comments.append("")  # Add empty comment if none was captured

                        if (bold_text, location_text) in map:
                            first, second = map[(bold_text, location_text)]
                            map[(bold_text, location_text)] = (first, second + 1)
                        else:
                            map[(bold_text, location_text)] = (len(map) + 1, 1)  # Use len(map) + 1 as the count

                        first, second = map[(bold_text, location_text)]
                        nth.append(second)
                        counter.append(first)

    # Add the last comment if there is one
    if capturing_comment:
        comments.append(current_comment.strip())

    # Ensure lists are of equal length
    max_len = max(len(bold_words), len(locations), len(comments))
    bold_words += [''] * (max_len - len(bold_words))
    locations += [''] * (max_len - len(locations))
    comments += [''] * (max_len - len(comments))

    return bold_words, locations, comments, counter, nth


# Usage
pdf_path = "/Users/larryzhi/PycharmProjects/prac/poo3.pdf"
bold_words, locations, comments, counter, nth = extract_bold_location_and_comment(pdf_path)

# Create a new Word document
doc = Document()

# Set the font for the entire document
style = doc.styles['Normal']
style.font.name = 'Times New Roman'
style.font.size = Pt(12)

# Create a list of tuples (bold_word, location, comment) and sort it
sorted_results = sorted(zip(bold_words, locations, comments, counter, nth),
                        key=lambda x: (x[3], x[4]),
                        reverse=False)


for bold, location, comment, x, y in sorted_results:
    # print(bold, x, y)
    if bold and location:
        # Add the bold name
        p = doc.add_paragraph()
        p.add_run(bold).bold = True
        # Add the location and date (not bold)
        p.add_run(f" {location} Sept. 1")

        # Add the comment
        if comment:
            doc.add_paragraph(comment)

        # Add a blank line for spacing
        doc.add_paragraph()

    if bold and location:
        print(location)

# Save the document
doc.save("output.docx")

print("Document has been created: output.docx")

Editor is loading...
Leave a Comment