Untitled
unknown
plain_text
a month ago
4.1 kB
3
Indexable
import fitz # PyMuPDF import re from docx import Document from docx.shared import Pt def extract_bold_location_and_comment(pdf_path): bold_words = [] locations = [] comments = [] counter = [] nth = [] map = {} doc = fitz.open(pdf_path) current_comment = "" capturing_comment = False for page in doc: blocks = page.get_text("dict")["blocks"] for block in blocks: if "lines" in block: for line in block["lines"]: bold_text = "" location_text = "" for span in line["spans"]: if "bold" in span["font"].lower(): if capturing_comment: comments.append(current_comment.strip()) current_comment = "" capturing_comment = False bold_text += span["text"] + " " elif bold_text and "Sept. 1" not in span["text"]: location_text += span["text"] + " " elif "Sept. 1" in span["text"]: location_text += span["text"].split("Sept. 1")[0] capturing_comment = True current_comment = span["text"].split("Sept. 1")[1] + " " elif capturing_comment: current_comment += span["text"] + " " bold_text = bold_text.strip() location_text = location_text.strip() if bold_text and re.search(r'[a-zA-Z0-9.,*^!@#$%&+\-=?]', bold_text): bold_words.append(bold_text) locations.append(location_text) if not capturing_comment: comments.append("") # Add empty comment if none was captured if (bold_text, location_text) in map: first, second = map[(bold_text, location_text)] map[(bold_text, location_text)] = (first, second + 1) else: map[(bold_text, location_text)] = (len(map) + 1, 1) # Use len(map) + 1 as the count first, second = map[(bold_text, location_text)] nth.append(second) counter.append(first) # Add the last comment if there is one if capturing_comment: comments.append(current_comment.strip()) # Ensure lists are of equal length max_len = max(len(bold_words), len(locations), len(comments)) bold_words += [''] * (max_len - len(bold_words)) locations += [''] * (max_len - len(locations)) comments += [''] * (max_len - len(comments)) return bold_words, locations, comments, counter, nth # Usage pdf_path = "/Users/larryzhi/PycharmProjects/prac/poo3.pdf" bold_words, locations, comments, counter, nth = extract_bold_location_and_comment(pdf_path) # Create a new Word document doc = Document() # Set the font for the entire document style = doc.styles['Normal'] style.font.name = 'Times New Roman' style.font.size = Pt(12) # Create a list of tuples (bold_word, location, comment) and sort it sorted_results = sorted(zip(bold_words, locations, comments, counter, nth), key=lambda x: (x[3], x[4]), reverse=False) for bold, location, comment, x, y in sorted_results: # print(bold, x, y) if bold and location: # Add the bold name p = doc.add_paragraph() p.add_run(bold).bold = True # Add the location and date (not bold) p.add_run(f" {location} Sept. 1") # Add the comment if comment: doc.add_paragraph(comment) # Add a blank line for spacing doc.add_paragraph() if bold and location: print(location) # Save the document doc.save("output.docx") print("Document has been created: output.docx")
Editor is loading...
Leave a Comment