Untitled
unknown
plain_text
7 months ago
4.1 kB
5
Indexable
import fitz # PyMuPDF
import re
from docx import Document
from docx.shared import Pt
def extract_bold_location_and_comment(pdf_path):
bold_words = []
locations = []
comments = []
counter = []
nth = []
map = {}
doc = fitz.open(pdf_path)
current_comment = ""
capturing_comment = False
for page in doc:
blocks = page.get_text("dict")["blocks"]
for block in blocks:
if "lines" in block:
for line in block["lines"]:
bold_text = ""
location_text = ""
for span in line["spans"]:
if "bold" in span["font"].lower():
if capturing_comment:
comments.append(current_comment.strip())
current_comment = ""
capturing_comment = False
bold_text += span["text"] + " "
elif bold_text and "Sept. 1" not in span["text"]:
location_text += span["text"] + " "
elif "Sept. 1" in span["text"]:
location_text += span["text"].split("Sept. 1")[0]
capturing_comment = True
current_comment = span["text"].split("Sept. 1")[1] + " "
elif capturing_comment:
current_comment += span["text"] + " "
bold_text = bold_text.strip()
location_text = location_text.strip()
if bold_text and re.search(r'[a-zA-Z0-9.,*^!@#$%&+\-=?]', bold_text):
bold_words.append(bold_text)
locations.append(location_text)
if not capturing_comment:
comments.append("") # Add empty comment if none was captured
if (bold_text, location_text) in map:
first, second = map[(bold_text, location_text)]
map[(bold_text, location_text)] = (first, second + 1)
else:
map[(bold_text, location_text)] = (len(map) + 1, 1) # Use len(map) + 1 as the count
first, second = map[(bold_text, location_text)]
nth.append(second)
counter.append(first)
# Add the last comment if there is one
if capturing_comment:
comments.append(current_comment.strip())
# Ensure lists are of equal length
max_len = max(len(bold_words), len(locations), len(comments))
bold_words += [''] * (max_len - len(bold_words))
locations += [''] * (max_len - len(locations))
comments += [''] * (max_len - len(comments))
return bold_words, locations, comments, counter, nth
# Usage
pdf_path = "/Users/larryzhi/PycharmProjects/prac/poo3.pdf"
bold_words, locations, comments, counter, nth = extract_bold_location_and_comment(pdf_path)
# Create a new Word document
doc = Document()
# Set the font for the entire document
style = doc.styles['Normal']
style.font.name = 'Times New Roman'
style.font.size = Pt(12)
# Create a list of tuples (bold_word, location, comment) and sort it
sorted_results = sorted(zip(bold_words, locations, comments, counter, nth),
key=lambda x: (x[3], x[4]),
reverse=False)
for bold, location, comment, x, y in sorted_results:
# print(bold, x, y)
if bold and location:
# Add the bold name
p = doc.add_paragraph()
p.add_run(bold).bold = True
# Add the location and date (not bold)
p.add_run(f" {location} Sept. 1")
# Add the comment
if comment:
doc.add_paragraph(comment)
# Add a blank line for spacing
doc.add_paragraph()
if bold and location:
print(location)
# Save the document
doc.save("output.docx")
print("Document has been created: output.docx")
Editor is loading...
Leave a Comment