Untitled
unknown
plain_text
9 months ago
12 kB
5
Indexable
import re
import logging
from fpdf import FPDF
def parse_trends_text_and_table(md_string):
"""
Parses a Markdown-like string to extract:
1) text_before (everything before the first table line)
2) table_data (all lines that contain '|')
3) text_after (everything after the table)
Returns (text_before, table_data, text_after).
"""
lines = md_string.split("\n")
table_start = None
table_end = None
# Find where the first table line occurs and where it ends
for i, line in enumerate(lines):
if "|" in line.strip() and table_start is None:
table_start = i # first line that looks like a table
elif table_start is not None and "|" not in line.strip():
# we've reached a line without '|', so the table ends here
table_end = i
break
# If no table lines found at all, return everything as text_before
if table_start is None:
return md_string, [], ""
# If we never hit a non-'|' line after table_start, assume table extends to end
if table_end is None:
table_end = len(lines)
# Slice out the pieces
text_before = "\n".join(lines[:table_start]).strip()
table_lines = lines[table_start:table_end]
text_after = "\n".join(lines[table_end:]).strip()
# Convert those table lines to table_data
table_data = extract_markdown_table("\n".join(table_lines))
return text_before, table_data, text_after
def extract_markdown_table(md_string):
"""
Extract lines that appear to be part of a Markdown table
and split them into columns based on '|' characters.
Returns a list of lists (rows -> columns).
"""
lines = md_string.split('\n')
table_data = []
for line in lines:
line_strip = line.strip()
# Ignore lines like |---|
if re.match(r'^\|[\s-]+\|', line_strip):
continue
if '|' in line_strip:
parts = [col.strip() for col in line_strip.split('|')]
# Remove any empty columns from leading/trailing '|'
parts = [p for p in parts if p]
if parts:
table_data.append(parts)
return table_data
def draw_measured_table(pdf, table_data, col_widths, line_height=5):
"""
Draws a multi-line table in two passes (MEASURE -> PRINT),
with a page-break check so rows don't overflow off the page.
"""
if not table_data:
return
num_cols = len(table_data[0])
# ---------- PASS 1: MEASURE (same as before) ----------
row_max_lines = []
for row in table_data:
max_lines_in_row = 1
for col_i in range(num_cols):
text = row[col_i]
lines = pdf.multi_cell(col_widths[col_i], line_height, text,
border=0, align='L', split_only=True)
max_lines_in_row = max(max_lines_in_row, len(lines))
row_max_lines.append(max_lines_in_row)
# ---------- PASS 2: PRINT (with page-break handling) ----------
for row_i, row in enumerate(table_data):
# Calculate row height
row_height = row_max_lines[row_i] * line_height
# Current position
x_start = pdf.get_x()
y_start = pdf.get_y()
# --- Check if this row will overflow the current page ---
#
# If y_start + row_height would exceed the page break trigger,
# then we start a new page.
#
if y_start + row_height > pdf.page_break_trigger:
pdf.add_page()
x_start = pdf.l_margin # typically left margin
y_start = pdf.get_y()
pdf.set_xy(x_start, y_start)
# Now print each cell
for col_i in range(num_cols):
text = row[col_i]
cell_x = pdf.get_x()
cell_y = pdf.get_y()
# bounding box (optional)
pdf.rect(cell_x, cell_y, col_widths[col_i], row_height)
# multi_cell for the text
pdf.multi_cell(col_widths[col_i], line_height, text, border=0, align='L')
# Reset position horizontally for the next column in this row
pdf.set_xy(cell_x + col_widths[col_i], cell_y)
# Move down for the next row
pdf.set_xy(x_start, y_start + row_height)
def fix_smart_chars(text):
"""
Replaces troublesome Unicode punctuation
with basic ASCII equivalents.
"""
replacements = {
"\u2013": "-", # en dash
"\u2014": "-", # em dash
"\u2018": "'", # left single quote
"\u2019": "'", # right single quote
"\u201C": '"', # left double quote
"\u201D": '"', # right double quote
# Add more if needed
}
for orig, repl in replacements.items():
text = text.replace(orig, repl)
return text
def basic_bold_parser(pdf, text, font_family="Arial", font_size=12):
"""
A simple parser to turn '**bold**' segments into bold text.
Everything else remains normal.
We set default to 12 so it matches the Chrono Trigger section.
"""
# 1) Fix any non-ASCII punctuation
text = fix_smart_chars(text)
segments = re.split(r'(\*\*[^*]+\*\*)', text)
pdf.set_font(font_family, '', font_size)
for segment in segments:
if segment.startswith('**') and segment.endswith('**'):
bold_text = segment.strip('*')
pdf.set_font(font_family, 'B', font_size)
pdf.multi_cell(0, 5, bold_text)
pdf.set_font(font_family, '', font_size)
else:
pdf.multi_cell(0, 5, segment)
def draw_relevant_titles_table(pdf, relevant_titles):
"""
Displays the 'Top 5 Relevant Titles' in a 2-column table:
- Title
- Features (comma-joined list)
Uses the same measured table approach to avoid overlap.
"""
if not relevant_titles:
return
# Build table_data (header row + data rows)
table_data = [["Title", "Features"]]
for item in relevant_titles:
title = item["Title"]
features = ", ".join(item["Features"])
table_data.append([title, features])
# Dynamically compute column widths
pdf.set_font("Arial", size=10)
page_width = pdf.w - 2 * pdf.l_margin
num_cols = len(table_data[0]) # 2 columns
col_width = page_width / num_cols
col_widths = [col_width] * num_cols
# Draw the table
draw_measured_table(pdf, table_data, col_widths, line_height=6)
def preprocess_text(text):
"""
Pre-process the Markdown-ish text to:
1) Remove or convert lines starting with '## ' (double hash).
2) Convert lines starting with '* ' into inline bullets (e.g., '- ' or '• ').
3) Merge numbered headings like:
1.\nTitle -> "1. Title"
4) Reduce extra blank lines or spacing.
"""
# 1) Handle double-hash headings: remove "## " or convert them into a single line heading.
text = re.sub(r'^##\s*(.*)$', r'**\1**', text, flags=re.MULTILINE)
# 2) Convert bullet lines that start with '* ' into '- ' or '• '.
text = re.sub(r'^\*\s+', '--- ', text, flags=re.MULTILINE)
# 3) Merge numbered list items that are broken across lines:
# e.g., "1.\nChrono Trigger" -> "1. Chrono Trigger"
text = re.sub(r'(\d+)\.\s*\n\s*([A-Za-z])', r'\1. \2', text)
# 4) Reduce extra blank lines
text = re.sub(r'\n\s*\n+', '\n\n', text)
return text
def generate_pdf_report(remake_df, insights):
"""
Main function to generate the PDF report:
1) Parse the trends_summary into text_before, table_data, text_after
2) Print text_before (with bold parser for consistent style)
3) Print the table (DON'T TOUCH the table code)
4) Print text_after (also with bold parser)
5) Chrono Trigger summary (with bold)
6) Visualizations (optional)
7) The second table: "Top 5 Relevant Titles"
"""
try:
logging.basicConfig(level=logging.INFO)
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
# --- Title (slightly bigger font) ---
pdf.set_font("Arial", size=16)
pdf.cell(0, 10, txt="RPG Remake/Remaster Analysis Report", ln=True, align='C')
pdf.ln(5)
# --- Total Count (normal body font = 12) ---
pdf.set_font("Arial", size=12)
pdf.cell(0, 10, txt=f"Total RPG Remakes/Remasters: {len(remake_df)}", ln=True)
# --- PART 1: Trends Summary + Table ---
trends_summary = insights.get("trends_summary", "")
text_before, table_data, text_after = parse_trends_text_and_table(trends_summary)
# 1. Print text before the table with the same bold parser
# so headings like '**I. Common Features**' become bold consistently
if text_before.strip():
text_before_pp = preprocess_text(text_before)
basic_bold_parser(pdf, text_before_pp, font_family="Arial", font_size=12)
pdf.ln(3)
# 2. Draw the table (unchanged)
if table_data:
pdf.set_font("Arial", size=10)
page_width = pdf.w - 2 * pdf.l_margin
num_cols = len(table_data[0])
col_widths = [page_width / num_cols] * num_cols
draw_measured_table(pdf, table_data, col_widths, line_height=6)
pdf.ln(5)
# 3. Print text after the table, also with the bold parser
if text_after.strip():
basic_bold_parser(pdf, text_after, font_family="Arial", font_size=12)
# --- PART 2: Chrono Trigger Insights ---
pdf.ln(10)
pdf.set_font("Arial", size=14)
pdf.cell(0, 10, txt="Recommendations for Chrono Trigger Remake:", ln=True)
pdf.ln(3)
pdf.set_font("Arial", size=12)
chrono_trigger_text = insights.get("chrono_trigger_summary", "N/A")
chrono_trigger_text_pp = preprocess_text(chrono_trigger_text)
basic_bold_parser(pdf, chrono_trigger_text_pp, font_family="Arial", font_size=12)
# --- PART 3: Visualizations (optional placeholders) ---
pdf.add_page()
pdf.set_font("Arial", size=14)
pdf.cell(0, 10, txt="Visualizations:", ln=True)
pdf.ln(3)
pdf.set_font("Arial", size=12)
image_width = 120 # Width of the image
x_center = (pdf.w - image_width) / 2 # Calculate X-coordinate to center
# Example: classification_distribution.png
try:
pdf.image('classification_distribution.png',x=x_center, w=image_width)
pdf.ln(3)
pdf.multi_cell(0, 10, "Figure 1: Pie chart representing remake/remaster distribution", align='C')
except:
pdf.multi_cell(0, 5, "Could not load classification_distribution.png")
pdf.ln(5)
# Example: feature_distribution.png
try:
pdf.image('feature_distribution.png',x=x_center ,w=image_width)
pdf.ln(3)
pdf.multi_cell(0, 10, "Figure 2: Bar chart showing key features in remakes", align='C')
except:
pdf.multi_cell(0, 5, "Could not load feature_distribution.png")
# --- PART 4: Second Table - Top 5 Relevant Titles ---
pdf.add_page()
pdf.set_font("Arial", size=14)
pdf.cell(0, 10, txt="Top 5 Relevant Titles for Chrono Trigger Remake:", ln=True)
pdf.ln(3)
relevant_titles = insights.get("relevant_titles", [])
if relevant_titles:
draw_relevant_titles_table(pdf, relevant_titles)
else:
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 5, "No relevant titles found.")
# --- Finish & Save ---
pdf.output("RPG_Remake_Report.pdf")
logging.info("PDF report generated successfully as 'RPG_Remake_Report.pdf'")
except Exception as e:
logging.error(f"Error generating PDF report: {e}")
Editor is loading...
Leave a Comment