Untitled
unknown
plain_text
9 months ago
12 kB
5
Indexable
import re import logging from fpdf import FPDF def parse_trends_text_and_table(md_string): """ Parses a Markdown-like string to extract: 1) text_before (everything before the first table line) 2) table_data (all lines that contain '|') 3) text_after (everything after the table) Returns (text_before, table_data, text_after). """ lines = md_string.split("\n") table_start = None table_end = None # Find where the first table line occurs and where it ends for i, line in enumerate(lines): if "|" in line.strip() and table_start is None: table_start = i # first line that looks like a table elif table_start is not None and "|" not in line.strip(): # we've reached a line without '|', so the table ends here table_end = i break # If no table lines found at all, return everything as text_before if table_start is None: return md_string, [], "" # If we never hit a non-'|' line after table_start, assume table extends to end if table_end is None: table_end = len(lines) # Slice out the pieces text_before = "\n".join(lines[:table_start]).strip() table_lines = lines[table_start:table_end] text_after = "\n".join(lines[table_end:]).strip() # Convert those table lines to table_data table_data = extract_markdown_table("\n".join(table_lines)) return text_before, table_data, text_after def extract_markdown_table(md_string): """ Extract lines that appear to be part of a Markdown table and split them into columns based on '|' characters. Returns a list of lists (rows -> columns). """ lines = md_string.split('\n') table_data = [] for line in lines: line_strip = line.strip() # Ignore lines like |---| if re.match(r'^\|[\s-]+\|', line_strip): continue if '|' in line_strip: parts = [col.strip() for col in line_strip.split('|')] # Remove any empty columns from leading/trailing '|' parts = [p for p in parts if p] if parts: table_data.append(parts) return table_data def draw_measured_table(pdf, table_data, col_widths, line_height=5): """ Draws a multi-line table in two passes (MEASURE -> PRINT), with a page-break check so rows don't overflow off the page. """ if not table_data: return num_cols = len(table_data[0]) # ---------- PASS 1: MEASURE (same as before) ---------- row_max_lines = [] for row in table_data: max_lines_in_row = 1 for col_i in range(num_cols): text = row[col_i] lines = pdf.multi_cell(col_widths[col_i], line_height, text, border=0, align='L', split_only=True) max_lines_in_row = max(max_lines_in_row, len(lines)) row_max_lines.append(max_lines_in_row) # ---------- PASS 2: PRINT (with page-break handling) ---------- for row_i, row in enumerate(table_data): # Calculate row height row_height = row_max_lines[row_i] * line_height # Current position x_start = pdf.get_x() y_start = pdf.get_y() # --- Check if this row will overflow the current page --- # # If y_start + row_height would exceed the page break trigger, # then we start a new page. # if y_start + row_height > pdf.page_break_trigger: pdf.add_page() x_start = pdf.l_margin # typically left margin y_start = pdf.get_y() pdf.set_xy(x_start, y_start) # Now print each cell for col_i in range(num_cols): text = row[col_i] cell_x = pdf.get_x() cell_y = pdf.get_y() # bounding box (optional) pdf.rect(cell_x, cell_y, col_widths[col_i], row_height) # multi_cell for the text pdf.multi_cell(col_widths[col_i], line_height, text, border=0, align='L') # Reset position horizontally for the next column in this row pdf.set_xy(cell_x + col_widths[col_i], cell_y) # Move down for the next row pdf.set_xy(x_start, y_start + row_height) def fix_smart_chars(text): """ Replaces troublesome Unicode punctuation with basic ASCII equivalents. """ replacements = { "\u2013": "-", # en dash "\u2014": "-", # em dash "\u2018": "'", # left single quote "\u2019": "'", # right single quote "\u201C": '"', # left double quote "\u201D": '"', # right double quote # Add more if needed } for orig, repl in replacements.items(): text = text.replace(orig, repl) return text def basic_bold_parser(pdf, text, font_family="Arial", font_size=12): """ A simple parser to turn '**bold**' segments into bold text. Everything else remains normal. We set default to 12 so it matches the Chrono Trigger section. """ # 1) Fix any non-ASCII punctuation text = fix_smart_chars(text) segments = re.split(r'(\*\*[^*]+\*\*)', text) pdf.set_font(font_family, '', font_size) for segment in segments: if segment.startswith('**') and segment.endswith('**'): bold_text = segment.strip('*') pdf.set_font(font_family, 'B', font_size) pdf.multi_cell(0, 5, bold_text) pdf.set_font(font_family, '', font_size) else: pdf.multi_cell(0, 5, segment) def draw_relevant_titles_table(pdf, relevant_titles): """ Displays the 'Top 5 Relevant Titles' in a 2-column table: - Title - Features (comma-joined list) Uses the same measured table approach to avoid overlap. """ if not relevant_titles: return # Build table_data (header row + data rows) table_data = [["Title", "Features"]] for item in relevant_titles: title = item["Title"] features = ", ".join(item["Features"]) table_data.append([title, features]) # Dynamically compute column widths pdf.set_font("Arial", size=10) page_width = pdf.w - 2 * pdf.l_margin num_cols = len(table_data[0]) # 2 columns col_width = page_width / num_cols col_widths = [col_width] * num_cols # Draw the table draw_measured_table(pdf, table_data, col_widths, line_height=6) def preprocess_text(text): """ Pre-process the Markdown-ish text to: 1) Remove or convert lines starting with '## ' (double hash). 2) Convert lines starting with '* ' into inline bullets (e.g., '- ' or '• '). 3) Merge numbered headings like: 1.\nTitle -> "1. Title" 4) Reduce extra blank lines or spacing. """ # 1) Handle double-hash headings: remove "## " or convert them into a single line heading. text = re.sub(r'^##\s*(.*)$', r'**\1**', text, flags=re.MULTILINE) # 2) Convert bullet lines that start with '* ' into '- ' or '• '. text = re.sub(r'^\*\s+', '--- ', text, flags=re.MULTILINE) # 3) Merge numbered list items that are broken across lines: # e.g., "1.\nChrono Trigger" -> "1. Chrono Trigger" text = re.sub(r'(\d+)\.\s*\n\s*([A-Za-z])', r'\1. \2', text) # 4) Reduce extra blank lines text = re.sub(r'\n\s*\n+', '\n\n', text) return text def generate_pdf_report(remake_df, insights): """ Main function to generate the PDF report: 1) Parse the trends_summary into text_before, table_data, text_after 2) Print text_before (with bold parser for consistent style) 3) Print the table (DON'T TOUCH the table code) 4) Print text_after (also with bold parser) 5) Chrono Trigger summary (with bold) 6) Visualizations (optional) 7) The second table: "Top 5 Relevant Titles" """ try: logging.basicConfig(level=logging.INFO) pdf = FPDF() pdf.set_auto_page_break(auto=True, margin=15) pdf.add_page() # --- Title (slightly bigger font) --- pdf.set_font("Arial", size=16) pdf.cell(0, 10, txt="RPG Remake/Remaster Analysis Report", ln=True, align='C') pdf.ln(5) # --- Total Count (normal body font = 12) --- pdf.set_font("Arial", size=12) pdf.cell(0, 10, txt=f"Total RPG Remakes/Remasters: {len(remake_df)}", ln=True) # --- PART 1: Trends Summary + Table --- trends_summary = insights.get("trends_summary", "") text_before, table_data, text_after = parse_trends_text_and_table(trends_summary) # 1. Print text before the table with the same bold parser # so headings like '**I. Common Features**' become bold consistently if text_before.strip(): text_before_pp = preprocess_text(text_before) basic_bold_parser(pdf, text_before_pp, font_family="Arial", font_size=12) pdf.ln(3) # 2. Draw the table (unchanged) if table_data: pdf.set_font("Arial", size=10) page_width = pdf.w - 2 * pdf.l_margin num_cols = len(table_data[0]) col_widths = [page_width / num_cols] * num_cols draw_measured_table(pdf, table_data, col_widths, line_height=6) pdf.ln(5) # 3. Print text after the table, also with the bold parser if text_after.strip(): basic_bold_parser(pdf, text_after, font_family="Arial", font_size=12) # --- PART 2: Chrono Trigger Insights --- pdf.ln(10) pdf.set_font("Arial", size=14) pdf.cell(0, 10, txt="Recommendations for Chrono Trigger Remake:", ln=True) pdf.ln(3) pdf.set_font("Arial", size=12) chrono_trigger_text = insights.get("chrono_trigger_summary", "N/A") chrono_trigger_text_pp = preprocess_text(chrono_trigger_text) basic_bold_parser(pdf, chrono_trigger_text_pp, font_family="Arial", font_size=12) # --- PART 3: Visualizations (optional placeholders) --- pdf.add_page() pdf.set_font("Arial", size=14) pdf.cell(0, 10, txt="Visualizations:", ln=True) pdf.ln(3) pdf.set_font("Arial", size=12) image_width = 120 # Width of the image x_center = (pdf.w - image_width) / 2 # Calculate X-coordinate to center # Example: classification_distribution.png try: pdf.image('classification_distribution.png',x=x_center, w=image_width) pdf.ln(3) pdf.multi_cell(0, 10, "Figure 1: Pie chart representing remake/remaster distribution", align='C') except: pdf.multi_cell(0, 5, "Could not load classification_distribution.png") pdf.ln(5) # Example: feature_distribution.png try: pdf.image('feature_distribution.png',x=x_center ,w=image_width) pdf.ln(3) pdf.multi_cell(0, 10, "Figure 2: Bar chart showing key features in remakes", align='C') except: pdf.multi_cell(0, 5, "Could not load feature_distribution.png") # --- PART 4: Second Table - Top 5 Relevant Titles --- pdf.add_page() pdf.set_font("Arial", size=14) pdf.cell(0, 10, txt="Top 5 Relevant Titles for Chrono Trigger Remake:", ln=True) pdf.ln(3) relevant_titles = insights.get("relevant_titles", []) if relevant_titles: draw_relevant_titles_table(pdf, relevant_titles) else: pdf.set_font("Arial", size=12) pdf.multi_cell(0, 5, "No relevant titles found.") # --- Finish & Save --- pdf.output("RPG_Remake_Report.pdf") logging.info("PDF report generated successfully as 'RPG_Remake_Report.pdf'") except Exception as e: logging.error(f"Error generating PDF report: {e}")
Editor is loading...
Leave a Comment