Untitled

 avatar
unknown
plain_text
18 days ago
12 kB
3
Indexable
import re
import logging
from fpdf import FPDF


def parse_trends_text_and_table(md_string):
    """
    Parses a Markdown-like string to extract:
      1) text_before  (everything before the first table line)
      2) table_data   (all lines that contain '|')
      3) text_after   (everything after the table)
    Returns (text_before, table_data, text_after).
    """
    lines = md_string.split("\n")
    table_start = None
    table_end = None

    # Find where the first table line occurs and where it ends
    for i, line in enumerate(lines):
        if "|" in line.strip() and table_start is None:
            table_start = i  # first line that looks like a table
        elif table_start is not None and "|" not in line.strip():
            # we've reached a line without '|', so the table ends here
            table_end = i
            break

    # If no table lines found at all, return everything as text_before
    if table_start is None:
        return md_string, [], ""

    # If we never hit a non-'|' line after table_start, assume table extends to end
    if table_end is None:
        table_end = len(lines)

    # Slice out the pieces
    text_before = "\n".join(lines[:table_start]).strip()
    table_lines = lines[table_start:table_end]
    text_after = "\n".join(lines[table_end:]).strip()

    # Convert those table lines to table_data
    table_data = extract_markdown_table("\n".join(table_lines))

    return text_before, table_data, text_after


def extract_markdown_table(md_string):
    """
    Extract lines that appear to be part of a Markdown table
    and split them into columns based on '|' characters.
    Returns a list of lists (rows -> columns).
    """
    lines = md_string.split('\n')
    table_data = []

    for line in lines:
        line_strip = line.strip()
        # Ignore lines like |---|
        if re.match(r'^\|[\s-]+\|', line_strip):
            continue
        if '|' in line_strip:
            parts = [col.strip() for col in line_strip.split('|')]
            # Remove any empty columns from leading/trailing '|'
            parts = [p for p in parts if p]
            if parts:
                table_data.append(parts)
    return table_data


def draw_measured_table(pdf, table_data, col_widths, line_height=5):
    """
    Draws a multi-line table in two passes (MEASURE -> PRINT), 
    with a page-break check so rows don't overflow off the page.
    """
    if not table_data:
        return

    num_cols = len(table_data[0])

    # ---------- PASS 1: MEASURE (same as before) ----------
    row_max_lines = []
    for row in table_data:
        max_lines_in_row = 1
        for col_i in range(num_cols):
            text = row[col_i]
            lines = pdf.multi_cell(col_widths[col_i], line_height, text,
                                   border=0, align='L', split_only=True)
            max_lines_in_row = max(max_lines_in_row, len(lines))
        row_max_lines.append(max_lines_in_row)

    # ---------- PASS 2: PRINT (with page-break handling) ----------
    for row_i, row in enumerate(table_data):
        # Calculate row height
        row_height = row_max_lines[row_i] * line_height

        # Current position
        x_start = pdf.get_x()
        y_start = pdf.get_y()

        # --- Check if this row will overflow the current page ---
        #
        # If y_start + row_height would exceed the page break trigger,
        # then we start a new page.
        #
        if y_start + row_height > pdf.page_break_trigger:
            pdf.add_page()
            x_start = pdf.l_margin  # typically left margin
            y_start = pdf.get_y()
            pdf.set_xy(x_start, y_start)

        # Now print each cell
        for col_i in range(num_cols):
            text = row[col_i]
            cell_x = pdf.get_x()
            cell_y = pdf.get_y()

            # bounding box (optional)
            pdf.rect(cell_x, cell_y, col_widths[col_i], row_height)

            # multi_cell for the text
            pdf.multi_cell(col_widths[col_i], line_height, text, border=0, align='L')

            # Reset position horizontally for the next column in this row
            pdf.set_xy(cell_x + col_widths[col_i], cell_y)

        # Move down for the next row
        pdf.set_xy(x_start, y_start + row_height)


def fix_smart_chars(text):
    """
    Replaces troublesome Unicode punctuation 
    with basic ASCII equivalents.
    """
    replacements = {
        "\u2013": "-",  # en dash
        "\u2014": "-",  # em dash
        "\u2018": "'",  # left single quote
        "\u2019": "'",  # right single quote
        "\u201C": '"',  # left double quote
        "\u201D": '"',  # right double quote
        # Add more if needed
    }
    for orig, repl in replacements.items():
        text = text.replace(orig, repl)
    return text


def basic_bold_parser(pdf, text, font_family="Arial", font_size=12):
    """
    A simple parser to turn '**bold**' segments into bold text.
    Everything else remains normal.
    We set default to 12 so it matches the Chrono Trigger section.
    """
    # 1) Fix any non-ASCII punctuation
    text = fix_smart_chars(text)

    segments = re.split(r'(\*\*[^*]+\*\*)', text)
    pdf.set_font(font_family, '', font_size)

    for segment in segments:
        if segment.startswith('**') and segment.endswith('**'):
            bold_text = segment.strip('*')
            pdf.set_font(font_family, 'B', font_size)
            pdf.multi_cell(0, 5, bold_text)
            pdf.set_font(font_family, '', font_size)
        else:
            pdf.multi_cell(0, 5, segment)



def draw_relevant_titles_table(pdf, relevant_titles):
    """
    Displays the 'Top 5 Relevant Titles' in a 2-column table:
      - Title
      - Features (comma-joined list)
    Uses the same measured table approach to avoid overlap.
    """
    if not relevant_titles:
        return

    # Build table_data (header row + data rows)
    table_data = [["Title", "Features"]]
    for item in relevant_titles:
        title = item["Title"]
        features = ", ".join(item["Features"])
        table_data.append([title, features])

    # Dynamically compute column widths
    pdf.set_font("Arial", size=10)
    page_width = pdf.w - 2 * pdf.l_margin
    num_cols = len(table_data[0])  # 2 columns
    col_width = page_width / num_cols
    col_widths = [col_width] * num_cols

    # Draw the table
    draw_measured_table(pdf, table_data, col_widths, line_height=6)

def preprocess_text(text):
    """
    Pre-process the Markdown-ish text to:
      1) Remove or convert lines starting with '## ' (double hash).
      2) Convert lines starting with '* ' into inline bullets (e.g., '- ' or '• ').
      3) Merge numbered headings like:
         1.\nTitle -> "1. Title"
      4) Reduce extra blank lines or spacing.
    """
    # 1) Handle double-hash headings: remove "## " or convert them into a single line heading.
    text = re.sub(r'^##\s*(.*)$', r'**\1**', text, flags=re.MULTILINE)

    # 2) Convert bullet lines that start with '* ' into '- ' or '• '.
    text = re.sub(r'^\*\s+', '--- ', text, flags=re.MULTILINE)

    # 3) Merge numbered list items that are broken across lines:
    #    e.g., "1.\nChrono Trigger" -> "1. Chrono Trigger"
    text = re.sub(r'(\d+)\.\s*\n\s*([A-Za-z])', r'\1. \2', text)

    # 4) Reduce extra blank lines
    text = re.sub(r'\n\s*\n+', '\n\n', text)

    return text



def generate_pdf_report(remake_df, insights):
    """
    Main function to generate the PDF report:
      1) Parse the trends_summary into text_before, table_data, text_after
      2) Print text_before (with bold parser for consistent style)
      3) Print the table (DON'T TOUCH the table code)
      4) Print text_after (also with bold parser)
      5) Chrono Trigger summary (with bold)
      6) Visualizations (optional)
      7) The second table: "Top 5 Relevant Titles"
    """
    try:
        logging.basicConfig(level=logging.INFO)

        pdf = FPDF()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.add_page()

        # --- Title (slightly bigger font) ---
        pdf.set_font("Arial", size=16)
        pdf.cell(0, 10, txt="RPG Remake/Remaster Analysis Report", ln=True, align='C')
        pdf.ln(5)

        # --- Total Count (normal body font = 12) ---
        pdf.set_font("Arial", size=12)
        pdf.cell(0, 10, txt=f"Total RPG Remakes/Remasters: {len(remake_df)}", ln=True)

        # --- PART 1: Trends Summary + Table ---
        trends_summary = insights.get("trends_summary", "")
        text_before, table_data, text_after = parse_trends_text_and_table(trends_summary)

        # 1. Print text before the table with the same bold parser
        #    so headings like '**I. Common Features**' become bold consistently
        if text_before.strip():
            text_before_pp = preprocess_text(text_before)
            basic_bold_parser(pdf, text_before_pp, font_family="Arial", font_size=12)
            pdf.ln(3)

        # 2. Draw the table (unchanged)
        if table_data:
            pdf.set_font("Arial", size=10)
            page_width = pdf.w - 2 * pdf.l_margin
            num_cols = len(table_data[0])
            col_widths = [page_width / num_cols] * num_cols
            draw_measured_table(pdf, table_data, col_widths, line_height=6)
            pdf.ln(5)

        # 3. Print text after the table, also with the bold parser
        if text_after.strip():
            basic_bold_parser(pdf, text_after, font_family="Arial", font_size=12)

        # --- PART 2: Chrono Trigger Insights ---
        pdf.ln(10)
        pdf.set_font("Arial", size=14)
        pdf.cell(0, 10, txt="Recommendations for Chrono Trigger Remake:", ln=True)
        pdf.ln(3)

        pdf.set_font("Arial", size=12)
        chrono_trigger_text = insights.get("chrono_trigger_summary", "N/A")
        chrono_trigger_text_pp = preprocess_text(chrono_trigger_text)
        basic_bold_parser(pdf, chrono_trigger_text_pp, font_family="Arial", font_size=12)

        # --- PART 3: Visualizations (optional placeholders) ---
        pdf.add_page()
        pdf.set_font("Arial", size=14)
        pdf.cell(0, 10, txt="Visualizations:", ln=True)
        pdf.ln(3)
        pdf.set_font("Arial", size=12)

        image_width = 120  # Width of the image
        x_center = (pdf.w - image_width) / 2  # Calculate X-coordinate to center
        # Example: classification_distribution.png
        try:
            pdf.image('classification_distribution.png',x=x_center, w=image_width)
            pdf.ln(3)
            pdf.multi_cell(0, 10, "Figure 1: Pie chart representing remake/remaster distribution", align='C')
        except:
            pdf.multi_cell(0, 5, "Could not load classification_distribution.png")

        pdf.ln(5)

        # Example: feature_distribution.png
        try:
            pdf.image('feature_distribution.png',x=x_center ,w=image_width)
            pdf.ln(3)
            pdf.multi_cell(0, 10, "Figure 2: Bar chart showing key features in remakes", align='C')
        except:
            pdf.multi_cell(0, 5, "Could not load feature_distribution.png")

        # --- PART 4: Second Table - Top 5 Relevant Titles ---
        pdf.add_page()
        pdf.set_font("Arial", size=14)
        pdf.cell(0, 10, txt="Top 5 Relevant Titles for Chrono Trigger Remake:", ln=True)
        pdf.ln(3)

        relevant_titles = insights.get("relevant_titles", [])
        if relevant_titles:
            draw_relevant_titles_table(pdf, relevant_titles)
        else:
            pdf.set_font("Arial", size=12)
            pdf.multi_cell(0, 5, "No relevant titles found.")

        # --- Finish & Save ---
        pdf.output("RPG_Remake_Report.pdf")
        logging.info("PDF report generated successfully as 'RPG_Remake_Report.pdf'")

    except Exception as e:
        logging.error(f"Error generating PDF report: {e}")
Leave a Comment