Untitled

 avatar
unknown
plain_text
a month ago
22 kB
2
Indexable
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'exceptions'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[5], line 7\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mwarnings\u001b[39;00m\n\u001b[0;32m      6\u001b[0m warnings\u001b[38;5;241m.\u001b[39mfilterwarnings(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mdocx\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01menum\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtext\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m WD_PARAGRAPH_ALIGNMENT\n\u001b[0;32m      8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mdocx\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mshared\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Pt, RGBColor\n\u001b[0;32m      9\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mre\u001b[39;00m\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\docx.py:30\u001b[0m\n\u001b[0;32m     27\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m     28\u001b[0m     TAGS \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m---> 30\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mexceptions\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;167;01mPendingDeprecationWarning\u001b[39;00m\n\u001b[0;32m     31\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mwarnings\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m warn\n\u001b[0;32m     33\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mlogging\u001b[39;00m\n",
      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'exceptions'"
     ]
    }
   ],
   "source": [
    "# import modules\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import *\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "from docx.enum.text import WD_PARAGRAPH_ALIGNMENT\n",
    "from docx.shared import Pt, RGBColor\n",
    "import re\n",
    "import json\n",
    "import os\n",
    "from docx2pdf import convert\n",
    "import pdfplumber\n",
    "import docx\n",
    "import PyPDF2\n",
    "import fitz\n",
    "from PyPDF2 import PdfReader, PdfWriter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Useful Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define functions\n",
    "def convert_docx_to_pdf(docx_path, pdf_path):\n",
    "    \"\"\"\n",
    "    Convert a DOCX document to a PDF document while preserving the layout and page count.\n",
    "\n",
    "    :param docx_path: Path to the input DOCX file.\n",
    "    :param pdf_path: Path to the output PDF file.\n",
    "    \"\"\"\n",
    "    # Convert the DOCX file to PDF\n",
    "    convert(docx_path, pdf_path)\n",
    "\n",
    "def find_closest_header(lines, position):\n",
    "    \"\"\"\n",
    "    Find the closest header above the given position.\n",
    "    \"\"\"\n",
    "    closest_header = \"\"\n",
    "    min_distance = float('inf')\n",
    "    \n",
    "    for i, line in enumerate(lines):\n",
    "        # Get the y-coordinate of the line\n",
    "        line_position = line['top']\n",
    "        \n",
    "        # Check if the line is above the given position\n",
    "        if line_position < position:\n",
    "            distance = position - line_position\n",
    "            if distance < min_distance:\n",
    "                min_distance = distance\n",
    "                closest_header = line['text']\n",
    "    \n",
    "    return closest_header\n",
    "\n",
    "def extract_tables_and_images(pdf_path):\n",
    "    # Open the PDF file\n",
    "    with pdfplumber.open(pdf_path) as pdf:\n",
    "        # Initialize result dictionary\n",
    "        result = {\n",
    "            \"tables\": [],\n",
    "            \"images\": []\n",
    "        }\n",
    "        \n",
    "        # Iterate through each page\n",
    "        for page_num, page in enumerate(pdf.pages):\n",
    "            lines = page.extract_text_lines()\n",
    "            \n",
    "            # Extract tables\n",
    "            tables = page.find_tables()\n",
    "            for table_index, table in enumerate(tables):\n",
    "                # Find the closest header above the table\n",
    "                table_title = find_closest_header(lines, table.bbox[1])\n",
    "                result[\"tables\"].append({\n",
    "                    \"title\": table_title,\n",
    "                    \"page\": page_num + 1\n",
    "                })\n",
    "            \n",
    "            # Extract images\n",
    "            images = page.images\n",
    "            for img_index, img in enumerate(images):\n",
    "                # Find the closest header above the image\n",
    "                image_title = find_closest_header(lines, img['y1'])\n",
    "                result[\"images\"].append({\n",
    "                    \"title\": image_title,\n",
    "                    \"page\": page_num + 1\n",
    "                })\n",
    "    \n",
    "    return result\n",
    "\n",
    "def create_toc(table_pd, image_pd):\n",
    "    \"\"\"Create a Table of Contents (TOC) in a new DOCX file.\"\"\"\n",
    "    toc_doc = docx.Document()\n",
    "    heading = toc_doc.add_heading('LIST OF TABLES', level=1)\n",
    "    heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER\n",
    "    run = heading.runs[0]\n",
    "    run.font.name = 'Times New Roman'\n",
    "    run.font.size = Pt(12)\n",
    "    run.font.color.rgb = RGBColor(0, 0, 0)\n",
    "    paragraph = toc_doc.add_paragraph()\n",
    "    paragraph.add_run('\\n')\n",
    "    # Add tables to TOC\n",
    "    page_heading = toc_doc.add_heading('PAGE', level=2)\n",
    "    page_heading.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT\n",
    "    run = page_heading.runs[0]\n",
    "    run.font.name = 'Times New Roman'\n",
    "    run.font.size = Pt(10)\n",
    "    run.font.color.rgb = RGBColor(0, 0, 0)\n",
    "    for i in range(len(table_pd)):\n",
    "        table_name = table_pd.loc[i, 'title']\n",
    "        page_number = table_pd.loc[i, 'page']\n",
    "        paragraph = toc_doc.add_paragraph()\n",
    "        run = paragraph.add_run(f\"{table_name}\")\n",
    "        run.font.name = 'Times New Roman'\n",
    "        run.font.size = Pt(10)\n",
    "        paragraph.add_run(\" \")\n",
    "        total_len = len(table_name)+len(str(page_number))+2\n",
    "        if total_len < 80:\n",
    "            dots = '.'*(80-total_len)+' '\n",
    "        elif total_len == 80:\n",
    "            dots = '.'*(160-total_len)+' '\n",
    "        else:\n",
    "            dots = '.'*(80-(total_len%80))+' '\n",
    "        paragraph.add_run(dots)\n",
    "        paragraph.add_run(f\" {page_number}\")\n",
    "\n",
    "    # Add images to TOC\n",
    "    toc_doc.add_page_break()\n",
    "    heading = toc_doc.add_heading('LIST OF FIGURES', level=1)\n",
    "    heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER\n",
    "    run = heading.runs[0]\n",
    "    run.font.name = 'Times New Roman'\n",
    "    run.font.size = Pt(12)\n",
    "    run.font.color.rgb = RGBColor(0, 0, 0)\n",
    "    paragraph.add_run(\"\\n\")\n",
    "    page_heading = toc_doc.add_heading('PAGE', level=2)\n",
    "    page_heading.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT\n",
    "    run = page_heading.runs[0]\n",
    "    run.font.name = 'Times New Roman'\n",
    "    run.font.size = Pt(10)\n",
    "    run.font.color.rgb = RGBColor(0, 0, 0)\n",
    "    for i in range(len(image_pd)):\n",
    "        table_name = image_pd.loc[i, 'title']\n",
    "        page_number = image_pd.loc[i, 'page']\n",
    "        paragraph = toc_doc.add_paragraph()\n",
    "        run = paragraph.add_run(f\"{table_name}\")\n",
    "        run.font.name = 'Times New Roman'\n",
    "        run.font.size = Pt(10)\n",
    "        paragraph.add_run(\" \")\n",
    "        total_len = len(table_name)+len(str(page_number))+2\n",
    "        if total_len < 80:\n",
    "            dots = '.'*(80-total_len)+' '\n",
    "        elif total_len == 80:\n",
    "            dots = '.'*(160-total_len)+' '\n",
    "        else:\n",
    "            dots = '.'*(80-(total_len%80))+' '\n",
    "        paragraph.add_run(dots)\n",
    "        paragraph.add_run(f\" {page_number}\")\n",
    "    toc_doc.save('Table_of_Contents.docx')\n",
    "\n",
    "\n",
    "def count_docx_pages(docx_path):\n",
    "    # Convert DOCX to PDF\n",
    "    pdf_path = docx_path.replace('.docx', '.pdf')\n",
    "    convert(docx_path, pdf_path)\n",
    "    \n",
    "    # Count the number of pages in the PDF\n",
    "    with open(pdf_path, 'rb') as pdf_file:\n",
    "        pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
    "        num_pages = len(pdf_reader.pages)\n",
    "    \n",
    "    # Clean up the generated PDF file\n",
    "    os.remove(pdf_path)\n",
    "    \n",
    "    return num_pages\n",
    "\n",
    "def is_strictly_in(target, string):\n",
    "    return target in string\n",
    "\n",
    "def extract_toc_location(pdf_path, table_pd, image_pd):\n",
    "    # Convert DOCX to PDF\n",
    "    with pdfplumber.open(pdf_path) as pdf:\n",
    "        for page_num, page in enumerate(pdf.pages):\n",
    "            text = str(page.extract_text())\n",
    "            for j in range(len(table_pd)):\n",
    "                table_name = table_pd.loc[j, 'title']\n",
    "                x = is_strictly_in(table_name.replace(' ', ''), text.replace(' ', '').replace('\\n', ''))\n",
    "                if x:\n",
    "                    table_pd.loc[j, 'toc_loc'] = page_num\n",
    "        for page_num, page in enumerate(pdf.pages):\n",
    "            text = str(page.extract_text())\n",
    "            for k in range(len(image_pd)):\n",
    "                image_name = image_pd.loc[k, 'title']\n",
    "                y = is_strictly_in(image_name.replace(' ', ''), text.replace(' ', '').replace('\\n', ''))\n",
    "                if y:\n",
    "                    image_pd.loc[k, 'toc_loc'] = page_num\n",
    "    return table_pd, image_pd\n",
    "\n",
    "def delete_old_TOC(source_path, target_path, pages_to_delete):\n",
    "    # 打开源PDF文件\n",
    "    source_pdf = PdfReader(source_path)\n",
    "    # 创建一个PdfWriter对象,用于写入新的PDF文件\n",
    "    writer = PdfWriter()\n",
    "    # 将除了要删除的页面之外的所有页面添加到writer中\n",
    "    for i in range(len(source_pdf.pages)):\n",
    "        if i not in pages_to_delete:\n",
    "            writer.add_page(source_pdf.pages[i])\n",
    "    # 将合并后的PDF写入到新的文件中\n",
    "    with open(target_path, \"wb\") as output_pdf:\n",
    "        writer.write(output_pdf)\n",
    "\n",
    "def write_new_TOC(source_path1, insert_path, output_path, page_start):\n",
    "    # 创建PdfReader对象,用于读取PDF文件\n",
    "    source_reader = PdfReader(source_path1)\n",
    "    insert_reader = PdfReader(insert_path)\n",
    "\n",
    "    # 创建PdfWriter对象,用于写入PDF文件\n",
    "    writer = PdfWriter()\n",
    "\n",
    "    # 确定插入的位置\n",
    "    insert_index = page_start  # 假设您想要在源PDF的第三页(索引为2)后插入\n",
    "\n",
    "    # 将源PDF文件的页面添加到writer中,直到您想要插入的位置\n",
    "    for i in range(insert_index):\n",
    "        page = source_reader.pages[i]\n",
    "        writer.add_page(page)\n",
    "\n",
    "    # 将插入的PDF文件的所有页面添加到writer中\n",
    "    for page in insert_reader.pages:\n",
    "        writer.add_page(page)\n",
    "\n",
    "    # 继续添加源PDF文件的剩余页面\n",
    "    for i in range(insert_index, len(source_reader.pages)):\n",
    "        page = source_reader.pages[i]\n",
    "        writer.add_page(page)\n",
    "\n",
    "    # 将合并后的PDF写入到新的文件中\n",
    "    with open(output_path, \"wb\") as output_pdf:\n",
    "        writer.write(output_pdf)\n",
    "\n",
    "def add_hyperlink_to_pdf(input_pdf_path, output_pdf_path, table_pd, image_pd, TOC_gap, page_start):\n",
    "    # Open the input PDF\n",
    "    doc = fitz.open(input_pdf_path)\n",
    "    \n",
    "    # Ensure the document has at least 2 pages\n",
    "    if doc.page_count < 2:\n",
    "        raise ValueError(\"The PDF must have at least 2 pages.\")\n",
    "    for j in range(len(table_pd)):\n",
    "        table_name = '.. '+str(table_pd.loc[j, 'page'])\n",
    "        n1 = int(table_pd.loc[j, 'toc_loc'])+page_start\n",
    "        # Get the first page\n",
    "        first_page = doc[n1]\n",
    "        # Define the text and its position\n",
    "        text = table_name\n",
    "    \n",
    "    # Get the text rectangle\n",
    "        text_rect = first_page.search_for(text)[table_pd.loc[j, 'count']]\n",
    "    \n",
    "    # Add the hyperlink to the text\n",
    "        first_page.insert_link({\n",
    "            \"kind\": fitz.LINK_GOTO,  # Type of link\n",
    "            \"page\": int(table_pd.loc[j, 'page'])+ TOC_gap - 1,  # Page number to go to (0-based index, so last page)\n",
    "            \"from\": text_rect\n",
    "        })\n",
    "    for j in range(len(image_pd)):\n",
    "        table_name = '.. '+str(image_pd.loc[j, 'page'])\n",
    "        n1 = int(image_pd.loc[j, 'toc_loc'])+page_start\n",
    "        # Get the first page\n",
    "        first_page = doc[n1]\n",
    "    \n",
    "        # Define the text and its position\n",
    "        text = table_name\n",
    "    \n",
    "    # Get the text rectangle\n",
    "        text_rect = first_page.search_for(text)[image_pd.loc[j, 'count']]\n",
    "    \n",
    "    # Add the hyperlink to the text\n",
    "        first_page.insert_link({\n",
    "            \"kind\": fitz.LINK_GOTO,  # Type of link\n",
    "            \"page\": int(image_pd.loc[j, 'page'])+ TOC_gap - 1,  # Page number to go to (0-based index, so last page)\n",
    "            \"from\": text_rect\n",
    "        })\n",
    "    # Save the modified PDF to the output path\n",
    "    doc.save(output_pdf_path)\n",
    "    doc.close()\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Workflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "abbd5a8b01fb4849a2590c25fa66b054",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The DOCX document has 180 pages.\n"
     ]
    }
   ],
   "source": [
    "# count raw document pages\n",
    "docx_path = 'test2.docx'\n",
    "num_pages = count_docx_pages(docx_path)\n",
    "print(f'The DOCX document has {num_pages} pages.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fea11e8ba42c4a538318c5092f528320",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# convert docx to pdf\n",
    "pdf_path = \"test3.pdf\"\n",
    "convert_docx_to_pdf(docx_path, pdf_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# extract tables and images for the document\n",
    "result = extract_tables_and_images(pdf_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get table list and image list\n",
    "table_pd = pd.DataFrame(result.get('tables'))\n",
    "image_pd = pd.DataFrame(result.get('images'))\n",
    "table_pd.loc[:,'toc_loc'] = 999\n",
    "image_pd.loc[:,'toc_loc'] = 999\n",
    "table_pd.loc[:, 'ok'] = 1\n",
    "image_pd.loc[:, 'ok'] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(1, len(table_pd)):\n",
    "    target = table_pd.loc[i, 'title']\n",
    "    if target == 'Protocol Final':\n",
    "        table_pd.loc[i, 'ok'] = 0\n",
    "    if (table_pd.loc[i, 'title'] == table_pd.loc[i-1, 'title']):\n",
    "        table_pd.loc[i, 'ok'] = 0\n",
    "for i in range(1, len(image_pd)):\n",
    "    target = image_pd.loc[i, 'title']\n",
    "    if target == 'Protocol Final':\n",
    "        image_pd.loc[i, 'ok'] = 0\n",
    "    if (image_pd.loc[i, 'title'] == image_pd.loc[i-1, 'title']):\n",
    "        image_pd.loc[i, 'ok'] = 0\n",
    "table_pd = table_pd[table_pd['ok'] == 1]\n",
    "table_pd = table_pd.drop(['ok'], axis = 1)\n",
    "table_pd.index = range(len(table_pd))\n",
    "image_pd = image_pd[image_pd['ok'] == 1]\n",
    "image_pd = image_pd.drop(['ok'], axis = 1)\n",
    "image_pd.index = range(len(image_pd))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6425e7a36c0f4a69844919146b79b5d1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The DOCX document has 3 pages.\n"
     ]
    }
   ],
   "source": [
    "# create initial Table of Contents\n",
    "create_toc(table_pd, image_pd)\n",
    "num_pages = count_docx_pages('Table_of_Contents.docx')\n",
    "print(f'The DOCX document has {num_pages} pages.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4ccd8790a80d42ef87e9c2df0b09dad7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "docx_path = 'Table_of_Contents.docx'\n",
    "pdf_path = \"Table_of_Contents.pdf\"\n",
    "convert_docx_to_pdf(docx_path, pdf_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# modify the table of contents\n",
    "table_pd, image_pd = extract_toc_location(pdf_path, table_pd, image_pd)\n",
    "table_pd.loc[:, 'count'] = 0\n",
    "image_pd.loc[:, 'count'] = 0\n",
    "count1 = 0\n",
    "for i in range(1,len(table_pd)):\n",
    "    if table_pd.loc[i, 'page'] == table_pd.loc[i-1, 'page']:\n",
    "        count1 = count1+1\n",
    "        table_pd.loc[i, 'count'] = count1\n",
    "    else:\n",
    "        count1 = 0\n",
    "count2 = 0\n",
    "for i in range(1,len(image_pd)):\n",
    "    if image_pd.loc[i, 'page'] == image_pd.loc[i-1, 'page']:\n",
    "        count2 = count2+1\n",
    "        image_pd.loc[i, 'count'] = count2\n",
    "    else:\n",
    "        count1 = 0\n",
    "create_toc(table_pd, image_pd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e6cde854c8dc4171871a2d366bc2e23e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# convert Table of Contents to PDF and merge all the PDFs\n",
    "docx_path = 'Table_of_Contents.docx'\n",
    "pdf_path = 'Table_of_Contents.pdf'\n",
    "convert_docx_to_pdf(docx_path, pdf_path)\n",
    "pages_to_delete = [9,10,11]\n",
    "old_len = len(pages_to_delete)\n",
    "delete_old_TOC('test3.pdf', 'test4.pdf', pages_to_delete)\n",
    "write_new_TOC('test4.pdf', pdf_path, 'test5.pdf', 9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "TOC_gap = num_pages-old_len\n",
    "page_start = 9\n",
    "# create final PDF\n",
    "input_pdf_path = 'test5.pdf'\n",
    "output_pdf_path = 'final_result0107.pdf'\n",
    "add_hyperlink_to_pdf(input_pdf_path, output_pdf_path, table_pd, image_pd, TOC_gap, page_start)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
Editor is loading...
Leave a Comment