Untitled
unknown
plain_text
a month ago
22 kB
2
Indexable
{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'exceptions'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[5], line 7\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mwarnings\u001b[39;00m\n\u001b[0;32m 6\u001b[0m warnings\u001b[38;5;241m.\u001b[39mfilterwarnings(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mdocx\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01menum\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtext\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m WD_PARAGRAPH_ALIGNMENT\n\u001b[0;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mdocx\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mshared\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Pt, RGBColor\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mre\u001b[39;00m\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\docx.py:30\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m 28\u001b[0m TAGS \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m---> 30\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mexceptions\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;167;01mPendingDeprecationWarning\u001b[39;00m\n\u001b[0;32m 31\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mwarnings\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m warn\n\u001b[0;32m 33\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mlogging\u001b[39;00m\n", "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'exceptions'" ] } ], "source": [ "# import modules\n", "import numpy as np\n", "import pandas as pd\n", "from tqdm import *\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "from docx.enum.text import WD_PARAGRAPH_ALIGNMENT\n", "from docx.shared import Pt, RGBColor\n", "import re\n", "import json\n", "import os\n", "from docx2pdf import convert\n", "import pdfplumber\n", "import docx\n", "import PyPDF2\n", "import fitz\n", "from PyPDF2 import PdfReader, PdfWriter" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Useful Functions" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# define functions\n", "def convert_docx_to_pdf(docx_path, pdf_path):\n", " \"\"\"\n", " Convert a DOCX document to a PDF document while preserving the layout and page count.\n", "\n", " :param docx_path: Path to the input DOCX file.\n", " :param pdf_path: Path to the output PDF file.\n", " \"\"\"\n", " # Convert the DOCX file to PDF\n", " convert(docx_path, pdf_path)\n", "\n", "def find_closest_header(lines, position):\n", " \"\"\"\n", " Find the closest header above the given position.\n", " \"\"\"\n", " closest_header = \"\"\n", " min_distance = float('inf')\n", " \n", " for i, line in enumerate(lines):\n", " # Get the y-coordinate of the line\n", " line_position = line['top']\n", " \n", " # Check if the line is above the given position\n", " if line_position < position:\n", " distance = position - line_position\n", " if distance < min_distance:\n", " min_distance = distance\n", " closest_header = line['text']\n", " \n", " return closest_header\n", "\n", "def extract_tables_and_images(pdf_path):\n", " # Open the PDF file\n", " with pdfplumber.open(pdf_path) as pdf:\n", " # Initialize result dictionary\n", " result = {\n", " \"tables\": [],\n", " \"images\": []\n", " }\n", " \n", " # Iterate through each page\n", " for page_num, page in enumerate(pdf.pages):\n", " lines = page.extract_text_lines()\n", " \n", " # Extract tables\n", " tables = page.find_tables()\n", " for table_index, table in enumerate(tables):\n", " # Find the closest header above the table\n", " table_title = find_closest_header(lines, table.bbox[1])\n", " result[\"tables\"].append({\n", " \"title\": table_title,\n", " \"page\": page_num + 1\n", " })\n", " \n", " # Extract images\n", " images = page.images\n", " for img_index, img in enumerate(images):\n", " # Find the closest header above the image\n", " image_title = find_closest_header(lines, img['y1'])\n", " result[\"images\"].append({\n", " \"title\": image_title,\n", " \"page\": page_num + 1\n", " })\n", " \n", " return result\n", "\n", "def create_toc(table_pd, image_pd):\n", " \"\"\"Create a Table of Contents (TOC) in a new DOCX file.\"\"\"\n", " toc_doc = docx.Document()\n", " heading = toc_doc.add_heading('LIST OF TABLES', level=1)\n", " heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER\n", " run = heading.runs[0]\n", " run.font.name = 'Times New Roman'\n", " run.font.size = Pt(12)\n", " run.font.color.rgb = RGBColor(0, 0, 0)\n", " paragraph = toc_doc.add_paragraph()\n", " paragraph.add_run('\\n')\n", " # Add tables to TOC\n", " page_heading = toc_doc.add_heading('PAGE', level=2)\n", " page_heading.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT\n", " run = page_heading.runs[0]\n", " run.font.name = 'Times New Roman'\n", " run.font.size = Pt(10)\n", " run.font.color.rgb = RGBColor(0, 0, 0)\n", " for i in range(len(table_pd)):\n", " table_name = table_pd.loc[i, 'title']\n", " page_number = table_pd.loc[i, 'page']\n", " paragraph = toc_doc.add_paragraph()\n", " run = paragraph.add_run(f\"{table_name}\")\n", " run.font.name = 'Times New Roman'\n", " run.font.size = Pt(10)\n", " paragraph.add_run(\" \")\n", " total_len = len(table_name)+len(str(page_number))+2\n", " if total_len < 80:\n", " dots = '.'*(80-total_len)+' '\n", " elif total_len == 80:\n", " dots = '.'*(160-total_len)+' '\n", " else:\n", " dots = '.'*(80-(total_len%80))+' '\n", " paragraph.add_run(dots)\n", " paragraph.add_run(f\" {page_number}\")\n", "\n", " # Add images to TOC\n", " toc_doc.add_page_break()\n", " heading = toc_doc.add_heading('LIST OF FIGURES', level=1)\n", " heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER\n", " run = heading.runs[0]\n", " run.font.name = 'Times New Roman'\n", " run.font.size = Pt(12)\n", " run.font.color.rgb = RGBColor(0, 0, 0)\n", " paragraph.add_run(\"\\n\")\n", " page_heading = toc_doc.add_heading('PAGE', level=2)\n", " page_heading.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT\n", " run = page_heading.runs[0]\n", " run.font.name = 'Times New Roman'\n", " run.font.size = Pt(10)\n", " run.font.color.rgb = RGBColor(0, 0, 0)\n", " for i in range(len(image_pd)):\n", " table_name = image_pd.loc[i, 'title']\n", " page_number = image_pd.loc[i, 'page']\n", " paragraph = toc_doc.add_paragraph()\n", " run = paragraph.add_run(f\"{table_name}\")\n", " run.font.name = 'Times New Roman'\n", " run.font.size = Pt(10)\n", " paragraph.add_run(\" \")\n", " total_len = len(table_name)+len(str(page_number))+2\n", " if total_len < 80:\n", " dots = '.'*(80-total_len)+' '\n", " elif total_len == 80:\n", " dots = '.'*(160-total_len)+' '\n", " else:\n", " dots = '.'*(80-(total_len%80))+' '\n", " paragraph.add_run(dots)\n", " paragraph.add_run(f\" {page_number}\")\n", " toc_doc.save('Table_of_Contents.docx')\n", "\n", "\n", "def count_docx_pages(docx_path):\n", " # Convert DOCX to PDF\n", " pdf_path = docx_path.replace('.docx', '.pdf')\n", " convert(docx_path, pdf_path)\n", " \n", " # Count the number of pages in the PDF\n", " with open(pdf_path, 'rb') as pdf_file:\n", " pdf_reader = PyPDF2.PdfReader(pdf_file)\n", " num_pages = len(pdf_reader.pages)\n", " \n", " # Clean up the generated PDF file\n", " os.remove(pdf_path)\n", " \n", " return num_pages\n", "\n", "def is_strictly_in(target, string):\n", " return target in string\n", "\n", "def extract_toc_location(pdf_path, table_pd, image_pd):\n", " # Convert DOCX to PDF\n", " with pdfplumber.open(pdf_path) as pdf:\n", " for page_num, page in enumerate(pdf.pages):\n", " text = str(page.extract_text())\n", " for j in range(len(table_pd)):\n", " table_name = table_pd.loc[j, 'title']\n", " x = is_strictly_in(table_name.replace(' ', ''), text.replace(' ', '').replace('\\n', ''))\n", " if x:\n", " table_pd.loc[j, 'toc_loc'] = page_num\n", " for page_num, page in enumerate(pdf.pages):\n", " text = str(page.extract_text())\n", " for k in range(len(image_pd)):\n", " image_name = image_pd.loc[k, 'title']\n", " y = is_strictly_in(image_name.replace(' ', ''), text.replace(' ', '').replace('\\n', ''))\n", " if y:\n", " image_pd.loc[k, 'toc_loc'] = page_num\n", " return table_pd, image_pd\n", "\n", "def delete_old_TOC(source_path, target_path, pages_to_delete):\n", " # 打开源PDF文件\n", " source_pdf = PdfReader(source_path)\n", " # 创建一个PdfWriter对象,用于写入新的PDF文件\n", " writer = PdfWriter()\n", " # 将除了要删除的页面之外的所有页面添加到writer中\n", " for i in range(len(source_pdf.pages)):\n", " if i not in pages_to_delete:\n", " writer.add_page(source_pdf.pages[i])\n", " # 将合并后的PDF写入到新的文件中\n", " with open(target_path, \"wb\") as output_pdf:\n", " writer.write(output_pdf)\n", "\n", "def write_new_TOC(source_path1, insert_path, output_path, page_start):\n", " # 创建PdfReader对象,用于读取PDF文件\n", " source_reader = PdfReader(source_path1)\n", " insert_reader = PdfReader(insert_path)\n", "\n", " # 创建PdfWriter对象,用于写入PDF文件\n", " writer = PdfWriter()\n", "\n", " # 确定插入的位置\n", " insert_index = page_start # 假设您想要在源PDF的第三页(索引为2)后插入\n", "\n", " # 将源PDF文件的页面添加到writer中,直到您想要插入的位置\n", " for i in range(insert_index):\n", " page = source_reader.pages[i]\n", " writer.add_page(page)\n", "\n", " # 将插入的PDF文件的所有页面添加到writer中\n", " for page in insert_reader.pages:\n", " writer.add_page(page)\n", "\n", " # 继续添加源PDF文件的剩余页面\n", " for i in range(insert_index, len(source_reader.pages)):\n", " page = source_reader.pages[i]\n", " writer.add_page(page)\n", "\n", " # 将合并后的PDF写入到新的文件中\n", " with open(output_path, \"wb\") as output_pdf:\n", " writer.write(output_pdf)\n", "\n", "def add_hyperlink_to_pdf(input_pdf_path, output_pdf_path, table_pd, image_pd, TOC_gap, page_start):\n", " # Open the input PDF\n", " doc = fitz.open(input_pdf_path)\n", " \n", " # Ensure the document has at least 2 pages\n", " if doc.page_count < 2:\n", " raise ValueError(\"The PDF must have at least 2 pages.\")\n", " for j in range(len(table_pd)):\n", " table_name = '.. '+str(table_pd.loc[j, 'page'])\n", " n1 = int(table_pd.loc[j, 'toc_loc'])+page_start\n", " # Get the first page\n", " first_page = doc[n1]\n", " # Define the text and its position\n", " text = table_name\n", " \n", " # Get the text rectangle\n", " text_rect = first_page.search_for(text)[table_pd.loc[j, 'count']]\n", " \n", " # Add the hyperlink to the text\n", " first_page.insert_link({\n", " \"kind\": fitz.LINK_GOTO, # Type of link\n", " \"page\": int(table_pd.loc[j, 'page'])+ TOC_gap - 1, # Page number to go to (0-based index, so last page)\n", " \"from\": text_rect\n", " })\n", " for j in range(len(image_pd)):\n", " table_name = '.. '+str(image_pd.loc[j, 'page'])\n", " n1 = int(image_pd.loc[j, 'toc_loc'])+page_start\n", " # Get the first page\n", " first_page = doc[n1]\n", " \n", " # Define the text and its position\n", " text = table_name\n", " \n", " # Get the text rectangle\n", " text_rect = first_page.search_for(text)[image_pd.loc[j, 'count']]\n", " \n", " # Add the hyperlink to the text\n", " first_page.insert_link({\n", " \"kind\": fitz.LINK_GOTO, # Type of link\n", " \"page\": int(image_pd.loc[j, 'page'])+ TOC_gap - 1, # Page number to go to (0-based index, so last page)\n", " \"from\": text_rect\n", " })\n", " # Save the modified PDF to the output path\n", " doc.save(output_pdf_path)\n", " doc.close()\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Workflow" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "abbd5a8b01fb4849a2590c25fa66b054", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00<?, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "The DOCX document has 180 pages.\n" ] } ], "source": [ "# count raw document pages\n", "docx_path = 'test2.docx'\n", "num_pages = count_docx_pages(docx_path)\n", "print(f'The DOCX document has {num_pages} pages.')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fea11e8ba42c4a538318c5092f528320", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00<?, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# convert docx to pdf\n", "pdf_path = \"test3.pdf\"\n", "convert_docx_to_pdf(docx_path, pdf_path)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# extract tables and images for the document\n", "result = extract_tables_and_images(pdf_path)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# get table list and image list\n", "table_pd = pd.DataFrame(result.get('tables'))\n", "image_pd = pd.DataFrame(result.get('images'))\n", "table_pd.loc[:,'toc_loc'] = 999\n", "image_pd.loc[:,'toc_loc'] = 999\n", "table_pd.loc[:, 'ok'] = 1\n", "image_pd.loc[:, 'ok'] = 1" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "for i in range(1, len(table_pd)):\n", " target = table_pd.loc[i, 'title']\n", " if target == 'Protocol Final':\n", " table_pd.loc[i, 'ok'] = 0\n", " if (table_pd.loc[i, 'title'] == table_pd.loc[i-1, 'title']):\n", " table_pd.loc[i, 'ok'] = 0\n", "for i in range(1, len(image_pd)):\n", " target = image_pd.loc[i, 'title']\n", " if target == 'Protocol Final':\n", " image_pd.loc[i, 'ok'] = 0\n", " if (image_pd.loc[i, 'title'] == image_pd.loc[i-1, 'title']):\n", " image_pd.loc[i, 'ok'] = 0\n", "table_pd = table_pd[table_pd['ok'] == 1]\n", "table_pd = table_pd.drop(['ok'], axis = 1)\n", "table_pd.index = range(len(table_pd))\n", "image_pd = image_pd[image_pd['ok'] == 1]\n", "image_pd = image_pd.drop(['ok'], axis = 1)\n", "image_pd.index = range(len(image_pd))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6425e7a36c0f4a69844919146b79b5d1", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00<?, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "The DOCX document has 3 pages.\n" ] } ], "source": [ "# create initial Table of Contents\n", "create_toc(table_pd, image_pd)\n", "num_pages = count_docx_pages('Table_of_Contents.docx')\n", "print(f'The DOCX document has {num_pages} pages.')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4ccd8790a80d42ef87e9c2df0b09dad7", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00<?, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "docx_path = 'Table_of_Contents.docx'\n", "pdf_path = \"Table_of_Contents.pdf\"\n", "convert_docx_to_pdf(docx_path, pdf_path)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# modify the table of contents\n", "table_pd, image_pd = extract_toc_location(pdf_path, table_pd, image_pd)\n", "table_pd.loc[:, 'count'] = 0\n", "image_pd.loc[:, 'count'] = 0\n", "count1 = 0\n", "for i in range(1,len(table_pd)):\n", " if table_pd.loc[i, 'page'] == table_pd.loc[i-1, 'page']:\n", " count1 = count1+1\n", " table_pd.loc[i, 'count'] = count1\n", " else:\n", " count1 = 0\n", "count2 = 0\n", "for i in range(1,len(image_pd)):\n", " if image_pd.loc[i, 'page'] == image_pd.loc[i-1, 'page']:\n", " count2 = count2+1\n", " image_pd.loc[i, 'count'] = count2\n", " else:\n", " count1 = 0\n", "create_toc(table_pd, image_pd)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e6cde854c8dc4171871a2d366bc2e23e", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00<?, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# convert Table of Contents to PDF and merge all the PDFs\n", "docx_path = 'Table_of_Contents.docx'\n", "pdf_path = 'Table_of_Contents.pdf'\n", "convert_docx_to_pdf(docx_path, pdf_path)\n", "pages_to_delete = [9,10,11]\n", "old_len = len(pages_to_delete)\n", "delete_old_TOC('test3.pdf', 'test4.pdf', pages_to_delete)\n", "write_new_TOC('test4.pdf', pdf_path, 'test5.pdf', 9)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "TOC_gap = num_pages-old_len\n", "page_start = 9\n", "# create final PDF\n", "input_pdf_path = 'test5.pdf'\n", "output_pdf_path = 'final_result0107.pdf'\n", "add_hyperlink_to_pdf(input_pdf_path, output_pdf_path, table_pd, image_pd, TOC_gap, page_start)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.0" } }, "nbformat": 4, "nbformat_minor": 2 }
Editor is loading...
Leave a Comment