Untitled
unknown
plain_text
a year ago
22 kB
6
Indexable
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'exceptions'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[5], line 7\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mwarnings\u001b[39;00m\n\u001b[0;32m 6\u001b[0m warnings\u001b[38;5;241m.\u001b[39mfilterwarnings(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mdocx\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01menum\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtext\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m WD_PARAGRAPH_ALIGNMENT\n\u001b[0;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mdocx\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mshared\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Pt, RGBColor\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mre\u001b[39;00m\n",
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\docx.py:30\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m 28\u001b[0m TAGS \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m---> 30\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mexceptions\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;167;01mPendingDeprecationWarning\u001b[39;00m\n\u001b[0;32m 31\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mwarnings\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m warn\n\u001b[0;32m 33\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mlogging\u001b[39;00m\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'exceptions'"
]
}
],
"source": [
"# import modules\n",
"import numpy as np\n",
"import pandas as pd\n",
"from tqdm import *\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"from docx.enum.text import WD_PARAGRAPH_ALIGNMENT\n",
"from docx.shared import Pt, RGBColor\n",
"import re\n",
"import json\n",
"import os\n",
"from docx2pdf import convert\n",
"import pdfplumber\n",
"import docx\n",
"import PyPDF2\n",
"import fitz\n",
"from PyPDF2 import PdfReader, PdfWriter"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Useful Functions"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# define functions\n",
"def convert_docx_to_pdf(docx_path, pdf_path):\n",
" \"\"\"\n",
" Convert a DOCX document to a PDF document while preserving the layout and page count.\n",
"\n",
" :param docx_path: Path to the input DOCX file.\n",
" :param pdf_path: Path to the output PDF file.\n",
" \"\"\"\n",
" # Convert the DOCX file to PDF\n",
" convert(docx_path, pdf_path)\n",
"\n",
"def find_closest_header(lines, position):\n",
" \"\"\"\n",
" Find the closest header above the given position.\n",
" \"\"\"\n",
" closest_header = \"\"\n",
" min_distance = float('inf')\n",
" \n",
" for i, line in enumerate(lines):\n",
" # Get the y-coordinate of the line\n",
" line_position = line['top']\n",
" \n",
" # Check if the line is above the given position\n",
" if line_position < position:\n",
" distance = position - line_position\n",
" if distance < min_distance:\n",
" min_distance = distance\n",
" closest_header = line['text']\n",
" \n",
" return closest_header\n",
"\n",
"def extract_tables_and_images(pdf_path):\n",
" # Open the PDF file\n",
" with pdfplumber.open(pdf_path) as pdf:\n",
" # Initialize result dictionary\n",
" result = {\n",
" \"tables\": [],\n",
" \"images\": []\n",
" }\n",
" \n",
" # Iterate through each page\n",
" for page_num, page in enumerate(pdf.pages):\n",
" lines = page.extract_text_lines()\n",
" \n",
" # Extract tables\n",
" tables = page.find_tables()\n",
" for table_index, table in enumerate(tables):\n",
" # Find the closest header above the table\n",
" table_title = find_closest_header(lines, table.bbox[1])\n",
" result[\"tables\"].append({\n",
" \"title\": table_title,\n",
" \"page\": page_num + 1\n",
" })\n",
" \n",
" # Extract images\n",
" images = page.images\n",
" for img_index, img in enumerate(images):\n",
" # Find the closest header above the image\n",
" image_title = find_closest_header(lines, img['y1'])\n",
" result[\"images\"].append({\n",
" \"title\": image_title,\n",
" \"page\": page_num + 1\n",
" })\n",
" \n",
" return result\n",
"\n",
"def create_toc(table_pd, image_pd):\n",
" \"\"\"Create a Table of Contents (TOC) in a new DOCX file.\"\"\"\n",
" toc_doc = docx.Document()\n",
" heading = toc_doc.add_heading('LIST OF TABLES', level=1)\n",
" heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER\n",
" run = heading.runs[0]\n",
" run.font.name = 'Times New Roman'\n",
" run.font.size = Pt(12)\n",
" run.font.color.rgb = RGBColor(0, 0, 0)\n",
" paragraph = toc_doc.add_paragraph()\n",
" paragraph.add_run('\\n')\n",
" # Add tables to TOC\n",
" page_heading = toc_doc.add_heading('PAGE', level=2)\n",
" page_heading.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT\n",
" run = page_heading.runs[0]\n",
" run.font.name = 'Times New Roman'\n",
" run.font.size = Pt(10)\n",
" run.font.color.rgb = RGBColor(0, 0, 0)\n",
" for i in range(len(table_pd)):\n",
" table_name = table_pd.loc[i, 'title']\n",
" page_number = table_pd.loc[i, 'page']\n",
" paragraph = toc_doc.add_paragraph()\n",
" run = paragraph.add_run(f\"{table_name}\")\n",
" run.font.name = 'Times New Roman'\n",
" run.font.size = Pt(10)\n",
" paragraph.add_run(\" \")\n",
" total_len = len(table_name)+len(str(page_number))+2\n",
" if total_len < 80:\n",
" dots = '.'*(80-total_len)+' '\n",
" elif total_len == 80:\n",
" dots = '.'*(160-total_len)+' '\n",
" else:\n",
" dots = '.'*(80-(total_len%80))+' '\n",
" paragraph.add_run(dots)\n",
" paragraph.add_run(f\" {page_number}\")\n",
"\n",
" # Add images to TOC\n",
" toc_doc.add_page_break()\n",
" heading = toc_doc.add_heading('LIST OF FIGURES', level=1)\n",
" heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER\n",
" run = heading.runs[0]\n",
" run.font.name = 'Times New Roman'\n",
" run.font.size = Pt(12)\n",
" run.font.color.rgb = RGBColor(0, 0, 0)\n",
" paragraph.add_run(\"\\n\")\n",
" page_heading = toc_doc.add_heading('PAGE', level=2)\n",
" page_heading.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT\n",
" run = page_heading.runs[0]\n",
" run.font.name = 'Times New Roman'\n",
" run.font.size = Pt(10)\n",
" run.font.color.rgb = RGBColor(0, 0, 0)\n",
" for i in range(len(image_pd)):\n",
" table_name = image_pd.loc[i, 'title']\n",
" page_number = image_pd.loc[i, 'page']\n",
" paragraph = toc_doc.add_paragraph()\n",
" run = paragraph.add_run(f\"{table_name}\")\n",
" run.font.name = 'Times New Roman'\n",
" run.font.size = Pt(10)\n",
" paragraph.add_run(\" \")\n",
" total_len = len(table_name)+len(str(page_number))+2\n",
" if total_len < 80:\n",
" dots = '.'*(80-total_len)+' '\n",
" elif total_len == 80:\n",
" dots = '.'*(160-total_len)+' '\n",
" else:\n",
" dots = '.'*(80-(total_len%80))+' '\n",
" paragraph.add_run(dots)\n",
" paragraph.add_run(f\" {page_number}\")\n",
" toc_doc.save('Table_of_Contents.docx')\n",
"\n",
"\n",
"def count_docx_pages(docx_path):\n",
" # Convert DOCX to PDF\n",
" pdf_path = docx_path.replace('.docx', '.pdf')\n",
" convert(docx_path, pdf_path)\n",
" \n",
" # Count the number of pages in the PDF\n",
" with open(pdf_path, 'rb') as pdf_file:\n",
" pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
" num_pages = len(pdf_reader.pages)\n",
" \n",
" # Clean up the generated PDF file\n",
" os.remove(pdf_path)\n",
" \n",
" return num_pages\n",
"\n",
"def is_strictly_in(target, string):\n",
" return target in string\n",
"\n",
"def extract_toc_location(pdf_path, table_pd, image_pd):\n",
" # Convert DOCX to PDF\n",
" with pdfplumber.open(pdf_path) as pdf:\n",
" for page_num, page in enumerate(pdf.pages):\n",
" text = str(page.extract_text())\n",
" for j in range(len(table_pd)):\n",
" table_name = table_pd.loc[j, 'title']\n",
" x = is_strictly_in(table_name.replace(' ', ''), text.replace(' ', '').replace('\\n', ''))\n",
" if x:\n",
" table_pd.loc[j, 'toc_loc'] = page_num\n",
" for page_num, page in enumerate(pdf.pages):\n",
" text = str(page.extract_text())\n",
" for k in range(len(image_pd)):\n",
" image_name = image_pd.loc[k, 'title']\n",
" y = is_strictly_in(image_name.replace(' ', ''), text.replace(' ', '').replace('\\n', ''))\n",
" if y:\n",
" image_pd.loc[k, 'toc_loc'] = page_num\n",
" return table_pd, image_pd\n",
"\n",
"def delete_old_TOC(source_path, target_path, pages_to_delete):\n",
" # 打开源PDF文件\n",
" source_pdf = PdfReader(source_path)\n",
" # 创建一个PdfWriter对象,用于写入新的PDF文件\n",
" writer = PdfWriter()\n",
" # 将除了要删除的页面之外的所有页面添加到writer中\n",
" for i in range(len(source_pdf.pages)):\n",
" if i not in pages_to_delete:\n",
" writer.add_page(source_pdf.pages[i])\n",
" # 将合并后的PDF写入到新的文件中\n",
" with open(target_path, \"wb\") as output_pdf:\n",
" writer.write(output_pdf)\n",
"\n",
"def write_new_TOC(source_path1, insert_path, output_path, page_start):\n",
" # 创建PdfReader对象,用于读取PDF文件\n",
" source_reader = PdfReader(source_path1)\n",
" insert_reader = PdfReader(insert_path)\n",
"\n",
" # 创建PdfWriter对象,用于写入PDF文件\n",
" writer = PdfWriter()\n",
"\n",
" # 确定插入的位置\n",
" insert_index = page_start # 假设您想要在源PDF的第三页(索引为2)后插入\n",
"\n",
" # 将源PDF文件的页面添加到writer中,直到您想要插入的位置\n",
" for i in range(insert_index):\n",
" page = source_reader.pages[i]\n",
" writer.add_page(page)\n",
"\n",
" # 将插入的PDF文件的所有页面添加到writer中\n",
" for page in insert_reader.pages:\n",
" writer.add_page(page)\n",
"\n",
" # 继续添加源PDF文件的剩余页面\n",
" for i in range(insert_index, len(source_reader.pages)):\n",
" page = source_reader.pages[i]\n",
" writer.add_page(page)\n",
"\n",
" # 将合并后的PDF写入到新的文件中\n",
" with open(output_path, \"wb\") as output_pdf:\n",
" writer.write(output_pdf)\n",
"\n",
"def add_hyperlink_to_pdf(input_pdf_path, output_pdf_path, table_pd, image_pd, TOC_gap, page_start):\n",
" # Open the input PDF\n",
" doc = fitz.open(input_pdf_path)\n",
" \n",
" # Ensure the document has at least 2 pages\n",
" if doc.page_count < 2:\n",
" raise ValueError(\"The PDF must have at least 2 pages.\")\n",
" for j in range(len(table_pd)):\n",
" table_name = '.. '+str(table_pd.loc[j, 'page'])\n",
" n1 = int(table_pd.loc[j, 'toc_loc'])+page_start\n",
" # Get the first page\n",
" first_page = doc[n1]\n",
" # Define the text and its position\n",
" text = table_name\n",
" \n",
" # Get the text rectangle\n",
" text_rect = first_page.search_for(text)[table_pd.loc[j, 'count']]\n",
" \n",
" # Add the hyperlink to the text\n",
" first_page.insert_link({\n",
" \"kind\": fitz.LINK_GOTO, # Type of link\n",
" \"page\": int(table_pd.loc[j, 'page'])+ TOC_gap - 1, # Page number to go to (0-based index, so last page)\n",
" \"from\": text_rect\n",
" })\n",
" for j in range(len(image_pd)):\n",
" table_name = '.. '+str(image_pd.loc[j, 'page'])\n",
" n1 = int(image_pd.loc[j, 'toc_loc'])+page_start\n",
" # Get the first page\n",
" first_page = doc[n1]\n",
" \n",
" # Define the text and its position\n",
" text = table_name\n",
" \n",
" # Get the text rectangle\n",
" text_rect = first_page.search_for(text)[image_pd.loc[j, 'count']]\n",
" \n",
" # Add the hyperlink to the text\n",
" first_page.insert_link({\n",
" \"kind\": fitz.LINK_GOTO, # Type of link\n",
" \"page\": int(image_pd.loc[j, 'page'])+ TOC_gap - 1, # Page number to go to (0-based index, so last page)\n",
" \"from\": text_rect\n",
" })\n",
" # Save the modified PDF to the output path\n",
" doc.save(output_pdf_path)\n",
" doc.close()\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Workflow"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "abbd5a8b01fb4849a2590c25fa66b054",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The DOCX document has 180 pages.\n"
]
}
],
"source": [
"# count raw document pages\n",
"docx_path = 'test2.docx'\n",
"num_pages = count_docx_pages(docx_path)\n",
"print(f'The DOCX document has {num_pages} pages.')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fea11e8ba42c4a538318c5092f528320",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# convert docx to pdf\n",
"pdf_path = \"test3.pdf\"\n",
"convert_docx_to_pdf(docx_path, pdf_path)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# extract tables and images for the document\n",
"result = extract_tables_and_images(pdf_path)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# get table list and image list\n",
"table_pd = pd.DataFrame(result.get('tables'))\n",
"image_pd = pd.DataFrame(result.get('images'))\n",
"table_pd.loc[:,'toc_loc'] = 999\n",
"image_pd.loc[:,'toc_loc'] = 999\n",
"table_pd.loc[:, 'ok'] = 1\n",
"image_pd.loc[:, 'ok'] = 1"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"for i in range(1, len(table_pd)):\n",
" target = table_pd.loc[i, 'title']\n",
" if target == 'Protocol Final':\n",
" table_pd.loc[i, 'ok'] = 0\n",
" if (table_pd.loc[i, 'title'] == table_pd.loc[i-1, 'title']):\n",
" table_pd.loc[i, 'ok'] = 0\n",
"for i in range(1, len(image_pd)):\n",
" target = image_pd.loc[i, 'title']\n",
" if target == 'Protocol Final':\n",
" image_pd.loc[i, 'ok'] = 0\n",
" if (image_pd.loc[i, 'title'] == image_pd.loc[i-1, 'title']):\n",
" image_pd.loc[i, 'ok'] = 0\n",
"table_pd = table_pd[table_pd['ok'] == 1]\n",
"table_pd = table_pd.drop(['ok'], axis = 1)\n",
"table_pd.index = range(len(table_pd))\n",
"image_pd = image_pd[image_pd['ok'] == 1]\n",
"image_pd = image_pd.drop(['ok'], axis = 1)\n",
"image_pd.index = range(len(image_pd))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6425e7a36c0f4a69844919146b79b5d1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The DOCX document has 3 pages.\n"
]
}
],
"source": [
"# create initial Table of Contents\n",
"create_toc(table_pd, image_pd)\n",
"num_pages = count_docx_pages('Table_of_Contents.docx')\n",
"print(f'The DOCX document has {num_pages} pages.')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4ccd8790a80d42ef87e9c2df0b09dad7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"docx_path = 'Table_of_Contents.docx'\n",
"pdf_path = \"Table_of_Contents.pdf\"\n",
"convert_docx_to_pdf(docx_path, pdf_path)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# modify the table of contents\n",
"table_pd, image_pd = extract_toc_location(pdf_path, table_pd, image_pd)\n",
"table_pd.loc[:, 'count'] = 0\n",
"image_pd.loc[:, 'count'] = 0\n",
"count1 = 0\n",
"for i in range(1,len(table_pd)):\n",
" if table_pd.loc[i, 'page'] == table_pd.loc[i-1, 'page']:\n",
" count1 = count1+1\n",
" table_pd.loc[i, 'count'] = count1\n",
" else:\n",
" count1 = 0\n",
"count2 = 0\n",
"for i in range(1,len(image_pd)):\n",
" if image_pd.loc[i, 'page'] == image_pd.loc[i-1, 'page']:\n",
" count2 = count2+1\n",
" image_pd.loc[i, 'count'] = count2\n",
" else:\n",
" count1 = 0\n",
"create_toc(table_pd, image_pd)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e6cde854c8dc4171871a2d366bc2e23e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# convert Table of Contents to PDF and merge all the PDFs\n",
"docx_path = 'Table_of_Contents.docx'\n",
"pdf_path = 'Table_of_Contents.pdf'\n",
"convert_docx_to_pdf(docx_path, pdf_path)\n",
"pages_to_delete = [9,10,11]\n",
"old_len = len(pages_to_delete)\n",
"delete_old_TOC('test3.pdf', 'test4.pdf', pages_to_delete)\n",
"write_new_TOC('test4.pdf', pdf_path, 'test5.pdf', 9)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"TOC_gap = num_pages-old_len\n",
"page_start = 9\n",
"# create final PDF\n",
"input_pdf_path = 'test5.pdf'\n",
"output_pdf_path = 'final_result0107.pdf'\n",
"add_hyperlink_to_pdf(input_pdf_path, output_pdf_path, table_pd, image_pd, TOC_gap, page_start)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Editor is loading...
Leave a Comment