Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
1.5 kB
1
Indexable
Never
from bs4 import BeautifulSoup
import docx

# Load the HTML file
with open('your_html_file.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# Create a BeautifulSoup object to parse the HTML
soup = BeautifulSoup(html_content, 'html.parser')

# Initialize a list to store table rows
table_data = []

# Find and extract relevant data
for li in soup.find_all('li', href=True):
    id_value = li['href'].replace('idm', '')
    problem = li.get_text()
    
    info = ""
    solution = ""
    
    for tag in li.find_next_siblings():
        if tag.name == 'h2':
            if "Info" in tag.get_text():
                info = tag.find_next('p').get_text()
            elif "Solution" in tag.get_text():
                solution = tag.find_next('p').get_text()
        elif tag.name == 'li' and tag.get('href') == f'#idm{id_value}':
            break

    table_data.append([f"idm{id_value}", problem, info, solution])

# Create a Word document and add a table
doc = docx.Document()
table = doc.add_table(rows=1, cols=3)
table.autofit = False
table.allow_autofit = False

# Define table column widths (you can adjust these values)
table.columns[0].width = docx.shared.Inches(1)
table.columns[1].width = docx.shared.Inches(2)
table.columns[2].width = docx.shared.Inches(3)

# Add data to the table
for row_data in table_data:
    row = table.add_row().cells
    for i in range(3):
        row[i].text = row_data[i]

# Save the Word document
doc.save('output_table.docx')