lyx document to word with formulas using python
Adapted from https://github.com/python-openxml/python-docx/issues/320#issuecomment-798749198unknown
python
2 years ago
7.7 kB
8
Indexable
from docx import Document
from lxml import etree
import latex2mathml.converter
import os
import re
import collections
import sys
# adapted from https://github.com/python-openxml/python-docx/issues/320#issuecomment-798749198
re_sanitize_ampersand = re.compile(R"&(?!#?[a-zA-Z0-9]+;)")
re_unicode_middot_to_cdot = re.compile(R"·")
re_unicode_plus_to_plus = re.compile(R"+")
re_unicode_minus_to_minus = re.compile(R"−")
re_remove_ensuremath = re.compile(R"\\ensuremath")
re_remove_begin_eqn = re.compile(R"\\\[")
re_remove_end_eqn = re.compile(R"\\\]")
re_remove_inline_dollar = re.compile(R"^\$(.*)\$$")
def latex_to_word(latex_input):
scriptdir = (os.path.dirname(os.path.realpath(__file__)))
latex_input = re_remove_ensuremath.sub('', latex_input)
latex_input = re_remove_begin_eqn.sub('', latex_input)
latex_input = re_remove_end_eqn.sub('', latex_input)
latex_input = re_remove_inline_dollar.sub(R'\1', latex_input)
latex_input = str.strip(latex_input)
if (str.strip(latex_input) == ""):
pass
return None
mathml = latex2mathml.converter.convert(latex_input)
mathml = re_sanitize_ampersand.sub('&', mathml)
#mathml = re_unicode_middot_to_cdot.sub(R'{\\cdot}', mathml)
#mathml = re_unicode_plus_to_plus.sub("+", mathml)
#mathml = re_unicode_minus_to_minus.sub("-", mathml)
tree = etree.fromstring(mathml)
xslt = etree.parse(
os.path.join(scriptdir,'MML2OMML.XSL')
)
transform = etree.XSLT(xslt)
new_dom = transform(tree)
word_xml = new_dom.getroot()
return word_xml
def demo():
scriptdir = (os.path.dirname(os.path.realpath(__file__)))
document = Document()
p = document.add_paragraph()
word_math = latex_to_word(R"\sum_{i=1}^{10}{\frac{\sigma_{zp,i}}{E_i} kN")
p._element.append(word_math)
p = document.add_paragraph()
p.add_run('before formula ')
p._element.append(word_math)
p.add_run(' after formula ')
p = document.add_paragraph()
word_math = latex_to_word(R"\sum_{n=1}^{10}{n^{2}}")
p._element.append(word_math)
p = document.add_paragraph()
word_math = latex_to_word(R"\sqrt{\frac{a}{b}}")
p._element.append(word_math)
p = document.add_paragraph()
word_math = latex_to_word(R"\begin{matrix*}[r]a & b \\ c & d \end{matrix*}")
p._element.append(word_math)
document.save(os.path.join(scriptdir,'demo.docx'))
print("Done.)")
begin_body_re = re.compile(R"^\\begin_(body)()(.*)$")
begin_body_found = False
end_body_re = re.compile(R"^\\end_(body)$")
begin_layout_re = re.compile(R"^\\begin_(layout) (Standard)(.*)$")
end_layout_re = re.compile(R"^\\end_(layout)$")
begin_inset_re = re.compile(R"^\\begin_(inset) (Formula)(.*)$")
end_inset_re = re.compile(R"^\\end_(inset)$")
begin_regexes = {'body':begin_body_re, 'layout':begin_layout_re, 'insert':begin_inset_re}
end_regexes = {'body':end_body_re, 'layout':end_layout_re, 'insert':end_inset_re}
def lyx2word(lyx_filepath:str, worddoc_filepath:str):
parse_stack = [{'type':'document', 'lines':[], 'childs':[]}]
doc_stack = []
scriptdir = (os.path.dirname(os.path.realpath(__file__)))
word_doc = Document()
doc_stack.append(word_doc)
lyx_body = False
l_idx = 0
with open(lyx_filepath, 'r', encoding='utf-8') as l_fh:
for line in l_fh:
l_idx+=1
continue_line = False
for key, begin_re in begin_regexes.items():
m_begin = begin_re.match(line)
if m_begin is not None:
elem = dict()
elem["type"] = m_begin.group(1)
elem["format"] = m_begin.group(2)
elem["lines"] = []
elem["childs"] = []
if (str.strip(m_begin.group(3)) != ""):
elem["lines"].append(str.strip(m_begin.group(3)))
if key == "body" and m_begin.group(1) == "body":
lyx_body = True
parse_stack.append(elem)
continue_line = True
break
if (continue_line): continue
for key, end_re in end_regexes.items():
m_end = end_re.match(line)
if m_end is not None:
if key == "body":
lyx_body = False
try:
stack_elem:dict = parse_stack[-1]
except IndexError:
print(F"Error {repr(e)}: unbalanced tree: Closing tag {m_end.group(0)} but stack is empty, Lyx Line {l_idx:d} = '{line}'")
end_type = m_end.group(1)
if (stack_elem.get('type', '') == end_type):
stack_elem:dict = parse_stack.pop() # matching, remove
if ((stack_elem["type"] == "inset") and (stack_elem["format"] == "Formula")):
stack_elem["xml"] = []
p = word_doc.add_paragraph()
lines_concat = "\n".join(stack_elem["lines"])
word_math_xml = latex_to_word(lines_concat)
stack_elem["xml"].append(word_math_xml)
if word_math_xml is not None:
p._element.append(word_math_xml)
elif ((stack_elem["type"] == "layout") and (stack_elem["format"] == "Standard")):
p = word_doc.add_paragraph()
for l in stack_elem["lines"]:
p.add_run(l)
try:
parse_stack[-1]["childs"].append(stack_elem)
except Exception as e:
print(F"Error {repr(e)} adding"+repr(stack_elem)+" to childs of parse stack "+repr(parse_stack)+F", Lyx Line {l_idx:d} = '{line}'" )
else:
print(F"Error: unbalanced tree: Closing tag {m_end.group(0)} does not match stack element {repr(stack_elem)}, Lyx Line {l_idx:d} = '{line}'")
continue_line = True
break
# end if m_end is not None
if (continue_line): continue
# no match hit, line not skipped
try:
if lyx_body and (str.strip(str(line)) != ""):
parse_stack[-1]["lines"].append(str.strip(line)) # append non empty lines within body
else:
pass # ignore header
except Exception as e:
print("ERROR: "+repr(e))
if (worddoc_filepath is not None) and (worddoc_filepath != ""):
word_doc.save(worddoc_filepath)
print("Converted doc written to "+repr(worddoc_filepath)+".")
return parse_stack
if __name__ == "__main__":
if len(sys.argv) >= 3:
lyx_fp = sys.argv[1]
word_fp = sys.argv[2]
r=lyx2word(lyx_fp, word_fp)
elif len(sys.argv) >= 2:
lyx_fp = sys.argv[1]
r=lyx2word(lyx_fp, lyx_fp+".docx")
else:
print("Argument lyx_filepath missing")
print("Converted.")Editor is loading...