lyx document to word with formulas using python
Adapted from https://github.com/python-openxml/python-docx/issues/320#issuecomment-798749198unknown
python
2 years ago
7.7 kB
5
Indexable
from docx import Document from lxml import etree import latex2mathml.converter import os import re import collections import sys # adapted from https://github.com/python-openxml/python-docx/issues/320#issuecomment-798749198 re_sanitize_ampersand = re.compile(R"&(?!#?[a-zA-Z0-9]+;)") re_unicode_middot_to_cdot = re.compile(R"·") re_unicode_plus_to_plus = re.compile(R"+") re_unicode_minus_to_minus = re.compile(R"−") re_remove_ensuremath = re.compile(R"\\ensuremath") re_remove_begin_eqn = re.compile(R"\\\[") re_remove_end_eqn = re.compile(R"\\\]") re_remove_inline_dollar = re.compile(R"^\$(.*)\$$") def latex_to_word(latex_input): scriptdir = (os.path.dirname(os.path.realpath(__file__))) latex_input = re_remove_ensuremath.sub('', latex_input) latex_input = re_remove_begin_eqn.sub('', latex_input) latex_input = re_remove_end_eqn.sub('', latex_input) latex_input = re_remove_inline_dollar.sub(R'\1', latex_input) latex_input = str.strip(latex_input) if (str.strip(latex_input) == ""): pass return None mathml = latex2mathml.converter.convert(latex_input) mathml = re_sanitize_ampersand.sub('&', mathml) #mathml = re_unicode_middot_to_cdot.sub(R'{\\cdot}', mathml) #mathml = re_unicode_plus_to_plus.sub("+", mathml) #mathml = re_unicode_minus_to_minus.sub("-", mathml) tree = etree.fromstring(mathml) xslt = etree.parse( os.path.join(scriptdir,'MML2OMML.XSL') ) transform = etree.XSLT(xslt) new_dom = transform(tree) word_xml = new_dom.getroot() return word_xml def demo(): scriptdir = (os.path.dirname(os.path.realpath(__file__))) document = Document() p = document.add_paragraph() word_math = latex_to_word(R"\sum_{i=1}^{10}{\frac{\sigma_{zp,i}}{E_i} kN") p._element.append(word_math) p = document.add_paragraph() p.add_run('before formula ') p._element.append(word_math) p.add_run(' after formula ') p = document.add_paragraph() word_math = latex_to_word(R"\sum_{n=1}^{10}{n^{2}}") p._element.append(word_math) p = document.add_paragraph() word_math = latex_to_word(R"\sqrt{\frac{a}{b}}") p._element.append(word_math) p = document.add_paragraph() word_math = latex_to_word(R"\begin{matrix*}[r]a & b \\ c & d \end{matrix*}") p._element.append(word_math) document.save(os.path.join(scriptdir,'demo.docx')) print("Done.)") begin_body_re = re.compile(R"^\\begin_(body)()(.*)$") begin_body_found = False end_body_re = re.compile(R"^\\end_(body)$") begin_layout_re = re.compile(R"^\\begin_(layout) (Standard)(.*)$") end_layout_re = re.compile(R"^\\end_(layout)$") begin_inset_re = re.compile(R"^\\begin_(inset) (Formula)(.*)$") end_inset_re = re.compile(R"^\\end_(inset)$") begin_regexes = {'body':begin_body_re, 'layout':begin_layout_re, 'insert':begin_inset_re} end_regexes = {'body':end_body_re, 'layout':end_layout_re, 'insert':end_inset_re} def lyx2word(lyx_filepath:str, worddoc_filepath:str): parse_stack = [{'type':'document', 'lines':[], 'childs':[]}] doc_stack = [] scriptdir = (os.path.dirname(os.path.realpath(__file__))) word_doc = Document() doc_stack.append(word_doc) lyx_body = False l_idx = 0 with open(lyx_filepath, 'r', encoding='utf-8') as l_fh: for line in l_fh: l_idx+=1 continue_line = False for key, begin_re in begin_regexes.items(): m_begin = begin_re.match(line) if m_begin is not None: elem = dict() elem["type"] = m_begin.group(1) elem["format"] = m_begin.group(2) elem["lines"] = [] elem["childs"] = [] if (str.strip(m_begin.group(3)) != ""): elem["lines"].append(str.strip(m_begin.group(3))) if key == "body" and m_begin.group(1) == "body": lyx_body = True parse_stack.append(elem) continue_line = True break if (continue_line): continue for key, end_re in end_regexes.items(): m_end = end_re.match(line) if m_end is not None: if key == "body": lyx_body = False try: stack_elem:dict = parse_stack[-1] except IndexError: print(F"Error {repr(e)}: unbalanced tree: Closing tag {m_end.group(0)} but stack is empty, Lyx Line {l_idx:d} = '{line}'") end_type = m_end.group(1) if (stack_elem.get('type', '') == end_type): stack_elem:dict = parse_stack.pop() # matching, remove if ((stack_elem["type"] == "inset") and (stack_elem["format"] == "Formula")): stack_elem["xml"] = [] p = word_doc.add_paragraph() lines_concat = "\n".join(stack_elem["lines"]) word_math_xml = latex_to_word(lines_concat) stack_elem["xml"].append(word_math_xml) if word_math_xml is not None: p._element.append(word_math_xml) elif ((stack_elem["type"] == "layout") and (stack_elem["format"] == "Standard")): p = word_doc.add_paragraph() for l in stack_elem["lines"]: p.add_run(l) try: parse_stack[-1]["childs"].append(stack_elem) except Exception as e: print(F"Error {repr(e)} adding"+repr(stack_elem)+" to childs of parse stack "+repr(parse_stack)+F", Lyx Line {l_idx:d} = '{line}'" ) else: print(F"Error: unbalanced tree: Closing tag {m_end.group(0)} does not match stack element {repr(stack_elem)}, Lyx Line {l_idx:d} = '{line}'") continue_line = True break # end if m_end is not None if (continue_line): continue # no match hit, line not skipped try: if lyx_body and (str.strip(str(line)) != ""): parse_stack[-1]["lines"].append(str.strip(line)) # append non empty lines within body else: pass # ignore header except Exception as e: print("ERROR: "+repr(e)) if (worddoc_filepath is not None) and (worddoc_filepath != ""): word_doc.save(worddoc_filepath) print("Converted doc written to "+repr(worddoc_filepath)+".") return parse_stack if __name__ == "__main__": if len(sys.argv) >= 3: lyx_fp = sys.argv[1] word_fp = sys.argv[2] r=lyx2word(lyx_fp, word_fp) elif len(sys.argv) >= 2: lyx_fp = sys.argv[1] r=lyx2word(lyx_fp, lyx_fp+".docx") else: print("Argument lyx_filepath missing") print("Converted.")
Editor is loading...