lyx document to word with formulas using python

Adapted from https://github.com/python-openxml/python-docx/issues/320#issuecomment-798749198
mail@pastecode.io avatar
unknown
python
7 months ago
7.7 kB
1
Indexable
Never
from docx import Document
from lxml import etree
import latex2mathml.converter
import os
import re
import collections
import sys
# adapted from https://github.com/python-openxml/python-docx/issues/320#issuecomment-798749198

re_sanitize_ampersand = re.compile(R"&(?!#?[a-zA-Z0-9]+;)")
re_unicode_middot_to_cdot = re.compile(R"·")
re_unicode_plus_to_plus = re.compile(R"+")
re_unicode_minus_to_minus = re.compile(R"−")

re_remove_ensuremath = re.compile(R"\\ensuremath")
re_remove_begin_eqn = re.compile(R"\\\[")
re_remove_end_eqn = re.compile(R"\\\]")
re_remove_inline_dollar = re.compile(R"^\$(.*)\$$")

def latex_to_word(latex_input):
    scriptdir = (os.path.dirname(os.path.realpath(__file__)))

    latex_input = re_remove_ensuremath.sub('', latex_input)
    latex_input = re_remove_begin_eqn.sub('', latex_input)
    latex_input = re_remove_end_eqn.sub('', latex_input)
    latex_input = re_remove_inline_dollar.sub(R'\1', latex_input)
    latex_input = str.strip(latex_input)

    if (str.strip(latex_input) == ""):
        pass
        return None

    mathml = latex2mathml.converter.convert(latex_input)
    mathml = re_sanitize_ampersand.sub('&', mathml)
    

    
    
    #mathml = re_unicode_middot_to_cdot.sub(R'{\\cdot}', mathml)
    #mathml = re_unicode_plus_to_plus.sub("+", mathml)
    #mathml = re_unicode_minus_to_minus.sub("-", mathml)
    
    tree = etree.fromstring(mathml)
    xslt = etree.parse(
        os.path.join(scriptdir,'MML2OMML.XSL')
        )
    transform = etree.XSLT(xslt)
    new_dom = transform(tree)
    word_xml = new_dom.getroot()


    return word_xml

def demo():
    scriptdir = (os.path.dirname(os.path.realpath(__file__)))

    document = Document()

    p = document.add_paragraph()
    word_math = latex_to_word(R"\sum_{i=1}^{10}{\frac{\sigma_{zp,i}}{E_i} kN")
    p._element.append(word_math)

    p = document.add_paragraph()
    p.add_run('before formula ')
    p._element.append(word_math)
    p.add_run(' after formula ')

    p = document.add_paragraph()
    word_math = latex_to_word(R"\sum_{n=1}^{10}{n^{2}}")
    p._element.append(word_math)

    p = document.add_paragraph()
    word_math = latex_to_word(R"\sqrt{\frac{a}{b}}")
    p._element.append(word_math)


    p = document.add_paragraph()
    word_math = latex_to_word(R"\begin{matrix*}[r]a & b \\ c & d \end{matrix*}")
    p._element.append(word_math)

    document.save(os.path.join(scriptdir,'demo.docx'))

    print("Done.)")



begin_body_re = re.compile(R"^\\begin_(body)()(.*)$")
begin_body_found = False
end_body_re = re.compile(R"^\\end_(body)$")

begin_layout_re = re.compile(R"^\\begin_(layout) (Standard)(.*)$")
end_layout_re = re.compile(R"^\\end_(layout)$")

begin_inset_re = re.compile(R"^\\begin_(inset) (Formula)(.*)$")
end_inset_re = re.compile(R"^\\end_(inset)$")



begin_regexes = {'body':begin_body_re, 'layout':begin_layout_re, 'insert':begin_inset_re}
end_regexes = {'body':end_body_re, 'layout':end_layout_re, 'insert':end_inset_re}


def lyx2word(lyx_filepath:str, worddoc_filepath:str):
    parse_stack = [{'type':'document', 'lines':[], 'childs':[]}]
    doc_stack = []

    scriptdir = (os.path.dirname(os.path.realpath(__file__)))
    word_doc = Document()
    doc_stack.append(word_doc)

    lyx_body = False

    

    l_idx = 0
    with open(lyx_filepath, 'r', encoding='utf-8') as l_fh:
        for line in l_fh:
            l_idx+=1
            continue_line = False
            for key, begin_re in begin_regexes.items():
                m_begin = begin_re.match(line)
                if m_begin is not None:
                    elem = dict()
                    elem["type"] = m_begin.group(1)
                    elem["format"] = m_begin.group(2)
                    elem["lines"] = []
                    elem["childs"] = []
                    if (str.strip(m_begin.group(3)) != ""):
                        elem["lines"].append(str.strip(m_begin.group(3)))

                    if key == "body" and m_begin.group(1) == "body":
                        lyx_body = True
                    
                    parse_stack.append(elem)
                    continue_line = True
                    break
            if (continue_line): continue

            for key, end_re in end_regexes.items():
                m_end = end_re.match(line)
                if m_end is not None:
                    if key == "body":
                        lyx_body = False

                    try:
                        stack_elem:dict = parse_stack[-1]
                    except IndexError:
                        print(F"Error {repr(e)}: unbalanced tree: Closing tag {m_end.group(0)} but stack is empty, Lyx Line {l_idx:d} = '{line}'")

                    end_type = m_end.group(1)
                    if (stack_elem.get('type', '') == end_type):
                        stack_elem:dict = parse_stack.pop() # matching, remove
                        
                        if ((stack_elem["type"] == "inset") and (stack_elem["format"] == "Formula")):
                            stack_elem["xml"] = []
                            p = word_doc.add_paragraph()

                            lines_concat = "\n".join(stack_elem["lines"])
                           
                            word_math_xml = latex_to_word(lines_concat)
                            stack_elem["xml"].append(word_math_xml)
                            if word_math_xml is not None:
                                p._element.append(word_math_xml)
                        elif ((stack_elem["type"] == "layout") and (stack_elem["format"] == "Standard")):
                            p = word_doc.add_paragraph()
                            for l in stack_elem["lines"]:
                                p.add_run(l)
                        try:
                            parse_stack[-1]["childs"].append(stack_elem)
                        except Exception as e:
                            print(F"Error {repr(e)} adding"+repr(stack_elem)+" to childs of parse stack "+repr(parse_stack)+F", Lyx Line {l_idx:d} = '{line}'" )
                        
                        
                    else:
                        print(F"Error: unbalanced tree: Closing tag {m_end.group(0)} does not match stack element {repr(stack_elem)}, Lyx Line {l_idx:d} = '{line}'")
                    
                    continue_line = True
                    break
                # end if m_end is not None
            if (continue_line): continue

            # no match hit, line not skipped
            try:
                if lyx_body and (str.strip(str(line)) != ""):
                    parse_stack[-1]["lines"].append(str.strip(line)) # append non empty lines within body
                else:
                    pass # ignore header
            except Exception as e:
                print("ERROR: "+repr(e))   
    if (worddoc_filepath is not None) and (worddoc_filepath != ""):
        word_doc.save(worddoc_filepath) 
        print("Converted doc written to "+repr(worddoc_filepath)+".")
    return parse_stack
               
                    
                    




    
if __name__ == "__main__":
    
    if len(sys.argv) >= 3:
        lyx_fp = sys.argv[1]
        word_fp = sys.argv[2]
        r=lyx2word(lyx_fp, word_fp)
    elif len(sys.argv) >= 2:
        lyx_fp = sys.argv[1]
        r=lyx2word(lyx_fp, lyx_fp+".docx")
    else:
        print("Argument lyx_filepath missing")
    print("Converted.")