Untitled

 avatar
unknown
plain_text
5 months ago
1.0 kB
3
Indexable
from src.ingestion.loaders.loaderBase import LoaderBase
from docx import Document

class LoaderDOCX(LoaderBase):

    def __init__(self, filepath:str):
        self.filepath=filepath

    def extract_metadata(self):
        doc = Document(self.filepath)
        doc_info = doc.core_properties
        metadata = {  
            'author': doc_info.author,
            'creator': doc_info.author,
            'producer': doc_info.author,
            'subject': doc_info.subject,
            'title': doc_info.title,
        }

        self.metadata=metadata

        return self.metadata if self.all_keys_have_values(metadata=self.metadata) else False
    
    def extract_text(self):
        doc = Document(self.filepath)
        paragraphs = ""
        for para in doc.paragraphs:
            paragraphs += para.text
        return paragraphs
    
    def all_keys_have_values(self, metadata, value_check=lambda x: x is not None and x != ''):
        return all(value_check(value) for value in metadata.values())
Editor is loading...
Leave a Comment