Untitled
unknown
plain_text
a year ago
1.0 kB
5
Indexable
from src.ingestion.loaders.loaderBase import LoaderBase
from docx import Document
class LoaderDOCX(LoaderBase):
def __init__(self, filepath:str):
self.filepath=filepath
def extract_metadata(self):
doc = Document(self.filepath)
doc_info = doc.core_properties
metadata = {
'author': doc_info.author,
'creator': doc_info.author,
'producer': doc_info.author,
'subject': doc_info.subject,
'title': doc_info.title,
}
self.metadata=metadata
return self.metadata if self.all_keys_have_values(metadata=self.metadata) else False
def extract_text(self):
doc = Document(self.filepath)
paragraphs = ""
for para in doc.paragraphs:
paragraphs += para.text
return paragraphs
def all_keys_have_values(self, metadata, value_check=lambda x: x is not None and x != ''):
return all(value_check(value) for value in metadata.values())Editor is loading...
Leave a Comment