Untitled
unknown
plain_text
5 months ago
1.0 kB
3
Indexable
from src.ingestion.loaders.loaderBase import LoaderBase from docx import Document class LoaderDOCX(LoaderBase): def __init__(self, filepath:str): self.filepath=filepath def extract_metadata(self): doc = Document(self.filepath) doc_info = doc.core_properties metadata = { 'author': doc_info.author, 'creator': doc_info.author, 'producer': doc_info.author, 'subject': doc_info.subject, 'title': doc_info.title, } self.metadata=metadata return self.metadata if self.all_keys_have_values(metadata=self.metadata) else False def extract_text(self): doc = Document(self.filepath) paragraphs = "" for para in doc.paragraphs: paragraphs += para.text return paragraphs def all_keys_have_values(self, metadata, value_check=lambda x: x is not None and x != ''): return all(value_check(value) for value in metadata.values())
Editor is loading...
Leave a Comment