Untitled
unknown
plain_text
a year ago
418 B
12
Indexable
from src.ingestion.loaders.loaderBase import LoaderBase
import html2text
import re
class LoaderHTML(LoaderBase):
def __init__(self,filepath:str):
self.filepath=filepath
def extract_metadata(self):
raise NotImplementedError
def extract_text(self):
html = html2text.HTML2Text()
text = html.handle(self.filepath)
return text
Editor is loading...
Leave a Comment