Untitled
unknown
python
5 months ago
1.5 kB
4
Indexable
import sys # nopep8 sys.path.append("") # nopep8 from src.utils.utils import load_config from src.helper.milvus import get_milvus_connection from src.utils.documents import load_all_documents from langchain_community.embeddings import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma, Milvus from langchain.chains import ConversationalRetrievalChain import os import nltk from tqdm import tqdm nltk.download('punkt_tab') nltk.download('averaged_perceptron_tagger_eng') config = load_config() LIST_MODELS = config['LIST_MODELS'] LIST_EMB_MODELS = config['LIST_EMB_MODELS'] MODEL = config['MODEL'] EMB_MODEL = config['EMB_MODEL'] DOCS_VER = config['DOCS_VER'] folder_path = f"resources/documents/{DOCS_VER}" persist_directory = f"resources/vectors/{DOCS_VER}/{EMB_MODEL}" embeddings = OpenAIEmbeddings(model=LIST_EMB_MODELS[EMB_MODEL]) vectorstore = get_milvus_connection(embeddings=embeddings) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) os.makedirs(persist_directory, exist_ok=True) raw_documents = load_all_documents(folder_path) split_documents = text_splitter.split_documents(raw_documents) batch_size = 50 for i in tqdm(range(0, len(split_documents), batch_size), desc="Processing batches"): batch = split_documents[i:i+batch_size] vectorstore.add_documents(batch)
Editor is loading...
Leave a Comment