Before refactor
unknown
plain_text
a year ago
2.2 kB
5
Indexable
#*************** add metadata course ID, document name, and document ID to chunks new_chunks = [] for doc in chunks: #*************** Check if doc.page_content character count is more than 7000. If yes, resplit the chunk if len(doc.page_content) > 7000: resplit = resplit_chunk(doc) for resplit_doc in resplit: resplit_doc.metadata['header1'] = doc.metadata['header1'] resplit_doc.metadata['header2'] = doc.metadata['header2'] resplit_doc.metadata['header3'] = doc.metadata['header3'] resplit_doc.metadata['header4'] = doc.metadata['header4'] resplit_doc.metadata["source"] = f"{course_id}" resplit_doc.metadata["document_name"] = f"{file_name}" resplit_doc.metadata["document_id"] = f"{course_document_id}" x = tokens_embbeding(doc.page_content) resplit_doc.metadata["tokens_embbed"] = x doc_tokens += x new_chunks.extend(resplit) #*************** end of Check if doc.page_content character count is more than 7000. If yes, resplit the chunk else: doc.metadata["source"] = f"{course_id}" doc.metadata["document_name"] = f"{file_name}" doc.metadata["document_id"] = f"{course_document_id}" x = tokens_embbeding(doc.page_content) doc.metadata["tokens_embbed"] = x doc_tokens += x new_chunks.append(doc) chunks = new_chunks #*************** end of add metadata course ID, document name, and document ID to chunks print(f"Token Usage for uploading: {doc_tokens}") doc_tokens += get_semantic_chunker_token() print(f"Token Usage for uploading + semantic chunking: {doc_tokens}") print(f"chunks: {len(chunks)}") print(f"token usage : {doc_tokens}") if doc_tokens < 1000000: print("Embedding Process") Embbed_openaAI(chunks, course_id, course_document_id, doc_tokens) print("Embeddings done") else: error = "PDF Too Large" callErrorWebhook(url_error_webhook, course_id, course_document_id, doc_tokens, error)
Editor is loading...
Leave a Comment