Untitled
unknown
plain_text
a year ago
2.2 kB
13
Indexable
#*************** add metadata course ID, document name, and document ID to chunks
new_chunks = []
for doc in chunks:
#*************** Check if doc.page_content character count is more than 7000. If yes, resplit the chunk
if len(doc.page_content) > 7000:
resplit = resplit_chunk(doc)
for resplit_doc in resplit:
resplit_doc.metadata['header1'] = doc.metadata['header1']
resplit_doc.metadata['header2'] = doc.metadata['header2']
resplit_doc.metadata['header3'] = doc.metadata['header3']
resplit_doc.metadata['header4'] = doc.metadata['header4']
resplit_doc.metadata["source"] = f"{course_id}"
resplit_doc.metadata["document_name"] = f"{file_name}"
resplit_doc.metadata["document_id"] = f"{course_document_id}"
x = tokens_embbeding(doc.page_content)
resplit_doc.metadata["tokens_embbed"] = x
doc_tokens += x
new_chunks.extend(resplit)
#*************** end of Check if doc.page_content character count is more than 7000. If yes, resplit the chunk
else:
doc.metadata["source"] = f"{course_id}"
doc.metadata["document_name"] = f"{file_name}"
doc.metadata["document_id"] = f"{course_document_id}"
x = tokens_embbeding(doc.page_content)
doc.metadata["tokens_embbed"] = x
doc_tokens += x
new_chunks.append(doc)
chunks = new_chunks
#*************** end of add metadata course ID, document name, and document ID to chunks
print(f"Token Usage for uploading: {doc_tokens}")
doc_tokens += get_semantic_chunker_token()
print(f"Token Usage for uploading + semantic chunking: {doc_tokens}")
print(f"chunks: {len(chunks)}")
print(f"token usage : {doc_tokens}")
if doc_tokens < 1000000:
print("Embedding Process")
Embbed_openaAI(chunks, course_id, course_document_id, doc_tokens)
print("Embeddings done")
else:
error = "PDF Too Large"
callErrorWebhook(url_error_webhook, course_id, course_document_id, doc_tokens, error)Editor is loading...
Leave a Comment