Untitled

mail@pastecode.io avatar
unknown
plain_text
a month ago
2.2 kB
4
Indexable
Never
 #*************** add metadata course ID, document name, and document ID to chunks
    new_chunks = []
    for doc in chunks:
        #*************** Check if doc.page_content character count is more than 7000. If yes, resplit the chunk
        if len(doc.page_content) > 7000:
            resplit = resplit_chunk(doc)
            for resplit_doc in resplit:
                resplit_doc.metadata['header1'] = doc.metadata['header1']
                resplit_doc.metadata['header2'] = doc.metadata['header2']
                resplit_doc.metadata['header3'] = doc.metadata['header3']
                resplit_doc.metadata['header4'] = doc.metadata['header4']
                resplit_doc.metadata["source"] = f"{course_id}"
                resplit_doc.metadata["document_name"] = f"{file_name}"
                resplit_doc.metadata["document_id"] = f"{course_document_id}"
                x = tokens_embbeding(doc.page_content)
                resplit_doc.metadata["tokens_embbed"] = x
                doc_tokens += x
            new_chunks.extend(resplit)
        #*************** end of Check if doc.page_content character count is more than 7000. If yes, resplit the chunk
        else:
            doc.metadata["source"] = f"{course_id}"
            doc.metadata["document_name"] = f"{file_name}"
            doc.metadata["document_id"] = f"{course_document_id}"
            x = tokens_embbeding(doc.page_content)
            doc.metadata["tokens_embbed"] = x
            doc_tokens += x
            new_chunks.append(doc)

    chunks = new_chunks
    #*************** end of add metadata course ID, document name, and document ID to chunks
    print(f"Token Usage for uploading: {doc_tokens}")
    doc_tokens += get_semantic_chunker_token()
    print(f"Token Usage for uploading + semantic chunking: {doc_tokens}")
    print(f"chunks: {len(chunks)}")
    print(f"token usage : {doc_tokens}")
    if doc_tokens < 1000000:
        print("Embedding Process")
        Embbed_openaAI(chunks, course_id, course_document_id, doc_tokens)
        print("Embeddings done")
    else:
        error = "PDF Too Large"
        callErrorWebhook(url_error_webhook, course_id, course_document_id, doc_tokens, error)
Leave a Comment