Before Refactor

mail@pastecode.io avatar
unknown
python
24 days ago
3.0 kB
2
Indexable
Never
def callRequest(URL, course_id, file_name, course_document_id):
    logging.basicConfig(level=logging.INFO,  # Set the logging level
                        format='%(asctime)s [%(levelname)s] - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')

    logger = logging.getLogger(__name__)

    pdf_page = load_doc(URL)
    if(pdf_page):
        logger.info('Document downloaded... Course ID: ', course_id, ' || Document ID: ', course_document_id)

    pdf_page_parsed = parsing_pdf_html(pdf_page)
    removed_break = remove_break_add_whitespace(pdf_page_parsed)
    doc = create_document_by_splitting(removed_break)
    chunks = extract_headers(doc)
    doc_tokens = 0
    print("insert metedata information")
    #*************** add metadata course ID, document name, and document ID to chunks
    new_chunks = []
    for doc in chunks:
        #*************** Check if doc.page_content character count is more than 7000. If yes, resplit the chunk
        if len(doc.page_content) > 7000:
            resplit = resplit_chunk(doc)
            for resplit_doc in resplit:
                resplit_doc.metadata['header1'] = doc.metadata['header1']
                resplit_doc.metadata['header2'] = doc.metadata['header2']
                resplit_doc.metadata['header3'] = doc.metadata['header3']
                resplit_doc.metadata['header4'] = doc.metadata['header4']
                resplit_doc.metadata["source"] = f"{course_id}"
                resplit_doc.metadata["document_name"] = f"{file_name}"
                resplit_doc.metadata["document_id"] = f"{course_document_id}"
                x = tokens_embbeding(doc.page_content)
                resplit_doc.metadata["tokens_embbed"] = x
                doc_tokens += x
            new_chunks.extend(resplit)
        #*************** end of Check if doc.page_content character count is more than 7000. If yes, resplit the chunk
        else:
            doc.metadata["source"] = f"{course_id}"
            doc.metadata["document_name"] = f"{file_name}"
            doc.metadata["document_id"] = f"{course_document_id}"
            x = tokens_embbeding(doc.page_content)
            doc.metadata["tokens_embbed"] = x
            doc_tokens += x
            new_chunks.append(doc)

    chunks = new_chunks
    #*************** end of add metadata course ID, document name, and document ID to chunks
    print(f"Token Usage for uploading: {doc_tokens}")
    doc_tokens += get_semantic_chunker_token()
    print(f"Token Usage for uploading + semantic chunking: {doc_tokens}")
    print(f"chunks: {len(chunks)}")
    print(f"token usage : {doc_tokens}")
    if doc_tokens < 1000000:
        print("Embedding Process")
        Embbed_openaAI(chunks, course_id, course_document_id, doc_tokens)
        print("Embeddings done")
    else:
        error = "PDF Too Large"
        callErrorWebhook(url_error_webhook, course_id, course_document_id, doc_tokens, error)
Leave a Comment