After Refactor

mail@pastecode.io avatar
unknown
python
19 days ago
1.7 kB
3
Indexable
Never
def upload_document(
        URL, 
        course_id, 
        file_name, 
        course_document_id,
    ):
    """
    Upload and process a document from a PDF URL to AstraDB.
    
    Args:
        URL (str): URL to the document.
        course_id (str): ID of the course.
        file_name (str): Name of the file.
        course_document_id (str): ID of the course document.
    
    Returns:
        Sending chunk to AstraDB
        Sending webhook to BE
    """
    pdf_page = load_doc(URL)

    if is_document(pdf_page):
        LOGGER.info('Success Document Loaded... Course ID: ', course_id, ' || Document ID: ', course_document_id)

    pdf_page_parsed = parsing_pdf_html(pdf_page)
    removed_break = remove_break_add_whitespace(pdf_page_parsed)
    doc = create_document_by_splitting(removed_break)
    chunks = extract_headers(doc)
    doc_tokens = 0
    print("insert metedata information")
    #*************** add metadata course ID, document name, and document ID to chunks
    new_chunks, doc_tokens = process_chunks(chunks, course_id, file_name, course_document_id, doc_tokens)

    if (doc_tokens < 1000000
        ) and (
            check_upload_document(new_chunks, course_id, course_document_id, doc_tokens)
        ):
        
        embbed_openai(new_chunks, course_id, course_document_id, doc_tokens)
    
    else:
        status = "Failed Uplaod"
        webhookPayload = {
            'course_id': course_id, 
            'document_id': course_document_id, 
            'tokens_embbed': doc_tokens,
            'error' : "PDF Too Large",
        }
        push_payload(
            url_error_webhook,
            status,
            webhookPayload,
        )
Leave a Comment