After Refactor
unknown
python
a year ago
1.7 kB
13
Indexable
def upload_document(
URL,
course_id,
file_name,
course_document_id,
):
"""
Upload and process a document from a PDF URL to AstraDB.
Args:
URL (str): URL to the document.
course_id (str): ID of the course.
file_name (str): Name of the file.
course_document_id (str): ID of the course document.
Returns:
Sending chunk to AstraDB
Sending webhook to BE
"""
pdf_page = load_doc(URL)
if is_document(pdf_page):
LOGGER.info('Success Document Loaded... Course ID: ', course_id, ' || Document ID: ', course_document_id)
pdf_page_parsed = parsing_pdf_html(pdf_page)
removed_break = remove_break_add_whitespace(pdf_page_parsed)
doc = create_document_by_splitting(removed_break)
chunks = extract_headers(doc)
doc_tokens = 0
print("insert metedata information")
#*************** add metadata course ID, document name, and document ID to chunks
new_chunks, doc_tokens = process_chunks(chunks, course_id, file_name, course_document_id, doc_tokens)
if (doc_tokens < 1000000
) and (
check_upload_document(new_chunks, course_id, course_document_id, doc_tokens)
):
embbed_openai(new_chunks, course_id, course_document_id, doc_tokens)
else:
status = "Failed Uplaod"
webhookPayload = {
'course_id': course_id,
'document_id': course_document_id,
'tokens_embbed': doc_tokens,
'error' : "PDF Too Large",
}
push_payload(
url_error_webhook,
status,
webhookPayload,
)Editor is loading...
Leave a Comment