Before Refactor
unknown
python
a year ago
3.0 kB
5
Indexable
def callRequest(URL, course_id, file_name, course_document_id): logging.basicConfig(level=logging.INFO, # Set the logging level format='%(asctime)s [%(levelname)s] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger(__name__) pdf_page = load_doc(URL) if(pdf_page): logger.info('Document downloaded... Course ID: ', course_id, ' || Document ID: ', course_document_id) pdf_page_parsed = parsing_pdf_html(pdf_page) removed_break = remove_break_add_whitespace(pdf_page_parsed) doc = create_document_by_splitting(removed_break) chunks = extract_headers(doc) doc_tokens = 0 print("insert metedata information") #*************** add metadata course ID, document name, and document ID to chunks new_chunks = [] for doc in chunks: #*************** Check if doc.page_content character count is more than 7000. If yes, resplit the chunk if len(doc.page_content) > 7000: resplit = resplit_chunk(doc) for resplit_doc in resplit: resplit_doc.metadata['header1'] = doc.metadata['header1'] resplit_doc.metadata['header2'] = doc.metadata['header2'] resplit_doc.metadata['header3'] = doc.metadata['header3'] resplit_doc.metadata['header4'] = doc.metadata['header4'] resplit_doc.metadata["source"] = f"{course_id}" resplit_doc.metadata["document_name"] = f"{file_name}" resplit_doc.metadata["document_id"] = f"{course_document_id}" x = tokens_embbeding(doc.page_content) resplit_doc.metadata["tokens_embbed"] = x doc_tokens += x new_chunks.extend(resplit) #*************** end of Check if doc.page_content character count is more than 7000. If yes, resplit the chunk else: doc.metadata["source"] = f"{course_id}" doc.metadata["document_name"] = f"{file_name}" doc.metadata["document_id"] = f"{course_document_id}" x = tokens_embbeding(doc.page_content) doc.metadata["tokens_embbed"] = x doc_tokens += x new_chunks.append(doc) chunks = new_chunks #*************** end of add metadata course ID, document name, and document ID to chunks print(f"Token Usage for uploading: {doc_tokens}") doc_tokens += get_semantic_chunker_token() print(f"Token Usage for uploading + semantic chunking: {doc_tokens}") print(f"chunks: {len(chunks)}") print(f"token usage : {doc_tokens}") if doc_tokens < 1000000: print("Embedding Process") Embbed_openaAI(chunks, course_id, course_document_id, doc_tokens) print("Embeddings done") else: error = "PDF Too Large" callErrorWebhook(url_error_webhook, course_id, course_document_id, doc_tokens, error)
Editor is loading...
Leave a Comment