Before Refactor
unknown
python
a year ago
3.0 kB
8
Indexable
def callRequest(URL, course_id, file_name, course_document_id):
logging.basicConfig(level=logging.INFO, # Set the logging level
format='%(asctime)s [%(levelname)s] - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)
pdf_page = load_doc(URL)
if(pdf_page):
logger.info('Document downloaded... Course ID: ', course_id, ' || Document ID: ', course_document_id)
pdf_page_parsed = parsing_pdf_html(pdf_page)
removed_break = remove_break_add_whitespace(pdf_page_parsed)
doc = create_document_by_splitting(removed_break)
chunks = extract_headers(doc)
doc_tokens = 0
print("insert metedata information")
#*************** add metadata course ID, document name, and document ID to chunks
new_chunks = []
for doc in chunks:
#*************** Check if doc.page_content character count is more than 7000. If yes, resplit the chunk
if len(doc.page_content) > 7000:
resplit = resplit_chunk(doc)
for resplit_doc in resplit:
resplit_doc.metadata['header1'] = doc.metadata['header1']
resplit_doc.metadata['header2'] = doc.metadata['header2']
resplit_doc.metadata['header3'] = doc.metadata['header3']
resplit_doc.metadata['header4'] = doc.metadata['header4']
resplit_doc.metadata["source"] = f"{course_id}"
resplit_doc.metadata["document_name"] = f"{file_name}"
resplit_doc.metadata["document_id"] = f"{course_document_id}"
x = tokens_embbeding(doc.page_content)
resplit_doc.metadata["tokens_embbed"] = x
doc_tokens += x
new_chunks.extend(resplit)
#*************** end of Check if doc.page_content character count is more than 7000. If yes, resplit the chunk
else:
doc.metadata["source"] = f"{course_id}"
doc.metadata["document_name"] = f"{file_name}"
doc.metadata["document_id"] = f"{course_document_id}"
x = tokens_embbeding(doc.page_content)
doc.metadata["tokens_embbed"] = x
doc_tokens += x
new_chunks.append(doc)
chunks = new_chunks
#*************** end of add metadata course ID, document name, and document ID to chunks
print(f"Token Usage for uploading: {doc_tokens}")
doc_tokens += get_semantic_chunker_token()
print(f"Token Usage for uploading + semantic chunking: {doc_tokens}")
print(f"chunks: {len(chunks)}")
print(f"token usage : {doc_tokens}")
if doc_tokens < 1000000:
print("Embedding Process")
Embbed_openaAI(chunks, course_id, course_document_id, doc_tokens)
print("Embeddings done")
else:
error = "PDF Too Large"
callErrorWebhook(url_error_webhook, course_id, course_document_id, doc_tokens, error)Editor is loading...
Leave a Comment