Untitled
unknown
plain_text
a year ago
2.4 kB
4
Indexable
def process_batch(qdrant_client, batch): abstracts = [row[2] for row in batch] claims = [row[3] for row in batch] abstract_ids = [fast_hash(row[0]) for row in batch] abstract_payloads = [ { "publication_number": row[0], "priority_date": row[4], "cpc_codes": list(set(code.strip() for code in row[5].split(","))), "assignees": re.split(r',\s*(?!Ltd\.)', row[6]) } for row in batch ] claims_chunked = [] claims_payloads = [] claims_ids = [] for patent_claims, payload, id in zip(claims, abstract_payloads, abstract_ids): chunks = text_splitter.split_text(patent_claims) claims_payloads.extend([payload] * len(chunks)) claims_ids.extend([str(uuid.uuid4()) for _ in range(len(chunks))]) claims_chunked.extend(chunks) abstract_vectors, abstract_tokens = embed_many(abstracts) claims_vectors, claims_tokens = embed_many(claims_chunked) # Try upsert for abstract-full twice try: qdrant_client.upsert( collection_name="abstract", points=models.Batch( ids=abstract_ids, vectors=abstract_vectors, payloads=abstract_payloads ), wait=False, ) except Exception as e: qdrant_client.upsert( collection_name="abstract", points=models.Batch( ids=abstract_ids, vectors=abstract_vectors, payloads=abstract_payloads ), wait=False, ) try: qdrant_client.upsert( collection_name="claims-recursive", points=models.Batch( ids=claims_ids, vectors=claims_vectors, payloads=claims_payloads ), wait=False, ) except Exception as e: mid = len(claims_ids) // 2 qdrant_client.upsert( collection_name="claims-recursive", points=models.Batch( ids=claims_ids[:mid], vectors=claims_vectors[:mid], payloads=claims_payloads[:mid] ), wait=False, ) qdrant_client.upsert( collection_name="claims-recursive", points=models.Batch( ids=claims_ids[mid:], vectors=claims_vectors[mid:], payloads=claims_payloads[mid:] ), wait=False, )
Editor is loading...
Leave a Comment