Untitled
unknown
plain_text
a year ago
2.4 kB
8
Indexable
def process_batch(qdrant_client, batch):
abstracts = [row[2] for row in batch]
claims = [row[3] for row in batch]
abstract_ids = [fast_hash(row[0]) for row in batch]
abstract_payloads = [
{
"publication_number": row[0],
"priority_date": row[4],
"cpc_codes": list(set(code.strip() for code in row[5].split(","))),
"assignees": re.split(r',\s*(?!Ltd\.)', row[6])
}
for row in batch
]
claims_chunked = []
claims_payloads = []
claims_ids = []
for patent_claims, payload, id in zip(claims, abstract_payloads, abstract_ids):
chunks = text_splitter.split_text(patent_claims)
claims_payloads.extend([payload] * len(chunks))
claims_ids.extend([str(uuid.uuid4()) for _ in range(len(chunks))])
claims_chunked.extend(chunks)
abstract_vectors, abstract_tokens = embed_many(abstracts)
claims_vectors, claims_tokens = embed_many(claims_chunked)
# Try upsert for abstract-full twice
try:
qdrant_client.upsert(
collection_name="abstract",
points=models.Batch(
ids=abstract_ids, vectors=abstract_vectors, payloads=abstract_payloads
),
wait=False,
)
except Exception as e:
qdrant_client.upsert(
collection_name="abstract",
points=models.Batch(
ids=abstract_ids, vectors=abstract_vectors, payloads=abstract_payloads
),
wait=False,
)
try:
qdrant_client.upsert(
collection_name="claims-recursive",
points=models.Batch(
ids=claims_ids, vectors=claims_vectors, payloads=claims_payloads
),
wait=False,
)
except Exception as e:
mid = len(claims_ids) // 2
qdrant_client.upsert(
collection_name="claims-recursive",
points=models.Batch(
ids=claims_ids[:mid], vectors=claims_vectors[:mid], payloads=claims_payloads[:mid]
),
wait=False,
)
qdrant_client.upsert(
collection_name="claims-recursive",
points=models.Batch(
ids=claims_ids[mid:], vectors=claims_vectors[mid:], payloads=claims_payloads[mid:]
),
wait=False,
)Editor is loading...
Leave a Comment