Untitled

 avatar
unknown
plain_text
a year ago
2.4 kB
4
Indexable
def process_batch(qdrant_client, batch):
    abstracts = [row[2] for row in batch]
    claims = [row[3] for row in batch]
    abstract_ids = [fast_hash(row[0]) for row in batch]
    abstract_payloads = [
        {
            "publication_number": row[0],
            "priority_date": row[4],
            "cpc_codes": list(set(code.strip() for code in row[5].split(","))),
            "assignees": re.split(r',\s*(?!Ltd\.)', row[6])
        }
        for row in batch
    ]

    claims_chunked = []
    claims_payloads = []
    claims_ids = []
    for patent_claims, payload, id in zip(claims, abstract_payloads, abstract_ids):
        chunks = text_splitter.split_text(patent_claims)
        claims_payloads.extend([payload] * len(chunks))
        claims_ids.extend([str(uuid.uuid4()) for _ in range(len(chunks))])
        claims_chunked.extend(chunks)

    abstract_vectors, abstract_tokens = embed_many(abstracts)
    claims_vectors, claims_tokens = embed_many(claims_chunked)

    # Try upsert for abstract-full twice
    try:
        qdrant_client.upsert(
            collection_name="abstract",
            points=models.Batch(
                ids=abstract_ids, vectors=abstract_vectors, payloads=abstract_payloads
            ),
            wait=False,
        )
    except Exception as e:
        qdrant_client.upsert(
            collection_name="abstract",
            points=models.Batch(
                ids=abstract_ids, vectors=abstract_vectors, payloads=abstract_payloads
            ),
            wait=False,
        )

    try:
        qdrant_client.upsert(
            collection_name="claims-recursive",
            points=models.Batch(
                ids=claims_ids, vectors=claims_vectors, payloads=claims_payloads
            ),
            wait=False,
        )
    except Exception as e:
        mid = len(claims_ids) // 2
        qdrant_client.upsert(
            collection_name="claims-recursive",
            points=models.Batch(
                ids=claims_ids[:mid], vectors=claims_vectors[:mid], payloads=claims_payloads[:mid]
            ),
            wait=False,
        )
        qdrant_client.upsert(
            collection_name="claims-recursive",
            points=models.Batch(
                ids=claims_ids[mid:], vectors=claims_vectors[mid:], payloads=claims_payloads[mid:]
            ),
            wait=False,
        )
Editor is loading...
Leave a Comment