Untitled

mail@pastecode.io avatar
unknown
python
5 months ago
7.2 kB
4
Indexable
import pandas as pd
import time
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

class NameClassifier:
    def __init__(self, api_key, model_name="mixtral-8x7b-32768", max_retries=5, base_wait_time=60):
        self.llm = ChatGroq(model_name=model_name, api_key=api_key)
        self.prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an AI assistant specialized in identifying person names. Your task is to determine if the given text is a person's name or not."),
            ("human", "Is the following text a person's name? Respond with 'Yes' if it's a person's name, or 'Unclaimed' if it's not.\n\nText: {text}\n\nClassification:")
        ])
        self.chain = (
            {"text": RunnablePassthrough()}
            | self.prompt
            | self.llm
            | StrOutputParser()
        )
        self.max_retries = max_retries
        self.base_wait_time = base_wait_time

    def classify_name(self, text):
        for attempt in range(self.max_retries):
            try:
                result = self.chain.invoke({"text": text})
                classification = result.strip().lower()
                if classification == 'yes':
                    return text  # Return the original name if it's classified as a person's name
                else:
                    return "Unclaimed"
            except Exception as e:
                print(f"Error classifying name (attempt {attempt + 1}/{self.max_retries}): {e}")
                if "rate_limit_exceeded" in str(e):
                    wait_time = self.base_wait_time * (2 ** attempt)
                    print(f"Rate limit exceeded. Waiting for {wait_time} seconds before retrying.")
                    time.sleep(wait_time)
                elif attempt < self.max_retries - 1:
                    time.sleep(5)
                else:
                    print(f"All retry attempts failed. Returning 'Unclaimed' for '{text}'.")
                    return "Unclaimed"

def process_name_list(name_list, api_key, model_name="mixtral-8x7b-32768"):
    classifier = NameClassifier(api_key, model_name)
    results = []
    
    for i, name in enumerate(name_list):
        print(f"Processing name {i+1}/{len(name_list)}: {name}")
        classification = classifier.classify_name(name)
        results.append({"Input": name, "Classification": classification})
        
        # Save intermediate results every 100 iterations
        if (i + 1) % 100 == 0:
            df = pd.DataFrame(results)
            df.to_csv(f'intermediate_name_classification_results_{i+1}.csv', index=False)
            print(f"Saved intermediate results at iteration {i+1}")
    
    return pd.DataFrame(results)

# Example usage
if __name__ == "__main__":
    api_key = "your_api_key_here"  # Replace with your actual API key
    name_list = [
        "John Doe",
        "Apple Inc.",
        "Jane Smith",
        "New York City",
        "William Shakespeare",
        "Artificial Intelligence",
        "Emily Johnson",
        "Python Programming",
        "Michael Jackson",
        "United Nations"
    ]
    
    result_df = process_name_list(name_list, api_key)
    print(result_df)
    result_df.to_csv('name_classification_results.csv', index=False)
    print("Results saved to 'name_classification_results.csv'")


















import pandas as pd
import time
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

class NameClassifier:
    def __init__(self, api_key, model_name="mixtral-8x7b-32768", max_retries=5, base_wait_time=60):
        self.llm = ChatGroq(model_name=model_name, api_key=api_key)
        self.prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an AI assistant specialized in identifying person names. Your task is to determine if the given text is a person's name or not."),
            ("human", "Is the following text a person's name? Respond with 'Yes' if it's a person's name, or 'Unclaimed' if it's not.\n\nText: {text}\n\nClassification:")
        ])
        self.chain = (
            {"text": RunnablePassthrough()}
            | self.prompt
            | self.llm
            | StrOutputParser()
        )
        self.max_retries = max_retries
        self.base_wait_time = base_wait_time

    def classify_name(self, text):
        for attempt in range(self.max_retries):
            try:
                result = self.chain.invoke({"text": text})
                classification = result.strip().lower()
                if classification == 'yes':
                    return text  # Return the original name if it's classified as a person's name
                else:
                    return "Unclaimed"
            except Exception as e:
                print(f"Error classifying name (attempt {attempt + 1}/{self.max_retries}): {e}")
                if "rate_limit_exceeded" in str(e):
                    wait_time = self.base_wait_time * (2 ** attempt)
                    print(f"Rate limit exceeded. Waiting for {wait_time} seconds before retrying.")
                    time.sleep(wait_time)
                elif attempt < self.max_retries - 1:
                    time.sleep(5)
                else:
                    print(f"All retry attempts failed. Returning 'Unclaimed' for '{text}'.")
                    return "Unclaimed"

def normalize_spaces(text):
    """Normalize spaces in a string, replacing multiple spaces with a single space."""
    return ' '.join(text.split())

def process_name_list(name_list, api_key, model_name="mixtral-8x7b-32768"):
    classifier = NameClassifier(api_key, model_name)
    results = []
    
    for i, name in enumerate(name_list):
        print(f"Processing name {i+1}/{len(name_list)}: {name}")
        classification = classifier.classify_name(name)
        normalized_name = normalize_spaces(name)
        results.append({
            "Input": name,
            "Classification": classification,
            "Normalized Name": normalized_name
        })
        
        # Save intermediate results every 100 iterations
        if (i + 1) % 100 == 0:
            df = pd.DataFrame(results)
            df.to_csv(f'intermediate_name_classification_results_{i+1}.csv', index=False)
            print(f"Saved intermediate results at iteration {i+1}")
    
    return pd.DataFrame(results)

# Example usage
if __name__ == "__main__":
    api_key = "your_api_key_here"  # Replace with your actual API key
    name_list = [
        "John Doe",
        "Apple Inc.",
        "Jane  Smith",
        "New York City",
        "William  Shakespeare",
        "Artificial Intelligence",
        "Emily   Johnson",
        "Python Programming",
        "Michael  Jackson",
        "United Nations"
    ]
    
    result_df = process_name_list(name_list, api_key)
    print(result_df)
    result_df.to_csv('name_classification_results.csv', index=False)
    print("Results saved to 'name_classification_results.csv'")
Leave a Comment