Untitled

import re
import time
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
import pandas as pd
from tqdm import tqdm
import datetime

class ConversationAnalyzer:
    def __init__(self, api_key, model_name="mixtral-8x7b-32768", max_retries=5, base_wait_time=60):
        self.model = ChatGroq(model_name=model_name, api_key=api_key)
        self.prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an expert in analyzing conversations. Your task is to determine the coherence of a given text. Focus on the overall coherence and natural flow of the text."),
            ("human", "Text: {text}\n\nAnalyze the coherence of this text and respond with a score between 0 (not at all coherent) and 1 (completely coherent), and explain your reasoning.")
        ])
        self.chain = self.prompt | self.model | StrOutputParser()
        self.max_retries = max_retries
        self.base_wait_time = base_wait_time

    def analyze_text(self, text):
        for attempt in range(self.max_retries):
            try:
                result = self.chain.invoke({"text": text})
                score_match = re.search(r"\b\d+(\.\d+)?\b", result)
                if score_match:
                    score = float(score_match.group(0))
                else:
                    raise ValueError("No numeric score found in the result")
                explanation = result.split("\n", 1)[1].strip() if "\n" in result else result.strip()
                return score, explanation
            except Exception as e:
                if "rate_limit_exceeded" in str(e):
                    wait_time = self.base_wait_time * (2 ** attempt)
                    print(f"Rate limit exceeded. Waiting for {wait_time} seconds before retrying.")
                    time.sleep(wait_time)
                elif attempt < self.max_retries - 1:
                    print(f"Error analyzing text (attempt {attempt + 1}/{self.max_retries}): {e}")
                    time.sleep(5)
                else:
                    print(f"All retry attempts failed. Error: {e}")
                    return 0.0, "Failed to analyze text after multiple attempts."

def find_coherent_texts(loader, analyzer, file_name, log_file):
    coherent_texts = []
    df = loader.load_data(file_name)
    total_texts = len(df)
    with open(log_file, 'a') as f:  # Changed to append mode
        for idx in tqdm(range(total_texts), desc=f"Processing {file_name}"):
            text = df.loc[idx, 'sentence']
            score, explanation = analyzer.analyze_text(text)
            if score > 0:
                coherent_texts.append({
                    'index': idx,
                    'text': text,
                    'score': score,
                    'explanation': explanation
                })
                log_entry = (f"Coherent text found with score {score}:\n"
                             f"Text: {text}\n"
                             f"Explanation: {explanation}\n"
                             f"{'-' * 50}\n")
                f.write(log_entry)
            if score >= 0.8:
                print(f"Coherent text found with score {score}:")
                print(f"Text: {text}")
                print(f"Explanation: {explanation}")
                print("-" * 50)
            
            # Save intermediate results every 100 iterations
            if (idx + 1) % 100 == 0:
                intermediate_df = pd.DataFrame(coherent_texts)
                intermediate_df.to_csv(f'intermediate_results_{idx+1}.csv', index=False)
                print(f"Saved intermediate results at iteration {idx+1}")
    
    return pd.DataFrame(coherent_texts)

def main():
    try:
        start = datetime.datetime.now()
        print(start)

        loader = DataLoader()
        analyzer = ConversationAnalyzer(api_key="your_api_key_here")  # Replace with your actual API key
        file_name = 'df_for_dori2.pkl'
        log_file = "logs.txt"
        results = find_coherent_texts(loader, analyzer, file_name, log_file)

        output_file = 'results.csv'
        results.to_csv(output_file, index=False)
        print("Saved results")

        # Load original dataframe to get additional information
        original_df = loader.load_data(file_name)

        # Add additional information to results
        results['path'] = results['index'].map(original_df['path'])
        results['start_cd'] = results['index'].map(original_df['start_cd'])
        results['end_cd'] = results['index'].map(original_df['end_cd'])
        results['times'] = results['index'].map(original_df['times'])

        # Reorder columns
        results = results[['index', 'path', 'text', 'start_cd', 'end_cd', 'times', 'score', 'explanation']]

        # Save results
        output_file = 'coherent_texts_results.csv'
        results.to_csv(output_file, index=False)
        print(f"Found {len(results)} coherent texts. Results saved to '{output_file}'")
    except Exception as e:
        print(f"Failed to run! Error: {e}")
    finally:
        end = datetime.datetime.now()
        print(end)
        print(f"Time that took: {end - start}")

if __name__ == "__main__":
    main()