Untitled

#@title test3 מעולה זהו !!
# @title ChatGroq
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
import pandas as pd
from tqdm import tqdm
from google.colab import userdata
import re

class ConversationAnalyzer:
    def __init__(self):
        self.model = ChatGroq(model_name="llama3-70b-8192", api_key=userdata.get('LlamaKey'))
        self.prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an expert in analyzing conversations. Your task is to determine if two given texts could be consecutive turns in a natural, coherent dialogue. Consider context, relevance, and natural flow of conversation."),
            ("human", "Text 1: {text1}\nText 2: {text2}\n\nCould these two texts be consecutive turns in a natural, coherent dialogue? Respond with a score between 0 (not at all coherent) and 1 (completely coherent), and explain your reasoning."),
        ])
        self.chain = self.prompt | self.model | StrOutputParser()

    def analyze_pair(self, text1, text2):
        result = self.chain.invoke({"text1": text1, "text2": text2})
        try:
            # Search for the first occurrence of a numeric value in the response
            score_match = re.search(r"\b\d+(\.\d+)?\b", result)
            if score_match:
                score = float(score_match.group(0))
            else:
                raise ValueError("No numeric score found in the result")
            explanation = result.split("\n", 1)[1].strip() if "\n" in result else result.strip()
        except ValueError as ve:
            print(f"Error parsing result: {result}\nError: {ve}")
            score = 0.0
            explanation = "Failed to parse model output: could not convert string to float."
        except Exception as e:
            print(f"Unexpected error parsing result: {result}\nError: {e}")
            score = 0.0
            explanation = "Failed to parse model output: unexpected error."
        return score, explanation

class DataLoader:
    def __init__(self):
        self.dataframe = None

    def load_data(self, file_name):
        if self.dataframe is None:
            self.dataframe = pd.read_pickle(file_name)
            self.dataframe = self.dataframe.reset_index(drop=True)
            print(f"Loaded {file_name}. Shape: {self.dataframe.shape}")
        return self.dataframe

def find_coherent_conversations(loader, analyzer, file_name):
    coherent_pairs = []
    df = loader.load_data(file_name)
    total_pairs = len(df)
    
    for idx1 in tqdm(range(total_pairs), desc=f"Processing {file_name}"):
        for idx2 in range(idx1 + 1, total_pairs):
            text1 = df.loc[idx1, 'sentence']
            text2 = df.loc[idx2, 'sentence']
            score, explanation = analyzer.analyze_pair(text1, text2)
            if score > 0:  # Consider only pairs with a positive coherence score
                coherent_pairs.append({
                    'index1': idx1,
                    'index2': idx2,
                    'text1': text1,
                    'text2': text2,
                    'score': score,
                    'explanation': explanation
                })
                print(f"Coherent pair found with score {score}:")
                print(f"Text 1: {text1}")
                print(f"Text 2: {text2}")
                print(f"Explanation: {explanation}")
                print("-" * 50)
    return pd.DataFrame(coherent_pairs)

loader = DataLoader()
analyzer = ConversationAnalyzer()
file_name = '/content/df_for_dori.pkl'

results = find_coherent_conversations(loader, analyzer, file_name)

# Load original dataframe to get additional information
original_df = loader.load_data(file_name)

# Add additional information to results
results['path1'] = results['index1'].map(original_df['path'])
results['start_cd1'] = results['index1'].map(original_df['start_cd'])
results['end_cd1'] = results['index1'].map(original_df['end_cd'])
results['times1'] = results['index1'].map(original_df['times'])

results['path2'] = results['index2'].map(original_df['path'])
results['start_cd2'] = results['index2'].map(original_df['start_cd'])
results['end_cd2'] = results['index2'].map(original_df['end_cd'])
results['times2'] = results['index2'].map(original_df['times'])

# Reorder columns 
results = results[['index1', 'path1', 'text1', 'start_cd1', 'end_cd1', 'times1',
                   'index2', 'path2', 'text2', 'start_cd2', 'end_cd2', 'times2',
                   'score', 'explanation']]

# Save results
output_file = 'coherent_conversations_results.csv'
results.to_csv(output_file, index=False)
print(f"Found {len(results)} coherent conversation pairs. Results saved to '{output_file}'")
Editor is loading...