Untitled

from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
import pandas as pd
from tqdm import tqdm
import re
import datetime

class ConversationAnalyzer:
    def __init__(self):
        self.model = ChatGroq(model_name="mixtral-8x7b-32768", api_key="?")
        self.prompt = ChatPromptTemplate.from_messages([
            ("system", """You are an expert at analyzing conversations. Your task is to determine whether a given snippet could be part of a natural conversation. Follow these guidelines:

1. Single sentences: Always give a high score (0.8-1.0) to snippets containing only one sentence, regardless of language.

2. Repeated sentences in the same language: Give a high score (0.8-1.0) to snippets where the same sentence is repeated twice in the same language. This simulates a conversation where one person repeats what was said.

3. Same sentence in two different languages: Give a high score (0.8-1.0) to snippets where the same sentence appears in two different languages. This simulates a conversation with an interpreter or translator.

4. For all other cases: Analyze the naturalness of the conversation based on context, coherence, and flow.

Conversations may be multi-lingual, with sentences in different languages."""),
            ("human", """Text: {text}

Analyze the text and respond with:
1. A score between 0 (not at all part of a natural conversation) and 1 (very likely to be part of a natural conversation).
2. A brief explanation of your reasoning, referencing the guidelines if applicable.
3. Identification of the language(s) used in the snippet.""")
        ])
        self.chain = self.prompt | self.model | StrOutputParser()

    def analyze_text(self, text):
        result = self.chain.invoke({"text": text})
        try:
            # Search for the first occurrence of a numeric value in the response
            score_match = re.search(r"\b\d+(\.\d+)?\b", result)
            if score_match:
                score = float(score_match.group(0))
            else:
                raise ValueError("No numeric score found in the result")
            explanation = result.split("\n", 1)[1].strip() if "\n" in result else result.strip()
        except ValueError as ve:
            print(f"Error parsing result: {result}\nError: {ve}")
            score = 0.0
            explanation = "Failed to parse model output: could not convert string to float."
        except Exception as e:
            print(f"Unexpected error parsing result: {result}\nError: {e}")
            score = 0.0
            explanation = "Failed to parse model output: unexpected error."
        return score, explanation

class DataLoader:
    def __init__(self):
        self.dataframe = None

    def load_data(self, file_name):
        if self.dataframe is None:
            self.dataframe = pd.read_pickle(file_name)
            self.dataframe = self.dataframe.reset_index(drop=True)
            print(f"Loaded {file_name}. Shape: {self.dataframe.shape}")
        return self.dataframe

def find_coherent_texts(loader, analyzer, file_name, log_file):
    coherent_texts = []
    df = loader.load_data(file_name)
    total_texts = len(df)
    with open(log_file, 'w') as f:
        for idx in tqdm(range(total_texts), desc=f"Processing {file_name}"):
            text = df.loc[idx, 'sentence']
            score, explanation = analyzer.analyze_text(text)
            if score > 0:
                coherent_texts.append({
                    'index': idx,
                    'text': text,
                    'score': score,
                    'explanation': explanation
                })
                log_entry = (f"Coherent text found with score {score}:\n"
                             f"Text: {text}\n"
                             f"Explanation: {explanation}\n"
                             f"{'-' * 50}\n")
                f.write(log_entry)
            if score >= 0.8:
                print(f"Coherent text found with score {score}:")
                print(f"Text: {text}")
                print(f"Explanation: {explanation}")
                print("-" * 50)
    return pd.DataFrame(coherent_texts)


def main(log_file,output_file,output_file2):
    try:
        start = datetime.datetime.now()
        print(start)

        loader = DataLoader()
        analyzer = ConversationAnalyzer()
        file_name = 'df_for_dori2.pkl'
        # log_file = "logs2.txt"
        results = find_coherent_texts(loader, analyzer, file_name, log_file)

        results.to_csv(output_file, index=False)
        print("Saved results")

        # Load original dataframe to get additional information
        original_df = loader.load_data(file_name)

        # Add additional information to results
        results['path'] = results['index'].map(original_df['path'])
        results['start_cd'] = results['index'].map(original_df['start_cd'])
        results['end_cd'] = results['index'].map(original_df['end_cd'])
        results['times'] = results['index'].map(original_df['times'])

        # Reorder columns
        results = results[['index', 'path', 'text', 'start_cd', 'end_cd', 'times', 'score', 'explanation']]
     
        # Save results
        # output_file = 'coherent_texts_results_data_2.csv'
        results.to_csv(f"{output_file2}.csv", index=False)
        results.to_pickle(f"{output_file2}.pkl")
        print(f"Found {len(results)} coherent texts. Results saved to '{output_file2}'")
    except Exception as e:
        print(f"Failed to run! Error: {e}")
    finally:
        end = datetime.datetime.now()
        print(end)
        print(f"Time that took: {end - start}")


if __name__ == "__main__":
    # claude
    log_file = "logsClaude.txt"
    output_file = 'resultsClaude.csv'
    output_file2 = "coherent_texts_results_data_Claude"
    main(log_file,output_file,output_file2)
Editor is loading...