Untitled

mail@pastecode.io avatar
unknown
python
a month ago
4.9 kB
3
Indexable
Never
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
import pandas as pd
from tqdm import tqdm
import re
import datetime

class ConversationAnalyzer:
    def __init__(self):
        self.model = ChatGroq(model_name="mixtral-8x7b-32768", api_key="")
        self.prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an expert in analyzing conversations. Your task is to determine the coherence of a given text. Focus on the overall coherence and natural flow of the text."),
            ("human", "Text: {text}\n\nAnalyze the coherence of this text and respond with a score between 0 (not at all coherent) and 1 (completely coherent), and explain your reasoning.")
        ])
        self.chain = self.prompt | self.model | StrOutputParser()

    def analyze_text(self, text):
        result = self.chain.invoke({"text": text})
        try:
            # Search for the first occurrence of a numeric value in the response
            score_match = re.search(r"\b\d+(\.\d+)?\b", result)
            if score_match:
                score = float(score_match.group(0))
            else:
                raise ValueError("No numeric score found in the result")
            explanation = result.split("\n", 1)[1].strip() if "\n" in result else result.strip()
        except ValueError as ve:
            print(f"Error parsing result: {result}\nError: {ve}")
            score = 0.0
            explanation = "Failed to parse model output: could not convert string to float."
        except Exception as e:
            print(f"Unexpected error parsing result: {result}\nError: {e}")
            score = 0.0
            explanation = "Failed to parse model output: unexpected error."
        return score, explanation

class DataLoader:
    def __init__(self):
        self.dataframe = None

    def load_data(self, file_name):
        if self.dataframe is None:
            self.dataframe = pd.read_pickle(file_name)
            self.dataframe = self.dataframe.reset_index(drop=True)
            print(f"Loaded {file_name}. Shape: {self.dataframe.shape}")
        return self.dataframe

def find_coherent_texts(loader, analyzer, file_name, log_file):
    coherent_texts = []
    df = loader.load_data(file_name)
    total_texts = len(df)
    with open(log_file, 'w') as f:
        for idx in tqdm(range(total_texts), desc=f"Processing {file_name}"):
            text = df.loc[idx, 'sentence']
            score, explanation = analyzer.analyze_text(text)
            if score > 0:
                coherent_texts.append({
                    'index': idx,
                    'text': text,
                    'score': score,
                    'explanation': explanation
                })
                log_entry = (f"Coherent text found with score {score}:\n"
                             f"Text: {text}\n"
                             f"Explanation: {explanation}\n"
                             f"{'-' * 50}\n")
                f.write(log_entry)
            if score >= 0.8:
                print(f"Coherent text found with score {score}:")
                print(f"Text: {text}")
                print(f"Explanation: {explanation}")
                print("-" * 50)
    return pd.DataFrame(coherent_texts)


def main():
    try:
        start = datetime.datetime.now()
        print(start)

        loader = DataLoader()
        analyzer = ConversationAnalyzer()
        file_name = 'df_for_dori3.pkl'
        log_file = "logs3.txt"
        results = find_coherent_texts(loader, analyzer, file_name, log_file)

        output_file = 'results3.csv'
        results.to_csv(output_file, index=False)
        print("Saved results")

        # Load original dataframe to get additional information
        original_df = loader.load_data(file_name)

        # Add additional information to results
        results['path'] = results['index'].map(original_df['path'])
        results['start_cd'] = results['index'].map(original_df['start_cd'])
        results['end_cd'] = results['index'].map(original_df['end_cd'])
        results['times'] = results['index'].map(original_df['times'])

        # Reorder columns
        results = results[['index', 'path', 'text', 'start_cd', 'end_cd', 'times', 'score', 'explanation']]

        # Save results
        output_file = 'coherent_texts_results_data_3.csv'
        results.to_csv(output_file, index=False)
        results.to_pickle("coherent_texts_results_data_3.pkl")
        print(f"Found {len(results)} coherent texts. Results saved to '{output_file}'")
    except Exception as e:
        print(f"Failed to run! Error: {e}")
    finally:
        end = datetime.datetime.now()
        print(end)
        print(f"Time that took: {end - start}")
Leave a Comment