Untitled
unknown
python
10 months ago
6.0 kB
6
Indexable
from langchain_groq import ChatGroq from langchain.prompts import ChatPromptTemplate from langchain.schema import StrOutputParser import pandas as pd from tqdm import tqdm import re import datetime class ConversationAnalyzer: def __init__(self): self.model = ChatGroq(model_name="mixtral-8x7b-32768", api_key="?") self.prompt = ChatPromptTemplate.from_messages([ ("system", """You are an expert at analyzing conversations. Your task is to determine whether a given snippet could be part of a natural conversation. Follow these guidelines: 1. Single sentences: Always give a high score (0.8-1.0) to snippets containing only one sentence, regardless of language. 2. Repeated sentences in the same language: Give a high score (0.8-1.0) to snippets where the same sentence is repeated twice in the same language. This simulates a conversation where one person repeats what was said. 3. Same sentence in two different languages: Give a high score (0.8-1.0) to snippets where the same sentence appears in two different languages. This simulates a conversation with an interpreter or translator. 4. For all other cases: Analyze the naturalness of the conversation based on context, coherence, and flow. Conversations may be multi-lingual, with sentences in different languages."""), ("human", """Text: {text} Analyze the text and respond with: 1. A score between 0 (not at all part of a natural conversation) and 1 (very likely to be part of a natural conversation). 2. A brief explanation of your reasoning, referencing the guidelines if applicable. 3. Identification of the language(s) used in the snippet.""") ]) self.chain = self.prompt | self.model | StrOutputParser() def analyze_text(self, text): result = self.chain.invoke({"text": text}) try: # Search for the first occurrence of a numeric value in the response score_match = re.search(r"\b\d+(\.\d+)?\b", result) if score_match: score = float(score_match.group(0)) else: raise ValueError("No numeric score found in the result") explanation = result.split("\n", 1)[1].strip() if "\n" in result else result.strip() except ValueError as ve: print(f"Error parsing result: {result}\nError: {ve}") score = 0.0 explanation = "Failed to parse model output: could not convert string to float." except Exception as e: print(f"Unexpected error parsing result: {result}\nError: {e}") score = 0.0 explanation = "Failed to parse model output: unexpected error." return score, explanation class DataLoader: def __init__(self): self.dataframe = None def load_data(self, file_name): if self.dataframe is None: self.dataframe = pd.read_pickle(file_name) self.dataframe = self.dataframe.reset_index(drop=True) print(f"Loaded {file_name}. Shape: {self.dataframe.shape}") return self.dataframe def find_coherent_texts(loader, analyzer, file_name, log_file): coherent_texts = [] df = loader.load_data(file_name) total_texts = len(df) with open(log_file, 'w') as f: for idx in tqdm(range(total_texts), desc=f"Processing {file_name}"): text = df.loc[idx, 'sentence'] score, explanation = analyzer.analyze_text(text) if score > 0: coherent_texts.append({ 'index': idx, 'text': text, 'score': score, 'explanation': explanation }) log_entry = (f"Coherent text found with score {score}:\n" f"Text: {text}\n" f"Explanation: {explanation}\n" f"{'-' * 50}\n") f.write(log_entry) if score >= 0.8: print(f"Coherent text found with score {score}:") print(f"Text: {text}") print(f"Explanation: {explanation}") print("-" * 50) return pd.DataFrame(coherent_texts) def main(log_file,output_file,output_file2): try: start = datetime.datetime.now() print(start) loader = DataLoader() analyzer = ConversationAnalyzer() file_name = 'df_for_dori2.pkl' # log_file = "logs2.txt" results = find_coherent_texts(loader, analyzer, file_name, log_file) results.to_csv(output_file, index=False) print("Saved results") # Load original dataframe to get additional information original_df = loader.load_data(file_name) # Add additional information to results results['path'] = results['index'].map(original_df['path']) results['start_cd'] = results['index'].map(original_df['start_cd']) results['end_cd'] = results['index'].map(original_df['end_cd']) results['times'] = results['index'].map(original_df['times']) # Reorder columns results = results[['index', 'path', 'text', 'start_cd', 'end_cd', 'times', 'score', 'explanation']] # Save results # output_file = 'coherent_texts_results_data_2.csv' results.to_csv(f"{output_file2}.csv", index=False) results.to_pickle(f"{output_file2}.pkl") print(f"Found {len(results)} coherent texts. Results saved to '{output_file2}'") except Exception as e: print(f"Failed to run! Error: {e}") finally: end = datetime.datetime.now() print(end) print(f"Time that took: {end - start}") if __name__ == "__main__": # claude log_file = "logsClaude.txt" output_file = 'resultsClaude.csv' output_file2 = "coherent_texts_results_data_Claude" main(log_file,output_file,output_file2)
Editor is loading...
Leave a Comment