Untitled
unknown
python
a year ago
6.0 kB
10
Indexable
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
import pandas as pd
from tqdm import tqdm
import re
import datetime
class ConversationAnalyzer:
def __init__(self):
self.model = ChatGroq(model_name="mixtral-8x7b-32768", api_key="?")
self.prompt = ChatPromptTemplate.from_messages([
("system", """You are an expert at analyzing conversations. Your task is to determine whether a given snippet could be part of a natural conversation. Follow these guidelines:
1. Single sentences: Always give a high score (0.8-1.0) to snippets containing only one sentence, regardless of language.
2. Repeated sentences in the same language: Give a high score (0.8-1.0) to snippets where the same sentence is repeated twice in the same language. This simulates a conversation where one person repeats what was said.
3. Same sentence in two different languages: Give a high score (0.8-1.0) to snippets where the same sentence appears in two different languages. This simulates a conversation with an interpreter or translator.
4. For all other cases: Analyze the naturalness of the conversation based on context, coherence, and flow.
Conversations may be multi-lingual, with sentences in different languages."""),
("human", """Text: {text}
Analyze the text and respond with:
1. A score between 0 (not at all part of a natural conversation) and 1 (very likely to be part of a natural conversation).
2. A brief explanation of your reasoning, referencing the guidelines if applicable.
3. Identification of the language(s) used in the snippet.""")
])
self.chain = self.prompt | self.model | StrOutputParser()
def analyze_text(self, text):
result = self.chain.invoke({"text": text})
try:
# Search for the first occurrence of a numeric value in the response
score_match = re.search(r"\b\d+(\.\d+)?\b", result)
if score_match:
score = float(score_match.group(0))
else:
raise ValueError("No numeric score found in the result")
explanation = result.split("\n", 1)[1].strip() if "\n" in result else result.strip()
except ValueError as ve:
print(f"Error parsing result: {result}\nError: {ve}")
score = 0.0
explanation = "Failed to parse model output: could not convert string to float."
except Exception as e:
print(f"Unexpected error parsing result: {result}\nError: {e}")
score = 0.0
explanation = "Failed to parse model output: unexpected error."
return score, explanation
class DataLoader:
def __init__(self):
self.dataframe = None
def load_data(self, file_name):
if self.dataframe is None:
self.dataframe = pd.read_pickle(file_name)
self.dataframe = self.dataframe.reset_index(drop=True)
print(f"Loaded {file_name}. Shape: {self.dataframe.shape}")
return self.dataframe
def find_coherent_texts(loader, analyzer, file_name, log_file):
coherent_texts = []
df = loader.load_data(file_name)
total_texts = len(df)
with open(log_file, 'w') as f:
for idx in tqdm(range(total_texts), desc=f"Processing {file_name}"):
text = df.loc[idx, 'sentence']
score, explanation = analyzer.analyze_text(text)
if score > 0:
coherent_texts.append({
'index': idx,
'text': text,
'score': score,
'explanation': explanation
})
log_entry = (f"Coherent text found with score {score}:\n"
f"Text: {text}\n"
f"Explanation: {explanation}\n"
f"{'-' * 50}\n")
f.write(log_entry)
if score >= 0.8:
print(f"Coherent text found with score {score}:")
print(f"Text: {text}")
print(f"Explanation: {explanation}")
print("-" * 50)
return pd.DataFrame(coherent_texts)
def main(log_file,output_file,output_file2):
try:
start = datetime.datetime.now()
print(start)
loader = DataLoader()
analyzer = ConversationAnalyzer()
file_name = 'df_for_dori2.pkl'
# log_file = "logs2.txt"
results = find_coherent_texts(loader, analyzer, file_name, log_file)
results.to_csv(output_file, index=False)
print("Saved results")
# Load original dataframe to get additional information
original_df = loader.load_data(file_name)
# Add additional information to results
results['path'] = results['index'].map(original_df['path'])
results['start_cd'] = results['index'].map(original_df['start_cd'])
results['end_cd'] = results['index'].map(original_df['end_cd'])
results['times'] = results['index'].map(original_df['times'])
# Reorder columns
results = results[['index', 'path', 'text', 'start_cd', 'end_cd', 'times', 'score', 'explanation']]
# Save results
# output_file = 'coherent_texts_results_data_2.csv'
results.to_csv(f"{output_file2}.csv", index=False)
results.to_pickle(f"{output_file2}.pkl")
print(f"Found {len(results)} coherent texts. Results saved to '{output_file2}'")
except Exception as e:
print(f"Failed to run! Error: {e}")
finally:
end = datetime.datetime.now()
print(end)
print(f"Time that took: {end - start}")
if __name__ == "__main__":
# claude
log_file = "logsClaude.txt"
output_file = 'resultsClaude.csv'
output_file2 = "coherent_texts_results_data_Claude"
main(log_file,output_file,output_file2)Editor is loading...
Leave a Comment