Untitled
unknown
python
a year ago
1.7 kB
2
Indexable
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity def read_file(file_path): with open(file_path, 'r', encoding='utf-8') as file: content = file.readlines() return content def calculate_similarity_percentage(file_path1, file_path2, output_file_path): content1 = read_file(file_path1) content2 = read_file(file_path2) different_lines = 0 total_lines = max(len(content1), len(content2)) different_lines_list = [] for line1, line2 in zip(content1, content2): vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform([line1, line2]) similarity = cosine_similarity(vectors[0], vectors[1])[0][0] threshold = 1 if similarity < threshold: different_lines += 1 different_lines_list.append(f"File 1: {line1.strip()} | File 2: {line2.strip()}") similarity_percentage = ((total_lines - different_lines) / total_lines) * 100 with open(output_file_path, 'w', encoding='utf-8') as output_file: output_file.write('\n'.join(different_lines_list)) return similarity_percentage file_path1 = "C:\\Users\\Admin\\Desktop\\AdPython\\crawl\\dấu\\demo-title.txt" file_path2 = "C:\\Users\\Admin\\Desktop\\AdPython\\crawl\\dấu\\ten_tep_tin_nguoc_lai.txt" output_file_path = "C:\\Users\\Admin\\Desktop\\AdPython\\crawl\\dấu\\ydiff_lines.txt" percentage = calculate_similarity_percentage(file_path1, file_path2, output_file_path) print(f"Similarity Percentage: {percentage}%") print(f"Different lines saved to {output_file_path}")