Untitled
unknown
python
2 years ago
1.7 kB
12
Indexable
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def read_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.readlines()
return content
def calculate_similarity_percentage(file_path1, file_path2, output_file_path):
content1 = read_file(file_path1)
content2 = read_file(file_path2)
different_lines = 0
total_lines = max(len(content1), len(content2))
different_lines_list = []
for line1, line2 in zip(content1, content2):
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([line1, line2])
similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
threshold = 1
if similarity < threshold:
different_lines += 1
different_lines_list.append(f"File 1: {line1.strip()} | File 2: {line2.strip()}")
similarity_percentage = ((total_lines - different_lines) / total_lines) * 100
with open(output_file_path, 'w', encoding='utf-8') as output_file:
output_file.write('\n'.join(different_lines_list))
return similarity_percentage
file_path1 = "C:\\Users\\Admin\\Desktop\\AdPython\\crawl\\dấu\\demo-title.txt"
file_path2 = "C:\\Users\\Admin\\Desktop\\AdPython\\crawl\\dấu\\ten_tep_tin_nguoc_lai.txt"
output_file_path = "C:\\Users\\Admin\\Desktop\\AdPython\\crawl\\dấu\\ydiff_lines.txt"
percentage = calculate_similarity_percentage(file_path1, file_path2, output_file_path)
print(f"Similarity Percentage: {percentage}%")
print(f"Different lines saved to {output_file_path}")Editor is loading...