Untitled

mail@pastecode.io avatar
unknown
python
a year ago
1.7 kB
2
Indexable
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.readlines()
    return content

def calculate_similarity_percentage(file_path1, file_path2, output_file_path):
    content1 = read_file(file_path1)
    content2 = read_file(file_path2)
    
    different_lines = 0
    total_lines = max(len(content1), len(content2))
    different_lines_list = []

    for line1, line2 in zip(content1, content2):
        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform([line1, line2])
        similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
        threshold = 1

        if similarity < threshold:
            different_lines += 1
            different_lines_list.append(f"File 1: {line1.strip()} | File 2: {line2.strip()}")

    similarity_percentage = ((total_lines - different_lines) / total_lines) * 100

    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write('\n'.join(different_lines_list))

    return similarity_percentage

file_path1 = "C:\\Users\\Admin\\Desktop\\AdPython\\crawl\\dấu\\demo-title.txt"
file_path2 = "C:\\Users\\Admin\\Desktop\\AdPython\\crawl\\dấu\\ten_tep_tin_nguoc_lai.txt"
output_file_path = "C:\\Users\\Admin\\Desktop\\AdPython\\crawl\\dấu\\ydiff_lines.txt"

percentage = calculate_similarity_percentage(file_path1, file_path2, output_file_path)
print(f"Similarity Percentage: {percentage}%")
print(f"Different lines saved to {output_file_path}")