Untitled

mail@pastecode.io avatar
unknown
plain_text
a month ago
896 B
1
Indexable
Never
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

def similarity_ratio(file_path1, file_path2):
    content1 = read_file(file_path1)
    content2 = read_file(file_path2)
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([content1, content2])
    similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
    similarity_percentage = similarity * 100
    return similarity_percentage

file_path1 = "C:\\Users\\Admin\\Desktop\\AdPython\\crawl\\dấu\\demo-title.txt"
file_path2 = "C:\\Users\\Admin\\Desktop\\AdPython\\crawl\\dấu\\ten_tep_tin_nguoc_lai.txt"

ratio = similarity_ratio(file_path1, file_path2)
print(f"Cosine Similarity Ratio: {ratio}%")