Text Comparison Program

mail@pastecode.io avatar
unknown
plain_text
7 months ago
10 kB
0
Indexable
Never
"""
Description:
    Computes the similarity between two texts using two different metrics:
    (1) shared words, and (2) term frequency-inverse document
    frequency (TF-IDF).
"""

import string
import math
import re

def load_file(filename):
    """
    Args:
        filename: string, name of file to read
    Returns:
        string, contains file contents
    """
    # print("Loading file %s" % filename)
    inFile = open(filename, 'r')
    line = inFile.read().strip()
    for char in string.punctuation:
        line = line.replace(char, "")
    inFile.close()
    return line.lower()


### Prep Data ###
def prep_data(input_text):
    """
    Args:
        input_text: string representation of text from file,
                    assume the string is made of lowercase characters
    Returns:
        list representation of input_text, where each word is a different element in the list
    """

    rep = []
    word = ""

    # for each character in input text, add it to word until there is a space
    # when there is a space, add the full word and start over
    for i in input_text:
        if i == " ":
            rep.append(word)
            word = ""
        else:
            word += i
    # appends last word(since there wont be a space)
    rep.append(word)
    return rep



### Get Frequency ###
def get_frequencies(word_list):
    dict = {}
    freq = 0


    """
    Args:
        word_list: list of strings, all are made of lowercase characters
    Returns:
        dictionary that maps string:int where each string
        is a word in l and the corresponding int
        is the frequency of the word in l
    """

    # if string is a new string, create new key of i with a value of 1
    # if string is already in dict, add 1 to the frequency of the current string

    for i in word_list:
        if i not in dict:
            dict[i] = 1
        else:
            dict[i] += 1
    return dict






### Get Words Sorted by Frequency
def get_words_sorted_by_frequency(frequencies_dict):

    """
    Args:
        frequencies_dict: dictionary that maps a word to its frequency
    Returns:
        list of words sorted by decreasing frequency with ties broken
        by alphabetical order
    """

    # Sorts dictionary by alphabet first, then by value, then returns the now list of tuples back to a dictionary
    sortList = sorted(frequencies_dict.items(),key=lambda x:x[0],reverse=False)
    sortList = sorted(sortList,key=lambda x:x[1],reverse=True)
    sortDict = dict(sortList)

    returnList = []

    # map the sorted dictionary keys onto a list
    for i in sortDict.keys():
        returnList.append(i)

    return returnList



### Most Frequent Word(s) ###
def get_most_frequent_words(dict1, dict2):
    """
    The keys of dict1 and dict2 are all lowercase,
    you will NOT need to worry about case sensitivity.

    Args:
        dict1: frequency dictionary for one text
        dict2: frequency dictionary for another text
    Returns:
        list of the most frequent word(s) in the input dictionaries

    The most frequent word:
        * is based on the combined word frequencies across both dictionaries.
          If a word occurs in both dictionaries, consider the sum the
          frequencies as the combined word frequency.
        * need not be in both dictionaries, i.e it can be exclusively in
          dict1, dict2, or shared by dict1 and dict2.
    If multiple words are tied (i.e. share the same highest frequency),
    return an alphabetically ordered list of all these words.
    """

    newDict = {}

    # adds every value and key of dict1 to newDict
    for i in dict1.keys():
        newDict[i] = dict1[i]

    # interates through dict2, if dict2 key is already in newDict, add frequencies
    # otherwise, create a new key and value in newDict
    for i in dict2.keys():
        if i in newDict:
            newDict[i] += dict2[i]
        else:
            newDict[i] = dict2[i]

    # sort newDict
    sortList = sorted(newDict.items(),key=lambda x:x[1],reverse=True)
    finalDict = dict(sortList)


    newList = []
    val = 0

    # get highest number from sorted finalDict, and add all keys with value equal to it to newList
    for i in finalDict.keys():

        if (finalDict[i] >= val):
            val = finalDict[i]
            newList.append(i)

    return newList



### Similarity ###
def calculate_similarity_score(dict1, dict2):
    """
    The keys of dict1 and dict2 are all lowercase,
    you will NOT need to worry about case sensitivity.

    Args:
        dict1: frequency dictionary of words of text1
        dict2: frequency dictionary of words of text2
    Returns:
        float, a number between 0 and 1, inclusive
        representing how similar the words/texts are to each other

        The difference in words/text frequencies = DIFF sums "frequencies"
        over all unique elements from dict1 and dict2 combined
        based on which of these three scenarios applies:
        * If an element occurs in dict1 and dict2 then
          get the difference in frequencies
        * If an element occurs only in dict1 then take the
          frequency from dict1
        * If an element occurs only in dict2 then take the
          frequency from dict2
         The total frequencies = ALL is calculated by summing
         all frequencies in both dict1 and dict2.
        Return 1-(DIFF/ALL) rounded to 2 decimal places
    """
    U = {}

    # adds every value and key of dict1 to U
    for i in dict1.keys():
        U[i] = dict1[i]

    # interates through dict2, if dict2 key is already in U, add frequencies
    # otherwise, create a new key and value in U
    for i in dict2.keys():
        if i in U:
            U[i] += dict2[i]
        else:
            U[i] = dict2[i]

    deltaTotal = 0
    sigmaTotal = 0
    sim = 0.0

    # for each value of U, check if it is in dict1 and or dict2
    # if in dict1, add to tempDelt, if in dict2, subtract to tempDelt
    # add the absolute value of the tempDelt to the totalDelt
    # Add the value of the key i in U

    for i in U:
        tempDelt = 0
        if i in dict1:
            tempDelt += dict1[i]
        if i in dict2:
            tempDelt -= dict2[i]

        deltaTotal += abs(tempDelt)
        sigmaTotal += U[i]

    # calculate similarity
    sim = (1-(deltaTotal/sigmaTotal))
    return round(sim,2)




### Finding TF-IDF ###
def get_tf(text_file):

    """
    Args:
        text_file: name of file in the form of a string
    Returns:
        a dictionary mapping each word to its TF

    * TF is calculated as TF(i) = (number times word *i* appears
        in the document) / (total number of words in the document)
    * Think about how we can use get_frequencies from earlier
    """

    # prepare file
    myFile = load_file(text_file)
    myList = prep_data(myFile)

    # myList is a list representation of text_file,
    # where each word is a different element in the list

    myDict = {}
    totalLength = len(myList)

    # gives each word in myList a key and val of 1 if never seen before
    # if seen before, add 1 to the value
    for i in myList:
        if i in myDict:
            myDict[i] += 1
        else:
            myDict[i] = 1

    # mutates each value in myDict to its TF instead of frequency
    for i in myDict.keys():
        myDict[i] = (myDict[i]/totalLength)

    return myDict



def get_idf(text_files):
    """
    Args:
        text_files: list of names of files, where each file name is a string
    Returns:
       a dictionary mapping each word to its IDF

    * IDF is calculated as IDF(i) = log_10(total number of documents / number of
    documents with word *i* in it), where log_10 is log base 10 and can be called
    with math.log10()

    """

    totalFiles = len(text_files)

    myDict = {}

    for i in text_files:
        # loads file word list to myList, for each word file
        myFile = load_file(i)
        myList = prep_data(myFile)
        tempList = []

    # for each word in the word file, add it to myDict as appearing in this word file
    # if this word is new, set the value at the key to 1, if not new, add one
    # if the word has already been counted for this file, then pass this word
        for j in myList:
            if j in tempList:
                pass
            else:
                if j in myDict:
                    myDict[j] += 1
                    tempList.append(j)
                elif j not in myDict:
                    myDict[j] = 1
                    tempList.append(j)

    # mutates key i value to IDF from cross-document frequency of word
    for i in myDict.keys():
        myDict[i] = math.log10(totalFiles/myDict[i])

    return myDict






def get_tfidf(text_file, text_files):
    """
    Args:
        text_file: name of file in the form of a string (used to calculate TF)
        text_files: list of names of files, where each file name is a string
        (used to calculate IDF)
    Returns:
       a sorted list of tuples (in increasing TF-IDF score), where each tuple is
       of the form (word, TF-IDF). In case of words with the same TF-IDF, the
       words should be sorted in increasing alphabetical order.

    * TF-IDF(i) = TF(i) * IDF(i)
    """
    tlist = []

    tf = get_tf(text_file)
    idf = get_idf(text_files)

    # a dictionary mapping each word to its IDF

    for i in tf.keys():
        for j in idf.keys():
            if i == j:
                tlist.append((i,tf[i]*idf[j]))
    if i not in idf:
        tlist.append((i,0))

    # sort list of tuples by key first, then by value
    tlist = sorted(tlist,key=lambda x:x[0],reverse=False)
    tlist = sorted(tlist,key=lambda x:x[1],reverse=False)

    return tlist
Leave a Comment