Text Comparison Program
unknown
plain_text
2 years ago
10 kB
4
Indexable
""" Description: Computes the similarity between two texts using two different metrics: (1) shared words, and (2) term frequency-inverse document frequency (TF-IDF). """ import string import math import re def load_file(filename): """ Args: filename: string, name of file to read Returns: string, contains file contents """ # print("Loading file %s" % filename) inFile = open(filename, 'r') line = inFile.read().strip() for char in string.punctuation: line = line.replace(char, "") inFile.close() return line.lower() ### Prep Data ### def prep_data(input_text): """ Args: input_text: string representation of text from file, assume the string is made of lowercase characters Returns: list representation of input_text, where each word is a different element in the list """ rep = [] word = "" # for each character in input text, add it to word until there is a space # when there is a space, add the full word and start over for i in input_text: if i == " ": rep.append(word) word = "" else: word += i # appends last word(since there wont be a space) rep.append(word) return rep ### Get Frequency ### def get_frequencies(word_list): dict = {} freq = 0 """ Args: word_list: list of strings, all are made of lowercase characters Returns: dictionary that maps string:int where each string is a word in l and the corresponding int is the frequency of the word in l """ # if string is a new string, create new key of i with a value of 1 # if string is already in dict, add 1 to the frequency of the current string for i in word_list: if i not in dict: dict[i] = 1 else: dict[i] += 1 return dict ### Get Words Sorted by Frequency def get_words_sorted_by_frequency(frequencies_dict): """ Args: frequencies_dict: dictionary that maps a word to its frequency Returns: list of words sorted by decreasing frequency with ties broken by alphabetical order """ # Sorts dictionary by alphabet first, then by value, then returns the now list of tuples back to a dictionary sortList = sorted(frequencies_dict.items(),key=lambda x:x[0],reverse=False) sortList = sorted(sortList,key=lambda x:x[1],reverse=True) sortDict = dict(sortList) returnList = [] # map the sorted dictionary keys onto a list for i in sortDict.keys(): returnList.append(i) return returnList ### Most Frequent Word(s) ### def get_most_frequent_words(dict1, dict2): """ The keys of dict1 and dict2 are all lowercase, you will NOT need to worry about case sensitivity. Args: dict1: frequency dictionary for one text dict2: frequency dictionary for another text Returns: list of the most frequent word(s) in the input dictionaries The most frequent word: * is based on the combined word frequencies across both dictionaries. If a word occurs in both dictionaries, consider the sum the frequencies as the combined word frequency. * need not be in both dictionaries, i.e it can be exclusively in dict1, dict2, or shared by dict1 and dict2. If multiple words are tied (i.e. share the same highest frequency), return an alphabetically ordered list of all these words. """ newDict = {} # adds every value and key of dict1 to newDict for i in dict1.keys(): newDict[i] = dict1[i] # interates through dict2, if dict2 key is already in newDict, add frequencies # otherwise, create a new key and value in newDict for i in dict2.keys(): if i in newDict: newDict[i] += dict2[i] else: newDict[i] = dict2[i] # sort newDict sortList = sorted(newDict.items(),key=lambda x:x[1],reverse=True) finalDict = dict(sortList) newList = [] val = 0 # get highest number from sorted finalDict, and add all keys with value equal to it to newList for i in finalDict.keys(): if (finalDict[i] >= val): val = finalDict[i] newList.append(i) return newList ### Similarity ### def calculate_similarity_score(dict1, dict2): """ The keys of dict1 and dict2 are all lowercase, you will NOT need to worry about case sensitivity. Args: dict1: frequency dictionary of words of text1 dict2: frequency dictionary of words of text2 Returns: float, a number between 0 and 1, inclusive representing how similar the words/texts are to each other The difference in words/text frequencies = DIFF sums "frequencies" over all unique elements from dict1 and dict2 combined based on which of these three scenarios applies: * If an element occurs in dict1 and dict2 then get the difference in frequencies * If an element occurs only in dict1 then take the frequency from dict1 * If an element occurs only in dict2 then take the frequency from dict2 The total frequencies = ALL is calculated by summing all frequencies in both dict1 and dict2. Return 1-(DIFF/ALL) rounded to 2 decimal places """ U = {} # adds every value and key of dict1 to U for i in dict1.keys(): U[i] = dict1[i] # interates through dict2, if dict2 key is already in U, add frequencies # otherwise, create a new key and value in U for i in dict2.keys(): if i in U: U[i] += dict2[i] else: U[i] = dict2[i] deltaTotal = 0 sigmaTotal = 0 sim = 0.0 # for each value of U, check if it is in dict1 and or dict2 # if in dict1, add to tempDelt, if in dict2, subtract to tempDelt # add the absolute value of the tempDelt to the totalDelt # Add the value of the key i in U for i in U: tempDelt = 0 if i in dict1: tempDelt += dict1[i] if i in dict2: tempDelt -= dict2[i] deltaTotal += abs(tempDelt) sigmaTotal += U[i] # calculate similarity sim = (1-(deltaTotal/sigmaTotal)) return round(sim,2) ### Finding TF-IDF ### def get_tf(text_file): """ Args: text_file: name of file in the form of a string Returns: a dictionary mapping each word to its TF * TF is calculated as TF(i) = (number times word *i* appears in the document) / (total number of words in the document) * Think about how we can use get_frequencies from earlier """ # prepare file myFile = load_file(text_file) myList = prep_data(myFile) # myList is a list representation of text_file, # where each word is a different element in the list myDict = {} totalLength = len(myList) # gives each word in myList a key and val of 1 if never seen before # if seen before, add 1 to the value for i in myList: if i in myDict: myDict[i] += 1 else: myDict[i] = 1 # mutates each value in myDict to its TF instead of frequency for i in myDict.keys(): myDict[i] = (myDict[i]/totalLength) return myDict def get_idf(text_files): """ Args: text_files: list of names of files, where each file name is a string Returns: a dictionary mapping each word to its IDF * IDF is calculated as IDF(i) = log_10(total number of documents / number of documents with word *i* in it), where log_10 is log base 10 and can be called with math.log10() """ totalFiles = len(text_files) myDict = {} for i in text_files: # loads file word list to myList, for each word file myFile = load_file(i) myList = prep_data(myFile) tempList = [] # for each word in the word file, add it to myDict as appearing in this word file # if this word is new, set the value at the key to 1, if not new, add one # if the word has already been counted for this file, then pass this word for j in myList: if j in tempList: pass else: if j in myDict: myDict[j] += 1 tempList.append(j) elif j not in myDict: myDict[j] = 1 tempList.append(j) # mutates key i value to IDF from cross-document frequency of word for i in myDict.keys(): myDict[i] = math.log10(totalFiles/myDict[i]) return myDict def get_tfidf(text_file, text_files): """ Args: text_file: name of file in the form of a string (used to calculate TF) text_files: list of names of files, where each file name is a string (used to calculate IDF) Returns: a sorted list of tuples (in increasing TF-IDF score), where each tuple is of the form (word, TF-IDF). In case of words with the same TF-IDF, the words should be sorted in increasing alphabetical order. * TF-IDF(i) = TF(i) * IDF(i) """ tlist = [] tf = get_tf(text_file) idf = get_idf(text_files) # a dictionary mapping each word to its IDF for i in tf.keys(): for j in idf.keys(): if i == j: tlist.append((i,tf[i]*idf[j])) if i not in idf: tlist.append((i,0)) # sort list of tuples by key first, then by value tlist = sorted(tlist,key=lambda x:x[0],reverse=False) tlist = sorted(tlist,key=lambda x:x[1],reverse=False) return tlist
Editor is loading...
Leave a Comment