Untitled
unknown
plain_text
a year ago
5.0 kB
8
Indexable
import java.io.BufferedReader; import java.io.FileReader; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.HashSet; class TFIDFCalculator{ public static void main(String[] args) { String filename = args[0]; String testCase = args[1]; TF_IDF tf_idf = new TF_IDF(); tf_idf.Store_Words_To_Trie(filename); List<Double> tf = tf_idf.TF(testCase); List<Double> idf = tf_idf.IDF(testCase); double[] finalAns = new double[tf.size()]; int i=0; int j=0; for(double db : tf){ finalAns[i] = db; i++; } for(double db : idf){ finalAns[j] = finalAns[j]*db; j++; } for(double db : finalAns){ System.out.println(db); } } } class TF_IDF{ public static Trie[] trie = new Trie[60000]; public static Trie word_for_text_count = new Trie(); public static int[] TotalNumber = new int[60000]; public static int Text_Number = 0; public static void Store_Words_To_Trie(String fileName){ try{ BufferedReader bf = new BufferedReader(new FileReader(fileName)); Set<String> used = new HashSet<>(); int LineCount = 0; String line =""; while((line = bf.readLine()) != null){ line = line.toLowerCase().replaceAll("[^a-z]", " "); String[] Token = line.split("\s+"); for(String token :Token){ trie[Text_Number].insert(token); TotalNumber[Text_Number]++; if(!used.contains(token)){ word_for_text_count.insert(token); used.add(token); } } LineCount++; if(LineCount == 5){ LineCount = 0; Text_Number++; used.clear(); } } } catch (Exception e) { e.printStackTrace(); } } public static ArrayList<Double> TF(String testCase){ ArrayList<Double> output = new ArrayList<>(); try { BufferedReader testCaseBufferReader = new BufferedReader(new FileReader(testCase)); String[] Word = testCaseBufferReader.readLine().split("\s+"); String[] Number = testCaseBufferReader.readLine().split("\s+"); for(int i=0 ; i<Word.length ; i++){ String word = Word[i]; String number = Number[i]; int count = trie[Integer.valueOf(number)].searchNum(word); int totalcount = TotalNumber[Integer.valueOf(number)]; double tf = (double)count / totalcount; output.add(tf); } } catch (Exception e) { e.printStackTrace(); } return output; } public static ArrayList<Double> IDF(String testCase){ ArrayList<Double> output = new ArrayList<>(); try { BufferedReader testCaseBufferReader = new BufferedReader(new FileReader(testCase)); String[] Word = testCaseBufferReader.readLine().split("\s+"); String[] Number = testCaseBufferReader.readLine().split("\s+"); for(int i=0 ; i<Word.length ; i++){ String word = Word[i]; String number = Number[i]; int count = word_for_text_count.searchNum(word); int totalcount = Text_Number; double tf = Math.log((double)totalcount / count); output.add(tf); } } catch (Exception e) { e.printStackTrace(); } return output; } } class TrieNode { TrieNode[] children = new TrieNode[26]; boolean isEndOfWord = false; int count =0; } class Trie { TrieNode root = new TrieNode(); // 插入一個單詞到 Trie public void insert(String word) { TrieNode node = root; for (char c : word.toCharArray()) { if (node.children[c - 'a'] == null) { node.children[c - 'a'] = new TrieNode(); } node = node.children[c - 'a']; } node.isEndOfWord = true; node.count+=1; } // 搜尋 Trie 中是否存在該單詞 public boolean search(String word) { TrieNode node = root; for (char c : word.toCharArray()) { node = node.children[c - 'a']; if (node == null) { return false; } } return node.isEndOfWord; } public int searchNum(String word){ TrieNode node = root; for(char c : word.toCharArray()){ node = node.children[c - 'a']; if (node == null) { return 0; } } return node.count ; } }
Editor is loading...
Leave a Comment