Untitled
unknown
plain_text
a year ago
5.0 kB
9
Indexable
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.HashSet;
class TFIDFCalculator{
public static void main(String[] args) {
String filename = args[0];
String testCase = args[1];
TF_IDF tf_idf = new TF_IDF();
tf_idf.Store_Words_To_Trie(filename);
List<Double> tf = tf_idf.TF(testCase);
List<Double> idf = tf_idf.IDF(testCase);
double[] finalAns = new double[tf.size()];
int i=0;
int j=0;
for(double db : tf){
finalAns[i] = db;
i++;
}
for(double db : idf){
finalAns[j] = finalAns[j]*db;
j++;
}
for(double db : finalAns){
System.out.println(db);
}
}
}
class TF_IDF{
public static Trie[] trie = new Trie[60000];
public static Trie word_for_text_count = new Trie();
public static int[] TotalNumber = new int[60000];
public static int Text_Number = 0;
public static void Store_Words_To_Trie(String fileName){
try{
BufferedReader bf = new BufferedReader(new FileReader(fileName));
Set<String> used = new HashSet<>();
int LineCount = 0;
String line ="";
while((line = bf.readLine()) != null){
line = line.toLowerCase().replaceAll("[^a-z]", " ");
String[] Token = line.split("\s+");
for(String token :Token){
trie[Text_Number].insert(token);
TotalNumber[Text_Number]++;
if(!used.contains(token)){
word_for_text_count.insert(token);
used.add(token);
}
}
LineCount++;
if(LineCount == 5){
LineCount = 0;
Text_Number++;
used.clear();
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static ArrayList<Double> TF(String testCase){
ArrayList<Double> output = new ArrayList<>();
try {
BufferedReader testCaseBufferReader = new BufferedReader(new FileReader(testCase));
String[] Word = testCaseBufferReader.readLine().split("\s+");
String[] Number = testCaseBufferReader.readLine().split("\s+");
for(int i=0 ; i<Word.length ; i++){
String word = Word[i];
String number = Number[i];
int count = trie[Integer.valueOf(number)].searchNum(word);
int totalcount = TotalNumber[Integer.valueOf(number)];
double tf = (double)count / totalcount;
output.add(tf);
}
} catch (Exception e) {
e.printStackTrace();
}
return output;
}
public static ArrayList<Double> IDF(String testCase){
ArrayList<Double> output = new ArrayList<>();
try {
BufferedReader testCaseBufferReader = new BufferedReader(new FileReader(testCase));
String[] Word = testCaseBufferReader.readLine().split("\s+");
String[] Number = testCaseBufferReader.readLine().split("\s+");
for(int i=0 ; i<Word.length ; i++){
String word = Word[i];
String number = Number[i];
int count = word_for_text_count.searchNum(word);
int totalcount = Text_Number;
double tf = Math.log((double)totalcount / count);
output.add(tf);
}
} catch (Exception e) {
e.printStackTrace();
}
return output;
}
}
class TrieNode {
TrieNode[] children = new TrieNode[26];
boolean isEndOfWord = false;
int count =0;
}
class Trie {
TrieNode root = new TrieNode();
// 插入一個單詞到 Trie
public void insert(String word) {
TrieNode node = root;
for (char c : word.toCharArray()) {
if (node.children[c - 'a'] == null) {
node.children[c - 'a'] = new TrieNode();
}
node = node.children[c - 'a'];
}
node.isEndOfWord = true;
node.count+=1;
}
// 搜尋 Trie 中是否存在該單詞
public boolean search(String word) {
TrieNode node = root;
for (char c : word.toCharArray()) {
node = node.children[c - 'a'];
if (node == null) {
return false;
}
}
return node.isEndOfWord;
}
public int searchNum(String word){
TrieNode node = root;
for(char c : word.toCharArray()){
node = node.children[c - 'a'];
if (node == null) {
return 0;
}
}
return node.count ;
}
}Editor is loading...
Leave a Comment