Untitled

 avatar
unknown
plain_text
a year ago
5.0 kB
8
Indexable
import java.io.BufferedReader;
import java.io.FileReader;

import java.util.ArrayList;
import java.util.List;

import java.util.Set;
import java.util.HashSet;


class TFIDFCalculator{
    
    public static void main(String[] args) {
        String filename = args[0];
        String testCase = args[1];
        TF_IDF tf_idf = new TF_IDF();
        tf_idf.Store_Words_To_Trie(filename);

        List<Double> tf = tf_idf.TF(testCase);
        List<Double> idf = tf_idf.IDF(testCase);
        double[] finalAns = new double[tf.size()];
        int i=0;
        int j=0;
        for(double db : tf){
            finalAns[i] = db;
            i++;
        }
        for(double db : idf){
            finalAns[j] = finalAns[j]*db;
            j++;
        }
        for(double db : finalAns){
            System.out.println(db);
        }
    }
}

class TF_IDF{

    public static Trie[] trie = new Trie[60000];
    public static Trie word_for_text_count = new Trie();

    public static int[] TotalNumber = new int[60000];
    public static int Text_Number = 0;

    public static void Store_Words_To_Trie(String fileName){
        try{
            BufferedReader bf = new BufferedReader(new FileReader(fileName));
            Set<String> used = new HashSet<>();
            int LineCount = 0;
            String line ="";
            while((line = bf.readLine()) != null){
                line = line.toLowerCase().replaceAll("[^a-z]", " ");
                String[] Token = line.split("\s+");
                for(String token :Token){
                    trie[Text_Number].insert(token);
                    TotalNumber[Text_Number]++;

                    if(!used.contains(token)){
                        word_for_text_count.insert(token);
                        used.add(token);
                    }
                }
                LineCount++;
                if(LineCount == 5){
                    LineCount = 0;
                    Text_Number++;
                    used.clear();
                }
            }

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static ArrayList<Double> TF(String testCase){
        ArrayList<Double> output = new ArrayList<>();
        try {
            BufferedReader testCaseBufferReader = new BufferedReader(new FileReader(testCase));
            String[] Word = testCaseBufferReader.readLine().split("\s+");
            String[] Number = testCaseBufferReader.readLine().split("\s+");
            for(int i=0 ; i<Word.length ; i++){
                String word = Word[i];
                String number = Number[i];

                int count = trie[Integer.valueOf(number)].searchNum(word);
                int totalcount = TotalNumber[Integer.valueOf(number)];

                double tf = (double)count / totalcount;
                output.add(tf);
            }
            
        } catch (Exception e) {
            e.printStackTrace();
        }
        return output;
    }

    public static ArrayList<Double> IDF(String testCase){
        ArrayList<Double> output = new ArrayList<>();
        try {
            BufferedReader testCaseBufferReader = new BufferedReader(new FileReader(testCase));
            String[] Word = testCaseBufferReader.readLine().split("\s+");
            String[] Number = testCaseBufferReader.readLine().split("\s+");
            for(int i=0 ; i<Word.length ; i++){
                String word = Word[i];
                String number = Number[i];

                int count = word_for_text_count.searchNum(word);
                int totalcount = Text_Number;

                double tf = Math.log((double)totalcount / count);
                output.add(tf);
            }
            
        } catch (Exception e) {
            e.printStackTrace();
        }
        return output;
    }
}


class TrieNode {
    TrieNode[] children = new TrieNode[26];
    boolean isEndOfWord = false;
    int count =0;
}

class Trie {
    TrieNode root = new TrieNode();

    // 插入一個單詞到 Trie
    public void insert(String word) {
        TrieNode node = root;
        
        for (char c : word.toCharArray()) {
            if (node.children[c - 'a'] == null) {
                node.children[c - 'a'] = new TrieNode();
            }
            node = node.children[c - 'a'];
        }
        
        node.isEndOfWord = true;
        node.count+=1;
    }

    // 搜尋 Trie 中是否存在該單詞
    public boolean search(String word) {
        TrieNode node = root;
        for (char c : word.toCharArray()) {
            node = node.children[c - 'a'];
            if (node == null) {
                return false;
            }
        }
        return node.isEndOfWord;
    }

    public int searchNum(String word){  
        TrieNode node  = root;
        for(char c : word.toCharArray()){
            node = node.children[c - 'a'];
            if (node == null) {
                return 0;
            }
        }
        return node.count ;
    }
}
Editor is loading...
Leave a Comment