Language Modelling

 avatar
mdaalam22
java
2 years ago
5.5 kB
20
Indexable
Never
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import static java.util.stream.Collectors.*;

public class LanguageModelling {
    
    public ArrayList<String> readFile() {
        ArrayList<String> tokens = new ArrayList<String>();
        try {
            BufferedReader br = new BufferedReader(
                    new FileReader("shakespear.txt")
            );
                    String line;
                    while((line = br.readLine()) != null){
                        String[] words = line.split(" ");
                        for(int i=0; i<words.length; i++){
                            if(words[i].length() > 1)
                                tokens.add(words[i]);
                        }
                    }
                    br.close();

        }catch(Exception e){}

        return(tokens);



    }

    // function for sorting word frequency in descending order
   
    public Map<String,Integer> getSortedMap(Map<String,Integer> word_freq){
        Map<String, Integer> sorted = word_freq
        .entrySet()
        .stream()
        .sorted(Collections.reverseOrder(Map.Entry.comparingByValue()))
        .skip(10)
        .limit(10)
        .collect(
            toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2,
                LinkedHashMap::new));

        return sorted;
    }

    public double cpOfWord(String prevWord,String nextWord, ArrayList<String> word_list){
        int and_count = 0,count=0;
        for(int i=0;i<word_list.size();i++){
            if(word_list.get(i).equalsIgnoreCase(prevWord) && word_list.get(i+1).equalsIgnoreCase(nextWord)){
                and_count++;
            }
            if(word_list.get(i).equalsIgnoreCase(prevWord)){
                count++;
            }
            
        }
        return (and_count*1.0/count*1.0);
    }
    
    


    public static void main(String[] args){
        ArrayList<String> words_list = new ArrayList<String>();
        LanguageModelling lng = new LanguageModelling();
        words_list = lng.readFile();
        // System.out.println(words_list);
        Map<String,Integer> unigram = new HashMap<String, Integer>();
        for(int i=0; i<words_list.size();i++){
            int count = 0;
            for(int j=0;j<words_list.size();j++){
                if(words_list.get(i).equals(words_list.get(j))){
                    count = count + 1;
                }
                
            }
           unigram.put(words_list.get(i), count);
           
        }
        
        System.out.println("Total no. of word = "+ Integer.toString(words_list.size()));
        // printing 10 highest frequency unigram word

        System.out.println("Highest frequency unigram\n"+lng.getSortedMap(unigram));

       

       

        // two word counting
        ArrayList<String> two_words_list = new ArrayList<String>();
        for(int k=0;k<words_list.size();k++){
            if(k>words_list.size()-2){
                two_words_list.add(words_list.get(k));
            }else{
                two_words_list.add(words_list.get(k)+" "+words_list.get(k+1));
            }
        }

        Map<String,Integer> bigram = new HashMap<String, Integer>();
        for(int i=0; i<two_words_list.size();i++){
            int count = 0;
            for(int j=0;j<two_words_list.size();j++){
                if(two_words_list.get(i).equals(two_words_list.get(j))){
                    count = count + 1;
                }  
            }
            bigram.put(two_words_list.get(i),count);
           
        }
        
        // printing 10 highest frequency unigram word

        System.out.println("Highest frequency bigram\n"+lng.getSortedMap(bigram));

        // three word counting
        ArrayList<String> three_words_list = new ArrayList<String>();
        for(int k=0;k<words_list.size();k++){
            if(k>words_list.size()-3){
                three_words_list.add(words_list.get(k));
            }else{
                three_words_list.add(words_list.get(k)+" "+words_list.get(k+1)+" "+words_list.get(k+2));
            }
        }

        Map<String,Integer> trigram = new HashMap<String, Integer>();
        for(int i=0; i<three_words_list.size();i++){
            int count = 0;
            for(int j=0;j<three_words_list.size();j++){
                if(three_words_list.get(i).equals(three_words_list.get(j))){
                    count = count + 1;
                }  
            }
            trigram.put(three_words_list.get(i),count);
           
        }
        
        // printing 10 highest frequency unigram word

        System.out.println("Highest frequency trigram\n"+lng.getSortedMap(trigram));


        // relative frequency of some word
        System.out.println((unigram.get("become")!=null?unigram.get("become"):0)*1.0/words_list.size()*1.0);
        System.out.println((unigram.get("brave")!=null?unigram.get("brave"):0)*1.0/words_list.size()*1.0);
        System.out.println((unigram.get("treasure")!=null?unigram.get("treasure"):0)*1.0/words_list.size()*1.0);

        
        // conditonal probality of word
        System.out.println(lng.cpOfWord("fairy","land",words_list));
        
       
        

    }
}