Language Modelling
mdaalam22
java
2 years ago
5.5 kB
20
Indexable
Never
import java.io.BufferedReader; import java.io.FileReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; import static java.util.stream.Collectors.*; public class LanguageModelling { public ArrayList<String> readFile() { ArrayList<String> tokens = new ArrayList<String>(); try { BufferedReader br = new BufferedReader( new FileReader("shakespear.txt") ); String line; while((line = br.readLine()) != null){ String[] words = line.split(" "); for(int i=0; i<words.length; i++){ if(words[i].length() > 1) tokens.add(words[i]); } } br.close(); }catch(Exception e){} return(tokens); } // function for sorting word frequency in descending order public Map<String,Integer> getSortedMap(Map<String,Integer> word_freq){ Map<String, Integer> sorted = word_freq .entrySet() .stream() .sorted(Collections.reverseOrder(Map.Entry.comparingByValue())) .skip(10) .limit(10) .collect( toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); return sorted; } public double cpOfWord(String prevWord,String nextWord, ArrayList<String> word_list){ int and_count = 0,count=0; for(int i=0;i<word_list.size();i++){ if(word_list.get(i).equalsIgnoreCase(prevWord) && word_list.get(i+1).equalsIgnoreCase(nextWord)){ and_count++; } if(word_list.get(i).equalsIgnoreCase(prevWord)){ count++; } } return (and_count*1.0/count*1.0); } public static void main(String[] args){ ArrayList<String> words_list = new ArrayList<String>(); LanguageModelling lng = new LanguageModelling(); words_list = lng.readFile(); // System.out.println(words_list); Map<String,Integer> unigram = new HashMap<String, Integer>(); for(int i=0; i<words_list.size();i++){ int count = 0; for(int j=0;j<words_list.size();j++){ if(words_list.get(i).equals(words_list.get(j))){ count = count + 1; } } unigram.put(words_list.get(i), count); } System.out.println("Total no. of word = "+ Integer.toString(words_list.size())); // printing 10 highest frequency unigram word System.out.println("Highest frequency unigram\n"+lng.getSortedMap(unigram)); // two word counting ArrayList<String> two_words_list = new ArrayList<String>(); for(int k=0;k<words_list.size();k++){ if(k>words_list.size()-2){ two_words_list.add(words_list.get(k)); }else{ two_words_list.add(words_list.get(k)+" "+words_list.get(k+1)); } } Map<String,Integer> bigram = new HashMap<String, Integer>(); for(int i=0; i<two_words_list.size();i++){ int count = 0; for(int j=0;j<two_words_list.size();j++){ if(two_words_list.get(i).equals(two_words_list.get(j))){ count = count + 1; } } bigram.put(two_words_list.get(i),count); } // printing 10 highest frequency unigram word System.out.println("Highest frequency bigram\n"+lng.getSortedMap(bigram)); // three word counting ArrayList<String> three_words_list = new ArrayList<String>(); for(int k=0;k<words_list.size();k++){ if(k>words_list.size()-3){ three_words_list.add(words_list.get(k)); }else{ three_words_list.add(words_list.get(k)+" "+words_list.get(k+1)+" "+words_list.get(k+2)); } } Map<String,Integer> trigram = new HashMap<String, Integer>(); for(int i=0; i<three_words_list.size();i++){ int count = 0; for(int j=0;j<three_words_list.size();j++){ if(three_words_list.get(i).equals(three_words_list.get(j))){ count = count + 1; } } trigram.put(three_words_list.get(i),count); } // printing 10 highest frequency unigram word System.out.println("Highest frequency trigram\n"+lng.getSortedMap(trigram)); // relative frequency of some word System.out.println((unigram.get("become")!=null?unigram.get("become"):0)*1.0/words_list.size()*1.0); System.out.println((unigram.get("brave")!=null?unigram.get("brave"):0)*1.0/words_list.size()*1.0); System.out.println((unigram.get("treasure")!=null?unigram.get("treasure"):0)*1.0/words_list.size()*1.0); // conditonal probality of word System.out.println(lng.cpOfWord("fairy","land",words_list)); } }