Language Modelling
mdaalam22
java
3 years ago
5.5 kB
29
Indexable
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import static java.util.stream.Collectors.*;
public class LanguageModelling {
public ArrayList<String> readFile() {
ArrayList<String> tokens = new ArrayList<String>();
try {
BufferedReader br = new BufferedReader(
new FileReader("shakespear.txt")
);
String line;
while((line = br.readLine()) != null){
String[] words = line.split(" ");
for(int i=0; i<words.length; i++){
if(words[i].length() > 1)
tokens.add(words[i]);
}
}
br.close();
}catch(Exception e){}
return(tokens);
}
// function for sorting word frequency in descending order
public Map<String,Integer> getSortedMap(Map<String,Integer> word_freq){
Map<String, Integer> sorted = word_freq
.entrySet()
.stream()
.sorted(Collections.reverseOrder(Map.Entry.comparingByValue()))
.skip(10)
.limit(10)
.collect(
toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2,
LinkedHashMap::new));
return sorted;
}
public double cpOfWord(String prevWord,String nextWord, ArrayList<String> word_list){
int and_count = 0,count=0;
for(int i=0;i<word_list.size();i++){
if(word_list.get(i).equalsIgnoreCase(prevWord) && word_list.get(i+1).equalsIgnoreCase(nextWord)){
and_count++;
}
if(word_list.get(i).equalsIgnoreCase(prevWord)){
count++;
}
}
return (and_count*1.0/count*1.0);
}
public static void main(String[] args){
ArrayList<String> words_list = new ArrayList<String>();
LanguageModelling lng = new LanguageModelling();
words_list = lng.readFile();
// System.out.println(words_list);
Map<String,Integer> unigram = new HashMap<String, Integer>();
for(int i=0; i<words_list.size();i++){
int count = 0;
for(int j=0;j<words_list.size();j++){
if(words_list.get(i).equals(words_list.get(j))){
count = count + 1;
}
}
unigram.put(words_list.get(i), count);
}
System.out.println("Total no. of word = "+ Integer.toString(words_list.size()));
// printing 10 highest frequency unigram word
System.out.println("Highest frequency unigram\n"+lng.getSortedMap(unigram));
// two word counting
ArrayList<String> two_words_list = new ArrayList<String>();
for(int k=0;k<words_list.size();k++){
if(k>words_list.size()-2){
two_words_list.add(words_list.get(k));
}else{
two_words_list.add(words_list.get(k)+" "+words_list.get(k+1));
}
}
Map<String,Integer> bigram = new HashMap<String, Integer>();
for(int i=0; i<two_words_list.size();i++){
int count = 0;
for(int j=0;j<two_words_list.size();j++){
if(two_words_list.get(i).equals(two_words_list.get(j))){
count = count + 1;
}
}
bigram.put(two_words_list.get(i),count);
}
// printing 10 highest frequency unigram word
System.out.println("Highest frequency bigram\n"+lng.getSortedMap(bigram));
// three word counting
ArrayList<String> three_words_list = new ArrayList<String>();
for(int k=0;k<words_list.size();k++){
if(k>words_list.size()-3){
three_words_list.add(words_list.get(k));
}else{
three_words_list.add(words_list.get(k)+" "+words_list.get(k+1)+" "+words_list.get(k+2));
}
}
Map<String,Integer> trigram = new HashMap<String, Integer>();
for(int i=0; i<three_words_list.size();i++){
int count = 0;
for(int j=0;j<three_words_list.size();j++){
if(three_words_list.get(i).equals(three_words_list.get(j))){
count = count + 1;
}
}
trigram.put(three_words_list.get(i),count);
}
// printing 10 highest frequency unigram word
System.out.println("Highest frequency trigram\n"+lng.getSortedMap(trigram));
// relative frequency of some word
System.out.println((unigram.get("become")!=null?unigram.get("become"):0)*1.0/words_list.size()*1.0);
System.out.println((unigram.get("brave")!=null?unigram.get("brave"):0)*1.0/words_list.size()*1.0);
System.out.println((unigram.get("treasure")!=null?unigram.get("treasure"):0)*1.0/words_list.size()*1.0);
// conditonal probality of word
System.out.println(lng.cpOfWord("fairy","land",words_list));
}
}
Editor is loading...