1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;
public class QuadGramLikelihoods {
private Map<String, Double> likelihoods;
private double notFoundLikelihood;
public QuadGramLikelihoods() throws FileNotFoundException {
Map<String, Integer> freqs = getQuadFrequencies();
long total = 0;
for (String qg : freqs.keySet()) {
total += freqs.get(qg);
}
total += freqs.keySet().size();
this.likelihoods = new HashMap<>();
for (String qg : freqs.keySet()) {
this.likelihoods.put(qg, Math.log10((double)freqs.get(qg) / (double)total));
}
this.notFoundLikelihood = Math.log10(1.0/total);
}
public static Map<String, Integer> getQuadFrequencies() throws FileNotFoundException {
Scanner quadgrams = new Scanner(new File("english_quadgrams.txt"));
Map<String, Integer> freqs = new HashMap<>();
while (quadgrams.hasNextLine()) {
String line = quadgrams.nextLine();
String[] pieces = line.split("\\s");
String quadgram = pieces[0];
int freq = Integer.parseInt(pieces[1]);
freqs.put(quadgram, freq);
}
return freqs;
}
/**
* Returns the log-likelihood of the provided quad-gram.
* @param quadgram the quad-gram to get the likelihood of
* @return the log-likelihood as calculated by our data set
*/
public double get(String quadgram) {
Double likelihood = likelihoods.get(quadgram);
if (likelihood != null) {
return likelihood;
}
else {
return this.notFoundLikelihood;
}
}
}