QuadGramLikelihoods.java 1.73 KB
import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;

public class QuadGramLikelihoods {
    private Map<String, Double> likelihoods;
    private double notFoundLikelihood;

    public QuadGramLikelihoods() throws FileNotFoundException {
        Map<String, Integer> freqs = getQuadFrequencies();

        long total = 0;
        for (String qg : freqs.keySet()) {
            total += freqs.get(qg);
        }

        total += freqs.keySet().size();

        this.likelihoods = new HashMap<>();
        for (String qg : freqs.keySet()) {
            this.likelihoods.put(qg, Math.log10((double)freqs.get(qg) / (double)total));
        }
        this.notFoundLikelihood = Math.log10(1.0/total);
    }

    public static Map<String, Integer> getQuadFrequencies() throws FileNotFoundException {
        Scanner quadgrams = new Scanner(new File("english_quadgrams.txt"));
        Map<String, Integer> freqs = new HashMap<>();

        while (quadgrams.hasNextLine()) {
            String line = quadgrams.nextLine();
            String[] pieces = line.split("\\s");
            String quadgram = pieces[0];
            int freq = Integer.parseInt(pieces[1]);
            freqs.put(quadgram, freq);
        }

        return freqs;
    }

    /**
     * Returns the log-likelihood of the provided quad-gram.
     * @param quadgram the quad-gram to get the likelihood of
     * @return the log-likelihood as calculated by our data set
     */
    public double get(String quadgram) {
        Double likelihood = likelihoods.get(quadgram);
        if (likelihood != null) {
            return likelihood;
        }
        else {
            return this.notFoundLikelihood;
        }
    }
}