Fill in the TODO blanks in ModelMatcher.java and MatcherController.java
//MarkovModel,java
import java.util.Set;
/**
* Construct a Markov model of order /k/ based on an input string.
*
* @author
* @version
*/
public class MarkovModel
{
/** Markov model order parameter */
int k;
/** ngram model of order k */
NgramAnalyser ngram;
/** ngram model of order k+1 */
NgramAnalyser n1gram;
/**
* Construct an order-k Markov model from string s
* @param k int order of the Markov model
* @param s String input to be modelled
*/
public MarkovModel(int k, String s)
{
ngram = new NgramAnalyser(k, s);
n1gram = new NgramAnalyser((k+1), s);
}
/**
* @return order of this Markov model
*/
public int getK()
{
return k;
}
/** Estimate the probability of a sequence appearing in the text
* using simple estimate of freq seq / frequency front(seq).
* @param sequence String of length k+1
* @return double probability of the last letter occuring in the
* context of the first ones or 0 if front(seq) does not occur.
*/
public double simpleEstimate(String sequence) {
double prob;
String seqNotLast = sequence.substring(0, sequence.length()-1);
if (ngram.getDistinctNgrams().contains(seqNotLast))
{
double n1g = n1gram.getNgramFrequency(sequence);
double ng = ngram.getNgramFrequency(seqNotLast);
try{
prob = (n1g/ng);
}
catch(ArithmeticException e){
return 0.0;
}
return prob;
}
else
{
return 0.0;
}
}
/**
* Calculate the Laplacian probability of string obs given this Markov model
* @input sequence String of length k+1
*/
public double laplaceEstimate(String sequence)
{
String context = sequence.substring(0, sequence.length()-1);
double npc = n1gram.getNgramFrequency(sequence);
double np = ngram.getNgramFrequency(context);
double laplace;
laplace = (npc + 1)/(np + ngram.getAlphabetSize());
return laplace;
}
/**
* @return String representing this Markov model
*/
public String toString()
{
String toRet = “”;
String k = Integer.toString(getK());
toRet += (k + “n”);
toRet += (Integer.toString(ngram.getAlphabetSize()) + “n”);
toRet += ngram.toString() + n1gram.toString();
return toRet;
}
}
————————————————————————————————————————–
————————————————————————————————————————–
//ModelMatcher.java
import java.util.HashMap;
import java.util.Collection;
import java.util.ArrayList;
import java.util.Arrays;
/**
* Report the average log likelihood of a test String occuring in a
* given Markov model and detail the calculated values behind this statistic.
*
* @author
* @version
*/
public class ModelMatcher
{
/** log likelihoods for a teststring under a given model */
private HashMap<String,Double> logLikelihoodMap;
/** summary statistic for this setting */
private double averageLogLikelihood;
/**
* Constructor to initialise the fields for the log likelihood map for
* a test string and a given Markov model and
* the average log likelihood summary statistic
* @param MarkovModel model a given Markov model object
* @param String teststring
*/
public ModelMatcher(MarkovModel model, String testString)
{
//TODO
}
/** Helper method that calculates the average log likelihood statistic
* given a HashMap of strings and their Laplace probabilities
* and the total number of ngrams in the model.
*
* @param logs map of ngram strings and their log likelihood
* @param ngramCount int number of ngrams in the original test string
* @return average log likelihood: the total of loglikelihoods
* divided by the ngramCount
*/
private double averageLogLikelihood(HashMap<String,Double> logs, int ngramCount)
{
//TODO
return 0.1;
}
/** Helper method to calculate the total log likelihood statistic
* given a HashMap of strings and their Laplace probabilities
* and the total number of ngrams in the model.
*
* @param logs map of ngram strings and their log likelihood
* @return total log likelihood: the sum of loglikelihoods in logs
*/
private double totalLogLikelihood(HashMap<String,Double> logs)
{
//TODO
return 0.1;
}
/**
* @return the average log likelihood statistic
*/
public double getAverageLogLikelihood()
{
return averageLogLikelihood;
}
/**
* @return the log likelihood value for a given ngram from the input string
*/
public double getLogLikelihood(String ngram)
{
return (logLikelihoodMap.get(ngram));
}
/**
* Make a String summarising the log likelihood map and its statistics
* @return String of ngrams and their loglikeihood differences between the models
* The likelihood table should be ordered from highest to lowest likelihood
*/
public String toString()
{
//TODO
return null;
}
}
————————————————————————————————————————–
————————————————————————————————————————–
//MatcherController.java
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
import java.io.*;
/** Create and manipulate Markov models and model matchers for lists of training data
* a test data String and generate output from it for convenient display.
*
* @author
* @version
*
*/
public class MatcherController {
/** list of training data string used to generate markov models */
ArrayList<String> trainingDataList;
/** test data to be matched with the models */
String testData;
/** order of the markov models*/
int k;
/** generated list of markov models for the given training data*/
ArrayList<MarkovModel> modelList;
/** generated list of matchers for the given markov models and test data*/
ArrayList<ModelMatcher> matcherList;
/** Generate models for analysis
* @param k order of the markov models to be used
* @param testData String to check against different models
* @throw unchecked exceptions if the input order or data inputs are invalid
*/
public MatcherController(int k, ArrayList<String> trainingDataList, String testData)
{
//TODO
}
/** @return a string containing all lines from a file
* ff file contents can be got, otherwise null
* This method should process any exceptions that arise.
*/
private static String getFileContents(String filename) {
//TODO
return null;
}
/**
* @return the ModelMatcher object that has the highest average loglikelihood
* (where all candidates are trained for the same test string
*/
public ModelMatcher getBestMatch(ArrayList<ModelMatcher> candidates)
{
//TODO
return null;
}
/** @return String an *explanation* of
* why the test string is the match from the candidate models
*/
public String explainBestMatch(ModelMatcher best) {
//TODO
return null;
}
/** Display an error to the user in a manner appropriate
* for the interface being used.
*
* @param message
*/
public void displayError(String message) {
// LEAVE THIS METHOD EMPTY
}
}
Task 3- Matching test strings to a model (ModelMatcher) models better matches a test string. Given two For this task, you will need to complete the Mod class, which determines which of two Markov Markov models built from text samples taken from two different sources, we can use them to estimate which source it is more likely a test s was drawn from. Even using only zero-th order models constructed using English and Russian text, we should be able to tell, for instance, that the string bopLLI oe3EKyceH is more likely to be Russian than English. We will computer a measure of fit of test strings against models, called the likelihood of a sequence under the model. The likelihood of a sequence under a k-th order Markov model is calculated as follows For each symbol c in the sequence, compute the probability of observing c under the model, given its k-letter context p (assuming the sequence to wrap around”, as described above) using the Laplace-smoothed estimate of probability we used for the MarkovModel class Compute the likelihood of the entire sequence as the product of the likelihoods of each character. mathematical notation: In Let s be an input sequence of length n, and let M be a k-th order Markov model. In order to calculate the likelihood of the sequence s under the model M, for each symbol ci in s (where 1 S is n), let pi be the k-length context of the symbol ci uming wraparound). The likelihood of the sequence s under the model is II laplace(ci) i 1 where laplace is the Laplace-smoothed probability of ci occurring ven its context) as described in the previous task. The probability we obtain from this calculation may be very smal in fact, potentially so small as to be indistinguishable from zero when using Java’s built-in floating-point arithmetic. Therefore we will calculate and express the likelihood using log probabilities. which do not suffer from this problem. (A weakness of log probabilities is that they cannot straightforwardly represent probabilities of zero, but our use of Laplace smoothing allows us to avoid this problem. The product of two probabilities p and g can be calculated by adding their log probabilities, logpand log g By way of example, suppose we have constructed a 2nd-order Markov model using the input string “aabcabaacaac”, as described in the example for Task 2. If we are then given a test string “aabbcaac we can compute its log likelihood as follows For each character in the test string, obtain its length-2 context (assuming wraparound). Note that the tri-gram “caa” occurs twice in the (wrapped) test string Context Character Frequency bb bc Ca
Expert Answer
public class MarkovModel
{
/** Markov model order parameter */
int k;
/** ngram model of order k */
NgramAnalyser ngram;
/** ngram model of order k+1 */
NgramAnalyser n1gram;
/**
* Construct an order-k Markov model from string s
* @param k int order of the Markov model
* @param s String input to be modelled
*/
public MarkovModel(int k, String s)
{
this.k = k;
ngram = new NgramAnalyser(k, s);
n1gram = new NgramAnalyser((k+1), s);
}
/**
* @return order of this Markov model
*/
public int getK()
{
return k;
}
/** Estimate the probability of a sequence appearing in the text
* using simple estimate of freq seq / frequency front(seq).
* @param sequence String of length k+1
* @return double probability of the last letter occurring in the
* context of the first ones or 0 if front(seq) does not occur.
*/
public double simpleEstimate(String sequence) {
double prob;
String seqNotLast = sequence.substring(0, sequence.length()-1);
if (ngram.getDistinctNgrams().contains(seqNotLast))
{
double n1g = n1gram.getNgramFrequency(sequence);
double ng = ngram.getNgramFrequency(seqNotLast);
try{
prob = (n1g/ng);
}
catch(ArithmeticException e){
return 0.0;
}
return prob;
}
else
{
return 0.0;
}
}
/**
* Calculate the Laplacian probability of string obs given this Markov model
* @input sequence String of length k+1
* @return Laplacian Probability
*/
public double laplaceEstimate(String sequence)
{
String context = sequence.substring(0, (sequence.length()-1));
double npc = n1gram.getNgramFrequency(sequence);
double np = ngram.getNgramFrequency(context);
double laplace;
laplace = (npc + 1)/(np + n1gram.getAlphabetSize());
return laplace;
}
/**
* @return String representing this Markov model
*/
public String toString()
{
String toRet = “”;
String k = Integer.toString(getK());
toRet += (k + “n”);
toRet += (Integer.toString(ngram.getAlphabetSize()) + “n”);
toRet += ngram.toString() + n1gram.toString();
return toRet;
}
}
MatchController.java
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Paths;
public class MatcherController {
/** list of training data string used to generate markov models */
ArrayList<String> trainingDataList;
/** test data to be matched with the models */
String testData;
/** order of the markov models*/
int k;
/** generated list of markov models for the given training data*/
ArrayList<MarkovModel> modelList;
/** generated list of matchers for the given markov models and test data*/
ArrayList<ModelMatcher> matcherList;
/** Generate models for analysis
* @param k order of the markov models to be used
* @param trainingDataList List of strings used to generate Markov Models
* @param testData String to check against different models
* @throw unchecked exceptions if the input order or data inputs are invalid
*/
public MatcherController(int k, ArrayList<String> trainingDataList, String testData)
{
if (k < 0)
{
throw new IllegalArgumentException(“K cannot be less than zero”);
}
if (testData == “” || testData == null)
{
throw new IllegalArgumentException(“The data to test cannot be blank”);
}
modelList = new ArrayList<MarkovModel>();
matcherList = new ArrayList<ModelMatcher>();
this.trainingDataList = trainingDataList;
//Create a markov model for each entry in trainingDataList
for (String i : trainingDataList)
{
modelList.add(new MarkovModel(k, i));
}
// For each Markov Model a Model Matcher
for (MarkovModel i : modelList)
{
matcherList.add(new ModelMatcher(i, testData));
}
//Find the best modelmatcher
ModelMatcher best = getBestMatch(matcherList);
String bestMatch = explainBestMatch(best);
}
/** @return a string containing all lines from a file
* @param the name of the file to import
*
* ff file contents can be got, otherwise null
* This method should process any exceptions that arise.
* @throws IOException
*/
private static String getFileContents(String filename) throws IOException
{
byte[] encoded = Files.readAllBytes(Paths.get(filename));
return new String(encoded);
}
/**
* @param an arraylist of ModelMatchers
* @return the ModelMatcher object that has the highest average loglikelihood
* (where all candidates are trained for the same test string
*/
public ModelMatcher getBestMatch(ArrayList<ModelMatcher> candidates)
{
ModelMatcher currentHighest = null;
for (ModelMatcher i : candidates)
{
ModelMatcher thisLog = i;
if (currentHighest == null)
{
currentHighest = i;
}
if (thisLog.getAverageLogLikelihood() > currentHighest.getAverageLogLikelihood())
{
currentHighest = thisLog;
}
}
return currentHighest;
}
/** @return String an *explanation* of
* why the test string is the match from the candidate models
*/
public String explainBestMatch(ModelMatcher best)
{
double normalProb = Math.pow(10, best.getAverageLogLikelihood());
String toRet = “This model was the best as the average likelihood was ” + normalProb + “n”;
return null;
}
/** Display an error to the user in a manner appropriate
* for the interface being used.
*
* @param message
*/
public void displayError(String message) {
// LEAVE THIS METHOD EMPTY
}
}
ModelMatcher.java
import java.util.HashMap;
import java.util.Collection;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.SortedSet;
import java.util.TreeSet;
public class ModelMatcher
{
/** log likelihoods for a teststring under a given model */
private HashMap<String,Double> logLikelihoodMap;
/** summary statistic for this setting */
private double averageLogLikelihood;
/**
* Constructor to initialise the fields for the log likelihood map for
* a test string and a given Markov model and
* the average log likelihood summary statistic
* @param MarkovModel model a given Markov model object
* @param String teststring the string to check compatability with the model
*/
public ModelMatcher(MarkovModel model, String testString)
{
logLikelihoodMap = new HashMap<String, Double>();
int kVal = model.getK();
String seq;
for (int curPos = 0; (curPos) < testString.length(); curPos++)
{
if (curPos + (kVal + 1) <= testString.length())
{
seq = testString.substring(curPos, (curPos + kVal + 1));
}
else
{
String fromEnd = testString.substring(curPos, (testString.length()));
String fromStart = testString.substring(0, ((kVal + 1) – fromEnd.length()));
seq = fromEnd + fromStart;
}
// seq should be of length k+1
// context should be the first k chars of seq
String context = seq.substring(0, kVal);
// impChar should be the last character of seq
String impChar = seq.substring(kVal, seq.length());
double loggedProb = Math.log10(model.laplaceEstimate(seq));
if (logLikelihoodMap.containsKey(seq))
{
logLikelihoodMap.put(seq, (logLikelihoodMap.get(seq) + loggedProb));
}
else
{
logLikelihoodMap.put(seq, loggedProb);
}
}
averageLogLikelihood = averageLogLikelihood(logLikelihoodMap, testString.length());
}
/** Helper method that calculates the average log likelihood statistic
* given a HashMap of strings and their Laplace probabilities
* and the total number of ngrams in the model.
*
* @param logs map of ngram strings and their log likelihood
* @param ngramCount int number of ngrams in the original test string
* @return average log likelihood: the total of loglikelihoods
* divided by the ngramCount
*/
private double averageLogLikelihood(HashMap<String,Double> logs, int ngramCount)
{
double totalLogs = 0.0;
for (String i : logs.keySet())
{
totalLogs += logs.get(i);
}
double avg = totalLogs/ngramCount;
return avg;
}
/** Helper method to calculate the total log likelihood statistic
* given a HashMap of strings and their Laplace probabilities
* and the total number of ngrams in the model.
*
* @param logs map of ngram strings and their log likelihood
* @return total log likelihood: the sum of loglikelihoods in logs
*/
private double totalLogLikelihood(HashMap<String,Double> logs)
{
double totalLogs = 0.0;
for (String i : logs.keySet())
{
totalLogs += logs.get(i);
}
return totalLogs;
}
/**
* @return the average log likelihood statistic
*/
public double getAverageLogLikelihood()
{
return averageLogLikelihood;
}
/**
* @param ngram String the ngram to find the log likelihood of.
* @return the log likelihood value for a given ngram from the input string
*/
public double getLogLikelihood(String ngram)
{
return (logLikelihoodMap.get(ngram));
}
/**
* Make a String summarising the log likelihood map and its statistics
* @return String of ngrams and their loglikeihood differences between the models
* The likelihood table should be ordered from highest to lowest likelihood
*/
public String toString()
{
SortedSet<String> keysArray = new TreeSet<String>(logLikelihoodMap.keySet());
String toRet = “”;
for (String key : keysArray)
{
double logLike = logLikelihoodMap.get(key);
String logLikeS = Double.toString(logLike);
String thisKey = (key + ” ” + logLikeS + “n”);
toRet += thisKey;
}
return toRet;
}
}