Predictive models of text: performing text analysis
<<MarkovModel.java>>
import java.util.Set;
/**
* Construct a Markov model of order /k/ based on an input string.
*
* @author
* @version
*/
public class MarkovModel
{
/** Markov model order parameter */
int k;
/** ngram model of order k */
NgramAnalyser ngram;
/** ngram model of order k+1 */
NgramAnalyser n1gram;
/**
* Construct an order-k Markov model from string s
* @param k int order of the Markov model
* @param s String input to be modelled
*/
public MarkovModel(int k, String s)
{
//TODO replace this line with your code
}
/**
* @return order of this Markov model
*/
public int getK()
{
return k;
}
/** Estimate the probability of a sequence appearing in the text
* using simple estimate of freq seq / frequency front(seq).
* @param sequence String of length k+1
* @return double probability of the last letter occuring in the
* context of the first ones or 0 if front(seq) does not occur.
*/
public double simpleEstimate(String sequence) {
//TODO replace this line with your code
return -1.0;
}
/**
* Calculate the Laplacian probability of string obs given this Markov model
* @input sequence String of length k+1
*/
public double laplaceEstimate(String sequence)
{
//TODO replace this line with your code
return -1.0;
}
/**
* @return String representing this Markov model
*/
public String toString()
{
//TODO replace this line with your code
return null;
}
}
————————————————————————————————————————–
————————————————————————————————————————–
<<ModelMatcher.java>>
import java.util.HashMap;
import java.util.Collection;
import java.util.ArrayList;
import java.util.Arrays;
/**
* Report the average log likelihood of a test String occuring in a
* given Markov model and detail the calculated values behind this statistic.
*
* @author
* @version
*/
public class ModelMatcher
{
/** log likelihoods for a teststring under a given model */
private HashMap<String,Double> logLikelihoodMap;
/** summary statistic for this setting */
private double averageLogLikelihood;
/**
* Constructor to initialise the fields for the log likelihood map for
* a test string and a given Markov model and
* the average log likelihood summary statistic
* @param MarkovModel model a given Markov model object
* @param String teststring
*/
public ModelMatcher(MarkovModel model, String testString)
{
//TODO
}
/** Helper method that calculates the average log likelihood statistic
* given a HashMap of strings and their Laplace probabilities
* and the total number of ngrams in the model.
*
* @param logs map of ngram strings and their log likelihood
* @param ngramCount int number of ngrams in the original test string
* @return average log likelihood: the total of loglikelihoods
* divided by the ngramCount
*/
private double averageLogLikelihood(HashMap<String,Double> logs, int ngramCount)
{
//TODO
return 0.1;
}
/** Helper method to calculate the total log likelihood statistic
* given a HashMap of strings and their Laplace probabilities
* and the total number of ngrams in the model.
*
* @param logs map of ngram strings and their log likelihood
* @return total log likelihood: the sum of loglikelihoods in logs
*/
private double totalLogLikelihood(HashMap<String,Double> logs)
{
//TODO
return 0.1;
}
/**
* @return the average log likelihood statistic
*/
public double getAverageLogLikelihood()
{
return averageLogLikelihood;
}
/**
* @return the log likelihood value for a given ngram from the input string
*/
public double getLogLikelihood(String ngram)
{
return (logLikelihoodMap.get(ngram));
}
/**
* Make a String summarising the log likelihood map and its statistics
* @return String of ngrams and their loglikeihood differences between the models
* The likelihood table should be ordered from highest to lowest likelihood
*/
public String toString()
{
//TODO
return null;
}
}
————————————————————————————————————————–
————————————————————————————————————————–
<<MatcherController.java>>
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
import java.io.*;
/** Create and manipulate Markov models and model matchers for lists of training data
* a test data String and generate output from it for convenient display.
*
* @author
* @version
*
*/
public class MatcherController {
/** list of training data string used to generate markov models */
ArrayList<String> trainingDataList;
/** test data to be matched with the models */
String testData;
/** order of the markov models*/
int k;
/** generated list of markov models for the given training data*/
ArrayList<MarkovModel> modelList;
/** generated list of matchers for the given markov models and test data*/
ArrayList<ModelMatcher> matcherList;
/** Generate models for analysis
* @param k order of the markov models to be used
* @param testData String to check against different models
* @throw unchecked exceptions if the input order or data inputs are invalid
*/
public MatcherController(int k, ArrayList<String> trainingDataList, String testData)
{
//TODO
}
/** @return a string containing all lines from a file
* ff file contents can be got, otherwise null
* This method should process any exceptions that arise.
*/
private static String getFileContents(String filename) {
//TODO
return null;
}
/**
* @return the ModelMatcher object that has the highest average loglikelihood
* (where all candidates are trained for the same test string
*/
public ModelMatcher getBestMatch(ArrayList<ModelMatcher> candidates)
{
//TODO
return null;
}
/** @return String an *explanation* of
* why the test string is the match from the candidate models
*/
public String explainBestMatch(ModelMatcher best) {
//TODO
return null;
}
/** Display an error to the user in a manner appropriate
* for the interface being used.
*
* @param message
*/
public void displayError(String message) {
// LEAVE THIS METHOD EMPTY
}
}
————————————————————————————————————————–
————————————————————————————————————————–
<<ProjectTest.java>>
import static org.junit.Assert.*;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
/**
* The test class ProjectTest for student test cases.
* Add all new test cases to this task.
*
* @author
* @version
*/
public class ProjectTest
{
/**
* Default constructor for test class ProjectTest
*/
public ProjectTest()
{
}
/**
* Sets up the test fixture.
*
* Called before every test case method.
*/
@Before
public void setUp()
{
}
/**
* Tears down the test fixture.
*
* Called after every test case method.
*/
@After
public void tearDown()
{
}
//TODO add new test cases from here include brief documentation
@Test(timeout=1000)
public void testLaplaceExample() {
assertEquals(0,1); //TODO replace with test code
}
@Test(timeout=1000)
public void testSimpleExample() {
assertEquals(0,1); //TODO replace with test code
}
@Test
public void testTask3example()
{
MarkovModel model = new MarkovModel(2,”aabcabaacaac”);
ModelMatcher match = new ModelMatcher(model,”aabbcaac”);
assertEquals(0,1); //TODO replace with test code
}
}
Expert Answer
=========================================================================
import java.util.Set;
/**
* Construct a Markov model of order /k/ based on an input string.
*
*/
public class MarkovModel
{
/** Markov model order parameter */
int k;
/** ngram model of order k */
NgramAnalyser ngram;
/** ngram model of order k+1 */
NgramAnalyser n1gram;
// probability of the next ngram occuring
double probability;
//the size of the alphabet
int alphSize;
/**
* Construct an order-k Markov model from string s
* @param k int order of the Markov model
* @param s String input to be modelled
*/
public MarkovModel(int k, String s)
{
this.k = k;
ngram = new NgramAnalyser (k, s);
n1gram = new NgramAnalyser (k+1, s);
}
/**
* @return order of this Markov model
*/
public int getK()
{
return this.k;
}
/**
* Estimate the probability of a sequence appearing in the text
* using simple estimate of freq seq / frequency front(seq).
* @param sequence String of length k+1
* @return double probability of the last letter occuring in the
* context of the first ones or 0 if front(seq) does not occur.
*/
public double simpleEstimate(String sequence) {
int sequenceFreq = n1gram.getNgramFrequency(sequence);
int preFreq = ngram.getNgramFrequency(sequence.substring(0,sequence.length()-1));
if (sequenceFreq == 0){
probability = 0;
} else{
probability = (double) sequenceFreq / (double) preFreq;
}
return probability;
}
/**
* Calculate the Laplacian probability of string obs given this Markov model
* @input sequence String of length k+1
*/
public double laplaceEstimate(String sequence)
{
int sequenceFreq = n1gram.getNgramFrequency(sequence);
int preFreq = ngram.getNgramFrequency(sequence.substring(0,sequence.length()-1));
alphSize = ngram.getAlphabetSize();
probability = ((double) (sequenceFreq + 1)) / ((double) (preFreq + alphSize));
return probability;
}
/**
* @return String representing this Markov model
*/
public String toString()
{
String result = “The ” + k + ” order of the Markov Modeln” + “alphabet size of ” + alphSize + “n”;
result += ngram.toString() + n1gram.toString();
System.out.println(result);
return result;
}
}
============================================================================================================
//MatcherController.java
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
import java.io.*;
/**
* Create and manipulate Markov models and model matchers for lists of training data
* a test data String and generate output from it for convenient display.
*
*
*/
public class MatcherController {
/** list of training data string used to generate markov models */
ArrayList<String> trainingDataList;
/** test data to be matched with the models */
String testData;
/** order of the markov models */
int k;
/** generated list of markov models for the given training data */
ArrayList<MarkovModel> modelList;
/** generated list of matchers for the given markov models and test data */
ArrayList<ModelMatcher> matcherList;
/** best ModelMatcher for the given testData string */
ModelMatcher bestModelMatcher;
/**
* Generate models for analysis
* Initialize class fields
* @param k order of the markov models to be used
* @param testData String to check against different models
* @throw unchecked exceptions if the input order or data inputs are invalid
*/
public MatcherController(int k, ArrayList<String> trainingDataList, String testData) {
this.checkInputs(k, trainingDataList, testData);
this.k = k;
this.testData = testData;
this.trainingDataList = trainingDataList;
this.modelList = new ArrayList<MarkovModel>();
this.matcherList = new ArrayList<ModelMatcher>();
ModelMatcher tempMatcher;
MarkovModel tempModel;
for (String trainingString : this.trainingDataList) {
tempModel = new MarkovModel(k, trainingString);
tempMatcher = new ModelMatcher(tempModel, testData);
this.modelList.add(tempModel);
this.matcherList.add(tempMatcher);
}
this.bestModelMatcher = this.getBestMatch(this.matcherList);
}
/**
* @return a string containing all lines from a file
* if file contents can be got, otherwise null
* This method should process any exceptions that arise.
*/
private static String getFileContents(String filename) {
try {
String outputString = “”;
ArrayList<String> fileLines = FileIO.readFile(filename);
for (String fileLine : fileLines) {
outputString = outputString + fileLine;
}
return outputString;
} catch(FileNotFoundException e) {
//TODO
} catch(IOException e) {
//TODO
}
return null;
}
/**
* @return the ModelMatcher object that has the highest average loglikelihood
* (where all candidates are trained for the same test string)
*/
public ModelMatcher getBestMatch(ArrayList<ModelMatcher> candidates) {
double bestLikelihood = 0;
ModelMatcher bestMatcher = candidates.get(0);
double tempLikelihood;
for (ModelMatcher matcher : candidates) {
tempLikelihood = matcher.getAverageLogLikelihood();
if (bestLikelihood == 0 || tempLikelihood > bestLikelihood) {
bestLikelihood = tempLikelihood;
bestMatcher = matcher;
}
}
return bestMatcher;
}
/**
* @return String an *explanation* of
* why the test string is the match from the candidate models
* Prints a bar chart of each loglikelihood relative to the lowest loglikelihood
* Table rows are based on the negative inverse of each loglikelihood
* to produce a proportional and increasing group of values.
* Loglikelihoods are first modified as previously stated (inverse and sign changed)
* These modified bar likelihoods are then converted into rations (barNumbers)
* Also shows numerical values next to each table with best one labelled
*/
public String explainBestMatch(ModelMatcher best) {
ArrayList<Double> modifiedLoglikelihoods = new ArrayList<Double>();
ArrayList<Double> normalLoglikelihoods = new ArrayList<Double>();
ArrayList<Long> barNumbers = new ArrayList<Long>();
Double lowestLikelihood = 0.0;
Double loglikelihood = 0.0;
String outputString = “”;
//Retrieve loglikelihoods from matcher array
for (ModelMatcher matcher : this.matcherList) {
loglikelihood = matcher.getAverageLogLikelihood();
normalLoglikelihoods.add(loglikelihood);
//Modify likelihoods and add them to modified array
loglikelihood = -(1 / loglikelihood);
modifiedLoglikelihoods.add(loglikelihood);
}
//Find the lowest likelihood to scale bar numbers against
for (Double scaledLikelihood : modifiedLoglikelihoods) {
if (lowestLikelihood == 0.0 || scaledLikelihood < lowestLikelihood) {
lowestLikelihood = scaledLikelihood;
}
}
Double relativeLikelihood;
Long barNumber;
//Get ratio of modified likelihoods to lowest likelihood and add to barNumbers
for (int i = 0; i < modifiedLoglikelihoods.size(); i++) {
relativeLikelihood = modifiedLoglikelihoods.get(i);
barNumber = Math.round(relativeLikelihood / lowestLikelihood);
barNumbers.add(i, barNumber);
}
outputString += “Table of Average Likelihoods for each Text Souce:”;
Double actualLikelihood;
Long barScaleValue = 1l;
//Ensure the bar lengths aren’t too large
//barScaleValue scales back each barnumber by a whole factor
Long largestBarLength =
Math.round((-(1/best.getAverageLogLikelihood()))/lowestLikelihood);
while (largestBarLength > 25) {
barScaleValue += 1;
largestBarLength = (long)Math.round(largestBarLength / barScaleValue);
}
//Format and create chart
for (int i = 0; i < barNumbers.size(); i++) {
barNumber = barNumbers.get(i);
actualLikelihood = normalLoglikelihoods.get(i);
outputString += “n”;
outputString += String.format(“%.5g”, actualLikelihood);
if (this.matcherList.get(i) == best) {
outputString += “##|”;
} else {
outputString += ” |”;
}
//Prints out barNumber many ‘-‘ characters for rows of chart.
int numberOfDashes = ((int) (long) barNumber)/((int) (long)barScaleValue);
outputString += new String(new char[numberOfDashes]).replace(“