Question & Answer: Task l-Analysing n-grams in a sample text (NgramAnalyser) For this task, you will need to complete the NgramAnalyser class, and add code to the ProjectTest class……

Task l-Analysing n-grams in a sample text (NgramAnalyser) For this task, you will need to complete the NgramAnalyser class, and add code to the ProjectTest class. The NgramAnalyser class analyses an input string, passed to it in the constructor, and counts all the n-grams of letters that occur in the string. An n-gram is simply a (contiguous) sequence of n items from a piece of text-the items we will be considering for this class are characters. (One could also analyse n-grams of words, syllables, or even sentences. For instance, a 2-gram (also called a bigram is a pair of characters, a 3-gram is a triple of characters, and so on. By way of example, consider the following string: “the rain in Spain” The alphabet size is 10 (unique characters including spaces) and the grams in the string are th he “e r”, “ra”, ai in “n”, i in “n S”, Sp “pa”, ai in nt If we remove duplicates, they are: th he “e r”. ra. “ai”. “in”. “n i”. S”. “Sp”. pa nt And if we count how often each 2-gram appears in the input string, we get the following results gram Frequency Sp “ai” he in pa ra th nt Two-gram frequencies The NgramAnalyser class is given a string as input to its constructor, and optionally an n-gram size n. It should analyse the n-grams in the input string, and record their frequencies in the hash-map ngram. It should also record the total number of distinct characters that appear in the input string (i.e., the “alphabet” used by the input string), and store this count in the field alphabetsize.

Expert Answer

ProjectTest.java

import static org.junit.Assert.*;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.util.Set;

public class ProjectTest
{
/**
* Default constructor for test class ProjectTest
*/
public ProjectTest() {

}

/**
* Sets up the test fixture.
*
* Called before every test case method.
*/
@Before
public void setUp() {
}

/**
* Tears down the test fixture.
*
* Called after every test case method.
*/
@After
public void tearDown() {

}

//TODO add new test cases from here include brief documentation

/**
* Test if the number of lines in a string output from Ngram.toString()
* is valid (i.e equal to the size of the alphabet of that Ngram)
* Also ensures that the sort, splice and constructor functions work
* as required to produce the required comparison
*/
@Test(timeout=1000)
public void testSensibleToStringSize() {

String[] stringsToTest = {“Hello my friend”,
“be”,
“Have a nice day you filthy animal”,
“asdfghjkl$$sdfghj%%”,
“2”,
“adadadadaaaaa”,
”    “};

Integer[] ngramSizesToTest = {1, 2, 3, 4, 5};

NgramAnalyser analysis;
String analysisString;

int i = ngramSizesToTest[0];
String s = stringsToTest[5];

if (i > s.length()) {
try {
analysis = new NgramAnalyser(i, s);
} catch (IllegalArgumentException e) {
assertEquals(0, 0);
}
} else {
analysis = new NgramAnalyser(i, s);
analysisString = analysis.toString();

//Number of lines is equal to the number of n’s plus 1
int numberofLines = analysisString.length() –
analysisString.replace(“n”, “”).length() + 1;

assert(numberofLines >= analysis.getAlphabetSize());

}
}

/**
* Tests various aspects of the getDistinctNgrams function
* inlcuding set length with comparison to basic boundaries
*/

@Test(timeout=1000)
public void testGetDistinctNgrams() {
String[] stringsToTest = {
“123!@#123!@#”,
“adadadadadadadad”,
“cadadcdaadcdbed”,
“aaaaaa”,
“HOWWEYVUFXBINEF”
};

String stringToTest = stringsToTest[0];
int ngramSize = 2;
NgramAnalyser analysis = new NgramAnalyser(ngramSize, stringToTest);

Set<String> distinctNgrams = analysis.getDistinctNgrams();
int distinctNgramCount = analysis.getDistinctNgramCount();
int totalNgramCount = analysis.getNgramCount();

//Test that there are fewer or equal distinct Ngrams than total Ngrams
assert(distinctNgramCount <= totalNgramCount);

//Test that there are fewer or equal distinct Ngrams than the size
//of the analysed string
assert(distinctNgramCount <= stringToTest.length());

//Test that the alphabet size is smaller than
//or equal to the number of distinct NGrams
assert(analysis.getAlphabetSize() <= distinctNgramCount);

}

/**
* Tests the NgramAnalyser function for more complicated and longer ngrams
*
*/
@Test(timeout=1000)
public void testNgramAnalyser() {

String stringToTest = “baaaaaaaaaamsdbfajeduhgtraaaab”;
int ngramSize = 16;
NgramAnalyser analysis = new NgramAnalyser(ngramSize, stringToTest);

//Test toString method
String toString = analysis.toString();
//System.out.println(toString); //REMOVE BEFORE SUBMITTING!!!!!

//Test that ngramCount = length of the string
assert(analysis.getNgramCount() == stringToTest.length());

}

@Test(timeout=1000)
public void testLaplaceExample() {
assertEquals(0,1); //TODO replace with test code
}

@Test(timeout=1000)
public void testSimpleExample() {
assertEquals(0,1); //TODO replace with test code
}

@Test
public void testTask3example()
{
MarkovModel model = new MarkovModel(2,”aabcabaacaac”);
ModelMatcher match = new ModelMatcher(model,”aabbcaac”);
assertEquals(0,1); //TODO replace with test code
}
}

MarkovModel.java

public class MarkovModel
{

/** Markov model order parameter */
int k;
/** ngram model of order k */
NgramAnalyser ngram;
/** ngram model of order k+1 */
NgramAnalyser n1gram;

/**
* Construct an order-k Markov model from string s
* @param k int order of the Markov model
* @param s String input to be modelled
*/
public MarkovModel(int k, String s)
{
ngram = new NgramAnalyser(k, s);
n1gram = new NgramAnalyser((k+1), s);
}

/**
* @return order of this Markov model
*/
public int getK()
{
return k;
}

/** Estimate the probability of a sequence appearing in the text
* using simple estimate of freq seq / frequency front(seq).
* @param sequence String of length k+1
* @return double probability of the last letter occurring in the
* context of the first ones or 0 if front(seq) does not occur.
*/
public double simpleEstimate(String sequence) {
double prob;
String seqNotLast = sequence.substring(0, sequence.length()-1);

if (ngram.getDistinctNgrams().contains(seqNotLast))
{
double n1g = n1gram.getNgramFrequency(sequence);
double ng = ngram.getNgramFrequency(seqNotLast);
try{
prob = (n1g/ng);
}
catch(ArithmeticException e){
return 0.0;
}
return prob;
}
else
{
return 0.0;
}
}
/**
* Calculate the Laplacian probability of string obs given this Markov model
* @input sequence String of length k+1
* @return Laplacian Probability
*/
public double laplaceEstimate(String sequence)
{
//TODO replace this line with your code
String context = sequence.substring(0, sequence.length()-1);
double npc = n1gram.getNgramFrequency(sequence);
double np = ngram.getNgramFrequency(context);
double laplace;
laplace = (npc + 1)/(np + ngram.getAlphabetSize());
return laplace;
}

/**
* @return String representing this Markov model
*/
public String toString()
{
//TODO replace this line with your code
String toRet = “”;
String k = Integer.toString(getK());
toRet += (k + “n”);
toRet += (Integer.toString(ngram.getAlphabetSize()) + “n”);
toRet += ngram.toString() + n1gram.toString();
return toRet;
}

}

NgramAnalyser.java

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;

import java.util.HashSet;
import java.util.Arrays;

//packages for iterating through hashmap
import java.util.Map;
import java.util.Iterator;

public class NgramAnalyser
{
/** dictionary of all distinct n-grams and their frequencies */
private HashMap<String,Integer> ngram;

/** number of distinct characters in the input */
private int alphabetSize;

/** n-gram size for this object (new field) */
private int ngramSize;

/** input length for this object (new field) */
private int inputLength;

/**
* Analyse the frequency with which distinct n-grams, of length n,
* appear in an input string.
* n-grams at the end of the string wrap to the front
* e.g. “abbbbc” includes “bca” and “cab” in its 3-grams
* @param int n size of n-grams to create
* @param String inp input string to be modelled
*/
public NgramAnalyser(int n, String inp) {
if(inp != null && inp != “” && n > 0 && n <= inp.length()) {
this.ngramSize = n;
this.inputLength = inp.length();
this.ngram = new HashMap<>(inp.length(), inp.length());
for (int i = 0; i < inp.length(); i++) { //loops through each character in inp
String currentNGram = “”; //new nGram starting at ith position
for (int j = i; j-i < n ; j++) { //starting from the ith character, loop n characters after this
currentNGram = currentNGram.concat(inp.substring(j%inp.length(), j%inp.length()+1)); //concatonates the jth char to currNGram
}
if (ngram.containsKey(currentNGram)) { //if the ngram exists, add one to its frequency
ngram.put(currentNGram, ngram.get(currentNGram) +1);
} else {
ngram.put(currentNGram, 1); //otherwise create a key for this ngram
}
}

//Prints the ngram
/*
Set set = ngram.entrySet();
Iterator iterator = set.iterator();
while(iterator.hasNext()) {
Map.Entry mentry = (Map.Entry)iterator.next();
System.out.print(“key is: “+ mentry.getKey() + ” & Value is: “);
System.out.println(mentry.getValue());
}
*/

//Alphabet size calculation
if (n != 1) {
NgramAnalyser alpha = new NgramAnalyser(inp);
this.alphabetSize = alpha.getDistinctNgramCount(); //find alphabet size by getting number of distinct 1-grams
} else {
this.alphabetSize = this.getDistinctNgramCount(); // 1-grams are simply a list of distinct characters, also bottoms recursion.
}
} else {
throw new IllegalArgumentException(“ngram size must be between 1 and the length of the input string. Input string must not be null or empty.”);
}
}

/**
* Analyses the input text for n-grams of size 1.
*/
public NgramAnalyser(String inp) {
this(1,inp);
}

/**
* @return int the size of the alphabet of a given input
*/
public int getAlphabetSize() {
return alphabetSize;
}

/**
* @return the total number of distinct n-grams appearing
*         in the input text.
*/
public int getDistinctNgramCount() {
//TODO replace this line with your code
return ngram.size();
}

/**
* @return Return a set containing all the distinct n-grams
*         in the input string.
*/
public Set<String> getDistinctNgrams() {
//TODO replace this line with your code
return ngram.keySet();
}

/**
* @return the total number of n-grams appearing
*         in the input text (not requiring them to be distinct)
*/
public int getNgramCount() {
//TODO replace this line with your code
return this.inputLength;
}

/** Return the frequency with which a particular n-gram appears
* in the text. If it does not appear at all, return 0.
*
* @param ngram The n-gram to get the frequency of
* @return The frequency with which the n-gram appears.
*/
public int getNgramFrequency(String ngram) {
//TODO replace this line with your code
return this.ngram.get(ngram);
}

/**
* Generate a summary of the ngrams for this object.
* @return a string representation of the n-grams in the input text
* comprising the ngram size and then each ngram and its frequency
* where ngrams are presented in alphabetical order.
*/
public String toString() {
//TODO replace this line with your code
String[] keys = ngram.keySet().toArray(new String[0]);
Arrays.sort(keys);
Integer a = ngramSize;
String answer = a.toString();
for (int i =0; i < ngram.keySet().size();i++) {
answer = answer.concat(“n” + keys[i] + ” “);
answer = answer.concat(this.getNgramFrequency(keys[i]) + “”);
}
return answer;
}

}

Still stressed from student homework?
Get quality assistance from academic writers!