Analysing n-grams in a sample text (NgramAnalyser)
<<NgramAnalyser.java>>
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
import java.util.HashSet;
import java.util.Arrays;
/**
* Perform n-gram analysis of a string.
*
* Analyses the frequency with which distinct n-grams, of length n,
* appear in an input string. For the purposes of all analyses of the input
* string, the final n-1 n-grams appearing in the string should be
* “filled out” to a length of n characters, by adding
* a sequence of contiguous characters from the start of the string.
* e.g. “abbc” includes “bca” and “cab” in its 3-grams
*
* @author
* @version
*/
public class NgramAnalyser
{
/** dictionary of all distinct n-grams and their frequencies */
private HashMap<String,Integer> ngram;
/** number of distinct characters in the input */
private int alphabetSize;
/** n-gram size for this object (new field) */
private int ngramSize;
/**
* Analyse the frequency with which distinct n-grams, of length n,
* appear in an input string.
* n-grams at the end of the string wrap to the front
* e.g. “abbbbc” includes “bca” and “cab” in its 3-grams
* @param int n size of n-grams to create
* @param String inp input string to be modelled
*/
public NgramAnalyser(int n, String inp)
{
//TODO replace this line with your code
}
/**
* Analyses the input text for n-grams of size 1.
*/
public NgramAnalyser(String inp)
{
this(1,inp);
}
/**
* @return int the size of the alphabet of a given input
*/
public int getAlphabetSize() {
//TODO replace this line with your code
return -1;
}
/**
* @return the total number of distinct n-grams appearing
* in the input text.
*/
public int getDistinctNgramCount() {
//TODO replace this line with your code
return -1;
}
/**
* @return Return a set containing all the distinct n-grams
* in the input string.
*/
public Set<String> getDistinctNgrams() {
//TODO replace this line with your code
return null;
}
/**
* @return the total number of n-grams appearing
* in the input text (not requiring them to be distinct)
*/
public int getNgramCount() {
//TODO replace this line with your code
return -1;
}
/** Return the frequency with which a particular n-gram appears
* in the text. If it does not appear at all, return 0.
*
* @param ngram The n-gram to get the frequency of
* @return The frequency with which the n-gram appears.
*/
public int getNgramFrequency(String ngram) {
//TODO replace this line with your code
return -1;
}
/**
* Generate a summary of the ngrams for this object.
* @return a string representation of the n-grams in the input text
* comprising the ngram size and then each ngram and its frequency
* where ngrams are presented in alphabetical order.
*/
public String toString()
{
//TODO replace this line with your code
return null;
}
}
————————————————————————————————————————–
————————————————————————————————————————–
<<ProjectTest.java>>
import static org.junit.Assert.*;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
/**
* The test class ProjectTest for student test cases.
* Add all new test cases to this task.
*
* @author
* @version
*/
public class ProjectTest
{
/**
* Default constructor for test class ProjectTest
*/
public ProjectTest()
{
}
/**
* Sets up the test fixture.
*
* Called before every test case method.
*/
@Before
public void setUp()
{
}
/**
* Tears down the test fixture.
*
* Called after every test case method.
*/
@After
public void tearDown()
{
}
//TODO add new test cases from here include brief documentation
@Test(timeout=1000)
public void testSensibleToStringSize() {
assertEquals(0,1); //TODO replace with test code
}
@Test(timeout=1000)
public void testGetDistinctNgrams() {
assertEquals(0,1); //TODO replace with test code
}
}
Expert Answer
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
import java.util.HashSet;
import java.util.Arrays;
import java.util.Map;
import java.util.TreeMap;
/**
* Perform n-gram analysis of a string.
*
* Analyses the frequency with which distinct n-grams, of length n,
* appear in an input string. For the purposes of all analyses of the input
* string, the final n-1 n-grams appearing in the string should be
* “filled out” to a length of n characters, by adding
* a sequence of contiguous characters from the start of the string.
* e.g. “abbc” includes “bca” and “cab” in its 3-grams
*
* @author your_full_name
* @version current_version
*/
public class NgramAnalyser
{
/** dictionary of all distinct n-grams and their frequencies */
private HashMap<String,Integer> ngram;
/** number of distinct characters in the input */
private int alphabetSize;
/** n-gram size for this object (new field) */
private int ngramSize;
/**
* Analyse the frequency with which distinct n-grams, of length n,
* appear in an input string.
* n-grams at the end of the string wrap to the front
* e.g. “abbbbc” includes “bca” and “cab” in its 3-grams
* @param int n size of n-grams to create
* @param String inp input string to be modelled
*/
public NgramAnalyser(int n, String inp)
{
if(inp==null || inp.length()==0 || n==0 || n>inp.length()){
throw new IllegalArgumentException();
}
ngram = new HashMap<String,Integer>();
int inpLength = inp.length();
//Adding extra wrap at the end of string so that substrings for
//later starting positions can be found easily
inp = inp + inp.substring(0,n-1);
ArrayList<Character> unique_chars = new ArrayList<Character>();
for (int i=0; i<inpLength; i++) {
String temp = inp.substring(i,i+n);
Integer freq = ngram.get(temp);
if(freq != null){
//String already in the Hashmap then increase frequency by 1
ngram.put(temp,freq+1);
}else{
//String not already present in Hashmap then initiazlize with 1 as frequency
ngram.put(temp,1);
}
if( !unique_chars.contains( inp.charAt(i) ) ){
//Check if this character is not seen then add it to unique_chars list
unique_chars.add( inp.charAt(i) );
}
}
alphabetSize = unique_chars.size();
ngramSize = ngram.size();
}
/**
* Analyses the input text for n-grams of size 1.
*/
public NgramAnalyser(String inp)
{
this(1,inp);
}
/**
* @return int the size of the alphabet of a given input
*/
public int getAlphabetSize() {
return alphabetSize;
}
/**
* @return the total number of distinct n-grams appearing
* in the input text.
*/
public int getDistinctNgramCount() {
return ngramSize;
}
/**
* @return Return a set containing all the distinct n-grams
* in the input string.
*/
public Set<String> getDistinctNgrams() {
//TODO replace this line with your code
return null;
}
/**
* @return the total number of n-grams appearing
* in the input text (not requiring them to be distinct)
*/
public int getNgramCount() {
int count = 0;
for(String key: ngram.keySet()){
count = count + ngram.get(key);
}
return count;
}
/** Return the frequency with which a particular n-gram appears
* in the text. If it does not appear at all, return 0.
*
* @param ngram The n-gram to get the frequency of
* @return The frequency with which the n-gram appears.
*/
public int getNgramFrequency(String ngram) {
Integer freq = this.ngram.get(ngram);
if(freq!=null){
return freq;
}else{
return 0;
}
}
/**
* Generate a summary of the ngrams for this object.
* @return a string representation of the n-grams in the input text
* comprising the ngram size and then each ngram and its frequency
* where ngrams are presented in alphabetical order.
*/
public String toString()
{
Map<String, Integer> treeMap = new TreeMap<String, Integer>(ngram);
String return_string = Integer.toString(ngramSize) + “n”;
for(String key: treeMap.keySet()){
return_string = return_string + key + ” ” + Integer.toString(ngram.get(key)) + “n”;
}
return return_string;
}
}
<<ProjectTest.java>>
import static org.junit.Assert.*;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
/**
* The test class ProjectTest for student test cases.
* Add all new test cases to this task.
*
* @author
* @version
*/
public class ProjectTest
{
/**
* Default constructor for test class ProjectTest
*/
public ProjectTest()
{
}
/**
* Sets up the test fixture.
*
* Called before every test case method.
*/
@Before
public void setUp()
{
}
/**
* Tears down the test fixture.
*
* Called after every test case method.
*/
@After
public void tearDown()
{
}
//TODO add new test cases from here include brief documentation
@Test(timeout=1000)
public void testSensibleToStringSize() {
String input_string = “abbc”;
int input_n = 3;
NgramAnalyser ng = new NgramAnalyser(input_n,input_string);
int num_lines = ng.toString().split(“n”).length;
assertTrue(num_lines>=1+ng.getAlphabetSize());
}
@Test(timeout=1000)
public void testGetDistinctNgrams() {
assertEquals(0,1); //TODO replace with test code
}
}