2024-08-10 01:24:54 -05:00
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.Arrays;
|
|
|
|
import java.io.File;
|
|
|
|
import java.io.FileNotFoundException;
|
|
|
|
import java.util.Scanner;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Class used for training the text-generator, and a main method with a basic demo.
|
|
|
|
*/
|
|
|
|
public class MarkovWordGen {
|
|
|
|
private ArrayList<WordNode> wordNodes;
|
|
|
|
private int nGram;
|
|
|
|
|
|
|
|
public static void main(String abcd[]) {
|
|
|
|
|
|
|
|
/* Create a bot as a bigram model. */
|
2024-08-10 10:44:15 -05:00
|
|
|
String data = readFile("./Poe.txt");
|
2024-08-10 01:24:54 -05:00
|
|
|
MarkovWordGen bot = new MarkovWordGen(2);
|
|
|
|
bot.train(data);
|
|
|
|
System.out.println(bot.generate("The thousand", 1000));
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Simple constructor.
|
|
|
|
*/
|
|
|
|
public MarkovWordGen(int n) {
|
|
|
|
wordNodes = new ArrayList<>();
|
|
|
|
nGram = n;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Trains using the parameter "input" as the training data. The parameter "nGram"
|
|
|
|
* represents whether it is a unigram, bigram, trigram, etc. Anything more than
|
|
|
|
* trigram will probably not work well, though. The method returns an ArrayList of
|
|
|
|
* WordNodes that can be used to generate text based on a seed (a starting value
|
|
|
|
* like "The" for a unigram, or "The dog" for a bigram).
|
|
|
|
*/
|
|
|
|
public void train(String input) {
|
|
|
|
|
|
|
|
// Convert the one large string input into an array of tokens.
|
|
|
|
ArrayList<String> tokens = new ArrayList<>(Arrays.asList(input.split("\\s+")));
|
|
|
|
|
|
|
|
for(int i = 0; i < tokens.size(); i++) {
|
|
|
|
tokens.get(i).trim();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Iterate through the tokens to train.
|
|
|
|
for(int i = 0; i < tokens.size() - nGram; i++) {
|
|
|
|
|
|
|
|
// Combine nGram tokens to create a key.
|
|
|
|
String key = "";
|
|
|
|
for(int j = 0; j < nGram - 1; j++) {
|
|
|
|
key += tokens.get(i+j) + " ";
|
|
|
|
}
|
|
|
|
key += tokens.get(i + nGram-1);
|
|
|
|
|
|
|
|
String next = tokens.get(i+nGram); // The token after the key value.
|
2024-08-10 10:44:15 -05:00
|
|
|
|
2024-08-10 11:00:16 -05:00
|
|
|
// Skip over ##STOP##. This is used to separate independent pieces of training
|
2024-08-10 10:44:15 -05:00
|
|
|
// data. If you are training with Edgar Poe stories for example, you can put
|
|
|
|
// ##STOP## between stories, and it won't consider the transition from one
|
|
|
|
// to the next as a natural flow, and will train on each story independantly.
|
2024-08-10 01:24:54 -05:00
|
|
|
if(next.equals("##STOP##")) {
|
|
|
|
i += nGram+1;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Find the index of that key in out, if it exists. If not, leave index -1.
|
|
|
|
int index = -1;
|
|
|
|
for(int j = 0; j < wordNodes.size(); j++) {
|
|
|
|
WordNode n = wordNodes.get(j);
|
|
|
|
|
|
|
|
// If this key has appeared before...
|
|
|
|
if(n.getKey().equals(key)) {
|
|
|
|
|
|
|
|
// ...then record the index and stop looking for more.
|
|
|
|
index = j;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If this key hasn't been seen before...
|
|
|
|
if(index == -1) {
|
|
|
|
wordNodes.add(new WordNode(key, next)); // ...then add the key and value.
|
|
|
|
} else { // And if it has...
|
|
|
|
wordNodes.get(index).add(next); // ...then add this value to that WordNode.
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-10 11:00:16 -05:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns a String generated from the markov chain.
|
|
|
|
*/
|
2024-08-10 01:24:54 -05:00
|
|
|
public String generate(String seed, int tokens) {
|
|
|
|
String[] context = seed.split(" ");
|
|
|
|
String out = seed + " ";
|
|
|
|
|
|
|
|
for(int i = 0; i < tokens; i++) {
|
|
|
|
// Make strContext, combining the array context into a single string.
|
|
|
|
String strContext = "";
|
|
|
|
for(int j = 0; j < context.length - 1; j++) {
|
|
|
|
strContext += context[j] + " ";
|
|
|
|
}
|
|
|
|
strContext += context[context.length - 1];
|
2024-08-10 11:00:16 -05:00
|
|
|
|
2024-08-10 01:24:54 -05:00
|
|
|
// See if strContext is the key to any WordNode.
|
|
|
|
for (int j = 0; j < wordNodes.size(); j++) {
|
|
|
|
if(wordNodes.get(j).getKey().equals(strContext)) {
|
|
|
|
// Make room for the new word in the context array.
|
|
|
|
for(int k = 0; k < context.length - 1; k++) {
|
|
|
|
context[k] = context[k+1];
|
|
|
|
}
|
|
|
|
|
|
|
|
String newWord = wordNodes.get(j).generateWord(); // Find the new word.
|
|
|
|
|
|
|
|
|
|
|
|
// Add the newly generated word to the context
|
|
|
|
context[context.length - 1] = newWord;
|
|
|
|
out += newWord + " ";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
2024-08-10 11:00:16 -05:00
|
|
|
/**
|
|
|
|
* Simple printout of wordNodes in a readable format. Useful for debugging.
|
|
|
|
*/
|
2024-08-10 01:24:54 -05:00
|
|
|
public void printNodes() {
|
|
|
|
for (WordNode n : wordNodes) {
|
|
|
|
System.out.println(n.getKey() + " - ");
|
|
|
|
for(int i = 0; i < n.getWordVals().size(); i++) {
|
|
|
|
System.out.println(" " + n.getWordVals().get(i) + " -- " + n.getNumVals().get(i));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-10 11:00:16 -05:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Reads a text file into one String and returns it.
|
|
|
|
*/
|
2024-08-10 01:24:54 -05:00
|
|
|
public static String readFile(String filePath) {
|
|
|
|
String out = "";
|
|
|
|
try {
|
|
|
|
Scanner scn = new Scanner(new File(filePath));
|
|
|
|
while(scn.hasNextLine()) {
|
|
|
|
out += scn.nextLine() + " ";
|
|
|
|
}
|
|
|
|
scn.close();
|
|
|
|
} catch(FileNotFoundException e) {
|
2024-08-10 10:44:15 -05:00
|
|
|
System.err.println("File not found.");
|
2024-08-10 01:24:54 -05:00
|
|
|
}
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
}
|