MarkovTextGen/MarkovWordGen.java

import java.util.ArrayList;
import java.util.Arrays;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.Scanner;

/**
 * Class used for training the text-generator, and a main method with a basic demo.
 */
public class MarkovWordGen {
    private ArrayList<WordNode> wordNodes;
    private int nGram;

    public static void main(String abcd[]) {

        /* Create a bot as a bigram model. */
        String data = readFile("./Poe.txt");
        MarkovWordGen bot = new MarkovWordGen(2);
        bot.train(data);
        System.out.println(bot.generate("The thousand", 1000));
    }

    /**
     * Simple constructor.
     */
    public MarkovWordGen(int n) {
        wordNodes = new ArrayList<>();
        nGram = n;
    }

    /**
     * Trains using the parameter "input" as the training data. The parameter "nGram"
     * represents whether it is a unigram, bigram, trigram, etc. Anything more than
     * trigram will probably not work well, though. The method returns an ArrayList of
     * WordNodes that can be used to generate text based on a seed (a starting value
     * like "The" for a unigram, or "The dog" for a bigram).
     */
    public void train(String input) {

        // Convert the one large string input into an array of tokens.
        ArrayList<String> tokens = new ArrayList<>(Arrays.asList(input.split("\\s+")));

        for(int i = 0; i < tokens.size(); i++) {
            tokens.get(i).trim();
        }

        // Iterate through the tokens to train.
        for(int i = 0; i < tokens.size() - nGram; i++) {

            // Combine nGram tokens to create a key.
            String key = "";
            for(int j = 0; j < nGram - 1; j++) {
                key += tokens.get(i+j) + " ";
            }
            key += tokens.get(i + nGram-1);

            String next = tokens.get(i+nGram); // The token after the key value.

            // Skip over stop. This is used to separate independant pieces of training
            // data. If you are training with Edgar Poe stories for example, you can put
            // ##STOP## between stories, and it won't consider the transition from one
            // to the next as a natural flow, and will train on each story independantly.
            if(next.equals("##STOP##")) {
                i += nGram+1;
            }

            // Find the index of that key in out, if it exists. If not, leave index -1.
            int index = -1;
            for(int j = 0; j < wordNodes.size(); j++) {
                WordNode n = wordNodes.get(j); 

                // If this key has appeared before...
                if(n.getKey().equals(key)) {

                    // ...then record the index and stop looking for more.
                    index = j;
                }
            }

            // If this key hasn't been seen before...
            if(index == -1) {
                wordNodes.add(new WordNode(key, next)); // ...then add the key and value.
            } else { // And if it has...
                wordNodes.get(index).add(next); // ...then add this value to that WordNode.
            }
        }
    }

    public String generate(String seed, int tokens) {
        String[] context = seed.split(" ");
        String out = seed + " ";

        for(int i = 0; i < tokens; i++) {
            // Make strContext, combining the array context into a single string.
            String strContext = "";
            for(int j = 0; j < context.length - 1; j++) {
                strContext += context[j] + " ";
            }
            strContext += context[context.length - 1];
            // See if strContext is the key to any WordNode.
            for (int j = 0; j < wordNodes.size(); j++) {
                if(wordNodes.get(j).getKey().equals(strContext)) {
                    // Make room for the new word in the context array.
                    for(int k = 0; k < context.length - 1; k++) {
                        context[k] = context[k+1];
                    }

                    String newWord = wordNodes.get(j).generateWord(); // Find the new word.


                    // Add the newly generated word to the context
                    context[context.length - 1] = newWord;
                    out += newWord + " ";
                }
            }
        }
        return out;
    }

    public void printNodes() {
        for (WordNode n : wordNodes) {
            System.out.println(n.getKey() + " - ");
            for(int i = 0; i < n.getWordVals().size(); i++) {
                System.out.println("    " + n.getWordVals().get(i) + " -- " + n.getNumVals().get(i));
            }
        }
    }

    public static String readFile(String filePath) {
        String out = "";
        try {
            Scanner scn = new Scanner(new File(filePath));
            while(scn.hasNextLine()) {
                out += scn.nextLine() + " ";
            }
            scn.close();
        } catch(FileNotFoundException e) {
            System.err.println("File not found.");
        }
        return out;
    }
}
Initial commit 2024-08-10 01:24:54 -05:00			`import java.util.ArrayList;`
			`import java.util.Arrays;`
			`import java.io.File;`
			`import java.io.FileNotFoundException;`
			`import java.util.Scanner;`

			`/**`
			`* Class used for training the text-generator, and a main method with a basic demo.`
			`*/`
			`public class MarkovWordGen {`
			`private ArrayList<WordNode> wordNodes;`
			`private int nGram;`

			`public static void main(String abcd[]) {`

			`/* Create a bot as a bigram model. */`
Added training data to git repo. 2024-08-10 10:44:15 -05:00			`String data = readFile("./Poe.txt");`
Initial commit 2024-08-10 01:24:54 -05:00			`MarkovWordGen bot = new MarkovWordGen(2);`
			`bot.train(data);`
			`System.out.println(bot.generate("The thousand", 1000));`
			`}`

			`/**`
			`* Simple constructor.`
			`*/`
			`public MarkovWordGen(int n) {`
			`wordNodes = new ArrayList<>();`
			`nGram = n;`
			`}`

			`/**`
			`* Trains using the parameter "input" as the training data. The parameter "nGram"`
			`* represents whether it is a unigram, bigram, trigram, etc. Anything more than`
			`* trigram will probably not work well, though. The method returns an ArrayList of`
			`* WordNodes that can be used to generate text based on a seed (a starting value`
			`* like "The" for a unigram, or "The dog" for a bigram).`
			`*/`
			`public void train(String input) {`

			`// Convert the one large string input into an array of tokens.`
			`ArrayList<String> tokens = new ArrayList<>(Arrays.asList(input.split("\\s+")));`

			`for(int i = 0; i < tokens.size(); i++) {`
			`tokens.get(i).trim();`
			`}`

			`// Iterate through the tokens to train.`
			`for(int i = 0; i < tokens.size() - nGram; i++) {`

			`// Combine nGram tokens to create a key.`
			`String key = "";`
			`for(int j = 0; j < nGram - 1; j++) {`
			`key += tokens.get(i+j) + " ";`
			`}`
			`key += tokens.get(i + nGram-1);`

			`String next = tokens.get(i+nGram); // The token after the key value.`
Added training data to git repo. 2024-08-10 10:44:15 -05:00
			`// Skip over stop. This is used to separate independant pieces of training`
			`// data. If you are training with Edgar Poe stories for example, you can put`
			`// ##STOP## between stories, and it won't consider the transition from one`
			`// to the next as a natural flow, and will train on each story independantly.`
Initial commit 2024-08-10 01:24:54 -05:00			`if(next.equals("##STOP##")) {`
			`i += nGram+1;`
			`}`

			`// Find the index of that key in out, if it exists. If not, leave index -1.`
			`int index = -1;`
			`for(int j = 0; j < wordNodes.size(); j++) {`
			`WordNode n = wordNodes.get(j);`

			`// If this key has appeared before...`
			`if(n.getKey().equals(key)) {`

			`// ...then record the index and stop looking for more.`
			`index = j;`
			`}`
			`}`

			`// If this key hasn't been seen before...`
			`if(index == -1) {`
			`wordNodes.add(new WordNode(key, next)); // ...then add the key and value.`
			`} else { // And if it has...`
			`wordNodes.get(index).add(next); // ...then add this value to that WordNode.`
			`}`
			`}`
			`}`

			`public String generate(String seed, int tokens) {`
			`String[] context = seed.split(" ");`
			`String out = seed + " ";`

			`for(int i = 0; i < tokens; i++) {`
			`// Make strContext, combining the array context into a single string.`
			`String strContext = "";`
			`for(int j = 0; j < context.length - 1; j++) {`
			`strContext += context[j] + " ";`
			`}`
			`strContext += context[context.length - 1];`
			`// See if strContext is the key to any WordNode.`
			`for (int j = 0; j < wordNodes.size(); j++) {`
			`if(wordNodes.get(j).getKey().equals(strContext)) {`
			`// Make room for the new word in the context array.`
			`for(int k = 0; k < context.length - 1; k++) {`
			`context[k] = context[k+1];`
			`}`

			`String newWord = wordNodes.get(j).generateWord(); // Find the new word.`


			`// Add the newly generated word to the context`
			`context[context.length - 1] = newWord;`
			`out += newWord + " ";`
			`}`
			`}`
			`}`
			`return out;`
			`}`

			`public void printNodes() {`
			`for (WordNode n : wordNodes) {`
			`System.out.println(n.getKey() + " - ");`
			`for(int i = 0; i < n.getWordVals().size(); i++) {`
			`System.out.println(" " + n.getWordVals().get(i) + " -- " + n.getNumVals().get(i));`
			`}`
			`}`
			`}`

			`public static String readFile(String filePath) {`
			`String out = "";`
			`try {`
			`Scanner scn = new Scanner(new File(filePath));`
			`while(scn.hasNextLine()) {`
			`out += scn.nextLine() + " ";`
			`}`
			`scn.close();`
			`} catch(FileNotFoundException e) {`
Added training data to git repo. 2024-08-10 10:44:15 -05:00			`System.err.println("File not found.");`
Initial commit 2024-08-10 01:24:54 -05:00			`}`
			`return out;`
			`}`
			`}`