Spaces:
Running
Running
| # based on markov.py by Allison Parish | |
| # https://github.com/aparrish/rwet-examples/blob/master/ngrams/markov.py | |
| import random | |
| def build_model(tokens, n): | |
| "Builds a Markov model from the list of tokens, using n-grams of length n." | |
| model = dict() | |
| if len(tokens) < n: | |
| return model | |
| for i in range(len(tokens) - n): | |
| gram = tuple(tokens[i:i+n]) | |
| next_token = tokens[i+n] | |
| if gram in model: | |
| model[gram].append(next_token) | |
| else: | |
| model[gram] = [next_token] | |
| final_gram = tuple(tokens[len(tokens)-n:]) | |
| # if final_gram in model: | |
| # model[final_gram].append(None) | |
| # else: | |
| # model[final_gram] = [None] | |
| return model | |
| def generate(model, n, seed=None, max_iterations=100): | |
| """Generates a list of tokens from information in model, using n as the | |
| length of n-grams in the model. Starts the generation with the n-gram | |
| given as seed. If more than max_iteration iterations are reached, the | |
| process is stopped. (This is to prevent infinite loops)""" | |
| if seed is None: | |
| seed = random.choice(list(model.keys())) | |
| else: | |
| seed = (seed,) | |
| output = list(seed) | |
| current = tuple(seed) | |
| for i in range(max_iterations): | |
| if current in model: | |
| possible_next_tokens = model[current] | |
| next_token = random.choice(possible_next_tokens) | |
| if next_token is None: | |
| print('next token is none') | |
| break | |
| output.append(next_token) | |
| current = tuple(output[-n:]) | |
| else: | |
| break | |
| # print 'output: ' + output[1] | |
| return output | |
| def merge_models(models): | |
| "Merges two or more Markov models." | |
| merged_model = dict() | |
| for model in models: | |
| for key, val in model.items(): | |
| if key in merged_model: | |
| merged_model[key].extend(val) | |
| else: | |
| merged_model[key] = val | |
| return merged_model | |
| def generate_from_token_lists(token_lines, n, count=14, max_iterations=100): | |
| """Generates text from a list of lists of tokens. This function is intended | |
| for input text where each line forms a distinct unit (e.g., poetry), and | |
| where the desired output is to recreate lines in that form. It does this | |
| by keeping track of the n-gram that comes at the beginning of each line, | |
| and then only generating lines that begin with one of these "beginnings." | |
| It also builds a separate Markov model for each line, and then merges | |
| those models together, to ensure that lines end with n-grams statistically | |
| likely to end lines in the original text.""" | |
| beginnings = list() | |
| models = list() | |
| for token_line in token_lines: | |
| beginning = token_line[:n] | |
| beginnings.append(beginning) | |
| line_model = build_model(token_line, n) | |
| models.append(line_model) | |
| combined_model = merge_models(models) | |
| generated_list = list() | |
| for i in range(count): | |
| generated_str = generate(combined_model, n, random.choice(beginnings), | |
| max_iterations) | |
| generated_list.append(generated_str) | |
| return generated_list | |
| # def char_level_generate(lines, n, count=14, max_iterations=100): | |
| # """Generates Markov chain text from the given lines, using character-level | |
| # n-grams of length n. Returns a list of count items.""" | |
| # token_lines = [list(line) for line in lines] | |
| # generated = generate_from_token_lists(token_lines, n, count, max_iterations) | |
| # return [''.join(item) for item in generated] | |
| # def word_level_generate(lines, n, count=14, max_iterations=100): | |
| # """Generates Markov chain text from the given lines, using word-level | |
| # n-grams of length n. Returns a list of count items.""" | |
| # token_lines = [line.split() for line in lines] | |
| # generated = generate_from_token_lists(token_lines, n, count, max_iterations) | |
| # return [' '.join(item) for item in generated] | |
| def generate_model_from_token_lists(token_lines, n, count=14, max_iterations=100): | |
| """Generates text from a list of lists of tokens. This function is intended | |
| for input text where each line forms a distinct unit (e.g., poetry), and | |
| where the desired output is to recreate lines in that form. It does this | |
| by keeping track of the n-gram that comes at the beginning of each line, | |
| and then only generating lines that begin with one of these "beginnings." | |
| It also builds a separate Markov model for each line, and then merges | |
| those models together, to ensure that lines end with n-grams statistically | |
| likely to end lines in the original text.""" | |
| # beginnings = list() | |
| models = list() | |
| for token_line in token_lines: | |
| # beginning = token_line[:n] | |
| # beginnings.append(beginning) | |
| line_model = build_model(token_line, n) | |
| models.append(line_model) | |
| combined_model = merge_models(models) | |
| return combined_model | |
| # if __name__ == '__main__': | |
| # import sys | |
| # n = int(sys.argv[1]) | |
| # lines = list() | |
| # for line in sys.stdin: | |
| # line = line.strip() | |
| # lines.append(line) | |
| # for generated in char_level_generate(lines, n): | |
| # print(generated) |