diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4b173a0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,152 @@ +.vscode/ + +# 模型文件 +model/* + +# ckpt +ckpt* + +# 临时目录 +tmp/* + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + + +# Debug +Debug + +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + +# Temp files +*.dump \ No newline at end of file diff --git a/.idea/dictionaries/liuchong.xml b/.idea/dictionaries/liuchong.xml deleted file mode 100644 index 7e15e5b..0000000 --- a/.idea/dictionaries/liuchong.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index a86fbb0..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - ApexVCS - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 6ec234d..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/seq2seq_chatbot.iml b/.idea/seq2seq_chatbot.iml deleted file mode 100644 index 6f63a63..0000000 --- a/.idea/seq2seq_chatbot.iml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..8941107 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "python.pythonPath": "C:\\Program Files\\Python37\\python.exe", + "python.linting.pylintEnabled": true, + "python.linting.enabled": true +} \ No newline at end of file diff --git a/README.md b/README.md index 642604c..3428663 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,14 @@ -=================================================更新=========================================================== -训练好的模型已经上传到百度云网盘,如果大家有需要可以前去下载。模型训练速度的话,CPU,16G内存,一天即刻训练完成~~~ - -链接:https://pan.baidu.com/s/1hrNxaSk 密码:d2sn +# seq2seq_chatbot =================================================分割线,下面是正文=============================================== -本文是一个简单的基于seq2seq模型的chatbot对话系统的tensorflow实现。 +本文是一个简单的基于 seq2seq 模型的 chatbot 对话系统的 tensorflow 实现。 代码的讲解可以参考我的知乎专栏文章: -[从头实现深度学习的对话系统--简单chatbot代码实现](https://zhuanlan.zhihu.com/p/32455898) +[从头实现深度学习的对话系统--简单 chatbot 代码实现](https://zhuanlan.zhihu.com/p/32455898) -代码参考了DeepQA,在其基础上添加了beam search的功能和attention的机制, +代码参考了 DeepQA,在其基础上添加了 beam search 的功能和 attention 的机制, 最终的效果如下图所示: @@ -19,23 +16,22 @@ ![](https://i.imgur.com/RnvBDwO.png) -测试效果,根据用户输入回复概率最大的前beam_size个句子: +测试效果,根据用户输入回复概率最大的前 beam_size 个句子: ![](https://i.imgur.com/EdsQ5FE.png) #使用方法 -1,下载代码到本地(data文件夹下已经包含了处理好的数据集,所以无需额外下载数据集) +1,下载代码到本地(data 文件夹下已经包含了处理好的数据集,所以无需额外下载数据集) -2,训练模型,将chatbot.py文件第34行的decode参数修改为False,进行训练模型 +2,训练模型,将 chatbot.py 文件第 34 行的 decode 参数修改为 False,进行训练模型 (之后我会把我这里训练好的模型上传到网上方便大家使用) -3,训练完之后(大概要一天左右的时间,30个epoches),再将decode参数修改为True +3,训练完之后(大概要一天左右的时间,30 个 epoches),再将 decode 参数修改为 True 就可以进行测试了。输入你想问的话看他回复什么吧== 这里还需要注意的就是要记得修改数据集和最后模型文件的绝对路径,不然可能会报错。 -分别在44行,57行,82行三处。好了,接下来就可以愉快的玩耍了~~ - +分别在 44 行,57 行,82 行三处。好了,接下来就可以愉快的玩耍了~~ diff --git a/__init__.py b/__init__.py index e69de29..362d74a 100644 --- a/__init__.py +++ b/__init__.py @@ -0,0 +1 @@ +__version__ = "0.0.1" diff --git a/beam_search.py b/beam_search.py new file mode 100644 index 0000000..ac7eaf0 --- /dev/null +++ b/beam_search.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# @Date : Apr-09-20 21:03 +# @Author : Your Name (you@example.org) +# @Link : http://example.org + +import os +import sys +import math +import time +import tensorflow as tf +from data_utils import * +from model import * +from tqdm import tqdm + + +def beam_search(sess, sentence, word2id, id2word, model, beam_size=5): + if sentence: + batch = sentence2enco(sentence, word2id, model.en_de_seq_len) + beam_path, beam_symbol = model.step(sess, batch.encoderSeqs, batch.decoderSeqs, batch.targetSeqs, + batch.weights, goToken) + paths = [[] for _ in range(beam_size)] + indices = [i for i in range(beam_size)] + num_steps = len(beam_path) + for i in reversed(range(num_steps)): + for kk in range(beam_size): + paths[kk].append(beam_symbol[i][indices[kk]]) + indices[kk] = beam_path[i][indices[kk]] + + recos = [] + for kk in range(beam_size): + foutputs = [int(logit) for logit in paths[kk][::-1]] + if eosToken in foutputs: + foutputs = foutputs[:foutputs.index(eosToken)] + rec = " ".join([tf.compat.as_str(id2word[output]) + for output in foutputs if output in id2word]) + if rec not in recos: + recos.append(rec) + return recos + + +def main(): + pass + # with tf.Session() as sess: + # beam_size = 5 + # if_beam_search = True + # model = create_model( + # sess, True, beam_search=if_beam_search, beam_size=beam_size) + # model.batch_size = 1 + # data_path = DATA_PATH + # word2id, id2word, trainingSamples = load_dataset(data_path) + + # sys.stdout.write("> ") + # sys.stdout.flush() + # sentence = sys.stdin.readline() + # while sentence: + # recos = beam_search(sess, sentence=sentence, word2id=word2id, + # id2word=id2word, model=model) + # print("Replies --------------------------------------->") + # print(recos) + # sys.stdout.write("> ") + # sys.stdout.flush() + # sentence = sys.stdin.readline() + + +if __name__ == "__main__": + main() diff --git a/data/dataset-cornell-length10-filter1-vocabSize40000.pkl b/data/dataset-cornell-length10-filter1-vocabSize40000.pkl deleted file mode 100644 index 8095bdd..0000000 Binary files a/data/dataset-cornell-length10-filter1-vocabSize40000.pkl and /dev/null differ diff --git a/data_utils.py b/data_utils.py index 8594ee3..c07af74 100644 --- a/data_utils.py +++ b/data_utils.py @@ -1,3 +1,10 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# @Date : 2019-06-14 20:51:26 +# @Author : Your Name (you@example.org) +# @Link : http://example.org +# @Version : $Id$ + from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -10,15 +17,17 @@ padToken, goToken, eosToken, unknownToken = 0, 1, 2, 3 + class Batch: - #batch类,里面包含了encoder输入,decoder输入,decoder标签,decoder样本长度mask + # batch类,里面包含了encoder输入,decoder输入,decoder标签,decoder样本长度mask def __init__(self): self.encoderSeqs = [] self.decoderSeqs = [] self.targetSeqs = [] self.weights = [] -def loadDataset(filename): + +def load_dataset(filename): ''' 读取样本数据 :param filename: 文件路径,是一个字典,包含word2id、id2word分别是单词与索引对应的字典和反序字典, @@ -28,12 +37,14 @@ def loadDataset(filename): dataset_path = os.path.join(filename) print('Loading dataset from {}'.format(dataset_path)) with open(dataset_path, 'rb') as handle: - data = pickle.load(handle) # Warning: If adding something here, also modifying saveDataset + # Warning: If adding something here, also modifying saveDataset + data = pickle.load(handle) word2id = data['word2id'] id2word = data['id2word'] trainingSamples = data['trainingSamples'] return word2id, id2word, trainingSamples + def createBatch(samples, en_de_seq_len): ''' 根据给出的samples(就是一个batch的数据),进行padding并构造成placeholder所需要的数据形式 @@ -42,21 +53,27 @@ def createBatch(samples, en_de_seq_len): :return: 处理完之后可以直接传入feed_dict的数据格式 ''' batch = Batch() - #根据样本长度获得batch size大小 + # 根据样本长度获得batch size大小 batchSize = len(samples) - #将每条数据的问题和答案分开传入到相应的变量中 + # 将每条数据的问题和答案分开传入到相应的变量中 for i in range(batchSize): sample = samples[i] batch.encoderSeqs.append(list(reversed(sample[0]))) # 将输入反序,可提高模型效果 - batch.decoderSeqs.append([goToken] + sample[1] + [eosToken]) # Add the and tokens - batch.targetSeqs.append(batch.decoderSeqs[-1][1:]) # Same as decoder, but shifted to the left (ignore the ) + # Add the and tokens + batch.decoderSeqs.append([goToken] + sample[1] + [eosToken]) + # Same as decoder, but shifted to the left (ignore the ) + batch.targetSeqs.append(batch.decoderSeqs[-1][1:]) # 将每个元素PAD到指定长度,并构造weights序列长度mask标志 - batch.encoderSeqs[i] = [padToken] * (en_de_seq_len[0] - len(batch.encoderSeqs[i])) + batch.encoderSeqs[i] - batch.weights.append([1.0] * len(batch.targetSeqs[i]) + [0.0] * (en_de_seq_len[1] - len(batch.targetSeqs[i]))) - batch.decoderSeqs[i] = batch.decoderSeqs[i] + [padToken] * (en_de_seq_len[1] - len(batch.decoderSeqs[i])) - batch.targetSeqs[i] = batch.targetSeqs[i] + [padToken] * (en_de_seq_len[1] - len(batch.targetSeqs[i])) + batch.encoderSeqs[i] = [ + padToken] * (en_de_seq_len[0] - len(batch.encoderSeqs[i])) + batch.encoderSeqs[i] + batch.weights.append([1.0] * len(batch.targetSeqs[i]) + + [0.0] * (en_de_seq_len[1] - len(batch.targetSeqs[i]))) + batch.decoderSeqs[i] = batch.decoderSeqs[i] + [padToken] * \ + (en_de_seq_len[1] - len(batch.decoderSeqs[i])) + batch.targetSeqs[i] = batch.targetSeqs[i] + [padToken] * \ + (en_de_seq_len[1] - len(batch.targetSeqs[i])) - #--------------------接下来就是将数据进行reshape操作,变成序列长度*batch_size格式的数据------------------------ + # --------------------接下来就是将数据进行reshape操作,变成序列长度*batch_size格式的数据------------------------ encoderSeqsT = [] # Corrected orientation for i in range(en_de_seq_len[0]): encoderSeqT = [] @@ -85,6 +102,7 @@ def createBatch(samples, en_de_seq_len): return batch + def getBatches(data, batch_size, en_de_seq_len): ''' 根据读取出来的所有数据和batch_size将原始数据分成不同的小batch。对每个batch索引的样本调用createBatch函数进行处理 @@ -93,10 +111,11 @@ def getBatches(data, batch_size, en_de_seq_len): :param en_de_seq_len: 列表,第一个元素表示source端序列的最大长度,第二个元素表示target端序列的最大长度 :return: 列表,每个元素都是一个batch的样本数据,可直接传入feed_dict进行训练 ''' - #每个epoch之前都要进行样本的shuffle + # 每个epoch之前都要进行样本的shuffle random.shuffle(data) batches = [] data_len = len(data) + def genNextSamples(): for i in range(0, data_len, batch_size): yield data[i:min(i + batch_size, data_len)] @@ -106,6 +125,7 @@ def genNextSamples(): batches.append(batch) return batches + def sentence2enco(sentence, word2id, en_de_seq_len): ''' 测试的时候将用户输入的句子转化为可以直接feed进模型的数据,现将句子转化成id,然后调用createBatch处理 @@ -116,14 +136,14 @@ def sentence2enco(sentence, word2id, en_de_seq_len): ''' if sentence == '': return None - #分词 + # 分词 tokens = nltk.word_tokenize(sentence) if len(tokens) > en_de_seq_len[0]: return None - #将每个单词转化为id + # 将每个单词转化为id wordIds = [] for token in tokens: wordIds.append(word2id.get(token, unknownToken)) - #调用createBatch构造batch + # 调用createBatch构造batch batch = createBatch([[wordIds, []]], en_de_seq_len) return batch diff --git a/decode.py b/decode.py new file mode 100644 index 0000000..7f31702 --- /dev/null +++ b/decode.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# @Date : Jan-06-20 11:47 +# @Author : Your Name (you@example.org) +# @Link : http://example.org + +import os +import sys + +import tensorflow as tf + +from data_utils import * +from seq2seq_model import * + +tf.app.flags.DEFINE_float("learning_rate", 0.001, "Learning rate.") +tf.app.flags.DEFINE_integer( + "batch_size", 64, "Batch size to use during training.") # 32 64 256 大小根据机器选择 +tf.app.flags.DEFINE_integer( + "numEpochs", 30, "Batch size to use during training.") +tf.app.flags.DEFINE_integer("size", 512, "Size of each model layer.") +tf.app.flags.DEFINE_integer("num_layers", 3, "Number of layers in the model.") +tf.app.flags.DEFINE_integer("en_vocab_size", 40000, "English vocabulary size.") +tf.app.flags.DEFINE_integer("en_de_seq_len", 20, "English vocabulary size.") +tf.app.flags.DEFINE_integer( + "max_train_data_size", 0, "Limit on the size of training data (0: no limit).") +tf.app.flags.DEFINE_integer( + "steps_per_checkpoint", 100, "How many training steps to do per checkpoint.") +tf.app.flags.DEFINE_string( + "train_dir", './model', "How many training steps to do per checkpoint.") +tf.app.flags.DEFINE_string( + "tmp", './tmp', "How many training steps to do per checkpoint.") +tf.app.flags.DEFINE_integer( + "beam_size", 5, "How many training steps to do per checkpoint.") +tf.app.flags.DEFINE_boolean( + "beam_search", True, "Set to True for beam_search.") +tf.app.flags.DEFINE_boolean( + "decode", True, "Set to True for interactive decoding.") +FLAGS = tf.app.flags.FLAGS + + +def create_model(session, forward_only, beam_search, beam_size=5): + """Create translation model and initialize or load parameters in session.""" + model = Seq2SeqModel( + FLAGS.en_vocab_size, FLAGS.en_vocab_size, [10, 10], + FLAGS.size, FLAGS.num_layers, FLAGS.batch_size, + FLAGS.learning_rate, forward_only=forward_only, beam_search=beam_search, beam_size=beam_size) + ckpt = tf.train.latest_checkpoint(FLAGS.train_dir) + model_path = os.path.join( + FLAGS.tmp, "chat_bot.ckpt-0") + if forward_only: + model.saver.restore(session, model_path) + elif ckpt and tf.gfile.Exists(ckpt + ".meta"): + print("Reading model parameters from checkpoint %s" % ckpt) + model.saver.restore(session, ckpt) + else: + print("Created model with fresh parameters.") + session.run(tf.initialize_all_variables()) + return model + + +def decode(): + with tf.Session() as sess: # 打开tensorflow session需要时间 + beam_size = FLAGS.beam_size + beam_search = FLAGS.beam_search + model = create_model( + sess, forward_only=True, beam_search=beam_search, beam_size=beam_size) + model.batch_size = 1 + data_path = 'data/dataset-cornell-length10-filter1-vocabSize40000.pkl' + data_path = os.path.join(os.path.abspath("."), data_path) + word2id, id2word, trainingSamples = loadDataset(data_path) + + if beam_search: + sys.stdout.write("> ") + sys.stdout.flush() + sentence = sys.stdin.readline() + while sentence: + batch = sentence2enco(sentence, word2id, model.en_de_seq_len) + beam_path, beam_symbol = model.step(sess, batch.encoderSeqs, batch.decoderSeqs, batch.targetSeqs, + batch.weights, goToken) + paths = [[] for _ in range(beam_size)] + curr = [i for i in range(beam_size)] + num_steps = len(beam_path) + for i in range(num_steps-1, -1, -1): + for kk in range(beam_size): + paths[kk].append(beam_symbol[i][curr[kk]]) + curr[kk] = beam_path[i][curr[kk]] + recos = set() + print("Replies --------------------------------------->") + for kk in range(beam_size): + foutputs = [int(logit) for logit in paths[kk][::-1]] + if eosToken in foutputs: + foutputs = foutputs[:foutputs.index(eosToken)] + rec = " ".join([tf.compat.as_str(id2word[output]) + for output in foutputs if output in id2word]) + if rec not in recos: + recos.add(rec) + print(rec) + print("> ", "") + sys.stdout.flush() + sentence = sys.stdin.readline() + # return recos + + +def main(): + decode() + + +if __name__ == "__main__": + main() diff --git a/chatbot.py b/main.py similarity index 55% rename from chatbot.py rename to main.py index 0679f42..77cb2ee 100644 --- a/chatbot.py +++ b/main.py @@ -1,3 +1,10 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# @Date : 2019-06-14 20:51:26 +# @Author : Your Name (you@example.org) +# @Link : http://example.org +# @Version : $Id$ + """Most of the code comes from seq2seq tutorial. Binary for training conversation models and decoding from them. Running this program without --decode will tokenize it in a very basic way, @@ -15,99 +22,118 @@ import math import sys import time +import tensorflow as tf from data_utils import * -from seq2seq_model import * +from beam_search import beam_search +from model import * from tqdm import tqdm +DATA_PATH = "D:\\DeepLearningData\\bronya-bot-data\\data\\dataset-cornell-length10-filter1-vocabSize40000.pkl" +TRAIN_DIR = "D:\\DeepLearningData\\bronya-bot-data\\model" + tf.app.flags.DEFINE_float("learning_rate", 0.001, "Learning rate.") -tf.app.flags.DEFINE_integer("batch_size", 256, "Batch size to use during training.") -tf.app.flags.DEFINE_integer("numEpochs", 30, "Batch size to use during training.") +tf.app.flags.DEFINE_integer( + "batch_size", 64, "Batch size to use during training.") # 32 64 256 大小根据机器选择 +tf.app.flags.DEFINE_integer( + "numEpochs", 30, "Batch size to use during training.") tf.app.flags.DEFINE_integer("size", 512, "Size of each model layer.") tf.app.flags.DEFINE_integer("num_layers", 3, "Number of layers in the model.") tf.app.flags.DEFINE_integer("en_vocab_size", 40000, "English vocabulary size.") tf.app.flags.DEFINE_integer("en_de_seq_len", 20, "English vocabulary size.") -tf.app.flags.DEFINE_integer("max_train_data_size", 0, "Limit on the size of training data (0: no limit).") -tf.app.flags.DEFINE_integer("steps_per_checkpoint", 100, "How many training steps to do per checkpoint.") -tf.app.flags.DEFINE_string("train_dir", './tmp', "How many training steps to do per checkpoint.") -tf.app.flags.DEFINE_integer("beam_size", 5, "How many training steps to do per checkpoint.") -tf.app.flags.DEFINE_boolean("beam_search", True, "Set to True for beam_search.") -tf.app.flags.DEFINE_boolean("decode", True, "Set to True for interactive decoding.") +tf.app.flags.DEFINE_integer( + "max_train_data_size", 0, "Limit on the size of training data (0: no limit).") +tf.app.flags.DEFINE_integer( + "steps_per_checkpoint", 100, "How many training steps to do per checkpoint.") +tf.app.flags.DEFINE_string( + "train_dir", TRAIN_DIR, ".") +tf.app.flags.DEFINE_string( + "tmp", './tmp', "tmp dir.") +tf.app.flags.DEFINE_integer( + "beam_size", 5, "beam_size.") +tf.app.flags.DEFINE_boolean( + "if_beam_search", True, "Set to True for beam_search.") +tf.app.flags.DEFINE_boolean( + "decode", True, "Set to True for interactive decoding.") + FLAGS = tf.app.flags.FLAGS -def create_model(session, forward_only, beam_search, beam_size = 5): + +def create_model(session, forward_only, beam_search, beam_size=5): """Create translation model and initialize or load parameters in session.""" model = Seq2SeqModel( FLAGS.en_vocab_size, FLAGS.en_vocab_size, [10, 10], FLAGS.size, FLAGS.num_layers, FLAGS.batch_size, FLAGS.learning_rate, forward_only=forward_only, beam_search=beam_search, beam_size=beam_size) ckpt = tf.train.latest_checkpoint(FLAGS.train_dir) - model_path = 'E:\PycharmProjects\Seq-to-Seq\seq2seq_chatbot\\tmp\chat_bot.ckpt-0' + model_path = os.path.join( + FLAGS.train_dir, "chat_bot.ckpt-0") if forward_only: - model.saver.restore(session, model_path) - elif ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): - print("Reading model parameters from %s" % ckpt.model_checkpoint_path) - model.saver.restore(session, ckpt.model_checkpoint_path) + # model.saver.restore(session, model_path) + print("Reading model parameters from checkpoint %s" % ckpt) + model.saver.restore(session, ckpt) + elif ckpt and tf.gfile.Exists(ckpt + ".meta"): + print("Reading model parameters from checkpoint %s" % ckpt) + model.saver.restore(session, ckpt) else: print("Created model with fresh parameters.") session.run(tf.initialize_all_variables()) return model + def train(): + # prepare directories + os.makedirs(FLAGS.train_dir, exist_ok=True) # prepare dataset - data_path = 'E:\PycharmProjects\Seq-to-Seq\seq2seq_chatbot\data\dataset-cornell-length10-filter1-vocabSize40000.pkl' - word2id, id2word, trainingSamples = loadDataset(data_path) + data_path = 'data/dataset-cornell-length10-filter1-vocabSize40000.pkl' + data_path = os.path.join(os.path.abspath("."), data_path) + word2id, id2word, trainingSamples = load_dataset(data_path) with tf.Session() as sess: - print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) + print("Creating %d layers of %d units." % + (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, False, beam_search=False, beam_size=5) current_step = 0 for e in range(FLAGS.numEpochs): print("----- Epoch {}/{} -----".format(e + 1, FLAGS.numEpochs)) - batches = getBatches(trainingSamples, FLAGS.batch_size, model.en_de_seq_len) + batches = getBatches( + trainingSamples, FLAGS.batch_size, model.en_de_seq_len) for nextBatch in tqdm(batches, desc="Training"): _, step_loss = model.step(sess, nextBatch.encoderSeqs, nextBatch.decoderSeqs, nextBatch.targetSeqs, nextBatch.weights, goToken) current_step += 1 if current_step % FLAGS.steps_per_checkpoint == 0: - perplexity = math.exp(float(step_loss)) if step_loss < 300 else float('inf') - tqdm.write("----- Step %d -- Loss %.2f -- Perplexity %.2f" % (current_step, step_loss, perplexity)) - checkpoint_path = os.path.join(FLAGS.train_dir, "chat_bot.ckpt") - model.saver.save(sess, checkpoint_path, global_step=model.global_step) + perplexity = math.exp( + float(step_loss)) if step_loss < 300 else float('inf') + tqdm.write("----- Step %d -- Loss %.2f -- Perplexity %.2f" % + (current_step, step_loss, perplexity)) + current_time = time.time() + current_time = time.localtime(current_time) + time_str = time.strftime("%Y%m%d%H%M%S", current_time) + checkpoint_path = os.path.join( + FLAGS.train_dir, "chat_bot.ckpt-" + time_str) # 八位日期 六位时间 + model.saver.save(sess, checkpoint_path, + global_step=model.global_step) + def decode(): with tf.Session() as sess: beam_size = FLAGS.beam_size - beam_search = FLAGS.beam_search - model = create_model(sess, True, beam_search=beam_search, beam_size=beam_size) + if_beam_search = FLAGS.if_beam_search + model = create_model( + sess, True, beam_search=if_beam_search, beam_size=beam_size) model.batch_size = 1 - data_path = 'E:\PycharmProjects\Seq-to-Seq\seq2seq_chatbot\data\dataset-cornell-length10-filter1-vocabSize40000.pkl' - word2id, id2word, trainingSamples = loadDataset(data_path) + data_path = DATA_PATH + word2id, id2word, trainingSamples = load_dataset(data_path) - if beam_search: + if if_beam_search: sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: - batch = sentence2enco(sentence, word2id, model.en_de_seq_len) - beam_path, beam_symbol = model.step(sess, batch.encoderSeqs, batch.decoderSeqs, batch.targetSeqs, - batch.weights, goToken) - paths = [[] for _ in range(beam_size)] - curr = [i for i in range(beam_size)] - num_steps = len(beam_path) - for i in range(num_steps-1, -1, -1): - for kk in range(beam_size): - paths[kk].append(beam_symbol[i][curr[kk]]) - curr[kk] = beam_path[i][curr[kk]] - recos = set() + recos = beam_search(sess, sentence=sentence, word2id=word2id, + id2word=id2word, model=model) print("Replies --------------------------------------->") - for kk in range(beam_size): - foutputs = [int(logit) for logit in paths[kk][::-1]] - if eosToken in foutputs: - foutputs = foutputs[:foutputs.index(eosToken)] - rec = " ".join([tf.compat.as_str(id2word[output]) for output in foutputs if output in id2word]) - if rec not in recos: - recos.add(rec) - print(rec) - print("> ", "") + print(recos) + sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() # else: @@ -141,11 +167,13 @@ def decode(): # sys.stdout.flush() # sentence = sys.stdin.readline() + def main(_): - if FLAGS.decode: - decode() - else: - train() + if FLAGS.decode: + decode() + else: + train() + if __name__ == "__main__": - tf.app.run() + tf.app.run() diff --git a/seq2seq_model.py b/model.py similarity index 78% rename from seq2seq_model.py rename to model.py index 8a979a4..9bdea74 100644 --- a/seq2seq_model.py +++ b/model.py @@ -1,5 +1,14 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# @Date : 2019-06-14 20:51:26 +# @Author : Your Name (you@example.org) +# @Link : http://example.org +# @Version : $Id$ + import tensorflow as tf from seq2seq import embedding_attention_seq2seq + + class Seq2SeqModel(): def __init__(self, source_vocab_size, target_vocab_size, en_de_seq_len, hidden_size, num_layers, @@ -36,11 +45,13 @@ def __init__(self, source_vocab_size, target_vocab_size, en_de_seq_len, hidden_s softmax_loss_function = None # 定义采样loss函数,传入后面的sequence_loss_by_example函数 if num_samples > 0 and num_samples < self.target_vocab_size: - w = tf.get_variable('proj_w', [hidden_size, self.target_vocab_size]) + w = tf.get_variable( + 'proj_w', [hidden_size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable('proj_b', [self.target_vocab_size]) output_projection = (w, b) - #调用sampled_softmax_loss函数计算sample loss,这样可以节省计算时间 + # 调用sampled_softmax_loss函数计算sample loss,这样可以节省计算时间 + def sample_loss(logits, labels): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, labels=labels, inputs=logits, num_sampled=num_samples, num_classes=self.target_vocab_size) @@ -48,11 +59,14 @@ def sample_loss(logits, labels): self.keep_drop = tf.placeholder(tf.float32) # 定义encoder和decoder阶段的多层dropout RNNCell + def create_rnn_cell(): encoDecoCell = tf.contrib.rnn.BasicLSTMCell(hidden_size) - encoDecoCell = tf.contrib.rnn.DropoutWrapper(encoDecoCell, input_keep_prob=1.0, output_keep_prob=self.keep_drop) + encoDecoCell = tf.contrib.rnn.DropoutWrapper( + encoDecoCell, input_keep_prob=1.0, output_keep_prob=self.keep_drop) return encoDecoCell - encoCell = tf.contrib.rnn.MultiRNNCell([create_rnn_cell() for _ in range(num_layers)]) + encoCell = tf.contrib.rnn.MultiRNNCell( + [create_rnn_cell() for _ in range(num_layers)]) # 定义输入的placeholder,采用了列表的形式 self.encoder_inputs = [] @@ -60,15 +74,19 @@ def create_rnn_cell(): self.decoder_targets = [] self.target_weights = [] for i in range(en_de_seq_len[0]): - self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None, ], name="encoder{0}".format(i))) + self.encoder_inputs.append(tf.placeholder( + tf.int32, shape=[None, ], name="encoder{0}".format(i))) for i in range(en_de_seq_len[1]): - self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None, ], name="decoder{0}".format(i))) - self.decoder_targets.append(tf.placeholder(tf.int32, shape=[None, ], name="target{0}".format(i))) - self.target_weights.append(tf.placeholder(tf.float32, shape=[None, ], name="weight{0}".format(i))) + self.decoder_inputs.append(tf.placeholder( + tf.int32, shape=[None, ], name="decoder{0}".format(i))) + self.decoder_targets.append(tf.placeholder( + tf.int32, shape=[None, ], name="target{0}".format(i))) + self.target_weights.append(tf.placeholder( + tf.float32, shape=[None, ], name="weight{0}".format(i))) # test模式,将上一时刻输出当做下一时刻输入传入 if forward_only: - if beam_search:#如果是beam_search的话,则调用自己写的embedding_attention_seq2seq函数,而不是legacy_seq2seq下面的 + if beam_search: # 如果是beam_search的话,则调用自己写的embedding_attention_seq2seq函数,而不是legacy_seq2seq下面的 self.beam_outputs, _, self.beam_path, self.beam_symbol = embedding_attention_seq2seq( self.encoder_inputs, self.decoder_inputs, encoCell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=hidden_size, @@ -80,7 +98,8 @@ def create_rnn_cell(): output_projection=output_projection, feed_previous=True) # 因为seq2seq模型中未指定output_projection,所以需要在输出之后自己进行output_projection if output_projection is not None: - self.outputs = tf.matmul(decoder_outputs, output_projection[0]) + output_projection[1] + self.outputs = tf.matmul( + decoder_outputs, output_projection[0]) + output_projection[1] else: # 因为不需要将output作为下一时刻的输入,所以不用output_projection decoder_outputs, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq( @@ -91,13 +110,14 @@ def create_rnn_cell(): decoder_outputs, self.decoder_targets, self.target_weights, softmax_loss_function=softmax_loss_function) # Initialize the optimizer - opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08) + opt = tf.train.AdamOptimizer( + learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08) self.optOp = opt.minimize(self.loss) self.saver = tf.train.Saver(tf.all_variables()) def step(self, session, encoder_inputs, decoder_inputs, decoder_targets, target_weights, go_token_id): - #传入一个batch的数据,并训练性对应的模型 + # 传入一个batch的数据,并训练性对应的模型 # 构建sess.run时的feed_inpits feed_dict = {} if not self.forward_only: @@ -124,4 +144,4 @@ def step(self, session, encoder_inputs, decoder_inputs, decoder_targets, target_ return None, outputs[1] else: if self.beam_search: - return outputs[0], outputs[1] \ No newline at end of file + return outputs[0], outputs[1] diff --git a/seq2seq.py b/seq2seq.py index 59fdaf6..78a686d 100644 --- a/seq2seq.py +++ b/seq2seq.py @@ -1,3 +1,10 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# @Date : 2019-06-14 20:51:26 +# @Author : Your Name (you@example.org) +# @Link : http://example.org +# @Version : $Id$ + from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -22,13 +29,17 @@ from tensorflow.python.ops import variable_scope from tensorflow.python.util import nest -Linear = rnn_cell_impl._Linear # pylint: disable=protected-access,invalid-name + +# Linear = rnn_cell_impl._Linear # pylint: disable=protected-access,invalid-name +Linear = core_rnn_cell._linear + def _extract_beam_search(embedding, beam_size, num_symbols, embedding_size, output_projection=None): def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols): if output_projection is not None: - prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) + prev = nn_ops.xw_plus_b( + prev, output_projection[0], output_projection[1]) # 对输出概率进行归一化和取log,这样序列概率相乘就可以变成概率相加 probs = tf.log(tf.nn.softmax(prev)) if i == 1: @@ -36,7 +47,8 @@ def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols): if i > 1: # 将当前序列的概率与之前序列概率相加得到结果之前有beam_szie个序列,本次产生num_symbols个结果, # 所以reshape成这样的tensor - probs = tf.reshape(probs + log_beam_probs[-1], [-1, beam_size * num_symbols]) + probs = tf.reshape( + probs + log_beam_probs[-1], [-1, beam_size * num_symbols]) # 选出概率最大的前beam_size个序列,从beam_size * num_symbols个元素中选出beam_size个 best_probs, indices = tf.nn.top_k(probs, beam_size) indices = tf.stop_gradient(tf.squeeze(tf.reshape(indices, [-1, 1]))) @@ -56,21 +68,23 @@ def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols): return loop_function + def beam_attention_decoder(decoder_inputs, - initial_state, - attention_states, - cell, + initial_state, + attention_states, + cell, embedding, - output_size=None, - num_heads=1, - loop_function=None, - dtype=None, - scope=None, - initial_state_attention=False, output_projection=None, beam_size=10): + output_size=None, + num_heads=1, + loop_function=None, + dtype=None, + scope=None, + initial_state_attention=False, output_projection=None, beam_size=10): if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: - raise ValueError("With less than 1 heads, use a non-attention decoder.") + raise ValueError( + "With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) @@ -86,14 +100,18 @@ def beam_attention_decoder(decoder_inputs, attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. - hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) + hidden = array_ops.reshape( + attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): - k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) - hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) - v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) + k = variable_scope.get_variable( + "AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) + hidden_features.append(nn_ops.conv2d( + hidden, k, [1, 1, 1, 1], "SAME")) + v.append(variable_scope.get_variable( + "AttnV_%d" % a, [attention_vec_size])) state = [] # 将encoder的最后一个隐层状态扩展成beam_size维,因为decoder阶段的batch_size是beam_size。 @@ -124,13 +142,16 @@ def attention(query): query = array_ops.concat(query_list, 1) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): - y = Linear(query, attention_vec_size, True)(query) + # y = Linear(query, attention_vec_size, True)(query) + y = Linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). - s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) + s = math_ops.reduce_sum( + v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. - d = math_ops.reduce_sum(array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) + d = math_ops.reduce_sum(array_ops.reshape( + a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds @@ -138,7 +159,8 @@ def attention(query): prev = None # attention也要定义成beam_size为的tensor batch_attn_size = array_ops.stack([beam_size, attn_size]) - attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] + attns = [array_ops.zeros(batch_attn_size, dtype=dtype) + for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: @@ -150,18 +172,23 @@ def attention(query): variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if i == 0: - #i=0时,输入时一个batch_szie=beam_size的tensor,且里面每个元素的值都是相同的,都是标志 - inp = tf.nn.embedding_lookup(embedding, tf.constant(1, dtype=tf.int32, shape=[beam_size])) + # i=0时,输入时一个batch_szie=beam_size的tensor,且里面每个元素的值都是相同的,都是标志 + inp = tf.nn.embedding_lookup(embedding, tf.constant( + 1, dtype=tf.int32, shape=[beam_size])) if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): - inp = loop_function(prev, i, log_beam_probs, beam_path, beam_symbols) + inp = loop_function( + prev, i, log_beam_probs, beam_path, beam_symbols) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: - raise ValueError("Could not infer input size from input: %s" % inp.name) + raise ValueError( + "Could not infer input size from input: %s" % inp.name) inputs = [inp] + attns - x = Linear(inputs, input_size, True)(inputs) + # print(inputs) + # x = Linear(inputs, output_size=input_size, bias=True)(inputs) + x = Linear(inputs, output_size=input_size, bias=True) # Run the RNN. cell_output, state = cell(x, state) @@ -174,14 +201,17 @@ def attention(query): with variable_scope.variable_scope("AttnOutputProjection"): inputs = [cell_output] + attns - output = Linear(inputs, output_size, True)(inputs) + # output = Linear(inputs, output_size, True)(inputs) + output = Linear(inputs, output_size, True) if loop_function is not None: prev = output - outputs.append(tf.argmax(nn_ops.xw_plus_b(output, output_projection[0], output_projection[1]), axis=1)) + outputs.append(tf.argmax(nn_ops.xw_plus_b( + output, output_projection[0], output_projection[1]), axis=1)) return outputs, state, tf.reshape(tf.concat(beam_path, 0), [-1, beam_size]), tf.reshape(tf.concat(beam_symbols, 0), [-1, beam_size]) + def embedding_attention_decoder(decoder_inputs, initial_state, attention_states, @@ -203,9 +233,12 @@ def embedding_attention_decoder(decoder_inputs, proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with variable_scope.variable_scope(scope or "embedding_attention_decoder", dtype=dtype) as scope: - embedding = variable_scope.get_variable("embedding", [num_symbols, embedding_size]) - emb_inp = [embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs] - loop_function = _extract_beam_search(embedding, beam_size, num_symbols, embedding_size, output_projection) + embedding = variable_scope.get_variable( + "embedding", [num_symbols, embedding_size]) + emb_inp = [embedding_ops.embedding_lookup( + embedding, i) for i in decoder_inputs] + loop_function = _extract_beam_search( + embedding, beam_size, num_symbols, embedding_size, output_projection) return beam_attention_decoder( emb_inp, initial_state, attention_states, cell, embedding, output_size=output_size, num_heads=num_heads, loop_function=loop_function, @@ -229,17 +262,21 @@ def embedding_attention_seq2seq(encoder_inputs, dtype = scope.dtype # Encoder. encoder_cell = copy.deepcopy(cell) - encoder_cell = core_rnn_cell.EmbeddingWrapper(encoder_cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size) - encoder_outputs, encoder_state = rnn.static_rnn(encoder_cell, encoder_inputs, dtype=dtype) + encoder_cell = core_rnn_cell.EmbeddingWrapper( + encoder_cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size) + encoder_outputs, encoder_state = rnn.static_rnn( + encoder_cell, encoder_inputs, dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. - top_states = [array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs] + top_states = [array_ops.reshape( + e, [-1, 1, cell.output_size]) for e in encoder_outputs] attention_states = array_ops.concat(top_states, 1) # Decoder. output_size = None if output_projection is None: - cell = core_rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) + cell = core_rnn_cell.OutputProjectionWrapper( + cell, num_decoder_symbols) output_size = num_decoder_symbols return embedding_attention_decoder( @@ -254,4 +291,3 @@ def embedding_attention_seq2seq(encoder_inputs, output_projection=output_projection, feed_previous=feed_previous, initial_state_attention=initial_state_attention, beam_search=beam_search, beam_size=beam_size) - diff --git a/update.sh b/update.sh new file mode 100644 index 0000000..9e38df5 --- /dev/null +++ b/update.sh @@ -0,0 +1,4 @@ +#!/bin/sh +git add -A +git commit -am "update `date`" +git push