Skip to content

Commit cb97b64

Browse files
author
Vithu Thangarasa
committed
Add Dockerized DeepQA Chatbot with YAML files
Add Support for parsing different corpuses Add embeddeding_attention_seq2seq Add functionality to specify corpus file in command line Fix Directory names/paths containing corpus data, pickle dataset and model
1 parent ef26903 commit cb97b64

File tree

19 files changed

+471
-388309
lines changed

19 files changed

+471
-388309
lines changed

Dockerfile

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
## Dockerfile to build DeepQ&A container image
2+
3+
FROM python:3.5.2
4+
MAINTAINER rbi
5+
6+
ARG TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.9.0-cp35-cp35m-linux_x86_64.whl
7+
8+
ENV CHATBOT_SECRET_KEY="my-secret-key"
9+
10+
## dependencies
11+
RUN \
12+
apt-get -qq -y update && apt-get -y install \
13+
unzip && \
14+
pip3 install -U $TF_BINARY_URL \
15+
nltk \
16+
tqdm \
17+
django \
18+
asgi_redis \
19+
channels && \
20+
python3 -m nltk.downloader punkt
21+
22+
23+
COPY ./ /root/DeepQA
24+
25+
## Run Config
26+
EXPOSE 8000
27+
COPY docker/settings.py /root/DeepQA/chatbot_website/chatbot_website/
28+
COPY docker/chatbot.sh /root/DeepQA/chatbot_website/
29+
WORKDIR /root/DeepQA/chatbot_website
30+
CMD bash chatbot.sh

chatbot/chatbot.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/usr/bin/env python3
2+
13
# Copyright 2015 Conchylicultor. All Rights Reserved.
24
#
35
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -62,7 +64,7 @@ def __init__(self):
6264
self.sess = None
6365

6466
# Filename and directories constants
65-
self.MODEL_DIR_BASE = 'save/model'
67+
self.MODEL_DIR_BASE = 'model'
6668
self.MODEL_NAME_BASE = 'model'
6769
self.MODEL_EXT = '.ckpt'
6870
self.CONFIG_FILENAME = 'params.ini'
@@ -222,12 +224,20 @@ def mainTrain(self, sess):
222224

223225
batches = self.textData.getBatches()
224226

227+
#print ("Printing batches: \n")
228+
#print (batches)
229+
#exit()
225230
# TODO: Also update learning parameters eventually
226231

227232
tic = datetime.datetime.now()
228233
for nextBatch in tqdm(batches, desc="Training"):
229234
# Training pass
230235
ops, feedDict = self.model.step(nextBatch)
236+
#print ("Printing ops: \n")
237+
#print (ops)
238+
#print ("Printing feedDict: \n")
239+
#print (feedDict)
240+
231241
assert len(ops) == 2 # training, loss
232242
_, loss, summary = sess.run(ops + (mergedSummaries,), feedDict)
233243
self.writer.add_summary(summary, self.globStep)
@@ -310,11 +320,11 @@ def mainTestInteractive(self, sess):
310320
continue # Back to the beginning, try again
311321

312322
print('{}{}'.format(self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True)))
313-
323+
314324
if self.args.verbose:
315325
print(self.textData.batchSeq2str(questionSeq, clean=True, reverse=True))
316326
print(self.textData.sequence2str(answer))
317-
327+
318328
print()
319329

320330
def singlePredict(self, question, questionSeq=None):
@@ -395,8 +405,11 @@ def managePreviousModel(self, sess):
395405
if self.args.reset:
396406
fileList = [os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir)]
397407
for f in fileList:
398-
print('Removing {}'.format(f))
399-
os.remove(f)
408+
if f.endswith(".pkl"):
409+
continue
410+
else:
411+
print('Removing {}'.format(f))
412+
os.remove(f)
400413

401414
else:
402415
print('No previous model found, starting from clean directory: {}'.format(self.modelDir))
@@ -487,7 +500,7 @@ def saveModelParams(self):
487500
config['Network']['hiddenSize'] = str(self.args.hiddenSize)
488501
config['Network']['numLayers'] = str(self.args.numLayers)
489502
config['Network']['embeddingSize'] = str(self.args.embeddingSize)
490-
503+
491504
# Keep track of the learning params (but without restoring them)
492505
config['Training (won\'t be restored)'] = {}
493506
config['Training (won\'t be restored)']['learningRate'] = str(self.args.learningRate)
@@ -529,4 +542,4 @@ def getDevice(self):
529542
return None
530543
else:
531544
print('Warning: Error in the device name: {}, use the default device'.format(self.args.device))
532-
return None
545+
return None

chatbot/cornelldata.py

Lines changed: 65 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/usr/bin/env python3
2+
13
# Copyright 2015 Conchylicultor. All Rights Reserved.
24
#
35
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,28 +22,31 @@
2022
http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html
2123
2224
"""
25+
import os
26+
import tqdm
2327

2428
class CornellData:
2529
"""
2630
2731
"""
28-
29-
def __init__(self, dirName):
32+
33+
def __init__(self, dirName, corpus):
3034
"""
3135
Args:
3236
dirName (string): directory where to load the corpus
3337
"""
3438
self.lines = {}
3539
self.conversations = []
36-
40+
3741
MOVIE_LINES_FIELDS = ["lineID","characterID","movieID","character","text"]
3842
MOVIE_CONVERSATIONS_FIELDS = ["character1ID","character2ID","movieID","utteranceIDs"]
39-
40-
self.lines = self.loadLines(dirName + "movie_lines.txt", MOVIE_LINES_FIELDS)
41-
self.conversations = self.loadConversations(dirName + "movie_conversations.txt", MOVIE_CONVERSATIONS_FIELDS)
42-
43+
44+
#self.lines = self.loadLines(dirName + "movie_lines.txt", MOVIE_LINES_FIELDS)
45+
#self.conversations = self.loadConversations(dirName + "movie_conversations.txt", MOVIE_CONVERSATIONS_FIELDS)
46+
self.conversations = self.loadConversations(dirName + corpus, MOVIE_CONVERSATIONS_FIELDS)
47+
4348
# TODO: Cleaner program (merge copy-paste) !!
44-
49+
4550
def loadLines(self, fileName, fields):
4651
"""
4752
Args:
@@ -51,7 +56,7 @@ def loadLines(self, fileName, fields):
5156
dict<dict<str>>: the extracted fields for each line
5257
"""
5358
lines = {}
54-
59+
5560
with open(fileName, 'r', encoding='iso-8859-1') as f: # TODO: Solve Iso encoding pb !
5661
for line in f:
5762
values = line.split(" +++$+++ ")
@@ -60,11 +65,14 @@ def loadLines(self, fileName, fields):
6065
lineObj = {}
6166
for i, field in enumerate(fields):
6267
lineObj[field] = values[i]
63-
68+
6469
lines[lineObj['lineID']] = lineObj
6570

71+
#print ("Printing lines (loadLines): \n")
72+
#print (lines)
73+
#exit()
6674
return lines
67-
75+
6876
def loadConversations(self, fileName, fields):
6977
"""
7078
Args:
@@ -73,32 +81,72 @@ def loadConversations(self, fileName, fields):
7381
Return:
7482
dict<dict<str>>: the extracted fields for each line
7583
"""
84+
7685
conversations = []
86+
collected_lines = []
87+
num_convos = 0;
7788

89+
with open(fileName,"r", encoding='iso-8859-1') as f:
90+
for line in f:
91+
if "Start of Convo" in line:
92+
num_convos = num_convos + 1;
93+
94+
with open(fileName, 'r', encoding='iso-8859-1') as fp:
95+
#all_lines = fp.readlines()
96+
for j in tqdm.tqdm(range(num_convos)):
97+
for line in fp:
98+
if "Start of Convo" in line:
99+
#print ("started at line", i)
100+
continue
101+
if "End of Convo" in line:
102+
#print ("end at line", i)
103+
conversations.append(collected_lines)
104+
break
105+
106+
collected_lines.append(line.strip())
107+
108+
if(len(collected_lines) % 2 != 0):
109+
collected_lines.pop()
110+
111+
if (os.path.split(fileName)[1] == "switchboard.txt" or os.path.split(fileName)[1] == "watson_pii.txt"):
112+
for i in range(0, len(collected_lines), 2):
113+
collected_lines[i], collected_lines[i+1] = collected_lines[i+1], collected_lines[i]
114+
115+
#for convo in conversations:
116+
# for sent in convo:
117+
# print (sent)
118+
#exit()
119+
'''
120+
conversations = []
78121
with open(fileName, 'r', encoding='iso-8859-1') as f: # TODO: Solve Iso encoding pb !
79122
for line in f:
80123
values = line.split(" +++$+++ ")
81-
124+
82125
# Extract fields
83126
convObj = {}
84127
for i, field in enumerate(fields):
85128
convObj[field] = values[i]
86-
129+
87130
lineIds = convObj["utteranceIDs"][2:-3].split("', '")
88-
131+
89132
#print(convObj["utteranceIDs"])
90133
#for lineId in lineIds:
91134
#print(lineId, end=' ')
92135
#print()
93-
136+
94137
# Reassemble lines
95138
convObj["lines"] = []
96139
for lineId in lineIds:
97140
convObj["lines"].append(self.lines[lineId])
98-
141+
99142
conversations.append(convObj)
100143
144+
#print ("Printing conversations: \n")
145+
#print (conversations)
146+
#exit()
147+
'''
101148
return conversations
102149

103150
def getConversations(self):
104-
return self.conversations
151+
#print ("Hello")
152+
return self.conversations

chatbot/model.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/usr/bin/env python3
2+
13
# Copyright 2015 Conchylicultor. All Rights Reserved.
24
#
35
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,7 +31,7 @@ class Model:
2931
Achitecture:
3032
2 LTSM layers
3133
"""
32-
34+
3335
def __init__(self, args, textData):
3436
"""
3537
Args:
@@ -68,6 +70,7 @@ def buildNetwork(self):
6870
#encoDecoCell = tf.nn.rnn_cell.DropoutWrapper(encoDecoCell, input_keep_prob=1.0, output_keep_prob=1.0) # TODO: Custom values (WARNING: No dropout when testing !!!)
6971
encoDecoCell = tf.nn.rnn_cell.MultiRNNCell([encoDecoCell] * self.args.numLayers, state_is_tuple=True)
7072

73+
7174
# Network input (placeholders)
7275

7376
with tf.name_scope('placeholder_encoder'):
@@ -81,7 +84,8 @@ def buildNetwork(self):
8184
# Define the network
8285
# Here we use an embedding model, it takes integer as input and convert them into word vector for
8386
# better word representation
84-
decoderOutputs, states = tf.nn.seq2seq.embedding_rnn_seq2seq(
87+
# decoderOutputs, states = tf.nn.seq2seq.embedding_rnn_seq2seq(
88+
decoderOutputs, states = tf.nn.seq2seq.embedding_attention_seq2seq(
8589
self.encoderInputs, # List<[batch=?, inputDim=1]>, list of size args.maxLength
8690
self.decoderInputs, # For training, we force the correct output (feed_previous=False)
8791
encoDecoCell,
@@ -111,7 +115,7 @@ def buildNetwork(self):
111115
epsilon=1e-08
112116
)
113117
self.optOp = opt.minimize(self.lossFct)
114-
118+
115119
def step(self, batch):
116120
""" Forward/training step operation.
117121
Does not perform run on itself but just return the operators to do so. Those have then to be run
@@ -142,4 +146,4 @@ def step(self, batch):
142146
ops = (self.outputs,)
143147

144148
# Return one pass operator
145-
return ops, feedDict
149+
return ops, feedDict

0 commit comments

Comments
 (0)