Huntersjm
diff --git a/‎Dockerfile‎
Lines changed: 30 additions & 0 deletions b/‎Dockerfile‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎chatbot/chatbot.py‎
Lines changed: 20 additions & 7 deletions b/‎chatbot/chatbot.py‎
Lines changed: 20 additions & 7 deletions
diff --git a/‎chatbot/cornelldata.py‎
Lines changed: 65 additions & 17 deletions b/‎chatbot/cornelldata.py‎
Lines changed: 65 additions & 17 deletions
diff --git a/‎chatbot/model.py‎
Lines changed: 8 additions & 4 deletions b/‎chatbot/model.py‎
Lines changed: 8 additions & 4 deletions
@@ -0,0 +1,30 @@
+## Dockerfile to build DeepQ&A container image
+
+FROM python:3.5.2
+MAINTAINER rbi
+
+ARG TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.9.0-cp35-cp35m-linux_x86_64.whl
+
+ENV CHATBOT_SECRET_KEY="my-secret-key"
+
+## dependencies
+RUN \
+  apt-get -qq -y update && apt-get -y install \
+  unzip && \
+  pip3 install -U $TF_BINARY_URL \
+  nltk \
+  tqdm \
+  django \
+  asgi_redis \
+  channels && \
+  python3 -m nltk.downloader punkt
+
+
+COPY ./ /root/DeepQA
+
+## Run Config
+EXPOSE 8000
+COPY docker/settings.py /root/DeepQA/chatbot_website/chatbot_website/
+COPY docker/chatbot.sh /root/DeepQA/chatbot_website/
+WORKDIR /root/DeepQA/chatbot_website
+CMD bash chatbot.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2015 Conchylicultor. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -62,7 +64,7 @@ def __init__(self):
         self.sess = None
 
         # Filename and directories constants
-        self.MODEL_DIR_BASE = 'save/model'
+        self.MODEL_DIR_BASE = 'model'
         self.MODEL_NAME_BASE = 'model'
         self.MODEL_EXT = '.ckpt'
         self.CONFIG_FILENAME = 'params.ini'
@@ -222,12 +224,20 @@ def mainTrain(self, sess):
 
                 batches = self.textData.getBatches()
 
+                #print ("Printing batches: \n")
+                #print (batches)
+                #exit()
                 # TODO: Also update learning parameters eventually
 
                 tic = datetime.datetime.now()
                 for nextBatch in tqdm(batches, desc="Training"):
                     # Training pass
                     ops, feedDict = self.model.step(nextBatch)
+                    #print ("Printing ops: \n")
+                    #print (ops)
+                    #print ("Printing feedDict: \n")
+                    #print (feedDict)
+
                     assert len(ops) == 2  # training, loss
                     _, loss, summary = sess.run(ops + (mergedSummaries,), feedDict)
                     self.writer.add_summary(summary, self.globStep)
@@ -310,11 +320,11 @@ def mainTestInteractive(self, sess):
                 continue  # Back to the beginning, try again
 
             print('{}{}'.format(self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True)))
-
+            
             if self.args.verbose:
                 print(self.textData.batchSeq2str(questionSeq, clean=True, reverse=True))
                 print(self.textData.sequence2str(answer))
-
+            
             print()
 
     def singlePredict(self, question, questionSeq=None):
@@ -395,8 +405,11 @@ def managePreviousModel(self, sess):
             if self.args.reset:
                 fileList = [os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir)]
                 for f in fileList:
-                    print('Removing {}'.format(f))
-                    os.remove(f)
+                    if f.endswith(".pkl"):
+                        continue
+                    else:
+                        print('Removing {}'.format(f))
+                        os.remove(f)
 
         else:
             print('No previous model found, starting from clean directory: {}'.format(self.modelDir))
@@ -487,7 +500,7 @@ def saveModelParams(self):
         config['Network']['hiddenSize'] = str(self.args.hiddenSize)
         config['Network']['numLayers'] = str(self.args.numLayers)
         config['Network']['embeddingSize'] = str(self.args.embeddingSize)
-
+        
         # Keep track of the learning params (but without restoring them)
         config['Training (won\'t be restored)'] = {}
         config['Training (won\'t be restored)']['learningRate'] = str(self.args.learningRate)
@@ -529,4 +542,4 @@ def getDevice(self):
             return None
         else:
             print('Warning: Error in the device name: {}, use the default device'.format(self.args.device))
-            return None
+            return None
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2015 Conchylicultor. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,28 +22,31 @@
 http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html
 
 """
+import os
+import tqdm
 
 class CornellData:
     """
 
     """
-
-    def __init__(self, dirName):
+    
+    def __init__(self, dirName, corpus):
         """
         Args:
             dirName (string): directory where to load the corpus
         """
         self.lines = {}
         self.conversations = []
-
+    
         MOVIE_LINES_FIELDS = ["lineID","characterID","movieID","character","text"]
         MOVIE_CONVERSATIONS_FIELDS = ["character1ID","character2ID","movieID","utteranceIDs"]
-
-        self.lines = self.loadLines(dirName + "movie_lines.txt", MOVIE_LINES_FIELDS)
-        self.conversations = self.loadConversations(dirName + "movie_conversations.txt", MOVIE_CONVERSATIONS_FIELDS)
-
+        
+        #self.lines = self.loadLines(dirName + "movie_lines.txt", MOVIE_LINES_FIELDS)
+        #self.conversations = self.loadConversations(dirName + "movie_conversations.txt", MOVIE_CONVERSATIONS_FIELDS)
+        self.conversations = self.loadConversations(dirName + corpus, MOVIE_CONVERSATIONS_FIELDS)
+        
         # TODO: Cleaner program (merge copy-paste) !!
-
+        
     def loadLines(self, fileName, fields):
         """
         Args:
@@ -51,7 +56,7 @@ def loadLines(self, fileName, fields):
             dict<dict<str>>: the extracted fields for each line
         """
         lines = {}
-
+        
         with open(fileName, 'r', encoding='iso-8859-1') as f:  # TODO: Solve Iso encoding pb !
             for line in f:
                 values = line.split(" +++$+++ ")
@@ -60,11 +65,14 @@ def loadLines(self, fileName, fields):
                 lineObj = {}
                 for i, field in enumerate(fields):
                     lineObj[field] = values[i]
-
+                
                 lines[lineObj['lineID']] = lineObj
 
+                #print ("Printing lines (loadLines): \n")
+                #print (lines)       
+                #exit()
         return lines
-
+        
     def loadConversations(self, fileName, fields):
         """
         Args:
@@ -73,32 +81,72 @@ def loadConversations(self, fileName, fields):
         Return:
             dict<dict<str>>: the extracted fields for each line
         """
+        
         conversations = []
+        collected_lines = []
+        num_convos = 0;
 
+        with open(fileName,"r", encoding='iso-8859-1') as f:
+            for line in f:
+                if "Start of Convo" in line:
+                    num_convos = num_convos + 1;
+
+        with open(fileName, 'r', encoding='iso-8859-1') as fp:
+            #all_lines = fp.readlines()
+            for j in tqdm.tqdm(range(num_convos)):
+                for line in fp:
+                    if "Start of Convo" in line:
+                        #print ("started at line", i)
+                        continue
+                    if "End of Convo" in line:
+                        #print ("end at line", i)
+                        conversations.append(collected_lines)
+                        break
+
+                    collected_lines.append(line.strip())
+
+                if(len(collected_lines) % 2 != 0):
+                    collected_lines.pop()
+                
+                if (os.path.split(fileName)[1] == "switchboard.txt" or os.path.split(fileName)[1] == "watson_pii.txt"):
+                    for i in range(0, len(collected_lines), 2):
+                        collected_lines[i], collected_lines[i+1] = collected_lines[i+1], collected_lines[i]
+
+        #for convo in conversations:
+        #    for sent in convo:
+        #        print (sent)
+        #exit()
+        '''
+        conversations = []
         with open(fileName, 'r', encoding='iso-8859-1') as f:  # TODO: Solve Iso encoding pb !
             for line in f:
                 values = line.split(" +++$+++ ")
-
+                
                 # Extract fields
                 convObj = {}
                 for i, field in enumerate(fields):
                     convObj[field] = values[i]
-
+                
                 lineIds = convObj["utteranceIDs"][2:-3].split("', '")
-
+                
                 #print(convObj["utteranceIDs"])
                 #for lineId in lineIds:
                     #print(lineId, end=' ')
                 #print()
-
+                    
                 # Reassemble lines
                 convObj["lines"] = []
                 for lineId in lineIds:
                     convObj["lines"].append(self.lines[lineId])
-
+                    
                 conversations.append(convObj)
 
+                #print ("Printing conversations: \n")
+                #print (conversations)
+                #exit()
+        '''
         return conversations
 
     def getConversations(self):
-        return self.conversations
+        #print ("Hello")
+        return self.conversations
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2015 Conchylicultor. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,7 +31,7 @@ class Model:
     Achitecture:
         2 LTSM layers
     """
-
+    
     def __init__(self, args, textData):
         """
         Args:
@@ -68,6 +70,7 @@ def buildNetwork(self):
             #encoDecoCell = tf.nn.rnn_cell.DropoutWrapper(encoDecoCell, input_keep_prob=1.0, output_keep_prob=1.0)  # TODO: Custom values (WARNING: No dropout when testing !!!)
             encoDecoCell = tf.nn.rnn_cell.MultiRNNCell([encoDecoCell] * self.args.numLayers, state_is_tuple=True)
 
+        
         # Network input (placeholders)
 
         with tf.name_scope('placeholder_encoder'):
@@ -81,7 +84,8 @@ def buildNetwork(self):
         # Define the network
         # Here we use an embedding model, it takes integer as input and convert them into word vector for
         # better word representation
-        decoderOutputs, states = tf.nn.seq2seq.embedding_rnn_seq2seq(
+        # decoderOutputs, states = tf.nn.seq2seq.embedding_rnn_seq2seq(
+        decoderOutputs, states = tf.nn.seq2seq.embedding_attention_seq2seq(
             self.encoderInputs,  # List<[batch=?, inputDim=1]>, list of size args.maxLength
             self.decoderInputs,  # For training, we force the correct output (feed_previous=False)
             encoDecoCell,
@@ -111,7 +115,7 @@ def buildNetwork(self):
                 epsilon=1e-08
             )
             self.optOp = opt.minimize(self.lossFct)
-
+    
     def step(self, batch):
         """ Forward/training step operation.
         Does not perform run on itself but just return the operators to do so. Those have then to be run
@@ -142,4 +146,4 @@ def step(self, batch):
             ops = (self.outputs,)
 
         # Return one pass operator
-        return ops, feedDict
+        return ops, feedDict