Limit vocabulary size

segurac · segurac · commit 346d5f1fb772 · 2017-04-06T11:21:58.000+02:00
diff --git a/chatbot/chatbot.py b/chatbot/chatbot.py
@@ -114,6 +114,9 @@ def parseArgs(args):
         datasetArgs.add_argument('--ratioDataset', type=float, default=1.0, help='ratio of dataset used to avoid using the whole dataset')  # Not implemented, useless ?
         datasetArgs.add_argument('--maxLength', type=int, default=10, help='maximum length of the sentence (for input and output), define number of maximum step of the RNN')
         datasetArgs.add_argument('--filterVocab', type=int, default=1, help='remove rarelly used words (by default words used only once). 0 to keep all words.')
+        datasetArgs.add_argument('--increaseTrainingPairs', type=bool, default=False, help='Use every line in the dataset as both input and target, thus multiplying by two the training set.')
+        datasetArgs.add_argument('--vocabularySize', type=int, default=40000, help='Limit the number of words in the vocabulary')
+        
 
         # Network options (Warning: if modifying something here, also make the change on save/loadParams() )
         nnArgs = parser.add_argument_group('Network options', 'architecture related option')
@@ -543,6 +546,9 @@ def loadModelParams(self):
             self.args.datasetTag = config['Dataset'].get('datasetTag')
             self.args.maxLength = config['Dataset'].getint('maxLength')  # We need to restore the model length because of the textData associated and the vocabulary size (TODO: Compatibility mode between different maxLength)
             self.args.filterVocab = config['Dataset'].getint('filterVocab')
+            self.increaseTrainingPairs = config['Dataset'].getboolean('increaseTrainingPairs')
+            self.args.vocabularySize = config['Dataset'].getint('vocabularySize')
+            
 
             self.args.hiddenSize = config['Network'].getint('hiddenSize')
             self.args.numLayers = config['Network'].getint('numLayers')
@@ -564,6 +570,8 @@ def loadModelParams(self):
             print('datasetTag: {}'.format(self.args.datasetTag))
             print('maxLength: {}'.format(self.args.maxLength))
             print('filterVocab: {}'.format(self.args.filterVocab))
+            print('increaseTrainingPairs: {}'.format(self.args.increaseTrainingPairs))
+            print('vocabularySize: {}'.format(self.args.vocabularySize))
             print('hiddenSize: {}'.format(self.args.hiddenSize))
             print('numLayers: {}'.format(self.args.numLayers))
             print('softmaxSamples: {}'.format(self.args.softmaxSamples))
@@ -596,7 +604,10 @@ def saveModelParams(self):
         config['Dataset']['datasetTag'] = str(self.args.datasetTag)
         config['Dataset']['maxLength'] = str(self.args.maxLength)
         config['Dataset']['filterVocab'] = str(self.args.filterVocab)
-
+        config['Dataset']['increaseTrainingPairs'] = str(self.args.increaseTrainingPairs)
+        config['Dataset']['vocabularySize'] = str(self.args.vocabularySize)
+        
+        
         config['Network'] = {}
         config['Network']['hiddenSize'] = str(self.args.hiddenSize)
         config['Network']['numLayers'] = str(self.args.numLayers)
diff --git a/chatbot/textdata.py b/chatbot/textdata.py
@@ -26,6 +26,7 @@
 import random
 import string
 from collections import OrderedDict
+import collections
 
 from chatbot.corpus.cornelldata import CornellData
 from chatbot.corpus.opensubsdata import OpensubsData
@@ -77,9 +78,10 @@ def __init__(self, args):
         self.corpusDir = os.path.join(self.args.rootDir, 'data', self.args.corpus)
         basePath = self._constructBasePath()
         self.fullSamplesPath = basePath + '.pkl'  # Full sentences length/vocab
-        self.filteredSamplesPath = basePath + '-lenght{}-filter{}.pkl'.format(
+        self.filteredSamplesPath = basePath + '-lenght{}-filter{}-vocabSize{}.pkl'.format(
             self.args.maxLength,
             self.args.filterVocab,
+            self.args.vocabularySize,
         )  # Sentences/vocab filtered for this model
 
         self.padToken = -1  # Padding
@@ -366,20 +368,25 @@ def mergeSentences(sentences, fromEnd=False):
         }
         new_mapping = {}  # Map the full words ids to the new one (TODO: Should be a list)
         newId = 0
+        
+        print("Filtering dataset with vocabSize={} and wordCount > {}", self.args.vocabularySize,self.args.filterVocab)
+        word_counter = collections.Counter(self.idCount)
+        selected_word_ids = word_counter.most_common(self.args.vocabularySize)
+        selected_word_ids = { k:v for k, v in selected_word_ids if v>self.args.filterVocab }
+
         for wordId, count in [(i, self.idCount[i]) for i in range(len(self.idCount))]:  # Iterate in order
-            if (count <= self.args.filterVocab and
-                wordId not in specialTokens):  # Cadidate to filtering (Warning: don't filter special token)
-                new_mapping[wordId] = self.unknownToken
-                del self.word2id[self.id2word[wordId]]  # The word isn't used anymore
-                del self.id2word[wordId]
-            else:  # Update the words ids
+            if wordId in selected_word_ids or wordId in specialTokens: #update word id
                 new_mapping[wordId] = newId
                 word = self.id2word[wordId]  # The new id has changed, update the dictionaries
                 del self.id2word[wordId]  # Will be recreated if newId == wordId
                 self.word2id[word] = newId
                 self.id2word[newId] = word
                 newId += 1
-
+            else:  #Not in our list nor special, map it to unknownToken
+                new_mapping[wordId] = self.unknownToken
+                del self.word2id[self.id2word[wordId]]  # The word isn't used anymore
+                del self.id2word[wordId]
+       
         # Last step: replace old ids by new ones and filters empty sentences
         def replace_words(words):
             valid = False  # Filter empty sequences
@@ -390,15 +397,25 @@ def replace_words(words):
             return valid
 
         self.trainingSamples.clear()
+        self.idCount.clear()  # Let's recreate idCount
+        
         for inputWords, targetWords in tqdm(newSamples, desc='Replace ids:', leave=False):
             valid = True
             valid &= replace_words(inputWords)
             valid &= replace_words(targetWords)
+            valid &= targetWords.count(self.unknownToken) == 0  # Filter target with out-of-vocabulary target words
 
             if valid:
                 self.trainingSamples.append([inputWords, targetWords])  # TODO: Could replace list by tuple
+                #Recreate idCount
+                for wordId in inputWords + targetWords:
+                    if wordId in self.idCount:
+                        self.idCount[wordId] = self.idCount[wordId] + 1
+                    else:
+                        self.idCount[wordId] = 1
+        print("Final vocabulary size of", len(self.word2id) - len(specialTokens))
+
 
-        self.idCount.clear()  # Not usefull anymore. Free data
 
 
     def createFullCorpus(self, conversations):
@@ -424,9 +441,14 @@ def extractConversation(self, conversation):
         Args:
             conversation (Obj): a conversation object containing the lines to extract
         """
+        
+        if self.args.increaseTrainingPairs:
+            step = 1
+        else:
+            step = 2
 
         # Iterate over all the lines of the conversation
-        for i in tqdm_wrap(range(len(conversation['lines']) - 1),  # We ignore the last line (no answer for it)
+        for i in tqdm_wrap(range(0,len(conversation['lines']) - 1, step ),  # We ignore the last line (no answer for it)
                            desc='Conversation', leave=False):
             inputLine  = conversation['lines'][i]
             targetLine = conversation['lines'][i+1]