Merge branch 'master' of github.com:fchollet/keras

fchollet · fchollet · commit 8ed57c168f17 · 2018-01-05T13:37:36.000-08:00
diff --git a/keras/engine/training.py b/keras/engine/training.py
@@ -62,88 +62,69 @@ def _standardize_input_data(data, names, shapes=None,
         return []
     if data is None:
         return [None for _ in range(len(names))]
+
     if isinstance(data, dict):
         try:
-            arrays = [data[name].values if data[name].__class__.__name__ == 'DataFrame' else data[name]
-                      for name in names]
-
+            data = [data[x].values if data[x].__class__.__name__ == 'DataFrame' else data[x] for x in names]
+            data = [np.expand_dims(x, 1) if x.ndim == 1 else x for x in data]
         except KeyError as e:
-            raise ValueError('No data provided for "' +
-                             e.args[0] + '". Need data for each key in: ' +
-                             str(names))
-
+            raise ValueError(
+                'No data provided for "' + e.args[0] + '". Need data '
+                'for each key in: ' + str(names))
     elif isinstance(data, list):
-        arrays = [x.values if x.__class__.__name__ == 'DataFrame' else x for x in data]
-        if len(arrays) != len(names):
-            if arrays and hasattr(arrays[0], 'shape'):
-                raise ValueError('Error when checking model ' +
-                                 exception_prefix +
-                                 ': the list of Numpy arrays '
-                                 'that you are passing to your model '
-                                 'is not the size the model expected. '
-                                 'Expected to see ' + str(len(names)) +
-                                 ' array(s), but instead got '
-                                 'the following list of ' + str(len(arrays)) +
-                                 ' arrays: ' + str(arrays)[:200] +
-                                 '...')
-            else:
-                if len(names) == 1:
-                    arrays = [np.asarray(arrays)]
-                else:
-                    raise ValueError(
-                        'Error when checking model ' +
-                        exception_prefix +
-                        ': you are passing a list as '
-                        'input to your model, '
-                        'but the model expects '
-                        'a list of ' + str(len(names)) +
-                        ' Numpy arrays instead. '
-                        'The list you passed was: ' +
-                        str(arrays)[:200])
+        data = [x.values if x.__class__.__name__ == 'DataFrame' else x for x in data]
+        data = [np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x for x in data]
     else:
-        if data.__class__.__name__ == 'DataFrame':
-            # test if data is a DataFrame, without pandas installed
-            data = data.values
-        if not hasattr(data, 'shape'):
-            raise TypeError('Error when checking model ' +
-                            exception_prefix +
-                            ': data should be a Numpy array, '
-                            'or list/dict of Numpy arrays. '
-                            'Found: ' + str(data)[:200] + '...')
-        if len(names) > 1:
-            # Case: model expects multiple inputs but only received
-            # a single Numpy array.
-            raise ValueError('The model expects ' + str(len(names)) + ' ' +
-                             exception_prefix +
-                             ' arrays, but only received one array. '
-                             'Found: array with shape ' + str(data.shape))
-        arrays = [data]
-
-    # Make arrays at least 2D.
-    arrays = [np.expand_dims(array, 1) if array.ndim == 1 else array for array in arrays]
+        data = data.values if data.__class__.__name__ == 'DataFrame' else data
+        data = [np.expand_dims(data, 1)] if data.ndim == 1 else [data]
+
+    if len(data) != len(names):
+        if data and hasattr(data[0], 'shape'):
+            raise ValueError(
+                'Error when checking model ' + exception_prefix +
+                ': the list of Numpy arrays that you are passing to '
+                'your model is not the size the model expected. '
+                'Expected to see ' + str(len(names)) + ' array(s), '
+                'but instead got the following list of ' +
+                str(len(data)) + ' arrays: ' + str(data)[:200] + '...')
+        elif len(names) > 1:
+            raise ValueError(
+                'Error when checking model ' + exception_prefix +
+                ': you are passing a list as input to your model, '
+                'but the model expects a list of ' + str(len(names)) +
+                ' Numpy arrays instead. The list you passed was: ' +
+                str(data)[:200])
+        elif len(data) == 1 and not hasattr(data[0], 'shape'):
+            raise TypeError(
+                'Error when checking model ' + exception_prefix +
+                ': data should be a Numpy array, or list/dict of '
+                'Numpy arrays. Found: ' + str(data)[:200] + '...')
+        elif len(names) == 1:
+            data = [np.asarray(data)]
 
     # Check shapes compatibility.
     if shapes:
-        start = 0 if check_batch_axis else 1
         for i in range(len(names)):
             if shapes[i] is not None:
-                array_shape = arrays[i].shape
-                if arrays[i].ndim != len(shapes[i]):
-                    raise ValueError('Error when checking ' + exception_prefix +
-                                     ': expected ' + names[i] +
-                                     ' to have ' + str(len(shapes[i])) +
-                                     ' dimensions, but got array with shape ' +
-                                     str(array_shape))
-
-                for dim, ref_dim in zip(array_shape[start:], shapes[i][start:]):
+                data_shape = data[i].shape
+                shape = shapes[i]
+                if data[i].ndim != len(shape):
+                    raise ValueError(
+                        'Error when checking ' + exception_prefix +
+                        ': expected ' + names[i] + ' to have ' +
+                        str(len(shape)) + ' dimensions, but got array '
+                        'with shape ' + str(data_shape))
+                if not check_batch_axis:
+                    data_shape = data_shape[1:]
+                    shape = shape[1:]
+                for dim, ref_dim in zip(data_shape, shape):
                     if ref_dim != dim and ref_dim:
                         raise ValueError(
                             'Error when checking ' + exception_prefix +
-                            ': expected ' + names[i] +
-                            ' to have shape ' + str(shapes[i]) +
-                            ' but got array with shape ' +
-                            str(array_shape))
-    return arrays
+                            ': expected ' + names[i] + ' to have shape ' +
+                            str(shape) + ' but got array with shape ' +
+                            str(data_shape))
+    return data
 
 
 def _standardize_sample_or_class_weights(x_weight, output_names, weight_type):
diff --git a/keras/preprocessing/text.py b/keras/preprocessing/text.py
@@ -124,6 +124,8 @@ class Tokenizer(object):
         lower: boolean. Whether to convert the texts to lowercase.
         split: character or string to use for token splitting.
         char_level: if True, every character will be treated as a token.
+        oov_token: if given, it will be added to word_index and used to
+            replace out-of-vocabulary words during text_to_sequence calls
 
     By default, all punctuation is removed, turning the texts into
     space-separated sequences of words
@@ -138,6 +140,7 @@ def __init__(self, num_words=None,
                  lower=True,
                  split=' ',
                  char_level=False,
+                 oov_token=None,
                  **kwargs):
         # Legacy support
         if 'nb_words' in kwargs:
@@ -155,6 +158,7 @@ def __init__(self, num_words=None,
         self.num_words = num_words
         self.document_count = 0
         self.char_level = char_level
+        self.oov_token = oov_token
 
     def fit_on_texts(self, texts):
         """Updates internal vocabulary based on a list of texts.
@@ -189,6 +193,11 @@ def fit_on_texts(self, texts):
         # note that index 0 is reserved, never assigned to an existing word
         self.word_index = dict(list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))
 
+        if self.oov_token is not None:
+            i = self.word_index.get(self.oov_token)
+            if i is None:
+                self.word_index[self.oov_token] = len(self.word_index) + 1
+
         self.index_docs = {}
         for w, c in list(self.word_docs.items()):
             self.index_docs[self.word_index[w]] = c
@@ -256,6 +265,10 @@ def texts_to_sequences_generator(self, texts):
                         continue
                     else:
                         vect.append(i)
+                elif self.oov_token is not None:
+                    i = self.word_index.get(self.oov_token)
+                    if i is not None:
+                        vect.append(i)
             yield vect
 
     def texts_to_matrix(self, texts, mode='binary'):
diff --git a/tests/keras/preprocessing/text_test.py b/tests/keras/preprocessing/text_test.py
@@ -67,5 +67,25 @@ def test_tokenizer_unicode():
     assert len(tokenizer.word_counts) == 5
 
 
+def test_tokenizer_oov_flag():
+    """
+    Test of Out of Vocabulary (OOV) flag in Tokenizer
+    """
+    x_train = ['This text has only known words']
+    x_test = ['This text has some unknown words']  # 2 OOVs: some, unknown
+
+    # Defalut, without OOV flag
+    tokenizer = Tokenizer()
+    tokenizer.fit_on_texts(x_train)
+    x_test_seq = tokenizer.texts_to_sequences(x_test)
+    assert len(x_test_seq[0]) == 4  # discards 2 OOVs
+
+    # With OOV feature
+    tokenizer = Tokenizer(oov_token='<unk>')
+    tokenizer.fit_on_texts(x_train)
+    x_test_seq = tokenizer.texts_to_sequences(x_test)
+    assert len(x_test_seq[0]) == 6  # OOVs marked in place
+
+
 if __name__ == '__main__':
     pytest.main([__file__])