starting licensing update

Raul Puri · Raul Puri · commit d5ec5c12b78c · 2018-12-10T14:57:20.000-08:00
diff --git a/LICENSE b/LICENSE
@@ -22,4 +22,39 @@
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------ LICENSE FOR fairseq(transformer) repository --------------------
+
+
+BSD License
+
+For fairseq software
+
+Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+       and/or other materials provided with the distribution.
+
+ * Neither the name Facebook nor the names of its contributors may be used to
+    endorse or promote products derived from this software without specific
+       prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/arguments.py b/arguments.py
@@ -1,3 +1,11 @@
+###############################################################################
+# BSD 3-Clause License
+#
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Author & Contact: Raul Puri (raulp@nvidia.com)
+###############################################################################
+
 from configure_data import configure_data
 
 def add_general_args(parser):
diff --git a/configure_data.py b/configure_data.py
@@ -120,48 +120,50 @@ def get_split(opt):
 def configure_data(parser):
     """add cmdline flags for configuring datasets"""
     main_parser = parser
-    parser = parser.add_argument_group('data options')
-    parser.add_argument('--data', nargs='+', default=['./data/imdb/unsup.json'],
+    group = parser.add_argument_group('data options')
+    group.add_argument('--data', nargs='+', default=['./data/imdb/unsup.json'],
                         help="""Filename for training""")
-    parser.add_argument('--valid', nargs='*', default=None,
+    group.add_argument('--valid', nargs='*', default=None,
                         help="""Filename for validation""")
-    parser.add_argument('--test', nargs='*', default=None,
+    group.add_argument('--test', nargs='*', default=None,
                         help="""Filename for testing""")
-    parser.add_argument('--process-fn', type=str, default='process_str', choices=['process_str', 'process_tweet'],
+    group.add_argument('--process-fn', type=str, default='process_str', choices=['process_str', 'process_tweet'],
                         help='what preprocessing function to use to process text. One of [process_str, process_tweet].')
-    parser.add_argument('--batch-size', type=int, default=128,
+    group.add_argument('--batch-size', type=int, default=128,
                         help='Data Loader batch size')
-    parser.add_argument('--eval-batch-size', type=int, default=0,
+    group.add_argument('--eval-batch-size', type=int, default=0,
                         help='Data Loader batch size for evaluation datasets')
-    parser.add_argument('--data-size', type=int, default=256,
+    group.add_argument('--data-size', type=int, default=256,
                         help='number of tokens in data')
-    parser.add_argument('--loose-json', action='store_true',
+    group.add_argument('--loose-json', action='store_true',
                         help='Use loose json (one json-formatted string per newline), instead of tight json (data file is one json string)')
-    parser.add_argument('--preprocess', action='store_true',
+    group.add_argument('--preprocess', action='store_true',
                         help='force preprocessing of datasets')
-    parser.add_argument('--delim', default=',',
+    group.add_argument('--delim', default=',',
                         help='delimiter used to parse csv testfiles')
-    parser.add_argument('--non-binary-cols', nargs='*', default=None,
+    group.add_argument('--non-binary-cols', nargs='*', default=None,
                         help='labels for columns to non-binary dataset [only works for csv datasets]')
-    parser.add_argument('--split', default='1.',
+    group.add_argument('--split', default='1.',
                         help='comma-separated list of proportions for training, validation, and test split')
-    parser.add_argument('--text-key', default='sentence',
+    group.add_argument('--text-key', default='sentence',
                         help='key to use to extract text from json/csv')
-    parser.add_argument('--label-key', default='label',
+    group.add_argument('--label-key', default='label',
                         help='key to use to extract labels from json/csv')
-    parser.add_argument('--eval-text-key', default=None,
+    group.add_argument('--eval-text-key', default=None,
                         help='key to use to extract text from json/csv evaluation datasets')
-    parser.add_argument('--eval-label-key', default=None,
+    group.add_argument('--eval-label-key', default=None,
                         help='key to use to extract labels from json/csv evaluation datasets')
     # tokenizer arguments
-    parser.add_argument('--tokenizer-type', type=str, default='CharacterLevelTokenizer', choices=['CharacterLevelTokenizer', 'SentencePieceTokenizer'],
+    group.add_argument('--tokenizer-type', type=str, default='CharacterLevelTokenizer', choices=['CharacterLevelTokenizer', 'SentencePieceTokenizer'],
                         help='what type of tokenizer to use')
-    parser.add_argument('--tokenizer-model-type', type=str, default='bpe', choices=['bpe', 'char', 'unigram', 'word'],
+    group.add_argument('--tokenizer-model-type', type=str, default='bpe', choices=['bpe', 'char', 'unigram', 'word'],
                         help='Model type to use for sentencepiece tokenization')
-    parser.add_argument('--vocab-size', type=int, default=256,
+    group.add_argument('--vocab-size', type=int, default=256,
                         help='vocab size to use for non-character-level tokenization')
-    parser.add_argument('--tokenizer-path', type=str, default='tokenizer.model',
+    group.add_argument('--tokenizer-path', type=str, default='tokenizer.model',
                         help='path used to save/load sentencepiece tokenization models')
+    # These are options that are relevant to data loading functionality, but are not meant to be exposed to the command line user.
+    # These options are intneded to be set in code by specific scripts.
     defaults = {
                 'world_size': 1,
                 'rank': -1,
@@ -174,4 +176,4 @@ def configure_data(parser):
                 'eval_seq_length': 256,
                 'samples_per_shard': 1000
                }
-    return DataConfig(main_parser, defaults=defaults), parser
+    return DataConfig(main_parser, defaults=defaults), group
diff --git a/model/checkpoint.py b/model/checkpoint.py
@@ -1,3 +1,11 @@
+###############################################################################
+# BSD 3-Clause License
+#
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Author & Contact: Raul Puri (raulp@nvidia.com)
+###############################################################################
+
 from __future__ import absolute_import, division, print_function, unicode_literals
 import torch
 import warnings
diff --git a/model/sentiment_classifier_old.py b/model/sentiment_classifier_old.py
@@ -0,0 +1,107 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+import numpy as np
+from model import RNNFeaturizer
+
+class BinaryClassifier(nn.Module):
+    def __init__(self, num_features=4096):
+        super().__init__()
+
+        self.dense0 = nn.Linear(num_features, 1)
+        self.neurons = None
+
+    def forward(self, X, **kwargs):
+        return torch.sigmoid(self.linear(X)).float()
+        #return F.sigmoid(self.linear(X), dim=-1).float()
+
+    def linear(self, X):
+        weight = self.dense0.weight
+        if self.neurons is not None:
+            #weight = weight[torch.arange(weight.size(0)).unsqueeze(1), self.neurons].contiguous()
+            weight = weight[:, self.neurons].contiguous()
+            if X.size(-1) == self.dense0.weight.size(-1):
+                X = X[:, self.neurons].contiguous()
+            torch.cuda.synchronize()
+        return F.linear(X, weight, self.dense0.bias)
+
+    def set_neurons(self, num_neurons=None):
+        if num_neurons is None:
+            self.neurons = None
+            return self.get_neurons()
+        neurons, values = self.get_neurons(num_neurons=num_neurons)
+        self.neurons = neurons
+        return neurons, values
+
+    def get_neurons(self, num_neurons=None):
+        if num_neurons is None:
+            return self.dense0.weight
+        values, neurons = torch.topk(self.dense0.weight.abs().float(), num_neurons, 1)
+        neurons = neurons[0]
+        values = self.dense0.weight[:, neurons]
+        return neurons, values
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        sd = self.dense0.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
+        sd['neurons'] = self.neurons
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        if 'neurons' in state_dict:
+            self.neurons = state_dict['neurons']
+
+        sd = {}
+        for k, v in state_dict.items():
+            if k != 'neurons':
+                sd[k] = v
+
+        self.dense0.load_state_dict(sd, strict=strict)
+
+
+class SentimentClassifier(nn.Module):
+    """Container module with an encoder, a recurrent module, and a decoder."""
+
+    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, all_layers=False):
+        super().__init__()
+        self.encoder = RNNFeaturizer(rnn_type, ntoken, ninp, nhid, nlayers, dropout=dropout, all_layers=all_layers)
+        self.classifier = BinaryClassifier(num_features=self.encoder.output_size)
+
+        self.neurons_ = None
+
+    def forward(self, input, seq_len=None, get_hidden=False):
+        self.encoder.rnn.reset_hidden(input.size(1))
+        hidden = self.encoder(input, seq_len=seq_len, get_hidden=get_hidden)
+        if get_hidden:
+            hidden = hidden[0]
+        if self.neurons is not None:
+            hidden = hidden[:, self.neurons].contiguous()
+        return self.classifier(hidden)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        sd = {}
+        sd['encoder'] = self.encoder.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
+        sd['classifier'] = self.classifier.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
+        return sd
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.encoder.load_state_dict(state_dict['encoder'], strict=strict)
+        self.classifier.load_state_dict(state_dict['classifier'], strict=strict)
+        self.neurons = self.classifier.neurons
+
+    def get_neurons(self, **kwargs):
+        return self.classifier.get_neurons(**kwargs)
+
+    def set_neurons(self, num_neurons=None):
+        rtn = self.classifier.set_neurons(num_neurons=num_neurons)
+        self.neurons_ = self.classifier.neurons
+        return rtn
+
+    @property
+    def neurons(self):
+        return self.neurons_
+
+    @neurons.setter
+    def neurons(self, val):
+        self.neurons_ = val
+        self.classifier.neurons = val
diff --git a/model/transformer.py b/model/transformer.py
@@ -1,3 +1,16 @@
+###############################################################################
+# BSD 3-Clause License
+#
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#   
+# Copyright (c) 2017, Facebook, inc. All rights reserved.
+###############################################################################
+'''
+Code adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/transformer.py
+Introduced optimal gradient checkpointing for intermediate layers
+'''
+
+
 import math
 import torch
 import torch.nn as nn
diff --git a/model/transformer_utils.py b/model/transformer_utils.py
@@ -1,3 +1,15 @@
+###############################################################################
+# BSD 3-Clause License
+#
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#   
+# Copyright (c) 2017, Facebook, inc. All rights reserved.
+###############################################################################
+'''
+Code adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/transformer.py
+Introduced optimal gradient checkpointing for intermediate layers in ./transformer.py
+'''
+
 import math
 import torch
 import torch.nn as nn
diff --git a/transfer.py b/transfer.py
@@ -68,14 +68,17 @@ def get_model(args):
                 sd = sd['sd']
             if 'lm_encoder' in sd:
                 sd = sd['lm_encoder']
-
         try:
             model.load_state_dict(sd)
         except:
             # if state dict has weight normalized parameters apply and remove weight norm to model while loading sd
-            apply_weight_norm(model)
+            if hasattr(model, 'rnn'):
+                apply_weight_norm(model.rnn)
+            else:
+                apply_weight_norm(model)
             model.load_state_dict(sd)
             remove_weight_norm(model)
+
     return model
 
 def transform(model, text, args):
@@ -109,7 +112,7 @@ def get_outs(text_batch, length_batch):
         if args.model.lower() == 'transformer' or args.model.lower() == 'bert':
             cell_out, lm_or_encoder_out = model(text_batch, length_batch, args.get_hidden)
         else:
-            model.lm_encoder.rnn.reset_hidden(args.batch_size)
+            model.rnn.reset_hidden(args.batch_size)
             for _ in range(1 + args.num_hidden_warmup):
                 cell_out, lm_or_encoder_out = model(text_batch, length_batch, args.get_hidden)
         return cell_out, lm_or_encoder_out
@@ -369,11 +372,11 @@ def main():
         clf_sd = {'weight': torch.from_numpy(logreg_model.coef_).half(), 'bias': torch.from_numpy(logreg_model.intercept_).half()}
     sd['classifier'] = clf_sd
     model.float().cpu()
-    sd['encoder'] = model.state_dict()
+    sd['lm_encoder'] = model.state_dict()
     with open(os.path.join(save_root, 'classifier.pt'), 'wb') as f:
         torch.save(sd, f)
     model.half()
-    sd['encoder'] = model.state_dict()
+    sd['lm_encoder'] = model.state_dict()
     with open(os.path.join(save_root, 'classifier.pt.16'), 'wb') as f:
         torch.save(sd, f)