Added samples & docs for BinaryClassification.StochasticGradientDesce…

…nt, plus a bunch of typo fixing.
dotnet · shmoradims · Feb 25, 2019 · Feb 21, 2019 · Feb 22, 2019 · Feb 22, 2019
commit c3aebc75022e1b01d93a6328917883359f28551a
diff --git a/...s/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs b/...s/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs
@@ -0,0 +1,47 @@
+using Microsoft.ML;
+
+namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
+{
+    public static class StochasticGradientDescent
+    {
+        // In this examples we will use the adult income dataset. The goal is to predict
+        // if a person's income is above $50K or not, based on different pieces of information about that person.
+        // For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult.
+        public static void Example()
+        {
+            // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
+            // as a catalog of available operations and as the source of randomness.
+            // Setting the seed to a fixed number in this example to make outputs deterministic.
+            var mlContext = new MLContext(seed: 0);
+
+            // Download and featurize the dataset.
+            var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
+
+            // Leave out 10% of data for testing.
+            var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+
+            // Create data training pipeline.
+            var pipeline = mlContext.BinaryClassification.Trainers.StochasticGradientDescent();
+
+            // Fit this pipeline to the training data.
+            var model = pipeline.Fit(trainTestData.TrainSet);
+
+            // Evaluate how the model is doing on the test data.
+            var dataWithPredictions = model.Transform(trainTestData.TestSet);
+            var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions);
+            SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
+
+            // Expected output:
+            //   Accuracy: 0.85
+            //   AUC: 0.90
+            //   F1 Score: 0.67
+            //   Negative Precision: 0.90
+            //   Negative Recall: 0.91
+            //   Positive Precision: 0.68
+            //   Positive Recall: 0.65
+            //   LogLoss: 0.48
+            //   LogLossReduction: 38.31
+            //   Entropy: 0.78
+        }
+    }
+}
diff --git a/....ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs b/....ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs
@@ -0,0 +1,56 @@
+using Microsoft.ML;
+using Microsoft.ML.Trainers;
+
+namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
+{
+    public static class StochasticGradientDescentWithOptions
+    {
+        // In this examples we will use the adult income dataset. The goal is to predict
+        // if a person's income is above $50K or not, based on different pieces of information about that person.
+        // For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult.
+        public static void Example()
+        {
+            // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
+            // as a catalog of available operations and as the source of randomness.
+            // Setting the seed to a fixed number in this example to make outputs deterministic.
+            var mlContext = new MLContext(seed: 0);
+
+            // Download and featurize the dataset.
+            var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
+
+            // Leave out 10% of data for testing.
+            var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+
+            // Define the trainer options.
+            var options = new SgdBinaryTrainer.Options()
+            {
+                MaxIterations = 30,
+                ConvergenceTolerance = 5e-5,
+                PositiveInstanceWeight = 1.2f,
+            };
+
+            // Create data training pipeline.
+            var pipeline = mlContext.BinaryClassification.Trainers.StochasticGradientDescent(options);
+
+            // Fit this pipeline to the training data.
+            var model = pipeline.Fit(trainTestData.TrainSet);
+
+            // Evaluate how the model is doing on the test data.
+            var dataWithPredictions = model.Transform(trainTestData.TestSet);
+            var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions);
+            SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
+
+            // Expected output:
+            //   Accuracy: 0.85
+            //   AUC: 0.90
+            //   F1 Score: 0.67
+            //   Negative Precision: 0.91
+            //   Negative Recall: 0.89
+            //   Positive Precision: 0.65
+            //   Positive Recall: 0.70
+            //   LogLoss: 0.48
+            //   LogLossReduction: 37.52
+            //   Entropy: 0.78
+        }
+    }
+}
diff --git a/src/Microsoft.ML.Data/EntryPoints/InputBase.cs b/src/Microsoft.ML.Data/EntryPoints/InputBase.cs
@@ -95,7 +95,7 @@ public abstract class LearnerInputBaseWithLabel : LearnerInputBase
     public abstract class LearnerInputBaseWithWeight : LearnerInputBaseWithLabel
     {
         /// <summary>
-        /// Column to use for example weight.
+        /// The name of the example weight column.
         /// </summary>
         [Argument(ArgumentType.AtMostOnce, HelpText = "Column to use for example weight", ShortName = "weight", SortOrder = 4, Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly)]
         public Optional<string> WeightColumn = Optional<string>.Implicit(DefaultColumnNames.Weight);

diff --git a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs
@@ -23,6 +23,18 @@ public static void PrintMetrics(BinaryClassificationMetrics metrics)
             Console.WriteLine($"Positive Recall: {metrics.PositiveRecall:F2}");
         }
 
+        /// <summary>
+        /// Pretty-print CalibratedBinaryClassificationMetrics objects.
+        /// </summary>
+        /// <param name="metrics">Calibrated binary classification metrics.</param>
+        public static void PrintMetrics(CalibratedBinaryClassificationMetrics metrics)
+        {
+            PrintMetrics(metrics as BinaryClassificationMetrics);
+            Console.WriteLine($"LogLoss: {metrics.LogLoss:F2}");
+            Console.WriteLine($"LogLossReduction: {metrics.LogLossReduction:F2}");
+            Console.WriteLine($"Entropy: {metrics.Entropy:F2}");
+        }
+
         /// <summary>
         /// Pretty-print RegressionMetrics objects.
         /// </summary>

diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedLinear.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedLinear.cs
@@ -60,7 +60,7 @@ public abstract class AveragedLinearOptions : OnlineLinearOptions
         public bool DoLazyUpdates = true;
 
         /// <summary>
-        /// L2 weight for <a href='tmpurl_regularization'>regularization</a>.
+        /// The L2 weight for <a href='tmpurl_regularization'>regularization</a>.
         /// </summary>
         [Argument(ArgumentType.AtMostOnce, HelpText = "L2 Regularization Weight", ShortName = "reg", SortOrder = 50)]
         [TGUI(Label = "L2 Regularization Weight")]

diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
@@ -54,7 +54,7 @@ public sealed class AveragedPerceptronTrainer : AveragedLinearTrainer<BinaryPred
         private readonly Options _args;
 
         /// <summary>
-        /// Options for the averaged perceptron trainer.
+        /// Options for the <see cref="AveragedPerceptronTrainer"/>.
         /// </summary>
         public sealed class Options : AveragedLinearOptions
         {

diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineLinear.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineLinear.cs
@@ -24,7 +24,7 @@ public abstract class OnlineLinearOptions : LearnerInputBaseWithLabel
         /// <summary>
         /// Number of passes through the training dataset.
         /// </summary>
-        [Argument(ArgumentType.AtMostOnce, HelpText = "Number of iterations", ShortName = "iter, numIterations", SortOrder = 50)]
+        [Argument(ArgumentType.AtMostOnce, HelpText = "Number of iterations", ShortName = "iter,numIterations", SortOrder = 50)]
         [TGUI(Label = "Number of Iterations", Description = "Number of training iterations through data", SuggestedSweeps = "1,10,100")]
         [TlcModule.SweepableLongParamAttribute("NumIterations", 1, 100, stepSize: 10, isLogScale: true)]
         public int NumberOfIterations = OnlineDefault.NumIterations;
@@ -43,7 +43,7 @@ public abstract class OnlineLinearOptions : LearnerInputBaseWithLabel
         /// This property is only used if the provided value is positive and <see cref="InitialWeights"/> is not specified.
         /// The weights and bias will be randomly selected from InitialWeights * [-0.5,0.5] interval with uniform distribution.
         /// </value>
-        [Argument(ArgumentType.AtMostOnce, HelpText = "Init weights diameter", ShortName = "initwts, initWtsDiameter", SortOrder = 140)]
+        [Argument(ArgumentType.AtMostOnce, HelpText = "Init weights diameter", ShortName = "initwts,initWtsDiameter", SortOrder = 140)]
         [TGUI(Label = "Initial Weights Scale", SuggestedSweeps = "0,0.1,0.5,1")]
         [TlcModule.SweepableFloatParamAttribute("InitWtsDiameter", 0.0f, 1.0f, numSteps: 5)]
         public float InitialWeightsDiameter = 0;

diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaBinary.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaBinary.cs
@@ -1723,36 +1723,77 @@ public abstract class SgdBinaryTrainerBase<TModel> :
     {
         public class OptionsBase : LearnerInputBaseWithWeight
         {
+            /// <summary>
+            /// The L2 weight for <a href='tmpurl_regularization'>regularization</a>.
+            /// </summary>
             [Argument(ArgumentType.AtMostOnce, HelpText = "L2 Regularization constant", ShortName = "l2", SortOrder = 50)]
             [TGUI(Label = "L2 Regularization Constant", SuggestedSweeps = "1e-7,5e-7,1e-6,5e-6,1e-5")]
             [TlcModule.SweepableDiscreteParam("L2Const", new object[] { 1e-7f, 5e-7f, 1e-6f, 5e-6f, 1e-5f })]
             public float L2Weight = Defaults.L2Weight;
 
+            /// <summary>
+            /// The degree of lock-free parallelism used by SGD.
+            /// </summary>
+            /// <value>
+            /// Defaults to automatic depending on data sparseness. Determinism is not guaranteed.
+            /// </value>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Degree of lock-free parallelism. Defaults to automatic depending on data sparseness. Determinism not guaranteed.", ShortName = "nt,t,threads", SortOrder = 50)]
             [TGUI(Label = "Number of threads", SuggestedSweeps = "1,2,4")]
             public int? NumThreads;
 
+            /// <summary>
+            /// The convergence tolerance. If the exponential moving average of loss reductions falls below this tolerance,
+            /// the algorithm is deemed to have converged and will stop.
+            /// </summary>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Exponential moving averaged improvement tolerance for convergence", ShortName = "tol")]
             [TGUI(SuggestedSweeps = "1e-2,1e-3,1e-4,1e-5")]
             [TlcModule.SweepableDiscreteParam("ConvergenceTolerance", new object[] { 1e-2f, 1e-3f, 1e-4f, 1e-5f })]
             public double ConvergenceTolerance = 1e-4;
 
+            /// <summary>
+            /// The maximum number of passes through the training dataset.
+            /// </summary>
+            /// <value>
+            /// Set to 1 to simulate online learning.
+            /// </value>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of iterations; set to 1 to simulate online learning.", ShortName = "iter")]
             [TGUI(Label = "Max number of iterations", SuggestedSweeps = "1,5,10,20")]
             [TlcModule.SweepableDiscreteParam("MaxIterations", new object[] { 1, 5, 10, 20 })]
             public int MaxIterations = Defaults.MaxIterations;
 
+            /// <summary>
+            /// The initial <a href="tmpurl_lr">learning rate</a> used by SGD.
+            /// </summary>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Initial learning rate (only used by SGD)", ShortName = "ilr,lr")]
             [TGUI(Label = "Initial Learning Rate (for SGD)")]
             public double InitLearningRate = Defaults.InitLearningRate;
 
+            /// <summary>
+            /// Determines whether to shuffle data for each training iteration.
+            /// </summary>
+            /// <value>
+            /// <see langword="true" /> to shuffle data for each training iteration; otherwise, <see langword="false" />.
+            /// Default is <see langword="true" />.
+            /// </value>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Shuffle data every epoch?", ShortName = "shuf")]
             [TlcModule.SweepableDiscreteParam("Shuffle", null, isBool: true)]
             public bool Shuffle = true;
 
+            /// <summary>
+            /// The weight to be applied to the positive class. This is useful for training with imbalanced data.
+            /// </summary>
+            /// <value>
+            /// Default value is 1, which means no extra weight.
+            /// </value>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Apply weight to the positive class, for imbalanced data", ShortName = "piw")]
             public float PositiveInstanceWeight = 1;
 
+            /// <summary>
+            /// Determines the frequency of checking for convergence in terms of number of iterations.
+            /// </summary>
+            /// <value>
+            /// Default equals <see cref="NumThreads"/>."
+            /// </value>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Convergence check frequency (in terms of number of iterations). Default equals number of threads", ShortName = "checkFreq")]
             public int? CheckFrequency;
 
@@ -1802,7 +1843,7 @@ internal static class Defaults
         /// <param name="env">The environment to use.</param>
         /// <param name="featureColumn">The name of the feature column.</param>
         /// <param name="labelColumn">The name of the label column.</param>
-        /// <param name="weightColumn">The name for the example weight column.</param>
+        /// <param name="weightColumn">The name of the example weight column.</param>
         /// <param name="maxIterations">The maximum number of iterations; set to 1 to simulate online learning.</param>
         /// <param name="initLearningRate">The initial learning rate used by SGD.</param>
         /// <param name="l2Weight">The L2 regularizer constant.</param>
@@ -2077,13 +2118,19 @@ private protected override void CheckLabel(RoleMappedData examples, out int weig
     }
 
     /// <summary>
-    /// Train logistic regression using a parallel stochastic gradient method.
+    /// The <see cref="IEstimator{TTransformer}"/> for training logistic regression using a parallel stochastic gradient method.
     /// </summary>
+    /// <remarks>
+    /// The Stochastic Gradient Descent (SGD) is one of the popular stochastic optimization procedures that can be integrated
+    /// into several machine learning tasks to achieve state-of-the-art performance. This trainer implements SGD for binary classification
+    /// that supports multi-threading without any locking. If the associated optimization problem is sparse, it achieves a nearly optimal
+    /// rate of convergence. For more details, please refer to http://arxiv.org/pdf/1106.5730v2.pdf.
+    /// </remarks>
     public sealed class SgdBinaryTrainer :
         SgdBinaryTrainerBase<CalibratedModelParametersBase<LinearBinaryModelParameters, PlattCalibrator>>
     {
         /// <summary>
-        /// Options available to training logistic regression using the implemented stochastic gradient method.
+        /// Options for the <see cref="SgdBinaryTrainer"/>.
         /// </summary>
         public sealed class Options : OptionsBase
         {