-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Update samples #238
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update samples #238
Changes from all commits
c690212
560293c
3b1e5eb
27439e8
5658af3
dc15aef
1c55277
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<OutputType>Exe</OutputType> | ||
<TargetFramework>netcoreapp2.0</TargetFramework> | ||
<LangVersion>latest</LangVersion> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<Content Include="..\..\datasets\imdb_labeled.txt" Link="datasets\imdb_labeled.txt"> | ||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||
</Content> | ||
<Content Include="..\..\datasets\yelp_labeled.txt" Link="datasets\yelp_labeled.txt"> | ||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||
</Content> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<PackageReference Include="Microsoft.ML" Version="0.1.0" /> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<Folder Include="datasets\" /> | ||
</ItemGroup> | ||
|
||
</Project> | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
using System; | ||
using System.IO; | ||
using System.Linq; | ||
using System.Threading.Tasks; | ||
using Microsoft.ML; | ||
using Microsoft.ML.Models; | ||
using Microsoft.ML.Runtime.Api; | ||
using Microsoft.ML.Trainers; | ||
using Microsoft.ML.Transforms; | ||
|
||
namespace BinaryClassification_SentimentAnalysis | ||
{ | ||
internal static class Program | ||
{ | ||
private static string AppPath => Path.GetDirectoryName(Environment.GetCommandLineArgs()[0]); | ||
private static string TrainDataPath => Path.Combine(AppPath, "datasets", "imdb_labeled.txt"); | ||
private static string TestDataPath => Path.Combine(AppPath, "datasets", "yelp_labeled.txt"); | ||
private static string ModelPath => Path.Combine(AppPath, "SentimentModel.zip"); | ||
|
||
private static async Task Main(string[] args) | ||
{ | ||
// ML task includes 3 steps: training a ML model, evaluating how good it is, | ||
// and if the quality is acceptable, using this model for predictions. | ||
var model = await TrainAsync(); | ||
|
||
Evaluate(model); | ||
|
||
var predictions = model.Predict(TestSentimentData.Sentiments); | ||
|
||
var sentimentsAndPredictions = | ||
TestSentimentData.Sentiments.Zip(predictions, (sentiment, prediction) => (sentiment, prediction)); | ||
foreach (var item in sentimentsAndPredictions) | ||
{ | ||
Console.WriteLine( | ||
$"Sentiment: {item.sentiment.SentimentText} | Prediction: {(item.prediction.Sentiment ? "Positive" : "Negative")} sentiment"); | ||
} | ||
|
||
Console.ReadLine(); | ||
} | ||
|
||
public static async Task<PredictionModel<SentimentData, SentimentPrediction>> TrainAsync() | ||
{ | ||
// LearningPipeline holds all steps of the learning process: data, transforms, learners. | ||
var pipeline = new LearningPipeline(); | ||
|
||
// The TextLoader loads a dataset. The schema of the dataset is specified by passing a class containing | ||
// all the column names and their types. This will be used to create the model, and train it. | ||
pipeline.Add(new TextLoader<SentimentData>(TrainDataPath, useHeader: false, separator: "tab")); | ||
|
||
// TextFeaturizer is a transform that will be used to featurize an input column to format and clean the data. | ||
pipeline.Add(new TextFeaturizer("Features", "SentimentText")); | ||
|
||
// FastTreeBinaryClassifier is an algorithm that will be used to train the model. | ||
// It has three hyperparameters for tuning decision tree performance. | ||
pipeline.Add(new FastTreeBinaryClassifier() {NumLeaves = 5, NumTrees = 5, MinDocumentsInLeafs = 2}); | ||
|
||
Console.WriteLine("=============== Training model ==============="); | ||
// The pipeline is trained on the dataset that has been loaded and transformed. | ||
var model = pipeline.Train<SentimentData, SentimentPrediction>(); | ||
|
||
// Saving the model as a .zip file. | ||
await model.WriteAsync(ModelPath); | ||
|
||
Console.WriteLine("=============== End training ==============="); | ||
Console.WriteLine("The model is saved to {0}", ModelPath); | ||
|
||
return model; | ||
} | ||
|
||
private static void Evaluate(PredictionModel<SentimentData, SentimentPrediction> model) | ||
{ | ||
// To evaluate how good the model predicts values, the model is ran against new set | ||
// of data (test data) that was not involved in training. | ||
var testData = new TextLoader<SentimentData>(TestDataPath, useHeader: true, separator: "tab"); | ||
|
||
// BinaryClassificationEvaluator performs evaluation for Binary Classification type of ML problems. | ||
var evaluator = new BinaryClassificationEvaluator(); | ||
|
||
Console.WriteLine("=============== Evaluating model ==============="); | ||
|
||
var metrics = evaluator.Evaluate(model, testData); | ||
// BinaryClassificationMetrics contains the overall metrics computed by binary classification evaluators | ||
// The Accuracy metric gets the accuracy of a classifier which is the proportion | ||
//of correct predictions in the test set. | ||
|
||
// The Auc metric gets the area under the ROC curve. | ||
// The area under the ROC curve is equal to the probability that the classifier ranks | ||
// a randomly chosen positive instance higher than a randomly chosen negative one | ||
// (assuming 'positive' ranks higher than 'negative'). | ||
|
||
// The F1Score metric gets the classifier's F1 score. | ||
// The F1 score is the harmonic mean of precision and recall: | ||
// 2 * precision * recall / (precision + recall). | ||
|
||
Console.WriteLine($"Accuracy: {metrics.Accuracy:P2}"); | ||
Console.WriteLine($"Auc: {metrics.Auc:P2}"); | ||
Console.WriteLine($"F1Score: {metrics.F1Score:P2}"); | ||
Console.WriteLine("=============== End evaluating ==============="); | ||
Console.WriteLine(); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
## Goal | ||
This is a getting started example that shows the simplest way of using ML.NET APIs for solving a binary classification problem on sentiment analysis example. | ||
|
||
## Problem | ||
The task is to build and train ML model (machine learning model) that will predict if a text has positive or negative sentiment. For training and evaluating the model we used imdb and yelp comments with known sentiments. | ||
|
||
## Problem Class - Binary Classification | ||
The described task is an example of a binary classification problem. | ||
> In machine learning, `binary classification` is the problem of classifying instances into one of a two classes. (Classifying instances into more than two classes is called `multiclass classification`.) | ||
|
||
Machine learning engineering process includes three steps: training ML model, evaluating how good it is, and if the quality is acceptable, using this model for predictions. If the quality of the model is not good enough, different algorithms and/or additional data transformations can be applied and the model should be trained and evaluated again. | ||
|
||
1. **Training** the ML model is implemented in `TrainAsync()` method that constructs `LearningPipeline`, trains it and saves the trained model as a .zip file. | ||
2. **Evaluating** the ML model is implemented in `Evaluate()` method which runs the model against a test data (new data with known answers, that was not involved in training). As a result it produces a set of metrics describing the quality of the model. | ||
3. **Predicting** the sentiment is performed in the `Main()` method: | ||
```CSharp | ||
var predictions = model.Predict(TestSentimentData.Sentiments); | ||
``` | ||
where you send a text as a `SentimentData` object. As a result you receive `SentimentPrediction` object that contains a boolean field `Sentiment`: true for positive, false for negative sentiments. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
using Microsoft.ML.Runtime.Api; | ||
|
||
namespace BinaryClassification_SentimentAnalysis | ||
{ | ||
public class SentimentData | ||
{ | ||
[Column("0")] | ||
public string SentimentText; | ||
|
||
[Column("1", name: "Label")] | ||
public float Sentiment; | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why does this class needs its own file? You can keep SentimentData, SentimentPrediction and TestSentimentData in Program.cs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The general .NET coding guidelines use one class per file and we want our samples to use idiomatic C#. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @OliaG Can you please point me to this guideline? Vast majority of ML.Net code has multiple class definitions in one file. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't find the written up rule, but based on my conversation with @terrajobst that's what the .NET team is usually doing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If there isn't a written rule then we should keep things consistent with the current code base. Here is an example from .Net Corefx repo that shows multiple classes defined in one file. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I disagree. Samples aren't part of the ML.NET codebase. They are for our customers and the vast majority doesn't need to know nor care about the ML.NET coding conventions. That's why we try to follow the conventions that the vast majority of our customers are using. That being said, I don't have a problem if we were to merge all these classes into one file as it might even help to keep the sample easier to understand. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The guideline is interesting, and one we generally try to obey ourselves, but not when separating definitions would lead to less comprehensible code. So we tend to declare types meant to be understood together as a unit, in the same file. (This isn't Java after all. :) ) This clearly falls into that bucket. Especially in this case where the intent is pedagogical, we would "bend" any rules we usually follow about style (even if this was an actual policy, which, as Zeeshan points out, it is not). In this case what I'd rather do is have a (private?) nested type, declared right next to their usages in the method. This will structure the example so that the method and class can be understood. Then you can get rid of these three files, and have only one, more easily understood example. In reply to: 190942775 [](ancestors = 190942775) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
using Microsoft.ML.Runtime.Api; | ||
|
||
namespace BinaryClassification_SentimentAnalysis | ||
{ | ||
public class SentimentPrediction | ||
{ | ||
[ColumnName("PredictedLabel")] | ||
public bool Sentiment; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
using System.Collections.Generic; | ||
|
||
namespace BinaryClassification_SentimentAnalysis | ||
{ | ||
internal class TestSentimentData | ||
{ | ||
internal static readonly IEnumerable<SentimentData> Sentiments = new[] | ||
{ | ||
new SentimentData | ||
{ | ||
SentimentText = "Contoso's 11 is a wonderful experience", | ||
Sentiment = 0 | ||
}, | ||
new SentimentData | ||
{ | ||
SentimentText = "The acting in this movie is very bad", | ||
Sentiment = 0 | ||
}, | ||
new SentimentData | ||
{ | ||
SentimentText = "Joe versus the Volcano Coffee Company is a great film.", | ||
Sentiment = 0 | ||
} | ||
}; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
| ||
Microsoft Visual Studio Solution File, Format Version 12.00 | ||
# Visual Studio 15 | ||
VisualStudioVersion = 15.0.27703.2000 | ||
MinimumVisualStudioVersion = 10.0.40219.1 | ||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Regression_TaxiFarePrediction", "Regression_TaxiFarePrediction\Regression_TaxiFarePrediction.csproj", "{C7301D08-10E3-4A51-A70D-7C0BCB39F6E6}" | ||
EndProject | ||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BinaryClassification_SentimentAnalysis", "BinaryClassification_SentimentAnalysis\BinaryClassification_SentimentAnalysis.csproj", "{ED877F56-5304-4F0D-A75C-4C77219C8D0E}" | ||
EndProject | ||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MulticlassClassification_Iris", "MulticlassClassification_Iris\MulticlassClassification_Iris.csproj", "{EEC2E07E-7482-4F37-8F7A-135EBDEC75B4}" | ||
EndProject | ||
Global | ||
GlobalSection(SolutionConfigurationPlatforms) = preSolution | ||
Debug|Any CPU = Debug|Any CPU | ||
Release|Any CPU = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(ProjectConfigurationPlatforms) = postSolution | ||
{C7301D08-10E3-4A51-A70D-7C0BCB39F6E6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | ||
{C7301D08-10E3-4A51-A70D-7C0BCB39F6E6}.Debug|Any CPU.Build.0 = Debug|Any CPU | ||
{C7301D08-10E3-4A51-A70D-7C0BCB39F6E6}.Release|Any CPU.ActiveCfg = Release|Any CPU | ||
{C7301D08-10E3-4A51-A70D-7C0BCB39F6E6}.Release|Any CPU.Build.0 = Release|Any CPU | ||
{ED877F56-5304-4F0D-A75C-4C77219C8D0E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | ||
{ED877F56-5304-4F0D-A75C-4C77219C8D0E}.Debug|Any CPU.Build.0 = Debug|Any CPU | ||
{ED877F56-5304-4F0D-A75C-4C77219C8D0E}.Release|Any CPU.ActiveCfg = Release|Any CPU | ||
{ED877F56-5304-4F0D-A75C-4C77219C8D0E}.Release|Any CPU.Build.0 = Release|Any CPU | ||
{EEC2E07E-7482-4F37-8F7A-135EBDEC75B4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | ||
{EEC2E07E-7482-4F37-8F7A-135EBDEC75B4}.Debug|Any CPU.Build.0 = Debug|Any CPU | ||
{EEC2E07E-7482-4F37-8F7A-135EBDEC75B4}.Release|Any CPU.ActiveCfg = Release|Any CPU | ||
{EEC2E07E-7482-4F37-8F7A-135EBDEC75B4}.Release|Any CPU.Build.0 = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(SolutionProperties) = preSolution | ||
HideSolutionNode = FALSE | ||
EndGlobalSection | ||
GlobalSection(ExtensibilityGlobals) = postSolution | ||
SolutionGuid = {B84E804C-06CA-45C8-9B9F-8F69CA930535} | ||
EndGlobalSection | ||
EndGlobal |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
using Microsoft.ML.Runtime.Api; | ||
|
||
namespace MulticlassClassification_Iris | ||
{ | ||
public class IrisData | ||
{ | ||
[Column("0")] | ||
public float Label; | ||
|
||
[Column("1")] | ||
public float SepalLength; | ||
|
||
[Column("2")] | ||
public float SepalWidth; | ||
|
||
[Column("3")] | ||
public float PetalLength; | ||
|
||
[Column("4")] | ||
public float PetalWidth; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
using Microsoft.ML.Runtime.Api; | ||
|
||
namespace MulticlassClassification_Iris | ||
{ | ||
public class IrisPrediction | ||
{ | ||
[ColumnName("Score")] | ||
public float[] Score; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<OutputType>Exe</OutputType> | ||
<TargetFramework>netcoreapp2.0</TargetFramework> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<Content Include="..\..\datasets\iris_test.txt" Link="datasets\iris_test.txt"> | ||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||
</Content> | ||
<Content Include="..\..\datasets\iris_train.txt" Link="datasets\iris_train.txt"> | ||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> | ||
</Content> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<PackageReference Include="Microsoft.ML" Version="0.1.0" /> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<Folder Include="datasets\" /> | ||
</ItemGroup> | ||
|
||
</Project> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is the rationale behind creating a separate project for "Binary Classification sentiment analysis"? Same for MC_Iris, TaxiFarePrediction.
What I'm seeing is you have taken the scenario test and broke it down into many files. What does this accomplish?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We want to have separate project for different problem types so it aligns with our docs