Skip to content

Commit 13aec05

Browse files
MaxAkbarCESARDELATORRE
authored andcommitted
Uploading to Master New Text Sample (dotnet#726)
* merging latest synch * Adding news articles trainer project. * Adding the perdiction project for news articles. * Added case for word embedding. * Adding a read me to the solution. * Cleaned the news data. * Removed CsvHelper
1 parent eecbf6f commit 13aec05

File tree

16 files changed

+33400
-0
lines changed

16 files changed

+33400
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>netcoreapp2.1</TargetFramework>
6+
<LangVersion>7.2</LangVersion>
7+
</PropertyGroup>
8+
9+
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
10+
<PlatformTarget>x64</PlatformTarget>
11+
</PropertyGroup>
12+
13+
<ItemGroup>
14+
<PackageReference Include="Microsoft.ML" Version="0.6.0-preview-26912-3" />
15+
<PackageReference Include="Microsoft.ML.CpuMath" Version="0.6.0-preview-26912-3" />
16+
<PackageReference Include="Microsoft.ML.ImageAnalytics" Version="0.6.0-preview-26912-3" />
17+
<PackageReference Include="Microsoft.ML.TensorFlow" Version="0.6.0-preview-26912-3" />
18+
</ItemGroup>
19+
20+
<ItemGroup>
21+
<None Update="assets\data\tags.tsv">
22+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
23+
</None>
24+
<None Update="assets\images\banana.jpg">
25+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
26+
</None>
27+
<None Update="assets\images\broccoli.jpg">
28+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
29+
</None>
30+
<None Update="assets\images\bucket.png">
31+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
32+
</None>
33+
<None Update="assets\images\canoe.jpg">
34+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
35+
</None>
36+
<None Update="assets\images\snail.jpg">
37+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
38+
</None>
39+
<None Update="assets\images\teddy1.jpg">
40+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
41+
</None>
42+
<None Update="assets\images\teddy2.jpg">
43+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
44+
</None>
45+
<None Update="assets\images\teddy3.jpg">
46+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
47+
</None>
48+
<None Update="assets\images\teddy4.jpg">
49+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
50+
</None>
51+
<None Update="assets\images\teddy5.jpg">
52+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
53+
</None>
54+
<None Update="assets\images\violin.png">
55+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
56+
</None>
57+
<None Update="assets\model\imagenet_comp_graph_label_strings.txt">
58+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
59+
</None>
60+
<None Update="assets\model\tensorflow_inception_graph.pb">
61+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
62+
</None>
63+
</ItemGroup>
64+
65+
</Project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Diagnostics;
4+
using System.IO;
5+
using System.Linq;
6+
using ClusteringNewsArticles.Perdict.DataStructures;
7+
using Common;
8+
using Microsoft.ML;
9+
using Microsoft.ML.Data;
10+
using OxyPlot;
11+
using OxyPlot.Series;
12+
13+
namespace ClusteringNewsArticles.Perdict
14+
{
15+
public class ClusteringModelScorer
16+
{
17+
private readonly string _newsDataLocation;
18+
private readonly string _plotLocation;
19+
private readonly string _csvLocation;
20+
private readonly MLContext _mlContext;
21+
private ITransformer _trainedModel;
22+
23+
public ClusteringModelScorer(MLContext mlContext, string newsDataLocation, string plotLocation, string csvLocation)
24+
{
25+
_newsDataLocation = newsDataLocation;
26+
_plotLocation = plotLocation;
27+
_csvLocation = csvLocation;
28+
_mlContext = mlContext;
29+
}
30+
31+
public ITransformer LoadModel(string modelPath)
32+
{
33+
_trainedModel = _mlContext.Model.Load(modelPath, out var modelInputSchema);
34+
35+
return _trainedModel;
36+
}
37+
38+
public void CreateNewsArticlesCluster()
39+
{
40+
var data = _mlContext.Data.LoadFromTextFile(path: _newsDataLocation,
41+
new[]
42+
{
43+
new TextLoader.Column("news_articles", DataKind.String, 0),
44+
new TextLoader.Column("category", DataKind.String, 1)
45+
}, ',', true);
46+
var tranfomedDataView = _trainedModel.Transform(data);
47+
var predictions = _mlContext.Data.CreateEnumerable<ClusteringPrediction>(tranfomedDataView, false).ToArray();
48+
49+
SaveNewsArticlesClusterCsv(predictions, _csvLocation);
50+
SaveNewsArticlesClusterPlotChart(predictions, _plotLocation);
51+
OpenChartInDefaultWindow(_plotLocation);
52+
}
53+
54+
private static void SaveNewsArticlesClusterCsv(IEnumerable<ClusteringPrediction> predictions, string csvLocation)
55+
{
56+
ConsoleHelper.ConsoleWriteHeader("CSV News Articles Cluster");
57+
58+
using (var w = new StreamWriter(csvLocation))
59+
{
60+
w.WriteLine("news_articles,SelectedClusterId");
61+
w.Flush();
62+
predictions.ToList().ForEach(delegate (ClusteringPrediction prediction)
63+
{
64+
w.WriteLine($"{prediction.NewsArticles},{prediction.SelectedClusterId},{prediction.Category}");
65+
w.Flush();
66+
});
67+
}
68+
69+
Console.WriteLine("CSV location: " + csvLocation);
70+
}
71+
72+
private static void SaveNewsArticlesClusterPlotChart(IEnumerable<ClusteringPrediction> predictions, string plotLocation)
73+
{
74+
ConsoleHelper.ConsoleWriteHeader("Plot News Articles Clusters");
75+
var plot = new PlotModel
76+
{
77+
Title = "News Articles Clusters",
78+
IsLegendVisible = true
79+
};
80+
var clusters = predictions.Select(p => p.SelectedClusterId).Distinct().OrderBy(x => x);
81+
82+
foreach (var cluster in clusters)
83+
{
84+
var scatter = new ScatterSeries { MarkerType = MarkerType.Circle, MarkerStrokeThickness = 2, Title = $"Cluster: {cluster}", RenderInLegend = true };
85+
var series = predictions
86+
.Where(p => p.SelectedClusterId == cluster)
87+
.Select(p => new ScatterPoint(p.Location[0], p.Location[1])).ToArray();
88+
scatter.Points.AddRange(series);
89+
plot.Series.Add(scatter);
90+
}
91+
92+
plot.DefaultColors = OxyPalettes.HueDistinct(plot.Series.Count).Colors;
93+
94+
var exporter = new SvgExporter { Width = 600, Height = 400 };
95+
using (var fs = new System.IO.FileStream(plotLocation, System.IO.FileMode.Create))
96+
{
97+
exporter.Export(plot, fs);
98+
}
99+
100+
Console.WriteLine($"Plot location: {plotLocation}");
101+
}
102+
103+
private static void OpenChartInDefaultWindow(string plotLocation)
104+
{
105+
Console.WriteLine("Showing chart...");
106+
107+
var p = new Process
108+
{
109+
StartInfo = new ProcessStartInfo(plotLocation)
110+
{
111+
UseShellExecute = true
112+
}
113+
};
114+
115+
p.Start();
116+
}
117+
}
118+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>netcoreapp3.1</TargetFramework>
6+
</PropertyGroup>
7+
8+
<ItemGroup>
9+
<Compile Include="..\..\..\common\ConsoleHelper.cs" Link="Common\ConsoleHelper.cs" />
10+
</ItemGroup>
11+
12+
<ItemGroup>
13+
<Folder Include="assets\outputs\" />
14+
<Folder Include="Common\" />
15+
</ItemGroup>
16+
17+
<ItemGroup>
18+
<PackageReference Include="Microsoft.ML" Version="1.3.1" />
19+
<PackageReference Include="OxyPlot.Core" Version="2.0.0" />
20+
</ItemGroup>
21+
22+
</Project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
using Microsoft.ML.Data;
2+
3+
namespace ClusteringNewsArticles.Perdict.DataStructures
4+
{
5+
public class ClusteringPrediction
6+
{
7+
[ColumnName("PredictedLabel")]
8+
public uint SelectedClusterId;
9+
10+
[ColumnName("Score")]
11+
public float[] Distance;
12+
13+
[ColumnName("Features")]
14+
public float[] Location;
15+
16+
[ColumnName("news_articles")]
17+
public string NewsArticles;
18+
19+
[ColumnName("category")]
20+
public string Category;
21+
}
22+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
using System;
2+
using System.IO;
3+
using Common;
4+
using Microsoft.ML;
5+
6+
namespace ClusteringNewsArticles.Perdict
7+
{
8+
class Program
9+
{
10+
private static void Main(string[] args)
11+
{
12+
var assetsPath = Program.GetAbsolutePath("assets");
13+
var newsCsv = Path.Combine(assetsPath, "inputs", "newsArticles.csv");
14+
var modelPath = Path.Combine(assetsPath, "inputs", "newsArticlesClustering.zip");
15+
var plotSvg = Path.Combine(assetsPath, "outputs", "newsArticlesClusters.svg");
16+
var plotCsv = Path.Combine(assetsPath, "outputs", "newsArticlesClusters.csv");
17+
18+
try
19+
{
20+
var mlContext = new MLContext(null);
21+
var clusteringModelScorer = new ClusteringModelScorer(mlContext, newsCsv, plotSvg, plotCsv);
22+
clusteringModelScorer.LoadModel(modelPath);
23+
clusteringModelScorer.CreateNewsArticlesCluster();
24+
}
25+
catch (Exception ex)
26+
{
27+
ConsoleHelper.ConsoleWriteException(ex.ToString());
28+
}
29+
30+
ConsoleHelper.ConsolePressAnyKey();
31+
}
32+
33+
public static string GetAbsolutePath(string relativePath)
34+
{
35+
var dataRoot = new FileInfo(typeof(Program).Assembly.Location);
36+
var assemblyFolderPath = dataRoot.Directory.Parent.Parent.Parent.FullName;
37+
return Path.Combine(assemblyFolderPath, relativePath);
38+
}
39+
}
40+
}

0 commit comments

Comments
 (0)