From f3fe5bad29c0b4f2d3e6d05a4fd47bfa7d21325f Mon Sep 17 00:00:00 2001 From: "yuyi@microsoft.com" Date: Thu, 7 Jan 2021 18:02:28 +0800 Subject: [PATCH 1/4] update --- .../SrCnnEntireAnomalyDetector.cs | 26 ++++ .../TimeSeriesDirectApi.cs | 56 +++++++- test/data/Timeseries/big_spike_data.csv | 136 ++++++++++++++++++ 3 files changed, 217 insertions(+), 1 deletion(-) create mode 100644 test/data/Timeseries/big_spike_data.csv diff --git a/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs b/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs index 45505d5ece..608fcbc61b 100644 --- a/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs +++ b/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs @@ -362,6 +362,9 @@ internal sealed class SrCnnEntireModeler private static readonly double _unitForZero = 0.3; private static readonly double _minimumScore = 0.0; private static readonly double _maximumScore = 1.0; + // Use this threshold to correct false anomalies + private static readonly double _zscoreThreshold = 1.5; + // If the score window is smaller than this value, the anomaly score is tend to be small. // Proof: For each point, the SR anomaly score is calculated as (w is average window size): // (mag - avg_mag) / avg_mag @@ -426,6 +429,8 @@ internal sealed class SrCnnEntireModeler //used in all modes private double _minimumOriginValue; private double _maximumOriginValue; + private double _std; + private double _mean; private readonly double[] _predictArray; private double[] _backAddArray; private double[] _fftRe; @@ -490,6 +495,7 @@ public void Train(double[] values, ref double[][] results) _minimumOriginValue = Double.MaxValue; _maximumOriginValue = Double.MinValue; + _mean = 0.0; Array.Resize(ref _seriesToDetect, values.Length); for (int i = 0; i < values.Length; ++i) @@ -497,7 +503,16 @@ public void Train(double[] values, ref double[][] results) _seriesToDetect[i] = values[i]; _minimumOriginValue = Math.Min(_minimumOriginValue, values[i]); _maximumOriginValue = Math.Max(_maximumOriginValue, values[i]); + _mean += values[i]; + } + + _mean /= values.Length; + _std = 0.0; + for (int i = 0; i < values.Length; ++i) + { + _std += (values[i] - _mean) * (values[i] - _mean); } + _std = Math.Sqrt(_std / values.Length - 1); if (_period > 0) { @@ -612,9 +627,20 @@ private void SpectralResidual(double[] values, double[][] results, double thresh var detres = score > threshold ? 1 : 0; + // Anomalies correction by zscore + if (detres > 0) + { + if (_std < _eps || Math.Abs(values[i] - _mean) / _std < _zscoreThreshold) + { + detres = 0; + score = 0.0; + } + } + results[i][0] = detres; results[i][1] = score; results[i][2] = _ifftMagList[i]; + } } diff --git a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs index 2877c3150f..925c238aa8 100644 --- a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs +++ b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs @@ -4,7 +4,6 @@ using System; using System.Collections.Generic; -using System.Data; using System.IO; using System.Linq; using Microsoft.ML.Data; @@ -717,6 +716,61 @@ public void TestSrCnnAnomalyDetectorWithSeasonalData( } } + [Theory, CombinatorialData] + public void TestSrCnnAnomalyDetectorBigSpike( + [CombinatorialValues(SrCnnDetectMode.AnomalyOnly, SrCnnDetectMode.AnomalyAndExpectedValue, SrCnnDetectMode.AnomalyOnly)] SrCnnDetectMode mode + ) + { + var ml = new MLContext(1); + IDataView dataView; + List data; + + var dataPath = GetDataPath("Timeseries", "big_spike_data.csv"); + + // Load data from file into the dataView + dataView = ml.Data.LoadFromTextFile(dataPath, hasHeader: true); + data = ml.Data.CreateEnumerable(dataView, reuseRowObject: false).ToList(); + + // Setup the detection arguments + string outputColumnName = nameof(SrCnnAnomalyDetection.Prediction); + string inputColumnName = nameof(TimeSeriesDataDouble.Value); + + // Do batch anomaly detection + var options = new SrCnnEntireAnomalyDetectorOptions() + { + Threshold = 0.3, + BatchSize = -1, + Sensitivity = 80.0, + DetectMode = mode, + Period = 0, + DeseasonalityMode = SrCnnDeseasonalityMode.Stl + }; + + var outputDataView = ml.AnomalyDetection.DetectEntireAnomalyBySrCnn(dataView, outputColumnName, inputColumnName, options); + + // Getting the data of the newly created column as an IEnumerable of SrCnnAnomalyDetection. + var predictionColumn = ml.Data.CreateEnumerable( + outputDataView, reuseRowObject: false); + + var anomalyIndex = 26; + + int k = 0; + foreach (var prediction in predictionColumn) + { + if (anomalyIndex == k) + { + Assert.Equal(1, prediction.Prediction[0]); + } + else + { + Assert.Equal(0, prediction.Prediction[0]); + } + + ++k; + } + + } + [Theory, CombinatorialData] public void TestSrCnnAnomalyDetectorWithSeasonalAnomalyData( [CombinatorialValues(SrCnnDeseasonalityMode.Stl, SrCnnDeseasonalityMode.Mean, SrCnnDeseasonalityMode.Median)] SrCnnDeseasonalityMode mode diff --git a/test/data/Timeseries/big_spike_data.csv b/test/data/Timeseries/big_spike_data.csv new file mode 100644 index 0000000000..43b1491ea0 --- /dev/null +++ b/test/data/Timeseries/big_spike_data.csv @@ -0,0 +1,136 @@ +Value +0.333061106 +2.198203303 +1.705836778 +1.861708215 +1.085050871 +0.548409541 +0.365537211 +0.433823922 +0.450379649 +0.485662867 +0.59162219 +0.678494031 +0.735315015 +0.780228908 +0.779309892 +0.71637311 +0.783369345 +0.829129842 +0.769519564 +0.74230352 +0.914116686 +0.970162226 +0.964537878 +0.983059421 +1.009637074 +1.054769667 +48232.24413 +4739.675242 +4963.982698 +8555.732913 +75.25537709 +11.2742621 +4.388301951 +2.584960796 +2.273629928 +1.972334276 +1.811987528 +1.854365004 +1.581860355 +1.478895939 +1.447799312 +1.406460886 +1.333295368 +1.282260475 +1.345933543 +1.264431234 +1.235222153 +1.204307109 +1.133533648 +1.110515351 +1.017397262 +1.103902775 +1.099039227 +1.061479438 +1.063725177 +1.072777829 +1.044107263 +0.981847451 +1.038324454 +1.033883341 +1.004416487 +1.017918007 +0.345233269 +1.092365812 +1.078005286 +1.033142227 +1.024832225 +1.098672969 +1.092767871 +1.095272293 +1.139357768 +1.0711793 +1.119012071 +1.11906761 +1.131538563 +1.113967769 +1.141610905 +1.14317559 +1.108130866 +1.083645413 +1.147460394 +1.177086603 +1.153490106 +1.145660569 +1.132464809 +1.106364602 +1.003350151 +1.099011524 +1.109557478 +1.065336146 +1.081590334 +1.075768021 +0.986278889 +1.001219623 +1.080312553 +1.075076345 +1.057146027 +1.106862867 +1.084433852 +0.975639541 +0.944182773 +1.088712253 +1.067152572 +1.107507855 +1.069142173 +1.036247939 +0.995907308 +0.932153379 +1.074865283 +1.065780376 +1.05063751 +1.077263172 +1.033459106 +0.985960758 +0.981842413 +1.032862035 +1.005063722 +0.862145269 +0.491629016 +0.473904777 +0.777874357 +0.945595834 +1.020180047 +1.025171701 +1.031632464 +1.02571454 +0.950313827 +0.935412116 +0.991591559 +1.013279894 +0.991734823 +1.007466737 +1.019160801 +0.919227208 +0.977617794 From 71451ccf48bc8cc3292203efbafbe6c14ef0eff0 Mon Sep 17 00:00:00 2001 From: "yuyi@microsoft.com" Date: Fri, 8 Jan 2021 12:28:44 +0800 Subject: [PATCH 2/4] refine codes --- .../SrCnnEntireAnomalyDetector.cs | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs b/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs index 608fcbc61b..0c0b64f1ca 100644 --- a/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs +++ b/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs @@ -495,24 +495,23 @@ public void Train(double[] values, ref double[][] results) _minimumOriginValue = Double.MaxValue; _maximumOriginValue = Double.MinValue; - _mean = 0.0; + + var sum = 0.0; + var squareSum = 0.0; Array.Resize(ref _seriesToDetect, values.Length); for (int i = 0; i < values.Length; ++i) { + var value = values[i]; _seriesToDetect[i] = values[i]; - _minimumOriginValue = Math.Min(_minimumOriginValue, values[i]); - _maximumOriginValue = Math.Max(_maximumOriginValue, values[i]); - _mean += values[i]; + _minimumOriginValue = Math.Min(_minimumOriginValue, value); + _maximumOriginValue = Math.Max(_maximumOriginValue, value); + sum += value; + squareSum += value * value; } - _mean /= values.Length; - _std = 0.0; - for (int i = 0; i < values.Length; ++i) - { - _std += (values[i] - _mean) * (values[i] - _mean); - } - _std = Math.Sqrt(_std / values.Length - 1); + _mean = sum / values.Length; + _std = Math.Sqrt((squareSum - (sum * sum) / values.Length) / values.Length); if (_period > 0) { @@ -630,7 +629,9 @@ private void SpectralResidual(double[] values, double[][] results, double thresh // Anomalies correction by zscore if (detres > 0) { - if (_std < _eps || Math.Abs(values[i] - _mean) / _std < _zscoreThreshold) + // Use zscore to filter out those points lie in the dense region. + var zscore = Math.Abs(values[i] - _mean) / _std; + if (_std < _eps || zscore < _zscoreThreshold) { detres = 0; score = 0.0; From 571fb25eb22d5c7500bbea1a1af01a5ab75889cd Mon Sep 17 00:00:00 2001 From: "yuyi@microsoft.com" Date: Fri, 8 Jan 2021 14:16:05 +0800 Subject: [PATCH 3/4] update comments --- src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs b/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs index 0c0b64f1ca..b9571e6178 100644 --- a/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs +++ b/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs @@ -629,7 +629,7 @@ private void SpectralResidual(double[] values, double[][] results, double thresh // Anomalies correction by zscore if (detres > 0) { - // Use zscore to filter out those points lie in the dense region. + // Use zscore to filter out those false anomalies that lie within 1.5 sigma region. var zscore = Math.Abs(values[i] - _mean) / _std; if (_std < _eps || zscore < _zscoreThreshold) { From 5477b57d290f1078b92e127beb0a744d2ab6a065 Mon Sep 17 00:00:00 2001 From: "yuyi@microsoft.com" Date: Tue, 9 Feb 2021 18:32:20 +0800 Subject: [PATCH 4/4] update for nit --- src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs | 2 +- test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs b/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs index b9571e6178..12a3f8c9ed 100644 --- a/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs +++ b/src/Microsoft.ML.TimeSeries/SrCnnEntireAnomalyDetector.cs @@ -503,7 +503,7 @@ public void Train(double[] values, ref double[][] results) for (int i = 0; i < values.Length; ++i) { var value = values[i]; - _seriesToDetect[i] = values[i]; + _seriesToDetect[i] = value; _minimumOriginValue = Math.Min(_minimumOriginValue, value); _maximumOriginValue = Math.Max(_maximumOriginValue, value); sum += value; diff --git a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs index 925c238aa8..1dcd2f52c3 100644 --- a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs +++ b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesDirectApi.cs @@ -721,7 +721,7 @@ public void TestSrCnnAnomalyDetectorBigSpike( [CombinatorialValues(SrCnnDetectMode.AnomalyOnly, SrCnnDetectMode.AnomalyAndExpectedValue, SrCnnDetectMode.AnomalyOnly)] SrCnnDetectMode mode ) { - var ml = new MLContext(1); + var ml = new MLContext(1); IDataView dataView; List data;