Skip to content

Add continuous resource monitoring to AutoML.IMonitor #6520

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
9647027
Fix a typo
andrasfuchs Nov 25, 2022
269b1bd
Fix trial cancellation bug
andrasfuchs Nov 25, 2022
a2c5781
Move performance related properties to TrialPerformanceMetrics and ad…
andrasfuchs Nov 25, 2022
e3fd992
Add new class and property explanations
andrasfuchs Nov 25, 2022
88fdefa
Revert "Fix trial cancellation bug"
andrasfuchs Dec 2, 2022
b03e46a
Remove pipeline info from the IMonitor Running event
andrasfuchs Dec 6, 2022
bf69dd2
Remove FreeSpaceOnDrives from TrialPerformanceMetrics
andrasfuchs Dec 6, 2022
38cf838
Change the default resource check interval to 5 seconds
andrasfuchs Dec 6, 2022
7f40df5
Remove StartedAtUtc property from TrialSettings
andrasfuchs Dec 22, 2022
8aa0ad8
move ReportTrialResourceUsage to IPerformanceMonitor
LittleLittleCloud Jan 3, 2023
739d865
Update AutoMLExperimentExtension.cs
LittleLittleCloud Jan 3, 2023
aeb651c
Merge pull request #2 from LittleLittleCloud/u/xiaoyun/add-cancellati…
andrasfuchs Jan 5, 2023
fc82c4c
Pause the performance monitor if the trial is not running
andrasfuchs Jan 6, 2023
d0ce0cd
Add StartedAtUtc and EndedAtUtc to TrialSettings
andrasfuchs Jan 8, 2023
4149a4b
cancel trial when as is
LittleLittleCloud Feb 6, 2023
7d3257a
fix tests
LittleLittleCloud Feb 6, 2023
c5c2d83
Merge branch 'main' into add-cancellation-and-resource-monitoring-to-…
LittleLittleCloud Feb 7, 2023
3919324
fix tests
LittleLittleCloud Feb 7, 2023
13ba949
fix tests
LittleLittleCloud Feb 7, 2023
488ff20
use workingset to evaluate memory usage
LittleLittleCloud Feb 8, 2023
49ac8ae
remove handler
LittleLittleCloud Feb 8, 2023
3722dcb
add handler back
LittleLittleCloud Feb 8, 2023
ff55857
add more logging
LittleLittleCloud Feb 8, 2023
509f963
add more logger
LittleLittleCloud Feb 8, 2023
1240335
add logging
LittleLittleCloud Feb 9, 2023
5a27af4
fix tests
LittleLittleCloud Feb 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
move ReportTrialResourceUsage to IPerformanceMonitor
  • Loading branch information
LittleLittleCloud committed Jan 3, 2023
commit 8aa0ad88c7b73d5a73936a00cb88252787bd68fd
6 changes: 0 additions & 6 deletions src/Microsoft.ML.AutoML.Interactive/NotebookMonitor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ public class NotebookMonitor : IMonitor
public List<TrialResult> CompletedTrials { get; set; }
public DataFrame TrialData { get; set; }

public int ResourceUsageCheckInterval => 5000;

public NotebookMonitor(SweepablePipeline pipeline)
{
CompletedTrials = new List<TrialResult>();
Expand Down Expand Up @@ -86,9 +84,5 @@ public void SetUpdate(DisplayedValue valueToUpdate)
_valueToUpdate = valueToUpdate;
ThrottledUpdate();
}

public void ReportTrialResourceUsage(TrialSettings setting)
{
}
}
}
41 changes: 39 additions & 2 deletions src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs
Original file line number Diff line number Diff line change
Expand Up @@ -149,18 +149,55 @@ public static AutoMLExperiment SetPipeline(this AutoMLExperiment experiment, Swe
return experiment;
}

/// <summary>
/// Set <see cref="DefaultPerformanceMonitor"/> as <see cref="IPerformanceMonitor"/> for <see cref="AutoMLExperiment"/>.
/// </summary>
/// <param name="experiment"><see cref="AutoMLExperiment"/></param>
/// <param name="checkIntervalInMilliseconds">the interval in milliseconds for <see cref="DefaultPerformanceMonitor"/> to sample <see cref="TrialPerformanceMetrics"/></param>
/// <returns></returns>
public static AutoMLExperiment SetPerformanceMonitor(this AutoMLExperiment experiment, int checkIntervalInMilliseconds = 1000)
{
experiment.SetPerformanceMonitor((service) =>
{
var channel = service.GetService<IChannel>();

return new DefaultPerformanceMonitor(channel, checkIntervalInMilliseconds);
var settings = service.GetRequiredService<AutoMLExperiment.AutoMLExperimentSettings>();
return new DefaultPerformanceMonitor(settings, channel, checkIntervalInMilliseconds);
});

return experiment;
}

/// <summary>
/// Set a custom performance monitor as <see cref="IPerformanceMonitor"/> for <see cref="AutoMLExperiment"/>.
/// </summary>
/// <typeparam name="TPerformanceMonitor"></typeparam>
/// <param name="experiment"><see cref="AutoMLExperiment"/></param>
/// <param name="factory"></param>
/// <returns></returns>
public static AutoMLExperiment SetPerformanceMonitor<TPerformanceMonitor>(this AutoMLExperiment experiment, Func<IServiceProvider, TPerformanceMonitor> factory)
where TPerformanceMonitor : class, IPerformanceMonitor

{
experiment.ServiceCollection.AddTransient<IPerformanceMonitor>(factory);

return experiment;
}

/// <summary>
/// Set a custom performance monitor as <see cref="IPerformanceMonitor"/> for <see cref="AutoMLExperiment"/>.
/// </summary>
/// <typeparam name="TPerformanceMonitor"></typeparam>
/// <param name="experiment"><see cref="AutoMLExperiment"/></param>
/// <returns></returns>
public static AutoMLExperiment SetPerformanceMonitor<TPerformanceMonitor>(this AutoMLExperiment experiment)
where TPerformanceMonitor : class, IPerformanceMonitor

{
experiment.ServiceCollection.AddTransient<IPerformanceMonitor, TPerformanceMonitor>();

return experiment;
}

/// <summary>
/// Set <see cref="SmacTuner"/> as tuner for hyper-parameter optimization. The performance of smac is in a large extend determined
/// by <paramref name="numberOfTrees"/>, <paramref name="nMinForSpit"/> and <paramref name="splitRatio"/>, which are used to fit smac's inner
Expand Down
44 changes: 1 addition & 43 deletions src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs
Original file line number Diff line number Diff line change
Expand Up @@ -194,22 +194,6 @@ public AutoMLExperiment SetTuner<TTuner>()
return this;
}

internal AutoMLExperiment SetPerformanceMonitor<TPerformanceMonitor>()
where TPerformanceMonitor : class, IPerformanceMonitor
{
_serviceCollection.AddTransient<IPerformanceMonitor, TPerformanceMonitor>();

return this;
}

internal AutoMLExperiment SetPerformanceMonitor<TPerformanceMonitor>(Func<IServiceProvider, TPerformanceMonitor> factory)
where TPerformanceMonitor : class, IPerformanceMonitor
{
_serviceCollection.AddTransient<IPerformanceMonitor>(factory);

return this;
}

/// <summary>
/// Run experiment and return the best trial result synchronizely.
/// </summary>
Expand Down Expand Up @@ -257,29 +241,14 @@ public async Task<TrialResult> RunAsync(CancellationToken ct = default)
{
TrialId = trialNum++,
Parameter = Parameter.CreateNestedParameter(),
CancellationTokenSource = null,
PerformanceMetrics = new TrialPerformanceMetrics(),
};
var parameter = tuner.Propose(trialSettings);
trialSettings.Parameter = parameter;

using (var trialCancellationTokenSource = new CancellationTokenSource())
{
trialSettings.CancellationTokenSource = trialCancellationTokenSource;
monitor?.ReportRunningTrial(trialSettings);

System.Timers.Timer resourceUsageTimer = null;
if ((monitor != null) && (monitor?.ResourceUsageCheckInterval > 0))
{
resourceUsageTimer = new System.Timers.Timer(monitor.ResourceUsageCheckInterval);
resourceUsageTimer.Elapsed += (o, e) =>
{
monitor?.ReportTrialResourceUsage(trialSettings);
};
resourceUsageTimer.AutoReset = true;
resourceUsageTimer.Enabled = false;
}

void handler(object o, EventArgs e)
{
// only force-canceling running trials when there's completed trials.
Expand All @@ -296,21 +265,11 @@ void handler(object o, EventArgs e)

performanceMonitor.PerformanceMetricsUpdated += (o, metrics) =>
{
trialSettings.PerformanceMetrics = metrics;

if (_settings.MaximumMemoryUsageInMegaByte is double d && metrics.PeakMemoryUsage > d && !trialCancellationTokenSource.IsCancellationRequested)
{
logger.Trace($"cancel current trial {trialSettings.TrialId} because it uses {metrics.PeakMemoryUsage} mb memory and the maximum memory usage is {d}");
trialCancellationTokenSource.Cancel();

GC.AddMemoryPressure(Convert.ToInt64(metrics.PeakMemoryUsage) * 1024 * 1024);
GC.Collect();
}
performanceMonitor.OnPerformanceMetricsUpdatedHandler(trialSettings, metrics, trialCancellationTokenSource);
};

var trialTask = runner.RunAsync(trialSettings, trialCancellationTokenSource.Token);
performanceMonitor.Start();
resourceUsageTimer?.Start();
logger.Trace($"trial setting - {JsonSerializer.Serialize(trialSettings)}");
var trialResult = await trialTask;

Expand Down Expand Up @@ -365,7 +324,6 @@ void handler(object o, EventArgs e)
finally
{
aggregateTrainingStopManager.OnStopTraining -= handler;
resourceUsageTimer?.Stop();
}
}
}
Expand Down
16 changes: 4 additions & 12 deletions src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,13 @@ namespace Microsoft.ML.AutoML
/// </summary>
public interface IMonitor
{
/// <summary>
/// Interval in milliseconds to report resource usage.
/// </summary>
int ResourceUsageCheckInterval { get; }
void ReportCompletedTrial(TrialResult result);

void ReportBestTrial(TrialResult result);

void ReportFailTrial(TrialSettings settings, Exception exception = null);

void ReportRunningTrial(TrialSettings settings);
void ReportTrialResourceUsage(TrialSettings settings);
}

/// <summary>
Expand All @@ -32,14 +30,12 @@ internal class MLContextMonitor : IMonitor
private readonly IChannel _logger;
private readonly List<TrialResult> _completedTrials;
private readonly SweepablePipeline _pipeline;
public int ResourceUsageCheckInterval { get; private set; }

public MLContextMonitor(IChannel logger, SweepablePipeline pipeline, int resourceUsageCheckInterval = 5000)
public MLContextMonitor(IChannel logger, SweepablePipeline pipeline)
{
_logger = logger;
_completedTrials = new List<TrialResult>();
_pipeline = pipeline;
ResourceUsageCheckInterval = resourceUsageCheckInterval;
}

public virtual void ReportBestTrial(TrialResult result)
Expand All @@ -62,10 +58,6 @@ public virtual void ReportRunningTrial(TrialSettings setting)
{
_logger.Info($"Update Running Trial - Id: {setting.TrialId}");
}

public void ReportTrialResourceUsage(TrialSettings setting)
{
}
}

internal class TrialResultMonitor<TMetrics> : MLContextMonitor
Expand Down
28 changes: 25 additions & 3 deletions src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
using System.IO;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Timers;
using Microsoft.ML.Runtime;
using Timer = System.Timers.Timer;

namespace Microsoft.ML.AutoML
{
internal interface IPerformanceMonitor : IDisposable
public interface IPerformanceMonitor : IDisposable
{
void Start();

Expand All @@ -24,20 +26,28 @@ internal interface IPerformanceMonitor : IDisposable

double? GetPeakCpuUsage();

/// <summary>
/// The handler function every time <see cref="PerformanceMetricsUpdated"/> get fired.
/// </summary>
void OnPerformanceMetricsUpdatedHandler(TrialSettings trialSettings, TrialPerformanceMetrics metrics, CancellationTokenSource trialCancellationTokenSource);


public event EventHandler<TrialPerformanceMetrics> PerformanceMetricsUpdated;
}

internal class DefaultPerformanceMonitor : IPerformanceMonitor
public class DefaultPerformanceMonitor : IPerformanceMonitor
{
private readonly IChannel _logger;
private readonly AutoMLExperiment.AutoMLExperimentSettings _settings;
private Timer _timer;
private double? _peakCpuUsage;
private double? _peakMemoryUsage;
private readonly int _checkIntervalInMilliseconds;
private TimeSpan _totalCpuProcessorTime;

public DefaultPerformanceMonitor(IChannel logger, int checkIntervalInMilliseconds)
public DefaultPerformanceMonitor(AutoMLExperiment.AutoMLExperimentSettings settings, IChannel logger, int checkIntervalInMilliseconds)
{
_settings = settings;
_logger = logger;
_checkIntervalInMilliseconds = checkIntervalInMilliseconds;
}
Expand Down Expand Up @@ -122,5 +132,17 @@ private void SampleCpuAndMemoryUsage()
PerformanceMetricsUpdated?.Invoke(this, metrics);
}
}

public virtual void OnPerformanceMetricsUpdatedHandler(TrialSettings trialSettings, TrialPerformanceMetrics metrics, CancellationTokenSource trialCancellationTokenSource)
{
if (_settings.MaximumMemoryUsageInMegaByte is double d && metrics.PeakMemoryUsage > d && !trialCancellationTokenSource.IsCancellationRequested)
{
_logger.Trace($"cancel current trial {trialSettings.TrialId} because it uses {metrics.PeakMemoryUsage} mb memory and the maximum memory usage is {d}");
trialCancellationTokenSource.Cancel();

GC.AddMemoryPressure(Convert.ToInt64(metrics.PeakMemoryUsage) * 1024 * 1024);
GC.Collect();
}
}
}
}
9 changes: 0 additions & 9 deletions src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,5 @@ public class TrialSettings
/// Parameters for the pipeline used in this trial
/// </summary>
public Parameter Parameter { get; set; }
/// <summary>
/// Cancellation token source to have the ability to cancel the trial
/// </summary>
[JsonIgnore]
public CancellationTokenSource CancellationTokenSource { get; set; }
/// <summary>
/// Performance metrics of the trial
/// </summary>
public TrialPerformanceMetrics PerformanceMetrics { get; internal set; }
}
}
7 changes: 5 additions & 2 deletions test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,7 @@ public async Task AutoMLExperiment_cancel_trial_when_exceeds_memory_limit_Async(
return new DummyTrialRunner(settings, 5, channel);
})
.SetTuner<RandomSearchTuner>()
.SetMaximumMemoryUsageInMegaByte(0.01)
.SetPerformanceMonitor<DummyPeformanceMonitor>();
.SetMaximumMemoryUsageInMegaByte(0.01);

var runExperimentAction = async () => await experiment.RunAsync();
await runExperimentAction.Should().ThrowExactlyAsync<TimeoutException>();
Expand Down Expand Up @@ -423,6 +422,10 @@ public void Dispose()
return 1000;
}

public void OnPerformanceMetricsUpdatedHandler(TrialSettings trialSettings, TrialPerformanceMetrics metrics, CancellationTokenSource trialCancellationTokenSource)
{
}

public void Start()
{
if (_timer == null)
Expand Down