Skip to content

Commit d239fda

Browse files
Add continuous resource monitoring to AutoML.IMonitor (#6520)
* Fix a typo * Fix trial cancellation bug * Move performance related properties to TrialPerformanceMetrics and add ReportTrialResourceUsage event to IMonitor * Add new class and property explanations * Revert "Fix trial cancellation bug" This reverts commit 269b1bd. * Remove pipeline info from the IMonitor Running event * Remove FreeSpaceOnDrives from TrialPerformanceMetrics * Change the default resource check interval to 5 seconds * Remove StartedAtUtc property from TrialSettings * move ReportTrialResourceUsage to IPerformanceMonitor * Update AutoMLExperimentExtension.cs * Pause the performance monitor if the trial is not running * Add StartedAtUtc and EndedAtUtc to TrialSettings * cancel trial when as is * fix tests * fix tests * fix tests * use workingset to evaluate memory usage * remove handler * add handler back * add more logging * add more logger * add logging * fix tests --------- Co-authored-by: XiaoYun Zhang <[email protected]> Co-authored-by: Xiaoyun Zhang <[email protected]>
1 parent 8c0ceaf commit d239fda

File tree

10 files changed

+209
-76
lines changed

10 files changed

+209
-76
lines changed

src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs

+37-2
Original file line numberDiff line numberDiff line change
@@ -149,18 +149,53 @@ public static AutoMLExperiment SetPipeline(this AutoMLExperiment experiment, Swe
149149
return experiment;
150150
}
151151

152+
/// <summary>
153+
/// Set <see cref="DefaultPerformanceMonitor"/> as <see cref="IPerformanceMonitor"/> for <see cref="AutoMLExperiment"/>.
154+
/// </summary>
155+
/// <param name="experiment"><see cref="AutoMLExperiment"/></param>
156+
/// <param name="checkIntervalInMilliseconds">the interval in milliseconds for <see cref="DefaultPerformanceMonitor"/> to sample <see cref="TrialPerformanceMetrics"/></param>
157+
/// <returns></returns>
152158
public static AutoMLExperiment SetPerformanceMonitor(this AutoMLExperiment experiment, int checkIntervalInMilliseconds = 1000)
153159
{
154160
experiment.SetPerformanceMonitor((service) =>
155161
{
156162
var channel = service.GetService<IChannel>();
157-
158-
return new DefaultPerformanceMonitor(channel, checkIntervalInMilliseconds);
163+
var settings = service.GetRequiredService<AutoMLExperiment.AutoMLExperimentSettings>();
164+
return new DefaultPerformanceMonitor(settings, channel, checkIntervalInMilliseconds);
159165
});
160166

161167
return experiment;
162168
}
163169

170+
/// <summary>
171+
/// Set a custom performance monitor as <see cref="IPerformanceMonitor"/> for <see cref="AutoMLExperiment"/>.
172+
/// </summary>
173+
/// <typeparam name="TPerformanceMonitor"></typeparam>
174+
/// <param name="experiment"><see cref="AutoMLExperiment"/></param>
175+
/// <param name="factory"></param>
176+
/// <returns></returns>
177+
public static AutoMLExperiment SetPerformanceMonitor<TPerformanceMonitor>(this AutoMLExperiment experiment, Func<IServiceProvider, TPerformanceMonitor> factory)
178+
where TPerformanceMonitor : class, IPerformanceMonitor
179+
{
180+
experiment.ServiceCollection.AddTransient<IPerformanceMonitor>(factory);
181+
182+
return experiment;
183+
}
184+
185+
/// <summary>
186+
/// Set a custom performance monitor as <see cref="IPerformanceMonitor"/> for <see cref="AutoMLExperiment"/>.
187+
/// </summary>
188+
/// <typeparam name="TPerformanceMonitor"></typeparam>
189+
/// <param name="experiment"><see cref="AutoMLExperiment"/></param>
190+
/// <returns></returns>
191+
public static AutoMLExperiment SetPerformanceMonitor<TPerformanceMonitor>(this AutoMLExperiment experiment)
192+
where TPerformanceMonitor : class, IPerformanceMonitor
193+
{
194+
experiment.ServiceCollection.AddTransient<IPerformanceMonitor, TPerformanceMonitor>();
195+
196+
return experiment;
197+
}
198+
164199
/// <summary>
165200
/// Set <see cref="SmacTuner"/> as tuner for hyper-parameter optimization. The performance of smac is in a large extend determined
166201
/// by <paramref name="numberOfTrees"/>, <paramref name="nMinForSpit"/> and <paramref name="splitRatio"/>, which are used to fit smac's inner

src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs

+31-42
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// See the LICENSE file in the project root for more information.
44

55
using System;
6+
using System.Diagnostics;
67
using System.Linq;
78
using System.Text.Json;
89
using System.Threading;
@@ -193,22 +194,6 @@ public AutoMLExperiment SetTuner<TTuner>()
193194
return this;
194195
}
195196

196-
internal AutoMLExperiment SetPerformanceMonitor<TPerformanceMonitor>()
197-
where TPerformanceMonitor : class, IPerformanceMonitor
198-
{
199-
_serviceCollection.AddTransient<IPerformanceMonitor, TPerformanceMonitor>();
200-
201-
return this;
202-
}
203-
204-
internal AutoMLExperiment SetPerformanceMonitor<TPerformanceMonitor>(Func<IServiceProvider, TPerformanceMonitor> factory)
205-
where TPerformanceMonitor : class, IPerformanceMonitor
206-
{
207-
_serviceCollection.AddTransient<IPerformanceMonitor>(factory);
208-
209-
return this;
210-
}
211-
212197
/// <summary>
213198
/// Run experiment and return the best trial result synchronizely.
214199
/// </summary>
@@ -249,54 +234,48 @@ public async Task<TrialResult> RunAsync(CancellationToken ct = default)
249234
var trialNum = trialResultManager?.GetAllTrialResults().Max(t => t.TrialSettings?.TrialId) + 1 ?? 0;
250235
var tuner = serviceProvider.GetService<ITuner>();
251236
Contracts.Assert(tuner != null, "tuner can't be null");
237+
252238
while (!aggregateTrainingStopManager.IsStopTrainingRequested())
253239
{
254-
var setting = new TrialSettings()
240+
var trialSettings = new TrialSettings()
255241
{
256242
TrialId = trialNum++,
257243
Parameter = Parameter.CreateNestedParameter(),
244+
StartedAtUtc = DateTime.UtcNow,
258245
};
259-
var parameter = tuner.Propose(setting);
260-
setting.Parameter = parameter;
246+
var parameter = tuner.Propose(trialSettings);
247+
trialSettings.Parameter = parameter;
261248

262-
monitor?.ReportRunningTrial(setting);
263249
using (var trialCancellationTokenSource = new CancellationTokenSource())
264250
{
251+
monitor?.ReportRunningTrial(trialSettings);
252+
265253
void handler(object o, EventArgs e)
266254
{
267-
// only force-canceling running trials when there's completed trials.
268-
// otherwise, wait for the current running trial to be completed.
269-
if (_bestTrialResult != null)
270-
trialCancellationTokenSource.Cancel();
255+
trialCancellationTokenSource.Cancel();
271256
}
272257
try
273258
{
274259
using (var performanceMonitor = serviceProvider.GetService<IPerformanceMonitor>())
275260
using (var runner = serviceProvider.GetRequiredService<ITrialRunner>())
276261
{
277262
aggregateTrainingStopManager.OnStopTraining += handler;
278-
279-
performanceMonitor.MemoryUsageInMegaByte += (o, m) =>
263+
performanceMonitor.PerformanceMetricsUpdated += (o, metrics) =>
280264
{
281-
if (_settings.MaximumMemoryUsageInMegaByte is double d && m > d && !trialCancellationTokenSource.IsCancellationRequested)
282-
{
283-
logger.Trace($"cancel current trial {setting.TrialId} because it uses {m} mb memory and the maximum memory usage is {d}");
284-
trialCancellationTokenSource.Cancel();
285-
286-
GC.AddMemoryPressure(Convert.ToInt64(m) * 1024 * 1024);
287-
GC.Collect();
288-
}
265+
performanceMonitor.OnPerformanceMetricsUpdatedHandler(trialSettings, metrics, trialCancellationTokenSource);
289266
};
290267

291268
performanceMonitor.Start();
292-
logger.Trace($"trial setting - {JsonSerializer.Serialize(setting)}");
293-
var trialResult = await runner.RunAsync(setting, trialCancellationTokenSource.Token);
269+
logger.Trace($"trial setting - {JsonSerializer.Serialize(trialSettings)}");
270+
var trialResult = await runner.RunAsync(trialSettings, trialCancellationTokenSource.Token);
294271

295272
var peakCpu = performanceMonitor?.GetPeakCpuUsage();
296273
var peakMemoryInMB = performanceMonitor?.GetPeakMemoryUsageInMegaByte();
297274
trialResult.PeakCpu = peakCpu;
298275
trialResult.PeakMemoryInMegaByte = peakMemoryInMB;
276+
trialResult.TrialSettings.EndedAtUtc = DateTime.UtcNow;
299277

278+
performanceMonitor.Pause();
300279
monitor?.ReportCompletedTrial(trialResult);
301280
tuner.Update(trialResult);
302281
trialResultManager?.AddOrUpdateTrialResult(trialResult);
@@ -313,26 +292,37 @@ void handler(object o, EventArgs e)
313292
}
314293
catch (OperationCanceledException ex) when (aggregateTrainingStopManager.IsStopTrainingRequested() == false)
315294
{
316-
monitor?.ReportFailTrial(setting, ex);
317-
var result = new TrialResult
295+
logger.Trace($"trial cancelled - {JsonSerializer.Serialize(trialSettings)}, continue training");
296+
trialSettings.EndedAtUtc = DateTime.UtcNow;
297+
monitor?.ReportFailTrial(trialSettings, ex);
298+
var trialResult = new TrialResult
318299
{
319-
TrialSettings = setting,
300+
TrialSettings = trialSettings,
320301
Loss = double.MaxValue,
321302
};
322303

323-
tuner.Update(result);
304+
tuner.Update(trialResult);
305+
trialResultManager?.AddOrUpdateTrialResult(trialResult);
306+
aggregateTrainingStopManager.Update(trialResult);
324307
continue;
325308
}
326309
catch (OperationCanceledException) when (aggregateTrainingStopManager.IsStopTrainingRequested())
327310
{
311+
logger.Trace($"trial cancelled - {JsonSerializer.Serialize(trialSettings)}, stop training");
312+
328313
break;
329314
}
330315
catch (Exception ex)
331316
{
332-
monitor?.ReportFailTrial(setting, ex);
317+
logger.Trace($"trial failed - {JsonSerializer.Serialize(trialSettings)}, stop training");
318+
319+
trialSettings.EndedAtUtc = DateTime.UtcNow;
320+
monitor?.ReportFailTrial(trialSettings, ex);
333321

334322
if (!aggregateTrainingStopManager.IsStopTrainingRequested() && _bestTrialResult == null)
335323
{
324+
logger.Trace($"trial fatal error - {JsonSerializer.Serialize(trialSettings)}, stop training");
325+
336326
// TODO
337327
// it's questionable on whether to abort the entire training process
338328
// for a single fail trial. We should make it an option and only exit
@@ -343,7 +333,6 @@ void handler(object o, EventArgs e)
343333
finally
344334
{
345335
aggregateTrainingStopManager.OnStopTraining -= handler;
346-
347336
}
348337
}
349338
}

src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs

+3-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ public interface IMonitor
1919

2020
void ReportFailTrial(TrialSettings settings, Exception exception = null);
2121

22-
void ReportRunningTrial(TrialSettings setting);
22+
void ReportRunningTrial(TrialSettings settings);
2323
}
2424

2525
/// <summary>
@@ -30,6 +30,7 @@ internal class MLContextMonitor : IMonitor
3030
private readonly IChannel _logger;
3131
private readonly List<TrialResult> _completedTrials;
3232
private readonly SweepablePipeline _pipeline;
33+
3334
public MLContextMonitor(IChannel logger, SweepablePipeline pipeline)
3435
{
3536
_logger = logger;
@@ -55,7 +56,7 @@ public virtual void ReportFailTrial(TrialSettings settings, Exception exception
5556

5657
public virtual void ReportRunningTrial(TrialSettings setting)
5758
{
58-
_logger.Info($"Update Running Trial - Id: {setting.TrialId} - Pipeline: {_pipeline.ToString(setting.Parameter)}");
59+
_logger.Info($"Update Running Trial - Id: {setting.TrialId}");
5960
}
6061
}
6162

src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs

+53-12
Original file line numberDiff line numberDiff line change
@@ -5,47 +5,57 @@
55
using System;
66
using System.Collections.Generic;
77
using System.Diagnostics;
8+
using System.IO;
9+
using System.Linq;
810
using System.Text;
11+
using System.Threading;
912
using System.Threading.Tasks;
1013
using System.Timers;
1114
using Microsoft.ML.Runtime;
15+
using Timer = System.Timers.Timer;
1216

1317
namespace Microsoft.ML.AutoML
1418
{
15-
internal interface IPerformanceMonitor : IDisposable
19+
public interface IPerformanceMonitor : IDisposable
1620
{
1721
void Start();
1822

23+
void Pause();
24+
1925
void Stop();
2026

2127
double? GetPeakMemoryUsageInMegaByte();
2228

2329
double? GetPeakCpuUsage();
2430

25-
public event EventHandler<double> CpuUsage;
31+
/// <summary>
32+
/// The handler function every time <see cref="PerformanceMetricsUpdated"/> get fired.
33+
/// </summary>
34+
void OnPerformanceMetricsUpdatedHandler(TrialSettings trialSettings, TrialPerformanceMetrics metrics, CancellationTokenSource trialCancellationTokenSource);
35+
2636

27-
public event EventHandler<double> MemoryUsageInMegaByte;
37+
public event EventHandler<TrialPerformanceMetrics> PerformanceMetricsUpdated;
2838
}
2939

30-
internal class DefaultPerformanceMonitor : IPerformanceMonitor
40+
public class DefaultPerformanceMonitor : IPerformanceMonitor
3141
{
3242
private readonly IChannel _logger;
43+
private readonly AutoMLExperiment.AutoMLExperimentSettings _settings;
3344
private Timer _timer;
3445
private double? _peakCpuUsage;
3546
private double? _peakMemoryUsage;
3647
private readonly int _checkIntervalInMilliseconds;
3748
private TimeSpan _totalCpuProcessorTime;
3849

39-
public DefaultPerformanceMonitor(IChannel logger, int checkIntervalInMilliseconds)
50+
public DefaultPerformanceMonitor(AutoMLExperiment.AutoMLExperimentSettings settings, IChannel logger, int checkIntervalInMilliseconds)
4051
{
52+
_settings = settings;
4153
_logger = logger;
4254
_checkIntervalInMilliseconds = checkIntervalInMilliseconds;
4355
}
4456

4557

46-
public event EventHandler<double> CpuUsage;
47-
48-
public event EventHandler<double> MemoryUsageInMegaByte;
58+
public event EventHandler<TrialPerformanceMetrics> PerformanceMetricsUpdated;
4959

5060

5161
public void Dispose()
@@ -71,9 +81,18 @@ public void Start()
7181
_totalCpuProcessorTime = Process.GetCurrentProcess().TotalProcessorTime;
7282
_timer.Elapsed += OnCheckCpuAndMemoryUsage;
7383
_timer.AutoReset = true;
74-
_timer.Enabled = true;
7584
_logger?.Trace($"{typeof(DefaultPerformanceMonitor)} has been started");
7685
}
86+
87+
// trigger the PerformanceMetricsUpdated event and (re)start the timer
88+
_timer.Enabled = false;
89+
SampleCpuAndMemoryUsage();
90+
_timer.Enabled = true;
91+
}
92+
93+
public void Pause()
94+
{
95+
_timer.Enabled = false;
7796
}
7897

7998
public void Stop()
@@ -108,11 +127,33 @@ private void SampleCpuAndMemoryUsage()
108127
_peakCpuUsage = Math.Max(cpuUsageInTotal, _peakCpuUsage ?? 0);
109128

110129
// calculate Memory Usage in MB
111-
var memoryUsage = process.PrivateMemorySize64 * 1.0 / (1024 * 1024);
130+
var memoryUsage = process.WorkingSet64 * 1.0 / (1024 * 1024);
112131
_peakMemoryUsage = Math.Max(memoryUsage, _peakMemoryUsage ?? 0);
132+
133+
var metrics = new TrialPerformanceMetrics()
134+
{
135+
CpuUsage = cpuUsageInTotal,
136+
MemoryUsage = memoryUsage,
137+
PeakCpuUsage = _peakCpuUsage,
138+
PeakMemoryUsage = _peakMemoryUsage
139+
};
140+
113141
_logger?.Trace($"current CPU: {cpuUsageInTotal}, current Memory(mb): {memoryUsage}");
114-
MemoryUsageInMegaByte?.Invoke(this, memoryUsage);
115-
CpuUsage?.Invoke(this, cpuUsageInTotal);
142+
143+
PerformanceMetricsUpdated?.Invoke(this, metrics);
144+
}
145+
}
146+
147+
public virtual void OnPerformanceMetricsUpdatedHandler(TrialSettings trialSettings, TrialPerformanceMetrics metrics, CancellationTokenSource trialCancellationTokenSource)
148+
{
149+
_logger.Trace($"maximum memory usage: {_settings.MaximumMemoryUsageInMegaByte}, PeakMemoryUsage: {metrics.PeakMemoryUsage} trialIsCancelled: {trialCancellationTokenSource.IsCancellationRequested}");
150+
if (_settings.MaximumMemoryUsageInMegaByte is double d && metrics.PeakMemoryUsage > d && !trialCancellationTokenSource.IsCancellationRequested)
151+
{
152+
_logger.Trace($"cancel current trial {trialSettings.TrialId} because it uses {metrics.PeakMemoryUsage} mb memory and the maximum memory usage is {d}");
153+
trialCancellationTokenSource.Cancel();
154+
155+
GC.AddMemoryPressure(Convert.ToInt64(metrics.PeakMemoryUsage) * 1024 * 1024);
156+
GC.Collect();
116157
}
117158
}
118159
}

src/Microsoft.ML.AutoML/AutoMLExperiment/IStopTrainingManager.cs

+12-1
Original file line numberDiff line numberDiff line change
@@ -140,10 +140,21 @@ public void AddTrainingStopManager(IStopTrainingManager manager)
140140
_managers.Add(manager);
141141
manager.OnStopTraining += (o, e) =>
142142
{
143-
OnStopTraining?.Invoke(this, e);
143+
if (_managers.Exists(manager.Equals))
144+
{
145+
OnStopTraining?.Invoke(this, e);
146+
}
144147
};
145148
}
146149

150+
public void RemoveTrainingStopManagerIfExist(IStopTrainingManager manager)
151+
{
152+
if (_managers.Exists(manager.Equals))
153+
{
154+
_managers.RemoveAll(manager.Equals);
155+
}
156+
}
157+
147158
public void Update(TrialResult result)
148159
{
149160
foreach (var manager in _managers)

0 commit comments

Comments
 (0)