2 releases
| new 0.1.0-alpha.2 | Dec 22, 2025 |
|---|---|
| 0.1.0-alpha.1 | Sep 30, 2025 |
#1980 in Machine learning
Used in 3 crates
5.5MB
120K
SLoC
torsh-profiler
Performance profiling and analysis tools for ToRSh applications.
Overview
This crate provides comprehensive profiling capabilities for deep learning workloads:
- Performance Profiling: CPU/GPU time, memory usage, operation counts
- Memory Profiling: Allocation tracking, peak usage, memory leaks
- Operation Analysis: Kernel timing, FLOPS counting, bottleneck detection
- Visualization: Chrome tracing, TensorBoard integration, custom views
- Integration: Works with CUDA profiler, Intel VTune, Apple Instruments
Usage
Basic Profiling
use torsh_profiler::prelude::*;
// Profile a model
let profiler = Profiler::new()
.record_shapes(true)
.with_stack(true);
with_profiler(&profiler, || {
for _ in 0..100 {
let output = model.forward(&input)?;
loss = criterion(&output, &target)?;
loss.backward()?;
optimizer.step()?;
}
})?;
// Get results
let report = profiler.report();
println!("{}", report);
Detailed Operation Profiling
// Profile with categories
let profiler = Profiler::new()
.activities(&[ProfilerActivity::CPU, ProfilerActivity::CUDA])
.record_shapes(true)
.profile_memory(true)
.with_stack(true);
// Profile specific operations
profiler.start();
profiler.step("data_loading");
let batch = dataloader.next()?;
profiler.step("forward");
let output = model.forward(&batch)?;
profiler.step("loss");
let loss = criterion(&output, &target)?;
profiler.step("backward");
loss.backward()?;
profiler.step("optimizer");
optimizer.step()?;
profiler.stop();
// Export trace
profiler.export_chrome_trace("trace.json")?;
Memory Profiling
use torsh_profiler::memory::*;
// Track memory allocations
let memory_profiler = MemoryProfiler::new()
.track_allocations(true)
.include_stacktraces(true);
memory_profiler.start();
// Your code here
let tensors = (0..1000).map(|_| {
randn(&[1024, 1024])
}).collect::<Vec<_>>();
memory_profiler.stop();
// Analyze memory usage
let snapshot = memory_profiler.snapshot()?;
println!("Peak memory: {} MB", snapshot.peak_memory_mb());
println!("Current memory: {} MB", snapshot.current_memory_mb());
// Find memory leaks
let leaks = memory_profiler.find_leaks()?;
for leak in leaks {
println!("Potential leak: {} bytes at {}", leak.size, leak.stack_trace);
}
FLOPS Counting
use torsh_profiler::flops::*;
// Count FLOPS for a model
let flop_counter = FlopCounter::new(&model);
let input_shape = vec![1, 3, 224, 224];
let total_flops = flop_counter.count(&input_shape)?;
println!("Total FLOPs: {}", format_flops(total_flops));
// Detailed breakdown
let breakdown = flop_counter.breakdown(&input_shape)?;
for (module_name, flops) in breakdown {
println!("{}: {}", module_name, format_flops(flops));
}
Custom Profiling Regions
use torsh_profiler::profile;
// Profile specific code regions
profile!("data_preprocessing", {
let normalized = normalize(&data)?;
let augmented = augment(&normalized)?;
augmented
});
// Or with explicit profiler
let profiler = Profiler::current();
let _guard = profiler.record("critical_section");
// Critical code here
// _guard automatically stops profiling when dropped
TensorBoard Integration
use torsh_profiler::tensorboard::*;
// Export to TensorBoard format
let tb_profiler = TensorBoardProfiler::new("./runs/profile");
tb_profiler.add_scalar("loss", loss.item(), step)?;
tb_profiler.add_histogram("weights", &model.weight, step)?;
tb_profiler.add_graph(&model, &example_input)?;
// Profile and export
with_profiler(&tb_profiler, || {
// Training loop
})?;
Advanced Analysis
use torsh_profiler::analysis::*;
// Analyze bottlenecks
let analyzer = ProfileAnalyzer::new(&profiler.events());
let bottlenecks = analyzer.find_bottlenecks()?;
for bottleneck in bottlenecks.iter().take(10) {
println!("{}: {:.2}% of total time", bottleneck.name, bottleneck.percentage);
}
// Find inefficient operations
let inefficiencies = analyzer.find_inefficiencies()?;
for issue in inefficiencies {
println!("Inefficiency: {}", issue.description);
println!("Suggestion: {}", issue.suggestion);
}
// Memory access patterns
let memory_patterns = analyzer.analyze_memory_access()?;
println!("Cache efficiency: {:.2}%", memory_patterns.cache_efficiency * 100.0);
Multi-GPU Profiling
// Profile distributed training
let profiler = DistributedProfiler::new()
.rank(rank)
.world_size(world_size)
.sync_enabled(true);
with_profiler(&profiler, || {
// Distributed training
})?;
// Aggregate results from all ranks
if rank == 0 {
let aggregated = profiler.aggregate_results()?;
println!("Total communication time: {:?}", aggregated.comm_time);
println!("Load imbalance: {:.2}%", aggregated.load_imbalance * 100.0);
}
Integration with External Profilers
// NVIDIA Nsight Systems
#[cfg(feature = "cuda")]
{
use torsh_profiler::cuda::*;
nvtx::range_push("model_forward");
let output = model.forward(&input)?;
nvtx::range_pop();
}
// Intel VTune
#[cfg(feature = "vtune")]
{
use torsh_profiler::vtune::*;
let domain = vtune::Domain::new("torsh_app");
let task = domain.begin_task("inference");
let output = model.forward(&input)?;
task.end();
}
Profiling Configuration
// Configure via environment variables
// TORSH_PROFILER_ENABLED=1
// TORSH_PROFILER_OUTPUT=trace.json
// TORSH_PROFILER_ACTIVITIES=cpu,cuda
// Or programmatically
ProfilerConfig::default()
.enabled(true)
.output_path("profile.json")
.activities(vec![Activity::CPU, Activity::CUDA])
.record_shapes(true)
.profile_memory(true)
.with_stack(true)
.with_flops(true)
.with_modules(true)
.export_format(ExportFormat::Chrome)
.apply()?;
Visualization
The profiler can export data in various formats:
- Chrome Tracing: View in chrome://tracing
- TensorBoard: Integrated with TensorBoard profiler plugin
- Perfetto: Modern trace viewer
- Custom JSON: For custom analysis tools
Performance Tips
- Profile representative workloads
- Warm up before profiling (exclude first iterations)
- Profile both training and inference
- Look for memory allocation patterns
- Check for unnecessary synchronizations
License
Licensed under either of
- Apache License, Version 2.0 (LICENSE-APACHE)
- MIT license (LICENSE-MIT)
at your option.
Dependencies
~168MB
~2.5M SLoC