Performance Optimization
Performance optimization in Rust involves understanding how the compiler works, measuring performance accurately, and applying targeted optimizations. Rust's zero-cost abstractions and fine-grained control over memory make it excellent for high-performance applications.
Performance Measurement and Profiling
Benchmarking with Criterion
First, add to Cargo.toml
:
[dev-dependencies]
criterion = { version = "0.5", features = ["html_reports"] }
[[bench]]
name = "my_benchmark"
harness = false
Basic benchmarking:
// benches/my_benchmark.rs
use criterion::{black_box, criterion_group, criterion_main, Criterion};
fn fibonacci_recursive(n: u64) -> u64 {
match n {
0 => 1,
1 => 1,
n => fibonacci_recursive(n - 1) + fibonacci_recursive(n - 2),
}
}
fn fibonacci_iterative(n: u64) -> u64 {
if n == 0 || n == 1 {
return 1;
}
let mut a = 1;
let mut b = 1;
for _ in 2..=n {
let temp = a + b;
a = b;
b = temp;
}
b
}
fn fibonacci_lookup(n: u64) -> u64 {
const FIBONACCI: [u64; 21] = [
1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946
];
if n < 21 {
FIBONACCI[n as usize]
} else {
fibonacci_iterative(n)
}
}
fn criterion_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("fibonacci");
for i in [10u64, 15, 20].iter() {
group.bench_with_input(format!("recursive/{}", i), i, |b, i| {
b.iter(|| fibonacci_recursive(black_box(*i)))
});
group.bench_with_input(format!("iterative/{}", i), i, |b, i| {
b.iter(|| fibonacci_iterative(black_box(*i)))
});
group.bench_with_input(format!("lookup/{}", i), i, |b, i| {
b.iter(|| fibonacci_lookup(black_box(*i)))
});
}
group.finish();
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
Run benchmarks:
cargo bench
Built-in Benchmarking (Nightly)
#![feature(test)]
extern crate test;
use test::Bencher;
#[bench]
fn bench_vector_sum_loop(b: &mut Bencher) {
let data: Vec<i32> = (0..1000).collect();
b.iter(|| {
let mut sum = 0;
for &item in &data {
sum += item;
}
sum
});
}
#[bench]
fn bench_vector_sum_iterator(b: &mut Bencher) {
let data: Vec<i32> = (0..1000).collect();
b.iter(|| data.iter().sum::<i32>());
}
#[bench]
fn bench_vector_sum_fold(b: &mut Bencher) {
let data: Vec<i32> = (0..1000).collect();
b.iter(|| data.iter().fold(0, |acc, &x| acc + x));
}
Profiling with perf and flamegraph
Install dependencies:
cargo install flamegraph
# On Linux: install perf
Profile your application:
# Generate flamegraph
cargo flamegraph --bin my_app
# Profile with perf
perf record --call-graph=dwarf target/release/my_app
perf report
Example application to profile:
// src/main.rs
use std::collections::HashMap;
fn expensive_computation() -> HashMap<String, u64> {
let mut map = HashMap::new();
for i in 0..1_000_000 {
let key = format!("key_{}", i);
let value = (i as u64).pow(2);
map.insert(key, value);
}
map
}
fn process_data(map: &HashMap<String, u64>) -> u64 {
map.values().filter(|&&v| v % 2 == 0).sum()
}
fn main() {
let map = expensive_computation();
let result = process_data(&map);
println!("Result: {}", result);
}
Compiler Optimizations
Release Profile Configuration
# Cargo.toml
[profile.release]
opt-level = 3 # Maximum optimization
debug = false # No debug info
debug-assertions = false
overflow-checks = false
lto = true # Link-time optimization
codegen-units = 1 # Better optimization, slower compile
panic = 'abort' # Smaller binary size
[profile.release-lto]
inherits = "release"
lto = "fat" # More aggressive LTO
codegen-units = 1
Target-Specific Optimizations
# .cargo/config.toml
[build]
rustflags = [
"-C", "target-cpu=native", # Optimize for current CPU
"-C", "target-feature=+crt-static", # Static linking
]
[target.x86_64-unknown-linux-gnu]
rustflags = [
"-C", "link-arg=-fuse-ld=lld", # Use faster linker
"-C", "target-cpu=haswell", # Target specific CPU
]
Profile-Guided Optimization (PGO)
# Step 1: Build instrumented binary
RUSTFLAGS="-Cprofile-generate=/tmp/pgo-data" \
cargo build --release --target-dir=/tmp/pgo
# Step 2: Run typical workload
/tmp/pgo/release/my_app
# Step 3: Build optimized binary
RUSTFLAGS="-Cprofile-use=/tmp/pgo-data" \
cargo build --release
Memory Optimization
Reducing Allocations
// Bad: Many allocations
fn process_strings_bad(input: &[&str]) -> Vec<String> {
let mut result = Vec::new();
for s in input {
let processed = s.to_uppercase();
let formatted = format!("Processed: {}", processed);
result.push(formatted);
}
result
}
// Good: Pre-allocate and reuse
fn process_strings_good(input: &[&str]) -> Vec<String> {
let mut result = Vec::with_capacity(input.len());
let mut buffer = String::new();
for s in input {
buffer.clear();
buffer.push_str("Processed: ");
for ch in s.chars() {
buffer.extend(ch.to_uppercase());
}
result.push(buffer.clone());
}
result
}
// Better: Avoid cloning
fn process_strings_better(input: &[&str], output: &mut Vec<String>) {
output.clear();
output.reserve(input.len());
for s in input {
let formatted = format!("Processed: {}", s.to_uppercase());
output.push(formatted);
}
}
// Best: Use iterators for zero-allocation processing when possible
fn process_strings_best(input: &[&str]) -> impl Iterator<Item = String> + '_ {
input.iter().map(|s| format!("Processed: {}", s.to_uppercase()))
}
Memory Pool Pattern
use std::collections::VecDeque;
struct ObjectPool<T> {
objects: VecDeque<T>,
factory: fn() -> T,
}
impl<T> ObjectPool<T> {
fn new(factory: fn() -> T) -> Self {
ObjectPool {
objects: VecDeque::new(),
factory,
}
}
fn get(&mut self) -> T {
self.objects.pop_front().unwrap_or_else(self.factory)
}
fn return_object(&mut self, obj: T) {
if self.objects.len() < 100 { // Limit pool size
self.objects.push_back(obj);
}
}
}
// Usage example
fn with_pool_optimization() {
let mut string_pool = ObjectPool::new(String::new);
for i in 0..1000 {
let mut s = string_pool.get();
s.clear();
s.push_str(&format!("Item {}", i));
// Process string...
println!("Processing: {}", s);
string_pool.return_object(s);
}
}
Stack vs Heap Allocation
// Stack allocation (faster)
fn stack_allocation() -> [i32; 1000] {
[0; 1000] // Allocated on stack
}
// Heap allocation (more flexible but slower)
fn heap_allocation() -> Vec<i32> {
vec![0; 1000] // Allocated on heap
}
// Hybrid approach: stack for small, heap for large
fn adaptive_allocation(size: usize) -> Box<[i32]> {
if size <= 1000 {
// Use stack-allocated array and move to heap
let mut vec = Vec::with_capacity(size);
vec.resize(size, 0);
vec.into_boxed_slice()
} else {
// Direct heap allocation
vec![0; size].into_boxed_slice()
}
}
// SmallVec for stack optimization
use smallvec::{SmallVec, smallvec};
fn using_smallvec() {
// Store up to 8 elements on stack, then heap
let mut vec: SmallVec<[i32; 8]> = smallvec![1, 2, 3, 4];
vec.push(5);
println!("SmallVec: {:?}", vec);
}
Data Structure Optimization
Choose Efficient Data Structures
use std::collections::{HashMap, BTreeMap, HashSet};
use indexmap::IndexMap;
// Different data structures for different use cases
fn data_structure_comparison() {
// HashMap: O(1) average access, no ordering
let mut hash_map = HashMap::new();
hash_map.insert("key1", "value1");
// BTreeMap: O(log n) access, sorted order
let mut btree_map = BTreeMap::new();
btree_map.insert("key1", "value1");
// IndexMap: O(1) access, insertion order preserved
let mut index_map = IndexMap::new();
index_map.insert("key1", "value1");
// Vector: O(1) indexed access, O(n) search
let mut vec = Vec::new();
vec.push(("key1", "value1"));
}
// Custom data structure for specific use case
struct CompactStringSet {
data: Vec<u8>,
offsets: Vec<usize>,
}
impl CompactStringSet {
fn new() -> Self {
CompactStringSet {
data: Vec::new(),
offsets: Vec::new(),
}
}
fn insert(&mut self, s: &str) {
self.offsets.push(self.data.len());
self.data.extend_from_slice(s.as_bytes());
self.data.push(0); // Null terminator
}
fn get(&self, index: usize) -> Option<&str> {
if index >= self.offsets.len() {
return None;
}
let start = self.offsets[index];
let end = if index + 1 < self.offsets.len() {
self.offsets[index + 1] - 1 // -1 for null terminator
} else {
self.data.len() - 1
};
std::str::from_utf8(&self.data[start..end]).ok()
}
fn len(&self) -> usize {
self.offsets.len()
}
}
Bit Manipulation Optimizations
// Bit-packed boolean array
struct BitSet {
data: Vec<u64>,
len: usize,
}
impl BitSet {
fn new(len: usize) -> Self {
let words = (len + 63) / 64; // Round up
BitSet {
data: vec![0; words],
len,
}
}
fn set(&mut self, index: usize, value: bool) {
if index >= self.len {
return;
}
let word_index = index / 64;
let bit_index = index % 64;
if value {
self.data[word_index] |= 1u64 << bit_index;
} else {
self.data[word_index] &= !(1u64 << bit_index);
}
}
fn get(&self, index: usize) -> bool {
if index >= self.len {
return false;
}
let word_index = index / 64;
let bit_index = index % 64;
(self.data[word_index] & (1u64 << bit_index)) != 0
}
fn count_ones(&self) -> u32 {
self.data.iter().map(|word| word.count_ones()).sum()
}
}
// Bit manipulation tricks
fn bit_tricks() {
let mut x = 42u32;
// Check if power of 2
let is_power_of_2 = x != 0 && (x & (x - 1)) == 0;
// Next power of 2
x -= 1;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
x += 1;
// Count trailing zeros (fast)
let trailing_zeros = 42u32.trailing_zeros();
// Population count (number of 1 bits)
let pop_count = 42u32.count_ones();
println!("Bit tricks: power_of_2={}, next_pow2={}, trailing_zeros={}, pop_count={}",
is_power_of_2, x, trailing_zeros, pop_count);
}
Algorithm Optimization
Cache-Friendly Algorithms
// Cache-unfriendly: row-major access of column-major data
fn matrix_multiply_bad(a: &[Vec<f64>], b: &[Vec<f64>]) -> Vec<Vec<f64>> {
let n = a.len();
let m = b[0].len();
let p = b.len();
let mut result = vec![vec![0.0; m]; n];
for i in 0..n {
for j in 0..m {
for k in 0..p {
result[i][j] += a[i][k] * b[k][j]; // Poor cache locality
}
}
}
result
}
// Cache-friendly: blocked/tiled multiplication
fn matrix_multiply_good(a: &[Vec<f64>], b: &[Vec<f64>]) -> Vec<Vec<f64>> {
let n = a.len();
let m = b[0].len();
let p = b.len();
let mut result = vec![vec![0.0; m]; n];
let block_size = 64; // Optimize for cache line size
for ii in (0..n).step_by(block_size) {
for jj in (0..m).step_by(block_size) {
for kk in (0..p).step_by(block_size) {
let i_end = (ii + block_size).min(n);
let j_end = (jj + block_size).min(m);
let k_end = (kk + block_size).min(p);
for i in ii..i_end {
for j in jj..j_end {
for k in kk..k_end {
result[i][j] += a[i][k] * b[k][j];
}
}
}
}
}
}
result
}
// SIMD-optimized operations
fn simd_sum(data: &[f32]) -> f32 {
// Manual SIMD (requires nightly and target features)
#[cfg(target_arch = "x86_64")]
{
use std::arch::x86_64::*;
unsafe {
let mut sum_vec = _mm256_setzero_ps();
let chunks = data.chunks_exact(8);
for chunk in chunks {
let chunk_vec = _mm256_loadu_ps(chunk.as_ptr());
sum_vec = _mm256_add_ps(sum_vec, chunk_vec);
}
// Horizontal sum
let high = _mm256_extractf128_ps(sum_vec, 1);
let low = _mm256_extractf128_ps(sum_vec, 0);
let sum128 = _mm_add_ps(high, low);
let sum64 = _mm_add_ps(sum128, _mm_movehl_ps(sum128, sum128));
let sum32 = _mm_add_ss(sum64, _mm_shuffle_ps(sum64, sum64, 1));
let mut result = _mm_cvtss_f32(sum32);
// Handle remainder
let remainder = data.chunks_exact(8).remainder();
result += remainder.iter().sum::<f32>();
result
}
}
#[cfg(not(target_arch = "x86_64"))]
{
data.iter().sum()
}
}
Parallel Processing
use rayon::prelude::*;
// Sequential processing
fn process_sequential(data: &[i32]) -> Vec<i32> {
data.iter().map(|&x| x * x + 1).collect()
}
// Parallel processing
fn process_parallel(data: &[i32]) -> Vec<i32> {
data.par_iter().map(|&x| x * x + 1).collect()
}
// Parallel reduction
fn parallel_sum(data: &[i32]) -> i64 {
data.par_iter().map(|&x| x as i64).sum()
}
// Custom parallel algorithm
fn parallel_quicksort<T: Ord + Send>(mut data: Vec<T>) -> Vec<T> {
if data.len() <= 1 {
return data;
}
let pivot_index = data.len() / 2;
let pivot = data.remove(pivot_index);
let (left, right): (Vec<_>, Vec<_>) = data.into_par_iter()
.partition(|item| item < &pivot);
if left.len() > 1000 && right.len() > 1000 {
// Parallel recursive calls for large partitions
let (mut sorted_left, mut sorted_right) = rayon::join(
|| parallel_quicksort(left),
|| parallel_quicksort(right),
);
sorted_left.push(pivot);
sorted_left.extend(sorted_right);
sorted_left
} else {
// Sequential for small partitions
let mut result = parallel_quicksort(left);
result.push(pivot);
result.extend(parallel_quicksort(right));
result
}
}
String and Text Optimization
Efficient String Processing
// Avoiding allocations in string processing
fn count_words_efficient(text: &str) -> usize {
text.split_whitespace().count() // No allocations
}
fn extract_numbers_efficient(text: &str) -> Vec<i32> {
text.split_whitespace()
.filter_map(|s| s.parse().ok())
.collect()
}
// String interning for repeated strings
use std::collections::HashMap;
struct StringInterner {
strings: Vec<String>,
map: HashMap<String, usize>,
}
impl StringInterner {
fn new() -> Self {
StringInterner {
strings: Vec::new(),
map: HashMap::new(),
}
}
fn intern(&mut self, s: &str) -> usize {
if let Some(&id) = self.map.get(s) {
id
} else {
let id = self.strings.len();
self.strings.push(s.to_string());
self.map.insert(s.to_string(), id);
id
}
}
fn get(&self, id: usize) -> Option<&str> {
self.strings.get(id).map(|s| s.as_str())
}
}
// Byte string processing (faster than UTF-8)
fn process_ascii_bytes(data: &[u8]) -> usize {
data.iter().filter(|&&b| b.is_ascii_alphabetic()).count()
}
// SIMD string search
fn find_byte_simd(haystack: &[u8], needle: u8) -> Option<usize> {
// Use memchr crate for production code
haystack.iter().position(|&b| b == needle)
}
I/O Optimization
Buffered I/O and Batch Processing
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::fs::File;
// Efficient file reading
fn read_lines_efficient(filename: &str) -> Result<Vec<String>, std::io::Error> {
let file = File::open(filename)?;
let reader = BufReader::new(file);
reader.lines().collect()
}
// Efficient file writing
fn write_lines_efficient(filename: &str, lines: &[String]) -> Result<(), std::io::Error> {
let file = File::create(filename)?;
let mut writer = BufWriter::new(file);
for line in lines {
writeln!(writer, "{}", line)?;
}
writer.flush()?;
Ok(())
}
// Memory-mapped files for large data
use memmap2::MmapOptions;
fn process_large_file_mmap(filename: &str) -> Result<usize, Box<dyn std::error::Error>> {
let file = File::open(filename)?;
let mmap = unsafe { MmapOptions::new().map(&file)? };
// Process memory-mapped data directly
let line_count = mmap.split(|&b| b == b'\n').count();
Ok(line_count)
}
// Async I/O for high concurrency
async fn async_file_processing(filenames: Vec<&str>) -> Result<Vec<String>, tokio::io::Error> {
use tokio::fs;
use futures::future::try_join_all;
let tasks: Vec<_> = filenames.into_iter()
.map(|filename| async move {
fs::read_to_string(filename).await
})
.collect();
try_join_all(tasks).await
}
Compilation and Build Optimization
Cargo Configuration
# .cargo/config.toml
[build]
rustflags = ["-C", "target-cpu=native"]
[profile.release]
debug = true # Keep symbols for profiling
debug-assertions = false
overflow-checks = false
lto = "thin" # Balance between compile time and performance
codegen-units = 16 # Parallel compilation
incremental = false # Disable for release builds
panic = "abort" # Smaller binaries
# Custom profile for maximum performance
[profile.max-perf]
inherits = "release"
opt-level = 3
lto = "fat"
codegen-units = 1
panic = "abort"
Link-Time Optimization
# Enable LTO
export RUSTFLAGS="-C link-args=-fuse-ld=lld"
cargo build --release
Cross-compilation optimization
# Target-specific optimizations
cargo build --release --target x86_64-unknown-linux-musl
cargo build --release --target x86_64-pc-windows-gnu
# CPU-specific targeting
export RUSTFLAGS="-C target-cpu=skylake"
cargo build --release
Advanced Optimization Techniques
Const Evaluation and Compile-Time Computation
// Compile-time computation
const fn factorial(n: u64) -> u64 {
if n == 0 { 1 } else { n * factorial(n - 1) }
}
const FACTORIAL_10: u64 = factorial(10); // Computed at compile time
// Const generics for optimization
struct Matrix<const N: usize, const M: usize> {
data: [[f64; M]; N],
}
impl<const N: usize, const M: usize> Matrix<N, M> {
const fn zeros() -> Self {
Matrix { data: [[0.0; M]; N] }
}
// Optimized matrix multiplication for known sizes
fn multiply<const P: usize>(&self, other: &Matrix<M, P>) -> Matrix<N, P> {
let mut result = Matrix::<N, P>::zeros();
for i in 0..N {
for j in 0..P {
for k in 0..M {
result.data[i][j] += self.data[i][k] * other.data[k][j];
}
}
}
result
}
}
// Lookup tables
const SINE_TABLE: [f32; 360] = {
let mut table = [0.0; 360];
let mut i = 0;
while i < 360 {
table[i] = (i as f32 * std::f32::consts::PI / 180.0).sin();
i += 1;
}
table
};
fn fast_sine(degrees: usize) -> f32 {
SINE_TABLE[degrees % 360]
}
Branch Prediction Optimization
// Help branch predictor with likely/unlikely
fn process_with_hints(data: &[i32]) -> i32 {
let mut sum = 0;
for &value in data {
if likely(value > 0) {
sum += value;
} else {
sum -= value;
}
}
sum
}
// Branchless programming
fn branchless_max(a: i32, b: i32) -> i32 {
let diff = a - b;
a - (diff & (diff >> 31))
}
fn branchless_abs(x: i32) -> i32 {
let mask = x >> 31;
(x + mask) ^ mask
}
// Jump table instead of long if-else chain
fn dispatch_operation(op: u8, a: i32, b: i32) -> i32 {
const OPERATIONS: [fn(i32, i32) -> i32; 4] = [
|a, b| a + b, // 0: add
|a, b| a - b, // 1: subtract
|a, b| a * b, // 2: multiply
|a, b| a / b, // 3: divide
];
if (op as usize) < OPERATIONS.len() {
OPERATIONS[op as usize](a, b)
} else {
0
}
}
// Compiler hints (requires nightly)
#[inline(always)]
fn likely(b: bool) -> bool {
#[cfg(target_arch = "x86_64")]
unsafe {
std::intrinsics::likely(b)
}
#[cfg(not(target_arch = "x86_64"))]
b
}
Performance Testing and Validation
Comprehensive Benchmarking Suite
// benches/comprehensive.rs
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
fn bench_algorithms(c: &mut Criterion) {
let mut group = c.benchmark_group("sorting");
for size in [100, 1000, 10000].iter() {
let data: Vec<i32> = (0..*size).rev().collect();
group.throughput(Throughput::Elements(*size as u64));
group.bench_with_input(
BenchmarkId::new("std_sort", size),
size,
|b, _size| {
b.iter_batched(
|| data.clone(),
|mut data| data.sort(),
criterion::BatchSize::SmallInput,
)
},
);
group.bench_with_input(
BenchmarkId::new("custom_sort", size),
size,
|b, _size| {
b.iter_batched(
|| data.clone(),
|data| parallel_quicksort(data),
criterion::BatchSize::SmallInput,
)
},
);
}
group.finish();
}
criterion_group!(benches, bench_algorithms);
criterion_main!(benches);
Memory Usage Profiling
// Memory allocation tracking
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
fn track_memory_usage() {
use jemalloc_ctl::{stats, epoch};
epoch::advance().unwrap();
let allocated = stats::allocated::read().unwrap();
println!("Allocated: {} bytes", allocated);
// Your code here
let _data: Vec<i32> = (0..1000000).collect();
epoch::advance().unwrap();
let allocated_after = stats::allocated::read().unwrap();
println!("Allocated after: {} bytes", allocated_after);
println!("Difference: {} bytes", allocated_after - allocated);
}
Best Practices Summary
- Measure First: Always profile before optimizing
- Target Bottlenecks: Focus on the critical path
- Cache-Friendly Code: Consider memory access patterns
- Minimize Allocations: Reuse memory when possible
- Choose Right Data Structures: Match structure to access pattern
- Leverage Parallelism: Use rayon for CPU-bound tasks
- Optimize Compilation: Use appropriate release flags
- SIMD When Applicable: For data-parallel operations
- Avoid Premature Optimization: Keep code readable
- Validate Optimizations: Ensure correctness is maintained
Performance optimization in Rust is about understanding the system, measuring carefully, and applying targeted improvements. The language's zero-cost abstractions and control over memory layout provide excellent opportunities for optimization while maintaining safety.