Performance Optimization

Performance optimization in Rust involves understanding how the compiler works, measuring performance accurately, and applying targeted optimizations. Rust's zero-cost abstractions and fine-grained control over memory make it excellent for high-performance applications.

Performance Measurement and Profiling

Benchmarking with Criterion

First, add to Cargo.toml:

[dev-dependencies]
criterion = { version = "0.5", features = ["html_reports"] }

[[bench]]
name = "my_benchmark"
harness = false

Basic benchmarking:

// benches/my_benchmark.rs
use criterion::{black_box, criterion_group, criterion_main, Criterion};

fn fibonacci_recursive(n: u64) -> u64 {
    match n {
        0 => 1,
        1 => 1,
        n => fibonacci_recursive(n - 1) + fibonacci_recursive(n - 2),
    }
}

fn fibonacci_iterative(n: u64) -> u64 {
    if n == 0 || n == 1 {
        return 1;
    }
    
    let mut a = 1;
    let mut b = 1;
    
    for _ in 2..=n {
        let temp = a + b;
        a = b;
        b = temp;
    }
    
    b
}

fn fibonacci_lookup(n: u64) -> u64 {
    const FIBONACCI: [u64; 21] = [
        1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946
    ];
    
    if n < 21 {
        FIBONACCI[n as usize]
    } else {
        fibonacci_iterative(n)
    }
}

fn criterion_benchmark(c: &mut Criterion) {
    let mut group = c.benchmark_group("fibonacci");
    
    for i in [10u64, 15, 20].iter() {
        group.bench_with_input(format!("recursive/{}", i), i, |b, i| {
            b.iter(|| fibonacci_recursive(black_box(*i)))
        });
        
        group.bench_with_input(format!("iterative/{}", i), i, |b, i| {
            b.iter(|| fibonacci_iterative(black_box(*i)))
        });
        
        group.bench_with_input(format!("lookup/{}", i), i, |b, i| {
            b.iter(|| fibonacci_lookup(black_box(*i)))
        });
    }
    
    group.finish();
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

Run benchmarks:

cargo bench

Built-in Benchmarking (Nightly)

#![feature(test)]
extern crate test;
use test::Bencher;

#[bench]
fn bench_vector_sum_loop(b: &mut Bencher) {
    let data: Vec<i32> = (0..1000).collect();
    b.iter(|| {
        let mut sum = 0;
        for &item in &data {
            sum += item;
        }
        sum
    });
}

#[bench]
fn bench_vector_sum_iterator(b: &mut Bencher) {
    let data: Vec<i32> = (0..1000).collect();
    b.iter(|| data.iter().sum::<i32>());
}

#[bench]
fn bench_vector_sum_fold(b: &mut Bencher) {
    let data: Vec<i32> = (0..1000).collect();
    b.iter(|| data.iter().fold(0, |acc, &x| acc + x));
}

Profiling with perf and flamegraph

Install dependencies:

cargo install flamegraph
# On Linux: install perf

Profile your application:

# Generate flamegraph
cargo flamegraph --bin my_app

# Profile with perf
perf record --call-graph=dwarf target/release/my_app
perf report

Example application to profile:

// src/main.rs
use std::collections::HashMap;

fn expensive_computation() -> HashMap<String, u64> {
    let mut map = HashMap::new();
    
    for i in 0..1_000_000 {
        let key = format!("key_{}", i);
        let value = (i as u64).pow(2);
        map.insert(key, value);
    }
    
    map
}

fn process_data(map: &HashMap<String, u64>) -> u64 {
    map.values().filter(|&&v| v % 2 == 0).sum()
}

fn main() {
    let map = expensive_computation();
    let result = process_data(&map);
    println!("Result: {}", result);
}

Compiler Optimizations

Release Profile Configuration

# Cargo.toml
[profile.release]
opt-level = 3           # Maximum optimization
debug = false           # No debug info
debug-assertions = false
overflow-checks = false
lto = true             # Link-time optimization
codegen-units = 1      # Better optimization, slower compile
panic = 'abort'        # Smaller binary size

[profile.release-lto]
inherits = "release"
lto = "fat"           # More aggressive LTO
codegen-units = 1

Target-Specific Optimizations

# .cargo/config.toml
[build]
rustflags = [
    "-C", "target-cpu=native",    # Optimize for current CPU
    "-C", "target-feature=+crt-static",  # Static linking
]

[target.x86_64-unknown-linux-gnu]
rustflags = [
    "-C", "link-arg=-fuse-ld=lld",  # Use faster linker
    "-C", "target-cpu=haswell",      # Target specific CPU
]

Profile-Guided Optimization (PGO)

# Step 1: Build instrumented binary
RUSTFLAGS="-Cprofile-generate=/tmp/pgo-data" \
    cargo build --release --target-dir=/tmp/pgo

# Step 2: Run typical workload
/tmp/pgo/release/my_app

# Step 3: Build optimized binary
RUSTFLAGS="-Cprofile-use=/tmp/pgo-data" \
    cargo build --release

Memory Optimization

Reducing Allocations

// Bad: Many allocations
fn process_strings_bad(input: &[&str]) -> Vec<String> {
    let mut result = Vec::new();
    for s in input {
        let processed = s.to_uppercase();
        let formatted = format!("Processed: {}", processed);
        result.push(formatted);
    }
    result
}

// Good: Pre-allocate and reuse
fn process_strings_good(input: &[&str]) -> Vec<String> {
    let mut result = Vec::with_capacity(input.len());
    let mut buffer = String::new();
    
    for s in input {
        buffer.clear();
        buffer.push_str("Processed: ");
        
        for ch in s.chars() {
            buffer.extend(ch.to_uppercase());
        }
        
        result.push(buffer.clone());
    }
    result
}

// Better: Avoid cloning
fn process_strings_better(input: &[&str], output: &mut Vec<String>) {
    output.clear();
    output.reserve(input.len());
    
    for s in input {
        let formatted = format!("Processed: {}", s.to_uppercase());
        output.push(formatted);
    }
}

// Best: Use iterators for zero-allocation processing when possible
fn process_strings_best(input: &[&str]) -> impl Iterator<Item = String> + '_ {
    input.iter().map(|s| format!("Processed: {}", s.to_uppercase()))
}

Memory Pool Pattern

use std::collections::VecDeque;

struct ObjectPool<T> {
    objects: VecDeque<T>,
    factory: fn() -> T,
}

impl<T> ObjectPool<T> {
    fn new(factory: fn() -> T) -> Self {
        ObjectPool {
            objects: VecDeque::new(),
            factory,
        }
    }
    
    fn get(&mut self) -> T {
        self.objects.pop_front().unwrap_or_else(self.factory)
    }
    
    fn return_object(&mut self, obj: T) {
        if self.objects.len() < 100 { // Limit pool size
            self.objects.push_back(obj);
        }
    }
}

// Usage example
fn with_pool_optimization() {
    let mut string_pool = ObjectPool::new(String::new);
    
    for i in 0..1000 {
        let mut s = string_pool.get();
        s.clear();
        s.push_str(&format!("Item {}", i));
        
        // Process string...
        println!("Processing: {}", s);
        
        string_pool.return_object(s);
    }
}

Stack vs Heap Allocation

// Stack allocation (faster)
fn stack_allocation() -> [i32; 1000] {
    [0; 1000] // Allocated on stack
}

// Heap allocation (more flexible but slower)
fn heap_allocation() -> Vec<i32> {
    vec![0; 1000] // Allocated on heap
}

// Hybrid approach: stack for small, heap for large
fn adaptive_allocation(size: usize) -> Box<[i32]> {
    if size <= 1000 {
        // Use stack-allocated array and move to heap
        let mut vec = Vec::with_capacity(size);
        vec.resize(size, 0);
        vec.into_boxed_slice()
    } else {
        // Direct heap allocation
        vec![0; size].into_boxed_slice()
    }
}

// SmallVec for stack optimization
use smallvec::{SmallVec, smallvec};

fn using_smallvec() {
    // Store up to 8 elements on stack, then heap
    let mut vec: SmallVec<[i32; 8]> = smallvec![1, 2, 3, 4];
    vec.push(5);
    
    println!("SmallVec: {:?}", vec);
}

Data Structure Optimization

Choose Efficient Data Structures

use std::collections::{HashMap, BTreeMap, HashSet};
use indexmap::IndexMap;

// Different data structures for different use cases
fn data_structure_comparison() {
    // HashMap: O(1) average access, no ordering
    let mut hash_map = HashMap::new();
    hash_map.insert("key1", "value1");
    
    // BTreeMap: O(log n) access, sorted order
    let mut btree_map = BTreeMap::new();
    btree_map.insert("key1", "value1");
    
    // IndexMap: O(1) access, insertion order preserved
    let mut index_map = IndexMap::new();
    index_map.insert("key1", "value1");
    
    // Vector: O(1) indexed access, O(n) search
    let mut vec = Vec::new();
    vec.push(("key1", "value1"));
}

// Custom data structure for specific use case
struct CompactStringSet {
    data: Vec<u8>,
    offsets: Vec<usize>,
}

impl CompactStringSet {
    fn new() -> Self {
        CompactStringSet {
            data: Vec::new(),
            offsets: Vec::new(),
        }
    }
    
    fn insert(&mut self, s: &str) {
        self.offsets.push(self.data.len());
        self.data.extend_from_slice(s.as_bytes());
        self.data.push(0); // Null terminator
    }
    
    fn get(&self, index: usize) -> Option<&str> {
        if index >= self.offsets.len() {
            return None;
        }
        
        let start = self.offsets[index];
        let end = if index + 1 < self.offsets.len() {
            self.offsets[index + 1] - 1 // -1 for null terminator
        } else {
            self.data.len() - 1
        };
        
        std::str::from_utf8(&self.data[start..end]).ok()
    }
    
    fn len(&self) -> usize {
        self.offsets.len()
    }
}

Bit Manipulation Optimizations

// Bit-packed boolean array
struct BitSet {
    data: Vec<u64>,
    len: usize,
}

impl BitSet {
    fn new(len: usize) -> Self {
        let words = (len + 63) / 64; // Round up
        BitSet {
            data: vec![0; words],
            len,
        }
    }
    
    fn set(&mut self, index: usize, value: bool) {
        if index >= self.len {
            return;
        }
        
        let word_index = index / 64;
        let bit_index = index % 64;
        
        if value {
            self.data[word_index] |= 1u64 << bit_index;
        } else {
            self.data[word_index] &= !(1u64 << bit_index);
        }
    }
    
    fn get(&self, index: usize) -> bool {
        if index >= self.len {
            return false;
        }
        
        let word_index = index / 64;
        let bit_index = index % 64;
        
        (self.data[word_index] & (1u64 << bit_index)) != 0
    }
    
    fn count_ones(&self) -> u32 {
        self.data.iter().map(|word| word.count_ones()).sum()
    }
}

// Bit manipulation tricks
fn bit_tricks() {
    let mut x = 42u32;
    
    // Check if power of 2
    let is_power_of_2 = x != 0 && (x & (x - 1)) == 0;
    
    // Next power of 2
    x -= 1;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    x += 1;
    
    // Count trailing zeros (fast)
    let trailing_zeros = 42u32.trailing_zeros();
    
    // Population count (number of 1 bits)
    let pop_count = 42u32.count_ones();
    
    println!("Bit tricks: power_of_2={}, next_pow2={}, trailing_zeros={}, pop_count={}", 
             is_power_of_2, x, trailing_zeros, pop_count);
}

Algorithm Optimization

Cache-Friendly Algorithms

// Cache-unfriendly: row-major access of column-major data
fn matrix_multiply_bad(a: &[Vec<f64>], b: &[Vec<f64>]) -> Vec<Vec<f64>> {
    let n = a.len();
    let m = b[0].len();
    let p = b.len();
    let mut result = vec![vec![0.0; m]; n];
    
    for i in 0..n {
        for j in 0..m {
            for k in 0..p {
                result[i][j] += a[i][k] * b[k][j]; // Poor cache locality
            }
        }
    }
    
    result
}

// Cache-friendly: blocked/tiled multiplication
fn matrix_multiply_good(a: &[Vec<f64>], b: &[Vec<f64>]) -> Vec<Vec<f64>> {
    let n = a.len();
    let m = b[0].len();
    let p = b.len();
    let mut result = vec![vec![0.0; m]; n];
    let block_size = 64; // Optimize for cache line size
    
    for ii in (0..n).step_by(block_size) {
        for jj in (0..m).step_by(block_size) {
            for kk in (0..p).step_by(block_size) {
                let i_end = (ii + block_size).min(n);
                let j_end = (jj + block_size).min(m);
                let k_end = (kk + block_size).min(p);
                
                for i in ii..i_end {
                    for j in jj..j_end {
                        for k in kk..k_end {
                            result[i][j] += a[i][k] * b[k][j];
                        }
                    }
                }
            }
        }
    }
    
    result
}

// SIMD-optimized operations
fn simd_sum(data: &[f32]) -> f32 {
    // Manual SIMD (requires nightly and target features)
    #[cfg(target_arch = "x86_64")]
    {
        use std::arch::x86_64::*;
        
        unsafe {
            let mut sum_vec = _mm256_setzero_ps();
            let chunks = data.chunks_exact(8);
            
            for chunk in chunks {
                let chunk_vec = _mm256_loadu_ps(chunk.as_ptr());
                sum_vec = _mm256_add_ps(sum_vec, chunk_vec);
            }
            
            // Horizontal sum
            let high = _mm256_extractf128_ps(sum_vec, 1);
            let low = _mm256_extractf128_ps(sum_vec, 0);
            let sum128 = _mm_add_ps(high, low);
            
            let sum64 = _mm_add_ps(sum128, _mm_movehl_ps(sum128, sum128));
            let sum32 = _mm_add_ss(sum64, _mm_shuffle_ps(sum64, sum64, 1));
            
            let mut result = _mm_cvtss_f32(sum32);
            
            // Handle remainder
            let remainder = data.chunks_exact(8).remainder();
            result += remainder.iter().sum::<f32>();
            
            result
        }
    }
    
    #[cfg(not(target_arch = "x86_64"))]
    {
        data.iter().sum()
    }
}

Parallel Processing

use rayon::prelude::*;

// Sequential processing
fn process_sequential(data: &[i32]) -> Vec<i32> {
    data.iter().map(|&x| x * x + 1).collect()
}

// Parallel processing
fn process_parallel(data: &[i32]) -> Vec<i32> {
    data.par_iter().map(|&x| x * x + 1).collect()
}

// Parallel reduction
fn parallel_sum(data: &[i32]) -> i64 {
    data.par_iter().map(|&x| x as i64).sum()
}

// Custom parallel algorithm
fn parallel_quicksort<T: Ord + Send>(mut data: Vec<T>) -> Vec<T> {
    if data.len() <= 1 {
        return data;
    }
    
    let pivot_index = data.len() / 2;
    let pivot = data.remove(pivot_index);
    
    let (left, right): (Vec<_>, Vec<_>) = data.into_par_iter()
        .partition(|item| item < &pivot);
    
    if left.len() > 1000 && right.len() > 1000 {
        // Parallel recursive calls for large partitions
        let (mut sorted_left, mut sorted_right) = rayon::join(
            || parallel_quicksort(left),
            || parallel_quicksort(right),
        );
        
        sorted_left.push(pivot);
        sorted_left.extend(sorted_right);
        sorted_left
    } else {
        // Sequential for small partitions
        let mut result = parallel_quicksort(left);
        result.push(pivot);
        result.extend(parallel_quicksort(right));
        result
    }
}

String and Text Optimization

Efficient String Processing

// Avoiding allocations in string processing
fn count_words_efficient(text: &str) -> usize {
    text.split_whitespace().count() // No allocations
}

fn extract_numbers_efficient(text: &str) -> Vec<i32> {
    text.split_whitespace()
        .filter_map(|s| s.parse().ok())
        .collect()
}

// String interning for repeated strings
use std::collections::HashMap;

struct StringInterner {
    strings: Vec<String>,
    map: HashMap<String, usize>,
}

impl StringInterner {
    fn new() -> Self {
        StringInterner {
            strings: Vec::new(),
            map: HashMap::new(),
        }
    }
    
    fn intern(&mut self, s: &str) -> usize {
        if let Some(&id) = self.map.get(s) {
            id
        } else {
            let id = self.strings.len();
            self.strings.push(s.to_string());
            self.map.insert(s.to_string(), id);
            id
        }
    }
    
    fn get(&self, id: usize) -> Option<&str> {
        self.strings.get(id).map(|s| s.as_str())
    }
}

// Byte string processing (faster than UTF-8)
fn process_ascii_bytes(data: &[u8]) -> usize {
    data.iter().filter(|&&b| b.is_ascii_alphabetic()).count()
}

// SIMD string search
fn find_byte_simd(haystack: &[u8], needle: u8) -> Option<usize> {
    // Use memchr crate for production code
    haystack.iter().position(|&b| b == needle)
}

I/O Optimization

Buffered I/O and Batch Processing

use std::io::{BufRead, BufReader, BufWriter, Write};
use std::fs::File;

// Efficient file reading
fn read_lines_efficient(filename: &str) -> Result<Vec<String>, std::io::Error> {
    let file = File::open(filename)?;
    let reader = BufReader::new(file);
    reader.lines().collect()
}

// Efficient file writing
fn write_lines_efficient(filename: &str, lines: &[String]) -> Result<(), std::io::Error> {
    let file = File::create(filename)?;
    let mut writer = BufWriter::new(file);
    
    for line in lines {
        writeln!(writer, "{}", line)?;
    }
    
    writer.flush()?;
    Ok(())
}

// Memory-mapped files for large data
use memmap2::MmapOptions;

fn process_large_file_mmap(filename: &str) -> Result<usize, Box<dyn std::error::Error>> {
    let file = File::open(filename)?;
    let mmap = unsafe { MmapOptions::new().map(&file)? };
    
    // Process memory-mapped data directly
    let line_count = mmap.split(|&b| b == b'\n').count();
    Ok(line_count)
}

// Async I/O for high concurrency
async fn async_file_processing(filenames: Vec<&str>) -> Result<Vec<String>, tokio::io::Error> {
    use tokio::fs;
    use futures::future::try_join_all;
    
    let tasks: Vec<_> = filenames.into_iter()
        .map(|filename| async move {
            fs::read_to_string(filename).await
        })
        .collect();
    
    try_join_all(tasks).await
}

Compilation and Build Optimization

Cargo Configuration

# .cargo/config.toml
[build]
rustflags = ["-C", "target-cpu=native"]

[profile.release]
debug = true           # Keep symbols for profiling
debug-assertions = false
overflow-checks = false
lto = "thin"          # Balance between compile time and performance
codegen-units = 16    # Parallel compilation
incremental = false   # Disable for release builds
panic = "abort"       # Smaller binaries

# Custom profile for maximum performance
[profile.max-perf]
inherits = "release"
opt-level = 3
lto = "fat"
codegen-units = 1
panic = "abort"

Link-Time Optimization

# Enable LTO
export RUSTFLAGS="-C link-args=-fuse-ld=lld"
cargo build --release

Cross-compilation optimization

# Target-specific optimizations
cargo build --release --target x86_64-unknown-linux-musl
cargo build --release --target x86_64-pc-windows-gnu

# CPU-specific targeting
export RUSTFLAGS="-C target-cpu=skylake"
cargo build --release

Advanced Optimization Techniques

Const Evaluation and Compile-Time Computation

// Compile-time computation
const fn factorial(n: u64) -> u64 {
    if n == 0 { 1 } else { n * factorial(n - 1) }
}

const FACTORIAL_10: u64 = factorial(10); // Computed at compile time

// Const generics for optimization
struct Matrix<const N: usize, const M: usize> {
    data: [[f64; M]; N],
}

impl<const N: usize, const M: usize> Matrix<N, M> {
    const fn zeros() -> Self {
        Matrix { data: [[0.0; M]; N] }
    }
    
    // Optimized matrix multiplication for known sizes
    fn multiply<const P: usize>(&self, other: &Matrix<M, P>) -> Matrix<N, P> {
        let mut result = Matrix::<N, P>::zeros();
        
        for i in 0..N {
            for j in 0..P {
                for k in 0..M {
                    result.data[i][j] += self.data[i][k] * other.data[k][j];
                }
            }
        }
        
        result
    }
}

// Lookup tables
const SINE_TABLE: [f32; 360] = {
    let mut table = [0.0; 360];
    let mut i = 0;
    while i < 360 {
        table[i] = (i as f32 * std::f32::consts::PI / 180.0).sin();
        i += 1;
    }
    table
};

fn fast_sine(degrees: usize) -> f32 {
    SINE_TABLE[degrees % 360]
}

Branch Prediction Optimization

// Help branch predictor with likely/unlikely
fn process_with_hints(data: &[i32]) -> i32 {
    let mut sum = 0;
    
    for &value in data {
        if likely(value > 0) {
            sum += value;
        } else {
            sum -= value;
        }
    }
    
    sum
}

// Branchless programming
fn branchless_max(a: i32, b: i32) -> i32 {
    let diff = a - b;
    a - (diff & (diff >> 31))
}

fn branchless_abs(x: i32) -> i32 {
    let mask = x >> 31;
    (x + mask) ^ mask
}

// Jump table instead of long if-else chain
fn dispatch_operation(op: u8, a: i32, b: i32) -> i32 {
    const OPERATIONS: [fn(i32, i32) -> i32; 4] = [
        |a, b| a + b,    // 0: add
        |a, b| a - b,    // 1: subtract
        |a, b| a * b,    // 2: multiply
        |a, b| a / b,    // 3: divide
    ];
    
    if (op as usize) < OPERATIONS.len() {
        OPERATIONS[op as usize](a, b)
    } else {
        0
    }
}

// Compiler hints (requires nightly)
#[inline(always)]
fn likely(b: bool) -> bool {
    #[cfg(target_arch = "x86_64")]
    unsafe {
        std::intrinsics::likely(b)
    }
    #[cfg(not(target_arch = "x86_64"))]
    b
}

Performance Testing and Validation

Comprehensive Benchmarking Suite

// benches/comprehensive.rs
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};

fn bench_algorithms(c: &mut Criterion) {
    let mut group = c.benchmark_group("sorting");
    
    for size in [100, 1000, 10000].iter() {
        let data: Vec<i32> = (0..*size).rev().collect();
        
        group.throughput(Throughput::Elements(*size as u64));
        
        group.bench_with_input(
            BenchmarkId::new("std_sort", size),
            size,
            |b, _size| {
                b.iter_batched(
                    || data.clone(),
                    |mut data| data.sort(),
                    criterion::BatchSize::SmallInput,
                )
            },
        );
        
        group.bench_with_input(
            BenchmarkId::new("custom_sort", size),
            size,
            |b, _size| {
                b.iter_batched(
                    || data.clone(),
                    |data| parallel_quicksort(data),
                    criterion::BatchSize::SmallInput,
                )
            },
        );
    }
    
    group.finish();
}

criterion_group!(benches, bench_algorithms);
criterion_main!(benches);

Memory Usage Profiling

// Memory allocation tracking
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;

fn track_memory_usage() {
    use jemalloc_ctl::{stats, epoch};
    
    epoch::advance().unwrap();
    let allocated = stats::allocated::read().unwrap();
    println!("Allocated: {} bytes", allocated);
    
    // Your code here
    let _data: Vec<i32> = (0..1000000).collect();
    
    epoch::advance().unwrap();
    let allocated_after = stats::allocated::read().unwrap();
    println!("Allocated after: {} bytes", allocated_after);
    println!("Difference: {} bytes", allocated_after - allocated);
}

Best Practices Summary

Measure First: Always profile before optimizing
Target Bottlenecks: Focus on the critical path
Cache-Friendly Code: Consider memory access patterns
Minimize Allocations: Reuse memory when possible
Choose Right Data Structures: Match structure to access pattern
Leverage Parallelism: Use rayon for CPU-bound tasks
Optimize Compilation: Use appropriate release flags
SIMD When Applicable: For data-parallel operations
Avoid Premature Optimization: Keep code readable
Validate Optimizations: Ensure correctness is maintained

Performance optimization in Rust is about understanding the system, measuring carefully, and applying targeted improvements. The language's zero-cost abstractions and control over memory layout provide excellent opportunities for optimization while maintaining safety.