Skip to content

C Performance Tips

This guide covers C programming performance optimization techniques, best practices, and tools for writing efficient C code.

🎯 Core Performance Principles

C Performance Characteristics

  • Compiled Language: Fast execution, direct hardware access
  • Manual Memory Management: Precise control over memory allocation
  • Low-Level Operations: Bit manipulation, pointer arithmetic
  • No Overhead: No garbage collection or runtime overhead

Measurement First

  • Profile Before Optimizing: Use profilers to identify bottlenecks
  • Benchmark Properly: Use timing functions and repeat measurements
  • Measure in Context: Profile realistic workloads
  • Set Performance Goals: Define measurable targets

🚀 Memory Optimization

Efficient Memory Allocation

// Bad: Frequent small allocations
void process_data_bad() {
    for (int i = 0; i < 10000; i++) {
        int* data = malloc(sizeof(int) * 100);  // 10,000 allocations!
        // Process data
        free(data);
    }
}

// Good: Single large allocation
void process_data_good() {
    int* data = malloc(sizeof(int) * 100 * 10000);  // Single allocation
    if (data == NULL) return;

    for (int i = 0; i < 10000; i++) {
        int* chunk = data + (i * 100);
        // Process chunk
    }

    free(data);
}

// Better: Use stack allocation for small data
void process_small_data() {
    int data[100];  // Stack allocation - very fast
    // Process data
    // No free needed
}

Memory Pool Pattern

// Memory pool for frequent allocations
typedef struct {
    void* pool;
    size_t used;
    size_t capacity;
} MemoryPool;

MemoryPool* create_memory_pool(size_t capacity) {
    MemoryPool* pool = malloc(sizeof(MemoryPool));
    if (!pool) return NULL;

    pool->pool = malloc(capacity);
    if (!pool->pool) {
        free(pool);
        return NULL;
    }

    pool->used = 0;
    pool->capacity = capacity;
    return pool;
}

void* pool_alloc(MemoryPool* pool, size_t size) {
    // Align to 8-byte boundary
    size_t aligned_size = (size + 7) & ~7;

    if (pool->used + aligned_size > pool->capacity) {
        return NULL;  // Pool exhausted
    }

    void* ptr = (char*)pool->pool + pool->used;
    pool->used += aligned_size;
    return ptr;
}

void destroy_memory_pool(MemoryPool* pool) {
    free(pool->pool);
    free(pool);
}

// Usage
void use_memory_pool() {
    MemoryPool* pool = create_memory_pool(1024 * 1024);  // 1MB pool

    for (int i = 0; i < 1000; i++) {
        int* data = pool_alloc(pool, sizeof(int) * 100);
        if (data) {
            // Use data
        }
    }

    destroy_memory_pool(pool);
}

Cache-Friendly Data Structures

// Bad: Poor cache locality
typedef struct {
    int id;
    char name[50];
    double score;
    char description[200];
} Student;

void process_students_bad(Student* students, int count) {
    for (int i = 0; i < count; i++) {
        // Accessing large structure - poor cache usage
        students[i].score = calculate_score(&students[i]);
    }
}

// Good: Structure of Arrays (SoA) for better cache usage
typedef struct {
    int* ids;
    char (*names)[50];
    double* scores;
    char (*descriptions)[200];
    int count;
} StudentsSoA;

void process_students_good(StudentsSoA* students) {
    for (int i = 0; i < students->count; i++) {
        // Accessing contiguous memory - better cache usage
        students->scores[i] = calculate_score_soa(students, i);
    }
}

// Even better: Separate frequently accessed data
typedef struct {
    int id;
    double score;
} StudentCore;

typedef struct {
    char name[50];
    char description[200];
} StudentDetails;

void process_students_optimized(StudentCore* cores, StudentDetails* details, int count) {
    for (int i = 0; i < count; i++) {
        // Only access core data for performance-critical operations
        cores[i].score = calculate_score_core(&cores[i]);
    }
}

📊 Algorithm Optimization

Loop Optimization

// Bad: Inefficient loop
void process_array_bad(int* array, int size) {
    for (int i = 0; i < size; i++) {
        if (array[i] % 2 == 0) {
            array[i] = array[i] * 2;
        }
        if (array[i] > 100) {
            array[i] = array[i] + 10;
        }
    }
}

// Good: Reduced operations
void process_array_good(int* array, int size) {
    for (int i = 0; i < size; i++) {
        int val = array[i];
        if (val % 2 == 0) {
            val *= 2;
        }
        if (val > 100) {
            val += 10;
        }
        array[i] = val;
    }
}

// Better: Loop unrolling (compiler may do this automatically)
void process_array_unrolled(int* array, int size) {
    int i = 0;
    // Process 4 elements at a time
    for (; i <= size - 4; i += 4) {
        array[i] = process_element(array[i]);
        array[i+1] = process_element(array[i+1]);
        array[i+2] = process_element(array[i+2]);
        array[i+3] = process_element(array[i+3]);
    }

    // Handle remaining elements
    for (; i < size; i++) {
        array[i] = process_element(array[i]);
    }
}

Branch Prediction Optimization

// Bad: Unpredictable branches
void process_data_branchy(int* data, int size) {
    for (int i = 0; i < size; i++) {
        if (data[i] % 2 == 0) {  // Unpredictable branch
            data[i] *= 2;
        } else {
            data[i] += 1;
        }
    }
}

// Good: Branchless code
void process_data_branchless(int* data, int size) {
    for (int i = 0; i < size; i++) {
        int is_even = (data[i] % 2 == 0);
        data[i] = data[i] * (1 + is_even) + (1 - is_even);
    }
}

// Better: Separate processing
void process_data_separated(int* data, int size) {
    // Process even numbers first
    for (int i = 0; i < size; i++) {
        if (data[i] % 2 == 0) {
            data[i] *= 2;
        }
    }

    // Process odd numbers
    for (int i = 0; i < size; i++) {
        if (data[i] % 2 != 0) {
            data[i] += 1;
        }
    }
}

Bit Manipulation

// Bad: Slow operations
bool is_power_of_two_slow(int n) {
    if (n <= 0) return false;

    while (n % 2 == 0) {
        n /= 2;
    }

    return n == 1;
}

// Good: Bit manipulation
bool is_power_of_two_fast(int n) {
    return n > 0 && (n & (n - 1)) == 0;
}

// Fast absolute value (avoiding branching)
int abs_fast(int n) {
    int mask = n >> (sizeof(int) * 8 - 1);
    return (n + mask) ^ mask;
}

// Fast min/max (avoiding branching)
int min_fast(int a, int b) {
    return b ^ ((a ^ b) & -(a < b));
}

int max_fast(int a, int b) {
    return a ^ ((a ^ b) & -(a < b));
}

🔄 Compiler Optimization

Compiler Flags

# Basic optimization
gcc -O2 program.c -o program

# Maximum optimization
gcc -O3 program.c -o program

# Optimization with architecture-specific tuning
gcc -O3 -march=native program.c -o program

# Link Time Optimization (LTO)
gcc -O3 -flto program.c -o program

# Profile-Guided Optimization (PGO)
# 1. Compile with profiling
gcc -O3 -fprofile-generate program.c -o program
# 2. Run with representative workload
./program
# 3. Rebuild with profile data
gcc -O3 -fprofile-use program.c -o program

Inline Functions

// Function declaration with inline hint
static inline int square(int x) {
    return x * x;
}

// Macro for very small operations
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))

// Inline assembly for critical sections
static inline int fast_multiply(int a, int b) {
    int result;
    asm("imul %2, %1" : "=r" (result) : "r" (a), "r" (b));
    return result;
}

Restrict Keyword

// Tell compiler that pointers don't alias
void process_arrays(int* restrict a, int* restrict b, int* restrict c, int size) {
    for (int i = 0; i < size; i++) {
        c[i] = a[i] + b[i];  // Compiler can optimize better
    }
}

// Function with restrict pointers
void* memcpy_optimized(void* restrict dest, const void* restrict src, size_t n) {
    // Compiler can assume no overlap
    char* d = dest;
    const char* s = src;

    for (size_t i = 0; i < n; i++) {
        d[i] = s[i];
    }

    return dest;
}

🔍 Profiling and Measurement

Timing Functions

#include <time.h>
#include <sys/time.h>

// High-resolution timing
double get_time() {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec + ts.tv_nsec / 1e9;
}

// Benchmark function
void benchmark_function(void (*func)(int*, int), int* data, int size, const char* name) {
    double start = get_time();
    func(data, size);
    double end = get_time();

    printf("%s: %.6f seconds\n", name, end - start);
}

// Usage
void test_performance() {
    int data[1000000];

    // Initialize data
    for (int i = 0; i < 1000000; i++) {
        data[i] = i;
    }

    benchmark_function(process_array_bad, data, 1000000, "Bad version");
    benchmark_function(process_array_good, data, 1000000, "Good version");
}

CPU Performance Counters

#include <stdio.h>
#include <stdint.h>

// Read CPU timestamp counter (x86)
static inline uint64_t rdtsc() {
    unsigned int low, high;
    asm volatile("rdtsc" : "=a" (low), "=d" (high));
    return ((uint64_t)high << 32) | low;
}

// Measure CPU cycles
void measure_cycles() {
    uint64_t start = rdtsc();

    // Code to measure
    int result = 0;
    for (int i = 0; i < 1000; i++) {
        result += i * i;
    }

    uint64_t end = rdtsc();

    printf("CPU cycles: %lu\n", end - start);
    printf("Result: %d\n", result);
}

🛠️ System-Level Optimization

SIMD Instructions

#include <immintrin.h>  // For SSE/AVX

// Vectorized addition using SSE
void vector_add_sse(float* a, float* b, float* result, int size) {
    int i = 0;

    // Process 4 floats at a time
    for (; i <= size - 4; i += 4) {
        __m128 va = _mm_load_ps(&a[i]);
        __m128 vb = _mm_load_ps(&b[i]);
        __m128 vresult = _mm_add_ps(va, vb);
        _mm_store_ps(&result[i], vresult);
    }

    // Handle remaining elements
    for (; i < size; i++) {
        result[i] = a[i] + b[i];
    }
}

// Vectorized multiplication using AVX
void vector_multiply_avx(double* a, double* b, double* result, int size) {
    int i = 0;

    // Process 4 doubles at a time
    for (; i <= size - 4; i += 4) {
        __m256d va = _mm256_load_pd(&a[i]);
        __m256d vb = _mm256_load_pd(&b[i]);
        __m256d vresult = _mm256_mul_pd(va, vb);
        _mm256_store_pd(&result[i], vresult);
    }

    // Handle remaining elements
    for (; i < size; i++) {
        result[i] = a[i] * b[i];
    }
}

Multi-threading with OpenMP

#include <omp.h>

// Parallel processing with OpenMP
void process_parallel(int* data, int size) {
    #pragma omp parallel for
    for (int i = 0; i < size; i++) {
        data[i] = expensive_computation(data[i]);
    }
}

// Parallel reduction
int sum_array_parallel(int* data, int size) {
    int sum = 0;

    #pragma omp parallel for reduction(+:sum)
    for (int i = 0; i < size; i++) {
        sum += data[i];
    }

    return sum;
}

// Usage
void test_openmp() {
    int data[1000000];

    // Initialize data
    for (int i = 0; i < 1000000; i++) {
        data[i] = i;
    }

    // Set number of threads
    omp_set_num_threads(4);

    double start = get_time();
    process_parallel(data, 1000000);
    double end = get_time();

    printf("Parallel processing: %.6f seconds\n", end - start);
}

📈 Memory Access Patterns

Optimal Memory Layout

// Bad: Strided memory access
void process_strided(int** matrix, int rows, int cols) {
    for (int col = 0; col < cols; col++) {
        for (int row = 0; row < rows; row++) {
            matrix[row][col] *= 2;  // Poor cache locality
        }
    }
}

// Good: Sequential memory access
void process_sequential(int** matrix, int rows, int cols) {
    for (int row = 0; row < rows; row++) {
        for (int col = 0; col < cols; col++) {
            matrix[row][col] *= 2;  // Good cache locality
        }
    }
}

// Better: Use 1D array for better cache usage
void process_1d(int* matrix, int rows, int cols) {
    for (int i = 0; i < rows * cols; i++) {
        matrix[i] *= 2;  // Sequential access
    }
}

// Access with 1D indexing
#define MATRIX_ELEM(matrix, row, col, cols) matrix[(row) * (cols) + (col)]

void process_1d_indexed(int* matrix, int rows, int cols) {
    for (int row = 0; row < rows; row++) {
        for (int col = 0; col < cols; col++) {
            MATRIX_ELEM(matrix, row, col, cols) *= 2;
        }
    }
}

Prefetching

#include <xmmintrin.h>  // For prefetch instructions

// Manual prefetching
void process_with_prefetch(int* data, int size) {
    const int prefetch_distance = 8;  // Prefetch 8 elements ahead

    for (int i = 0; i < size; i++) {
        // Prefetch future data
        if (i + prefetch_distance < size) {
            _mm_prefetch(&data[i + prefetch_distance], _MM_HINT_T0);
        }

        // Process current data
        data[i] = expensive_operation(data[i]);
    }
}

// Block processing for better cache usage
void process_blocks(int* data, int size) {
    const int block_size = 64;  // Cache line size

    for (int block_start = 0; block_start < size; block_start += block_size) {
        int block_end = block_start + block_size;
        if (block_end > size) block_end = size;

        // Process block
        for (int i = block_start; i < block_end; i++) {
            data[i] = expensive_operation(data[i]);
        }
    }
}

🎯 Optimization Best Practices

Performance Checklist

// Optimization checklist implementation

void optimization_checklist() {
    // 1. Use appropriate data types
    // Use int instead of long when possible
    // Use float instead of double when precision allows

    // 2. Minimize function calls in loops
    for (int i = 0; i < 1000000; i++) {
        // Bad: function call in loop
        // result = expensive_function(data[i]);

        // Good: inline or move outside loop
        result = data[i] * 2;  // Simple operation
    }

    // 3. Use bit operations when possible
    // Bad: x % 2 == 0
    // Good: (x & 1) == 0

    // 4. Avoid unnecessary branches
    // Use branchless code when possible

    // 5. Optimize memory access patterns
    // Access memory sequentially
    // Use data structures that fit in cache
}

Compiler Hints

// Tell compiler about likely branches
void process_with_hints(int* data, int size) {
    for (int i = 0; i < size; i++) {
        if (__builtin_expect(data[i] > 0, 1)) {
            // Likely case
            data[i] *= 2;
        } else {
            // Unlikely case
            data[i] += 1;
        }
    }
}

// Function attributes for optimization
__attribute__((hot))
void frequently_called_function(int* data, int size) {
    // This function is called frequently
    // Compiler will optimize it more aggressively
    for (int i = 0; i < size; i++) {
        data[i] *= 2;
    }
}

__attribute__((cold))
void error_handling_function() {
    // This function is rarely called
    // Compiler will optimize for size, not speed
    printf("An error occurred\n");
}

🔗 Language-Specific Performance