C Performance Tips¶
This guide covers C programming performance optimization techniques, best practices, and tools for writing efficient C code.
🎯 Core Performance Principles¶
C Performance Characteristics¶
- Compiled Language: Fast execution, direct hardware access
- Manual Memory Management: Precise control over memory allocation
- Low-Level Operations: Bit manipulation, pointer arithmetic
- No Overhead: No garbage collection or runtime overhead
Measurement First¶
- Profile Before Optimizing: Use profilers to identify bottlenecks
- Benchmark Properly: Use timing functions and repeat measurements
- Measure in Context: Profile realistic workloads
- Set Performance Goals: Define measurable targets
🚀 Memory Optimization¶
Efficient Memory Allocation¶
// Bad: Frequent small allocations
void process_data_bad() {
for (int i = 0; i < 10000; i++) {
int* data = malloc(sizeof(int) * 100); // 10,000 allocations!
// Process data
free(data);
}
}
// Good: Single large allocation
void process_data_good() {
int* data = malloc(sizeof(int) * 100 * 10000); // Single allocation
if (data == NULL) return;
for (int i = 0; i < 10000; i++) {
int* chunk = data + (i * 100);
// Process chunk
}
free(data);
}
// Better: Use stack allocation for small data
void process_small_data() {
int data[100]; // Stack allocation - very fast
// Process data
// No free needed
}
Memory Pool Pattern¶
// Memory pool for frequent allocations
typedef struct {
void* pool;
size_t used;
size_t capacity;
} MemoryPool;
MemoryPool* create_memory_pool(size_t capacity) {
MemoryPool* pool = malloc(sizeof(MemoryPool));
if (!pool) return NULL;
pool->pool = malloc(capacity);
if (!pool->pool) {
free(pool);
return NULL;
}
pool->used = 0;
pool->capacity = capacity;
return pool;
}
void* pool_alloc(MemoryPool* pool, size_t size) {
// Align to 8-byte boundary
size_t aligned_size = (size + 7) & ~7;
if (pool->used + aligned_size > pool->capacity) {
return NULL; // Pool exhausted
}
void* ptr = (char*)pool->pool + pool->used;
pool->used += aligned_size;
return ptr;
}
void destroy_memory_pool(MemoryPool* pool) {
free(pool->pool);
free(pool);
}
// Usage
void use_memory_pool() {
MemoryPool* pool = create_memory_pool(1024 * 1024); // 1MB pool
for (int i = 0; i < 1000; i++) {
int* data = pool_alloc(pool, sizeof(int) * 100);
if (data) {
// Use data
}
}
destroy_memory_pool(pool);
}
Cache-Friendly Data Structures¶
// Bad: Poor cache locality
typedef struct {
int id;
char name[50];
double score;
char description[200];
} Student;
void process_students_bad(Student* students, int count) {
for (int i = 0; i < count; i++) {
// Accessing large structure - poor cache usage
students[i].score = calculate_score(&students[i]);
}
}
// Good: Structure of Arrays (SoA) for better cache usage
typedef struct {
int* ids;
char (*names)[50];
double* scores;
char (*descriptions)[200];
int count;
} StudentsSoA;
void process_students_good(StudentsSoA* students) {
for (int i = 0; i < students->count; i++) {
// Accessing contiguous memory - better cache usage
students->scores[i] = calculate_score_soa(students, i);
}
}
// Even better: Separate frequently accessed data
typedef struct {
int id;
double score;
} StudentCore;
typedef struct {
char name[50];
char description[200];
} StudentDetails;
void process_students_optimized(StudentCore* cores, StudentDetails* details, int count) {
for (int i = 0; i < count; i++) {
// Only access core data for performance-critical operations
cores[i].score = calculate_score_core(&cores[i]);
}
}
📊 Algorithm Optimization¶
Loop Optimization¶
// Bad: Inefficient loop
void process_array_bad(int* array, int size) {
for (int i = 0; i < size; i++) {
if (array[i] % 2 == 0) {
array[i] = array[i] * 2;
}
if (array[i] > 100) {
array[i] = array[i] + 10;
}
}
}
// Good: Reduced operations
void process_array_good(int* array, int size) {
for (int i = 0; i < size; i++) {
int val = array[i];
if (val % 2 == 0) {
val *= 2;
}
if (val > 100) {
val += 10;
}
array[i] = val;
}
}
// Better: Loop unrolling (compiler may do this automatically)
void process_array_unrolled(int* array, int size) {
int i = 0;
// Process 4 elements at a time
for (; i <= size - 4; i += 4) {
array[i] = process_element(array[i]);
array[i+1] = process_element(array[i+1]);
array[i+2] = process_element(array[i+2]);
array[i+3] = process_element(array[i+3]);
}
// Handle remaining elements
for (; i < size; i++) {
array[i] = process_element(array[i]);
}
}
Branch Prediction Optimization¶
// Bad: Unpredictable branches
void process_data_branchy(int* data, int size) {
for (int i = 0; i < size; i++) {
if (data[i] % 2 == 0) { // Unpredictable branch
data[i] *= 2;
} else {
data[i] += 1;
}
}
}
// Good: Branchless code
void process_data_branchless(int* data, int size) {
for (int i = 0; i < size; i++) {
int is_even = (data[i] % 2 == 0);
data[i] = data[i] * (1 + is_even) + (1 - is_even);
}
}
// Better: Separate processing
void process_data_separated(int* data, int size) {
// Process even numbers first
for (int i = 0; i < size; i++) {
if (data[i] % 2 == 0) {
data[i] *= 2;
}
}
// Process odd numbers
for (int i = 0; i < size; i++) {
if (data[i] % 2 != 0) {
data[i] += 1;
}
}
}
Bit Manipulation¶
// Bad: Slow operations
bool is_power_of_two_slow(int n) {
if (n <= 0) return false;
while (n % 2 == 0) {
n /= 2;
}
return n == 1;
}
// Good: Bit manipulation
bool is_power_of_two_fast(int n) {
return n > 0 && (n & (n - 1)) == 0;
}
// Fast absolute value (avoiding branching)
int abs_fast(int n) {
int mask = n >> (sizeof(int) * 8 - 1);
return (n + mask) ^ mask;
}
// Fast min/max (avoiding branching)
int min_fast(int a, int b) {
return b ^ ((a ^ b) & -(a < b));
}
int max_fast(int a, int b) {
return a ^ ((a ^ b) & -(a < b));
}
🔄 Compiler Optimization¶
Compiler Flags¶
# Basic optimization
gcc -O2 program.c -o program
# Maximum optimization
gcc -O3 program.c -o program
# Optimization with architecture-specific tuning
gcc -O3 -march=native program.c -o program
# Link Time Optimization (LTO)
gcc -O3 -flto program.c -o program
# Profile-Guided Optimization (PGO)
# 1. Compile with profiling
gcc -O3 -fprofile-generate program.c -o program
# 2. Run with representative workload
./program
# 3. Rebuild with profile data
gcc -O3 -fprofile-use program.c -o program
Inline Functions¶
// Function declaration with inline hint
static inline int square(int x) {
return x * x;
}
// Macro for very small operations
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
// Inline assembly for critical sections
static inline int fast_multiply(int a, int b) {
int result;
asm("imul %2, %1" : "=r" (result) : "r" (a), "r" (b));
return result;
}
Restrict Keyword¶
// Tell compiler that pointers don't alias
void process_arrays(int* restrict a, int* restrict b, int* restrict c, int size) {
for (int i = 0; i < size; i++) {
c[i] = a[i] + b[i]; // Compiler can optimize better
}
}
// Function with restrict pointers
void* memcpy_optimized(void* restrict dest, const void* restrict src, size_t n) {
// Compiler can assume no overlap
char* d = dest;
const char* s = src;
for (size_t i = 0; i < n; i++) {
d[i] = s[i];
}
return dest;
}
🔍 Profiling and Measurement¶
Timing Functions¶
#include <time.h>
#include <sys/time.h>
// High-resolution timing
double get_time() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec + ts.tv_nsec / 1e9;
}
// Benchmark function
void benchmark_function(void (*func)(int*, int), int* data, int size, const char* name) {
double start = get_time();
func(data, size);
double end = get_time();
printf("%s: %.6f seconds\n", name, end - start);
}
// Usage
void test_performance() {
int data[1000000];
// Initialize data
for (int i = 0; i < 1000000; i++) {
data[i] = i;
}
benchmark_function(process_array_bad, data, 1000000, "Bad version");
benchmark_function(process_array_good, data, 1000000, "Good version");
}
CPU Performance Counters¶
#include <stdio.h>
#include <stdint.h>
// Read CPU timestamp counter (x86)
static inline uint64_t rdtsc() {
unsigned int low, high;
asm volatile("rdtsc" : "=a" (low), "=d" (high));
return ((uint64_t)high << 32) | low;
}
// Measure CPU cycles
void measure_cycles() {
uint64_t start = rdtsc();
// Code to measure
int result = 0;
for (int i = 0; i < 1000; i++) {
result += i * i;
}
uint64_t end = rdtsc();
printf("CPU cycles: %lu\n", end - start);
printf("Result: %d\n", result);
}
🛠️ System-Level Optimization¶
SIMD Instructions¶
#include <immintrin.h> // For SSE/AVX
// Vectorized addition using SSE
void vector_add_sse(float* a, float* b, float* result, int size) {
int i = 0;
// Process 4 floats at a time
for (; i <= size - 4; i += 4) {
__m128 va = _mm_load_ps(&a[i]);
__m128 vb = _mm_load_ps(&b[i]);
__m128 vresult = _mm_add_ps(va, vb);
_mm_store_ps(&result[i], vresult);
}
// Handle remaining elements
for (; i < size; i++) {
result[i] = a[i] + b[i];
}
}
// Vectorized multiplication using AVX
void vector_multiply_avx(double* a, double* b, double* result, int size) {
int i = 0;
// Process 4 doubles at a time
for (; i <= size - 4; i += 4) {
__m256d va = _mm256_load_pd(&a[i]);
__m256d vb = _mm256_load_pd(&b[i]);
__m256d vresult = _mm256_mul_pd(va, vb);
_mm256_store_pd(&result[i], vresult);
}
// Handle remaining elements
for (; i < size; i++) {
result[i] = a[i] * b[i];
}
}
Multi-threading with OpenMP¶
#include <omp.h>
// Parallel processing with OpenMP
void process_parallel(int* data, int size) {
#pragma omp parallel for
for (int i = 0; i < size; i++) {
data[i] = expensive_computation(data[i]);
}
}
// Parallel reduction
int sum_array_parallel(int* data, int size) {
int sum = 0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < size; i++) {
sum += data[i];
}
return sum;
}
// Usage
void test_openmp() {
int data[1000000];
// Initialize data
for (int i = 0; i < 1000000; i++) {
data[i] = i;
}
// Set number of threads
omp_set_num_threads(4);
double start = get_time();
process_parallel(data, 1000000);
double end = get_time();
printf("Parallel processing: %.6f seconds\n", end - start);
}
📈 Memory Access Patterns¶
Optimal Memory Layout¶
// Bad: Strided memory access
void process_strided(int** matrix, int rows, int cols) {
for (int col = 0; col < cols; col++) {
for (int row = 0; row < rows; row++) {
matrix[row][col] *= 2; // Poor cache locality
}
}
}
// Good: Sequential memory access
void process_sequential(int** matrix, int rows, int cols) {
for (int row = 0; row < rows; row++) {
for (int col = 0; col < cols; col++) {
matrix[row][col] *= 2; // Good cache locality
}
}
}
// Better: Use 1D array for better cache usage
void process_1d(int* matrix, int rows, int cols) {
for (int i = 0; i < rows * cols; i++) {
matrix[i] *= 2; // Sequential access
}
}
// Access with 1D indexing
#define MATRIX_ELEM(matrix, row, col, cols) matrix[(row) * (cols) + (col)]
void process_1d_indexed(int* matrix, int rows, int cols) {
for (int row = 0; row < rows; row++) {
for (int col = 0; col < cols; col++) {
MATRIX_ELEM(matrix, row, col, cols) *= 2;
}
}
}
Prefetching¶
#include <xmmintrin.h> // For prefetch instructions
// Manual prefetching
void process_with_prefetch(int* data, int size) {
const int prefetch_distance = 8; // Prefetch 8 elements ahead
for (int i = 0; i < size; i++) {
// Prefetch future data
if (i + prefetch_distance < size) {
_mm_prefetch(&data[i + prefetch_distance], _MM_HINT_T0);
}
// Process current data
data[i] = expensive_operation(data[i]);
}
}
// Block processing for better cache usage
void process_blocks(int* data, int size) {
const int block_size = 64; // Cache line size
for (int block_start = 0; block_start < size; block_start += block_size) {
int block_end = block_start + block_size;
if (block_end > size) block_end = size;
// Process block
for (int i = block_start; i < block_end; i++) {
data[i] = expensive_operation(data[i]);
}
}
}
🎯 Optimization Best Practices¶
Performance Checklist¶
// Optimization checklist implementation
void optimization_checklist() {
// 1. Use appropriate data types
// Use int instead of long when possible
// Use float instead of double when precision allows
// 2. Minimize function calls in loops
for (int i = 0; i < 1000000; i++) {
// Bad: function call in loop
// result = expensive_function(data[i]);
// Good: inline or move outside loop
result = data[i] * 2; // Simple operation
}
// 3. Use bit operations when possible
// Bad: x % 2 == 0
// Good: (x & 1) == 0
// 4. Avoid unnecessary branches
// Use branchless code when possible
// 5. Optimize memory access patterns
// Access memory sequentially
// Use data structures that fit in cache
}
Compiler Hints¶
// Tell compiler about likely branches
void process_with_hints(int* data, int size) {
for (int i = 0; i < size; i++) {
if (__builtin_expect(data[i] > 0, 1)) {
// Likely case
data[i] *= 2;
} else {
// Unlikely case
data[i] += 1;
}
}
}
// Function attributes for optimization
__attribute__((hot))
void frequently_called_function(int* data, int size) {
// This function is called frequently
// Compiler will optimize it more aggressively
for (int i = 0; i < size; i++) {
data[i] *= 2;
}
}
__attribute__((cold))
void error_handling_function() {
// This function is rarely called
// Compiler will optimize for size, not speed
printf("An error occurred\n");
}
📚 Related Resources¶
- C Best Practices - Write efficient C code
- C Common Mistakes - Avoid performance pitfalls
- C Debugging Techniques - Performance debugging
- C Resources - Performance tools and documentation
🔗 Related Performance Guides¶
- Time Complexity - Algorithm analysis
- Space Complexity - Memory optimization
- Algorithm Analysis - Performance measurement
- Optimization Techniques - General optimization strategies
🔗 Language-Specific Performance¶
- Java Performance Tips - Java optimization
- Python Performance Tips - Python optimization
- Oracle Performance Tips - Oracle optimization