Skip to content

Python Performance Tips

This guide covers Python performance optimization techniques, best practices, and tools for writing efficient Python code.

🎯 Core Performance Principles

Python Performance Characteristics

  • Interpreted Language: Slower than compiled languages
  • Dynamic Typing: Runtime type checking overhead
  • GIL Limitation: Global Interpreter Lock limits true parallelism
  • Rich Ecosystem: Many optimized libraries available

Measurement First

  • Profile Before Optimizing: Use profilers to identify bottlenecks
  • Benchmark Properly: Use timeit and pytest-benchmark
  • Measure in Context: Profile realistic workloads
  • Set Performance Goals: Define measurable targets

🚀 Algorithm Optimization

Choose Right Data Structures

# Bad: List for membership testing (O(n))
def has_permission_list(user_permissions, permission):
    return permission in user_permissions  # O(n) lookup

# Good: Set for membership testing (O(1))
def has_permission_set(user_permissions, permission):
    return permission in set(user_permissions)  # O(1) lookup

# Performance comparison
import timeit

permissions = [f"perm_{i}" for i in range(10000)]
test_permission = "perm_5000"

# List lookup: ~0.0005 seconds
list_time = timeit.timeit(
    lambda: test_permission in permissions, 
    number=1000
)

# Set lookup: ~0.00001 seconds
set_time = timeit.timeit(
    lambda: test_permission in set(permissions), 
    number=1000
)

print(f"List: {list_time:.6f}s, Set: {set_time:.6f}s")

Use Built-in Functions

# Bad: Manual implementation
def sum_manual(numbers):
    total = 0
    for num in numbers:
        total += num
    return total

# Good: Built-in sum function
def sum_builtin(numbers):
    return sum(numbers)  # Optimized C implementation

# Bad: Manual max finding
def max_manual(numbers):
    max_val = numbers[0]
    for num in numbers[1:]:
        if num > max_val:
            max_val = num
    return max_val

# Good: Built-in max function
def max_builtin(numbers):
    return max(numbers)  # Optimized C implementation

List Comprehensions and Generators

# Bad: Loop with append
def squares_bad(numbers):
    result = []
    for num in numbers:
        result.append(num * num)
    return result

# Good: List comprehension
def squares_good(numbers):
    return [num * num for num in numbers]  # More efficient

# Better: Generator for large datasets
def squares_generator(numbers):
    return (num * num for num in numbers)  # Memory efficient

# Performance comparison
import sys

large_list = list(range(1000000))

# List comprehension: ~8MB memory
list_comp = [x * x for x in large_list]
print(f"List comprehension: {sys.getsizeof(list_comp)} bytes")

# Generator: ~120 bytes memory
gen_comp = (x * x for x in large_list)
print(f"Generator: {sys.getsizeof(gen_comp)} bytes")

📊 Memory Optimization

Memory-Efficient Data Structures

# Bad: List of tuples (high memory usage)
def create_user_list_bad():
    users = []
    for i in range(100000):
        users.append((f"user_{i}", f"email_{i}@example.com", i))
    return users

# Good: List of objects (better memory usage)
class User:
    __slots__ = ['username', 'email', 'id']  # Reduces memory usage

    def __init__(self, username, email, user_id):
        self.username = username
        self.email = email
        self.id = user_id

def create_user_list_good():
    users = []
    for i in range(100000):
        users.append(User(f"user_{i}", f"email_{i}@example.com", i))
    return users

# Better: Use namedtuple for simple data structures
from collections import namedtuple

UserTuple = namedtuple('UserTuple', ['username', 'email', 'id'])

def create_user_list_tuple():
    return [UserTuple(f"user_{i}", f"email_{i}@example.com", i) 
            for i in range(100000)]

Generator Usage

# Bad: Loading entire file into memory
def process_large_file_bad(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()  # Loads entire file
    return [line.strip().upper() for line in lines]

# Good: Process line by line
def process_large_file_good(filename):
    with open(filename, 'r') as file:
        for line in file:  # One line at a time
            yield line.strip().upper()

# Better: Use generator expression
def process_large_file_generator(filename):
    with open(filename, 'r') as file:
        return (line.strip().upper() for line in file)

# Usage
for processed_line in process_large_file_good("large_file.txt"):
    # Process each line without loading entire file
    pass

Memory Profiling

import tracemalloc
import sys
from memory_profiler import profile

# Memory profiling with tracemalloc
def memory_intensive_function():
    # Start tracing memory allocation
    tracemalloc.start()

    # Allocate memory
    large_data = [i for i in range(1000000)]

    # Get memory statistics
    current, peak = tracemalloc.get_traced_memory()
    print(f"Current memory usage: {current / 1024 / 1024:.2f} MB")
    print(f"Peak memory usage: {peak / 1024 / 1024:.2f} MB")

    tracemalloc.stop()
    return large_data

# Memory profiling with memory_profiler
@profile
def memory_profile_function():
    # Create large data structures
    data = []
    for i in range(100000):
        data.append({
            'id': i,
            'name': f"item_{i}",
            'value': i * 2
        })
    return data

🔄 Concurrency and Parallelism

Multiprocessing for CPU-Bound Tasks

import multiprocessing
import time
from concurrent.futures import ProcessPoolExecutor

# CPU-bound task
def fibonacci(n):
    if n <= 1:
        return n
    return fibonacci(n-1) + fibonacci(n-2)

# Bad: Sequential execution
def fibonacci_sequential(numbers):
    return [fibonacci(n) for n in numbers]

# Good: Parallel execution with multiprocessing
def fibonacci_parallel(numbers):
    with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
        return list(executor.map(fibonacci, numbers))

# Performance comparison
if __name__ == "__main__":
    numbers = [35, 36, 37, 38, 39, 40]

    # Sequential execution
    start_time = time.time()
    result_sequential = fibonacci_sequential(numbers)
    sequential_time = time.time() - start_time

    # Parallel execution
    start_time = time.time()
    result_parallel = fibonacci_parallel(numbers)
    parallel_time = time.time() - start_time

    print(f"Sequential: {sequential_time:.2f}s")
    print(f"Parallel: {parallel_time:.2f}s")
    print(f"Speedup: {sequential_time / parallel_time:.2f}x")

Threading for I/O-Bound Tasks

import threading
import time
import requests
from concurrent.futures import ThreadPoolExecutor

# I/O-bound task
def fetch_url(url):
    try:
        response = requests.get(url, timeout=5)
        return len(response.content)
    except requests.RequestException:
        return 0

# Bad: Sequential execution
def fetch_urls_sequential(urls):
    results = []
    for url in urls:
        results.append(fetch_url(url))
    return results

# Good: Parallel execution with threading
def fetch_urls_parallel(urls):
    with ThreadPoolExecutor(max_workers=10) as executor:
        return list(executor.map(fetch_url, urls))

# Usage
urls = [
    "https://httpbin.org/delay/1",
    "https://httpbin.org/delay/1",
    "https://httpbin.org/delay/1"
]

# Sequential execution
start_time = time.time()
results_sequential = fetch_urls_sequential(urls)
sequential_time = time.time() - start_time

# Parallel execution
start_time = time.time()
results_parallel = fetch_urls_parallel(urls)
parallel_time = time.time() - start_time

print(f"Sequential: {sequential_time:.2f}s")
print(f"Parallel: {parallel_time:.2f}s")

Asyncio for Asynchronous Programming

import asyncio
import aiohttp
import time

# Async I/O-bound task
async def fetch_url_async(session, url):
    try:
        async with session.get(url, timeout=5) as response:
            return len(await response.read())
    except Exception:
        return 0

# Async execution
async def fetch_urls_async(urls):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_url_async(session, url) for url in urls]
        return await asyncio.gather(*tasks)

# Usage
async def main():
    urls = [
        "https://httpbin.org/delay/1",
        "https://httpbin.org/delay/1",
        "https://httpbin.org/delay/1"
    ]

    start_time = time.time()
    results = await fetch_urls_async(urls)
    async_time = time.time() - start_time

    print(f"Async: {async_time:.2f}s")
    return results

# Run async function
if __name__ == "__main__":
    asyncio.run(main())

🛠️ Caching and Memoization

Function Caching

from functools import lru_cache
import time

# Bad: Recalculating expensive operations
def expensive_function(x):
    time.sleep(1)  # Simulate expensive computation
    return x * x

# Good: Use lru_cache for memoization
@lru_cache(maxsize=128)
def cached_expensive_function(x):
    time.sleep(1)  # Simulate expensive computation
    return x * x

# Performance comparison
def test_performance():
    # Without cache
    start_time = time.time()
    for i in range(5):
        result = expensive_function(i)
    no_cache_time = time.time() - start_time

    # With cache
    start_time = time.time()
    for i in range(5):
        result = cached_expensive_function(i)
    with_cache_time = time.time() - start_time

    print(f"No cache: {no_cache_time:.2f}s")
    print(f"With cache: {with_cache_time:.2f}s")

# Custom cache implementation
class SimpleCache:
    def __init__(self, max_size=128):
        self.cache = {}
        self.max_size = max_size

    def get(self, key):
        return self.cache.get(key)

    def set(self, key, value):
        if len(self.cache) >= self.max_size:
            # Remove oldest item (simple LRU)
            oldest_key = next(iter(self.cache))
            del self.cache[oldest_key]
        self.cache[key] = value

# Use custom cache
cache = SimpleCache()

def custom_cached_function(x):
    cached_result = cache.get(x)
    if cached_result is not None:
        return cached_result

    result = x * x  # Expensive computation
    cache.set(x, result)
    return result

Data Caching

import redis
import json
import pickle
from datetime import timedelta

# Redis caching
class RedisCache:
    def __init__(self, host='localhost', port=6379):
        self.redis_client = redis.Redis(host=host, port=port, decode_responses=False)

    def get(self, key):
        try:
            data = self.redis_client.get(key)
            return pickle.loads(data) if data else None
        except Exception:
            return None

    def set(self, key, value, expire_seconds=3600):
        try:
            data = pickle.dumps(value)
            return self.redis_client.setex(key, expire_seconds, data)
        except Exception:
            return False

    def delete(self, key):
        try:
            return self.redis_client.delete(key)
        except Exception:
            return False

# Usage
cache = RedisCache()

def get_user_data(user_id):
    # Try cache first
    cached_data = cache.get(f"user:{user_id}")
    if cached_data:
        return cached_data

    # Fetch from database (simulated)
    user_data = {
        'id': user_id,
        'name': f"User {user_id}",
        'email': f"user{user_id}@example.com"
    }

    # Cache the result
    cache.set(f"user:{user_id}", user_data, expire_seconds=300)
    return user_data

📈 Profiling and Optimization

Performance Profiling

import cProfile
import pstats
import timeit
from line_profiler import LineProfiler

# Time-based profiling
def profile_function():
    def expensive_operation():
        total = 0
        for i in range(1000000):
            total += i * i
        return total

    # Time the function
    time_taken = timeit.timeit(expensive_operation, number=10)
    print(f"Function took {time_taken:.4f} seconds for 10 runs")

# Profile-based analysis
def profile_with_cprofile():
    def complex_function():
        result = []
        for i in range(10000):
            if i % 2 == 0:
                result.append(i * 2)
            else:
                result.append(i * 3)
        return sum(result)

    # Create profile
    profiler = cProfile.Profile()
    profiler.enable()

    # Run function
    complex_function()

    profiler.disable()

    # Print statistics
    stats = pstats.Stats(profiler)
    stats.sort_stats('cumulative')
    stats.print_stats(10)  # Top 10 functions

# Line-by-line profiling
def profile_lines():
    lp = LineProfiler()

    @lp.profile
    def line_intensive_function():
        total = 0
        for i in range(10000):
            if i % 2 == 0:
                total += i * 2
            else:
                total += i * 3
        return total

    line_intensive_function()
    lp.print_stats()

Memory Profiling

import gc
import sys
import tracemalloc
from pympler import asizeof
from pympler import muppy
from pympler import summary

# Object size measurement
def measure_object_sizes():
    small_list = [1, 2, 3]
    large_list = list(range(10000))

    print(f"Small list size: {asizeof.asizeof(small_list)} bytes")
    print(f"Large list size: {asizeof.asizeof(large_list)} bytes")
    print(f"Large list sys.sizeof: {sys.getsizeof(large_list)} bytes")

# Memory leak detection
def detect_memory_leaks():
    # Start tracing
    tracemalloc.start()

    # Create objects
    objects = []
    for i in range(1000):
        objects.append({'id': i, 'data': 'x' * 100})

    # Get memory snapshot
    snapshot1 = tracemalloc.take_snapshot()

    # Delete some objects
    del objects[:500]

    # Force garbage collection
    gc.collect()

    # Get another snapshot
    snapshot2 = tracemalloc.take_snapshot()

    # Compare snapshots
    stats = snapshot2.compare_to(snapshot1, 'lineno')
    for stat in stats[:10]:
        print(stat)

# Memory summary
def memory_summary():
    # Get all objects
    all_objects = muppy.get_objects()

    # Summarize by type
    sum1 = summary.summarize(all_objects)

    # Print summary
    summary.print_(sum1)

🎯 Optimization Best Practices

Code Optimization Checklist

# Optimization checklist implementation

class PerformanceChecker:
    @staticmethod
    def check_list_vs_set_membership():
        """Check if set membership is more efficient than list"""
        data = list(range(10000))
        test_item = 5000

        # List membership test
        list_time = timeit.timeit(
            lambda: test_item in data, 
            number=1000
        )

        # Set membership test
        set_time = timeit.timeit(
            lambda: test_item in set(data), 
            number=1000
        )

        return {
            'list_time': list_time,
            'set_time': set_time,
            'improvement': list_time / set_time
        }

    @staticmethod
    def check_comprehension_vs_loop():
        """Compare list comprehension vs loop"""
        numbers = range(10000)

        # Loop approach
        loop_time = timeit.timeit(
            lambda: [x * x for x in numbers],  # Actually comprehension
            number=100
        )

        return {'comprehension_time': loop_time}

    @staticmethod
    def check_string_operations():
        """Check string concatenation methods"""
        strings = [f"string_{i}" for i in range(1000)]

        # String concatenation with +
        concat_time = timeit.timeit(
            lambda: ''.join(strings),
            number=100
        )

        return {'concat_time': concat_time}

# Usage
checker = PerformanceChecker()
results = checker.check_list_vs_set_membership()
print(f"Set vs List improvement: {results['improvement']:.2f}x faster")

Performance Monitoring

import psutil
import time
from contextlib import contextmanager

@contextmanager
def performance_monitor():
    """Monitor performance during code execution"""
    # Get initial stats
    process = psutil.Process()
    initial_memory = process.memory_info().rss
    initial_cpu = process.cpu_percent()
    start_time = time.time()

    try:
        yield
    finally:
        # Get final stats
        end_time = time.time()
        final_memory = process.memory_info().rss
        final_cpu = process.cpu_percent()

        # Calculate differences
        execution_time = end_time - start_time
        memory_used = (final_memory - initial_memory) / 1024 / 1024  # MB
        cpu_usage = final_cpu - initial_cpu

        print(f"Execution time: {execution_time:.4f}s")
        print(f"Memory used: {memory_used:.2f} MB")
        print(f"CPU usage: {cpu_usage:.2f}%")

# Usage
with performance_monitor():
    # Your code here
    result = sum(i * i for i in range(100000))

🔗 Language-Specific Performance