Python Performance Tips¶
This guide covers Python performance optimization techniques, best practices, and tools for writing efficient Python code.
🎯 Core Performance Principles¶
Python Performance Characteristics¶
- Interpreted Language: Slower than compiled languages
- Dynamic Typing: Runtime type checking overhead
- GIL Limitation: Global Interpreter Lock limits true parallelism
- Rich Ecosystem: Many optimized libraries available
Measurement First¶
- Profile Before Optimizing: Use profilers to identify bottlenecks
- Benchmark Properly: Use timeit and pytest-benchmark
- Measure in Context: Profile realistic workloads
- Set Performance Goals: Define measurable targets
🚀 Algorithm Optimization¶
Choose Right Data Structures¶
# Bad: List for membership testing (O(n))
def has_permission_list(user_permissions, permission):
return permission in user_permissions # O(n) lookup
# Good: Set for membership testing (O(1))
def has_permission_set(user_permissions, permission):
return permission in set(user_permissions) # O(1) lookup
# Performance comparison
import timeit
permissions = [f"perm_{i}" for i in range(10000)]
test_permission = "perm_5000"
# List lookup: ~0.0005 seconds
list_time = timeit.timeit(
lambda: test_permission in permissions,
number=1000
)
# Set lookup: ~0.00001 seconds
set_time = timeit.timeit(
lambda: test_permission in set(permissions),
number=1000
)
print(f"List: {list_time:.6f}s, Set: {set_time:.6f}s")
Use Built-in Functions¶
# Bad: Manual implementation
def sum_manual(numbers):
total = 0
for num in numbers:
total += num
return total
# Good: Built-in sum function
def sum_builtin(numbers):
return sum(numbers) # Optimized C implementation
# Bad: Manual max finding
def max_manual(numbers):
max_val = numbers[0]
for num in numbers[1:]:
if num > max_val:
max_val = num
return max_val
# Good: Built-in max function
def max_builtin(numbers):
return max(numbers) # Optimized C implementation
List Comprehensions and Generators¶
# Bad: Loop with append
def squares_bad(numbers):
result = []
for num in numbers:
result.append(num * num)
return result
# Good: List comprehension
def squares_good(numbers):
return [num * num for num in numbers] # More efficient
# Better: Generator for large datasets
def squares_generator(numbers):
return (num * num for num in numbers) # Memory efficient
# Performance comparison
import sys
large_list = list(range(1000000))
# List comprehension: ~8MB memory
list_comp = [x * x for x in large_list]
print(f"List comprehension: {sys.getsizeof(list_comp)} bytes")
# Generator: ~120 bytes memory
gen_comp = (x * x for x in large_list)
print(f"Generator: {sys.getsizeof(gen_comp)} bytes")
📊 Memory Optimization¶
Memory-Efficient Data Structures¶
# Bad: List of tuples (high memory usage)
def create_user_list_bad():
users = []
for i in range(100000):
users.append((f"user_{i}", f"email_{i}@example.com", i))
return users
# Good: List of objects (better memory usage)
class User:
__slots__ = ['username', 'email', 'id'] # Reduces memory usage
def __init__(self, username, email, user_id):
self.username = username
self.email = email
self.id = user_id
def create_user_list_good():
users = []
for i in range(100000):
users.append(User(f"user_{i}", f"email_{i}@example.com", i))
return users
# Better: Use namedtuple for simple data structures
from collections import namedtuple
UserTuple = namedtuple('UserTuple', ['username', 'email', 'id'])
def create_user_list_tuple():
return [UserTuple(f"user_{i}", f"email_{i}@example.com", i)
for i in range(100000)]
Generator Usage¶
# Bad: Loading entire file into memory
def process_large_file_bad(filename):
with open(filename, 'r') as file:
lines = file.readlines() # Loads entire file
return [line.strip().upper() for line in lines]
# Good: Process line by line
def process_large_file_good(filename):
with open(filename, 'r') as file:
for line in file: # One line at a time
yield line.strip().upper()
# Better: Use generator expression
def process_large_file_generator(filename):
with open(filename, 'r') as file:
return (line.strip().upper() for line in file)
# Usage
for processed_line in process_large_file_good("large_file.txt"):
# Process each line without loading entire file
pass
Memory Profiling¶
import tracemalloc
import sys
from memory_profiler import profile
# Memory profiling with tracemalloc
def memory_intensive_function():
# Start tracing memory allocation
tracemalloc.start()
# Allocate memory
large_data = [i for i in range(1000000)]
# Get memory statistics
current, peak = tracemalloc.get_traced_memory()
print(f"Current memory usage: {current / 1024 / 1024:.2f} MB")
print(f"Peak memory usage: {peak / 1024 / 1024:.2f} MB")
tracemalloc.stop()
return large_data
# Memory profiling with memory_profiler
@profile
def memory_profile_function():
# Create large data structures
data = []
for i in range(100000):
data.append({
'id': i,
'name': f"item_{i}",
'value': i * 2
})
return data
🔄 Concurrency and Parallelism¶
Multiprocessing for CPU-Bound Tasks¶
import multiprocessing
import time
from concurrent.futures import ProcessPoolExecutor
# CPU-bound task
def fibonacci(n):
if n <= 1:
return n
return fibonacci(n-1) + fibonacci(n-2)
# Bad: Sequential execution
def fibonacci_sequential(numbers):
return [fibonacci(n) for n in numbers]
# Good: Parallel execution with multiprocessing
def fibonacci_parallel(numbers):
with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
return list(executor.map(fibonacci, numbers))
# Performance comparison
if __name__ == "__main__":
numbers = [35, 36, 37, 38, 39, 40]
# Sequential execution
start_time = time.time()
result_sequential = fibonacci_sequential(numbers)
sequential_time = time.time() - start_time
# Parallel execution
start_time = time.time()
result_parallel = fibonacci_parallel(numbers)
parallel_time = time.time() - start_time
print(f"Sequential: {sequential_time:.2f}s")
print(f"Parallel: {parallel_time:.2f}s")
print(f"Speedup: {sequential_time / parallel_time:.2f}x")
Threading for I/O-Bound Tasks¶
import threading
import time
import requests
from concurrent.futures import ThreadPoolExecutor
# I/O-bound task
def fetch_url(url):
try:
response = requests.get(url, timeout=5)
return len(response.content)
except requests.RequestException:
return 0
# Bad: Sequential execution
def fetch_urls_sequential(urls):
results = []
for url in urls:
results.append(fetch_url(url))
return results
# Good: Parallel execution with threading
def fetch_urls_parallel(urls):
with ThreadPoolExecutor(max_workers=10) as executor:
return list(executor.map(fetch_url, urls))
# Usage
urls = [
"https://httpbin.org/delay/1",
"https://httpbin.org/delay/1",
"https://httpbin.org/delay/1"
]
# Sequential execution
start_time = time.time()
results_sequential = fetch_urls_sequential(urls)
sequential_time = time.time() - start_time
# Parallel execution
start_time = time.time()
results_parallel = fetch_urls_parallel(urls)
parallel_time = time.time() - start_time
print(f"Sequential: {sequential_time:.2f}s")
print(f"Parallel: {parallel_time:.2f}s")
Asyncio for Asynchronous Programming¶
import asyncio
import aiohttp
import time
# Async I/O-bound task
async def fetch_url_async(session, url):
try:
async with session.get(url, timeout=5) as response:
return len(await response.read())
except Exception:
return 0
# Async execution
async def fetch_urls_async(urls):
async with aiohttp.ClientSession() as session:
tasks = [fetch_url_async(session, url) for url in urls]
return await asyncio.gather(*tasks)
# Usage
async def main():
urls = [
"https://httpbin.org/delay/1",
"https://httpbin.org/delay/1",
"https://httpbin.org/delay/1"
]
start_time = time.time()
results = await fetch_urls_async(urls)
async_time = time.time() - start_time
print(f"Async: {async_time:.2f}s")
return results
# Run async function
if __name__ == "__main__":
asyncio.run(main())
🛠️ Caching and Memoization¶
Function Caching¶
from functools import lru_cache
import time
# Bad: Recalculating expensive operations
def expensive_function(x):
time.sleep(1) # Simulate expensive computation
return x * x
# Good: Use lru_cache for memoization
@lru_cache(maxsize=128)
def cached_expensive_function(x):
time.sleep(1) # Simulate expensive computation
return x * x
# Performance comparison
def test_performance():
# Without cache
start_time = time.time()
for i in range(5):
result = expensive_function(i)
no_cache_time = time.time() - start_time
# With cache
start_time = time.time()
for i in range(5):
result = cached_expensive_function(i)
with_cache_time = time.time() - start_time
print(f"No cache: {no_cache_time:.2f}s")
print(f"With cache: {with_cache_time:.2f}s")
# Custom cache implementation
class SimpleCache:
def __init__(self, max_size=128):
self.cache = {}
self.max_size = max_size
def get(self, key):
return self.cache.get(key)
def set(self, key, value):
if len(self.cache) >= self.max_size:
# Remove oldest item (simple LRU)
oldest_key = next(iter(self.cache))
del self.cache[oldest_key]
self.cache[key] = value
# Use custom cache
cache = SimpleCache()
def custom_cached_function(x):
cached_result = cache.get(x)
if cached_result is not None:
return cached_result
result = x * x # Expensive computation
cache.set(x, result)
return result
Data Caching¶
import redis
import json
import pickle
from datetime import timedelta
# Redis caching
class RedisCache:
def __init__(self, host='localhost', port=6379):
self.redis_client = redis.Redis(host=host, port=port, decode_responses=False)
def get(self, key):
try:
data = self.redis_client.get(key)
return pickle.loads(data) if data else None
except Exception:
return None
def set(self, key, value, expire_seconds=3600):
try:
data = pickle.dumps(value)
return self.redis_client.setex(key, expire_seconds, data)
except Exception:
return False
def delete(self, key):
try:
return self.redis_client.delete(key)
except Exception:
return False
# Usage
cache = RedisCache()
def get_user_data(user_id):
# Try cache first
cached_data = cache.get(f"user:{user_id}")
if cached_data:
return cached_data
# Fetch from database (simulated)
user_data = {
'id': user_id,
'name': f"User {user_id}",
'email': f"user{user_id}@example.com"
}
# Cache the result
cache.set(f"user:{user_id}", user_data, expire_seconds=300)
return user_data
📈 Profiling and Optimization¶
Performance Profiling¶
import cProfile
import pstats
import timeit
from line_profiler import LineProfiler
# Time-based profiling
def profile_function():
def expensive_operation():
total = 0
for i in range(1000000):
total += i * i
return total
# Time the function
time_taken = timeit.timeit(expensive_operation, number=10)
print(f"Function took {time_taken:.4f} seconds for 10 runs")
# Profile-based analysis
def profile_with_cprofile():
def complex_function():
result = []
for i in range(10000):
if i % 2 == 0:
result.append(i * 2)
else:
result.append(i * 3)
return sum(result)
# Create profile
profiler = cProfile.Profile()
profiler.enable()
# Run function
complex_function()
profiler.disable()
# Print statistics
stats = pstats.Stats(profiler)
stats.sort_stats('cumulative')
stats.print_stats(10) # Top 10 functions
# Line-by-line profiling
def profile_lines():
lp = LineProfiler()
@lp.profile
def line_intensive_function():
total = 0
for i in range(10000):
if i % 2 == 0:
total += i * 2
else:
total += i * 3
return total
line_intensive_function()
lp.print_stats()
Memory Profiling¶
import gc
import sys
import tracemalloc
from pympler import asizeof
from pympler import muppy
from pympler import summary
# Object size measurement
def measure_object_sizes():
small_list = [1, 2, 3]
large_list = list(range(10000))
print(f"Small list size: {asizeof.asizeof(small_list)} bytes")
print(f"Large list size: {asizeof.asizeof(large_list)} bytes")
print(f"Large list sys.sizeof: {sys.getsizeof(large_list)} bytes")
# Memory leak detection
def detect_memory_leaks():
# Start tracing
tracemalloc.start()
# Create objects
objects = []
for i in range(1000):
objects.append({'id': i, 'data': 'x' * 100})
# Get memory snapshot
snapshot1 = tracemalloc.take_snapshot()
# Delete some objects
del objects[:500]
# Force garbage collection
gc.collect()
# Get another snapshot
snapshot2 = tracemalloc.take_snapshot()
# Compare snapshots
stats = snapshot2.compare_to(snapshot1, 'lineno')
for stat in stats[:10]:
print(stat)
# Memory summary
def memory_summary():
# Get all objects
all_objects = muppy.get_objects()
# Summarize by type
sum1 = summary.summarize(all_objects)
# Print summary
summary.print_(sum1)
🎯 Optimization Best Practices¶
Code Optimization Checklist¶
# Optimization checklist implementation
class PerformanceChecker:
@staticmethod
def check_list_vs_set_membership():
"""Check if set membership is more efficient than list"""
data = list(range(10000))
test_item = 5000
# List membership test
list_time = timeit.timeit(
lambda: test_item in data,
number=1000
)
# Set membership test
set_time = timeit.timeit(
lambda: test_item in set(data),
number=1000
)
return {
'list_time': list_time,
'set_time': set_time,
'improvement': list_time / set_time
}
@staticmethod
def check_comprehension_vs_loop():
"""Compare list comprehension vs loop"""
numbers = range(10000)
# Loop approach
loop_time = timeit.timeit(
lambda: [x * x for x in numbers], # Actually comprehension
number=100
)
return {'comprehension_time': loop_time}
@staticmethod
def check_string_operations():
"""Check string concatenation methods"""
strings = [f"string_{i}" for i in range(1000)]
# String concatenation with +
concat_time = timeit.timeit(
lambda: ''.join(strings),
number=100
)
return {'concat_time': concat_time}
# Usage
checker = PerformanceChecker()
results = checker.check_list_vs_set_membership()
print(f"Set vs List improvement: {results['improvement']:.2f}x faster")
Performance Monitoring¶
import psutil
import time
from contextlib import contextmanager
@contextmanager
def performance_monitor():
"""Monitor performance during code execution"""
# Get initial stats
process = psutil.Process()
initial_memory = process.memory_info().rss
initial_cpu = process.cpu_percent()
start_time = time.time()
try:
yield
finally:
# Get final stats
end_time = time.time()
final_memory = process.memory_info().rss
final_cpu = process.cpu_percent()
# Calculate differences
execution_time = end_time - start_time
memory_used = (final_memory - initial_memory) / 1024 / 1024 # MB
cpu_usage = final_cpu - initial_cpu
print(f"Execution time: {execution_time:.4f}s")
print(f"Memory used: {memory_used:.2f} MB")
print(f"CPU usage: {cpu_usage:.2f}%")
# Usage
with performance_monitor():
# Your code here
result = sum(i * i for i in range(100000))
📚 Related Resources¶
- Python Best Practices - Write efficient Python code
- Python Common Mistakes - Avoid performance pitfalls
- Python Testing Frameworks - Performance testing
- Python Resources - Performance tools and documentation
🔗 Related Performance Guides¶
- Time Complexity - Algorithm analysis
- Space Complexity - Memory optimization
- Algorithm Analysis - Performance measurement
- Optimization Techniques - General optimization strategies
🔗 Language-Specific Performance¶
- Java Performance Tips - Java optimization
- C Performance Tips - C optimization
- Oracle Performance Tips - Oracle optimization