PostgreSQL Best Practices

Official PostgreSQL guidelines - Schema Design, Indexing, Query Optimization, Transactions, Advanced SQL

Core Principles

•ALWAYS Enable Constraints - Use PRIMARY KEY, FOREIGN KEY, UNIQUE, CHECK, NOT NULL for data integrity
•Index Strategically - Choose the right index type (B-tree, GIN, GiST, Hash) based on query patterns
•Use EXPLAIN ANALYZE - Analyze query plans to identify performance bottlenecks
•Leverage MVCC - Understand Multi-Version Concurrency Control for optimal transaction management
•Apply Normalization - Use 3NF for OLTP, denormalize selectively for performance
•Connection Pooling - Use PgBouncer or Supavisor to manage connections efficiently
•Monitor Performance - Track slow queries with pg_stat_statements

Schema Design - Complete Guide

Data Types

✅ Good: Use Appropriate Data Types

sql

-- Use specific data types for better storage and performance
CREATE TABLE users (
  id BIGSERIAL PRIMARY KEY,                    -- Auto-incrementing 64-bit integer
  email VARCHAR(255) NOT NULL UNIQUE,          -- Variable-length string with limit
  username VARCHAR(50) NOT NULL UNIQUE,        -- Shorter string for usernames
  age INTEGER CHECK (age >= 0 AND age <= 150), -- Integer with range constraint
  balance NUMERIC(10, 2),                      -- Exact decimal for money (10 digits, 2 decimal places)
  is_active BOOLEAN DEFAULT true,              -- Boolean for flags
  metadata JSONB,                               -- Binary JSON for flexible data
  tags TEXT[],                                  -- Array of strings
  created_at TIMESTAMPTZ DEFAULT NOW(),        -- Timestamp with timezone
  updated_at TIMESTAMPTZ DEFAULT NOW()
);

-- Use UUID for distributed systems
CREATE TABLE sessions (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  user_id BIGINT NOT NULL REFERENCES users(id),
  token TEXT NOT NULL,
  expires_at TIMESTAMPTZ NOT NULL
);

-- Use ENUM for fixed sets of values
CREATE TYPE order_status AS ENUM ('pending', 'processing', 'shipped', 'delivered', 'cancelled');

CREATE TABLE orders (
  id BIGSERIAL PRIMARY KEY,
  user_id BIGINT NOT NULL REFERENCES users(id),
  status order_status DEFAULT 'pending',
  total NUMERIC(10, 2) NOT NULL,
  created_at TIMESTAMPTZ DEFAULT NOW()
);

❌ Bad: Inappropriate Data Types

sql

-- ❌ Using TEXT for everything (wastes storage, slower comparisons)
CREATE TABLE users (
  id TEXT,
  email TEXT,
  age TEXT,        -- Should be INTEGER
  balance TEXT,    -- Should be NUMERIC
  is_active TEXT,  -- Should be BOOLEAN
  created_at TEXT  -- Should be TIMESTAMPTZ
);

-- ❌ Using FLOAT for money (precision errors)
CREATE TABLE transactions (
  amount FLOAT  -- ❌ NEVER use FLOAT for money, use NUMERIC
);

-- ❌ Using VARCHAR without limit
CREATE TABLE posts (
  content VARCHAR  -- ❌ Specify limit or use TEXT
);

-- ❌ Using TIMESTAMP instead of TIMESTAMPTZ
CREATE TABLE events (
  occurred_at TIMESTAMP  -- ❌ No timezone info, use TIMESTAMPTZ
);

Primary Keys and Foreign Keys

✅ Good: Proper Primary and Foreign Keys

sql

-- Use BIGSERIAL for auto-incrementing IDs
CREATE TABLE customers (
  id BIGSERIAL PRIMARY KEY,
  name VARCHAR(255) NOT NULL,
  email VARCHAR(255) NOT NULL UNIQUE,
  created_at TIMESTAMPTZ DEFAULT NOW()
);

-- Foreign key with proper referential actions
CREATE TABLE orders (
  id BIGSERIAL PRIMARY KEY,
  customer_id BIGINT NOT NULL,
  total NUMERIC(10, 2) NOT NULL,
  created_at TIMESTAMPTZ DEFAULT NOW(),
  
  -- Foreign key with ON DELETE CASCADE (delete orders when customer is deleted)
  CONSTRAINT fk_customer
    FOREIGN KEY (customer_id)
    REFERENCES customers(id)
    ON DELETE CASCADE
    ON UPDATE CASCADE
);

-- Composite primary key
CREATE TABLE order_items (
  order_id BIGINT NOT NULL,
  product_id BIGINT NOT NULL,
  quantity INTEGER NOT NULL CHECK (quantity > 0),
  price NUMERIC(10, 2) NOT NULL,
  
  PRIMARY KEY (order_id, product_id),
  
  FOREIGN KEY (order_id) REFERENCES orders(id) ON DELETE CASCADE,
  FOREIGN KEY (product_id) REFERENCES products(id) ON DELETE RESTRICT
);

-- Use UUID for distributed systems or multi-tenant apps
CREATE TABLE tenants (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  name VARCHAR(255) NOT NULL,
  created_at TIMESTAMPTZ DEFAULT NOW()
);

CREATE TABLE tenant_users (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  tenant_id UUID NOT NULL REFERENCES tenants(id) ON DELETE CASCADE,
  email VARCHAR(255) NOT NULL,
  UNIQUE (tenant_id, email)  -- Unique per tenant
);

❌ Bad: Missing or Improper Keys

sql

-- ❌ No primary key
CREATE TABLE logs (
  user_id BIGINT,
  action TEXT,
  created_at TIMESTAMPTZ
);  -- ❌ Should have id or composite key

-- ❌ No foreign key constraint (orphaned records possible)
CREATE TABLE comments (
  id BIGSERIAL PRIMARY KEY,
  post_id BIGINT,  -- ❌ No REFERENCES constraint
  content TEXT
);

-- ❌ Wrong ON DELETE action
CREATE TABLE invoices (
  id BIGSERIAL PRIMARY KEY,
  customer_id BIGINT REFERENCES customers(id) ON DELETE CASCADE
);  -- ❌ CASCADE may not be appropriate for invoices

-- ❌ Using INTEGER for large tables (will overflow at 2.1B)
CREATE TABLE events (
  id SERIAL PRIMARY KEY  -- ❌ Use BIGSERIAL for tables that grow large
);

Constraints

✅ Good: Comprehensive Constraints

sql

CREATE TABLE products (
  id BIGSERIAL PRIMARY KEY,
  sku VARCHAR(50) NOT NULL UNIQUE,
  name VARCHAR(255) NOT NULL,
  description TEXT,
  price NUMERIC(10, 2) NOT NULL CHECK (price >= 0),
  cost NUMERIC(10, 2) CHECK (cost >= 0),
  stock INTEGER NOT NULL DEFAULT 0 CHECK (stock >= 0),
  category VARCHAR(50) NOT NULL,
  is_active BOOLEAN DEFAULT true,
  created_at TIMESTAMPTZ DEFAULT NOW(),
  updated_at TIMESTAMPTZ DEFAULT NOW(),
  
  -- Table-level constraint
  CONSTRAINT price_greater_than_cost CHECK (price >= cost)
);

-- Email validation constraint
CREATE TABLE subscribers (
  id BIGSERIAL PRIMARY KEY,
  email VARCHAR(255) NOT NULL UNIQUE,
  
  CONSTRAINT email_format CHECK (email ~* '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$')
);

-- Date range constraints
CREATE TABLE bookings (
  id BIGSERIAL PRIMARY KEY,
  room_id BIGINT NOT NULL REFERENCES rooms(id),
  check_in DATE NOT NULL,
  check_out DATE NOT NULL,
  
  CONSTRAINT valid_date_range CHECK (check_out > check_in)
);

-- Mutually exclusive columns (use CHECK constraint)
CREATE TABLE payments (
  id BIGSERIAL PRIMARY KEY,
  amount NUMERIC(10, 2) NOT NULL,
  credit_card_id BIGINT REFERENCES credit_cards(id),
  paypal_email VARCHAR(255),
  
  -- Ensure exactly one payment method is provided
  CONSTRAINT one_payment_method CHECK (
    (credit_card_id IS NOT NULL AND paypal_email IS NULL) OR
    (credit_card_id IS NULL AND paypal_email IS NOT NULL)
  )
);

❌ Bad: Missing or Weak Constraints

sql

-- ❌ No constraints, allows invalid data
CREATE TABLE products (
  id BIGSERIAL PRIMARY KEY,
  name VARCHAR(255),        -- ❌ Should be NOT NULL
  price NUMERIC(10, 2),     -- ❌ Should be NOT NULL and CHECK >= 0
  stock INTEGER             -- ❌ Should have CHECK >= 0
);

-- ❌ Duplicate emails allowed
CREATE TABLE users (
  id BIGSERIAL PRIMARY KEY,
  email VARCHAR(255) NOT NULL  -- ❌ Missing UNIQUE constraint
);

-- ❌ No validation on enum-like columns
CREATE TABLE orders (
  id BIGSERIAL PRIMARY KEY,
  status VARCHAR(50)  -- ❌ Should use ENUM or CHECK constraint
);

Normalization

✅ Good: Properly Normalized Schema (3NF)

sql

-- Third Normal Form (3NF): No transitive dependencies

-- 1. Customers table
CREATE TABLE customers (
  id BIGSERIAL PRIMARY KEY,
  name VARCHAR(255) NOT NULL,
  email VARCHAR(255) NOT NULL UNIQUE,
  phone VARCHAR(20),
  created_at TIMESTAMPTZ DEFAULT NOW()
);

-- 2. Addresses table (separate from customers)
CREATE TABLE addresses (
  id BIGSERIAL PRIMARY KEY,
  customer_id BIGINT NOT NULL REFERENCES customers(id) ON DELETE CASCADE,
  street VARCHAR(255) NOT NULL,
  city VARCHAR(100) NOT NULL,
  state VARCHAR(50) NOT NULL,
  zip_code VARCHAR(20) NOT NULL,
  country VARCHAR(100) NOT NULL,
  is_default BOOLEAN DEFAULT false
);

-- 3. Products table
CREATE TABLE products (
  id BIGSERIAL PRIMARY KEY,
  name VARCHAR(255) NOT NULL,
  sku VARCHAR(50) NOT NULL UNIQUE,
  price NUMERIC(10, 2) NOT NULL CHECK (price >= 0),
  category_id BIGINT NOT NULL REFERENCES categories(id)
);

-- 4. Categories table (no redundancy)
CREATE TABLE categories (
  id BIGSERIAL PRIMARY KEY,
  name VARCHAR(100) NOT NULL UNIQUE,
  description TEXT
);

-- 5. Orders table
CREATE TABLE orders (
  id BIGSERIAL PRIMARY KEY,
  customer_id BIGINT NOT NULL REFERENCES customers(id),
  shipping_address_id BIGINT NOT NULL REFERENCES addresses(id),
  total NUMERIC(10, 2) NOT NULL CHECK (total >= 0),
  status order_status DEFAULT 'pending',
  created_at TIMESTAMPTZ DEFAULT NOW()
);

-- 6. Order items (many-to-many relationship)
CREATE TABLE order_items (
  order_id BIGINT NOT NULL REFERENCES orders(id) ON DELETE CASCADE,
  product_id BIGINT NOT NULL REFERENCES products(id),
  quantity INTEGER NOT NULL CHECK (quantity > 0),
  price NUMERIC(10, 2) NOT NULL,  -- Snapshot price at order time
  
  PRIMARY KEY (order_id, product_id)
);

❌ Bad: Denormalized Schema with Redundancy

sql

-- ❌ First Normal Form violation (repeating groups)
CREATE TABLE orders_bad (
  id BIGSERIAL PRIMARY KEY,
  customer_name VARCHAR(255),
  product1_name VARCHAR(255),
  product1_quantity INTEGER,
  product1_price NUMERIC(10, 2),
  product2_name VARCHAR(255),     -- ❌ Repeating groups
  product2_quantity INTEGER,
  product2_price NUMERIC(10, 2)
);

-- ❌ Second Normal Form violation (partial dependencies)
CREATE TABLE order_items_bad (
  order_id BIGINT,
  product_id BIGINT,
  customer_name VARCHAR(255),     -- ❌ Depends on order_id, not full key
  product_name VARCHAR(255),      -- ❌ Depends on product_id, not full key
  quantity INTEGER,
  
  PRIMARY KEY (order_id, product_id)
);

-- ❌ Third Normal Form violation (transitive dependency)
CREATE TABLE employees_bad (
  id BIGSERIAL PRIMARY KEY,
  name VARCHAR(255),
  department_id BIGINT,
  department_name VARCHAR(255),   -- ❌ Depends on department_id, not id
  department_budget NUMERIC(10, 2)  -- ❌ Transitive dependency
);

Strategic Denormalization

✅ Good: Denormalization for Performance

sql

-- Add calculated/cached columns for frequently accessed data
CREATE TABLE orders (
  id BIGSERIAL PRIMARY KEY,
  customer_id BIGINT NOT NULL REFERENCES customers(id),
  
  -- Denormalized: cache customer name for faster queries
  customer_name VARCHAR(255) NOT NULL,
  
  -- Denormalized: cache total to avoid SUM on order_items
  item_count INTEGER NOT NULL DEFAULT 0,
  total NUMERIC(10, 2) NOT NULL DEFAULT 0,
  
  status order_status DEFAULT 'pending',
  created_at TIMESTAMPTZ DEFAULT NOW()
);

-- Update denormalized fields with triggers
CREATE OR REPLACE FUNCTION update_order_totals()
RETURNS TRIGGER AS $$
BEGIN
  UPDATE orders
  SET 
    item_count = (SELECT COUNT(*) FROM order_items WHERE order_id = NEW.order_id),
    total = (SELECT COALESCE(SUM(quantity * price), 0) FROM order_items WHERE order_id = NEW.order_id)
  WHERE id = NEW.order_id;
  
  RETURN NEW;
END;
$$ LANGUAGE plpgsql;

CREATE TRIGGER order_items_update_totals
AFTER INSERT OR UPDATE OR DELETE ON order_items
FOR EACH ROW
EXECUTE FUNCTION update_order_totals();

-- Materialized views for complex aggregations
CREATE MATERIALIZED VIEW product_sales_summary AS
SELECT 
  p.id,
  p.name,
  p.category_id,
  COUNT(oi.order_id) AS order_count,
  SUM(oi.quantity) AS total_quantity_sold,
  SUM(oi.quantity * oi.price) AS total_revenue
FROM products p
LEFT JOIN order_items oi ON p.id = oi.product_id
GROUP BY p.id, p.name, p.category_id;

-- Create index on materialized view
CREATE INDEX idx_product_sales_category ON product_sales_summary(category_id);

-- Refresh periodically (e.g., via cron job)
REFRESH MATERIALIZED VIEW CONCURRENTLY product_sales_summary;

❌ Bad: Over-Denormalization

sql

-- ❌ Too much denormalization, hard to maintain consistency
CREATE TABLE orders_bad (
  id BIGSERIAL PRIMARY KEY,
  customer_id BIGINT REFERENCES customers(id),
  customer_name VARCHAR(255),
  customer_email VARCHAR(255),
  customer_phone VARCHAR(20),
  customer_address TEXT,        -- ❌ Too many duplicated fields
  customer_city VARCHAR(100),
  customer_state VARCHAR(50),
  customer_country VARCHAR(100)
);  -- ❌ Nightmare to keep in sync when customer updates profile

Indexing Strategies - Complete Guide

B-Tree Indexes (Default)

✅ Good: B-Tree Index Usage

sql

-- Single-column index for equality and range queries
CREATE INDEX idx_users_email ON users(email);

-- Query benefits from index
SELECT * FROM users WHERE email = 'user@example.com';
SELECT * FROM users WHERE email LIKE 'user%';  -- Prefix search
SELECT * FROM users WHERE created_at > '2024-01-01';
SELECT * FROM users ORDER BY email;  -- Index scan for sorting

-- Multi-column index (order matters!)
CREATE INDEX idx_orders_customer_created ON orders(customer_id, created_at DESC);

-- ✅ Uses index efficiently
SELECT * FROM orders WHERE customer_id = 123 ORDER BY created_at DESC;
SELECT * FROM orders WHERE customer_id = 123 AND created_at > '2024-01-01';

-- Partial index (index subset of rows)
CREATE INDEX idx_active_users ON users(email) WHERE is_active = true;

-- ✅ Uses partial index
SELECT * FROM users WHERE email = 'user@example.com' AND is_active = true;

-- Index on expression
CREATE INDEX idx_users_lower_email ON users(LOWER(email));

-- ✅ Uses expression index
SELECT * FROM users WHERE LOWER(email) = 'user@example.com';

-- Unique index
CREATE UNIQUE INDEX idx_users_username_unique ON users(LOWER(username));

-- Covering index (includes extra columns to avoid table lookup)
CREATE INDEX idx_orders_customer_covering ON orders(customer_id) INCLUDE (total, created_at);

-- ✅ Index-only scan (no table access needed)
SELECT customer_id, total, created_at FROM orders WHERE customer_id = 123;

❌ Bad: Inefficient B-Tree Index Usage

sql

-- ❌ Index on low-cardinality column (few distinct values)
CREATE INDEX idx_users_is_active ON users(is_active);  -- Only 2 values: true/false

-- ❌ Wrong column order in multi-column index
CREATE INDEX idx_orders_created_customer ON orders(created_at, customer_id);

-- ❌ Doesn't use index efficiently (customer_id not first)
SELECT * FROM orders WHERE customer_id = 123 AND created_at > '2024-01-01';

-- ❌ Index not used for wildcard search
SELECT * FROM users WHERE email LIKE '%@example.com';  -- ❌ Leading wildcard

-- ❌ Index not used for case-insensitive search
SELECT * FROM users WHERE email = 'USER@EXAMPLE.COM';  -- ❌ Case mismatch

-- ❌ Over-indexing (too many indexes slow down writes)
CREATE INDEX idx1 ON products(name);
CREATE INDEX idx2 ON products(sku);
CREATE INDEX idx3 ON products(category);
CREATE INDEX idx4 ON products(price);
CREATE INDEX idx5 ON products(created_at);
-- ❌ Each INSERT/UPDATE now updates 6 indexes!

GIN Indexes (Arrays, JSONB, Full-Text Search)

✅ Good: GIN Index for JSONB and Arrays

sql

-- GIN index on JSONB column
CREATE TABLE products (
  id BIGSERIAL PRIMARY KEY,
  name VARCHAR(255) NOT NULL,
  attributes JSONB
);

CREATE INDEX idx_products_attributes ON products USING GIN (attributes);

-- ✅ Efficient JSONB queries
SELECT * FROM products WHERE attributes @> '{"color": "red"}';
SELECT * FROM products WHERE attributes ? 'size';  -- Key exists
SELECT * FROM products WHERE attributes ?| ARRAY['color', 'size'];  -- Any key exists
SELECT * FROM products WHERE attributes ?& ARRAY['color', 'size'];  -- All keys exist

-- GIN index on specific JSONB path
CREATE INDEX idx_products_attributes_color ON products USING GIN ((attributes -> 'color'));

-- GIN index on array column
CREATE TABLE posts (
  id BIGSERIAL PRIMARY KEY,
  title TEXT NOT NULL,
  tags TEXT[]
);

CREATE INDEX idx_posts_tags ON posts USING GIN (tags);

-- ✅ Efficient array queries
SELECT * FROM posts WHERE tags @> ARRAY['postgresql', 'database'];  -- Contains all
SELECT * FROM posts WHERE tags && ARRAY['postgresql', 'sql'];       -- Contains any
SELECT * FROM posts WHERE 'postgresql' = ANY(tags);                 -- Contains element

-- Full-text search with GIN
CREATE TABLE articles (
  id BIGSERIAL PRIMARY KEY,
  title TEXT NOT NULL,
  content TEXT NOT NULL,
  search_vector TSVECTOR GENERATED ALWAYS AS (
    to_tsvector('english', title || ' ' || content)
  ) STORED
);

CREATE INDEX idx_articles_search ON articles USING GIN (search_vector);

-- ✅ Fast full-text search
SELECT * FROM articles WHERE search_vector @@ to_tsquery('english', 'postgresql & performance');

-- Rank results by relevance
SELECT 
  id,
  title,
  ts_rank(search_vector, to_tsquery('english', 'postgresql & performance')) AS rank
FROM articles
WHERE search_vector @@ to_tsquery('english', 'postgresql & performance')
ORDER BY rank DESC;

❌ Bad: Missing or Improper GIN Indexes

sql

-- ❌ No GIN index on JSONB column
CREATE TABLE products (
  id BIGSERIAL PRIMARY KEY,
  attributes JSONB
);
-- ❌ Slow queries on attributes

SELECT * FROM products WHERE attributes @> '{"color": "red"}';  -- Seq scan

-- ❌ Using B-tree index on array (doesn't work well)
CREATE TABLE posts (
  id BIGSERIAL PRIMARY KEY,
  tags TEXT[]
);

CREATE INDEX idx_posts_tags ON posts(tags);  -- ❌ B-tree, not GIN

SELECT * FROM posts WHERE tags @> ARRAY['postgresql'];  -- Slow

GiST Indexes (Geometric, Range Types)

✅ Good: GiST Index for Spatial and Range Data

sql

-- GiST for geometric data (requires PostGIS extension)
CREATE EXTENSION IF NOT EXISTS postgis;

CREATE TABLE locations (
  id BIGSERIAL PRIMARY KEY,
  name VARCHAR(255) NOT NULL,
  coordinates GEOGRAPHY(POINT, 4326)
);

CREATE INDEX idx_locations_coords ON locations USING GIST (coordinates);

-- ✅ Efficient spatial queries
SELECT * FROM locations 
WHERE ST_DWithin(coordinates, ST_MakePoint(-122.4194, 37.7749)::geography, 5000);  -- Within 5km

-- GiST for range types
CREATE TABLE reservations (
  id BIGSERIAL PRIMARY KEY,
  room_id BIGINT NOT NULL,
  period TSTZRANGE NOT NULL,
  EXCLUDE USING GIST (room_id WITH =, period WITH &&)  -- Prevent overlapping reservations
);

-- ✅ Prevent double-booking
INSERT INTO reservations (room_id, period)
VALUES (1, '[2024-01-01 14:00, 2024-01-01 16:00)');

-- ❌ This will fail due to overlap
INSERT INTO reservations (room_id, period)
VALUES (1, '[2024-01-01 15:00, 2024-01-01 17:00)');  -- ERROR: conflicting key

-- Query overlapping ranges
SELECT * FROM reservations 
WHERE room_id = 1 AND period && '[2024-01-01, 2024-01-02)'::tstzrange;

❌ Bad: Missing GiST for Range Data

sql

-- ❌ No exclusion constraint, allows double-booking
CREATE TABLE reservations (
  id BIGSERIAL PRIMARY KEY,
  room_id BIGINT NOT NULL,
  check_in TIMESTAMPTZ NOT NULL,
  check_out TIMESTAMPTZ NOT NULL
);  -- ❌ Can have overlapping reservations

-- Manual check required (race conditions possible)

Hash Indexes

✅ Good: Hash Index for Equality Queries

sql

-- Hash index for exact equality (since PostgreSQL 10: crash-safe, replicable)
CREATE TABLE sessions (
  id UUID PRIMARY KEY,
  token TEXT NOT NULL,
  user_id BIGINT NOT NULL,
  expires_at TIMESTAMPTZ NOT NULL
);

CREATE INDEX idx_sessions_token ON sessions USING HASH (token);

-- ✅ Fast equality lookup
SELECT * FROM sessions WHERE token = 'abc123def456';

-- ✅ Use hash index for large text columns with equality queries
CREATE INDEX idx_files_hash ON files USING HASH (sha256_hash);

❌ Bad: Hash Index Misuse

sql

-- ❌ Hash index doesn't support range queries
CREATE INDEX idx_users_created ON users USING HASH (created_at);

-- ❌ Index not used
SELECT * FROM users WHERE created_at > '2024-01-01';  -- Seq scan

-- ❌ Hash index on low-cardinality column
CREATE INDEX idx_users_active ON users USING HASH (is_active);  -- Use B-tree or skip

Index Maintenance

✅ Good: Regular Index Maintenance

sql

-- Reindex to rebuild bloated indexes (offline operation)
REINDEX INDEX idx_users_email;
REINDEX TABLE users;  -- Reindex all indexes on table
REINDEX DATABASE mydb;  -- Reindex entire database

-- Concurrent reindex (online, no locks)
REINDEX INDEX CONCURRENTLY idx_users_email;

-- Vacuum to reclaim space and update statistics
VACUUM ANALYZE users;  -- Vacuum and update stats
VACUUM FULL users;     -- Aggressive vacuum (locks table)

-- Monitor index usage
SELECT 
  schemaname,
  tablename,
  indexname,
  idx_scan,  -- Number of index scans
  idx_tup_read,
  idx_tup_fetch
FROM pg_stat_user_indexes
WHERE schemaname = 'public'
ORDER BY idx_scan ASC;  -- Find unused indexes

-- Drop unused indexes
SELECT 
  schemaname || '.' || tablename AS table,
  indexname,
  pg_size_pretty(pg_relation_size(indexrelid)) AS size
FROM pg_stat_user_indexes
WHERE idx_scan = 0
  AND indexrelname NOT LIKE '%_pkey'  -- Keep primary keys
ORDER BY pg_relation_size(indexrelid) DESC;

-- Check index bloat
SELECT 
  schemaname,
  tablename,
  indexname,
  pg_size_pretty(pg_relation_size(indexrelid)) AS index_size,
  idx_scan,
  idx_tup_read,
  idx_tup_fetch
FROM pg_stat_user_indexes
WHERE schemaname = 'public'
ORDER BY pg_relation_size(indexrelid) DESC;

-- Auto-vacuum configuration (postgresql.conf)
-- autovacuum = on  (default)
-- autovacuum_max_workers = 3
-- autovacuum_naptime = 1min

❌ Bad: No Index Maintenance

sql

-- ❌ Never reindexing or vacuuming
-- Results in:
-- - Index bloat (wasted disk space)
-- - Slower queries (bloated indexes)
-- - Outdated statistics (poor query plans)
-- - Dead tuples accumulating

-- ❌ Disabling autovacuum
ALTER TABLE large_table SET (autovacuum_enabled = false);  -- ❌ Don't do this

Query Optimization - Complete Guide

EXPLAIN and EXPLAIN ANALYZE

✅ Good: Using EXPLAIN ANALYZE

sql

-- EXPLAIN shows the query plan (doesn't execute)
EXPLAIN
SELECT * FROM users WHERE email = 'user@example.com';

/*
Output:
Index Scan using idx_users_email on users  (cost=0.42..8.44 rows=1 width=100)
  Index Cond: (email = 'user@example.com')
*/

-- EXPLAIN ANALYZE executes and shows actual performance
EXPLAIN ANALYZE
SELECT * FROM orders 
WHERE customer_id = 123 
  AND created_at > '2024-01-01'
ORDER BY created_at DESC;

/*
Output:
Index Scan using idx_orders_customer_created on orders  
  (cost=0.42..123.45 rows=50 width=100) 
  (actual time=0.012..0.234 rows=42 loops=1)
  Index Cond: ((customer_id = 123) AND (created_at > '2024-01-01'))
Planning Time: 0.123 ms
Execution Time: 0.456 ms
*/

-- EXPLAIN with additional options
EXPLAIN (ANALYZE, BUFFERS, VERBOSE, COSTS, TIMING)
SELECT 
  o.id,
  o.total,
  c.name
FROM orders o
JOIN customers c ON o.customer_id = c.id
WHERE o.created_at > NOW() - INTERVAL '30 days';

-- Interpret key metrics:
-- - cost: estimated startup/total cost (lower is better)
-- - rows: estimated/actual row count
-- - width: average row size in bytes
-- - actual time: actual startup/total time in ms
-- - loops: number of times node was executed

❌ Bad: Ignoring Query Plans

sql

-- ❌ Writing queries without checking execution plan
SELECT * FROM orders WHERE EXTRACT(YEAR FROM created_at) = 2024;  
-- ❌ Function on column prevents index usage

-- ✅ Better: Use range query
SELECT * FROM orders WHERE created_at >= '2024-01-01' AND created_at < '2025-01-01';

-- ❌ Slow query without investigation
SELECT * FROM large_table WHERE status = 'active';  -- Seq scan on 10M rows
-- ✅ Check with EXPLAIN ANALYZE and add index if needed

Reading Query Plans

✅ Good: Understanding Scan Types

sql

-- Sequential Scan (reads entire table)
EXPLAIN ANALYZE
SELECT * FROM users WHERE last_login < NOW() - INTERVAL '1 year';
/*
Seq Scan on users  (cost=0.00..1234.56 rows=100 width=100)
  Filter: (last_login < (now() - '1 year'))
  Rows Removed by Filter: 9900
-- ✅ Seq scan OK for small tables or when returning most rows
*/

-- Index Scan (uses index to find rows, then fetches from table)
EXPLAIN ANALYZE
SELECT * FROM users WHERE email = 'user@example.com';
/*
Index Scan using idx_users_email on users  (cost=0.42..8.44 rows=1 width=100)
  Index Cond: (email = 'user@example.com')
-- ✅ Efficient for selective queries
*/

-- Index Only Scan (all data in index, no table access)
EXPLAIN ANALYZE
SELECT email FROM users WHERE email LIKE 'user%';
/*
Index Only Scan using idx_users_email on users  (cost=0.42..12.34 rows=10 width=50)
  Index Cond: (email >= 'user' AND email < 'usfs')
  Heap Fetches: 0
-- ✅ Most efficient: no table access needed
*/

-- Bitmap Index Scan (combines multiple indexes)
EXPLAIN ANALYZE
SELECT * FROM products 
WHERE category_id = 5 AND price > 100;
/*
Bitmap Heap Scan on products  (cost=12.34..234.56 rows=50 width=100)
  Recheck Cond: ((category_id = 5) AND (price > 100))
  ->  BitmapAnd  (cost=12.34..12.34 rows=50 width=0)
        ->  Bitmap Index Scan on idx_products_category  (cost=0.00..4.56 rows=100 width=0)
        ->  Bitmap Index Scan on idx_products_price  (cost=0.00..7.78 rows=200 width=0)
-- ✅ Combines two indexes efficiently
*/

❌ Bad: Ignoring Plan Warnings

sql

-- ❌ Large estimated vs actual row mismatch (outdated statistics)
EXPLAIN ANALYZE
SELECT * FROM orders WHERE status = 'pending';
/*
Seq Scan on orders  (cost=0.00..1234.56 rows=100 width=100) 
                    (actual time=0.123..45.678 rows=50000 loops=1)
-- ❌ Estimated 100 rows, actual 50000! Run ANALYZE
*/

-- ✅ Update statistics
ANALYZE orders;

-- ❌ High "Rows Removed by Filter" (missing index)
EXPLAIN ANALYZE
SELECT * FROM users WHERE username = 'john';
/*
Seq Scan on users  (cost=0.00..1234.56 rows=1 width=100)
  Filter: (username = 'john')
  Rows Removed by Filter: 99999
-- ❌ Scanned 100K rows to find 1! Add index on username
*/

Join Optimization

✅ Good: Efficient Joins

sql

-- Nested Loop Join (small result sets, indexed join column)
EXPLAIN ANALYZE
SELECT 
  o.id,
  c.name,
  o.total
FROM orders o
JOIN customers c ON o.customer_id = c.id
WHERE o.created_at > NOW() - INTERVAL '7 days';
/*
Nested Loop  (cost=0.42..123.45 rows=10 width=100)
  ->  Index Scan using idx_orders_created on orders o  (cost=0.42..45.67 rows=10 width=100)
  ->  Index Scan using customers_pkey on customers c  (cost=0.42..7.78 rows=1 width=50)
-- ✅ Efficient for small result sets
*/

-- Hash Join (large result sets, equality conditions)
EXPLAIN ANALYZE
SELECT 
  o.id,
  c.name,
  SUM(oi.quantity * oi.price) AS total
FROM orders o
JOIN customers c ON o.customer_id = c.id
JOIN order_items oi ON o.id = oi.order_id
WHERE o.created_at > '2024-01-01'
GROUP BY o.id, c.name;
/*
Hash Join  (cost=1234.56..5678.90 rows=1000 width=100)
  Hash Cond: (o.customer_id = c.id)
  ->  Seq Scan on orders o  (cost=0.00..1234.56 rows=1000 width=100)
  ->  Hash  (cost=234.56..234.56 rows=5000 width=50)
        ->  Seq Scan on customers c  (cost=0.00..234.56 rows=5000 width=50)
-- ✅ Builds hash table in memory for fast lookups
*/

-- Merge Join (both inputs sorted, equality conditions)
EXPLAIN ANALYZE
SELECT 
  a.id,
  b.value
FROM large_table_a a
JOIN large_table_b b ON a.key = b.key
ORDER BY a.key;
/*
Merge Join  (cost=1234.56..5678.90 rows=10000 width=100)
  Merge Cond: (a.key = b.key)
  ->  Index Scan using idx_a_key on large_table_a a  (cost=0.42..2345.67 rows=10000 width=100)
  ->  Index Scan using idx_b_key on large_table_b b  (cost=0.42..2345.67 rows=10000 width=50)
-- ✅ Efficient when both sides are sorted by join key
*/

-- Use JOIN instead of subquery for better optimization
-- ✅ Good: JOIN
SELECT 
  c.name,
  COUNT(o.id) AS order_count
FROM customers c
LEFT JOIN orders o ON c.id = o.customer_id
GROUP BY c.id, c.name;

-- ❌ Bad: Correlated subquery (executes once per row)
SELECT 
  c.name,
  (SELECT COUNT(*) FROM orders o WHERE o.customer_id = c.id) AS order_count
FROM customers c;

❌ Bad: Inefficient Joins

sql

-- ❌ Cross join (Cartesian product)
SELECT * FROM customers, orders;  -- ❌ Returns customers * orders rows

-- ❌ Missing join condition
SELECT * FROM customers c
JOIN orders o ON true  -- ❌ Cross join
WHERE c.id = 123;

-- ❌ Join on non-indexed column
SELECT * FROM orders o
JOIN customers c ON LOWER(o.customer_email) = LOWER(c.email);  
-- ❌ Function prevents index usage

-- ✅ Better: Index expression
CREATE INDEX idx_customers_email_lower ON customers(LOWER(email));

Subqueries vs CTEs vs JOINs

✅ Good: Choosing the Right Approach

sql

-- ✅ CTE for readability and reuse
WITH active_customers AS (
  SELECT id, name, email
  FROM customers
  WHERE is_active = true
),
recent_orders AS (
  SELECT customer_id, COUNT(*) AS order_count, SUM(total) AS total_spent
  FROM orders
  WHERE created_at > NOW() - INTERVAL '30 days'
  GROUP BY customer_id
)
SELECT 
  ac.name,
  ac.email,
  COALESCE(ro.order_count, 0) AS order_count,
  COALESCE(ro.total_spent, 0) AS total_spent
FROM active_customers ac
LEFT JOIN recent_orders ro ON ac.id = ro.customer_id
ORDER BY ro.total_spent DESC NULLS LAST;

-- ✅ JOIN for simple relationships
SELECT 
  o.id,
  c.name,
  o.total
FROM orders o
JOIN customers c ON o.customer_id = c.id
WHERE o.created_at > '2024-01-01';

-- ✅ IN with subquery (for small result sets)
SELECT * FROM products
WHERE category_id IN (
  SELECT id FROM categories WHERE name LIKE 'Electronics%'
);

-- ✅ EXISTS for checking existence (stops at first match)
SELECT c.name
FROM customers c
WHERE EXISTS (
  SELECT 1 FROM orders o WHERE o.customer_id = c.id AND o.total > 1000
);

-- ✅ NOT EXISTS (more efficient than NOT IN with NULLs)
SELECT c.name
FROM customers c
WHERE NOT EXISTS (
  SELECT 1 FROM orders o WHERE o.customer_id = c.id
);

-- ✅ Recursive CTE for hierarchical data
WITH RECURSIVE org_chart AS (
  -- Base case: top-level managers
  SELECT id, name, manager_id, 1 AS level
  FROM employees
  WHERE manager_id IS NULL
  
  UNION ALL
  
  -- Recursive case: employees under managers
  SELECT e.id, e.name, e.manager_id, oc.level + 1
  FROM employees e
  JOIN org_chart oc ON e.manager_id = oc.id
)
SELECT * FROM org_chart ORDER BY level, name;

❌ Bad: Inefficient Subqueries

sql

-- ❌ Correlated subquery in SELECT (executes for each row)
SELECT 
  c.name,
  (SELECT COUNT(*) FROM orders WHERE customer_id = c.id) AS order_count,
  (SELECT SUM(total) FROM orders WHERE customer_id = c.id) AS total_spent
FROM customers c;

-- ✅ Better: Single JOIN
SELECT 
  c.name,
  COUNT(o.id) AS order_count,
  COALESCE(SUM(o.total), 0) AS total_spent
FROM customers c
LEFT JOIN orders o ON c.id = o.customer_id
GROUP BY c.id, c.name;

-- ❌ NOT IN with nullable column (performance issue + wrong results)
SELECT * FROM customers
WHERE id NOT IN (SELECT customer_id FROM orders);  
-- ❌ Returns no rows if any customer_id IS NULL

-- ✅ Better: NOT EXISTS or LEFT JOIN
SELECT c.*
FROM customers c
WHERE NOT EXISTS (SELECT 1 FROM orders o WHERE o.customer_id = c.id);

Transactions & Concurrency - Complete Guide

ACID Properties

✅ Good: Understanding ACID

sql

-- Atomicity: All or nothing
BEGIN;

-- Transfer money between accounts
UPDATE accounts SET balance = balance - 100 WHERE id = 1;
UPDATE accounts SET balance = balance + 100 WHERE id = 2;

-- Both updates succeed or both fail
COMMIT;  -- Or ROLLBACK if error

-- Consistency: Constraints enforced
BEGIN;

INSERT INTO orders (customer_id, total) 
VALUES (999, -100);  -- ❌ Violates CHECK constraint

-- Transaction automatically rolled back on error

-- Isolation: Concurrent transactions don't interfere (see isolation levels)

-- Durability: Committed data survives crashes (Write-Ahead Logging)

Transaction Isolation Levels

✅ Good: Choosing the Right Isolation Level

sql

-- Read Committed (PostgreSQL default)
-- - Prevents dirty reads
-- - Allows non-repeatable reads and phantom reads
BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;

SELECT balance FROM accounts WHERE id = 1;  -- Returns 100
-- Another transaction commits: UPDATE accounts SET balance = 200 WHERE id = 1
SELECT balance FROM accounts WHERE id = 1;  -- Returns 200 (non-repeatable read)

COMMIT;

-- Repeatable Read (most common for application logic)
-- - Prevents dirty reads and non-repeatable reads
-- - Still allows phantom reads
-- - Uses snapshot isolation (sees snapshot at transaction start)
BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;

SELECT * FROM products WHERE price > 100;  -- Returns 10 rows
-- Another transaction inserts: INSERT INTO products VALUES (..., 150)
SELECT * FROM products WHERE price > 100;  -- Still returns 10 rows (phantom prevented)

COMMIT;

-- Serializable (strictest, may cause serialization failures)
-- - Prevents all anomalies (dirty, non-repeatable, phantom reads)
-- - May abort transactions to prevent write skew
BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;

-- Transfer logic
SELECT balance FROM accounts WHERE id = 1;  -- 100
SELECT balance FROM accounts WHERE id = 2;  -- 50

-- If another transaction modifies these rows concurrently:
-- ERROR: could not serialize access due to concurrent update

UPDATE accounts SET balance = balance - 50 WHERE id = 1;
UPDATE accounts SET balance = balance + 50 WHERE id = 2;

COMMIT;  -- May fail with serialization error

-- ✅ Set default isolation level in postgresql.conf
-- default_transaction_isolation = 'repeatable read'

❌ Bad: Wrong Isolation Level

sql

-- ❌ Using Read Committed for financial transactions (race conditions)
BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;

-- Check balance
SELECT balance FROM accounts WHERE id = 1;  -- Returns 100

-- Another transaction withdraws $50 (balance now 50)

-- Withdraw $60 (overdraft!)
UPDATE accounts SET balance = balance - 60 WHERE id = 1;  -- Balance now -10

COMMIT;  -- ❌ Overdraft!

-- ✅ Better: Use Repeatable Read or Serializable with constraint
BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;

UPDATE accounts SET balance = balance - 60 WHERE id = 1;

-- ❌ If concurrent transaction modified balance, this will fail
-- Or use CHECK constraint: CHECK (balance >= 0)

COMMIT;

MVCC (Multi-Version Concurrency Control)

✅ Good: Understanding MVCC

sql

-- PostgreSQL creates new row versions instead of locking
-- Each transaction sees a snapshot of data

-- Transaction 1 (reads don't block writes)
BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
SELECT * FROM products WHERE id = 1;  -- Version 1, price = 100
-- ... long-running transaction

-- Transaction 2 (writes don't block reads)
BEGIN;
UPDATE products SET price = 120 WHERE id = 1;  -- Creates version 2
COMMIT;

-- Transaction 1 continues
SELECT * FROM products WHERE id = 1;  -- Still sees version 1, price = 100
COMMIT;

-- ✅ No locks needed for reads (except SERIALIZABLE)
-- ✅ Reads don't block writes, writes don't block reads
-- ✅ Each transaction sees consistent snapshot

-- Dead tuples cleanup (old versions)
VACUUM products;  -- Reclaims space from old row versions

❌ Bad: Ignoring MVCC Implications

sql

-- ❌ Long-running transactions prevent VACUUM
BEGIN;
SELECT * FROM large_table WHERE id = 1;
-- ... transaction open for hours
COMMIT;

-- ❌ Table bloat: old row versions can't be reclaimed
-- ❌ Performance degrades over time

-- ✅ Keep transactions short
-- ✅ Use connection pooling to avoid idle transactions

Locking

✅ Good: Explicit Locking When Needed

sql

-- SELECT FOR UPDATE (locks rows for update)
BEGIN;

SELECT * FROM inventory 
WHERE product_id = 123 
FOR UPDATE;  -- Locks row, prevents other updates

-- Check stock
UPDATE inventory SET quantity = quantity - 1 
WHERE product_id = 123 AND quantity > 0;

COMMIT;

-- SELECT FOR SHARE (locks rows for read, allows other readers)
BEGIN;

SELECT * FROM orders WHERE id = 456 FOR SHARE;  -- Shared lock

-- Other transactions can read but not update

COMMIT;

-- SKIP LOCKED (skip locked rows, useful for job queues)
BEGIN;

SELECT * FROM jobs 
WHERE status = 'pending' 
ORDER BY created_at 
LIMIT 1 
FOR UPDATE SKIP LOCKED;  -- Skip if locked by another worker

-- Process job
UPDATE jobs SET status = 'processing' WHERE id = ...;

COMMIT;

-- NOWAIT (fail immediately if locked)
BEGIN;

SELECT * FROM accounts WHERE id = 1 FOR UPDATE NOWAIT;
-- ERROR: could not obtain lock on row in relation "accounts"

COMMIT;

-- Advisory locks (application-level locking)
-- Lock on arbitrary ID (e.g., user ID)
SELECT pg_advisory_lock(12345);  -- Blocks until lock acquired

-- Do work...

SELECT pg_advisory_unlock(12345);  -- Release lock

-- Try lock (non-blocking)
SELECT pg_try_advisory_lock(12345);  -- Returns true/false

❌ Bad: Deadlocks and Lock Contention

sql

-- ❌ Deadlock scenario

-- Transaction 1
BEGIN;
UPDATE accounts SET balance = balance - 10 WHERE id = 1;  -- Locks row 1
-- ... waiting
UPDATE accounts SET balance = balance + 10 WHERE id = 2;  -- Waits for row 2 lock
COMMIT;

-- Transaction 2 (concurrent)
BEGIN;
UPDATE accounts SET balance = balance - 5 WHERE id = 2;   -- Locks row 2
UPDATE accounts SET balance = balance + 5 WHERE id = 1;   -- Waits for row 1 lock (DEADLOCK!)
-- ERROR: deadlock detected

-- ✅ Prevention: Always acquire locks in the same order
BEGIN;
UPDATE accounts SET balance = balance - 10 WHERE id = LEAST(1, 2);
UPDATE accounts SET balance = balance + 10 WHERE id = GREATEST(1, 2);
COMMIT;

-- ❌ Too coarse-grained locking
LOCK TABLE products IN EXCLUSIVE MODE;  -- ❌ Locks entire table
-- ✅ Use row-level locking instead

Best Practices

✅ Good: Transaction Best Practices

sql

-- ✅ Keep transactions short
BEGIN;
-- Quick operations only
UPDATE users SET last_login = NOW() WHERE id = 123;
COMMIT;

-- ❌ Don't do this
BEGIN;
-- Slow external API call
-- Long computation
-- User input
COMMIT;  -- ❌ Transaction open too long

-- ✅ Use appropriate isolation level
-- Read Committed: default, good for most cases
-- Repeatable Read: financial transactions, reports
-- Serializable: strict consistency requirements

-- ✅ Handle serialization failures
BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;

-- Retry logic
DECLARE
  max_retries INT := 3;
  retry_count INT := 0;
BEGIN
  LOOP
    BEGIN
      -- Transaction logic
      UPDATE accounts SET balance = balance - 100 WHERE id = 1;
      EXIT;  -- Success
    EXCEPTION
      WHEN serialization_failure THEN
        retry_count := retry_count + 1;
        IF retry_count >= max_retries THEN
          RAISE;
        END IF;
    END;
  END LOOP;
END;

COMMIT;

-- ✅ Use savepoints for partial rollback
BEGIN;

INSERT INTO orders (customer_id, total) VALUES (123, 100);

SAVEPOINT before_items;

INSERT INTO order_items (order_id, product_id, quantity) VALUES (1, 1, 5);
-- Error occurs
ROLLBACK TO SAVEPOINT before_items;  -- Undo items, keep order

INSERT INTO order_items (order_id, product_id, quantity) VALUES (1, 2, 3);

COMMIT;

Advanced SQL Features - Complete Guide

Common Table Expressions (CTEs)

✅ Good: Using CTEs Effectively

sql

-- Basic CTE for readability
WITH high_value_customers AS (
  SELECT 
    customer_id,
    SUM(total) AS total_spent
  FROM orders
  WHERE created_at > NOW() - INTERVAL '1 year'
  GROUP BY customer_id
  HAVING SUM(total) > 10000
)
SELECT 
  c.name,
  c.email,
  hvc.total_spent
FROM high_value_customers hvc
JOIN customers c ON hvc.customer_id = c.id
ORDER BY hvc.total_spent DESC;

-- Multiple CTEs
WITH 
category_sales AS (
  SELECT 
    c.id AS category_id,
    c.name AS category_name,
    COUNT(DISTINCT o.id) AS order_count,
    SUM(oi.quantity * oi.price) AS revenue
  FROM categories c
  JOIN products p ON c.id = p.category_id
  JOIN order_items oi ON p.id = oi.product_id
  JOIN orders o ON oi.order_id = o.id
  WHERE o.created_at > NOW() - INTERVAL '30 days'
  GROUP BY c.id, c.name
),
top_categories AS (
  SELECT category_id
  FROM category_sales
  ORDER BY revenue DESC
  LIMIT 5
)
SELECT 
  cs.category_name,
  cs.order_count,
  cs.revenue
FROM category_sales cs
JOIN top_categories tc ON cs.category_id = tc.category_id
ORDER BY cs.revenue DESC;

-- Recursive CTE for hierarchical data
WITH RECURSIVE category_tree AS (
  -- Base: top-level categories
  SELECT 
    id,
    name,
    parent_id,
    1 AS level,
    name::TEXT AS path
  FROM categories
  WHERE parent_id IS NULL
  
  UNION ALL
  
  -- Recursive: child categories
  SELECT 
    c.id,
    c.name,
    c.parent_id,
    ct.level + 1,
    ct.path || ' > ' || c.name
  FROM categories c
  JOIN category_tree ct ON c.parent_id = ct.id
)
SELECT 
  id,
  REPEAT('  ', level - 1) || name AS indented_name,
  level,
  path
FROM category_tree
ORDER BY path;

-- Recursive CTE for graph traversal (find all dependencies)
WITH RECURSIVE dependency_tree AS (
  -- Base: direct dependencies
  SELECT 
    package_id,
    dependency_id,
    1 AS depth
  FROM package_dependencies
  WHERE package_id = 123
  
  UNION ALL
  
  -- Recursive: transitive dependencies
  SELECT 
    pd.package_id,
    pd.dependency_id,
    dt.depth + 1
  FROM package_dependencies pd
  JOIN dependency_tree dt ON pd.package_id = dt.dependency_id
  WHERE dt.depth < 10  -- Prevent infinite loops
)
SELECT DISTINCT
  p.name,
  dt.depth
FROM dependency_tree dt
JOIN packages p ON dt.dependency_id = p.id
ORDER BY dt.depth, p.name;

❌ Bad: CTE Misuse

sql

-- ❌ Unnecessary CTE (adds overhead)
WITH all_users AS (
  SELECT * FROM users
)
SELECT * FROM all_users WHERE id = 123;

-- ✅ Better: Direct query
SELECT * FROM users WHERE id = 123;

-- ❌ CTE used only once (no benefit)
WITH recent_orders AS (
  SELECT * FROM orders WHERE created_at > NOW() - INTERVAL '7 days'
)
SELECT COUNT(*) FROM recent_orders;

-- ✅ Better: Subquery or direct query
SELECT COUNT(*) FROM orders WHERE created_at > NOW() - INTERVAL '7 days';

Window Functions

✅ Good: Window Function Patterns

sql

-- ROW_NUMBER: Assign unique sequential numbers
SELECT 
  customer_id,
  order_id,
  total,
  ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY created_at DESC) AS row_num
FROM orders;

-- RANK and DENSE_RANK: Handle ties
SELECT 
  name,
  score,
  RANK() OVER (ORDER BY score DESC) AS rank,        -- 1, 2, 2, 4
  DENSE_RANK() OVER (ORDER BY score DESC) AS dense_rank  -- 1, 2, 2, 3
FROM students;

-- LAG and LEAD: Access previous/next row
SELECT 
  date,
  revenue,
  LAG(revenue, 1) OVER (ORDER BY date) AS prev_day_revenue,
  LEAD(revenue, 1) OVER (ORDER BY date) AS next_day_revenue,
  revenue - LAG(revenue, 1) OVER (ORDER BY date) AS day_over_day_change
FROM daily_sales
ORDER BY date;

-- FIRST_VALUE and LAST_VALUE
SELECT 
  product_id,
  date,
  price,
  FIRST_VALUE(price) OVER (PARTITION BY product_id ORDER BY date) AS initial_price,
  LAST_VALUE(price) OVER (
    PARTITION BY product_id 
    ORDER BY date 
    ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
  ) AS final_price
FROM price_history;

-- Running totals and moving averages
SELECT 
  date,
  revenue,
  SUM(revenue) OVER (ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_revenue,
  AVG(revenue) OVER (ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS moving_avg_7day
FROM daily_sales
ORDER BY date;

-- Percentiles and NTile
SELECT 
  name,
  salary,
  NTILE(4) OVER (ORDER BY salary) AS quartile,
  PERCENT_RANK() OVER (ORDER BY salary) AS percent_rank,
  CUME_DIST() OVER (ORDER BY salary) AS cumulative_dist
FROM employees;

-- Top N per group
SELECT *
FROM (
  SELECT 
    category_id,
    product_id,
    name,
    sales,
    ROW_NUMBER() OVER (PARTITION BY category_id ORDER BY sales DESC) AS rank
  FROM products
) ranked
WHERE rank <= 3;  -- Top 3 products per category

❌ Bad: Window Function Misuse

sql

-- ❌ Using window function when simple aggregate suffices
SELECT 
  customer_id,
  SUM(total) OVER (PARTITION BY customer_id) AS total_spent
FROM orders;

-- ✅ Better: GROUP BY
SELECT 
  customer_id,
  SUM(total) AS total_spent
FROM orders
GROUP BY customer_id;

-- ❌ Missing PARTITION BY (operates on entire result set)
SELECT 
  product_id,
  date,
  price,
  AVG(price) OVER (ORDER BY date)  -- ❌ Global average, not per product
FROM prices;

-- ✅ Better: Add PARTITION BY
SELECT 
  product_id,
  date,
  price,
  AVG(price) OVER (PARTITION BY product_id ORDER BY date) AS avg_price
FROM prices;

JSONB Operations

✅ Good: Working with JSONB

sql

CREATE TABLE events (
  id BIGSERIAL PRIMARY KEY,
  event_type VARCHAR(50) NOT NULL,
  data JSONB NOT NULL,
  created_at TIMESTAMPTZ DEFAULT NOW()
);

CREATE INDEX idx_events_data ON events USING GIN (data);

-- Insert JSONB data
INSERT INTO events (event_type, data) VALUES
('page_view', '{"user_id": 123, "page": "/products", "duration": 45}'),
('purchase', '{"user_id": 123, "product_id": 456, "amount": 99.99, "currency": "USD"}');

-- Query JSONB fields
SELECT * FROM events WHERE data @> '{"user_id": 123}';  -- Contains
SELECT * FROM events WHERE data ? 'product_id';          -- Key exists
SELECT * FROM events WHERE data ->> 'page' = '/products';  -- Extract text

-- Extract JSONB values
SELECT 
  id,
  data -> 'user_id' AS user_id_json,        -- Returns JSONB
  data ->> 'user_id' AS user_id_text,       -- Returns text
  (data ->> 'user_id')::BIGINT AS user_id,  -- Cast to integer
  data -> 'metadata' -> 'ip' AS ip_address  -- Nested access
FROM events;

-- Update JSONB fields
UPDATE events
SET data = jsonb_set(data, '{amount}', '149.99')
WHERE id = 1;

-- Add new field
UPDATE events
SET data = data || '{"processed": true}'::jsonb
WHERE event_type = 'purchase';

-- Remove field
UPDATE events
SET data = data - 'temporary_field'
WHERE id = 1;

-- Array operations
SELECT * FROM events WHERE data -> 'tags' @> '["urgent"]';  -- Array contains

-- Aggregate JSONB
SELECT 
  event_type,
  jsonb_agg(data) AS all_events,
  jsonb_object_agg(id::text, data) AS events_by_id
FROM events
GROUP BY event_type;

Array Operations

✅ Good: Working with Arrays

sql

CREATE TABLE products (
  id BIGSERIAL PRIMARY KEY,
  name VARCHAR(255) NOT NULL,
  tags TEXT[],
  prices NUMERIC[]
);

CREATE INDEX idx_products_tags ON products USING GIN (tags);

-- Insert array data
INSERT INTO products (name, tags, prices) VALUES
('Laptop', ARRAY['electronics', 'computers', 'portable'], ARRAY[999.99, 1299.99, 1599.99]);

-- Query arrays
SELECT * FROM products WHERE 'electronics' = ANY(tags);        -- Contains element
SELECT * FROM products WHERE tags @> ARRAY['electronics'];     -- Contains all
SELECT * FROM products WHERE tags && ARRAY['computers', 'mobile'];  -- Overlaps

-- Array functions
SELECT 
  name,
  array_length(tags, 1) AS tag_count,
  array_upper(prices, 1) AS price_count,
  prices[1] AS lowest_price,
  prices[array_upper(prices, 1)] AS highest_price
FROM products;

-- Unnest array to rows
SELECT 
  p.id,
  p.name,
  unnest(p.tags) AS tag
FROM products p;

-- Array aggregation
SELECT 
  category_id,
  array_agg(name ORDER BY name) AS product_names,
  array_agg(DISTINCT tag) AS all_tags
FROM products
CROSS JOIN unnest(tags) AS tag
GROUP BY category_id;

Performance Tuning - Complete Guide

Connection Pooling

✅ Good: PgBouncer Configuration

ini

# /etc/pgbouncer/pgbouncer.ini

[databases]
mydb = host=localhost port=5432 dbname=mydb

[pgbouncer]
listen_addr = 0.0.0.0
listen_port = 6432
auth_type = md5
auth_file = /etc/pgbouncer/userlist.txt

# Pool mode
pool_mode = transaction  # transaction (recommended) | session | statement

# Connection limits
max_client_conn = 1000   # Max client connections
default_pool_size = 25   # Connections per user/database
reserve_pool_size = 5    # Extra connections for emergencies
reserve_pool_timeout = 3  # Seconds

# Timeouts
server_idle_timeout = 600    # Close idle server connections after 10 min
server_lifetime = 3600       # Close server connections after 1 hour
query_timeout = 0            # No query timeout

typescript

// Application connection string
const DATABASE_URL = 'postgresql://user:password@localhost:6432/mydb';

✅ Good: Supavisor (Supabase Connection Pooler)

typescript

// Transaction mode (recommended for serverless)
const DATABASE_URL = 'postgresql://postgres.xxxxx.supabase.co:6543/postgres';

// Session mode (for long-lived connections)
const DATABASE_URL = 'postgresql://postgres.xxxxx.supabase.co:5432/postgres';

❌ Bad: No Connection Pooling

typescript

// ❌ Each request creates new connection (slow, exhausts connections)
async function handler(req, res) {
  const client = new Client({
    connectionString: 'postgresql://localhost:5432/mydb'
  });
  await client.connect();  // ❌ Expensive operation
  const result = await client.query('SELECT * FROM users');
  await client.end();
  res.json(result.rows);
}

// ✅ Better: Use connection pool
const pool = new Pool({
  connectionString: 'postgresql://localhost:6432/mydb',  // PgBouncer
  max: 20  // Application-side pool
});

async function handler(req, res) {
  const result = await pool.query('SELECT * FROM users');
  res.json(result.rows);
}

PostgreSQL Configuration

✅ Good: Performance-Oriented postgresql.conf

ini

# Memory Configuration
shared_buffers = 4GB             # 25% of RAM for dedicated server
effective_cache_size = 12GB      # 75% of RAM (OS + PG cache)
work_mem = 64MB                  # Per operation (sort, hash)
maintenance_work_mem = 1GB       # VACUUM, CREATE INDEX

# Query Planner
random_page_cost = 1.1           # For SSD (default 4 for HDD)
effective_io_concurrency = 200   # For SSD (default 1 for HDD)

# Write-Ahead Log (WAL)
wal_buffers = 16MB
min_wal_size = 2GB
max_wal_size = 8GB
checkpoint_completion_target = 0.9

# Autovacuum (critical for MVCC)
autovacuum = on
autovacuum_max_workers = 4
autovacuum_naptime = 1min
autovacuum_vacuum_scale_factor = 0.1
autovacuum_analyze_scale_factor = 0.05

# Connection Settings
max_connections = 200

❌ Bad: Default Configuration for Production

ini

# ❌ Default settings (designed for minimal systems)
shared_buffers = 128MB           # ❌ Too small for production
work_mem = 4MB                   # ❌ Causes disk sorts
effective_cache_size = 4GB       # ❌ Underestimates available cache
random_page_cost = 4.0           # ❌ Wrong for SSD

Monitoring and Analysis

✅ Good: Essential Monitoring Queries

sql

-- Enable pg_stat_statements extension
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;

-- Top 10 slowest queries
SELECT 
  query,
  calls,
  total_exec_time,
  mean_exec_time,
  max_exec_time,
  rows
FROM pg_stat_statements
ORDER BY mean_exec_time DESC
LIMIT 10;

-- Most frequently executed queries
SELECT 
  query,
  calls,
  total_exec_time,
  mean_exec_time
FROM pg_stat_statements
ORDER BY calls DESC
LIMIT 10;

-- Table sizes
SELECT 
  schemaname,
  tablename,
  pg_size_pretty(pg_total_relation_size(schemaname || '.' || tablename)) AS total_size,
  pg_size_pretty(pg_relation_size(schemaname || '.' || tablename)) AS table_size,
  pg_size_pretty(pg_indexes_size(schemaname || '.' || tablename)) AS indexes_size
FROM pg_tables
WHERE schemaname = 'public'
ORDER BY pg_total_relation_size(schemaname || '.' || tablename) DESC;

-- Index usage statistics
SELECT 
  schemaname,
  tablename,
  indexname,
  idx_scan,
  idx_tup_read,
  idx_tup_fetch,
  pg_size_pretty(pg_relation_size(indexrelid)) AS index_size
FROM pg_stat_user_indexes
WHERE schemaname = 'public'
ORDER BY idx_scan ASC;

-- Cache hit ratio (should be > 99%)
SELECT 
  'index hit rate' AS name,
  (sum(idx_blks_hit)) / nullif(sum(idx_blks_hit + idx_blks_read),0) AS ratio
FROM pg_statio_user_indexes
UNION ALL
SELECT 
  'table hit rate' AS name,
  sum(heap_blks_hit) / nullif(sum(heap_blks_hit) + sum(heap_blks_read),0) AS ratio
FROM pg_statio_user_tables;

-- Long-running queries
SELECT 
  pid,
  now() - query_start AS duration,
  state,
  query
FROM pg_stat_activity
WHERE state != 'idle'
  AND now() - query_start > interval '5 minutes'
ORDER BY duration DESC;

-- Blocking queries
SELECT 
  blocked_locks.pid AS blocked_pid,
  blocked_activity.usename AS blocked_user,
  blocking_locks.pid AS blocking_pid,
  blocking_activity.usename AS blocking_user,
  blocked_activity.query AS blocked_statement,
  blocking_activity.query AS blocking_statement
FROM pg_catalog.pg_locks blocked_locks
JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
JOIN pg_catalog.pg_locks blocking_locks 
  ON blocking_locks.locktype = blocked_locks.locktype
  AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database
  AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
  AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
  AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
  AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
  AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
  AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
  AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
  AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
  AND blocking_locks.pid != blocked_locks.pid
JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
WHERE NOT blocked_locks.granted;

Best Practices - Complete Guide

Naming Conventions

✅ Good: Consistent Naming

sql

-- Tables: plural, snake_case
CREATE TABLE customers (...);
CREATE TABLE order_items (...);

-- Columns: snake_case
CREATE TABLE users (
  id BIGSERIAL PRIMARY KEY,
  first_name VARCHAR(100),
  last_name VARCHAR(100),
  email_address VARCHAR(255),
  created_at TIMESTAMPTZ,
  updated_at TIMESTAMPTZ
);

-- Indexes: idx_table_column(s)
CREATE INDEX idx_users_email ON users(email_address);
CREATE INDEX idx_orders_customer_created ON orders(customer_id, created_at);

-- Foreign keys: fk_table_referenced_table
ALTER TABLE orders ADD CONSTRAINT fk_orders_customers 
  FOREIGN KEY (customer_id) REFERENCES customers(id);

-- Unique constraints: uq_table_column(s)
ALTER TABLE users ADD CONSTRAINT uq_users_email UNIQUE (email_address);

-- Check constraints: ck_table_description
ALTER TABLE products ADD CONSTRAINT ck_products_price_positive 
  CHECK (price >= 0);

❌ Bad: Inconsistent Naming

sql

-- ❌ Mixed naming conventions
CREATE TABLE Customer (...);           -- ❌ Uppercase
CREATE TABLE order_item (...);         -- ❌ Singular
CREATE TABLE "user-profiles" (...);    -- ❌ Hyphens require quotes

-- ❌ Unclear abbreviations
CREATE TABLE usr (id BIGINT, nm VARCHAR(100), em VARCHAR(255));

Migration Strategies

✅ Good: Safe Migrations

sql

-- Add column with default (safe in PostgreSQL 11+)
ALTER TABLE users ADD COLUMN status VARCHAR(20) DEFAULT 'active' NOT NULL;

-- Add index concurrently (no table lock)
CREATE INDEX CONCURRENTLY idx_users_status ON users(status);

-- Add constraint with NOT VALID (fast), then validate (can be interrupted)
ALTER TABLE orders ADD CONSTRAINT ck_orders_total_positive 
  CHECK (total >= 0) NOT VALID;
  
ALTER TABLE orders VALIDATE CONSTRAINT ck_orders_total_positive;

-- Rename column (atomic, but breaks dependent code)
ALTER TABLE users RENAME COLUMN email TO email_address;

-- Drop column (use after code deployed)
ALTER TABLE users DROP COLUMN deprecated_field;

-- Large data migration in batches
DO $$
DECLARE
  batch_size INT := 1000;
  rows_updated INT;
BEGIN
  LOOP
    UPDATE products
    SET normalized_name = LOWER(name)
    WHERE normalized_name IS NULL
    LIMIT batch_size;
    
    GET DIAGNOSTICS rows_updated = ROW_COUNT;
    EXIT WHEN rows_updated = 0;
    
    COMMIT;  -- Release locks between batches
  END LOOP;
END $$;

❌ Bad: Dangerous Migrations

sql

-- ❌ Adding NOT NULL without default (locks table, fails if NULLs exist)
ALTER TABLE users ADD COLUMN status VARCHAR(20) NOT NULL;

-- ❌ Creating index without CONCURRENTLY (locks table for reads/writes)
CREATE INDEX idx_users_email ON users(email);  -- ❌ Locks table

-- ❌ Changing column type (rewrites entire table)
ALTER TABLE users ALTER COLUMN id TYPE BIGINT;  -- ❌ Long lock

Security

✅ Good: Row-Level Security (RLS)

sql

-- Enable RLS on table
ALTER TABLE documents ENABLE ROW LEVEL SECURITY;

-- Users can only see their own documents
CREATE POLICY documents_select_own ON documents
  FOR SELECT
  TO authenticated_users
  USING (user_id = current_user_id());

-- Users can only update their own documents
CREATE POLICY documents_update_own ON documents
  FOR UPDATE
  TO authenticated_users
  USING (user_id = current_user_id())
  WITH CHECK (user_id = current_user_id());

-- Admins can see all documents
CREATE POLICY documents_select_admin ON documents
  FOR SELECT
  TO admin_users
  USING (true);

-- Function to get current user ID (from Supabase auth)
CREATE OR REPLACE FUNCTION current_user_id()
RETURNS BIGINT AS $$
  SELECT COALESCE(
    current_setting('request.jwt.claims', true)::json->>'sub',
    '0'
  )::BIGINT;
$$ LANGUAGE SQL STABLE;

❌ Bad: No Access Control

sql

-- ❌ No RLS, relying on application code
CREATE TABLE documents (
  id BIGSERIAL PRIMARY KEY,
  user_id BIGINT NOT NULL,
  content TEXT
);  -- ❌ Anyone can query all documents

-- ✅ Enable RLS for defense in depth

Backup and Recovery

✅ Good: Regular Backups

bash

# Logical backup (pg_dump)
pg_dump -h localhost -U postgres -d mydb -F c -f mydb_backup.dump

# Restore
pg_restore -h localhost -U postgres -d mydb -c mydb_backup.dump

# Continuous archiving (WAL)
# postgresql.conf
archive_mode = on
archive_command = 'cp %p /path/to/archive/%f'

# Point-in-time recovery (PITR)
pg_basebackup -h localhost -U postgres -D /path/to/backup -Fp -Xs -P