PostgreSQL Advanced Guide

📖 18 min read | 📅 Updated: January 2025 | 🏷️ Other Technologies

Introduction

PostgreSQL is a powerful open-source database with advanced features. This guide covers advanced queries, indexing strategies, partitioning, replication, performance optimization, and production best practices.

1. Advanced Query Techniques

-- Window Functions
SELECT 
  employee_id,
  department,
  salary,
  AVG(salary) OVER (PARTITION BY department) as dept_avg,
  RANK() OVER (PARTITION BY department ORDER BY salary DESC) as dept_rank,
  ROW_NUMBER() OVER (ORDER BY salary DESC) as overall_rank
FROM employees;

-- Running totals
SELECT 
  date,
  amount,
  SUM(amount) OVER (ORDER BY date) as running_total
FROM transactions;

-- Moving average
SELECT 
  date,
  price,
  AVG(price) OVER (
    ORDER BY date 
    ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
  ) as moving_avg_7day
FROM stock_prices;

-- Common Table Expressions (CTEs)
WITH regional_sales AS (
  SELECT region, SUM(amount) as total_sales
  FROM orders
  GROUP BY region
),
top_regions AS (
  SELECT region
  FROM regional_sales
  WHERE total_sales > (SELECT AVG(total_sales) FROM regional_sales)
)
SELECT 
  r.region,
  p.product,
  SUM(o.amount) as product_sales
FROM orders o
JOIN top_regions tr ON o.region = tr.region
JOIN products p ON o.product_id = p.id
GROUP BY r.region, p.product;

-- Recursive CTE (organizational hierarchy)
WITH RECURSIVE employee_hierarchy AS (
  -- Base case: top-level managers
  SELECT id, name, manager_id, 1 as level
  FROM employees
  WHERE manager_id IS NULL
  
  UNION ALL
  
  -- Recursive case: employees under managers
  SELECT e.id, e.name, e.manager_id, eh.level + 1
  FROM employees e
  JOIN employee_hierarchy eh ON e.manager_id = eh.id
)
SELECT * FROM employee_hierarchy ORDER BY level, name;

-- LATERAL joins (correlated subqueries)
SELECT 
  c.name as customer,
  o.order_date,
  o.total
FROM customers c
CROSS JOIN LATERAL (
  SELECT order_date, total
  FROM orders
  WHERE customer_id = c.id
  ORDER BY order_date DESC
  LIMIT 3
) o;

-- JSON operations
SELECT 
  id,
  data->>'name' as name,
  data->'address'->>'city' as city,
  jsonb_array_elements(data->'tags') as tag
FROM users
WHERE data @> '{"status": "active"}';

-- Full-text search
CREATE INDEX idx_documents_search ON documents 
USING GIN (to_tsvector('english', title || ' ' || content));

SELECT 
  title,
  ts_rank(
    to_tsvector('english', title || ' ' || content),
    to_tsquery('english', 'postgresql & performance')
  ) as rank
FROM documents
WHERE to_tsvector('english', title || ' ' || content) 
      @@ to_tsquery('english', 'postgresql & performance')
ORDER BY rank DESC;

2. Index Strategies

-- B-tree index (default, good for equality and range)
CREATE INDEX idx_users_email ON users(email);
CREATE INDEX idx_orders_date ON orders(created_at);

-- Partial index (index subset of rows)
CREATE INDEX idx_active_users ON users(email) 
WHERE status = 'active';

-- Composite index
CREATE INDEX idx_orders_customer_date ON orders(customer_id, created_at);

-- IMPORTANT: Column order matters
-- Good for queries:
-- - WHERE customer_id = ? AND created_at > ?
-- - WHERE customer_id = ?
-- NOT good for:
-- - WHERE created_at > ? (doesn't use customer_id)

-- Expression index
CREATE INDEX idx_users_lower_email ON users(LOWER(email));

-- Now this query uses the index:
SELECT * FROM users WHERE LOWER(email) = 'john@example.com';

-- GIN index for arrays, JSONB, full-text
CREATE INDEX idx_tags ON articles USING GIN(tags);
CREATE INDEX idx_data ON documents USING GIN(data);

-- GiST index for geometric data, full-text
CREATE INDEX idx_location ON places USING GIST(location);

-- Hash index (equality only, rarely used)
CREATE INDEX idx_hash_code ON products USING HASH(product_code);

-- Covering index (index-only scan)
CREATE INDEX idx_orders_covering ON orders(customer_id, created_at) 
INCLUDE (total, status);

-- This query can be satisfied by index alone
SELECT created_at, total, status 
FROM orders 
WHERE customer_id = 123;

-- Analyze index usage
SELECT 
  schemaname,
  tablename,
  indexname,
  idx_scan,
  idx_tup_read,
  idx_tup_fetch
FROM pg_stat_user_indexes
ORDER BY idx_scan;

-- Find unused indexes
SELECT 
  schemaname,
  tablename,
  indexname
FROM pg_stat_user_indexes
WHERE idx_scan = 0 
  AND indexname NOT LIKE '%_pkey';

-- Index maintenance
REINDEX INDEX idx_users_email;
REINDEX TABLE users;

-- Concurrent reindex (doesn't lock table)
REINDEX INDEX CONCURRENTLY idx_users_email;

3. Table Partitioning

-- Range partitioning by date
CREATE TABLE orders (
  id SERIAL,
  customer_id INT,
  total DECIMAL(10,2),
  created_at TIMESTAMP NOT NULL
) PARTITION BY RANGE (created_at);

-- Create partitions
CREATE TABLE orders_2024_q1 PARTITION OF orders
  FOR VALUES FROM ('2024-01-01') TO ('2024-04-01');

CREATE TABLE orders_2024_q2 PARTITION OF orders
  FOR VALUES FROM ('2024-04-01') TO ('2024-07-01');

CREATE TABLE orders_2024_q3 PARTITION OF orders
  FOR VALUES FROM ('2024-07-01') TO ('2024-10-01');

CREATE TABLE orders_2024_q4 PARTITION OF orders
  FOR VALUES FROM ('2024-10-01') TO ('2025-01-01');

-- Indexes on partitions
CREATE INDEX idx_orders_2024_q1_customer ON orders_2024_q1(customer_id);
CREATE INDEX idx_orders_2024_q2_customer ON orders_2024_q2(customer_id);

-- List partitioning
CREATE TABLE users (
  id SERIAL,
  name VARCHAR(100),
  country VARCHAR(2)
) PARTITION BY LIST (country);

CREATE TABLE users_us PARTITION OF users FOR VALUES IN ('US');
CREATE TABLE users_uk PARTITION OF users FOR VALUES IN ('UK');
CREATE TABLE users_eu PARTITION OF users FOR VALUES IN ('DE', 'FR', 'IT', 'ES');
CREATE TABLE users_other PARTITION OF users DEFAULT;

-- Hash partitioning (for load distribution)
CREATE TABLE events (
  id BIGSERIAL,
  event_type VARCHAR(50),
  data JSONB
) PARTITION BY HASH (id);

CREATE TABLE events_0 PARTITION OF events FOR VALUES WITH (MODULUS 4, REMAINDER 0);
CREATE TABLE events_1 PARTITION OF events FOR VALUES WITH (MODULUS 4, REMAINDER 1);
CREATE TABLE events_2 PARTITION OF events FOR VALUES WITH (MODULUS 4, REMAINDER 2);
CREATE TABLE events_3 PARTITION OF events FOR VALUES WITH (MODULUS 4, REMAINDER 3);

-- Automatic partition creation function
CREATE OR REPLACE FUNCTION create_monthly_partition(table_name TEXT, start_date DATE)
RETURNS VOID AS $$
DECLARE
  partition_name TEXT;
  start_range DATE;
  end_range DATE;
BEGIN
  partition_name := table_name || '_' || TO_CHAR(start_date, 'YYYY_MM');
  start_range := start_date;
  end_range := start_date + INTERVAL '1 month';
  
  EXECUTE format(
    'CREATE TABLE IF NOT EXISTS %I PARTITION OF %I FOR VALUES FROM (%L) TO (%L)',
    partition_name, table_name, start_range, end_range
  );
END;
$$ LANGUAGE plpgsql;

-- Create partitions for next 12 months
DO $$
DECLARE
  month_date DATE;
BEGIN
  FOR i IN 0..11 LOOP
    month_date := DATE_TRUNC('month', CURRENT_DATE) + (i || ' months')::INTERVAL;
    PERFORM create_monthly_partition('orders', month_date);
  END LOOP;
END;
$$;

-- Detach and drop old partitions
ALTER TABLE orders DETACH PARTITION orders_2023_q1;
DROP TABLE orders_2023_q1;

4. Replication & High Availability

-- Streaming replication setup

# Primary server postgresql.conf
wal_level = replica
max_wal_senders = 5
wal_keep_size = 1GB
synchronous_commit = on

# Create replication user
CREATE ROLE replicator WITH REPLICATION LOGIN PASSWORD 'strong_password';

# pg_hba.conf on primary
host replication replicator standby_ip/32 md5

# On standby server, create base backup
pg_basebackup -h primary_ip -D /var/lib/postgresql/data -U replicator -P -v

# standby.signal file
touch /var/lib/postgresql/data/standby.signal

# Standby postgresql.conf
primary_conninfo = 'host=primary_ip port=5432 user=replicator password=strong_password'
hot_standby = on

# Check replication status on primary
SELECT 
  client_addr,
  state,
  sent_lsn,
  write_lsn,
  flush_lsn,
  replay_lsn,
  sync_state
FROM pg_stat_replication;

-- Logical replication (selective replication)

# Primary
CREATE PUBLICATION my_publication FOR TABLE users, orders;

# Subscriber
CREATE SUBSCRIPTION my_subscription
CONNECTION 'host=primary_ip dbname=mydb user=replicator password=password'
PUBLICATION my_publication;

# Monitor logical replication
SELECT * FROM pg_stat_subscription;

-- Connection pooling with PgBouncer
# pgbouncer.ini
[databases]
mydb = host=localhost port=5432 dbname=mydb

[pgbouncer]
listen_addr = *
listen_port = 6432
auth_type = md5
auth_file = /etc/pgbouncer/userlist.txt
pool_mode = transaction
max_client_conn = 1000
default_pool_size = 20

# Application connects to PgBouncer instead of PostgreSQL
DATABASE_URL=postgresql://user:pass@localhost:6432/mydb

-- Health check query
SELECT 
  pg_is_in_recovery() as is_standby,
  CASE 
    WHEN pg_is_in_recovery() THEN pg_last_wal_receive_lsn()
    ELSE pg_current_wal_lsn()
  END as lsn;

5. Performance Optimization

-- Query performance analysis
EXPLAIN ANALYZE
SELECT u.name, COUNT(o.id) as order_count
FROM users u
LEFT JOIN orders o ON u.id = o.customer_id
WHERE u.status = 'active'
GROUP BY u.id;

-- Look for:
-- - Sequential Scan (may need index)
-- - High cost numbers
-- - Slow execution time

-- Vacuum and analyze
VACUUM ANALYZE users;

-- Autovacuum settings (postgresql.conf)
autovacuum = on
autovacuum_max_workers = 3
autovacuum_naptime = 1min

-- Connection pooling in Node.js
const { Pool } = require('pg');

const pool = new Pool({
  host: 'localhost',
  database: 'mydb',
  user: 'dbuser',
  password: 'password',
  max: 20,                    // Maximum connections
  idleTimeoutMillis: 30000,   // Close idle connections
  connectionTimeoutMillis: 2000,
});

async function getUser(id) {
  const client = await pool.connect();
  try {
    const result = await client.query('SELECT * FROM users WHERE id = $1', [id]);
    return result.rows[0];
  } finally {
    client.release();
  }
}

-- Prepared statements
const getUserQuery = {
  name: 'get-user',
  text: 'SELECT * FROM users WHERE id = $1'
};

async function getUser(id) {
  const result = await pool.query(getUserQuery, [id]);
  return result.rows[0];
}

-- Batch operations
async function insertUsers(users) {
  const values = users.map((u, i) => 
    `($${i*3+1}, $${i*3+2}, $${i*3+3})`
  ).join(',');
  
  const params = users.flatMap(u => [u.name, u.email, u.age]);
  
  await pool.query(
    `INSERT INTO users (name, email, age) VALUES ${values}`,
    params
  );
}

-- Materialized views for expensive queries
CREATE MATERIALIZED VIEW user_statistics AS
SELECT 
  u.id,
  u.name,
  COUNT(DISTINCT o.id) as order_count,
  SUM(o.total) as total_spent,
  MAX(o.created_at) as last_order
FROM users u
LEFT JOIN orders o ON u.id = o.customer_id
GROUP BY u.id;

CREATE INDEX idx_user_stats_id ON user_statistics(id);

-- Refresh materialized view
REFRESH MATERIALIZED VIEW CONCURRENTLY user_statistics;

-- Schedule refresh
-- Use cron or pg_cron extension
SELECT cron.schedule('refresh-stats', '0 */6 * * *', 
  $$REFRESH MATERIALIZED VIEW CONCURRENTLY user_statistics$$
);

-- Monitoring queries
-- Active queries
SELECT 
  pid,
  now() - query_start as duration,
  state,
  query
FROM pg_stat_activity
WHERE state != 'idle'
ORDER BY duration DESC;

-- Long-running queries
SELECT 
  pid,
  now() - query_start as duration,
  query
FROM pg_stat_activity
WHERE state = 'active' 
  AND now() - query_start > interval '5 minutes';

-- Kill long query
SELECT pg_cancel_backend(pid);  -- Cancel gracefully
SELECT pg_terminate_backend(pid);  -- Force kill

-- Table sizes
SELECT 
  schemaname,
  tablename,
  pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) as size
FROM pg_tables
WHERE schemaname = 'public'
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC;

-- Cache hit ratio (should be > 99%)
SELECT 
  sum(heap_blks_read) as heap_read,
  sum(heap_blks_hit) as heap_hit,
  sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)) * 100 
    as cache_hit_ratio
FROM pg_statio_user_tables;

6. Advanced Data Types

-- JSONB (binary JSON, indexable)
CREATE TABLE products (
  id SERIAL PRIMARY KEY,
  name VARCHAR(100),
  attributes JSONB
);

INSERT INTO products (name, attributes) VALUES
('Laptop', '{"brand": "Dell", "ram": 16, "ssd": 512}'),
('Phone', '{"brand": "Apple", "model": "iPhone 15", "storage": 256}');

-- Query JSONB
SELECT * FROM products WHERE attributes->>'brand' = 'Dell';
SELECT * FROM products WHERE attributes @> '{"ram": 16}';

-- JSONB operators
-- -> returns JSON
-- ->> returns text
-- @> contains
-- ? key exists
-- ?| any key exists
-- ?& all keys exist

-- Arrays
CREATE TABLE articles (
  id SERIAL PRIMARY KEY,
  title VARCHAR(200),
  tags TEXT[]
);

INSERT INTO articles (title, tags) VALUES
('PostgreSQL Guide', ARRAY['database', 'sql', 'tutorial']);

-- Array queries
SELECT * FROM articles WHERE 'database' = ANY(tags);
SELECT * FROM articles WHERE tags @> ARRAY['sql', 'tutorial'];
SELECT * FROM articles WHERE tags && ARRAY['database', 'mongodb'];

-- Range types
CREATE TABLE reservations (
  id SERIAL PRIMARY KEY,
  room_id INT,
  during TSTZRANGE
);

INSERT INTO reservations (room_id, during) VALUES
(101, '[2024-01-15 14:00, 2024-01-15 16:00)');

-- Check for overlaps
SELECT * FROM reservations 
WHERE during && '[2024-01-15 15:00, 2024-01-15 17:00)'::tstzrange;

-- Prevent overlapping bookings
CREATE EXTENSION btree_gist;
ALTER TABLE reservations 
ADD CONSTRAINT no_overlap 
EXCLUDE USING GIST (room_id WITH =, during WITH &&);

-- UUID
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";

CREATE TABLE users (
  id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
  name VARCHAR(100)
);

-- PostGIS for geographic data
CREATE EXTENSION postgis;

CREATE TABLE locations (
  id SERIAL PRIMARY KEY,
  name VARCHAR(100),
  location GEOGRAPHY(POINT)
);

INSERT INTO locations (name, location) VALUES
('New York', ST_MakePoint(-74.006, 40.7128));

-- Find nearby locations (within 10km)
SELECT name 
FROM locations
WHERE ST_DWithin(
  location,
  ST_MakePoint(-74.0, 40.7)::geography,
  10000  -- meters
);

7. Security & Best Practices

-- Role-based access control
CREATE ROLE readonly;
GRANT CONNECT ON DATABASE mydb TO readonly;
GRANT USAGE ON SCHEMA public TO readonly;
GRANT SELECT ON ALL TABLES IN SCHEMA public TO readonly;
ALTER DEFAULT PRIVILEGES IN SCHEMA public 
GRANT SELECT ON TABLES TO readonly;

CREATE USER analyst WITH PASSWORD 'password';
GRANT readonly TO analyst;

-- Row-level security
CREATE TABLE documents (
  id SERIAL PRIMARY KEY,
  user_id INT,
  content TEXT
);

ALTER TABLE documents ENABLE ROW LEVEL SECURITY;

CREATE POLICY user_documents ON documents
FOR ALL
USING (user_id = current_setting('app.user_id')::INT);

-- Set user context
SET app.user_id = 123;
SELECT * FROM documents;  -- Only sees user 123's documents

-- SSL connections
# postgresql.conf
ssl = on
ssl_cert_file = '/path/to/server.crt'
ssl_key_file = '/path/to/server.key'

# Require SSL
# pg_hba.conf
hostssl all all 0.0.0.0/0 md5

-- Connection encryption
const pool = new Pool({
  ssl: {
    rejectUnauthorized: true,
    ca: fs.readFileSync('/path/to/ca.crt').toString(),
  }
});

-- Backup strategies
# Full backup
pg_dump mydb > backup.sql

# Compressed backup
pg_dump mydb | gzip > backup.sql.gz

# Parallel backup (faster)
pg_dump -Fd mydb -j 4 -f backup_dir/

# Restore
psql mydb < backup.sql
pg_restore -d mydb backup_dir/

# Continuous archiving (PITR)
# postgresql.conf
archive_mode = on
archive_command = 'cp %p /backup/archive/%f'

# Base backup
pg_basebackup -D /backup/base -Ft -z -P

# Point-in-time recovery
# Create recovery.conf
restore_command = 'cp /backup/archive/%f %p'
recovery_target_time = '2024-01-15 14:30:00'

-- Monitoring
CREATE EXTENSION pg_stat_statements;

# postgresql.conf
shared_preload_libraries = 'pg_stat_statements'

-- Top slow queries
SELECT 
  query,
  calls,
  mean_exec_time,
  total_exec_time
FROM pg_stat_statements
ORDER BY mean_exec_time DESC
LIMIT 10;

8. Best Practices

✓ PostgreSQL Best Practices:

✓ Use connection pooling (PgBouncer or app-level)
✓ Create appropriate indexes for query patterns
✓ Use EXPLAIN ANALYZE to optimize queries
✓ Partition large tables by date or key ranges
✓ Regular VACUUM and ANALYZE for statistics
✓ Use prepared statements to prevent SQL injection
✓ Implement streaming replication for HA
✓ Monitor cache hit ratio and slow queries
✓ Use JSONB instead of JSON for better performance
✓ Set up automated backups with PITR
✓ Use row-level security for multi-tenant apps
✓ Optimize autovacuum settings for workload
✓ Use materialized views for expensive queries
✓ Enable SSL for encrypted connections
✓ Monitor and kill long-running queries

Conclusion

PostgreSQL offers powerful features for scalable, high-performance applications. Master advanced queries, implement proper indexing strategies, use partitioning for large tables, set up replication for high availability, and continuously monitor performance. Regular optimization and maintenance ensure your database performs well under load.

💡 Pro Tip: Use pg_stat_statements extension to identify your slowest queries in production. Combined with auto_explain (logs slow query plans automatically), you can proactively optimize before performance becomes an issue. Set up automated alerts when cache hit ratio drops below 99% or query execution time exceeds thresholds. Consider using tools like pgBadger for comprehensive log analysis and Patroni for automated failover in production clusters.