Spatial Preprocessing
QC, filtering, normalization, and feature selection for spatial data.
Required Imports
python
import squidpy as sq import scanpy as sc import numpy as np import matplotlib.pyplot as plt
Calculate QC Metrics
python
# Calculate standard QC metrics sc.pp.calculate_qc_metrics(adata, inplace=True) # View QC columns print(adata.obs[['total_counts', 'n_genes_by_counts']].describe()) print(adata.var[['total_counts', 'n_cells_by_counts']].describe())
Calculate Mitochondrial Content
python
# Mark mitochondrial genes
adata.var['mt'] = adata.var_names.str.startswith('MT-')
# Calculate percent mitochondrial
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)
print(f"Mean MT%: {adata.obs['pct_counts_mt'].mean():.1f}")
Visualize QC Metrics on Tissue
python
# Plot QC metrics spatially sq.pl.spatial_scatter(adata, color=['total_counts', 'n_genes_by_counts', 'pct_counts_mt'], ncols=3) # Or with Scanpy sc.pl.spatial(adata, color=['total_counts', 'n_genes_by_counts'], spot_size=1.5)
QC Metric Distributions
python
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
axes[0].hist(adata.obs['total_counts'], bins=50)
axes[0].set_xlabel('Total counts')
axes[1].hist(adata.obs['n_genes_by_counts'], bins=50)
axes[1].set_xlabel('Genes detected')
axes[2].hist(adata.obs['pct_counts_mt'], bins=50)
axes[2].set_xlabel('MT %')
plt.tight_layout()
Filter Spots
python
# Filter based on QC metrics
print(f'Before filtering: {adata.n_obs} spots')
# Minimum counts and genes
sc.pp.filter_cells(adata, min_counts=500)
sc.pp.filter_cells(adata, min_genes=200)
# Maximum mitochondrial content
adata = adata[adata.obs['pct_counts_mt'] < 20].copy()
print(f'After filtering: {adata.n_obs} spots')
Filter Genes
python
# Remove genes detected in few spots
print(f'Before filtering: {adata.n_vars} genes')
sc.pp.filter_genes(adata, min_cells=10)
print(f'After filtering: {adata.n_vars} genes')
Normalization
python
# Store raw counts adata.layers['counts'] = adata.X.copy() # Normalize to median total counts sc.pp.normalize_total(adata, target_sum=1e4) # Log transform sc.pp.log1p(adata)
SCTransform-like Normalization
python
# Pearson residuals normalization (similar to SCTransform) # Requires raw counts adata_raw = adata.copy() adata_raw.X = adata_raw.layers['counts'] sc.experimental.pp.normalize_pearson_residuals(adata_raw) adata.layers['pearson'] = adata_raw.X.copy()
Highly Variable Genes
python
# Find HVGs
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor='seurat_v3', layer='counts')
# View HVG stats
print(f"Found {adata.var['highly_variable'].sum()} HVGs")
sc.pl.highly_variable_genes(adata)
Spatially Variable Genes
python
# Compute spatial neighbors first
sq.gr.spatial_neighbors(adata, coord_type='generic', n_neighs=6)
# Find spatially variable genes using Moran's I
sq.gr.spatial_autocorr(adata, mode='moran', genes=adata.var_names[:1000])
# Get top spatially variable genes
svg = adata.uns['moranI'].sort_values('I', ascending=False)
print('Top spatially variable genes:')
print(svg.head(20))
Combine HVG and SVG
python
# Get union of highly variable and spatially variable genes
hvg = set(adata.var_names[adata.var['highly_variable']])
svg_top = set(adata.uns['moranI'].head(500).index)
selected_genes = hvg | svg_top
print(f'HVG: {len(hvg)}, SVG: {len(svg_top)}, Union: {len(selected_genes)}')
# Subset to selected genes for downstream
adata_subset = adata[:, list(selected_genes)].copy()
Scale Data
python
# Scale for PCA (use log-normalized data) sc.pp.scale(adata, max_value=10)
PCA
python
# Run PCA sc.tl.pca(adata, n_comps=50) # Variance explained sc.pl.pca_variance_ratio(adata, n_pcs=50)
Complete Preprocessing Pipeline
python
import squidpy as sq
import scanpy as sc
# Load data
adata = sq.read.visium('spaceranger_output/')
# QC
adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)
# Filter
sc.pp.filter_cells(adata, min_counts=1000)
sc.pp.filter_cells(adata, min_genes=500)
adata = adata[adata.obs['pct_counts_mt'] < 20].copy()
sc.pp.filter_genes(adata, min_cells=10)
# Normalize
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# HVGs
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor='seurat_v3', layer='counts')
# Scale and PCA
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, n_comps=50)
print(f'Preprocessed: {adata.n_obs} spots, {adata.n_vars} genes')
adata.write_h5ad('preprocessed.h5ad')
Related Skills
- •spatial-data-io - Load spatial data
- •spatial-neighbors - Build spatial graphs
- •single-cell/preprocessing - Non-spatial preprocessing