Overview
Analyzing CNVs across multiple samples enables:
- Cohort-level patterns: Identify recurrent CNV events
- Sample comparison: Compare CNV profiles between patients
- Batch integration: Combine samples with shared reference
This vignette demonstrates multi-sample workflows in fastCNV.
Running Multi-Sample Analysis
Pooled Reference Analysis
When samples share similar normal cell populations, use a pooled reference:
results <- fastCNV(
seuratObj = sample_list,
sampleName = sample_names,
referenceVar = "cell_type",
referenceLabel = c("Normal_epithelial", "Fibroblast"),
pooledReference = TRUE, # Combine references across samples
prepareCounts = TRUE,
getCNVPerChromosomeArm = TRUE,
getCNVClusters = TRUE
)When to use pooled reference:
| Scenario | Pooled Reference | Individual Reference |
|---|---|---|
| Same tissue type | ✓ Recommended | ✓ Also valid |
| Different tissues | ✗ Avoid | ✓ Recommended |
| Batch effects | ✗ May amplify | ✓ Reduces batch effects |
| Low reference cells | ✓ Increases power | ✗ May be noisy |
Cross-Sample Comparisons
Comparing CNV Patterns
library(dplyr)
library(ggplot2)
# Extract CNV fractions from all samples
cnv_summary <- lapply(seq_along(results), function(i) {
data.frame(
sample = sample_names[i],
cnv_fraction = results[[i]]$cnv_fraction,
cnv_cluster = results[[i]]$cnv_clusters
)
}) %>% bind_rows()
# Compare CNV burden across samples
ggplot(cnv_summary, aes(x = sample, y = cnv_fraction, fill = sample)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "CNV Burden Comparison Across Samples",
x = "Sample",
y = "CNV Fraction"
)Chromosome Arm Comparison
# Extract arm-level CNVs
arm_summary <- lapply(seq_along(results), function(i) {
arm_data <- results[[i]]@meta.data %>%
select(starts_with("arm_")) %>%
colMeans()
data.frame(
sample = sample_names[i],
arm = names(arm_data),
mean_cnv = arm_data
)
}) %>% bind_rows()
# Heatmap of arm-level CNVs
library(tidyr)
arm_matrix <- arm_summary %>%
pivot_wider(names_from = sample, values_from = mean_cnv) %>%
column_to_rownames("arm") %>%
as.matrix()
heatmap(arm_matrix, scale = "none", col = colorRampPalette(c("blue", "white", "red"))(100))Identifying Recurrent CNVs
Finding Common Events
# Define CNV presence threshold
cnv_threshold <- 0.15
# Function to identify recurrent CNVs
identify_recurrent_cnvs <- function(results, threshold = 0.15) {
# Get CNV calls per sample
cnv_calls <- lapply(results, function(obj) {
scores <- as.matrix(obj@assays$CNVScoresTrimmed@data)
# Fraction of cells with CNV per window
colMeans(abs(scores) > threshold)
})
# Combine across samples
cnv_matrix <- do.call(cbind, cnv_calls)
colnames(cnv_matrix) <- sample_names
# Identify windows with CNV in multiple samples
recurrent <- rowSums(cnv_matrix > 0.1) >= 2 # CNV in ≥2 samples
list(
matrix = cnv_matrix,
recurrent_windows = which(recurrent)
)
}
recurrent <- identify_recurrent_cnvs(results)
message("Recurrent CNV regions: ", length(recurrent$recurrent_windows))Visualizing Recurrent CNVs
# Create recurrence plot
recurrence_data <- data.frame(
window = 1:nrow(recurrent$matrix),
n_samples = rowSums(recurrent$matrix > 0.1)
)
ggplot(recurrence_data, aes(x = window, y = n_samples)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_hline(yintercept = 2, linetype = "dashed", color = "red") +
theme_minimal() +
labs(
title = "CNV Recurrence Across Samples",
x = "Genomic Window",
y = "Number of Samples with CNV"
)Batch Effect Considerations
Detecting Batch Effects
# PCA on CNV profiles
cnv_profiles <- lapply(results, function(obj) {
colMeans(as.matrix(obj@assays$CNVScores@data))
})
cnv_matrix <- do.call(rbind, cnv_profiles)
rownames(cnv_matrix) <- sample_names
# PCA
pca <- prcomp(cnv_matrix, scale = TRUE)
plot(pca$x[, 1:2], pch = 19, col = 1:length(sample_names))
text(pca$x[, 1:2], labels = sample_names, pos = 3)Mitigating Batch Effects
# Strategy 1: Use sample-specific references (default)
results_individual <- fastCNV(
seuratObj = sample_list,
sampleName = sample_names,
referenceVar = "cell_type",
referenceLabel = c("Normal"),
pooledReference = FALSE # Each sample uses its own reference
)
# Strategy 2: Regress out batch in downstream analysis
# After merging, use Seurat integration methodsCohort-Level Visualization
Combined Heatmap
# Merge samples for combined visualization
merged_results <- merge(results[[1]], results[2:3])
# Add sample labels
merged_results$sample_id <- factor(
merged_results$orig.ident,
levels = sample_names
)
# Plot combined heatmap (shows all samples)
plotCNVResults(
seuratObj = merged_results,
referenceVar = "cell_type",
tumorLabel = "Tumor",
splitPlotOnVar = "sample_id"
)Memory Management
Large Cohort Processing
# For large cohorts, process samples sequentially
process_sample <- function(seurat_path, sample_name, ref_var, ref_label) {
# Load sample
obj <- readRDS(seurat_path)
# Run CNV analysis
result <- fastCNV(
seuratObj = obj,
sampleName = sample_name,
referenceVar = ref_var,
referenceLabel = ref_label,
prepareCounts = TRUE,
getCNVPerChromosomeArm = TRUE,
getCNVClusters = TRUE,
doPlot = FALSE # Skip plotting to save memory
)
# Save result
saveRDS(result, paste0(sample_name, "_cnv_result.rds"))
# Clean up
rm(obj)
invisible(gc())
return(invisible(NULL))
}
# Process cohort
sample_paths <- c("patient_A.rds", "patient_B.rds", "patient_C.rds")
sample_names <- c("Patient_A", "Patient_B", "Patient_C")
for (i in seq_along(sample_paths)) {
message("Processing: ", sample_names[i])
process_sample(
sample_paths[i],
sample_names[i],
ref_var = "cell_type",
ref_label = c("Normal")
)
}Parallel Processing
library(future)
library(future.apply)
# Set up parallel backend
plan(multisession, workers = 4)
# Process samples in parallel
results <- future_lapply(seq_along(sample_list), function(i) {
fastCNV(
seuratObj = sample_list[[i]],
sampleName = sample_names[i],
referenceVar = "cell_type",
referenceLabel = c("Normal"),
doPlot = FALSE
)
})
# Reset to sequential
plan(sequential)Case Study: Tumor Cohort Analysis
Workflow Example
# 1. Load cohort data
cohort <- list(
tumor_1 = readRDS("GBM_patient_1.rds"),
tumor_2 = readRDS("GBM_patient_2.rds"),
tumor_3 = readRDS("GBM_patient_3.rds")
)
# 2. Run CNV analysis with pooled reference
cnv_results <- fastCNV(
seuratObj = cohort,
sampleName = names(cohort),
referenceVar = "cell_type",
referenceLabel = c("Oligodendrocyte", "Astrocyte"),
pooledReference = TRUE,
windowSize = 150,
getCNVClusters = TRUE
)
# 3. Identify recurrent GBM-associated CNVs
# Expected: chr7 gain, chr10 loss
# 4. Compare clonal composition across patients
clone_composition <- lapply(cnv_results, function(obj) {
table(obj$cnv_clusters) / ncol(obj)
})
# 5. Export results
saveRDS(cnv_results, "GBM_cohort_cnv_results.rds")Summary
Key points for multi-sample analysis:
- Consistent annotations: Use the same cell type labels across samples
- Reference selection: Choose appropriate reference strategy (pooled vs. individual)
- Batch awareness: Consider technical variation between samples
- Memory management: Process large cohorts sequentially or in parallel
- Recurrence analysis: Identify biologically meaningful recurrent CNVs
Session Info
sessionInfo()
#> R version 4.4.0 (2024-04-24)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS 15.6.1
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
#>
#> locale:
#> [1] C
#>
#> time zone: Asia/Shanghai
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> loaded via a namespace (and not attached):
#> [1] digest_0.6.39 desc_1.4.3 R6_2.6.1 fastmap_1.2.0
#> [5] xfun_0.56 cachem_1.1.0 knitr_1.51 htmltools_0.5.9
#> [9] rmarkdown_2.30 lifecycle_1.0.5 cli_3.6.5 sass_0.4.10
#> [13] pkgdown_2.1.3 textshaping_1.0.4 jquerylib_0.1.4 systemfonts_1.3.1
#> [17] compiler_4.4.0 tools_4.4.0 ragg_1.5.0 bslib_0.9.0
#> [21] evaluate_1.0.5 yaml_2.3.12 otel_0.2.0 jsonlite_2.0.0
#> [25] rlang_1.1.7 fs_1.6.6 htmlwidgets_1.6.4