Multi-Sample Analysis

Overview

Analyzing CNVs across multiple samples enables:

Cohort-level patterns: Identify recurrent CNV events
Sample comparison: Compare CNV profiles between patients
Batch integration: Combine samples with shared reference

This vignette demonstrates multi-sample workflows in fastCNV.

Preparing Multiple Samples

Loading Samples

library(fastCNV)
library(Seurat)

# Load individual Seurat objects
sample1 <- readRDS("patient_A.rds")
sample2 <- readRDS("patient_B.rds")
sample3 <- readRDS("patient_C.rds")

# Verify cell type annotations exist
table(sample1$cell_type)
table(sample2$cell_type)
table(sample3$cell_type)

Sample List Preparation

# Create a list of Seurat objects
sample_list <- list(sample1, sample2, sample3)

# Define sample names
sample_names <- c("Patient_A", "Patient_B", "Patient_C")

Running Multi-Sample Analysis

Basic Multi-Sample Workflow

# Run fastCNV on multiple samples
results <- fastCNV(
  seuratObj = sample_list,
  sampleName = sample_names,
  referenceVar = "cell_type",
  referenceLabel = c("Fibroblast", "T_cell", "B_cell"),
  prepareCounts = TRUE,
  getCNVPerChromosomeArm = TRUE,
  getCNVClusters = TRUE,
  doPlot = TRUE
)

Pooled Reference Analysis

When samples share similar normal cell populations, use a pooled reference:

results <- fastCNV(
  seuratObj = sample_list,
  sampleName = sample_names,
  referenceVar = "cell_type",
  referenceLabel = c("Normal_epithelial", "Fibroblast"),
  pooledReference = TRUE,  # Combine references across samples
  prepareCounts = TRUE,
  getCNVPerChromosomeArm = TRUE,
  getCNVClusters = TRUE
)

When to use pooled reference:

Scenario	Pooled Reference	Individual Reference
Same tissue type	✓ Recommended	✓ Also valid
Different tissues	✗ Avoid	✓ Recommended
Batch effects	✗ May amplify	✓ Reduces batch effects
Low reference cells	✓ Increases power	✗ May be noisy

Handling Results

Accessing Individual Results

# Results are returned as a list
length(results)  # Number of samples

# Access individual sample
patient_a <- results[[1]]
patient_b <- results[[2]]
patient_c <- results[[3]]

# Check CNV clusters per sample
table(patient_a$cnv_clusters)
table(patient_b$cnv_clusters)
table(patient_c$cnv_clusters)

Merging Results

# Merge all samples for combined analysis
merged <- merge(results[[1]], results[2:3])

# Add sample identifier
merged$sample <- merged$orig.ident

# Visualize combined data
DimPlot(merged, group.by = "cnv_clusters", split.by = "sample")

Cross-Sample Comparisons

Comparing CNV Patterns

library(dplyr)
library(ggplot2)

# Extract CNV fractions from all samples
cnv_summary <- lapply(seq_along(results), function(i) {
  data.frame(
    sample = sample_names[i],
    cnv_fraction = results[[i]]$cnv_fraction,
    cnv_cluster = results[[i]]$cnv_clusters
  )
}) %>% bind_rows()

# Compare CNV burden across samples
ggplot(cnv_summary, aes(x = sample, y = cnv_fraction, fill = sample)) +
  geom_boxplot() +
  theme_minimal() +
  labs(
    title = "CNV Burden Comparison Across Samples",
    x = "Sample",
    y = "CNV Fraction"
  )

Chromosome Arm Comparison

# Extract arm-level CNVs
arm_summary <- lapply(seq_along(results), function(i) {
  arm_data <- results[[i]]@meta.data %>%
    select(starts_with("arm_")) %>%
    colMeans()
  data.frame(
    sample = sample_names[i],
    arm = names(arm_data),
    mean_cnv = arm_data
  )
}) %>% bind_rows()

# Heatmap of arm-level CNVs
library(tidyr)
arm_matrix <- arm_summary %>%
  pivot_wider(names_from = sample, values_from = mean_cnv) %>%
  column_to_rownames("arm") %>%
  as.matrix()

heatmap(arm_matrix, scale = "none", col = colorRampPalette(c("blue", "white", "red"))(100))

Identifying Recurrent CNVs

Finding Common Events

# Define CNV presence threshold
cnv_threshold <- 0.15

# Function to identify recurrent CNVs
identify_recurrent_cnvs <- function(results, threshold = 0.15) {
  # Get CNV calls per sample
  cnv_calls <- lapply(results, function(obj) {
    scores <- as.matrix(obj@assays$CNVScoresTrimmed@data)
    # Fraction of cells with CNV per window
    colMeans(abs(scores) > threshold)
  })
  
  # Combine across samples
  cnv_matrix <- do.call(cbind, cnv_calls)
  colnames(cnv_matrix) <- sample_names
  
  # Identify windows with CNV in multiple samples
  recurrent <- rowSums(cnv_matrix > 0.1) >= 2  # CNV in ≥2 samples
  
  list(
    matrix = cnv_matrix,
    recurrent_windows = which(recurrent)
  )
}

recurrent <- identify_recurrent_cnvs(results)
message("Recurrent CNV regions: ", length(recurrent$recurrent_windows))

Visualizing Recurrent CNVs

# Create recurrence plot
recurrence_data <- data.frame(
  window = 1:nrow(recurrent$matrix),
  n_samples = rowSums(recurrent$matrix > 0.1)
)

ggplot(recurrence_data, aes(x = window, y = n_samples)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_hline(yintercept = 2, linetype = "dashed", color = "red") +
  theme_minimal() +
  labs(
    title = "CNV Recurrence Across Samples",
    x = "Genomic Window",
    y = "Number of Samples with CNV"
  )

Batch Effect Considerations

Detecting Batch Effects

# PCA on CNV profiles
cnv_profiles <- lapply(results, function(obj) {
  colMeans(as.matrix(obj@assays$CNVScores@data))
})
cnv_matrix <- do.call(rbind, cnv_profiles)
rownames(cnv_matrix) <- sample_names

# PCA
pca <- prcomp(cnv_matrix, scale = TRUE)
plot(pca$x[, 1:2], pch = 19, col = 1:length(sample_names))
text(pca$x[, 1:2], labels = sample_names, pos = 3)

Mitigating Batch Effects

# Strategy 1: Use sample-specific references (default)
results_individual <- fastCNV(
  seuratObj = sample_list,
  sampleName = sample_names,
  referenceVar = "cell_type",
  referenceLabel = c("Normal"),
  pooledReference = FALSE  # Each sample uses its own reference
)

# Strategy 2: Regress out batch in downstream analysis
# After merging, use Seurat integration methods

Cohort-Level Visualization

Combined Heatmap

# Merge samples for combined visualization
merged_results <- merge(results[[1]], results[2:3])

# Add sample labels
merged_results$sample_id <- factor(
  merged_results$orig.ident,
  levels = sample_names
)

# Plot combined heatmap (shows all samples)
plotCNVResults(
  seuratObj = merged_results,
  referenceVar = "cell_type",
  tumorLabel = "Tumor",
  splitPlotOnVar = "sample_id"
)

Sample-Wise CNV Tree

# Build tree for each sample
trees <- lapply(results, function(obj) {
  CNVTree(
    seuratObj = obj,
    referenceVar = "cell_type",
    tumorLabel = "Tumor",
    cnv_thresh = 0.15
  )
})

# Plot trees side by side
par(mfrow = c(1, 3))
for (i in seq_along(trees)) {
  plot(trees[[i]], main = sample_names[i])
}

Memory Management

Large Cohort Processing

# For large cohorts, process samples sequentially
process_sample <- function(seurat_path, sample_name, ref_var, ref_label) {
  # Load sample
  obj <- readRDS(seurat_path)
  
  # Run CNV analysis
  result <- fastCNV(
    seuratObj = obj,
    sampleName = sample_name,
    referenceVar = ref_var,
    referenceLabel = ref_label,
    prepareCounts = TRUE,
    getCNVPerChromosomeArm = TRUE,
    getCNVClusters = TRUE,
    doPlot = FALSE  # Skip plotting to save memory
  )
  
  # Save result
  saveRDS(result, paste0(sample_name, "_cnv_result.rds"))
  
  # Clean up
  rm(obj)
  invisible(gc())
  
  return(invisible(NULL))
}

# Process cohort
sample_paths <- c("patient_A.rds", "patient_B.rds", "patient_C.rds")
sample_names <- c("Patient_A", "Patient_B", "Patient_C")

for (i in seq_along(sample_paths)) {
  message("Processing: ", sample_names[i])
  process_sample(
    sample_paths[i],
    sample_names[i],
    ref_var = "cell_type",
    ref_label = c("Normal")
  )
}

Parallel Processing

library(future)
library(future.apply)

# Set up parallel backend
plan(multisession, workers = 4)

# Process samples in parallel
results <- future_lapply(seq_along(sample_list), function(i) {
  fastCNV(
    seuratObj = sample_list[[i]],
    sampleName = sample_names[i],
    referenceVar = "cell_type",
    referenceLabel = c("Normal"),
    doPlot = FALSE
  )
})

# Reset to sequential
plan(sequential)

Case Study: Tumor Cohort Analysis

Workflow Example

# 1. Load cohort data
cohort <- list(
  tumor_1 = readRDS("GBM_patient_1.rds"),
  tumor_2 = readRDS("GBM_patient_2.rds"),
  tumor_3 = readRDS("GBM_patient_3.rds")
)

# 2. Run CNV analysis with pooled reference
cnv_results <- fastCNV(
  seuratObj = cohort,
  sampleName = names(cohort),
  referenceVar = "cell_type",
  referenceLabel = c("Oligodendrocyte", "Astrocyte"),
  pooledReference = TRUE,
  windowSize = 150,
  getCNVClusters = TRUE
)

# 3. Identify recurrent GBM-associated CNVs
# Expected: chr7 gain, chr10 loss

# 4. Compare clonal composition across patients
clone_composition <- lapply(cnv_results, function(obj) {
  table(obj$cnv_clusters) / ncol(obj)
})

# 5. Export results
saveRDS(cnv_results, "GBM_cohort_cnv_results.rds")

Summary

Key points for multi-sample analysis:

Consistent annotations: Use the same cell type labels across samples
Reference selection: Choose appropriate reference strategy (pooled vs. individual)
Batch awareness: Consider technical variation between samples
Memory management: Process large cohorts sequentially or in parallel
Recurrence analysis: Identify biologically meaningful recurrent CNVs

Session Info

sessionInfo()
#> R version 4.4.0 (2024-04-24)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS 15.6.1
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
#> 
#> locale:
#> [1] C
#> 
#> time zone: Asia/Shanghai
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> loaded via a namespace (and not attached):
#>  [1] digest_0.6.39     desc_1.4.3        R6_2.6.1          fastmap_1.2.0    
#>  [5] xfun_0.56         cachem_1.1.0      knitr_1.51        htmltools_0.5.9  
#>  [9] rmarkdown_2.30    lifecycle_1.0.5   cli_3.6.5         sass_0.4.10      
#> [13] pkgdown_2.1.3     textshaping_1.0.4 jquerylib_0.1.4   systemfonts_1.3.1
#> [17] compiler_4.4.0    tools_4.4.0       ragg_1.5.0        bslib_0.9.0      
#> [21] evaluate_1.0.5    yaml_2.3.12       otel_0.2.0        jsonlite_2.0.0   
#> [25] rlang_1.1.7       fs_1.6.6          htmlwidgets_1.6.4

Zaoqu Liu

2026-01-23