Skip to contents

Introduction

This case study demonstrates how to use scPAS to identify survival-associated cell subpopulations in cancer single-cell data by integrating bulk RNA-seq data with patient survival information.

Biological Question

Which cell subpopulations in the tumor microenvironment are associated with patient prognosis?

Analysis Overview

  1. Prepare single-cell RNA-seq data from tumor samples
  2. Prepare bulk RNA-seq data with survival outcomes
  3. Train scPAS model with Cox regression
  4. Identify cells associated with good/poor prognosis
  5. Characterize survival-associated subpopulations

Simulated Cancer Data

For this demonstration, we create realistic simulated data mimicking a tumor microenvironment study.

Single-Cell Data: Tumor Microenvironment

set.seed(123)

n_genes <- 800
n_cells <- 1000

# Create sparse count matrix
counts <- matrix(0, nrow = n_genes, ncol = n_cells)
for (i in 1:n_genes) {
  # Variable expression rates
  lambda <- sample(c(1, 3, 5, 10), 1, prob = c(0.3, 0.4, 0.2, 0.1))
  counts[i, ] <- rpois(n_cells, lambda)
}
rownames(counts) <- paste0("Gene", 1:n_genes)
colnames(counts) <- paste0("Cell", 1:n_cells)

# Create Seurat object
tumor_sc <- CreateSeuratObject(counts = counts, project = "TumorME")

# Define cell types (typical tumor microenvironment)
cell_types <- c(
  rep("Malignant", 300),       # Tumor cells
  rep("CD8_T_exhausted", 100), # Exhausted T cells
  rep("CD8_T_effector", 100),  # Effector T cells
  rep("Treg", 80),             # Regulatory T cells
  rep("TAM_M1", 80),           # M1 macrophages (anti-tumor)
  rep("TAM_M2", 120),          # M2 macrophages (pro-tumor)
  rep("CAF", 100),             # Cancer-associated fibroblasts
  rep("Endothelial", 70),      # Endothelial cells
  rep("DC", 50)                # Dendritic cells
)
tumor_sc$celltype <- cell_types

# Standard preprocessing
tumor_sc <- NormalizeData(tumor_sc, verbose = FALSE)
tumor_sc <- FindVariableFeatures(tumor_sc, nfeatures = 500, verbose = FALSE)
tumor_sc <- ScaleData(tumor_sc, verbose = FALSE)
tumor_sc <- RunPCA(tumor_sc, npcs = 30, verbose = FALSE)
tumor_sc <- RunUMAP(tumor_sc, dims = 1:20, verbose = FALSE)

# Visualize
DimPlot(tumor_sc, group.by = "celltype", label = TRUE, repel = TRUE) +
  ggtitle("Tumor Microenvironment Cell Types")

Bulk Data: TCGA-like Cohort with Survival

set.seed(456)

n_bulk_samples <- 80

# Create bulk expression matrix
bulk_data <- matrix(
  rnorm(n_genes * n_bulk_samples, mean = 8, sd = 2),
  nrow = n_genes,
  ncol = n_bulk_samples
)
rownames(bulk_data) <- paste0("Gene", 1:n_genes)
colnames(bulk_data) <- paste0("Patient", 1:n_bulk_samples)

# Simulate survival data
# Higher expression of certain genes → worse survival
prognostic_genes <- sample(1:n_genes, 50)
risk_score_bulk <- colMeans(bulk_data[prognostic_genes, ])

# Generate survival times (exponential with risk-dependent rate)
base_survival <- 1000  # days
survival_time <- rexp(n_bulk_samples, rate = 0.001 * exp(scale(risk_score_bulk)))
survival_time <- pmin(survival_time, 2000)  # Cap at 2000 days

# Generate censoring
censor_time <- runif(n_bulk_samples, 500, 2500)
observed_time <- pmin(survival_time, censor_time)
event_status <- as.integer(survival_time <= censor_time)

# Create survival object
survival_phenotype <- Surv(time = observed_time, event = event_status)

cat("Bulk samples:", n_bulk_samples, "\n")
#> Bulk samples: 80
cat("Events:", sum(event_status), "\n")
#> Events: 56
cat("Censored:", sum(1 - event_status), "\n")
#> Censored: 24
cat("Median follow-up:", median(observed_time), "days\n")
#> Median follow-up: 580.6118 days

Visualize Survival Data

# Split by median risk
risk_group <- ifelse(risk_score_bulk > median(risk_score_bulk), "High", "Low")

km_data <- data.frame(
  time = observed_time,
  status = event_status,
  risk_group = risk_group
)

fit <- survfit(Surv(time, status) ~ risk_group, data = km_data)

ggsurvplot(
  fit,
  data = km_data,
  pval = TRUE,
  risk.table = TRUE,
  palette = c("royalblue", "indianred"),
  title = "Bulk Sample Survival by Risk Score",
  xlab = "Time (days)",
  ylab = "Survival Probability"
)

Run scPAS with Cox Regression

# Run scPAS analysis
result <- scPAS(
  bulk_dataset = bulk_data,
  sc_dataset = tumor_sc,
  phenotype = survival_phenotype,
  family = "cox",
  nfeature = 300,
  permutation_times = 200,  # Use 1000+ in practice
  do_imputation = FALSE,
  n_cores = 1,
  FDR.threshold = 0.05
)

# Summary
cat("Total cells:", ncol(result), "\n")
#> Total cells: 1000
cat("scPAS+ (poor prognosis):", sum(result$scPAS == "scPAS+", na.rm = TRUE), "\n")
#> scPAS+ (poor prognosis): 1
cat("scPAS- (good prognosis):", sum(result$scPAS == "scPAS-", na.rm = TRUE), "\n")
#> scPAS- (good prognosis): 3
cat("Non-significant:", sum(result$scPAS == "0", na.rm = TRUE), "\n")
#> Non-significant: 996

Visualize Results

UMAP Overview

# Cell type colors
ct_colors <- c(
  "Malignant" = "#E41A1C",
  "CD8_T_exhausted" = "#377EB8",
  "CD8_T_effector" = "#4DAF4A",
  "Treg" = "#984EA3",
  "TAM_M1" = "#FF7F00",
  "TAM_M2" = "#FFFF33",
  "CAF" = "#A65628",
  "Endothelial" = "#F781BF",
  "DC" = "#999999"
)

class_colors <- c("scPAS-" = "royalblue", "0" = "gray80", "scPAS+" = "indianred")

p1 <- DimPlot(result, group.by = "celltype", cols = ct_colors, label = TRUE) +
  ggtitle("Cell Types") + NoLegend()

p2 <- FeaturePlot(result, features = "scPAS_NRS") +
  scale_color_gradient2(low = "royalblue", mid = "white", high = "indianred", midpoint = 0) +
  ggtitle("Risk Score")

p3 <- DimPlot(result, group.by = "scPAS", cols = class_colors,
              order = c("0", "scPAS-", "scPAS+")) +
  ggtitle("Prognosis Association")

p1 | p2 | p3

Cell Type Enrichment

# Calculate enrichment
enrichment_table <- table(result$celltype, result$scPAS)
enrichment_df <- as.data.frame(enrichment_table)
colnames(enrichment_df) <- c("CellType", "scPAS", "Count")

# Calculate proportions
enrichment_df <- enrichment_df %>%
  dplyr::group_by(CellType) %>%
  dplyr::mutate(
    Total = sum(Count),
    Proportion = Count / Total * 100
  ) %>%
  dplyr::ungroup()

# Focus on scPAS+ (poor prognosis)
scpas_positive <- enrichment_df %>%
  dplyr::filter(scPAS == "scPAS+") %>%
  dplyr::arrange(desc(Proportion))

ggplot(scpas_positive, aes(x = reorder(CellType, Proportion), y = Proportion, fill = CellType)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_fill_manual(values = ct_colors) +
  coord_flip() +
  labs(
    x = "",
    y = "% scPAS+ cells",
    title = "Cell Types Enriched for Poor Prognosis (scPAS+)"
  ) +
  theme(
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold")
  )

Violin Plot by Cell Type

# Order by median risk score
cell_order <- result@meta.data %>%
  dplyr::group_by(celltype) %>%
  dplyr::summarise(median_risk = median(scPAS_NRS, na.rm = TRUE)) %>%
  dplyr::arrange(desc(median_risk)) %>%
  dplyr::pull(celltype)

result$celltype <- factor(result$celltype, levels = cell_order)

VlnPlot(result, features = "scPAS_NRS", group.by = "celltype",
        cols = ct_colors[cell_order], pt.size = 0) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray40") +
  stat_summary(fun = median, geom = "point", size = 2, color = "black") +
  labs(
    x = "Cell Type (ordered by median risk)",
    y = "Normalized Risk Score",
    title = "Survival-Associated Risk Score by Cell Type"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  )

Biological Interpretation

Key Findings

Based on the analysis (with real data, findings would differ):

# Create interpretation summary
interpretation_data <- data.frame(
  CellType = c("TAM_M2", "Treg", "CAF", "CD8_T_exhausted", 
               "TAM_M1", "CD8_T_effector", "DC"),
  Association = c("Poor", "Poor", "Poor", "Poor",
                  "Good", "Good", "Good"),
  Mechanism = c(
    "Immunosuppressive",
    "Immunosuppressive",
    "TME remodeling",
    "Dysfunctional",
    "Anti-tumor",
    "Cytotoxic",
    "Antigen presentation"
  )
)

interpretation_data$Association <- factor(interpretation_data$Association, levels = c("Good", "Poor"))

ggplot(interpretation_data, aes(x = reorder(CellType, as.numeric(Association)), 
                                y = 1, fill = Association)) +
  geom_tile(color = "white", size = 1) +
  geom_text(aes(label = Mechanism), size = 3.5, color = "white", fontface = "bold") +
  scale_fill_manual(values = c("Good" = "royalblue", "Poor" = "indianred")) +
  coord_flip() +
  labs(
    x = "",
    y = "",
    title = "Biological Interpretation of Survival Associations",
    fill = "Prognosis"
  ) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank(),
    plot.title = element_text(hjust = 0.5, face = "bold")
  )

Publication Figure

# Create comprehensive figure
layout <- "
AABB
CCDD
EEEE
"

p_a <- DimPlot(result, group.by = "celltype", cols = ct_colors, label = TRUE, label.size = 3) +
  ggtitle("A. Tumor Microenvironment") + NoLegend()

p_b <- FeaturePlot(result, features = "scPAS_NRS", pt.size = 0.5) +
  scale_color_gradient2(low = "royalblue", mid = "white", high = "indianred", 
                        midpoint = 0, name = "Risk\nScore") +
  ggtitle("B. Survival Risk Score")

p_c <- DimPlot(result, group.by = "scPAS", cols = class_colors, pt.size = 0.5,
               order = c("0", "scPAS-", "scPAS+")) +
  ggtitle("C. Prognosis Classification")

p_d <- ggplot(scpas_positive, aes(x = reorder(CellType, Proportion), y = Proportion, fill = CellType)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = ct_colors) +
  coord_flip() +
  labs(x = "", y = "% Poor Prognosis (scPAS+)") +
  ggtitle("D. Cell Type Enrichment") +
  theme(legend.position = "none")

p_e <- VlnPlot(result, features = "scPAS_NRS", group.by = "celltype",
               cols = ct_colors[cell_order], pt.size = 0) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  ggtitle("E. Risk Score Distribution by Cell Type") +
  NoLegend() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p_a + p_b + p_c + p_d + p_e + plot_layout(design = layout)

Model Application to New Data

The trained model can be applied to independent datasets:

# Get trained model parameters
model_params <- result@misc$scPAS_para
head(sort(model_params$Coefs, decreasing = TRUE))

# Apply to new bulk data
new_bulk_predictions <- scPAS.prediction(
  model = result,
  test.data = new_bulk_expression,
  do_imputation = FALSE
)

# Apply to spatial transcriptomics
spatial_predictions <- scPAS.prediction(
  model = result,
  test.data = spatial_seurat,
  assay = "Spatial",
  do_imputation = TRUE
)

Key Takeaways

  1. scPAS+ cells (associated with poor prognosis):
    • M2 tumor-associated macrophages (immunosuppressive)
    • Regulatory T cells (Tregs)
    • Cancer-associated fibroblasts
    • Exhausted CD8+ T cells
  2. scPAS- cells (associated with good prognosis):
    • M1 macrophages (anti-tumor)
    • Effector CD8+ T cells
    • Dendritic cells
  3. Clinical implications:
    • Target immunosuppressive populations for therapy
    • Enhance anti-tumor immune responses
    • Biomarker development for prognosis prediction

Session Information

sessionInfo()
#> R version 4.4.0 (2024-04-24)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS 15.6.1
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
#> 
#> locale:
#> [1] C
#> 
#> time zone: Asia/Shanghai
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] patchwork_1.3.2    RColorBrewer_1.1-3 survminer_0.5.1    ggpubr_0.6.2      
#>  [5] survival_3.8-3     ggplot2_4.0.1      Matrix_1.7-4       SeuratObject_4.1.4
#>  [9] Seurat_4.4.0       scPAS_1.0.3       
#> 
#> loaded via a namespace (and not attached):
#>   [1] jsonlite_2.0.0         magrittr_2.0.4         ggbeeswarm_0.7.3      
#>   [4] spatstat.utils_3.2-1   farver_2.1.2           rmarkdown_2.30        
#>   [7] fs_1.6.6               ragg_1.5.0             vctrs_0.7.0           
#>  [10] ROCR_1.0-11            spatstat.explore_3.6-0 rstatix_0.7.3         
#>  [13] htmltools_0.5.9        broom_1.0.11           Formula_1.2-5         
#>  [16] sass_0.4.10            sctransform_0.4.3      parallelly_1.46.1     
#>  [19] KernSmooth_2.23-26     bslib_0.9.0            htmlwidgets_1.6.4     
#>  [22] desc_1.4.3             ica_1.0-3              plyr_1.8.9            
#>  [25] plotly_4.11.0          zoo_1.8-15             cachem_1.1.0          
#>  [28] commonmark_2.0.0       igraph_2.2.1           mime_0.13             
#>  [31] lifecycle_1.0.5        pkgconfig_2.0.3        R6_2.6.1              
#>  [34] fastmap_1.2.0          fitdistrplus_1.2-4     future_1.69.0         
#>  [37] shiny_1.12.1           digest_0.6.39          tensor_1.5.1          
#>  [40] RSpectra_0.16-2        irlba_2.3.5.1          textshaping_1.0.4     
#>  [43] labeling_0.4.3         progressr_0.18.0       km.ci_0.5-6           
#>  [46] spatstat.sparse_3.1-0  httr_1.4.7             polyclip_1.10-7       
#>  [49] abind_1.4-8            compiler_4.4.0         withr_3.0.2           
#>  [52] S7_0.2.1               backports_1.5.0        carData_3.0-5         
#>  [55] ggsignif_0.6.4         MASS_7.3-65            tools_4.4.0           
#>  [58] vipor_0.4.7            lmtest_0.9-40          otel_0.2.0            
#>  [61] beeswarm_0.4.0         httpuv_1.6.16          future.apply_1.20.1   
#>  [64] goftest_1.2-3          glue_1.8.0             nlme_3.1-168          
#>  [67] promises_1.5.0         gridtext_0.1.5         grid_4.4.0            
#>  [70] Rtsne_0.17             cluster_2.1.8.1        reshape2_1.4.5        
#>  [73] generics_0.1.4         gtable_0.3.6           spatstat.data_3.1-9   
#>  [76] KMsurv_0.1-6           preprocessCore_1.68.0  tidyr_1.3.2           
#>  [79] data.table_1.18.0      xml2_1.5.2             sp_2.2-0              
#>  [82] car_3.1-3              spatstat.geom_3.6-1    RcppAnnoy_0.0.23      
#>  [85] markdown_2.0           ggrepel_0.9.6          RANN_2.6.2            
#>  [88] pillar_1.11.1          stringr_1.6.0          later_1.4.5           
#>  [91] splines_4.4.0          ggtext_0.1.2           dplyr_1.1.4           
#>  [94] lattice_0.22-7         deldir_2.0-4           tidyselect_1.2.1      
#>  [97] miniUI_0.1.2           pbapply_1.7-4          knitr_1.51            
#> [100] gridExtra_2.3          litedown_0.9           scattermore_1.2       
#> [103] xfun_0.56              matrixStats_1.5.0      stringi_1.8.7         
#> [106] lazyeval_0.2.2         yaml_2.3.12            evaluate_1.0.5        
#> [109] codetools_0.2-20       tibble_3.3.1           cli_3.6.5             
#> [112] uwot_0.2.4             xtable_1.8-4           reticulate_1.44.1     
#> [115] systemfonts_1.3.1      jquerylib_0.1.4        survMisc_0.5.6        
#> [118] dichromat_2.0-0.1      Rcpp_1.1.1             globals_0.18.0        
#> [121] spatstat.random_3.4-3  png_0.1-8              ggrastr_1.0.2         
#> [124] spatstat.univar_3.1-6  parallel_4.4.0         pkgdown_2.1.3         
#> [127] listenv_0.10.0         viridisLite_0.4.2      scales_1.4.0          
#> [130] ggridges_0.5.7         leiden_0.4.3.1         purrr_1.2.1           
#> [133] rlang_1.1.7            cowplot_1.2.0