Performance Optimization

Introduction

CytoSPACER is designed for high performance through C++ implementations and parallel processing. This vignette provides guidance on optimizing performance for large datasets.

Performance Architecture

CytoSPACER uses several strategies for optimal performance:

C++ Backend: Core algorithms implemented in C++ via Rcpp
Parallel Processing: Multi-core support via the future framework
Chunked Processing: Memory-efficient handling of large datasets
Sparse Matrix Support: Efficient handling of sparse data

library(CytoSPACER)

Benchmarking

C++ vs R Implementation

The C++ implementations provide significant speedups:

# Create test data
set.seed(42)
n_genes <- 500
n_cells <- 1000
n_spots <- 200

sc_data <- matrix(rnorm(n_genes * n_cells), nrow = n_genes, ncol = n_cells)
st_data <- matrix(rnorm(n_genes * n_spots), nrow = n_genes, ncol = n_spots)
rownames(sc_data) <- rownames(st_data) <- paste0("Gene", seq_len(n_genes))
colnames(sc_data) <- paste0("Cell", seq_len(n_cells))
colnames(st_data) <- paste0("Spot", seq_len(n_spots))

# Benchmark correlation computation
time_cpp <- system.time({
  cost_cpp <- compute_cost_matrix(sc_data, st_data, method = "pearson", use_cpp = TRUE)
})

time_r <- system.time({
  cost_r <- compute_cost_matrix(sc_data, st_data, method = "pearson", use_cpp = FALSE)
})

cat("C++ implementation:", round(time_cpp["elapsed"], 3), "seconds\n")
#> C++ implementation: 1.222 seconds
cat("R implementation:", round(time_r["elapsed"], 3), "seconds\n")
#> R implementation: 0.048 seconds
cat("Speedup:", round(time_r["elapsed"] / time_cpp["elapsed"], 1), "x\n")
#> Speedup: 0 x

LAP Solver Performance

# Create cost matrices of different sizes
sizes <- c(100, 200, 500)
times <- numeric(length(sizes))

for (i in seq_along(sizes)) {
  n <- sizes[i]
  cost <- matrix(runif(n * n), nrow = n, ncol = n)
  
  times[i] <- system.time({
    assignment <- solve_lap(cost)
  })["elapsed"]
}

# Display results
results_df <- data.frame(
  Size = paste0(sizes, "x", sizes),
  Time_sec = round(times, 4)
)
print(results_df)
#>      Size Time_sec
#> 1 100x100    0.000
#> 2 200x200    0.003
#> 3 500x500    0.150

Memory Optimization

Sparse Matrix Handling

CytoSPACER efficiently handles sparse matrices:

library(Matrix)

# Create sparse data (typical for scRNA-seq)
sparse_data <- rsparsematrix(1000, 5000, density = 0.1)
rownames(sparse_data) <- paste0("Gene", 1:1000)
colnames(sparse_data) <- paste0("Cell", 1:5000)

# Check memory usage
dense_size <- object.size(as.matrix(sparse_data))
sparse_size <- object.size(sparse_data)

cat("Dense matrix size:", format(dense_size, units = "MB"), "\n")
#> Dense matrix size: 38.5 Mb
cat("Sparse matrix size:", format(sparse_size, units = "MB"), "\n")
#> Sparse matrix size: 6.1 Mb
cat("Memory savings:", round((1 - as.numeric(sparse_size)/as.numeric(dense_size)) * 100), "%\n")
#> Memory savings: 84 %

Downsampling

For very large scRNA-seq datasets, downsampling reduces memory and computation:

# Create large count matrix
large_counts <- matrix(rpois(500 * 10000, lambda = 10), nrow = 500, ncol = 10000)
rownames(large_counts) <- paste0("Gene", 1:500)
colnames(large_counts) <- paste0("Cell", 1:10000)

# Downsample to target count
time_downsample <- system.time({
  downsampled <- downsample_expression(large_counts, target_count = 1500, seed = 42)
})

cat("Downsampling time:", round(time_downsample["elapsed"], 3), "seconds\n")
#> Downsampling time: 0.75 seconds
cat("Original total counts:", sum(large_counts[,1]), "\n")
#> Original total counts: 5031
cat("Downsampled total counts:", sum(downsampled[,1]), "\n")
#> Downsampled total counts: 1500

Parallel Processing

Setting Up Workers

library(future)

# Check available cores
n_cores <- parallel::detectCores()
cat("Available cores:", n_cores, "\n")
#> Available cores: 12

# Recommended: use n_cores - 1 for parallel processing
n_workers <- max(1, n_cores - 1)
cat("Recommended workers:", n_workers, "\n")
#> Recommended workers: 11

Parallel LAP Solving

# Create multiple cost matrices
cost_list <- lapply(1:4, function(i) {
  matrix(runif(200 * 200), nrow = 200, ncol = 200)
})

# Sequential processing
time_seq <- system.time({
  results_seq <- lapply(cost_list, solve_lap)
})

# Parallel processing
time_par <- system.time({
  results_par <- solve_lap_parallel(cost_list, n_workers = 2, progress = FALSE)
})

cat("Sequential time:", round(time_seq["elapsed"], 3), "seconds\n")
#> Sequential time: 0.012 seconds
cat("Parallel time:", round(time_par["elapsed"], 3), "seconds\n")
#> Parallel time: 1.749 seconds

Using Future Plans

library(future)

# Multisession (recommended for most cases)
plan(multisession, workers = 4)

# Multicore (Unix/macOS only, more efficient)
plan(multicore, workers = 4)

# Sequential (for debugging)
plan(sequential)

# Run CytoSPACER with custom plan
results <- run_cytospace(
  sc_data = sc_data,
  cell_types = cell_types,
  st_data = st_data,
  coordinates = coordinates,
  n_workers = NULL  # Uses current plan
)

# Reset to default
plan(sequential)

Chunked Processing

For very large datasets, CytoSPACER uses chunked processing:

# Large dataset example
results <- run_cytospace(
  sc_data = sc_data,
  cell_types = cell_types,
  st_data = st_data,
  coordinates = coordinates,
  chunk_size = 5000,  # Process 5000 cells at a time
  n_workers = 4
)

How Chunking Works

Partition: Cells are divided into chunks of chunk_size
Subproblem Creation: Each chunk creates a subproblem with relevant spots
Parallel Solving: Subproblems are solved independently
Aggregation: Results are combined into final assignment

# Demonstrate partitioning
n_cells <- 15000
chunk_size <- 5000

n_chunks <- ceiling(n_cells / chunk_size)
cat("Total cells:", n_cells, "\n")
#> Total cells: 15000
cat("Chunk size:", chunk_size, "\n")
#> Chunk size: 5000
cat("Number of chunks:", n_chunks, "\n")
#> Number of chunks: 3

Optimization Guidelines

Dataset Size Recommendations

Dataset Size	Recommended Settings
< 10,000 cells	Default settings
10,000 - 50,000 cells	`n_workers = 4`, `chunk_size = 10000`
50,000 - 200,000 cells	`n_workers = 8`, `chunk_size = 5000`, `downsample = TRUE`
> 200,000 cells	Subsample reference, `n_workers = max`, `chunk_size = 5000`

Memory Guidelines

# Estimate memory requirements
estimate_memory <- function(n_cells, n_spots, n_genes) {
  # Cost matrix: n_cells x n_spots (double precision)
  cost_matrix_mb <- (n_cells * n_spots * 8) / (1024^2)
  
  # Expression matrices
  sc_matrix_mb <- (n_genes * n_cells * 8) / (1024^2)
  st_matrix_mb <- (n_genes * n_spots * 8) / (1024^2)
  
  total_mb <- cost_matrix_mb + sc_matrix_mb + st_matrix_mb
  
  cat("Estimated memory requirements:\n")
  cat("  Cost matrix:", round(cost_matrix_mb), "MB\n")
  cat("  scRNA-seq matrix:", round(sc_matrix_mb), "MB\n")
  cat("  ST matrix:", round(st_matrix_mb), "MB\n")
  cat("  Total (approx):", round(total_mb), "MB\n")
}

# Example: 50,000 cells, 5,000 spots, 20,000 genes
estimate_memory(50000, 5000, 20000)
#> Estimated memory requirements:
#>   Cost matrix: 1907 MB
#>   scRNA-seq matrix: 7629 MB
#>   ST matrix: 763 MB
#>   Total (approx): 10300 MB

Best Practices

Use sparse matrices when possible (typical scRNA-seq data is >90% zeros)
Downsample scRNA-seq to ~1500 counts per cell before analysis
Subset genes to highly variable genes if memory is limited
Monitor memory during analysis:

# Check current memory usage
gc_info <- gc()
mem_used <- sum(gc_info[, 2])
cat("Current memory used:", round(mem_used), "MB\n")
#> Current memory used: 217 MB

Use appropriate chunk size:
- Larger chunks = faster but more memory
- Smaller chunks = slower but less memory

Profiling

Time Profiling

# Profile individual steps
set.seed(42)
n_cells <- 50
n_spots <- 100

sc_test <- matrix(rpois(100 * n_cells, 10), nrow = 100)
st_test <- matrix(rpois(100 * n_spots, 50), nrow = 100)
rownames(sc_test) <- rownames(st_test) <- paste0("Gene", 1:100)
colnames(sc_test) <- paste0("Cell", 1:n_cells)
colnames(st_test) <- paste0("Spot", 1:n_spots)

# Normalization
time_norm <- system.time({
  sc_norm <- normalize_expression(sc_test)
  st_norm <- normalize_expression(st_test)
})

# Cost matrix
time_cost <- system.time({
  cost <- compute_cost_matrix(sc_norm, st_norm, method = "pearson")
})

# LAP solving (rows must be <= cols)
time_lap <- system.time({
  assignment <- solve_lap(cost)
})

cat("Timing breakdown:\n")
#> Timing breakdown:
cat("  Normalization:", round(time_norm["elapsed"], 4), "s\n")
#>   Normalization: 0.001 s
cat("  Cost matrix:", round(time_cost["elapsed"], 4), "s\n")
#>   Cost matrix: 0.007 s
cat("  LAP solving:", round(time_lap["elapsed"], 4), "s\n")
#>   LAP solving: 0.004 s

Troubleshooting Performance Issues

Issue: Out of Memory

# Solution 1: Reduce chunk size
results <- run_cytospace(..., chunk_size = 2000)

# Solution 2: Downsample more aggressively
results <- run_cytospace(..., downsample_target = 1000)

# Solution 3: Subset to highly variable genes
hvg <- VariableFeatures(seurat_obj)[1:2000]
sc_subset <- sc_data[hvg, ]
st_subset <- st_data[hvg, ]

Issue: Slow Performance

# Solution 1: Increase workers
results <- run_cytospace(..., n_workers = 8)

# Solution 2: Use larger chunks (if memory allows)
results <- run_cytospace(..., chunk_size = 20000)

# Solution 3: Use C++ implementations (default)
cost <- compute_cost_matrix(..., use_cpp = TRUE)

Session Info

sessionInfo()
#> R version 4.4.0 (2024-04-24)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS 15.6.1
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
#> 
#> locale:
#> [1] C
#> 
#> time zone: Asia/Shanghai
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] future_1.69.0    Matrix_1.7-4     CytoSPACER_1.0.0
#> 
#> loaded via a namespace (and not attached):
#>  [1] gtable_0.3.6        future.apply_1.20.1 jsonlite_2.0.0     
#>  [4] dplyr_1.1.4         compiler_4.4.0      Rcpp_1.1.1         
#>  [7] tidyselect_1.2.1    parallel_4.4.0      dichromat_2.0-0.1  
#> [10] jquerylib_0.1.4     globals_0.18.0      systemfonts_1.3.1  
#> [13] scales_1.4.0        textshaping_1.0.4   yaml_2.3.12        
#> [16] fastmap_1.2.0       lattice_0.22-7      ggplot2_4.0.1      
#> [19] R6_2.6.1            generics_0.1.4      knitr_1.51         
#> [22] htmlwidgets_1.6.4   tibble_3.3.1        desc_1.4.3         
#> [25] pillar_1.11.1       bslib_0.9.0         RColorBrewer_1.1-3 
#> [28] rlang_1.1.7         cachem_1.1.0        xfun_0.56          
#> [31] S7_0.2.1            fs_1.6.6            sass_0.4.10        
#> [34] otel_0.2.0          cli_3.6.5           progressr_0.18.0   
#> [37] magrittr_2.0.4      pkgdown_2.1.3       digest_0.6.39      
#> [40] grid_4.4.0          lifecycle_1.0.5     vctrs_0.7.1        
#> [43] evaluate_1.0.5      glue_1.8.0          data.table_1.18.0  
#> [46] farver_2.1.2        listenv_0.10.0      codetools_0.2-20   
#> [49] ragg_1.5.0          parallelly_1.46.1   rmarkdown_2.30     
#> [52] pkgconfig_2.0.3     tools_4.4.0         htmltools_0.5.9

Zaoqu Liu

2026-01-25