Advanced Usage and Best Practices

Introduction

This vignette covers advanced features and best practices for optimal TorchDecon performance in real-world deconvolution tasks.

Author: Zaoqu Liu (liuzaoqu@163.com)

GPU Acceleration

Checking GPU Availability

library(TorchDecon)
library(torch)

# Check if CUDA is available
if (torch::cuda_is_available()) {
  cat("CUDA GPU detected!\n")
  cat("Device count:", torch::cuda_device_count(), "\n")
  cat("Current device:", torch::cuda_current_device(), "\n")
} else {
  cat("No CUDA GPU available. Using CPU.\n")
}

Specifying Device

# Automatically detect best device
ensemble <- CreateTorchDeconEnsemble(
  n_features = 5000,
  n_classes = 10,
  device = "auto"  # Auto-select GPU if available
)

# Force CPU usage (useful for debugging)
ensemble_cpu <- CreateTorchDeconEnsemble(
  n_features = 5000,
  n_classes = 10,
  device = "cpu"
)

# Force GPU usage (will error if no GPU)
ensemble_gpu <- CreateTorchDeconEnsemble(
  n_features = 5000,
  n_classes = 10,
  device = "cuda"
)

Custom Model Architectures

Creating Custom Networks

# Define a custom architecture
custom_model <- CreateTorchDecon(
  n_features = 5000,
  n_classes = 10,
  architecture = "custom",
  hidden_units = c(512, 256, 128, 64),      # Custom layer sizes
  dropout_rates = c(0.1, 0.2, 0.15, 0.1),   # Custom dropout
  device = "auto"
)

print(custom_model)

Architecture Selection Guidelines

Dataset Size	Recommended Architecture	Rationale
< 1000 genes	M256	Prevents overfitting on small feature sets
1000-5000 genes	M512	Balanced capacity
> 5000 genes	M1024 or Custom	Sufficient capacity for complex patterns
Limited samples	Lower dropout	Regularization already from small data
Large samples	Higher dropout	Prevent overfitting

Training Optimization

Early Stopping

# Enable early stopping with validation
ensemble <- TrainModel(
  model = ensemble,
  data = processed_data,
  num_steps = 10000,           # Maximum steps
  validation_split = 0.1,       # 10% for validation
  early_stopping = TRUE,        # Enable early stopping
  patience = 500,               # Steps without improvement
  verbose = TRUE
)

Learning Rate Tuning

# Lower learning rate for stability
ensemble_stable <- TrainModel(
  model = ensemble,
  data = processed_data,
  learning_rate = 5e-5,  # Default is 1e-4
  num_steps = 10000
)

# Higher learning rate for faster convergence (risky)
ensemble_fast <- TrainModel(
  model = ensemble,
  data = processed_data,
  learning_rate = 5e-4,
  num_steps = 3000
)

Batch Size Considerations

# Larger batch sizes: more stable gradients, faster (if GPU memory allows)
ensemble_large_batch <- TrainModel(
  model = ensemble,
  data = processed_data,
  batch_size = 256,  # Default is 128
  num_steps = 5000
)

# Smaller batch sizes: more noise, can help escape local minima
ensemble_small_batch <- TrainModel(
  model = ensemble,
  data = processed_data,
  batch_size = 32,
  num_steps = 5000
)

Data Quality Optimization

Handling Unknown Cell Types

# Merge rare or unknown cell types
simulation <- SimulateBulk(
  object = seurat_obj,
  n_samples = 2000,
  celltype_col = "cell_type",
  unknown_celltypes = c("Doublets", "Unknown", "LowQuality"),  # Merge these
  verbose = TRUE
)

# These will be combined into a single "Unknown" category

Optimal Sample Simulation

# High-quality simulation settings
simulation <- SimulateBulk(
  object = seurat_obj,
  n_samples = 5000,              # More samples = better generalization
  cells_per_sample = 200,        # More cells = more realistic bulk
  sparse_fraction = 0.3,         # Include incomplete compositions
  min_celltypes = 2,             # At least 2 cell types per sparse sample
  seed = 42                      # Reproducibility
)

Gene Selection Strategies

# Stricter variance filtering
processed_strict <- ProcessTrainingData(
  simulation = simulation,
  prediction_data = bulk_data,
  var_cutoff = 0.5,  # Higher threshold = fewer genes
  scaling = "log_min_max"
)

# More lenient (include more genes)
processed_lenient <- ProcessTrainingData(
  simulation = simulation,
  prediction_data = bulk_data,
  var_cutoff = 0.01,  # Lower threshold = more genes
  scaling = "log_min_max"
)

cat("Strict filtering:", processed_strict$n_genes, "genes\n")
cat("Lenient filtering:", processed_lenient$n_genes, "genes\n")

Working with Multiple Datasets

Merging Simulations

# Create simulations from different tissues/conditions
sim_tissue1 <- SimulateBulk(seurat_tissue1, n_samples = 1000, verbose = FALSE)
sim_tissue2 <- SimulateBulk(seurat_tissue2, n_samples = 1000, verbose = FALSE)

# Merge simulations
combined_sim <- MergeSimulations(sim_tissue1, sim_tissue2)

print(combined_sim)

Cross-validation Strategy

# Implement k-fold cross-validation
k <- 5
n_samples <- nrow(processed$X)
fold_size <- ceiling(n_samples / k)

cv_results <- list()

for (i in 1:k) {
  # Define fold indices
  val_idx <- ((i-1) * fold_size + 1):min(i * fold_size, n_samples)
  train_idx <- setdiff(1:n_samples, val_idx)
  
  # Create training subset
  train_data <- list(
    X = processed$X[train_idx, ],
    Y = processed$Y[train_idx, ],
    genes = processed$genes,
    celltypes = processed$celltypes
  )
  class(train_data) <- c("TorchDeconProcessed", "list")
  
  # Train model on this fold
  model <- CreateTorchDecon(
    n_features = ncol(train_data$X),
    n_classes = ncol(train_data$Y),
    architecture = "m256",
    device = "cpu"
  )
  
  model <- TrainModel(model, train_data, num_steps = 2000, verbose = FALSE)
  
  # Evaluate on validation fold
  val_pred <- PredictFractions(model, t(processed$X[val_idx, ]), 
                                scaling = NULL, verbose = FALSE)
  
  cv_results[[i]] <- list(
    predictions = val_pred,
    true = processed$Y[val_idx, ]
  )
}

# Aggregate CV results
cv_performance <- sapply(cv_results, function(r) {
  cor(as.vector(as.matrix(r$predictions)), 
      as.vector(r$true))
})
cat("Mean CV correlation:", mean(cv_performance), "\n")
cat("SD:", sd(cv_performance), "\n")

Model Persistence and Deployment

Saving Models with Metadata

# Save trained model
SaveModel(ensemble, "production_model", overwrite = TRUE)

# The saved directory contains:
# - network.pt (or m256/, m512/, m1024/ for ensemble)
# - metadata.rds
# - genes.txt
# - celltypes.txt

Loading and Deploying

# Load model for prediction
loaded_model <- LoadModel("production_model", device = "auto")

# Quick prediction pipeline
QuickPredict(
  model_path = "production_model",
  bulk_data = "new_bulk_data.txt",
  output_file = "predictions.txt"
)

Troubleshooting

Common Issues and Solutions

Issue	Cause	Solution
Out of memory	Dataset too large	Reduce batch_size, use CPU
Poor accuracy	Insufficient training	Increase num_steps, n_samples
Predictions sum ≠ 1	Numerical issues	Automatic normalization applied
Slow training	No GPU	Install CUDA, use GPU
Gene mismatch	Different gene sets	Ensure same gene names in reference and bulk

Memory Management

# Clear GPU memory after training
if (torch::cuda_is_available()) {
  torch::cuda_empty_cache()
}

# Force garbage collection
gc()

Reproducibility

Setting Seeds

# For fully reproducible results
set.seed(42)
torch::torch_manual_seed(42)

# All TorchDecon functions support seed parameter
simulation <- SimulateBulk(seurat_obj, n_samples = 1000, seed = 42)
ensemble <- CreateTorchDeconEnsemble(n_features = 5000, n_classes = 10, seed = 42)
ensemble <- TrainModel(ensemble, processed_data, seed = 42)

Best Practices Summary

Data Quality: Use high-quality scRNA-seq reference with accurate annotations
Sample Size: Generate 2000-5000 simulated samples for training
GPU Usage: Use GPU when available for faster training
Validation: Use early stopping with validation split
Reproducibility: Always set seeds for reproducible results
Gene Filtering: Start with default var_cutoff (0.1), adjust if needed
Model Selection: Use ensemble (default) for robust predictions
Evaluation: Always evaluate on held-out data if ground truth available

Package Author: Zaoqu Liu
Contact: liuzaoqu@163.com
GitHub: https://github.com/Zaoqu-Liu/TorchDecon

Zaoqu Liu

2026-01-26