Introduction
This vignette demonstrates the core functionality of scClustEval with executable examples using simulated data.
Installation
# From R-universe (recommended)
install.packages("scClustEval", repos = "https://zaoqu-liu.r-universe.dev")
# From GitHub
remotes::install_github("Zaoqu-Liu/scClustEval")Creating Example Data
Let’s create a synthetic dataset with known cluster structure to demonstrate the assessment workflow.
# Generate synthetic single-cell data
set.seed(42)
n_cells <- 600
n_features <- 50
n_clusters <- 4
# Create expression matrix with distinct clusters
X <- matrix(nrow = n_cells, ncol = n_features)
labels <- character(n_cells)
cells_per_cluster <- n_cells / n_clusters
for (i in 1:n_clusters) {
start_idx <- (i - 1) * cells_per_cluster + 1
end_idx <- i * cells_per_cluster
# Each cluster has a distinct mean expression profile
cluster_mean <- rnorm(n_features, mean = i * 2, sd = 0.5)
X[start_idx:end_idx, ] <- matrix(
rep(cluster_mean, cells_per_cluster) + rnorm(cells_per_cluster * n_features, sd = 1),
nrow = cells_per_cluster,
byrow = TRUE
)
labels[start_idx:end_idx] <- paste0("Cluster_", i)
}
colnames(X) <- paste0("Gene_", 1:n_features)
cat("Data dimensions:", nrow(X), "cells x", ncol(X), "features\n")
cat("Clusters:", unique(labels), "\n")Basic Assessment
Running Self-Projection
# Run clustering assessment
result <- sc_assessment(
X = X,
labels = labels,
classifier = "LR", # Logistic Regression
penalty = "l1", # L1 regularization (Lasso)
test_size = 0.5, # 50% for testing
n_per_class = 100, # Max 100 cells per cluster in training
cv = 5, # 5-fold cross-validation
seed = 42,
verbose = TRUE
)
# View summary
print(result)Understanding Results
# Key metrics
cat("\n=== Key Metrics ===\n")
cat("Test Accuracy:", sprintf("%.1f%%", result$accuracy * 100), "\n")
cat("CV Accuracy:", sprintf("%.1f%%", result$cv_accuracy * 100), "\n")
cat("Max R1 Confusion:", sprintf("%.4f", result$max_r1), "\n")
cat("Max R2 Confusion:", sprintf("%.4f", result$max_r2), "\n")
# Per-cluster accuracy
cat("\n=== Per-Cluster Accuracy ===\n")
for (cl in names(result$per_class_accuracy)) {
cat(sprintf(" %s: %.1f%%\n", cl, result$per_class_accuracy[cl] * 100))
}Visualization
ROC Curves
# Plot ROC and Precision-Recall curves
plot_roc(result, plot_type = "both", show_auc = TRUE)Confusion Matrix Heatmaps
library(gridExtra)
# Raw confusion matrix
p1 <- plot_confusion_heatmap(result, normalized = "raw", title = "Raw Counts")
# R1-normalized
p2 <- plot_confusion_heatmap(result, normalized = "R1", title = "R1 Normalized")
# R2-normalized
p3 <- plot_confusion_heatmap(result, normalized = "R2", title = "R2 Normalized")
grid.arrange(p1, p2, p3, ncol = 3)Simulating Over-Clustering
Now let’s create an over-clustered scenario to demonstrate optimization.
# Split some clusters to simulate over-clustering
labels_over <- labels
# Split Cluster_1 into two
labels_over[labels == "Cluster_1"][1:75] <- "Cluster_1a"
labels_over[labels == "Cluster_1"][76:150] <- "Cluster_1b"
# Split Cluster_2 into two
labels_over[labels == "Cluster_2"][1:75] <- "Cluster_2a"
labels_over[labels == "Cluster_2"][76:150] <- "Cluster_2b"
labels_over <- as.character(labels_over)
cat("Over-clustered labels:", unique(labels_over), "\n")
cat("Number of clusters:", length(unique(labels_over)), "\n")Assessment of Over-Clustered Data
# Assess the over-clustered data
result_over <- sc_assessment(
X = X,
labels = labels_over,
classifier = "LR",
n_per_class = 50,
cv = 5,
seed = 42,
verbose = TRUE
)
cat("\nOver-clustering accuracy:", sprintf("%.1f%%", result_over$accuracy * 100), "\n")
cat("Max R1 (indicates confusion):", sprintf("%.4f", result_over$max_r1), "\n")
# Show confusion between artificial splits
plot_confusion_heatmap(result_over, normalized = "R1",
title = "R1 Confusion (Over-clustered)")Single Optimization Round
# Run single optimization round
optim_round <- sc_optimize(
X = X,
labels = labels_over,
classifier = "LR",
n_iter = 3, # 3 iterations for confusion matrix
r1_cutoff = 0.1, # Merge if R1 > 0.1
r2_cutoff = 0.05, # Or if R2 > 0.05
seed = 42,
verbose = TRUE
)
cat("\nClusters before:", optim_round$n_clusters_before, "\n")
cat("Clusters after:", optim_round$n_clusters_after, "\n")
cat("Accuracy:", sprintf("%.1f%%", optim_round$accuracy * 100), "\n")Full Optimization Pipeline
# Run full optimization
optim_result <- sc_optimize_all(
X = X,
labels = labels_over,
min_accuracy = 0.90, # Target 90% accuracy
max_rounds = 10,
classifier = "LR",
r1_cutoff = 0.5, # Start with high cutoff
r2_cutoff = 0.05,
seed = 42,
verbose = TRUE
)
# Summary
print(optim_result)Optimization History
# Plot optimization progress
plot_optimization_history(optim_result, metric = "both")Compare Before and After
# Final cluster distribution
cat("\n=== Optimization Summary ===\n")
cat("Initial clusters:", length(unique(labels_over)), "\n")
cat("Final clusters:", length(unique(optim_result$final_labels)), "\n")
cat("Final accuracy:", sprintf("%.1f%%", optim_result$final_accuracy * 100), "\n")
# Cluster mapping
cat("\n=== Final Cluster Sizes ===\n")
print(table(optim_result$final_labels))Sankey Diagram
# Visualize cluster reassignment
if (requireNamespace("ggalluvial", quietly = TRUE)) {
plot_cluster_sankey(
labels_from = labels_over,
labels_to = as.character(optim_result$final_labels),
title = "Cluster Optimization Flow"
)
}Using Different Classifiers
# List available classifiers
get_available_classifiers()
# Try Random Forest
result_rf <- sc_assessment(
X = X,
labels = labels,
classifier = "RF",
n_per_class = 100,
cv = 0, # Skip CV for speed
seed = 42,
verbose = FALSE
)
cat("Random Forest accuracy:", sprintf("%.1f%%", result_rf$accuracy * 100), "\n")Summary
This quick start guide demonstrated:
- Creating test data with known cluster structure
-
Running assessment with
sc_assessment() - Visualizing results with ROC curves and confusion matrices
- Simulating over-clustering scenarios
-
Optimizing clustering with
sc_optimize_all() - Comparing classifiers
For more advanced usage, see the other vignettes:
- Algorithm Principles - Mathematical foundations
- Seurat Integration - Working with Seurat objects
- Visualization Guide - Comprehensive plotting
Author: Zaoqu Liu (liuzaoqu@163.com)
Package: scClustEval v1.0.0