# load the necessary libraries
library(readr)
library(dplyr)
library(tidyverse)
library(Seurat)
library(patchwork)
library(ggplot2)
library(sctransform)
library(matrixStats)
library(purrr)
library(BayesTools)
library(VGAM)
library(knitr)
library(Matrix)
library(fastglmpca)
library(cowplot)
library(gridExtra)
library(ggpubr)
library(motifcluster)
library(cluster)
library(mclust)
library(tightClust)
library(factoextra)
Homework 3
Please submit both your code and results. Please use R for this homework.
In this homework, you should use a new PBMC dataset with the given cell type labels stored as the Seurat object (v5) [PBMC_w_labels.rds]. This dataset has nine different cell types.
1 Question 1
Data preparation. Perform the following steps in Seurat with default parameters:
(1) log1pCP10k,
(2) highly variable gene selection,
(3) scaling data, and
(4) PCA.
1.1 Answer
load the data
# load the data
<- readRDS("PBMC_w_labels.rds")
pbmc pbmc
An object of class Seurat
33694 features across 3362 samples within 1 assay
Active assay: RNA (33694 features, 0 variable features)
1 layer present: counts
- log1pCP10k
# perform log1pCP10k
<- NormalizeData(
pbmc_log1pCP10k
pbmc, normalization.method = "LogNormalize",
scale.factor = 10000)
- highly variable gene selection (Identification of highly variable features (feature selection))
# identify the highly variable genes
<- FindVariableFeatures(pbmc_log1pCP10k,
pbmc_features selection.method = "vst",
nfeatures = 2000)
<- head(VariableFeatures(pbmc_features), 10)
head_10_features <- VariableFeaturePlot(pbmc_features)
plot1 LabelPoints(plot = plot1, points = head_10_features, repel = TRUE) +
theme(legend.position = "none") +
ggtitle("Feature Selection") +
theme(plot.title = element_text(hjust = 0.5))
Warning in scale_x_log10(): log-10 transformation introduced infinite values.
- scaling the data
# scale the data
<- ScaleData(pbmc_features) pbmc_scaled
- PCA
# perform PCA
<- RunPCA(pbmc_scaled,
pbmc_PCA npcs = 50,
verbose = TRUE)
VizDimLoadings(pbmc_PCA, dims = 1:2, reduction = "pca")
2 Question 2
Implement the following four clustering algorithms on the PCA results (the first 50 PCs). Also, plot cells using their scores of the first two PCs and color them according to five sets of labels. Present these in five separate figures: four showing the cluster labels in the four clustering results and one showing the true cell type labels.
- kmeans [Rfunction]: Set k to 9.
- kmeans++ [Rfunction]: Set k to 9
- Hierarchical clustering (complete linkage; Euclidean distance for the dissimilarity) [Rfunction]: Cut the dendrogram to obtain 9 clusters.
- Seurat clustering: Choose an appropriate
resolution
parameter in FindCluster() to obtain 9 clusters.
2.1 Answer
- kmeans
# get the cell embeddings
<-
cell_embeddings Embeddings(pbmc_PCA, reduction = "pca") |>
as_tibble()
# kmeans clustering
<- kmeans(cell_embeddings,
kmeans_cluster centers = 9)
# kmeans++ clustering
<- kmeanspp(cell_embeddings,
kmeanspp_cluster k = 9)
# hierarchical clustering
<- hclust(dist(cell_embeddings,
hc_cluster method = "euclidean"),
method = "complete") |>
cutree(k = 9)
# seurat clustering
<-
pbmc_neighbours FindNeighbors(
pbmc_PCA)
<-
pbmc_clusters FindClusters(
pbmc_neighbours,resolution = 0.2)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 3362
Number of edges: 119362
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9512
Number of communities: 10
Elapsed time: 0 seconds
Plot the cells using their scores of the first two PCs and color them according to the five sets of labels.
# extract the first two PCs
<- cell_embeddings$PC_1
PC1 <- cell_embeddings$PC_2
PC2
# write a function to plot for each clustering result
<- function(cluster, title){
plot_clusters <- cluster |>
df as.factor() |>
as.data.frame() |>
mutate(PC1 = PC1,
PC2 = PC2)
|>
df ggplot(aes(x = PC1, y = PC2,
# color equals to the first column of the data frame
color = df[,1])) +
geom_point() +
ggtitle(title) +
theme(plot.title = element_text(hjust = 0.5)) +
theme_minimal() +
labs(color = "Clusters")
}
# plot the kmeans clustering results
<- plot_clusters(kmeans_cluster$cluster,
kmeans_plot "kmeans clustering")
# plot the kmeans++ clustering results
<- plot_clusters(kmeanspp_cluster$cluster,
kmeanspp_plot "kmeans++ clustering")
# plot the hierarchical clustering results
<- plot_clusters(hc_cluster, "hierarchical clustering")
hc_plot
# plot the seurat clustering results
<- plot_clusters(Idents(pbmc_clusters),
seurat_plot "seurat clustering")
# plot the true labels
<- plot_clusters(pbmc$celltype,
true_labels_plot "true labels")
arrange the five plots together
# arrange the five plots together
kmeans_plot
kmeanspp_plot
hc_plot
seurat_plot
true_labels_plot
From the plots, we can see that the clustering results are different. The true labels plot shows the actual cell types of the cells. The kmeans++ clustering
result is the most similar to the true labels plot, while the hierarchical clustering
result is the most different from the true labels plot.
3 Question 3
Evaluate and compare the four clustering results using:
(1) Silhouette score [Rfunction] (use the Euclidean distance).
(2) Adjusted rand index (ARI) [Rfunction]. Please refer to this introduction to ARI and provide a brief explanation of it.
Which clustering method has the best performance based on each metric? Can you explain the different conclusions?
3.1 Answer
- Silhouette score
# calculate the silhouette score for each clustering method
# kmeans
<- silhouette(kmeans_cluster$cluster,
kmeans_silhouette dist(cell_embeddings,
method = "euclidean"))
# kmeans++
<- silhouette(kmeanspp_cluster$cluster,
kmeanspp_silhouette dist(cell_embeddings,
method = "euclidean"))
# hierarchical clustering
<- silhouette(hc_cluster,
hc_silhouette dist(cell_embeddings,
method = "euclidean"))
# seurat clustering
<- Idents(pbmc_clusters) |>
seurat_cluster as.data.frame() |>
rename("cluster" = "Idents(pbmc_clusters)")
<- silhouette(as.numeric(seurat_cluster$cluster),
seurat_silhouette dist(cell_embeddings,
method = "euclidean"))
# combine the silhouette score for each clustering method together
cbind(kmeans = mean(kmeans_silhouette[, 3]),
kmeanspp = mean(kmeanspp_silhouette[, 3]),
hc = mean(hc_silhouette[, 3]),
seurat = mean(seurat_silhouette[, 3])) |>
`rownames<-`(c("Silhouette score")) |>
::kable() knitr
kmeans | kmeanspp | hc | seurat | |
---|---|---|---|---|
Silhouette score | 0.1519549 | 0.2065469 | 0.3325925 | 0.1912153 |
For the silhouette score, the higher the value, the better the clustering performance. Based on the results, the hierarchical clustering
method has the highest silhouette score, indicating that it has the best performance in terms of cluster separation.
- Adjusted rand index (ARI)
# calculate the adjusted rand index for each clustering method
# kmeans
<- adjustedRandIndex(kmeans_cluster$cluster,
kmeans_ari $celltype)
pbmc
# kmeans++
<- adjustedRandIndex(pbmc$celltype,
kmeanspp_ari $cluster)
kmeanspp_cluster
# hierarchical clustering
<- adjustedRandIndex(hc_cluster,
hc_ari $celltype)
pbmc
# seurat clustering
<- adjustedRandIndex(as.numeric(seurat_cluster$cluster),
seurat_ari $celltype)
pbmc
# evaluate the ARI for each clustering method
cbind(kmeans = kmeans_ari,
kmeanspp = kmeanspp_ari,
hc = hc_ari,
seurat = seurat_ari) |>
`rownames<-`(c("Adjusted rand index")) |>
::kable() knitr
kmeans | kmeanspp | hc | seurat | |
---|---|---|---|---|
Adjusted rand index | 0.640469 | 0.9153481 | 0.2202939 | 0.8126663 |
Introduction to ARI:
The Adjusted Rand Index (ARI) is a measure of the similarity between two clusterings. It considers all pairs of samples and counts the number of pairs that are assigned to the same or different clusters in the two clusterings. The ARI ranges from -1 to 1, where 1 indicates perfect similarity between the two clusterings, 0 indicates random clustering, and -1 indicates complete dissimilarity.
Based on the ARI results, the K-means++ clustering
method has the highest ARI score, indicating that it has the best performance in terms of similarity to the true cell type labels.
- Explanation
This discrepancy between the silhouette score and ARI results can be explained by the different evaluation criteria used by the two metrics. The silhouette score evaluates the compactness and separation of clusters based on the distance between data points, while the ARI evaluates the similarity between the clustering results and the true labels. In this case, the hierarchical clustering
method has the best separation of clusters based on the silhouette score, while the K-means++ clustering
method has the best similarity to the true labels based on the ARI. Therefore, the choice of clustering method may depend on the specific evaluation criteria and goals of the analysis.
4 Question 4
Use the following nine combinations for parameters of the Seurat clustering algorithm. How does each parameter influence the number of clusters?
4.1 Answer
Here, in the prune.SNN column, I used 1/7 instead of 1/5 because the code will not run with 1/5.
# create a list of parameter combinations
<- data.frame(
parameter_combinations k.neighbors = c(10, 20, 50, rep(20, 6)),
prune.SNN = c(rep(1/15, 3), 1/7, 1/10, 1/15, rep(1/15, 3)),
resolution = c(rep(0.8, 6), 0.5, 1, 1.5))
# create a list to store the number of clusters for each combination
<- list()
num_clusters
# loop through each parameter combination
for (i in c(1:3, 4, 5:9)) {
# run Seurat neighbors with the specified parameters
<- FindNeighbors(
pbmc_neighbours_compare
pbmc_PCA,features = VariableFeatures(pbmc_features),
reduction = "pca",
k.param = parameter_combinations$k.neighbors[i],
prune.SNN = parameter_combinations$prune.SNN[i])
# run Seurat clustering with the specified parameters
<- FindClusters(
pbmc_clusters_compare
pbmc_neighbours_compare,resolution = parameter_combinations$resolution[i])
# store the number of clusters for each parameter combination
<- length(unique(Idents(pbmc_clusters_compare)))
num_clusters[[i]] }
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 3362
Number of edges: 49827
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8882
Number of communities: 17
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 3362
Number of edges: 119362
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8713
Number of communities: 12
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 3362
Number of edges: 275174
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8512
Number of communities: 11
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 3362
Number of edges: 66450
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8890
Number of communities: 16
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 3362
Number of edges: 88637
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8801
Number of communities: 13
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 3362
Number of edges: 119362
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8713
Number of communities: 12
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 3362
Number of edges: 119362
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9061
Number of communities: 12
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 3362
Number of edges: 119362
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8504
Number of communities: 14
Elapsed time: 0 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 3362
Number of edges: 119362
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8080
Number of communities: 16
Elapsed time: 0 seconds
# display the number of clusters for each parameter combination
$num_clusters <- unlist(num_clusters)
parameter_combinations|>
parameter_combinations
# set the last six values of column `k.neighbors` to `default`
mutate(k.neighbors = ifelse(row_number() > 3,
"default",
|>
k.neighbors))
# set the first three and last three values of
# column `prune.SNN` to `default`
mutate(prune.SNN = ifelse(row_number() %in% c(1:3, 7:9),
"default",
|>
prune.SNN)) mutate(prune.SNN = ifelse(row_number() %in% c(4:6),
c("1/7", "1/10", "1/15"),
|>
prune.SNN))
# set the last six values of column `resolution` to `default`
mutate(resolution = ifelse(row_number() %in% c(1:6),
"default",
|>
resolution)) ::kable() knitr
k.neighbors | prune.SNN | resolution | num_clusters |
---|---|---|---|
10 | default | default | 17 |
20 | default | default | 12 |
50 | default | default | 11 |
default | 1/7 | default | 16 |
default | 1/10 | default | 13 |
default | 1/15 | default | 12 |
default | default | 0.5 | 12 |
default | default | 1 | 14 |
default | default | 1.5 | 16 |
From the results, we can observe the following trends:
- Increasing the
k.neighbors
parameter generally leads to a decrease in the number of clusters. - Decreasing the
prune.SNN
parameter generally leads to an increase in the number of clusters. - Increasing the
resolution
parameter generally leads to an increase in the number of clusters.
5 Question 5
Instead of setting k as nine in question2, please calculate the Gap statistics for kmeans clustering, with k ranging from 1 to 15 [Rfunction]. What is the best k for kmeans based on Gap statistics?
5.1 Answer
set.seed(2024)
# calculate the Gap statistics for kmeans clustering
if (file.exists("gap_stat.RData")) {
load("gap_stat.RData")
else {
} <- clusGap(cell_embeddings,
gap_stat FUN = kmeans,
K.max = 15,
B = 50)
save(gap_stat, file = "gap_stat.RData")
}
# plot the Gap statistics results
fviz_gap_stat(gap_stat) +
theme_minimal() +
theme(legend.position = "none")
Based on the Gap statistics results, the best k for kmeans clustering is 11, as it has the highest Gap statistic value. This indicates that k=11 is the optimal number of clusters for the kmeans algorithm based on the Gap statistics method.
6 Question 6
Perform the tight clustering on the PCA results (50 PCs) [Rfunction]. Settarget
=9,and k.min
=15. What are the meanings of these two parameters? Plot cells using their scores of the first two PCs, coloring them according to the cluster labels from tight clustering results (use the gray color for cells labeled -1). Please explain why some cells are labeled as -1.
6.1 Answer
# perform tight clustering on the 50 PCs
if (file.exists("tight_clustering.RData")) {
load("tight_clustering.RData")
else {
} <-
tight_clustering tight.clust(Embeddings(pbmc_PCA,
reduction = "pca"),
target = 9,
k.min = 15)
save(tight_clustering, file = "tight_clustering.RData")
}
# plot cells using their scores of the first two PCs
$cluster |>
tight_clusteringas.factor() |>
as.data.frame() |>
setNames("cluster") |>
mutate(PC1 = PC1,
PC2 = PC2) |>
ggplot(aes(x = PC1,
y = PC2,
color = cluster)) +
geom_point() +
scale_color_manual(values = c("gray", rainbow(9))) +
labs(title = "Tight Clustering Results",
x = "PC1",
y = "PC2") +
theme_minimal()
Explanation:
The reason why some cells are labeled as -1 in the tight clustering results is that these cells do not belong to any of the identified clusters based on the specified target and k.min parameters. The target parameter specifies the desired number of clusters, while the k.min parameter sets the minimum number of clusters to consider during the tight clustering process. If the algorithm cannot assign a cell to any of the identified clusters, it labels the cell as -1, indicating that it does not fit into any of the defined clusters. This can happen when the data points do not exhibit clear clustering patterns or when the clustering algorithm fails to identify distinct clusters within the data. In such cases, the cells are labeled as outliers or noise points, represented by the -1 label in the clustering results. The gray color in the plot represents these outlier cells that do not belong to any of the identified clusters.