# Load libraries
library(readr)
library(dplyr)
library(tidyverse)
library(Seurat) |> suppressWarnings()
library(patchwork)
library(ggplot2)
library(sctransform)
library(matrixStats)
library(purrr)
library(BayesTools)
library(VGAM) |> suppressWarnings()
library(knitr) |> suppressWarnings()
library(Matrix)
library(fastglmpca)
library(cowplot)
library(gridExtra)
library(ggpubr)
library(motifcluster)
library(cluster)
library(mclust)
library(tightClust)
library(devtools)
library(biomaRt) |> suppressWarnings()
library(presto)
library(writexl)
# library(SingleR)
set.seed(2024)
STAT M254 Final Project
BoneMarrow and Pancreas Data Analysis
1 Introduction
In this project, I will analyze two datasets: BoneMarrow and Pancreas. The BoneMarrow dataset contains two datasets: BoneMarrow1
and BoneMarrow2
. The Pancreas dataset contains one dataset: Pancreas
. The goal of this project is to identify cell types in the BoneMarrow and Pancreas datasets. I will perform the following analysis:
- Setup the Seurat Object
- Standard pre-processing workfow
- Normalizing the data
- Identification of highly variable features (feature selection)
- Scaling the data
- Perform linear dimensional reduction
- Determine the ‘dimensionality’ of the dataset
- Cluster the cells
- Run non-linear dimensional reduction (UMAP/tSNE)
- Finding differentially expressed features (cluster biomarkers)
- Assigning cell type identity to clusters
2 Part 1: BoneMarrow
2.1 Read the data
# Load the BoneMarrow dataset
<- readRDS(
BoneMarrow_data1 paste("/Users/jacenai/Desktop/STAT_M254/",
"Final_Project/Datasets_final/",
"BoneMarrow_dataset1.rds",
sep = "")) |>
as.data.frame()
<- readRDS(
BoneMarrow_data2 paste("/Users/jacenai/Desktop/STAT_M254/",
"Final_Project/Datasets_final/",
"BoneMarrow_dataset2.rds",
sep = "")) |>
as.data.frame()
2.2 Create Seurat and QC
# Create the BoneMarrow1 Seurat object
<- Seurat::CreateSeuratObject(
BoneMarrow1 counts = BoneMarrow_data1,
project = "BoneMarrow1",
assay = "DNA",
min.cells = 10,
min.features = 200) |>
suppressWarnings()
# Find mitochondrial genes for BoneMarrow1
"percent.mt"]] <- PercentageFeatureSet(
BoneMarrow1[[
BoneMarrow1,pattern = "^MT-") |>
suppressWarnings()
# Create the BoneMarrow2 Seurat object
<- Seurat::CreateSeuratObject(
BoneMarrow2 counts = BoneMarrow_data2,
project = "BoneMarrow2",
assay = "DNA",
min.cells = 10,
min.features = 200) |>
suppressWarnings()
# Find mitochondrial genes for BoneMarrow2
"percent.mt"]] <- PercentageFeatureSet(
BoneMarrow2[[
BoneMarrow2,pattern = "^MT-") |>
suppressWarnings()
VlnPlot(BoneMarrow1,
features = c("nFeature_DNA",
"nCount_DNA",
"percent.mt"),
ncol = 3) |>
suppressWarnings()
VlnPlot(BoneMarrow2,
features = c("nFeature_DNA",
"nCount_DNA",
"percent.mt"),
ncol = 3) |>
suppressWarnings()
2.3 Transformation & Feature selection & Scaling
I will try log1pCPM
, and SCTransform
to transform the data and compare the results.
# Transform the BoneMarrow1 data
# Seurat default log-normalization (log1pCP10k)
<- NormalizeData(
pbmc_log1pCP1k_1
BoneMarrow1,normalization.method = "LogNormalize",
scale.factor = 1000)
# identify the highly variable genes
<- FindVariableFeatures(
pbmc_features_log1pCP1k_1
pbmc_log1pCP1k_1,selection.method = "vst",
nfeatures = 2000) # tune this parameter
# Scale the data
<-
pbmc_scaled_log1pCP1k_1 ScaleData(pbmc_features_log1pCP1k_1,
features =
VariableFeatures(pbmc_features_log1pCP1k_1))
# Transform the BoneMarrow2 data
# Seurat default log-normalization (log1pCP10k)
<- NormalizeData(
pbmc_log1pCP1k_2
BoneMarrow2,normalization.method = "LogNormalize",
scale.factor = 1000)
# identify the highly variable genes
<- FindVariableFeatures(
pbmc_features_log1pCP1k_2
pbmc_log1pCP1k_2,selection.method = "vst",
nfeatures = 2000) # tune this parameter
# Scale the data
<-
pbmc_scaled_log1pCP1k_2 ScaleData(pbmc_features_log1pCP1k_2,
features =
VariableFeatures(pbmc_features_log1pCP1k_2))
Try SCTransform
to transform the data.
# SCTransform
# SCTransform the BoneMarrow1 data
<- SCTransform(
BoneMarrow1_SCTransform
BoneMarrow1,verbose = FALSE,
variable.features.n = 3000, # tune this parameter
return.only.var.genes = TRUE,
assay = "DNA",
vars.to.regress = c("nCount_DNA", "percent.mt"))
# SCTransform the BoneMarrow2 data
<- SCTransform(
BoneMarrow2_SCTransform
BoneMarrow2,verbose = FALSE,
variable.features.n = 3000, # tune this parameter
return.only.var.genes = TRUE,
assay = "DNA",
vars.to.regress = c("nCount_DNA", "percent.mt"))
Calculate the variance of transformed data and plot it against the mean.
# Calculate the gene variance of transformed data
# and plot it against the gene mean
# BoneMarrow1 Log1pCPM
<-
gene_var_log1pCPM_1 "DNA"]]@layers$scale.data |>
pbmc_scaled_log1pCP1k_1[[apply(1, var)
<-
gene_mean_log1pCPM_1 "DNA"]]@layers$scale.data |>
pbmc_scaled_log1pCP1k_1[[apply(1, mean)
# BoneMarrow2 Log1pCPM
<-
gene_var_log1pCPM_2 "DNA"]]@layers$scale.data |>
pbmc_scaled_log1pCP1k_2[[apply(1, var)
<-
gene_mean_log1pCPM_2 "DNA"]]@layers$scale.data |>
pbmc_scaled_log1pCP1k_2[[apply(1, mean)
# BoneMarrow1 SCTransform
<-
gene_var_SCTransform_1 "SCT"]]@scale.data |>
BoneMarrow1_SCTransform[[apply(1, var)
<-
gene_mean_SCTransform_1 "SCT"]]@scale.data |>
BoneMarrow1_SCTransform[[apply(1, mean)
# BoneMarrow2 SCTransform
<-
gene_var_SCTransform_2 "SCT"]]@scale.data |>
BoneMarrow2_SCTransform[[apply(1, var)
<-
gene_mean_SCTransform_2 "SCT"]]@scale.data |>
BoneMarrow2_SCTransform[[apply(1, mean)
# Plot the gene variance against the gene mean
# Create a data frame for plotting
<-
gene_var_mean_df data.frame(Gene_Variance =
c(gene_var_log1pCPM_1,
gene_var_log1pCPM_2,
gene_var_SCTransform_1,
gene_var_SCTransform_2),Gene_Mean =
c(gene_mean_log1pCPM_1,
gene_mean_log1pCPM_2,
gene_mean_SCTransform_1,
gene_mean_SCTransform_2),Method =
c(rep("Log1pCPM",
length(gene_var_log1pCPM_1) +
length(gene_var_log1pCPM_2)),
rep("SCTransform",
length(gene_var_SCTransform_1) +
length(gene_var_SCTransform_2))))
# Plot the gene means vs. different variances
ggplot(gene_var_mean_df, aes(x = Gene_Mean,
y = Gene_Variance,
color = Method)) +
geom_point(alpha = 0.3) +
theme_grey() +
facet_wrap(~ Method, scales = "free") +
labs(title = "Gene Variance vs. Gene Mean",
x = "Gene Mean",
y = "Gene Variance") +
theme(legend.position = "none") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 5))
# Calculate the Spearman correlation
<-
correlation_log1pCPM_1 cor(gene_var_log1pCPM_1, gene_mean_log1pCPM_1,
method = "spearman")
<-
correlation_log1pCPM_2 cor(gene_var_log1pCPM_2, gene_mean_log1pCPM_2,
method = "spearman")
<-
correlation_SCTransform_1 cor(gene_var_SCTransform_1, gene_mean_SCTransform_1,
method = "spearman")
<-
correlation_SCTransform_2 cor(gene_var_SCTransform_2, gene_mean_SCTransform_2,
method = "spearman")
# print the Spearman correlation results
cbind(Method = c("Log1pCPM", "Log1pCPM",
"SCTransform", "SCTransform"),
Dataset = c("BoneMarrow1", "BoneMarrow2",
"BoneMarrow1", "BoneMarrow2"),
Correlation = c(correlation_log1pCPM_1,
correlation_log1pCPM_2,
correlation_SCTransform_1, |>
correlation_SCTransform_2)) ::kable() knitr
Method | Dataset | Correlation |
---|---|---|
Log1pCPM | BoneMarrow1 | 0.974603251887739 |
Log1pCPM | BoneMarrow2 | 0.961849201395136 |
SCTransform | BoneMarrow1 | -0.01872351035011 |
SCTransform | BoneMarrow2 | 0.00471352040124233 |
It can be seen from the resutls that the SCTransform method has a lower correlation between gene variance and gene mean compared to the Log1pCPM method. This indicates that the SCTransform method is better at normalizing the data. Therefore, I will use the SCTransform method for downstream analysis.
2.4 Linear dimensional reduction
# perform PCA
<- RunPCA(
BoneMarrow1_pbmc_PCA
BoneMarrow1_SCTransform,features = VariableFeatures(BoneMarrow1_SCTransform),
npcs = 100, # tune this parameter
verbose = FALSE)
ElbowPlot(BoneMarrow1_pbmc_PCA, ndims = 100)
<- RunPCA(
BoneMarrow2_pbmc_PCA
BoneMarrow2_SCTransform,features = VariableFeatures(BoneMarrow2_SCTransform),
npcs = 100, # tune this parameter
verbose = FALSE)
ElbowPlot(BoneMarrow2_pbmc_PCA, ndims = 100)
2.5 Get the gene names
Ensembl IDs to gene
= useMart(biomart = "ensembl",
mrt dataset = "hsapiens_gene_ensembl")
if (file.exists("BoneMarrow_data1_IDs.rds")) {
<- readRDS("BoneMarrow_data1_IDs.rds")
BoneMarrow_data1_IDs else {
} <- getBM(
BoneMarrow_data1_IDs attributes = c("ensembl_gene_id","hgnc_symbol"),
filters = 'ensembl_gene_id',
values = rownames(BoneMarrow_data1),
mart = mrt)
saveRDS(BoneMarrow_data1_IDs, "BoneMarrow_data1_IDs.rds")
}
if (file.exists("BoneMarrow_data2_IDs.rds")) {
<- readRDS("BoneMarrow_data2_IDs.rds")
BoneMarrow_data2_IDs else {
} <- getBM(
BoneMarrow_data2_IDs attributes = c("ensembl_gene_id","hgnc_symbol"),
filters = 'ensembl_gene_id',
values = rownames(BoneMarrow_data2),
mart = mrt)
saveRDS(BoneMarrow_data2_IDs, "BoneMarrow_data2_IDs.rds")
}
2.6 Clustering
Perform neighborhood-based clustering (I tried different parameters and the ones below gave the best results).
# Perform neighborhood-based clustering
<- FindNeighbors(
BoneMarrow1_pbmc_neighbor
BoneMarrow1_pbmc_PCA,reduction = "pca",
dims = 1:100, # tune this parameter
k.param = 30, # tune this parameter
prune.SNN = 1/10, # tune this parameter
verbose = FALSE)
<- FindClusters(
BoneMarrow1_pbmc_cluster
BoneMarrow1_pbmc_neighbor,resolution = 1.8, # tune this parameter
verbose = FALSE)
<- FindNeighbors(
BoneMarrow2_pbmc_neighbor
BoneMarrow2_pbmc_PCA,reduction = "pca",
dims = 1:100, # tune this parameter
k.param = 40, # tune this parameter
prune.SNN = 1/10, # tune this parameter
verbose = FALSE)
<- FindClusters(
BoneMarrow2_pbmc_cluster
BoneMarrow2_pbmc_neighbor,resolution = 0.8, # tune this parameter
verbose = FALSE)
Run UMAP and t-SNE
# Run UMAP
<- RunUMAP(
BoneMarrow1_pbmc_UMAP
BoneMarrow1_pbmc_cluster,min.dist = 0.3,
n.neighbors = 30,
dims = 1:79,
metric = "cosine",
verbose = FALSE) |>
suppressWarnings()
<- RunUMAP(
BoneMarrow2_pbmc_UMAP
BoneMarrow2_pbmc_cluster,min.dist = 0.3,
n.neighbors = 30,
dims = 1:87,
metric = "cosine",
verbose = FALSE)
# try t_SNE
<- RunTSNE(
BoneMarrow1_pbmc_tSNE
BoneMarrow1_pbmc_cluster,dims = 1:50,
verbose = FALSE)
<- RunTSNE(
BoneMarrow2_pbmc_tSNE
BoneMarrow2_pbmc_cluster,dims = 1:98,
verbose = FALSE)
Plot the UMAP
DimPlot(BoneMarrow1_pbmc_UMAP,
reduction = "umap",
label = TRUE,
group.by = "seurat_clusters",
alpha = 0.5) +
ggtitle("BoneMarrow1 UMAP")
DimPlot(BoneMarrow2_pbmc_UMAP,
reduction = "umap",
label = TRUE,
group.by = "seurat_clusters",
alpha = 0.5) +
ggtitle("BoneMarrow2 UMAP")
Plot the t-SNE
DimPlot(BoneMarrow1_pbmc_tSNE,
reduction = "tsne",
label = TRUE,
group.by = "seurat_clusters",
alpha = 0.5) +
ggtitle("BoneMarrow1 t-SNE")
DimPlot(BoneMarrow2_pbmc_tSNE,
reduction = "tsne",
label = TRUE,
group.by = "seurat_clusters",
alpha = 0.5) +
ggtitle("BoneMarrow2 t-SNE")
By comparing the UMAP and t-SNE plots, I decided to use the UMAP for both datasets.
2.7 Annotation Results
2.7.1 BoneMarrow1 data
# Find gene markers for the BoneMarrow1 data
# use the UMAP for the BoneMarrow1 data to find the gene markers
if (file.exists("BoneMarrow1_pbmc_markers.rds")) {
<- readRDS("BoneMarrow1_pbmc_markers.rds")
BoneMarrow1_pbmc_markers else {
} <-
BoneMarrow1_pbmc_markers FindAllMarkers(
BoneMarrow1_pbmc_UMAP,min.pct = 0.01,
logfc.threshold = 0.01,
test.use = "wilcox",
verbose = FALSE)
saveRDS(BoneMarrow1_pbmc_markers, "BoneMarrow1_pbmc_markers.rds")
}
# select the top 5 highly expressed genes in each cluster
# to roughly identify the cell types
<-
BoneMarrow1_pbmc_markers_top_5 |>
BoneMarrow1_pbmc_markers as_tibble() |>
filter(avg_log2FC > 4) |>
filter(p_val_adj < 0.01) |>
group_by(cluster, p_val_adj) |>
# arrange the gene with the order of
# high in avg_log2FC and low in p_val_adj
arrange(cluster, p_val_adj, desc(avg_log2FC)) |>
mutate(gene_names = map_chr(
~ BoneMarrow_data1_IDs$hgnc_symbol[
gene, match(.x, BoneMarrow_data1_IDs$ensembl_gene_id)])) |>
# select the top 5 genes in each cluster
group_by(cluster) |>
::slice(1:5) |>
dplyr::select(cluster, gene_names, everything())
dplyr
# export the top 5 highly expressed genes each cluster to a xlsx file
if (!file.exists("BoneMarrow1_pbmc_markers_top_5.xlsx")) {
write_xlsx(BoneMarrow1_pbmc_markers_top_5,
"BoneMarrow1_pbmc_markers_top_5.xlsx")
}
|>
BoneMarrow1_pbmc_markers_top_5 ::kable() knitr
cluster | gene_names | p_val | avg_log2FC | pct.1 | pct.2 | p_val_adj | gene |
---|---|---|---|---|---|---|---|
0 | CCR4 | 0 | 4.874914 | 0.382 | 0.014 | 0.0000000 | ENSG00000183813 |
0 | TTC39C-AS1 | 0 | 4.401808 | 0.091 | 0.004 | 0.0000000 | ENSG00000264745 |
0 | WNT7A | 0 | 4.294893 | 0.087 | 0.004 | 0.0000000 | ENSG00000154764 |
0 | FOXP3 | 0 | 4.169362 | 0.044 | 0.002 | 0.0000000 | ENSG00000049768 |
1 | MAFB | 0 | 4.238447 | 0.972 | 0.106 | 0.0000000 | ENSG00000204103 |
1 | THBS1 | 0 | 4.626654 | 0.816 | 0.069 | 0.0000000 | ENSG00000137801 |
1 | PID1 | 0 | 4.263354 | 0.624 | 0.039 | 0.0000000 | ENSG00000153823 |
1 | CYP1B1 | 0 | 5.438358 | 0.543 | 0.025 | 0.0000000 | ENSG00000138061 |
1 | CD300E | 0 | 4.320640 | 0.571 | 0.030 | 0.0000000 | ENSG00000186407 |
2 | KLRC2 | 0 | 4.734576 | 0.829 | 0.053 | 0.0000000 | ENSG00000205809 |
2 | TKTL1 | 0 | 5.867000 | 0.566 | 0.017 | 0.0000000 | ENSG00000007350 |
2 | KIR3DL2 | 0 | 4.176698 | 0.483 | 0.028 | 0.0000000 | ENSG00000240403 |
2 | KLRC3 | 0 | 4.646220 | 0.415 | 0.021 | 0.0000000 | ENSG00000205810 |
2 | LDB2 | 0 | 4.309103 | 0.102 | 0.005 | 0.0000000 | ENSG00000169744 |
3 | AZU1 | 0 | 7.113508 | 0.741 | 0.034 | 0.0000000 | ENSG00000172232 |
3 | MPO | 0 | 7.051123 | 0.615 | 0.030 | 0.0000000 | ENSG00000005381 |
3 | ELANE | 0 | 7.931680 | 0.498 | 0.014 | 0.0000000 | ENSG00000197561 |
3 | RNASE2 | 0 | 4.601070 | 0.678 | 0.054 | 0.0000000 | ENSG00000169385 |
3 | CTSG | 0 | 7.272826 | 0.390 | 0.007 | 0.0000000 | ENSG00000100448 |
4 | TNFRSF9 | 0 | 5.339711 | 0.261 | 0.012 | 0.0000000 | ENSG00000049249 |
4 | CCL3-AS1 | 0 | 4.315129 | 0.227 | 0.017 | 0.0000000 | ENSG00000277089 |
4 | CXCR6 | 0 | 4.409252 | 0.177 | 0.010 | 0.0000000 | ENSG00000172215 |
4 | DKK3 | 0 | 4.144354 | 0.148 | 0.010 | 0.0000000 | ENSG00000050165 |
4 | VCAM1 | 0 | 6.842211 | 0.025 | 0.000 | 0.0000000 | ENSG00000162692 |
5 | MS4A1 | 0 | 7.384164 | 0.866 | 0.015 | 0.0000000 | ENSG00000156738 |
5 | CD79A | 0 | 7.144223 | 0.985 | 0.034 | 0.0000000 | ENSG00000105369 |
5 | LINC00926 | 0 | 7.117973 | 0.738 | 0.012 | 0.0000000 | ENSG00000247982 |
5 | BANK1 | 0 | 6.714238 | 0.713 | 0.019 | 0.0000000 | ENSG00000153064 |
5 | FCER2 | 0 | 7.537949 | 0.554 | 0.006 | 0.0000000 | ENSG00000104921 |
6 | MYOM2 | 0 | 5.380428 | 0.885 | 0.050 | 0.0000000 | ENSG00000036448 |
6 | FGFBP2 | 0 | 4.273741 | 0.935 | 0.093 | 0.0000000 | ENSG00000137441 |
6 | SH2D1B | 0 | 4.317394 | 0.620 | 0.037 | 0.0000000 | ENSG00000198574 |
6 | SPON2 | 0 | 4.029231 | 0.975 | 0.208 | 0.0000000 | ENSG00000159674 |
6 | LINC00299 | 0 | 5.023763 | 0.370 | 0.016 | 0.0000000 | ENSG00000236790 |
7 | FSBP | 0 | 5.568025 | 0.012 | 0.000 | 0.0002713 | ENSG00000265817 |
7 | CELA1 | 0 | 5.568025 | 0.012 | 0.000 | 0.0002713 | ENSG00000139610 |
8 | CCL3L3 | 0 | 4.485081 | 0.781 | 0.057 | 0.0000000 | ENSG00000276085 |
8 | KCNE1 | 0 | 4.955907 | 0.323 | 0.014 | 0.0000000 | ENSG00000180509 |
8 | TNFAIP6 | 0 | 4.494003 | 0.303 | 0.022 | 0.0000000 | ENSG00000123610 |
8 | 0 | 4.503048 | 0.181 | 0.008 | 0.0000000 | ENSG00000287553 | |
8 | CXCL1 | 0 | 4.981095 | 0.168 | 0.008 | 0.0000000 | ENSG00000163739 |
9 | TRAV14DV4 | 0 | 4.364786 | 0.263 | 0.014 | 0.0000000 | ENSG00000211792 |
9 | TRDV2 | 0 | 6.154833 | 0.180 | 0.005 | 0.0000000 | ENSG00000211821 |
9 | ZNF683 | 0 | 4.564768 | 0.218 | 0.010 | 0.0000000 | ENSG00000176083 |
9 | TRBV5-1 | 0 | 5.172667 | 0.241 | 0.014 | 0.0000000 | ENSG00000211734 |
9 | TRGV2 | 0 | 4.735881 | 0.158 | 0.009 | 0.0000000 | ENSG00000233306 |
10 | SLC4A10 | 0 | 7.069240 | 0.250 | 0.002 | 0.0000000 | ENSG00000144290 |
10 | TRAV1-2 | 0 | 4.894153 | 0.189 | 0.006 | 0.0000000 | ENSG00000256553 |
10 | LINC01644 | 0 | 5.024152 | 0.144 | 0.004 | 0.0000000 | ENSG00000218357 |
10 | TRBV6-4 | 0 | 5.680197 | 0.091 | 0.002 | 0.0000000 | ENSG00000211713 |
10 | KIF5C | 0 | 4.225359 | 0.091 | 0.005 | 0.0000000 | ENSG00000168280 |
11 | LINC02812 | 0 | 9.882266 | 0.825 | 0.001 | 0.0000000 | ENSG00000230138 |
11 | LILRA4 | 0 | 8.612708 | 0.992 | 0.012 | 0.0000000 | ENSG00000239961 |
11 | PTPRS | 0 | 8.572198 | 0.775 | 0.003 | 0.0000000 | ENSG00000105426 |
11 | CLEC4C | 0 | 8.503754 | 0.850 | 0.004 | 0.0000000 | ENSG00000198178 |
11 | SCT | 0 | 8.283629 | 0.592 | 0.002 | 0.0000000 | ENSG00000070031 |
12 | SPTA1 | 0 | 6.572461 | 0.030 | 0.000 | 0.0000000 | ENSG00000163554 |
12 | SELP | 0 | 6.350069 | 0.030 | 0.000 | 0.0000000 | ENSG00000174175 |
12 | MFSD2B | 0 | 5.765106 | 0.030 | 0.000 | 0.0000000 | ENSG00000205639 |
12 | POLQ | 0 | 5.765106 | 0.030 | 0.000 | 0.0000000 | ENSG00000051341 |
12 | B3GALNT1 | 0 | 5.765106 | 0.030 | 0.000 | 0.0000000 | ENSG00000169255 |
13 | CRHBP | 0 | 11.350012 | 0.762 | 0.000 | 0.0000000 | ENSG00000145708 |
13 | CD34 | 0 | 8.263918 | 0.845 | 0.004 | 0.0000000 | ENSG00000174059 |
13 | CYTL1 | 0 | 7.493245 | 0.690 | 0.007 | 0.0000000 | ENSG00000170891 |
13 | NPR3 | 0 | 8.010162 | 0.560 | 0.003 | 0.0000000 | ENSG00000113389 |
13 | EGFL7 | 0 | 5.907402 | 0.833 | 0.019 | 0.0000000 | ENSG00000172889 |
14 | LINC02446 | 0 | 5.434920 | 0.836 | 0.037 | 0.0000000 | ENSG00000256039 |
14 | CD8B | 0 | 4.262326 | 0.970 | 0.076 | 0.0000000 | ENSG00000172116 |
14 | C20orf204 | 0 | 4.298116 | 0.209 | 0.011 | 0.0000000 | ENSG00000196421 |
14 | PCSK1N | 0 | 4.345422 | 0.224 | 0.015 | 0.0000000 | ENSG00000102109 |
14 | CD248 | 0 | 7.667350 | 0.045 | 0.000 | 0.0000000 | ENSG00000174807 |
15 | CLEC10A | 0 | 7.985711 | 0.578 | 0.004 | 0.0000000 | ENSG00000132514 |
15 | ENHO | 0 | 7.668228 | 0.356 | 0.002 | 0.0000000 | ENSG00000168913 |
15 | CD1C | 0 | 6.268298 | 0.578 | 0.013 | 0.0000000 | ENSG00000158481 |
15 | FCER1A | 0 | 5.012487 | 0.844 | 0.040 | 0.0000000 | ENSG00000179639 |
15 | AXL | 0 | 6.400748 | 0.289 | 0.004 | 0.0000000 | ENSG00000167601 |
16 | LYPD2 | 0 | 13.985578 | 0.543 | 0.000 | 0.0000000 | ENSG00000197353 |
16 | UICLM | 0 | 10.106432 | 0.629 | 0.000 | 0.0000000 | ENSG00000233392 |
16 | HES4 | 0 | 7.714115 | 0.514 | 0.003 | 0.0000000 | ENSG00000188290 |
16 | CDKN1C | 0 | 6.806037 | 0.571 | 0.007 | 0.0000000 | ENSG00000129757 |
16 | VMO1 | 0 | 9.205968 | 0.257 | 0.000 | 0.0000000 | ENSG00000182853 |
17 | SDC1 | 0 | 10.283611 | 0.548 | 0.000 | 0.0000000 | ENSG00000115884 |
17 | TNFRSF17 | 0 | 8.133368 | 0.645 | 0.003 | 0.0000000 | ENSG00000048462 |
17 | JSRP1 | 0 | 10.605539 | 0.452 | 0.000 | 0.0000000 | ENSG00000167476 |
17 | IGLV3-1 | 0 | 11.477532 | 0.581 | 0.005 | 0.0000000 | ENSG00000211673 |
17 | PARM1 | 0 | 7.476256 | 0.452 | 0.003 | 0.0000000 | ENSG00000169116 |
By looking up the highly expressed genes in each cluster online and in the literature, I can roughly identify the cell types in the BoneMarrow1
data. I found the following cell types:
Cluster | Cell Type |
---|---|
0 | Regulatory T-cell |
1 | Macrophage Cell |
2 | Natural Killer Cell |
3 | Neutrophil Precursors |
4 | CD8 T Cell |
5 | Naïve B Cell |
6 | Natural Killer Cell |
7 | Megakaryocyte Cell |
8 | Macrophage Cell |
9 | CD8 T Cell |
10 | Mucosal-associated Invariant T Cell |
11 | Plasmacytoid Dendritic Cell |
12 | Erythroid Cells |
13 | Hematopoietic Stem Cells |
14 | CD8 T Cell |
15 | Dendritic Cell |
16 | Non-classical Monocyte Cell |
17 | Plasma Cell |
Since I already have the primary cell types for the BoneMarrow1
data from the top 5 highly expressed genes in each cluster, we can verfiy the cell types by gene markers from the literature or from online databases (e.g., CellMarker 2.0, PanglaoDB)
And I found the following gene markers for the cell types in the BoneMarrow1
data:
Cell Type | Gene Markers |
---|---|
Regulatory T-cell | CD25, CD4, FOXP3, CCR4, CCR6 |
Macrophage Cell | ARG, CCL2, CD163, CD206, FIZZ1, IL-10, MRC1, CD163, CD206, CD14 |
Natural Killer Cell | TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX |
Neutrophil Precursors | S100A8, S100A9, S100A12, LCN2, LTF |
CD8 T Cell | CD3, CD5, CD8, CD27, CD28 |
Naïve B Cell | P2RX5, PTPRCAP |
Natural Killer Cell | TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX |
Megakaryocyte Cell | IL6ST |
Macrophage Cell | ARG, CCL2, CD163, CD206, FIZZ1, IL-10, MRC1, CD163, CD206, CD14 |
CD8 T Cell | CD3, CD5, CD8, CD27, CD28 |
Mucosal-associated Invariant T Cell | TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10 |
Plasmacytoid Dendritic Cell | BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1 |
Erythroid Cells | AHSP, HBA-A1, HBA-A2, HIST1H2AO, STFA1, ALAD, PNPO |
Hematopoietic Stem Cells | CD34, MECOM, EGR1 |
CD8 T Cell | CD8A, CD8B, CD3D, CD3E, CD3G, CD4, CD5, CD6, CD7, CD8, CD8, CD8 |
Dendritic Cell | CD1C, CD86 |
Non-classical Monocyte Cell | CD14, CD16, CX3CR1, LYPD2 |
Plasma Cell | MZB1, IGHG1, JCHAIN, SPAG4, TGM5 |
Then plot feature plots for the classic gene markers for each cell type to verify the cell types in the BoneMarrow1
data.
- Regulatory T-cell: CD25, CD4, FOXP3, CCR4, CCR6
# define a function to plot feature plots for the classic gene markers
<- function(cell_type, classic_gene_markers) {
plot_feature_plots # map the gene names back to the ENSEMBL IDs
<- c(
classic_gene_maker_names
classic_gene_markers)
<- getBM(
Ensembl attributes=c("ensembl_gene_id","hgnc_symbol"),
filters = 'hgnc_symbol', values = classic_gene_maker_names,
mart = mrt)
<- FeaturePlot(
feature_plot
cell_type,features = Ensembl$ensembl_gene_id,
pt.size = 0.1) |>
suppressWarnings()
<- VlnPlot(
violin_plot
cell_type,features = Ensembl$ensembl_gene_id,
pt.size = 0.1,
ncol = 2) |>
suppressWarnings()
<- Ensembl |>
gene_id_names # pivot_wider(names_from = hgnc_symbol,
# values_from = ensembl_gene_id) |>
kable() |>
suppressWarnings()
return(list(
gene_id_names = gene_id_names,
feature_plot = feature_plot,
violin_plot = violin_plot
))
}
# map the gene names back to the ENSEMBL IDs
<- c(
Regulatory_T_classic_gene_maker_names "CD25", "CD4", "FOXP3", "CCR4", "CCR6")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Regulatory_T_classic_gene_maker_names)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000183813 |CCR4 |
|ENSG00000112486 |CCR6 |
|ENSG00000010610 |CD4 |
|ENSG00000049768 |FOXP3 |
$feature_plot
$violin_plot
- Macrophage Cell: ARG, CCL2, CD163, CD206, FIZZ1, IL-10, MRC1, CD163, CD206
<- c(
Macrophage_Cell_classic_gene_maker_names "ARG", "CCL2", "CD163", "CD206", "FIZZ1", "IL-10", "MRC1", "CD163", "CD206", "CD14")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Macrophage_Cell_classic_gene_maker_names)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000108691 |CCL2 |
|ENSG00000170458 |CD14 |
|ENSG00000177575 |CD163 |
|ENSG00000260314 |MRC1 |
$feature_plot
$violin_plot
- NK Cell: TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX
<- c(
NK_Cell_classic_gene_maker_names "TRDC", "KLRF1", "GNLY", "NCR1", "GZMA", "HOPX")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
NK_Cell_classic_gene_maker_names)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000115523 |GNLY |
|ENSG00000145649 |GZMA |
|ENSG00000171476 |HOPX |
|ENSG00000150045 |KLRF1 |
|ENSG00000275521 |NCR1 |
|ENSG00000278025 |NCR1 |
|ENSG00000277442 |NCR1 |
|ENSG00000277334 |NCR1 |
|ENSG00000276450 |NCR1 |
|ENSG00000277629 |NCR1 |
|ENSG00000273506 |NCR1 |
|ENSG00000275637 |NCR1 |
|ENSG00000277824 |NCR1 |
|ENSG00000275822 |NCR1 |
|ENSG00000274053 |NCR1 |
|ENSG00000278362 |NCR1 |
|ENSG00000273916 |NCR1 |
|ENSG00000273535 |NCR1 |
|ENSG00000275156 |NCR1 |
|ENSG00000189430 |NCR1 |
|ENSG00000211829 |TRDC |
$feature_plot
$violin_plot
- Neutrophil Precursors: S100A8, S100A9, S100A12, LCN2, LTF
<- c("S100A8", "S100A9", "S100A12", "LCN2", "LTF")
Neutrophil_Precursors_classic_gene_maker_names plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Neutrophil_Precursors_classic_gene_maker_names)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000148346 |LCN2 |
|ENSG00000012223 |LTF |
|ENSG00000163221 |S100A12 |
|ENSG00000143546 |S100A8 |
|ENSG00000163220 |S100A9 |
$feature_plot
$violin_plot
- CD8 T Cell: CD3, CD5, CD8, CD27, CD28
<- c("CD3", "CD5", "CD8", "CD27", "CD28")
CD8_T_Cell_classic_gene_maker_names plot_feature_plots(BoneMarrow1_pbmc_UMAP,
CD8_T_Cell_classic_gene_maker_names)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000139193 |CD27 |
|ENSG00000178562 |CD28 |
|ENSG00000110448 |CD5 |
$feature_plot
$violin_plot
- Naïve B Cell: P2RX5, PTPRCAP
<- c("P2RX5", "PTPRCAP")
Naive_B_Cell_classic_gene_maker_names plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Naive_B_Cell_classic_gene_maker_names)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000083454 |P2RX5 |
|ENSG00000213402 |PTPRCAP |
$feature_plot
$violin_plot
- Megakaryocyte Cell: IL6ST
<- c("IL6ST")
Megakaryocyte_Cell_classic_gene_maker_names plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Megakaryocyte_Cell_classic_gene_maker_names)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000134352 |IL6ST |
$feature_plot
$violin_plot
- Mucosal-associated Invariant T Cell: TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10
<- c("TRAV1-2", "TRAJ33", "TRAJ20", "TRAJ12", "SLC4A10")
Mucosal_associated_Invariant_T_Cell_classic_gene_maker_names plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Mucosal_associated_Invariant_T_Cell_classic_gene_maker_names)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000144290 |SLC4A10 |
|ENSG00000211877 |TRAJ12 |
|ENSG00000211869 |TRAJ20 |
|ENSG00000211856 |TRAJ33 |
|ENSG00000256553 |TRAV1-2 |
$feature_plot
$violin_plot
- Plasmacytoid Dendritic Cell: BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1,CD40, CD80, CD86,
<- c(
Plasmacytoid_Dendritic_Cell_classic_gene_maker_names "BDCA2", "BDCA-4", "CD123", "CD303", "CD304", "CLEC4C", "DR6", "Fc-epsilon RI-alpha", "ILT3", "ILT7", "NRP1")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Plasmacytoid_Dendritic_Cell_classic_gene_maker_names)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000198178 |CLEC4C |
|ENSG00000099250 |NRP1 |
$feature_plot
$violin_plot
- Erythroid Cell: AHSP, HBA-A1, HBA-A2, HIST1H2AO, STFA1, ALAD, PNPO
<- c("AHSP", "HBA-A1", "HBA-A2", "HIST1H2AO", "STFA1","ALAD", "PNPO")
Erythroid_Cell_classic_gene_maker_names plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Erythroid_Cell_classic_gene_maker_names)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000169877 |AHSP |
|ENSG00000148218 |ALAD |
|ENSG00000108439 |PNPO |
$feature_plot
$violin_plot
- Hematopoietic Stem Cell: CD34, MECOM, EGR1
<- c("CD34", "MECOM", "EGR1")
Hematopoietic_Stem_Cell_classic_gene_maker_names plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Hematopoietic_Stem_Cell_classic_gene_maker_names)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000174059 |CD34 |
|ENSG00000120738 |EGR1 |
|ENSG00000085276 |MECOM |
$feature_plot
$violin_plot
- Dendritic Cell: CD1C, CD86
<- c(
Dendritic_Cell_classic_gene_maker_names "CD1C", "CD86")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Dendritic_Cell_classic_gene_maker_names)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000158481 |CD1C |
|ENSG00000114013 |CD86 |
$feature_plot
$violin_plot
- Non-classical Monocyte Cell: CD14, CD16, CX3CR1, LYPD2
<- c(
Non_classical_Monocyte_Cell_classic_gene_maker_names "CD14", "CD16", "LYPD2")
plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Non_classical_Monocyte_Cell_classic_gene_maker_names)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000170458 |CD14 |
|ENSG00000197353 |LYPD2 |
$feature_plot
$violin_plot
- Plasma Cell: MZB1, IGHG1, JCHAIN, SPAG4, TGM5
<- c("MZB1", "IGHG1", "JCHAIN", "SPAG4", "TGM5")
Plasma_Cell_classic_gene_maker_names plot_feature_plots(BoneMarrow1_pbmc_UMAP,
Plasma_Cell_classic_gene_maker_names)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000277633 |IGHG1 |
|ENSG00000211896 |IGHG1 |
|ENSG00000132465 |JCHAIN |
|ENSG00000170476 |MZB1 |
|ENSG00000061656 |SPAG4 |
|ENSG00000104055 |TGM5 |
$feature_plot
$violin_plot
Annotate the cell types for the BoneMarrow1 data
# Annotate the cell types for the BoneMarrow1 data
# manually rename the seurat_clusters to the cell types
<- BoneMarrow1_pbmc_UMAP@meta.data |>
BoneMarrow1_pbmc_UMAP_new_meta # change the seurat_clusters to cell types
::mutate(cell_type = case_when(
dplyr== 0 ~ "Regulatory T-cell",
seurat_clusters == 1 ~ "Macrophage Cell",
seurat_clusters == 2 ~ "Natural Killer Cell",
seurat_clusters == 3 ~ "Neutrophil Precursors",
seurat_clusters == 4 ~ "CD8 T cell",
seurat_clusters == 5 ~ "Naïve B cell",
seurat_clusters == 6 ~ "Natural Killer Cell",
seurat_clusters == 7 ~ "Megakaryocyte Cell",
seurat_clusters == 8 ~ "Macrophage Cell",
seurat_clusters == 9 ~ "CD8 T cell",
seurat_clusters == 10 ~ "Mucosal-associated Invariant T Cell",
seurat_clusters == 11 ~ "Plasmacytoid Dendritic Cell",
seurat_clusters == 12 ~ "Erythroid Cells",
seurat_clusters == 13 ~ "Hematopoietic Stem Cells",
seurat_clusters == 14 ~ "CD8 T cell",
seurat_clusters == 15 ~ "Dendritic Cell",
seurat_clusters == 16 ~ "Non-classical Monocyte Cell",
seurat_clusters == 17 ~ "Plasma Cell")) |>
seurat_clusters ::mutate(cell_type = as.factor(cell_type))
dplyr
# add the cell types to the UMAP object
<- AddMetaData(
BoneMarrow1_pbmc_UMAP
BoneMarrow1_pbmc_UMAP,metadata = BoneMarrow1_pbmc_UMAP_new_meta,
col.name = "cell_type")
# export the UMAP object to the RDS file
saveRDS(BoneMarrow1_pbmc_UMAP, "BoneMarrow1_pbmc_UMAP.rds")
2.7.1.1 Summary Plots
Plot the UMAP with the cell types for the BoneMarrow1
data
DimPlot(BoneMarrow1_pbmc_UMAP,
reduction = "umap",
label = TRUE,
group.by = "cell_type",
label.size = 3,
repel = TRUE,
alpha = 0.5) +
ggtitle("BoneMarrow1 Cell Types") +
NoLegend()
Plot the heatmap
DoHeatmap(
BoneMarrow1_pbmc_UMAP,features = BoneMarrow1_pbmc_markers_top_5$gene,
label = TRUE,
group.by = "cell_type",
size = 4) |>
suppressWarnings() +
NoLegend() +
ggtitle("BoneMarrow1 Heatmap") +
rremove("y.text")
2.7.2 BoneMarrow2 data
# I decided to use the UMAP for the BoneMarrow2 data
# to find the gene markers
if (file.exists("BoneMarrow2_pbmc_markers.rds")) {
<- readRDS("BoneMarrow2_pbmc_markers.rds")
BoneMarrow2_pbmc_markers else {
} <-
BoneMarrow2_pbmc_markers FindAllMarkers(
BoneMarrow2_pbmc_UMAP,min.pct = 0.01,
logfc.threshold = 0.01,
test.use = "wilcox",
verbose = FALSE)
saveRDS(BoneMarrow2_pbmc_markers, "BoneMarrow2_pbmc_markers.rds")
}
<-
BoneMarrow2_pbmc_markers_top_5 |>
BoneMarrow2_pbmc_markers as_tibble() |>
filter(avg_log2FC > 4) |>
filter(p_val_adj < 0.01) |>
group_by(cluster, p_val_adj) |>
# arrange the gene with the order of
# high in avg_log2FC and low in p_val_adj
arrange(cluster, p_val_adj, desc(avg_log2FC)) |>
mutate(gene_names = map_chr(
~ BoneMarrow_data2_IDs$hgnc_symbol[
gene, match(.x, BoneMarrow_data2_IDs$ensembl_gene_id)])) |>
# select the top 5 genes in each cluster
group_by(cluster) |>
::slice(1:5) |>
dplyr::select(cluster, gene_names, everything())
dplyr
# export the top 5 highly expressed genes each cluster to a xlsx file
if (!file.exists("BoneMarrow2_pbmc_markers_top_5.xlsx")) {
write_xlsx(BoneMarrow2_pbmc_markers_top_5,
"BoneMarrow2_pbmc_markers_top_5.xlsx")
}
|>
BoneMarrow2_pbmc_markers_top_5 ::kable() knitr
cluster | gene_names | p_val | avg_log2FC | pct.1 | pct.2 | p_val_adj | gene |
---|---|---|---|---|---|---|---|
0 | S100A12 | 0 | 8.700578 | 0.897 | 0.011 | 0e+00 | ENSG00000163221 |
0 | S100A8 | 0 | 7.696677 | 0.993 | 0.093 | 0e+00 | ENSG00000143546 |
0 | S100A9 | 0 | 7.390087 | 0.998 | 0.138 | 0e+00 | ENSG00000163220 |
0 | CD14 | 0 | 7.300052 | 0.676 | 0.009 | 0e+00 | ENSG00000170458 |
0 | CLEC4E | 0 | 7.265346 | 0.657 | 0.007 | 0e+00 | ENSG00000166523 |
1 | CCR4 | 0 | 5.170545 | 0.306 | 0.008 | 0e+00 | ENSG00000183813 |
1 | TTC39C-AS1 | 0 | 4.442917 | 0.097 | 0.004 | 0e+00 | ENSG00000264745 |
1 | FOXP3 | 0 | 4.069459 | 0.062 | 0.004 | 0e+00 | ENSG00000049768 |
1 | PI16 | 0 | 5.528891 | 0.039 | 0.000 | 0e+00 | ENSG00000164530 |
1 | NEFL | 0 | 4.593021 | 0.043 | 0.001 | 0e+00 | ENSG00000277586 |
2 | KLRC2 | 0 | 4.488809 | 0.642 | 0.057 | 0e+00 | ENSG00000205809 |
2 | TKTL1 | 0 | 5.667839 | 0.362 | 0.017 | 0e+00 | ENSG00000007350 |
2 | KLRC3 | 0 | 4.342381 | 0.345 | 0.020 | 0e+00 | ENSG00000205810 |
2 | IL5RA | 0 | 4.732259 | 0.072 | 0.003 | 0e+00 | ENSG00000091181 |
3 | MYOM2 | 0 | 4.906103 | 0.886 | 0.055 | 0e+00 | ENSG00000036448 |
3 | FGFBP2 | 0 | 4.041734 | 0.905 | 0.100 | 0e+00 | ENSG00000137441 |
3 | SH2D1B | 0 | 4.343918 | 0.649 | 0.045 | 0e+00 | ENSG00000198574 |
3 | LINC00299 | 0 | 4.531001 | 0.431 | 0.025 | 0e+00 | ENSG00000236790 |
3 | PRSS23 | 0 | 4.230768 | 0.455 | 0.034 | 0e+00 | ENSG00000150687 |
4 | TNFRSF9 | 0 | 4.412879 | 0.325 | 0.018 | 0e+00 | ENSG00000049249 |
4 | ASTL | 0 | 4.889529 | 0.067 | 0.002 | 0e+00 | ENSG00000188886 |
4 | VCAM1 | 0 | 6.889529 | 0.029 | 0.000 | 0e+00 | ENSG00000162692 |
4 | ZNF80 | 0 | 5.889529 | 0.019 | 0.000 | 1e-07 | ENSG00000174255 |
5 | LRRN3 | 0 | 4.684415 | 0.187 | 0.006 | 0e+00 | ENSG00000173114 |
6 | SLC4A10 | 0 | 7.301805 | 0.395 | 0.003 | 0e+00 | ENSG00000144290 |
6 | TRAV1-2 | 0 | 4.486837 | 0.238 | 0.009 | 0e+00 | ENSG00000256553 |
6 | LINC01644 | 0 | 5.420450 | 0.195 | 0.004 | 0e+00 | ENSG00000218357 |
6 | TSPAN15 | 0 | 4.319363 | 0.157 | 0.008 | 0e+00 | ENSG00000099282 |
6 | IL23R | 0 | 5.216916 | 0.114 | 0.003 | 0e+00 | ENSG00000162594 |
7 | LILRA4 | 0 | 9.930713 | 0.975 | 0.005 | 0e+00 | ENSG00000239961 |
7 | LINC02812 | 0 | 9.230507 | 0.787 | 0.002 | 0e+00 | ENSG00000230138 |
7 | CLEC4C | 0 | 8.741929 | 0.861 | 0.003 | 0e+00 | ENSG00000198178 |
7 | LRRC26 | 0 | 8.675292 | 0.672 | 0.003 | 0e+00 | ENSG00000184709 |
7 | PROC | 0 | 8.369202 | 0.787 | 0.004 | 0e+00 | ENSG00000115718 |
8 | CD1C | 0 | 6.120903 | 0.286 | 0.010 | 0e+00 | ENSG00000158481 |
8 | CAVIN2 | 0 | 5.834204 | 0.321 | 0.019 | 0e+00 | ENSG00000168497 |
8 | CLEC10A | 0 | 5.894971 | 0.188 | 0.003 | 0e+00 | ENSG00000132514 |
8 | ENHO | 0 | 6.330357 | 0.152 | 0.002 | 0e+00 | ENSG00000168913 |
8 | C19orf33 | 0 | 7.107964 | 0.134 | 0.001 | 0e+00 | ENSG00000167644 |
9 | CD79A | 0 | 8.333834 | 0.958 | 0.021 | 0e+00 | ENSG00000105369 |
9 | MS4A1 | 0 | 7.611297 | 0.832 | 0.015 | 0e+00 | ENSG00000156738 |
9 | CD19 | 0 | 8.874331 | 0.568 | 0.002 | 0e+00 | ENSG00000177455 |
9 | FCER2 | 0 | 7.647739 | 0.653 | 0.006 | 0e+00 | ENSG00000104921 |
9 | FCRL1 | 0 | 7.857457 | 0.568 | 0.003 | 0e+00 | ENSG00000163534 |
10 | CDKN1C | 0 | 7.368670 | 0.603 | 0.004 | 0e+00 | ENSG00000129757 |
10 | CKB | 0 | 8.221113 | 0.562 | 0.003 | 0e+00 | ENSG00000166165 |
10 | LYPD2 | 0 | 13.087031 | 0.438 | 0.000 | 0e+00 | ENSG00000197353 |
10 | VMO1 | 0 | 9.435237 | 0.438 | 0.001 | 0e+00 | ENSG00000182853 |
10 | UICLM | 0 | 8.747181 | 0.438 | 0.001 | 0e+00 | ENSG00000233392 |
11 | PRTN3 | 0 | 9.439529 | 0.952 | 0.019 | 0e+00 | ENSG00000196415 |
11 | C1QTNF4 | 0 | 9.376598 | 0.629 | 0.003 | 0e+00 | ENSG00000172247 |
11 | SMIM24 | 0 | 7.842478 | 0.710 | 0.006 | 0e+00 | ENSG00000095932 |
11 | CTSG | 0 | 7.899165 | 0.790 | 0.010 | 0e+00 | ENSG00000100448 |
11 | MS4A3 | 0 | 8.151139 | 0.581 | 0.002 | 0e+00 | ENSG00000149516 |
12 | SPTSSB | 0 | 6.160634 | 0.500 | 0.010 | 0e+00 | ENSG00000196542 |
12 | XCL1 | 0 | 4.973271 | 0.871 | 0.051 | 0e+00 | ENSG00000143184 |
12 | KLRC1 | 0 | 4.814189 | 0.613 | 0.035 | 0e+00 | ENSG00000134545 |
12 | TNFRSF11A | 0 | 5.751829 | 0.194 | 0.004 | 0e+00 | ENSG00000141655 |
12 | ADGRG3 | 0 | 4.967806 | 0.226 | 0.008 | 0e+00 | ENSG00000182885 |
By looking up the highly expressed genes in each cluster online and in the literature, I can roughly identify the cell types in the BoneMarrow2
data. I found the following cell types:
Cluster | Cell Type |
---|---|
0 | Neutrophil |
1 | Regulatory T-cell |
2 | Natural Killer Cell |
3 | CD16 Natural Killer Cell |
4 | CD8 T cell |
5 | Regulatory T-cell |
6 | Mucosal-associated Invariant |
7 | Plasmacytoid Dendritic Cell |
8 | Dendritic Cell |
9 | Naïve B cell |
10 | Non-classical Monocyte Cell |
11 | Neutrophil Granulocytes |
12 | Natural Killer Cell |
Since I already have the primary cell types for the BoneMarrow2
data from the top 5 highly expressed genes in each cluster, we can verfiy the cell types by gene markers from the literature or from online databases (e.g., CellMarker 2.0, PanglaoDB)
And I found the following gene markers for the cell types in the BoneMarrow2
data:
Cell Type | Gene Markers |
---|---|
Neutrophil | S100A8, S100A9, S100A12, LCN2, LTF |
Regulatory T-cell | CD25, CD4, FOXP3, CCR4, CCR6 |
Natural Killer Cell | TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX |
CD16 Natural Killer Cell | MYOM2, FGFBP2, SH2D1B |
CD8 T cell | CD3, CD5, CD8, CD27, CD28 |
Mucosal-associated Invariant T Cell | TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10 |
Plasmacytoid Dendritic Cell | BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1 |
Dendritic Cell | CD1C, CD86 |
Naïve B cell | MS4A1, CD79A, P2RX5, PTPRCAP |
Non-classical Monocyte Cell | CD14, CD16, CX3CR1, LYPD2 |
Neutrophil Granulocytes | S100A8, S100A9, S100A12, LCN2, LTF |
Then plot feature plots for the classic gene markers for each cell type to verify the cell types in the BoneMarrow2
data.
- Neutrophil: S100A8, S100A9, S100A12, LCN2, LTF
<- c("S100A8", "S100A9", "S100A12", "LCN2", "LTF")
Neutrophil_markers_bone_marrow2 plot_feature_plots(BoneMarrow2_pbmc_UMAP,
Neutrophil_markers_bone_marrow2)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000148346 |LCN2 |
|ENSG00000012223 |LTF |
|ENSG00000163221 |S100A12 |
|ENSG00000143546 |S100A8 |
|ENSG00000163220 |S100A9 |
$feature_plot
$violin_plot
- Regulatory T-cell: CD25, CD4, FOXP3, CCR4, CCR6
<- c("CD25", "CD4", "FOXP3", "CCR4", "CCR6")
Regulatory_T_cell_markers_bone_marrow2 plot_feature_plots(BoneMarrow2_pbmc_UMAP,
Regulatory_T_cell_markers_bone_marrow2)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000183813 |CCR4 |
|ENSG00000112486 |CCR6 |
|ENSG00000010610 |CD4 |
|ENSG00000049768 |FOXP3 |
$feature_plot
$violin_plot
- Natural Killer Cell: TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX
<- c("TRDC", "KLRF1", "GNLY", "NCR1", "GZMA", "HOPX")
Natural_Killer_Cell_markers_bone_marrow2 plot_feature_plots(BoneMarrow2_pbmc_UMAP,
Natural_Killer_Cell_markers_bone_marrow2)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000115523 |GNLY |
|ENSG00000145649 |GZMA |
|ENSG00000171476 |HOPX |
|ENSG00000150045 |KLRF1 |
|ENSG00000275521 |NCR1 |
|ENSG00000278025 |NCR1 |
|ENSG00000277442 |NCR1 |
|ENSG00000277334 |NCR1 |
|ENSG00000276450 |NCR1 |
|ENSG00000277629 |NCR1 |
|ENSG00000273506 |NCR1 |
|ENSG00000275637 |NCR1 |
|ENSG00000277824 |NCR1 |
|ENSG00000275822 |NCR1 |
|ENSG00000274053 |NCR1 |
|ENSG00000278362 |NCR1 |
|ENSG00000273916 |NCR1 |
|ENSG00000273535 |NCR1 |
|ENSG00000275156 |NCR1 |
|ENSG00000189430 |NCR1 |
|ENSG00000211829 |TRDC |
$feature_plot
$violin_plot
- CD16 Natural Killer Cell: MYOM2, FGFBP2, SH2D1B
<- c("MYOM2", "FGFBP2", "SH2D1B")
CD16_Natural_Killer_Cell_markers_bone_marrow2 plot_feature_plots(BoneMarrow2_pbmc_UMAP,
CD16_Natural_Killer_Cell_markers_bone_marrow2)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000137441 |FGFBP2 |
|ENSG00000274137 |MYOM2 |
|ENSG00000036448 |MYOM2 |
|ENSG00000198574 |SH2D1B |
$feature_plot
$violin_plot
- CD8 T cell: CD3, CD5, CD8, CD27, CD28
<- c("CD3", "CD5", "CD8", "CD27", "CD28")
CD8_T_cell_markers_bone_marrow2 plot_feature_plots(BoneMarrow2_pbmc_UMAP,
CD8_T_cell_markers_bone_marrow2)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000139193 |CD27 |
|ENSG00000178562 |CD28 |
|ENSG00000110448 |CD5 |
$feature_plot
$violin_plot
- Mucosal-associated Invariant T Cell: TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10
<- c("TRAV1-2", "TRAJ33", "TRAJ20", "TRAJ12", "SLC4A10")
Mucosal_associated_Invariant_T_Cell_markers_bone_marrow2 plot_feature_plots(BoneMarrow2_pbmc_UMAP,
Mucosal_associated_Invariant_T_Cell_markers_bone_marrow2)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000144290 |SLC4A10 |
|ENSG00000211877 |TRAJ12 |
|ENSG00000211869 |TRAJ20 |
|ENSG00000211856 |TRAJ33 |
|ENSG00000256553 |TRAV1-2 |
$feature_plot
$violin_plot
- Plasmacytoid Dendritic Cell: BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1
<- c("BDCA2", "BDCA-4", "CD123", "CD303", "CD304", "CLEC4C", "DR6", "Fc-epsilon RI-alpha", "ILT3", "ILT7", "NRP1")
Plasmacytoid_Dendritic_Cell_markers_bone_marrow2 plot_feature_plots(BoneMarrow2_pbmc_UMAP,
Plasmacytoid_Dendritic_Cell_markers_bone_marrow2)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000198178 |CLEC4C |
|ENSG00000099250 |NRP1 |
$feature_plot
$violin_plot
- Dendritic Cell: CD1C, CD86
<- c("CD1C", "CD86")
Dendritic_Cell_markers_bone_marrow2 plot_feature_plots(BoneMarrow2_pbmc_UMAP,
Dendritic_Cell_markers_bone_marrow2)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000158481 |CD1C |
|ENSG00000114013 |CD86 |
$feature_plot
$violin_plot
- Naïve B cell: MS4A1, CD79A, P2RX5
<- c("MS4A1", "CD79A", "P2RX5")
Naive_B_cell_markers_bone_marrow2 plot_feature_plots(BoneMarrow2_pbmc_UMAP,
Naive_B_cell_markers_bone_marrow2)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000105369 |CD79A |
|ENSG00000156738 |MS4A1 |
|ENSG00000083454 |P2RX5 |
$feature_plot
$violin_plot
Annotate the cell types for the BoneMarrow2 data
# Annotate the cell types for the BoneMarrow2 data
# manually rename the seurat_clusters to the cell types
<- BoneMarrow2_pbmc_UMAP@meta.data |>
BoneMarrow2_pbmc_UMAP_new_meta # change the seurat_clusters to cell types
::mutate(cell_type = case_when(
dplyr== 0 ~ "Neutrophil",
seurat_clusters == 1 ~ "Regulatory T-cell",
seurat_clusters == 2 ~ "Natural Killer Cell",
seurat_clusters == 3 ~ "CD16 Natural Killer Cell",
seurat_clusters == 4 ~ "CD8 T Cell",
seurat_clusters == 5 ~ "Regulatory T-cell",
seurat_clusters == 6 ~ "Mucosal-associated Invariant T Cell",
seurat_clusters == 7 ~ "Plasmacytoid Dendritic Cell",
seurat_clusters == 8 ~ "Dendritic Cell",
seurat_clusters == 9 ~ "Naïve B Cell",
seurat_clusters == 10 ~ "Non-classical Monocyte Cell",
seurat_clusters == 11 ~ "Neutrophil Granulocytes",
seurat_clusters == 12 ~ "Natural Killer Cell")) |>
seurat_clusters ::mutate(cell_type = as.factor(cell_type))
dplyr
# add the cell types to the UMAP object
<- AddMetaData(
BoneMarrow2_pbmc_UMAP
BoneMarrow2_pbmc_UMAP,metadata = BoneMarrow2_pbmc_UMAP_new_meta,
col.name = "cell_type")
# export the UMAP object to a rds file
saveRDS(BoneMarrow2_pbmc_UMAP, "BoneMarrow2_pbmc_UMAP.rds")
2.7.2.1 Summary Plots
Plot the UMAP with the cell types for the BoneMarrow2
data
DimPlot(BoneMarrow2_pbmc_UMAP,
reduction = "umap",
label = TRUE,
group.by = "cell_type",
label.size = 3,
repel = TRUE,
alpha = 0.5) +
ggtitle("BoneMarrow2 Cell Types") +
NoLegend()
Plot the heatmap
DoHeatmap(
BoneMarrow2_pbmc_UMAP,features = BoneMarrow2_pbmc_markers_top_5$gene,
label = TRUE,
group.by = "cell_type",
angle = 23,
size = 4) |>
suppressWarnings() +
NoLegend() +
ggtitle("BoneMarrow2 Heatmap") +
rremove("y.text")
3 Part 2: Pancreas
3.1 Read the data
# Load the Pancreas dataset
<- readRDS(
Pancreas paste("/Users/jacenai/Desktop/STAT_M254/",
"Final_Project/Datasets_final/",
"Pancreas.rds",
sep = "")) |>
as.data.frame()
3.2 Create Seurat and QC
# Create the Pancreas Seurat object
<- Seurat::CreateSeuratObject(
Pancreas counts = Pancreas,
project = "Pancreas",
assay = "DNA",
min.cells = 10,
min.features = 200) |>
suppressWarnings()
# Find mitochondrial genes for Pancreas
"percent.mt"]] <- PercentageFeatureSet(
Pancreas[[
Pancreas,pattern = "^MT-") |>
suppressWarnings()
VlnPlot(Pancreas,
features = c("nFeature_DNA",
"nCount_DNA",
"percent.mt"),
ncol = 3) |>
suppressWarnings()
3.3 Transformation & Feature selection & Scaling
I will try log1pCPM
, and SCTransform
to transform the data.
# Transform the pancreas data
# Seurat default log-normalization (log1pCP10k)
<- NormalizeData(
pancreas_log1pCP1k_1
Pancreas,normalization.method = "LogNormalize",
scale.factor = 1000)
# identify the highly variable genes
<- FindVariableFeatures(
pancreas_features_log1pCP1k_1
pancreas_log1pCP1k_1,selection.method = "vst",
nfeatures = 2000) # tune this parameter
# Scale the data
<-
pancreas_scaled_log1pCP1k_1 ScaleData(pancreas_features_log1pCP1k_1,
features =
VariableFeatures(pancreas_features_log1pCP1k_1))
# SCTransform the Pancreas data
<- SCTransform(
pancreas_SCTransform
Pancreas,verbose = FALSE,
variable.features.n = 3000, # tune this parameter
return.only.var.genes = TRUE,
assay = "DNA",
vars.to.regress = c("nCount_DNA", "percent.mt"))
Calculate the variance of transformed data and plot it against the mean.
# Calculate the variance of the transformed data
<-
pancreas_variance_log1pCP1k_1 "DNA"]]@layers$scale.data |>
pancreas_scaled_log1pCP1k_1[[apply(1, var)
<-
pancreas_mean_log1pCP1k_1 "DNA"]]@layers$scale.data |>
pancreas_scaled_log1pCP1k_1[[apply(1, mean)
<-
pancreas_variance_SCTransform "SCT"]]@scale.data |>
pancreas_SCTransform[[apply(1, var)
<-
pancreas_mean_SCTransform "SCT"]]@scale.data |>
pancreas_SCTransform[[apply(1, mean)
Plot the variance against the mean
# Plot the variance against the mean
# Create a data frame for plotting
<-
gene_var_mean_df data.frame(Gene_Variance =
c(pancreas_variance_log1pCP1k_1,
pancreas_variance_SCTransform),Gene_Mean =
c(pancreas_mean_log1pCP1k_1,
pancreas_mean_SCTransform),Transformation =
rep(c("log1pCP1k", "SCTransform"),
c(length(pancreas_variance_log1pCP1k_1),
length(pancreas_variance_SCTransform))))
# Plot the variance against the mean
ggplot(gene_var_mean_df,
aes(x = Gene_Mean,
y = Gene_Variance,
color = Transformation)) +
geom_point(alpha = 0.5) +
facet_wrap(~Transformation,
scales = "free") +
theme_gray() +
labs(title = "Variance vs Mean",
x = "Mean",
y = "Variance") +
theme(legend.position = "none") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 5))
I decided to use SCTransform
for the downstream analysis.
3.4 Linear dimensional reduction
# I decided to use SCTransform for the downstream analysis
# Perform PCA on the Pancreas data
<- RunPCA(
Pancreas_pbmc_PCA
pancreas_SCTransform,features = VariableFeatures(pancreas_SCTransform),
npcs = 100,
verbose = FALSE)
ElbowPlot(Pancreas_pbmc_PCA, ndims = 100)
3.5 Get the gene names
Ensembl IDs to gene
if (file.exists("Pancreas_data_IDs.rds")) {
<- readRDS("Pancreas_data_IDs.rds")
Pancreas_data_IDs else {
} <- getBM(
Pancreas_data_IDs attributes = c("ensembl_gene_id","hgnc_symbol"),
filters = 'ensembl_gene_id',
values = rownames(Pancreas),
mart = mrt)
saveRDS(Pancreas_data_IDs, "Pancreas_data_IDs.rds")
}
3.6 Clustering
# Perform neighborhood-based clustering
<- FindNeighbors(
Pancreas_pbmc_neighbor
Pancreas_pbmc_PCA,reduction = "pca",
dims = 1:100,
k.param = 30,
prune.SNN = 1/15,
verbose = FALSE)
<- FindClusters(
Pancreas_pbmc_cluster
Pancreas_pbmc_neighbor,resolution = 0.5 ,
verbose = FALSE)
Run UMAP and t-SNE
# Run UMAP
<- RunUMAP(
Pancreas_pbmc_UMAP
Pancreas_pbmc_cluster,dims = 1:100,
verbose = FALSE)
# try t_SNE
<- RunTSNE(
Pancreas_pbmc_tSNE
Pancreas_pbmc_cluster,dims = 1:90,
verbose = FALSE)
Plot the UMAP and t-SNE
DimPlot(Pancreas_pbmc_UMAP,
reduction = "umap",
label = TRUE,
group.by = "seurat_clusters",
alpha = 0.5) +
ggtitle("Pancreas UMAP")
DimPlot(Pancreas_pbmc_tSNE,
reduction = "tsne",
label = TRUE,
group.by = "seurat_clusters",
alpha = 0.5) +
ggtitle("Pancreas t-SNE")
By comparing the UMAP and t-SNE plots, I decided to use the UMAP.
3.7 Annotation Results
# Find gene markers
if (file.exists("pancreas_gene_markers.rds")) {
<- readRDS("pancreas_gene_markers.rds")
pancreas_gene_markers else {
} <- FindAllMarkers(
pancreas_gene_markers
Pancreas_pbmc_UMAP,min.pct = 0.01,
logfc.threshold = 0.01,
test.use = "wilcox",
verbose = FALSE)
saveRDS(pancreas_gene_markers, "pancreas_gene_markers.rds")
}
# Filter the gene markers
<-
Pancreas_pbmc_markers_top_5 |>
pancreas_gene_markers as_tibble() |>
filter(avg_log2FC > 2) |>
filter(p_val_adj < 0.01) |>
group_by(cluster, p_val_adj) |>
# arrange the gene with the order of
# high in avg_log2FC and low in p_val_adj
arrange(cluster, p_val_adj, desc(avg_log2FC)) |>
mutate(gene_names = map_chr(
~ Pancreas_data_IDs$hgnc_symbol[
gene, match(.x, Pancreas_data_IDs$ensembl_gene_id)])) |>
# select the top 5 genes in each cluster
group_by(cluster) |>
::slice(1:5) |>
dplyr::select(cluster, gene_names, everything())
dplyr
|>
Pancreas_pbmc_markers_top_5 as_tibble() |>
::kable() knitr
cluster | gene_names | p_val | avg_log2FC | pct.1 | pct.2 | p_val_adj | gene |
---|---|---|---|---|---|---|---|
0 | GATM | 0 | 2.314042 | 1.000 | 0.817 | 0 | ENSG00000171766 |
0 | SYCN | 0 | 2.056717 | 1.000 | 0.969 | 0 | ENSG00000179751 |
0 | AMY2A | 0 | 2.232347 | 0.998 | 0.994 | 0 | ENSG00000243480 |
0 | AQP8 | 0 | 2.280071 | 0.948 | 0.561 | 0 | ENSG00000103375 |
0 | AMY2B | 0 | 2.190193 | 0.993 | 0.883 | 0 | ENSG00000240038 |
1 | REG3G | 0 | 4.080445 | 1.000 | 0.844 | 0 | ENSG00000143954 |
1 | REG1B | 0 | 3.215856 | 1.000 | 1.000 | 0 | ENSG00000172023 |
1 | REG1A | 0 | 2.081271 | 1.000 | 1.000 | 0 | ENSG00000115386 |
1 | SERPINA3 | 0 | 3.717401 | 0.534 | 0.083 | 0 | ENSG00000196136 |
1 | C3 | 0 | 3.439010 | 0.685 | 0.193 | 0 | ENSG00000125730 |
2 | GYPC | 0 | 3.637994 | 0.741 | 0.105 | 0 | ENSG00000136732 |
2 | DAB2 | 0 | 4.486262 | 0.553 | 0.049 | 0 | ENSG00000153071 |
2 | S100A13 | 0 | 2.685883 | 0.962 | 0.452 | 0 | ENSG00000189171 |
2 | AQP1 | 0 | 4.018850 | 0.662 | 0.103 | 0 | ENSG00000240583 |
2 | CAVIN1 | 0 | 2.775235 | 0.700 | 0.100 | 0 | ENSG00000177469 |
3 | HSD11B1-AS1 | 0 | 3.592255 | 0.329 | 0.034 | 0 | ENSG00000227591 |
3 | LINC01610 | 0 | 2.458333 | 0.883 | 0.643 | 0 | ENSG00000237643 |
3 | TAC1 | 0 | 3.921941 | 0.428 | 0.109 | 0 | ENSG00000006128 |
3 | CPA4 | 0 | 2.247372 | 0.266 | 0.062 | 0 | ENSG00000128510 |
3 | ANXA10 | 0 | 2.430645 | 0.302 | 0.089 | 0 | ENSG00000109511 |
4 | RXRG | 0 | 6.361009 | 0.399 | 0.005 | 0 | ENSG00000143171 |
4 | ADCYAP1 | 0 | 5.879420 | 0.454 | 0.014 | 0 | ENSG00000141433 |
4 | PCSK1 | 0 | 4.752696 | 0.475 | 0.018 | 0 | ENSG00000175426 |
4 | TAGLN3 | 0 | 5.974577 | 0.464 | 0.017 | 0 | ENSG00000144834 |
4 | ASB9 | 0 | 6.459432 | 0.492 | 0.024 | 0 | ENSG00000102048 |
5 | MALL | 0 | 5.182514 | 0.982 | 0.118 | 0 | ENSG00000144063 |
5 | ALDH1A3 | 0 | 4.547304 | 0.945 | 0.129 | 0 | ENSG00000184254 |
5 | FLNA | 0 | 4.041860 | 0.982 | 0.158 | 0 | ENSG00000196924 |
5 | S100A14 | 0 | 3.988552 | 0.936 | 0.135 | 0 | ENSG00000189334 |
5 | TSPAN15 | 0 | 3.189382 | 0.955 | 0.157 | 0 | ENSG00000099282 |
6 | IRX2-DT | 0 | 6.099829 | 0.833 | 0.023 | 0 | ENSG00000186493 |
6 | F10 | 0 | 5.597897 | 0.833 | 0.031 | 0 | ENSG00000126218 |
6 | CRH | 0 | 6.225981 | 0.567 | 0.012 | 0 | ENSG00000147571 |
6 | SMIM24 | 0 | 5.272775 | 0.817 | 0.043 | 0 | ENSG00000095932 |
6 | PCSK2 | 0 | 5.318404 | 0.933 | 0.067 | 0 | ENSG00000125851 |
7 | MALAT1 | 0 | 4.024411 | 1.000 | 0.992 | 0 | ENSG00000251562 |
7 | NEAT1 | 0 | 4.437703 | 1.000 | 0.947 | 0 | ENSG00000245532 |
7 | LENG8 | 0 | 2.830771 | 1.000 | 0.752 | 0 | ENSG00000167615 |
7 | RBM25 | 0 | 2.400703 | 0.982 | 0.569 | 0 | ENSG00000119707 |
7 | MEG3 | 0 | 2.917470 | 0.912 | 0.474 | 0 | ENSG00000214548 |
# export the top 5 highly expressed genes each cluster to a xlsx file
if (!file.exists("Pancreas_pbmc_markers_top_5.xlsx")) {
write_xlsx(Pancreas_pbmc_markers_top_5,
"Pancreas_pbmc_markers_top_5.xlsx")
}
By looking up the highly expressed genes in each cluster online and in the literature, I can roughly identify the cell types in the Pancreas
data. I found the following cell types:
Cluster | Cell Type |
---|---|
0 | Acinar Cell |
1 | Acinar Cell |
2 | Ductal Cell |
3 | Acinar Cell |
4 | Beta Cell |
5 | Ductal Cell |
6 | Alpha Cell |
7 | Mixed Immune Cell |
Since I already have the primary cell types for the Pancreas
data from the top 5 highly expressed genes in each cluster, we can verfiy the cell types by gene markers from the literature or from online databases (e.g., CellMarker 2.0, PanglaoDB)
And I found the following gene markers for the cell types in the Pancreas
data:
Cell Type | Gene Markers |
---|---|
Acinar Cell | PRSS1, CELA3A, CELA3B, CPA1, CTRB1, CTRB2, REG1A, REG1B, REG3A, REG3G, REG4 |
Ductal Cell | KRT19, KRT7, KRT8, KRT18, KRT20, MUC1, MUC6, MUC5AC, MUC5B, SOX9 |
Beta Cell | INS, GCG, IAPP, PCSK1, PCSK2, SLC2A2, SLC30A8, NKX6-1, PDX1, MAFA |
Alpha Cell | GCG, ARX, IRX2, IRX3, IRX5, IRX6, IRX1, IRX4, IRX7, IRX8, IRX9 |
Mixed Immune Cell | CD3D, CD3E, CD3G, CD4, CD8A, CD8B, CD19, CD20, CD22, CD79A, CD79B, CD34, CD38 |
Then plot feature plots for the classic gene markers for each cell type to verify the cell types in the Pancreas
data.
- Acinar Cell: PRSS1, CELA3A, CELA3B, CPA1, CTRB1, CTRB2, REG1A, REG1B, REG3A, REG3G, REG4
<- c("PRSS1", "CELA3A", "CELA3B", "CPA1")
Acinar_cell_markers plot_feature_plots(Pancreas_pbmc_UMAP,
Acinar_cell_markers)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000142789 |CELA3A |
|ENSG00000219073 |CELA3B |
|ENSG00000091704 |CPA1 |
|ENSG00000274247 |PRSS1 |
|ENSG00000204983 |PRSS1 |
$feature_plot
$violin_plot
- Ductal Cell: KRT19, KRT7, KRT8, KRT18, KRT20, MUC1, MUC6, MUC5AC, MUC5B, SOX9
<- c("KRT19", "KRT7", "KRT8", "KRT18", "MUC5AC")
Ductal_cell_markers plot_feature_plots(Pancreas_pbmc_UMAP,
Ductal_cell_markers)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000111057 |KRT18 |
|ENSG00000171345 |KRT19 |
|ENSG00000135480 |KRT7 |
|ENSG00000170421 |KRT8 |
|ENSG00000215182 |MUC5AC |
|ENSG00000291363 |MUC5AC |
$feature_plot
$violin_plot
- Beta Cell: INS, GCG, IAPP, PCSK1, PCSK2, SLC2A2, SLC30A8, NKX6-1, PDX1, MAFA
<- c("INS", "GCG", "IAPP", "PCSK1", "PCSK2")
Beta_cell_markers plot_feature_plots(Pancreas_pbmc_UMAP,
Beta_cell_markers)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000115263 |GCG |
|ENSG00000121351 |IAPP |
|ENSG00000254647 |INS |
|ENSG00000175426 |PCSK1 |
|ENSG00000125851 |PCSK2 |
$feature_plot
$violin_plot
- Alpha Cell: GCG, ARX, IRX2, IRX3, IRX5, IRX6, IRX1, IRX4, IRX7, IRX8, IRX9
<- c("GCG", "ARX", "IRX2", "IRX3")
Alpha_cell_markers plot_feature_plots(Pancreas_pbmc_UMAP,
Alpha_cell_markers)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000004848 |ARX |
|ENSG00000115263 |GCG |
|ENSG00000170561 |IRX2 |
|ENSG00000177508 |IRX3 |
$feature_plot
$violin_plot
- Mixed Immune Cell: MALAT1, NEAT1
<- c("MALAT1", "NEAT1")
Mixed_immune_cell_markers plot_feature_plots(Pancreas_pbmc_UMAP,
Mixed_immune_cell_markers)
$gene_id_names
|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000251562 |MALAT1 |
|ENSG00000245532 |NEAT1 |
$feature_plot
$violin_plot
Annotate the cell types
Annotate the cell types for the Pancreas data based on the gene markers and the feature plots.
# Annotate the cell types for the Pancreas data
# manually rename the seurat_clusters to the cell types
<- Pancreas_pbmc_UMAP@meta.data |>
Pancreas_pbmc_UMAP_new_meta # change the seurat_clusters to cell types
::mutate(cell_type = case_when(
dplyr== 0 ~ "Acinar Cell",
seurat_clusters == 1 ~ "Acinar Cell",
seurat_clusters == 2 ~ "Ductal Cell",
seurat_clusters == 3 ~ "Acinar Cell",
seurat_clusters == 4 ~ "Beta Cell",
seurat_clusters == 5 ~ "Ductal Cell",
seurat_clusters == 6 ~ "Alpha Cell",
seurat_clusters == 7 ~ "Mixed Immune Cell")) |>
seurat_clusters ::mutate(cell_type = as.factor(cell_type))
dplyr
# add the cell types to the UMAP object
<- AddMetaData(
Pancreas_pbmc_UMAP
Pancreas_pbmc_UMAP,metadata = Pancreas_pbmc_UMAP_new_meta,
col.name = "cell_type")
# export the UMAP object to a rds file
saveRDS(Pancreas_pbmc_UMAP, "Pancreas_pbmc_UMAP.rds")
3.7.0.1 Summary Plots
Plot the UMAP with the cell types for the Pancreas
data.
DimPlot(Pancreas_pbmc_UMAP,
reduction = "umap",
label = TRUE,
group.by = "cell_type",
label.size = 4,
repel = TRUE,
alpha = 0.5) +
ggtitle("Pancreas Cell Types") +
NoLegend()
Plot the heatmap
# plot the heatmap
DoHeatmap(
Pancreas_pbmc_UMAP,features = Pancreas_pbmc_markers_top_5$gene,
label = TRUE,
group.by = "cell_type",
size = 4) |>
suppressWarnings() +
NoLegend() +
rremove("y.text")
New marker for alpha cells
FeaturePlot(
Pancreas_pbmc_UMAP,features = c("ENSG00000126218"),
reduction = "umap",
slot = "data")
4 Conclusion
In this final project, I have analyzed the BoneMarrow1
, BoneMarrow2
and Pancreas
data using the Seurat package in R. I have identified 14 cell types in the BoneMarrow1
data, 11 cell types in the BoneMarrow2
data, and 5 cell types in the Pancreas
data. I have also identified the gene markers for each cell type and annotated the cell types based on the gene markers and the feature plots. Finally, I have plotted the heatmap for the top 5 gene markers for each cell type in all three datasets. The results of this analysis can be used to better understand the cell types in the BoneMarrow1
, BoneMarrow2
and Pancreas
data and to further investigate the role of these cell types in health and disease.