STAT M254 Final Project

BoneMarrow and Pancreas Data Analysis

Author

Jiachen Ai, UID: 206182615

Published

June 16, 2024

# Load libraries
library(readr)
library(dplyr)
library(tidyverse)
library(Seurat) |> suppressWarnings()
library(patchwork)
library(ggplot2)
library(sctransform)
library(matrixStats)
library(purrr)
library(BayesTools)
library(VGAM) |> suppressWarnings()
library(knitr) |> suppressWarnings()
library(Matrix)
library(fastglmpca)
library(cowplot)
library(gridExtra)
library(ggpubr)
library(motifcluster)
library(cluster)
library(mclust)
library(tightClust)
library(devtools)
library(biomaRt) |> suppressWarnings()
library(presto)
library(writexl)
# library(SingleR)
set.seed(2024)

1 Introduction

In this project, I will analyze two datasets: BoneMarrow and Pancreas. The BoneMarrow dataset contains two datasets: BoneMarrow1 and BoneMarrow2. The Pancreas dataset contains one dataset: Pancreas. The goal of this project is to identify cell types in the BoneMarrow and Pancreas datasets. I will perform the following analysis:

  • Setup the Seurat Object
  • Standard pre-processing workfow
  • Normalizing the data
  • Identification of highly variable features (feature selection)
  • Scaling the data
  • Perform linear dimensional reduction
  • Determine the ‘dimensionality’ of the dataset
  • Cluster the cells
  • Run non-linear dimensional reduction (UMAP/tSNE)
  • Finding differentially expressed features (cluster biomarkers)
  • Assigning cell type identity to clusters

2 Part 1: BoneMarrow

2.1 Read the data

# Load the BoneMarrow dataset
BoneMarrow_data1 <- readRDS(
  paste("/Users/jacenai/Desktop/STAT_M254/",
        "Final_Project/Datasets_final/",
        "BoneMarrow_dataset1.rds",
        sep = "")) |>
  as.data.frame()

BoneMarrow_data2 <- readRDS(
  paste("/Users/jacenai/Desktop/STAT_M254/",
        "Final_Project/Datasets_final/",
        "BoneMarrow_dataset2.rds",
        sep = "")) |>
  as.data.frame()

2.2 Create Seurat and QC

# Create the BoneMarrow1 Seurat object
BoneMarrow1 <- Seurat::CreateSeuratObject(
  counts = BoneMarrow_data1,
  project = "BoneMarrow1",
  assay = "DNA",
  min.cells = 10,
  min.features = 200) |>
  suppressWarnings()

# Find mitochondrial genes for BoneMarrow1
BoneMarrow1[["percent.mt"]] <- PercentageFeatureSet(
  BoneMarrow1,
  pattern = "^MT-") |>
  suppressWarnings()

# Create the BoneMarrow2 Seurat object
BoneMarrow2 <- Seurat::CreateSeuratObject(
  counts = BoneMarrow_data2,
  project = "BoneMarrow2",
  assay = "DNA",
  min.cells = 10,
  min.features = 200) |>
  suppressWarnings()

# Find mitochondrial genes for BoneMarrow2
BoneMarrow2[["percent.mt"]] <- PercentageFeatureSet(
  BoneMarrow2,
  pattern = "^MT-") |>
  suppressWarnings()

VlnPlot(BoneMarrow1,
        features = c("nFeature_DNA", 
                     "nCount_DNA",
                     "percent.mt"), 
        ncol = 3) |>
  suppressWarnings()

VlnPlot(BoneMarrow2,
        features = c("nFeature_DNA", 
                     "nCount_DNA",
                     "percent.mt"), 
        ncol = 3) |>
  suppressWarnings()

2.3 Transformation & Feature selection & Scaling

I will try log1pCPM, and SCTransform to transform the data and compare the results.

# Transform the BoneMarrow1 data
# Seurat default log-normalization (log1pCP10k)
pbmc_log1pCP1k_1 <- NormalizeData(
  BoneMarrow1,
  normalization.method = "LogNormalize",
  scale.factor = 1000)

# identify the highly variable genes
pbmc_features_log1pCP1k_1 <- FindVariableFeatures(
  pbmc_log1pCP1k_1,
  selection.method = "vst",
  nfeatures = 2000) # tune this parameter

# Scale the data
pbmc_scaled_log1pCP1k_1 <- 
  ScaleData(pbmc_features_log1pCP1k_1, 
            features = 
              VariableFeatures(pbmc_features_log1pCP1k_1))


# Transform the BoneMarrow2 data
# Seurat default log-normalization (log1pCP10k)
pbmc_log1pCP1k_2 <- NormalizeData(
  BoneMarrow2,
  normalization.method = "LogNormalize",
  scale.factor = 1000)

# identify the highly variable genes
pbmc_features_log1pCP1k_2 <- FindVariableFeatures(
  pbmc_log1pCP1k_2,
  selection.method = "vst",
  nfeatures = 2000) # tune this parameter

# Scale the data
pbmc_scaled_log1pCP1k_2 <- 
  ScaleData(pbmc_features_log1pCP1k_2, 
            features = 
              VariableFeatures(pbmc_features_log1pCP1k_2))

Try SCTransform to transform the data.

# SCTransform
# SCTransform the BoneMarrow1 data
BoneMarrow1_SCTransform <- SCTransform(
  BoneMarrow1,
  verbose = FALSE,
  variable.features.n = 3000, # tune this parameter
  return.only.var.genes = TRUE,
  assay = "DNA",
  vars.to.regress = c("nCount_DNA", "percent.mt"))

# SCTransform the BoneMarrow2 data
BoneMarrow2_SCTransform <- SCTransform(
  BoneMarrow2,
  verbose = FALSE,
  variable.features.n = 3000, # tune this parameter
  return.only.var.genes = TRUE,
  assay = "DNA",
  vars.to.regress = c("nCount_DNA", "percent.mt"))

Calculate the variance of transformed data and plot it against the mean.

# Calculate the gene variance of transformed data 
# and plot it against the gene mean

# BoneMarrow1 Log1pCPM
gene_var_log1pCPM_1 <- 
  pbmc_scaled_log1pCP1k_1[["DNA"]]@layers$scale.data |>
  apply(1, var)

gene_mean_log1pCPM_1 <- 
  pbmc_scaled_log1pCP1k_1[["DNA"]]@layers$scale.data |>
  apply(1, mean)

# BoneMarrow2 Log1pCPM
gene_var_log1pCPM_2 <- 
  pbmc_scaled_log1pCP1k_2[["DNA"]]@layers$scale.data |>
  apply(1, var)

gene_mean_log1pCPM_2 <- 
  pbmc_scaled_log1pCP1k_2[["DNA"]]@layers$scale.data |>
  apply(1, mean)


# BoneMarrow1 SCTransform
gene_var_SCTransform_1 <- 
  BoneMarrow1_SCTransform[["SCT"]]@scale.data |>
  apply(1, var)

gene_mean_SCTransform_1 <- 
  BoneMarrow1_SCTransform[["SCT"]]@scale.data |>
  apply(1, mean)

# BoneMarrow2 SCTransform
gene_var_SCTransform_2 <- 
  BoneMarrow2_SCTransform[["SCT"]]@scale.data |>
  apply(1, var)

gene_mean_SCTransform_2 <- 
  BoneMarrow2_SCTransform[["SCT"]]@scale.data |>
  apply(1, mean)

# Plot the gene variance against the gene mean
  
# Create a data frame for plotting
gene_var_mean_df <-
  data.frame(Gene_Variance =
               c(gene_var_log1pCPM_1,
                 gene_var_log1pCPM_2,
                 gene_var_SCTransform_1,
                 gene_var_SCTransform_2),
             Gene_Mean =
               c(gene_mean_log1pCPM_1,
                 gene_mean_log1pCPM_2,
                 gene_mean_SCTransform_1,
                 gene_mean_SCTransform_2),
             Method =
               c(rep("Log1pCPM", 
                     length(gene_var_log1pCPM_1) + 
                       length(gene_var_log1pCPM_2)),
                 rep("SCTransform", 
                     length(gene_var_SCTransform_1) + 
                       length(gene_var_SCTransform_2))))    

# Plot the gene means vs. different variances
ggplot(gene_var_mean_df, aes(x = Gene_Mean, 
                             y = Gene_Variance, 
                             color = Method)) +
  geom_point(alpha = 0.3) +
  theme_grey() +
  facet_wrap(~ Method, scales = "free") +
  labs(title = "Gene Variance vs. Gene Mean",
       x = "Gene Mean",
       y = "Gene Variance") +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5))

# Calculate the Spearman correlation
correlation_log1pCPM_1 <- 
  cor(gene_var_log1pCPM_1, gene_mean_log1pCPM_1, 
      method = "spearman")

correlation_log1pCPM_2 <- 
  cor(gene_var_log1pCPM_2, gene_mean_log1pCPM_2,
      method = "spearman")

correlation_SCTransform_1 <- 
  cor(gene_var_SCTransform_1, gene_mean_SCTransform_1,
      method = "spearman")

correlation_SCTransform_2 <- 
  cor(gene_var_SCTransform_2, gene_mean_SCTransform_2,
      method = "spearman")

# print the Spearman correlation results
cbind(Method = c("Log1pCPM", "Log1pCPM", 
                 "SCTransform", "SCTransform"),
      Dataset = c("BoneMarrow1", "BoneMarrow2", 
                  "BoneMarrow1", "BoneMarrow2"),
      Correlation = c(correlation_log1pCPM_1, 
                      correlation_log1pCPM_2,
                      correlation_SCTransform_1, 
                      correlation_SCTransform_2)) |>
  knitr::kable()
Method Dataset Correlation
Log1pCPM BoneMarrow1 0.974603251887739
Log1pCPM BoneMarrow2 0.961849201395136
SCTransform BoneMarrow1 -0.01872351035011
SCTransform BoneMarrow2 0.00471352040124233

It can be seen from the resutls that the SCTransform method has a lower correlation between gene variance and gene mean compared to the Log1pCPM method. This indicates that the SCTransform method is better at normalizing the data. Therefore, I will use the SCTransform method for downstream analysis.

2.4 Linear dimensional reduction

# perform PCA
BoneMarrow1_pbmc_PCA <- RunPCA(
  BoneMarrow1_SCTransform,
  features = VariableFeatures(BoneMarrow1_SCTransform),
  npcs = 100, # tune this parameter
  verbose = FALSE)
ElbowPlot(BoneMarrow1_pbmc_PCA, ndims = 100)

BoneMarrow2_pbmc_PCA <- RunPCA(
  BoneMarrow2_SCTransform,
  features = VariableFeatures(BoneMarrow2_SCTransform),
  npcs = 100, # tune this parameter
  verbose = FALSE)
ElbowPlot(BoneMarrow2_pbmc_PCA, ndims = 100)

2.5 Get the gene names

Ensembl IDs to gene

mrt = useMart(biomart = "ensembl",
              dataset = "hsapiens_gene_ensembl")

if (file.exists("BoneMarrow_data1_IDs.rds")) {
  BoneMarrow_data1_IDs <- readRDS("BoneMarrow_data1_IDs.rds")
} else {
  BoneMarrow_data1_IDs <- getBM(
    attributes = c("ensembl_gene_id","hgnc_symbol"),
    filters = 'ensembl_gene_id',
    values = rownames(BoneMarrow_data1),
    mart = mrt)
  saveRDS(BoneMarrow_data1_IDs, "BoneMarrow_data1_IDs.rds")
}

if (file.exists("BoneMarrow_data2_IDs.rds")) {
  BoneMarrow_data2_IDs <- readRDS("BoneMarrow_data2_IDs.rds")
} else {
  BoneMarrow_data2_IDs <- getBM(
    attributes = c("ensembl_gene_id","hgnc_symbol"),
    filters = 'ensembl_gene_id',
    values = rownames(BoneMarrow_data2),
    mart = mrt)
  saveRDS(BoneMarrow_data2_IDs, "BoneMarrow_data2_IDs.rds")
}

2.6 Clustering

Perform neighborhood-based clustering (I tried different parameters and the ones below gave the best results).

# Perform neighborhood-based clustering
BoneMarrow1_pbmc_neighbor <- FindNeighbors(
  BoneMarrow1_pbmc_PCA,
  reduction = "pca",
  dims = 1:100, # tune this parameter
  k.param = 30, # tune this parameter
  prune.SNN = 1/10, # tune this parameter
  verbose = FALSE)

BoneMarrow1_pbmc_cluster <- FindClusters(
  BoneMarrow1_pbmc_neighbor,
  resolution = 1.8, # tune this parameter
  verbose = FALSE)

BoneMarrow2_pbmc_neighbor <- FindNeighbors(
  BoneMarrow2_pbmc_PCA,
  reduction = "pca",
  dims = 1:100, # tune this parameter
  k.param = 40, # tune this parameter
  prune.SNN = 1/10, # tune this parameter
  verbose = FALSE)

BoneMarrow2_pbmc_cluster <- FindClusters(
  BoneMarrow2_pbmc_neighbor,
  resolution = 0.8, # tune this parameter
  verbose = FALSE)

Run UMAP and t-SNE

# Run UMAP
BoneMarrow1_pbmc_UMAP <- RunUMAP(
  BoneMarrow1_pbmc_cluster,
  min.dist = 0.3,
  n.neighbors = 30,
  dims = 1:79,
  metric = "cosine",
  verbose = FALSE) |>
  suppressWarnings()

BoneMarrow2_pbmc_UMAP <- RunUMAP(
  BoneMarrow2_pbmc_cluster,
  min.dist = 0.3,
  n.neighbors = 30,
  dims = 1:87,
  metric = "cosine",
  verbose = FALSE)

# try t_SNE
BoneMarrow1_pbmc_tSNE <- RunTSNE(
  BoneMarrow1_pbmc_cluster,
  dims = 1:50,
  verbose = FALSE)

BoneMarrow2_pbmc_tSNE <- RunTSNE(
  BoneMarrow2_pbmc_cluster,
  dims = 1:98,
  verbose = FALSE)

Plot the UMAP

DimPlot(BoneMarrow1_pbmc_UMAP,
        reduction = "umap",
        label = TRUE,
        group.by = "seurat_clusters",
        alpha = 0.5) +
  ggtitle("BoneMarrow1 UMAP")

DimPlot(BoneMarrow2_pbmc_UMAP,
        reduction = "umap",
        label = TRUE,
        group.by = "seurat_clusters",
        alpha = 0.5) +
  ggtitle("BoneMarrow2 UMAP")

Plot the t-SNE

DimPlot(BoneMarrow1_pbmc_tSNE,
        reduction = "tsne",
        label = TRUE,
        group.by = "seurat_clusters",
        alpha = 0.5) +
  ggtitle("BoneMarrow1 t-SNE")

DimPlot(BoneMarrow2_pbmc_tSNE,
        reduction = "tsne",
        label = TRUE,
        group.by = "seurat_clusters",
        alpha = 0.5) +
  ggtitle("BoneMarrow2 t-SNE")

By comparing the UMAP and t-SNE plots, I decided to use the UMAP for both datasets.

2.7 Annotation Results

2.7.1 BoneMarrow1 data

# Find gene markers for the BoneMarrow1 data

# use the UMAP for the BoneMarrow1 data to find the gene markers

if (file.exists("BoneMarrow1_pbmc_markers.rds")) {
  BoneMarrow1_pbmc_markers <- readRDS("BoneMarrow1_pbmc_markers.rds")
} else {
  BoneMarrow1_pbmc_markers <- 
    FindAllMarkers(
      BoneMarrow1_pbmc_UMAP,
      min.pct = 0.01,
      logfc.threshold = 0.01,
      test.use = "wilcox",
      verbose = FALSE)
  saveRDS(BoneMarrow1_pbmc_markers, "BoneMarrow1_pbmc_markers.rds")
}

# select the top 5 highly expressed genes in each cluster
# to roughly identify the cell types
BoneMarrow1_pbmc_markers_top_5 <-
BoneMarrow1_pbmc_markers |>
  as_tibble() |>
  filter(avg_log2FC > 4) |>
  filter(p_val_adj < 0.01) |>
  group_by(cluster, p_val_adj) |>
  
  # arrange the gene with the order of 
  # high in avg_log2FC and low in p_val_adj
  arrange(cluster, p_val_adj, desc(avg_log2FC)) |>
  mutate(gene_names = map_chr(
    gene, ~ BoneMarrow_data1_IDs$hgnc_symbol[
      match(.x, BoneMarrow_data1_IDs$ensembl_gene_id)])) |>
  
  # select the top 5 genes in each cluster
  group_by(cluster) |>
  dplyr::slice(1:5) |>
  dplyr::select(cluster, gene_names, everything()) 

# export the top 5 highly expressed genes each cluster to a xlsx file
if (!file.exists("BoneMarrow1_pbmc_markers_top_5.xlsx")) {
  write_xlsx(BoneMarrow1_pbmc_markers_top_5, 
             "BoneMarrow1_pbmc_markers_top_5.xlsx")
}

BoneMarrow1_pbmc_markers_top_5 |>
  knitr::kable()
cluster gene_names p_val avg_log2FC pct.1 pct.2 p_val_adj gene
0 CCR4 0 4.874914 0.382 0.014 0.0000000 ENSG00000183813
0 TTC39C-AS1 0 4.401808 0.091 0.004 0.0000000 ENSG00000264745
0 WNT7A 0 4.294893 0.087 0.004 0.0000000 ENSG00000154764
0 FOXP3 0 4.169362 0.044 0.002 0.0000000 ENSG00000049768
1 MAFB 0 4.238447 0.972 0.106 0.0000000 ENSG00000204103
1 THBS1 0 4.626654 0.816 0.069 0.0000000 ENSG00000137801
1 PID1 0 4.263354 0.624 0.039 0.0000000 ENSG00000153823
1 CYP1B1 0 5.438358 0.543 0.025 0.0000000 ENSG00000138061
1 CD300E 0 4.320640 0.571 0.030 0.0000000 ENSG00000186407
2 KLRC2 0 4.734576 0.829 0.053 0.0000000 ENSG00000205809
2 TKTL1 0 5.867000 0.566 0.017 0.0000000 ENSG00000007350
2 KIR3DL2 0 4.176698 0.483 0.028 0.0000000 ENSG00000240403
2 KLRC3 0 4.646220 0.415 0.021 0.0000000 ENSG00000205810
2 LDB2 0 4.309103 0.102 0.005 0.0000000 ENSG00000169744
3 AZU1 0 7.113508 0.741 0.034 0.0000000 ENSG00000172232
3 MPO 0 7.051123 0.615 0.030 0.0000000 ENSG00000005381
3 ELANE 0 7.931680 0.498 0.014 0.0000000 ENSG00000197561
3 RNASE2 0 4.601070 0.678 0.054 0.0000000 ENSG00000169385
3 CTSG 0 7.272826 0.390 0.007 0.0000000 ENSG00000100448
4 TNFRSF9 0 5.339711 0.261 0.012 0.0000000 ENSG00000049249
4 CCL3-AS1 0 4.315129 0.227 0.017 0.0000000 ENSG00000277089
4 CXCR6 0 4.409252 0.177 0.010 0.0000000 ENSG00000172215
4 DKK3 0 4.144354 0.148 0.010 0.0000000 ENSG00000050165
4 VCAM1 0 6.842211 0.025 0.000 0.0000000 ENSG00000162692
5 MS4A1 0 7.384164 0.866 0.015 0.0000000 ENSG00000156738
5 CD79A 0 7.144223 0.985 0.034 0.0000000 ENSG00000105369
5 LINC00926 0 7.117973 0.738 0.012 0.0000000 ENSG00000247982
5 BANK1 0 6.714238 0.713 0.019 0.0000000 ENSG00000153064
5 FCER2 0 7.537949 0.554 0.006 0.0000000 ENSG00000104921
6 MYOM2 0 5.380428 0.885 0.050 0.0000000 ENSG00000036448
6 FGFBP2 0 4.273741 0.935 0.093 0.0000000 ENSG00000137441
6 SH2D1B 0 4.317394 0.620 0.037 0.0000000 ENSG00000198574
6 SPON2 0 4.029231 0.975 0.208 0.0000000 ENSG00000159674
6 LINC00299 0 5.023763 0.370 0.016 0.0000000 ENSG00000236790
7 FSBP 0 5.568025 0.012 0.000 0.0002713 ENSG00000265817
7 CELA1 0 5.568025 0.012 0.000 0.0002713 ENSG00000139610
8 CCL3L3 0 4.485081 0.781 0.057 0.0000000 ENSG00000276085
8 KCNE1 0 4.955907 0.323 0.014 0.0000000 ENSG00000180509
8 TNFAIP6 0 4.494003 0.303 0.022 0.0000000 ENSG00000123610
8 0 4.503048 0.181 0.008 0.0000000 ENSG00000287553
8 CXCL1 0 4.981095 0.168 0.008 0.0000000 ENSG00000163739
9 TRAV14DV4 0 4.364786 0.263 0.014 0.0000000 ENSG00000211792
9 TRDV2 0 6.154833 0.180 0.005 0.0000000 ENSG00000211821
9 ZNF683 0 4.564768 0.218 0.010 0.0000000 ENSG00000176083
9 TRBV5-1 0 5.172667 0.241 0.014 0.0000000 ENSG00000211734
9 TRGV2 0 4.735881 0.158 0.009 0.0000000 ENSG00000233306
10 SLC4A10 0 7.069240 0.250 0.002 0.0000000 ENSG00000144290
10 TRAV1-2 0 4.894153 0.189 0.006 0.0000000 ENSG00000256553
10 LINC01644 0 5.024152 0.144 0.004 0.0000000 ENSG00000218357
10 TRBV6-4 0 5.680197 0.091 0.002 0.0000000 ENSG00000211713
10 KIF5C 0 4.225359 0.091 0.005 0.0000000 ENSG00000168280
11 LINC02812 0 9.882266 0.825 0.001 0.0000000 ENSG00000230138
11 LILRA4 0 8.612708 0.992 0.012 0.0000000 ENSG00000239961
11 PTPRS 0 8.572198 0.775 0.003 0.0000000 ENSG00000105426
11 CLEC4C 0 8.503754 0.850 0.004 0.0000000 ENSG00000198178
11 SCT 0 8.283629 0.592 0.002 0.0000000 ENSG00000070031
12 SPTA1 0 6.572461 0.030 0.000 0.0000000 ENSG00000163554
12 SELP 0 6.350069 0.030 0.000 0.0000000 ENSG00000174175
12 MFSD2B 0 5.765106 0.030 0.000 0.0000000 ENSG00000205639
12 POLQ 0 5.765106 0.030 0.000 0.0000000 ENSG00000051341
12 B3GALNT1 0 5.765106 0.030 0.000 0.0000000 ENSG00000169255
13 CRHBP 0 11.350012 0.762 0.000 0.0000000 ENSG00000145708
13 CD34 0 8.263918 0.845 0.004 0.0000000 ENSG00000174059
13 CYTL1 0 7.493245 0.690 0.007 0.0000000 ENSG00000170891
13 NPR3 0 8.010162 0.560 0.003 0.0000000 ENSG00000113389
13 EGFL7 0 5.907402 0.833 0.019 0.0000000 ENSG00000172889
14 LINC02446 0 5.434920 0.836 0.037 0.0000000 ENSG00000256039
14 CD8B 0 4.262326 0.970 0.076 0.0000000 ENSG00000172116
14 C20orf204 0 4.298116 0.209 0.011 0.0000000 ENSG00000196421
14 PCSK1N 0 4.345422 0.224 0.015 0.0000000 ENSG00000102109
14 CD248 0 7.667350 0.045 0.000 0.0000000 ENSG00000174807
15 CLEC10A 0 7.985711 0.578 0.004 0.0000000 ENSG00000132514
15 ENHO 0 7.668228 0.356 0.002 0.0000000 ENSG00000168913
15 CD1C 0 6.268298 0.578 0.013 0.0000000 ENSG00000158481
15 FCER1A 0 5.012487 0.844 0.040 0.0000000 ENSG00000179639
15 AXL 0 6.400748 0.289 0.004 0.0000000 ENSG00000167601
16 LYPD2 0 13.985578 0.543 0.000 0.0000000 ENSG00000197353
16 UICLM 0 10.106432 0.629 0.000 0.0000000 ENSG00000233392
16 HES4 0 7.714115 0.514 0.003 0.0000000 ENSG00000188290
16 CDKN1C 0 6.806037 0.571 0.007 0.0000000 ENSG00000129757
16 VMO1 0 9.205968 0.257 0.000 0.0000000 ENSG00000182853
17 SDC1 0 10.283611 0.548 0.000 0.0000000 ENSG00000115884
17 TNFRSF17 0 8.133368 0.645 0.003 0.0000000 ENSG00000048462
17 JSRP1 0 10.605539 0.452 0.000 0.0000000 ENSG00000167476
17 IGLV3-1 0 11.477532 0.581 0.005 0.0000000 ENSG00000211673
17 PARM1 0 7.476256 0.452 0.003 0.0000000 ENSG00000169116

By looking up the highly expressed genes in each cluster online and in the literature, I can roughly identify the cell types in the BoneMarrow1 data. I found the following cell types:

Cluster Cell Type
0 Regulatory T-cell
1 Macrophage Cell
2 Natural Killer Cell
3 Neutrophil Precursors
4 CD8 T Cell
5 Naïve B Cell
6 Natural Killer Cell
7 Megakaryocyte Cell
8 Macrophage Cell
9 CD8 T Cell
10 Mucosal-associated Invariant T Cell
11 Plasmacytoid Dendritic Cell
12 Erythroid Cells
13 Hematopoietic Stem Cells
14 CD8 T Cell
15 Dendritic Cell
16 Non-classical Monocyte Cell
17 Plasma Cell

Since I already have the primary cell types for the BoneMarrow1 data from the top 5 highly expressed genes in each cluster, we can verfiy the cell types by gene markers from the literature or from online databases (e.g., CellMarker 2.0, PanglaoDB)

And I found the following gene markers for the cell types in the BoneMarrow1 data:

Cell Type Gene Markers
Regulatory T-cell CD25, CD4, FOXP3, CCR4, CCR6
Macrophage Cell ARG, CCL2, CD163, CD206, FIZZ1, IL-10, MRC1, CD163, CD206, CD14
Natural Killer Cell TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX
Neutrophil Precursors S100A8, S100A9, S100A12, LCN2, LTF
CD8 T Cell CD3, CD5, CD8, CD27, CD28
Naïve B Cell P2RX5, PTPRCAP
Natural Killer Cell TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX
Megakaryocyte Cell IL6ST
Macrophage Cell ARG, CCL2, CD163, CD206, FIZZ1, IL-10, MRC1, CD163, CD206, CD14
CD8 T Cell CD3, CD5, CD8, CD27, CD28
Mucosal-associated Invariant T Cell TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10
Plasmacytoid Dendritic Cell BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1
Erythroid Cells AHSP, HBA-A1, HBA-A2, HIST1H2AO, STFA1, ALAD, PNPO
Hematopoietic Stem Cells CD34, MECOM, EGR1
CD8 T Cell CD8A, CD8B, CD3D, CD3E, CD3G, CD4, CD5, CD6, CD7, CD8, CD8, CD8
Dendritic Cell CD1C, CD86
Non-classical Monocyte Cell CD14, CD16, CX3CR1, LYPD2
Plasma Cell MZB1, IGHG1, JCHAIN, SPAG4, TGM5

Then plot feature plots for the classic gene markers for each cell type to verify the cell types in the BoneMarrow1 data.

  • Regulatory T-cell: CD25, CD4, FOXP3, CCR4, CCR6
# define a function to plot feature plots for the classic gene markers
plot_feature_plots <- function(cell_type, classic_gene_markers) {
  # map the gene names back to the ENSEMBL IDs
  classic_gene_maker_names <- c(
    classic_gene_markers)

  Ensembl <- getBM(
    attributes=c("ensembl_gene_id","hgnc_symbol"),
    filters = 'hgnc_symbol', values = classic_gene_maker_names,
    mart = mrt)
  
  feature_plot <- FeaturePlot(
    cell_type,
    features = Ensembl$ensembl_gene_id,
    pt.size = 0.1) |>
    suppressWarnings()
  
  violin_plot <- VlnPlot(
    cell_type,
    features = Ensembl$ensembl_gene_id,
    pt.size = 0.1,
    ncol = 2) |>
    suppressWarnings()

  gene_id_names <- Ensembl |>
    # pivot_wider(names_from = hgnc_symbol, 
    #             values_from = ensembl_gene_id) |>
    kable() |>
    suppressWarnings()
  
return(list(
    gene_id_names = gene_id_names,
    feature_plot = feature_plot,
    violin_plot = violin_plot
  ))
}

# map the gene names back to the ENSEMBL IDs
Regulatory_T_classic_gene_maker_names <- c(
  "CD25", "CD4", "FOXP3", "CCR4", "CCR6")

plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Regulatory_T_classic_gene_maker_names)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000183813 |CCR4        |
|ENSG00000112486 |CCR6        |
|ENSG00000010610 |CD4         |
|ENSG00000049768 |FOXP3       |

$feature_plot


$violin_plot

  • Macrophage Cell: ARG, CCL2, CD163, CD206, FIZZ1, IL-10, MRC1, CD163, CD206
Macrophage_Cell_classic_gene_maker_names <- c(
  "ARG", "CCL2", "CD163", "CD206", "FIZZ1", "IL-10", "MRC1", "CD163", "CD206", "CD14")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Macrophage_Cell_classic_gene_maker_names)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000108691 |CCL2        |
|ENSG00000170458 |CD14        |
|ENSG00000177575 |CD163       |
|ENSG00000260314 |MRC1        |

$feature_plot


$violin_plot

  • NK Cell: TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX
NK_Cell_classic_gene_maker_names <- c(
  "TRDC", "KLRF1", "GNLY", "NCR1", "GZMA", "HOPX")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   NK_Cell_classic_gene_maker_names)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000115523 |GNLY        |
|ENSG00000145649 |GZMA        |
|ENSG00000171476 |HOPX        |
|ENSG00000150045 |KLRF1       |
|ENSG00000275521 |NCR1        |
|ENSG00000278025 |NCR1        |
|ENSG00000277442 |NCR1        |
|ENSG00000277334 |NCR1        |
|ENSG00000276450 |NCR1        |
|ENSG00000277629 |NCR1        |
|ENSG00000273506 |NCR1        |
|ENSG00000275637 |NCR1        |
|ENSG00000277824 |NCR1        |
|ENSG00000275822 |NCR1        |
|ENSG00000274053 |NCR1        |
|ENSG00000278362 |NCR1        |
|ENSG00000273916 |NCR1        |
|ENSG00000273535 |NCR1        |
|ENSG00000275156 |NCR1        |
|ENSG00000189430 |NCR1        |
|ENSG00000211829 |TRDC        |

$feature_plot


$violin_plot

  • Neutrophil Precursors: S100A8, S100A9, S100A12, LCN2, LTF
Neutrophil_Precursors_classic_gene_maker_names <- c("S100A8", "S100A9", "S100A12", "LCN2", "LTF")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Neutrophil_Precursors_classic_gene_maker_names)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000148346 |LCN2        |
|ENSG00000012223 |LTF         |
|ENSG00000163221 |S100A12     |
|ENSG00000143546 |S100A8      |
|ENSG00000163220 |S100A9      |

$feature_plot


$violin_plot

  • CD8 T Cell: CD3, CD5, CD8, CD27, CD28
CD8_T_Cell_classic_gene_maker_names <- c("CD3", "CD5", "CD8", "CD27", "CD28")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   CD8_T_Cell_classic_gene_maker_names)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000139193 |CD27        |
|ENSG00000178562 |CD28        |
|ENSG00000110448 |CD5         |

$feature_plot


$violin_plot

  • Naïve B Cell: P2RX5, PTPRCAP
Naive_B_Cell_classic_gene_maker_names <- c("P2RX5", "PTPRCAP")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Naive_B_Cell_classic_gene_maker_names)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000083454 |P2RX5       |
|ENSG00000213402 |PTPRCAP     |

$feature_plot


$violin_plot

  • Megakaryocyte Cell: IL6ST
Megakaryocyte_Cell_classic_gene_maker_names <- c("IL6ST")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Megakaryocyte_Cell_classic_gene_maker_names)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000134352 |IL6ST       |

$feature_plot


$violin_plot

  • Mucosal-associated Invariant T Cell: TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10
Mucosal_associated_Invariant_T_Cell_classic_gene_maker_names <- c("TRAV1-2", "TRAJ33", "TRAJ20", "TRAJ12", "SLC4A10")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Mucosal_associated_Invariant_T_Cell_classic_gene_maker_names)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000144290 |SLC4A10     |
|ENSG00000211877 |TRAJ12      |
|ENSG00000211869 |TRAJ20      |
|ENSG00000211856 |TRAJ33      |
|ENSG00000256553 |TRAV1-2     |

$feature_plot


$violin_plot

  • Plasmacytoid Dendritic Cell: BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1,CD40, CD80, CD86,
Plasmacytoid_Dendritic_Cell_classic_gene_maker_names <- c(
  "BDCA2", "BDCA-4", "CD123", "CD303", "CD304", "CLEC4C", "DR6", "Fc-epsilon RI-alpha", "ILT3", "ILT7", "NRP1")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Plasmacytoid_Dendritic_Cell_classic_gene_maker_names)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000198178 |CLEC4C      |
|ENSG00000099250 |NRP1        |

$feature_plot


$violin_plot

  • Erythroid Cell: AHSP, HBA-A1, HBA-A2, HIST1H2AO, STFA1, ALAD, PNPO
Erythroid_Cell_classic_gene_maker_names <- c("AHSP", "HBA-A1", "HBA-A2", "HIST1H2AO", "STFA1","ALAD", "PNPO")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Erythroid_Cell_classic_gene_maker_names)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000169877 |AHSP        |
|ENSG00000148218 |ALAD        |
|ENSG00000108439 |PNPO        |

$feature_plot


$violin_plot

  • Hematopoietic Stem Cell: CD34, MECOM, EGR1
Hematopoietic_Stem_Cell_classic_gene_maker_names <- c("CD34", "MECOM", "EGR1")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Hematopoietic_Stem_Cell_classic_gene_maker_names)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000174059 |CD34        |
|ENSG00000120738 |EGR1        |
|ENSG00000085276 |MECOM       |

$feature_plot


$violin_plot

  • Dendritic Cell: CD1C, CD86
Dendritic_Cell_classic_gene_maker_names <- c(
  "CD1C", "CD86")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Dendritic_Cell_classic_gene_maker_names)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000158481 |CD1C        |
|ENSG00000114013 |CD86        |

$feature_plot


$violin_plot

  • Non-classical Monocyte Cell: CD14, CD16, CX3CR1, LYPD2
Non_classical_Monocyte_Cell_classic_gene_maker_names <- c(
  "CD14", "CD16", "LYPD2")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Non_classical_Monocyte_Cell_classic_gene_maker_names)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000170458 |CD14        |
|ENSG00000197353 |LYPD2       |

$feature_plot


$violin_plot

  • Plasma Cell: MZB1, IGHG1, JCHAIN, SPAG4, TGM5
Plasma_Cell_classic_gene_maker_names <- c("MZB1", "IGHG1", "JCHAIN", "SPAG4", "TGM5")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Plasma_Cell_classic_gene_maker_names)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000277633 |IGHG1       |
|ENSG00000211896 |IGHG1       |
|ENSG00000132465 |JCHAIN      |
|ENSG00000170476 |MZB1        |
|ENSG00000061656 |SPAG4       |
|ENSG00000104055 |TGM5        |

$feature_plot


$violin_plot

Annotate the cell types for the BoneMarrow1 data

# Annotate the cell types for the BoneMarrow1 data
# manually rename the seurat_clusters to the cell types
BoneMarrow1_pbmc_UMAP_new_meta <- BoneMarrow1_pbmc_UMAP@meta.data |>
  # change the seurat_clusters to cell types
  dplyr::mutate(cell_type = case_when(
    seurat_clusters == 0 ~ "Regulatory T-cell",
    seurat_clusters == 1 ~ "Macrophage Cell",
    seurat_clusters == 2 ~ "Natural Killer Cell",
    seurat_clusters == 3 ~ "Neutrophil Precursors",
    seurat_clusters == 4 ~ "CD8 T cell",
    seurat_clusters == 5 ~ "Naïve B cell",
    seurat_clusters == 6 ~ "Natural Killer Cell",
    seurat_clusters == 7 ~ "Megakaryocyte Cell",
    seurat_clusters == 8 ~ "Macrophage Cell",
    seurat_clusters == 9 ~ "CD8 T cell",
    seurat_clusters == 10 ~ "Mucosal-associated Invariant T Cell",
    seurat_clusters == 11 ~ "Plasmacytoid Dendritic Cell",
    seurat_clusters == 12 ~ "Erythroid Cells",
    seurat_clusters == 13 ~ "Hematopoietic Stem Cells",
    seurat_clusters == 14 ~ "CD8 T cell",
    seurat_clusters == 15 ~ "Dendritic Cell",
    seurat_clusters == 16 ~ "Non-classical Monocyte Cell",
    seurat_clusters == 17 ~ "Plasma Cell")) |>
  dplyr::mutate(cell_type = as.factor(cell_type))

# add the cell types to the UMAP object
BoneMarrow1_pbmc_UMAP <- AddMetaData(
  BoneMarrow1_pbmc_UMAP,
  metadata = BoneMarrow1_pbmc_UMAP_new_meta,
  col.name = "cell_type")

# export the UMAP object to the RDS file
saveRDS(BoneMarrow1_pbmc_UMAP, "BoneMarrow1_pbmc_UMAP.rds")

2.7.1.1 Summary Plots

Plot the UMAP with the cell types for the BoneMarrow1 data

DimPlot(BoneMarrow1_pbmc_UMAP,
        reduction = "umap",
        label = TRUE,
        group.by = "cell_type",
        label.size = 3,
        repel = TRUE,
        alpha = 0.5) +
  ggtitle("BoneMarrow1 Cell Types") +
  NoLegend()

Plot the heatmap

DoHeatmap(
  BoneMarrow1_pbmc_UMAP,
  features = BoneMarrow1_pbmc_markers_top_5$gene,
  label = TRUE,
  group.by = "cell_type",
  size = 4) |>
  suppressWarnings() +
  NoLegend() +
  ggtitle("BoneMarrow1 Heatmap") +
  rremove("y.text") 

2.7.2 BoneMarrow2 data

# I decided to use the UMAP for the BoneMarrow2 data
# to find the gene markers

if (file.exists("BoneMarrow2_pbmc_markers.rds")) {
  BoneMarrow2_pbmc_markers <- readRDS("BoneMarrow2_pbmc_markers.rds")
} else {
  BoneMarrow2_pbmc_markers <- 
    FindAllMarkers(
      BoneMarrow2_pbmc_UMAP,
      min.pct = 0.01,
      logfc.threshold = 0.01,
      test.use = "wilcox",
      verbose = FALSE)
  saveRDS(BoneMarrow2_pbmc_markers, "BoneMarrow2_pbmc_markers.rds")
}

BoneMarrow2_pbmc_markers_top_5 <-
BoneMarrow2_pbmc_markers |>
  as_tibble() |>
  filter(avg_log2FC > 4) |>
  filter(p_val_adj < 0.01) |>
  group_by(cluster, p_val_adj) |>
  
  # arrange the gene with the order of 
  # high in avg_log2FC and low in p_val_adj
  arrange(cluster, p_val_adj, desc(avg_log2FC)) |>
  mutate(gene_names = map_chr(
    gene, ~ BoneMarrow_data2_IDs$hgnc_symbol[
      match(.x, BoneMarrow_data2_IDs$ensembl_gene_id)])) |>
  
  # select the top 5 genes in each cluster
  group_by(cluster) |>
  dplyr::slice(1:5) |>
  dplyr::select(cluster, gene_names, everything()) 

# export the top 5 highly expressed genes each cluster to a xlsx file
if (!file.exists("BoneMarrow2_pbmc_markers_top_5.xlsx")) {
  write_xlsx(BoneMarrow2_pbmc_markers_top_5, 
             "BoneMarrow2_pbmc_markers_top_5.xlsx")
}

BoneMarrow2_pbmc_markers_top_5 |>
  knitr::kable()
cluster gene_names p_val avg_log2FC pct.1 pct.2 p_val_adj gene
0 S100A12 0 8.700578 0.897 0.011 0e+00 ENSG00000163221
0 S100A8 0 7.696677 0.993 0.093 0e+00 ENSG00000143546
0 S100A9 0 7.390087 0.998 0.138 0e+00 ENSG00000163220
0 CD14 0 7.300052 0.676 0.009 0e+00 ENSG00000170458
0 CLEC4E 0 7.265346 0.657 0.007 0e+00 ENSG00000166523
1 CCR4 0 5.170545 0.306 0.008 0e+00 ENSG00000183813
1 TTC39C-AS1 0 4.442917 0.097 0.004 0e+00 ENSG00000264745
1 FOXP3 0 4.069459 0.062 0.004 0e+00 ENSG00000049768
1 PI16 0 5.528891 0.039 0.000 0e+00 ENSG00000164530
1 NEFL 0 4.593021 0.043 0.001 0e+00 ENSG00000277586
2 KLRC2 0 4.488809 0.642 0.057 0e+00 ENSG00000205809
2 TKTL1 0 5.667839 0.362 0.017 0e+00 ENSG00000007350
2 KLRC3 0 4.342381 0.345 0.020 0e+00 ENSG00000205810
2 IL5RA 0 4.732259 0.072 0.003 0e+00 ENSG00000091181
3 MYOM2 0 4.906103 0.886 0.055 0e+00 ENSG00000036448
3 FGFBP2 0 4.041734 0.905 0.100 0e+00 ENSG00000137441
3 SH2D1B 0 4.343918 0.649 0.045 0e+00 ENSG00000198574
3 LINC00299 0 4.531001 0.431 0.025 0e+00 ENSG00000236790
3 PRSS23 0 4.230768 0.455 0.034 0e+00 ENSG00000150687
4 TNFRSF9 0 4.412879 0.325 0.018 0e+00 ENSG00000049249
4 ASTL 0 4.889529 0.067 0.002 0e+00 ENSG00000188886
4 VCAM1 0 6.889529 0.029 0.000 0e+00 ENSG00000162692
4 ZNF80 0 5.889529 0.019 0.000 1e-07 ENSG00000174255
5 LRRN3 0 4.684415 0.187 0.006 0e+00 ENSG00000173114
6 SLC4A10 0 7.301805 0.395 0.003 0e+00 ENSG00000144290
6 TRAV1-2 0 4.486837 0.238 0.009 0e+00 ENSG00000256553
6 LINC01644 0 5.420450 0.195 0.004 0e+00 ENSG00000218357
6 TSPAN15 0 4.319363 0.157 0.008 0e+00 ENSG00000099282
6 IL23R 0 5.216916 0.114 0.003 0e+00 ENSG00000162594
7 LILRA4 0 9.930713 0.975 0.005 0e+00 ENSG00000239961
7 LINC02812 0 9.230507 0.787 0.002 0e+00 ENSG00000230138
7 CLEC4C 0 8.741929 0.861 0.003 0e+00 ENSG00000198178
7 LRRC26 0 8.675292 0.672 0.003 0e+00 ENSG00000184709
7 PROC 0 8.369202 0.787 0.004 0e+00 ENSG00000115718
8 CD1C 0 6.120903 0.286 0.010 0e+00 ENSG00000158481
8 CAVIN2 0 5.834204 0.321 0.019 0e+00 ENSG00000168497
8 CLEC10A 0 5.894971 0.188 0.003 0e+00 ENSG00000132514
8 ENHO 0 6.330357 0.152 0.002 0e+00 ENSG00000168913
8 C19orf33 0 7.107964 0.134 0.001 0e+00 ENSG00000167644
9 CD79A 0 8.333834 0.958 0.021 0e+00 ENSG00000105369
9 MS4A1 0 7.611297 0.832 0.015 0e+00 ENSG00000156738
9 CD19 0 8.874331 0.568 0.002 0e+00 ENSG00000177455
9 FCER2 0 7.647739 0.653 0.006 0e+00 ENSG00000104921
9 FCRL1 0 7.857457 0.568 0.003 0e+00 ENSG00000163534
10 CDKN1C 0 7.368670 0.603 0.004 0e+00 ENSG00000129757
10 CKB 0 8.221113 0.562 0.003 0e+00 ENSG00000166165
10 LYPD2 0 13.087031 0.438 0.000 0e+00 ENSG00000197353
10 VMO1 0 9.435237 0.438 0.001 0e+00 ENSG00000182853
10 UICLM 0 8.747181 0.438 0.001 0e+00 ENSG00000233392
11 PRTN3 0 9.439529 0.952 0.019 0e+00 ENSG00000196415
11 C1QTNF4 0 9.376598 0.629 0.003 0e+00 ENSG00000172247
11 SMIM24 0 7.842478 0.710 0.006 0e+00 ENSG00000095932
11 CTSG 0 7.899165 0.790 0.010 0e+00 ENSG00000100448
11 MS4A3 0 8.151139 0.581 0.002 0e+00 ENSG00000149516
12 SPTSSB 0 6.160634 0.500 0.010 0e+00 ENSG00000196542
12 XCL1 0 4.973271 0.871 0.051 0e+00 ENSG00000143184
12 KLRC1 0 4.814189 0.613 0.035 0e+00 ENSG00000134545
12 TNFRSF11A 0 5.751829 0.194 0.004 0e+00 ENSG00000141655
12 ADGRG3 0 4.967806 0.226 0.008 0e+00 ENSG00000182885

By looking up the highly expressed genes in each cluster online and in the literature, I can roughly identify the cell types in the BoneMarrow2 data. I found the following cell types:

Cluster Cell Type
0 Neutrophil
1 Regulatory T-cell
2 Natural Killer Cell
3 CD16 Natural Killer Cell
4 CD8 T cell
5 Regulatory T-cell
6 Mucosal-associated Invariant
7 Plasmacytoid Dendritic Cell
8 Dendritic Cell
9 Naïve B cell
10 Non-classical Monocyte Cell
11 Neutrophil Granulocytes
12 Natural Killer Cell

Since I already have the primary cell types for the BoneMarrow2 data from the top 5 highly expressed genes in each cluster, we can verfiy the cell types by gene markers from the literature or from online databases (e.g., CellMarker 2.0, PanglaoDB)

And I found the following gene markers for the cell types in the BoneMarrow2 data:

Cell Type Gene Markers
Neutrophil S100A8, S100A9, S100A12, LCN2, LTF
Regulatory T-cell CD25, CD4, FOXP3, CCR4, CCR6
Natural Killer Cell TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX
CD16 Natural Killer Cell MYOM2, FGFBP2, SH2D1B
CD8 T cell CD3, CD5, CD8, CD27, CD28
Mucosal-associated Invariant T Cell TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10
Plasmacytoid Dendritic Cell BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1
Dendritic Cell CD1C, CD86
Naïve B cell MS4A1, CD79A, P2RX5, PTPRCAP
Non-classical Monocyte Cell CD14, CD16, CX3CR1, LYPD2
Neutrophil Granulocytes S100A8, S100A9, S100A12, LCN2, LTF

Then plot feature plots for the classic gene markers for each cell type to verify the cell types in the BoneMarrow2 data.

  • Neutrophil: S100A8, S100A9, S100A12, LCN2, LTF
Neutrophil_markers_bone_marrow2 <- c("S100A8", "S100A9", "S100A12", "LCN2", "LTF")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   Neutrophil_markers_bone_marrow2)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000148346 |LCN2        |
|ENSG00000012223 |LTF         |
|ENSG00000163221 |S100A12     |
|ENSG00000143546 |S100A8      |
|ENSG00000163220 |S100A9      |

$feature_plot


$violin_plot

  • Regulatory T-cell: CD25, CD4, FOXP3, CCR4, CCR6
Regulatory_T_cell_markers_bone_marrow2 <- c("CD25", "CD4", "FOXP3", "CCR4", "CCR6")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   Regulatory_T_cell_markers_bone_marrow2)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000183813 |CCR4        |
|ENSG00000112486 |CCR6        |
|ENSG00000010610 |CD4         |
|ENSG00000049768 |FOXP3       |

$feature_plot


$violin_plot

  • Natural Killer Cell: TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX
Natural_Killer_Cell_markers_bone_marrow2 <- c("TRDC", "KLRF1", "GNLY", "NCR1", "GZMA", "HOPX")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   Natural_Killer_Cell_markers_bone_marrow2)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000115523 |GNLY        |
|ENSG00000145649 |GZMA        |
|ENSG00000171476 |HOPX        |
|ENSG00000150045 |KLRF1       |
|ENSG00000275521 |NCR1        |
|ENSG00000278025 |NCR1        |
|ENSG00000277442 |NCR1        |
|ENSG00000277334 |NCR1        |
|ENSG00000276450 |NCR1        |
|ENSG00000277629 |NCR1        |
|ENSG00000273506 |NCR1        |
|ENSG00000275637 |NCR1        |
|ENSG00000277824 |NCR1        |
|ENSG00000275822 |NCR1        |
|ENSG00000274053 |NCR1        |
|ENSG00000278362 |NCR1        |
|ENSG00000273916 |NCR1        |
|ENSG00000273535 |NCR1        |
|ENSG00000275156 |NCR1        |
|ENSG00000189430 |NCR1        |
|ENSG00000211829 |TRDC        |

$feature_plot


$violin_plot

  • CD16 Natural Killer Cell: MYOM2, FGFBP2, SH2D1B
CD16_Natural_Killer_Cell_markers_bone_marrow2 <- c("MYOM2", "FGFBP2", "SH2D1B")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   CD16_Natural_Killer_Cell_markers_bone_marrow2)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000137441 |FGFBP2      |
|ENSG00000274137 |MYOM2       |
|ENSG00000036448 |MYOM2       |
|ENSG00000198574 |SH2D1B      |

$feature_plot


$violin_plot

  • CD8 T cell: CD3, CD5, CD8, CD27, CD28
CD8_T_cell_markers_bone_marrow2 <- c("CD3", "CD5", "CD8", "CD27", "CD28")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   CD8_T_cell_markers_bone_marrow2)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000139193 |CD27        |
|ENSG00000178562 |CD28        |
|ENSG00000110448 |CD5         |

$feature_plot


$violin_plot

  • Mucosal-associated Invariant T Cell: TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10
Mucosal_associated_Invariant_T_Cell_markers_bone_marrow2 <- c("TRAV1-2", "TRAJ33", "TRAJ20", "TRAJ12", "SLC4A10")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   Mucosal_associated_Invariant_T_Cell_markers_bone_marrow2)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000144290 |SLC4A10     |
|ENSG00000211877 |TRAJ12      |
|ENSG00000211869 |TRAJ20      |
|ENSG00000211856 |TRAJ33      |
|ENSG00000256553 |TRAV1-2     |

$feature_plot


$violin_plot

  • Plasmacytoid Dendritic Cell: BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1
Plasmacytoid_Dendritic_Cell_markers_bone_marrow2 <- c("BDCA2", "BDCA-4", "CD123", "CD303", "CD304", "CLEC4C", "DR6", "Fc-epsilon RI-alpha", "ILT3", "ILT7", "NRP1")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   Plasmacytoid_Dendritic_Cell_markers_bone_marrow2)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000198178 |CLEC4C      |
|ENSG00000099250 |NRP1        |

$feature_plot


$violin_plot

  • Dendritic Cell: CD1C, CD86
Dendritic_Cell_markers_bone_marrow2 <- c("CD1C", "CD86")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   Dendritic_Cell_markers_bone_marrow2)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000158481 |CD1C        |
|ENSG00000114013 |CD86        |

$feature_plot


$violin_plot

  • Naïve B cell: MS4A1, CD79A, P2RX5
Naive_B_cell_markers_bone_marrow2 <- c("MS4A1", "CD79A", "P2RX5")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   Naive_B_cell_markers_bone_marrow2)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000105369 |CD79A       |
|ENSG00000156738 |MS4A1       |
|ENSG00000083454 |P2RX5       |

$feature_plot


$violin_plot

Annotate the cell types for the BoneMarrow2 data

# Annotate the cell types for the BoneMarrow2 data
# manually rename the seurat_clusters to the cell types
BoneMarrow2_pbmc_UMAP_new_meta <- BoneMarrow2_pbmc_UMAP@meta.data |>
  # change the seurat_clusters to cell types
  dplyr::mutate(cell_type = case_when(
    seurat_clusters == 0 ~ "Neutrophil",
    seurat_clusters == 1 ~ "Regulatory T-cell",
    seurat_clusters == 2 ~ "Natural Killer Cell",
    seurat_clusters == 3 ~ "CD16 Natural Killer Cell",
    seurat_clusters == 4 ~ "CD8 T Cell",
    seurat_clusters == 5 ~ "Regulatory T-cell",
    seurat_clusters == 6 ~ "Mucosal-associated Invariant T Cell",
    seurat_clusters == 7 ~ "Plasmacytoid Dendritic Cell",
    seurat_clusters == 8 ~ "Dendritic Cell",
    seurat_clusters == 9 ~ "Naïve B Cell",
    seurat_clusters == 10 ~ "Non-classical Monocyte Cell",
    seurat_clusters == 11 ~ "Neutrophil Granulocytes",
    seurat_clusters == 12 ~ "Natural Killer Cell")) |>
  dplyr::mutate(cell_type = as.factor(cell_type))
    
# add the cell types to the UMAP object    
BoneMarrow2_pbmc_UMAP <- AddMetaData(
  BoneMarrow2_pbmc_UMAP,
  metadata = BoneMarrow2_pbmc_UMAP_new_meta,
  col.name = "cell_type")

# export the UMAP object to a rds file
saveRDS(BoneMarrow2_pbmc_UMAP, "BoneMarrow2_pbmc_UMAP.rds")

2.7.2.1 Summary Plots

Plot the UMAP with the cell types for the BoneMarrow2 data

DimPlot(BoneMarrow2_pbmc_UMAP,
        reduction = "umap",
        label = TRUE,
        group.by = "cell_type",
        label.size = 3,
        repel = TRUE,
        alpha = 0.5) +
  ggtitle("BoneMarrow2 Cell Types") +
  NoLegend()

Plot the heatmap

DoHeatmap(
  BoneMarrow2_pbmc_UMAP,
  features = BoneMarrow2_pbmc_markers_top_5$gene,
  label = TRUE,
  group.by = "cell_type",
  angle = 23,
  size = 4) |> 
  suppressWarnings() +
  NoLegend() +
  ggtitle("BoneMarrow2 Heatmap") +
  rremove("y.text") 

3 Part 2: Pancreas

3.1 Read the data

# Load the Pancreas dataset
Pancreas <- readRDS(
  paste("/Users/jacenai/Desktop/STAT_M254/",
        "Final_Project/Datasets_final/",
        "Pancreas.rds",
        sep = "")) |>
  as.data.frame()

3.2 Create Seurat and QC

# Create the Pancreas Seurat object
Pancreas <- Seurat::CreateSeuratObject(
  counts = Pancreas,
  project = "Pancreas",
  assay = "DNA",
  min.cells = 10,
  min.features = 200) |>
  suppressWarnings()

# Find mitochondrial genes for Pancreas
Pancreas[["percent.mt"]] <- PercentageFeatureSet(
  Pancreas,
  pattern = "^MT-") |>
  suppressWarnings()

VlnPlot(Pancreas,
        features = c("nFeature_DNA", 
                     "nCount_DNA",
                     "percent.mt"), 
        ncol = 3) |>
  suppressWarnings()

3.3 Transformation & Feature selection & Scaling

I will try log1pCPM, and SCTransform to transform the data.

# Transform the pancreas data
# Seurat default log-normalization (log1pCP10k)
pancreas_log1pCP1k_1 <- NormalizeData(
  Pancreas,
  normalization.method = "LogNormalize",
  scale.factor = 1000)

# identify the highly variable genes
pancreas_features_log1pCP1k_1 <- FindVariableFeatures(
  pancreas_log1pCP1k_1,
  selection.method = "vst",
  nfeatures = 2000) # tune this parameter

# Scale the data
pancreas_scaled_log1pCP1k_1 <- 
  ScaleData(pancreas_features_log1pCP1k_1, 
            features = 
              VariableFeatures(pancreas_features_log1pCP1k_1))


# SCTransform the Pancreas data
pancreas_SCTransform <- SCTransform(
  Pancreas,
  verbose = FALSE,
  variable.features.n = 3000, # tune this parameter
  return.only.var.genes = TRUE,
  assay = "DNA",
  vars.to.regress = c("nCount_DNA", "percent.mt"))

Calculate the variance of transformed data and plot it against the mean.

# Calculate the variance of the transformed data
pancreas_variance_log1pCP1k_1 <- 
  pancreas_scaled_log1pCP1k_1[["DNA"]]@layers$scale.data |>
  apply(1, var)

pancreas_mean_log1pCP1k_1 <- 
  pancreas_scaled_log1pCP1k_1[["DNA"]]@layers$scale.data |>
  apply(1, mean)

pancreas_variance_SCTransform <- 
  pancreas_SCTransform[["SCT"]]@scale.data |>
  apply(1, var)

pancreas_mean_SCTransform <- 
  pancreas_SCTransform[["SCT"]]@scale.data |>
  apply(1, mean)

Plot the variance against the mean

# Plot the variance against the mean

# Create a data frame for plotting
gene_var_mean_df <-
  data.frame(Gene_Variance =
               c(pancreas_variance_log1pCP1k_1,
                 pancreas_variance_SCTransform),
             Gene_Mean =
               c(pancreas_mean_log1pCP1k_1,
                 pancreas_mean_SCTransform),
             Transformation =
               rep(c("log1pCP1k", "SCTransform"),
                   c(length(pancreas_variance_log1pCP1k_1),
                     length(pancreas_variance_SCTransform))))

# Plot the variance against the mean
ggplot(gene_var_mean_df,
       aes(x = Gene_Mean,
           y = Gene_Variance,
           color = Transformation)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~Transformation,
             scales = "free") +
  theme_gray() +
  labs(title = "Variance vs Mean",
       x = "Mean",
       y = "Variance") +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5))

I decided to use SCTransform for the downstream analysis.

3.4 Linear dimensional reduction

# I decided to use SCTransform for the downstream analysis
# Perform PCA on the Pancreas data
Pancreas_pbmc_PCA <- RunPCA(
  pancreas_SCTransform,
  features = VariableFeatures(pancreas_SCTransform),
  npcs = 100,
  verbose = FALSE)
ElbowPlot(Pancreas_pbmc_PCA, ndims = 100)

3.5 Get the gene names

Ensembl IDs to gene

if (file.exists("Pancreas_data_IDs.rds")) {
  Pancreas_data_IDs <- readRDS("Pancreas_data_IDs.rds")
} else {
  Pancreas_data_IDs <- getBM(
    attributes = c("ensembl_gene_id","hgnc_symbol"),
    filters = 'ensembl_gene_id',
    values = rownames(Pancreas),
    mart = mrt)
  saveRDS(Pancreas_data_IDs, "Pancreas_data_IDs.rds")
}

3.6 Clustering

# Perform neighborhood-based clustering
Pancreas_pbmc_neighbor <- FindNeighbors(
  Pancreas_pbmc_PCA,
  reduction = "pca",
  dims = 1:100,
  k.param = 30,
  prune.SNN = 1/15,
  verbose = FALSE)

Pancreas_pbmc_cluster <- FindClusters(
  Pancreas_pbmc_neighbor,
  resolution = 0.5 ,
  verbose = FALSE)

Run UMAP and t-SNE

# Run UMAP
Pancreas_pbmc_UMAP <- RunUMAP(
  Pancreas_pbmc_cluster,
  dims = 1:100,
  verbose = FALSE)

# try t_SNE
Pancreas_pbmc_tSNE <- RunTSNE(
  Pancreas_pbmc_cluster,
  dims = 1:90,
  verbose = FALSE)

Plot the UMAP and t-SNE

DimPlot(Pancreas_pbmc_UMAP,
        reduction = "umap",
        label = TRUE,
        group.by = "seurat_clusters",
        alpha = 0.5) +
  ggtitle("Pancreas UMAP")

DimPlot(Pancreas_pbmc_tSNE,
        reduction = "tsne",
        label = TRUE,
        group.by = "seurat_clusters",
        alpha = 0.5) +
  ggtitle("Pancreas t-SNE")

By comparing the UMAP and t-SNE plots, I decided to use the UMAP.

3.7 Annotation Results

# Find gene markers
if (file.exists("pancreas_gene_markers.rds")) {
  pancreas_gene_markers <- readRDS("pancreas_gene_markers.rds")
} else {
  pancreas_gene_markers <- FindAllMarkers(
    Pancreas_pbmc_UMAP,
    min.pct = 0.01,
    logfc.threshold = 0.01,
    test.use = "wilcox",
    verbose = FALSE)
  saveRDS(pancreas_gene_markers, "pancreas_gene_markers.rds")
}

# Filter the gene markers
Pancreas_pbmc_markers_top_5 <-
  pancreas_gene_markers |>
  as_tibble() |>
  filter(avg_log2FC > 2) |>
  filter(p_val_adj < 0.01) |>
  group_by(cluster, p_val_adj) |>
  
  # arrange the gene with the order of 
  # high in avg_log2FC and low in p_val_adj
  arrange(cluster, p_val_adj, desc(avg_log2FC)) |>
  mutate(gene_names = map_chr(
    gene, ~ Pancreas_data_IDs$hgnc_symbol[
      match(.x, Pancreas_data_IDs$ensembl_gene_id)])) |>
  
  # select the top 5 genes in each cluster
  group_by(cluster) |>
  dplyr::slice(1:5) |>
  dplyr::select(cluster, gene_names, everything()) 

Pancreas_pbmc_markers_top_5 |>
  as_tibble() |>
  knitr::kable()
cluster gene_names p_val avg_log2FC pct.1 pct.2 p_val_adj gene
0 GATM 0 2.314042 1.000 0.817 0 ENSG00000171766
0 SYCN 0 2.056717 1.000 0.969 0 ENSG00000179751
0 AMY2A 0 2.232347 0.998 0.994 0 ENSG00000243480
0 AQP8 0 2.280071 0.948 0.561 0 ENSG00000103375
0 AMY2B 0 2.190193 0.993 0.883 0 ENSG00000240038
1 REG3G 0 4.080445 1.000 0.844 0 ENSG00000143954
1 REG1B 0 3.215856 1.000 1.000 0 ENSG00000172023
1 REG1A 0 2.081271 1.000 1.000 0 ENSG00000115386
1 SERPINA3 0 3.717401 0.534 0.083 0 ENSG00000196136
1 C3 0 3.439010 0.685 0.193 0 ENSG00000125730
2 GYPC 0 3.637994 0.741 0.105 0 ENSG00000136732
2 DAB2 0 4.486262 0.553 0.049 0 ENSG00000153071
2 S100A13 0 2.685883 0.962 0.452 0 ENSG00000189171
2 AQP1 0 4.018850 0.662 0.103 0 ENSG00000240583
2 CAVIN1 0 2.775235 0.700 0.100 0 ENSG00000177469
3 HSD11B1-AS1 0 3.592255 0.329 0.034 0 ENSG00000227591
3 LINC01610 0 2.458333 0.883 0.643 0 ENSG00000237643
3 TAC1 0 3.921941 0.428 0.109 0 ENSG00000006128
3 CPA4 0 2.247372 0.266 0.062 0 ENSG00000128510
3 ANXA10 0 2.430645 0.302 0.089 0 ENSG00000109511
4 RXRG 0 6.361009 0.399 0.005 0 ENSG00000143171
4 ADCYAP1 0 5.879420 0.454 0.014 0 ENSG00000141433
4 PCSK1 0 4.752696 0.475 0.018 0 ENSG00000175426
4 TAGLN3 0 5.974577 0.464 0.017 0 ENSG00000144834
4 ASB9 0 6.459432 0.492 0.024 0 ENSG00000102048
5 MALL 0 5.182514 0.982 0.118 0 ENSG00000144063
5 ALDH1A3 0 4.547304 0.945 0.129 0 ENSG00000184254
5 FLNA 0 4.041860 0.982 0.158 0 ENSG00000196924
5 S100A14 0 3.988552 0.936 0.135 0 ENSG00000189334
5 TSPAN15 0 3.189382 0.955 0.157 0 ENSG00000099282
6 IRX2-DT 0 6.099829 0.833 0.023 0 ENSG00000186493
6 F10 0 5.597897 0.833 0.031 0 ENSG00000126218
6 CRH 0 6.225981 0.567 0.012 0 ENSG00000147571
6 SMIM24 0 5.272775 0.817 0.043 0 ENSG00000095932
6 PCSK2 0 5.318404 0.933 0.067 0 ENSG00000125851
7 MALAT1 0 4.024411 1.000 0.992 0 ENSG00000251562
7 NEAT1 0 4.437703 1.000 0.947 0 ENSG00000245532
7 LENG8 0 2.830771 1.000 0.752 0 ENSG00000167615
7 RBM25 0 2.400703 0.982 0.569 0 ENSG00000119707
7 MEG3 0 2.917470 0.912 0.474 0 ENSG00000214548
# export the top 5 highly expressed genes each cluster to a xlsx file
if (!file.exists("Pancreas_pbmc_markers_top_5.xlsx")) {
  write_xlsx(Pancreas_pbmc_markers_top_5, 
             "Pancreas_pbmc_markers_top_5.xlsx")
}

By looking up the highly expressed genes in each cluster online and in the literature, I can roughly identify the cell types in the Pancreas data. I found the following cell types:

Cluster Cell Type
0 Acinar Cell
1 Acinar Cell
2 Ductal Cell
3 Acinar Cell
4 Beta Cell
5 Ductal Cell
6 Alpha Cell
7 Mixed Immune Cell

Since I already have the primary cell types for the Pancreas data from the top 5 highly expressed genes in each cluster, we can verfiy the cell types by gene markers from the literature or from online databases (e.g., CellMarker 2.0, PanglaoDB)

And I found the following gene markers for the cell types in the Pancreas data:

Cell Type Gene Markers
Acinar Cell PRSS1, CELA3A, CELA3B, CPA1, CTRB1, CTRB2, REG1A, REG1B, REG3A, REG3G, REG4
Ductal Cell KRT19, KRT7, KRT8, KRT18, KRT20, MUC1, MUC6, MUC5AC, MUC5B, SOX9
Beta Cell INS, GCG, IAPP, PCSK1, PCSK2, SLC2A2, SLC30A8, NKX6-1, PDX1, MAFA
Alpha Cell GCG, ARX, IRX2, IRX3, IRX5, IRX6, IRX1, IRX4, IRX7, IRX8, IRX9
Mixed Immune Cell CD3D, CD3E, CD3G, CD4, CD8A, CD8B, CD19, CD20, CD22, CD79A, CD79B, CD34, CD38

Then plot feature plots for the classic gene markers for each cell type to verify the cell types in the Pancreas data.

  • Acinar Cell: PRSS1, CELA3A, CELA3B, CPA1, CTRB1, CTRB2, REG1A, REG1B, REG3A, REG3G, REG4
Acinar_cell_markers <- c("PRSS1", "CELA3A", "CELA3B", "CPA1")
plot_feature_plots(Pancreas_pbmc_UMAP, 
                   Acinar_cell_markers)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000142789 |CELA3A      |
|ENSG00000219073 |CELA3B      |
|ENSG00000091704 |CPA1        |
|ENSG00000274247 |PRSS1       |
|ENSG00000204983 |PRSS1       |

$feature_plot


$violin_plot

  • Ductal Cell: KRT19, KRT7, KRT8, KRT18, KRT20, MUC1, MUC6, MUC5AC, MUC5B, SOX9
Ductal_cell_markers <- c("KRT19", "KRT7", "KRT8", "KRT18", "MUC5AC")
plot_feature_plots(Pancreas_pbmc_UMAP, 
                   Ductal_cell_markers)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000111057 |KRT18       |
|ENSG00000171345 |KRT19       |
|ENSG00000135480 |KRT7        |
|ENSG00000170421 |KRT8        |
|ENSG00000215182 |MUC5AC      |
|ENSG00000291363 |MUC5AC      |

$feature_plot


$violin_plot

  • Beta Cell: INS, GCG, IAPP, PCSK1, PCSK2, SLC2A2, SLC30A8, NKX6-1, PDX1, MAFA
Beta_cell_markers <- c("INS", "GCG", "IAPP", "PCSK1", "PCSK2")
plot_feature_plots(Pancreas_pbmc_UMAP, 
                   Beta_cell_markers)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000115263 |GCG         |
|ENSG00000121351 |IAPP        |
|ENSG00000254647 |INS         |
|ENSG00000175426 |PCSK1       |
|ENSG00000125851 |PCSK2       |

$feature_plot


$violin_plot

  • Alpha Cell: GCG, ARX, IRX2, IRX3, IRX5, IRX6, IRX1, IRX4, IRX7, IRX8, IRX9
Alpha_cell_markers <- c("GCG", "ARX", "IRX2", "IRX3")
plot_feature_plots(Pancreas_pbmc_UMAP, 
                   Alpha_cell_markers)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000004848 |ARX         |
|ENSG00000115263 |GCG         |
|ENSG00000170561 |IRX2        |
|ENSG00000177508 |IRX3        |

$feature_plot


$violin_plot

  • Mixed Immune Cell: MALAT1, NEAT1
Mixed_immune_cell_markers <- c("MALAT1", "NEAT1")
plot_feature_plots(Pancreas_pbmc_UMAP, 
                   Mixed_immune_cell_markers)
$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000251562 |MALAT1      |
|ENSG00000245532 |NEAT1       |

$feature_plot


$violin_plot

Annotate the cell types

Annotate the cell types for the Pancreas data based on the gene markers and the feature plots.

# Annotate the cell types for the Pancreas data
# manually rename the seurat_clusters to the cell types
Pancreas_pbmc_UMAP_new_meta <- Pancreas_pbmc_UMAP@meta.data |>
  # change the seurat_clusters to cell types
  dplyr::mutate(cell_type = case_when(
    seurat_clusters == 0 ~ "Acinar Cell",
    seurat_clusters == 1 ~ "Acinar Cell",
    seurat_clusters == 2 ~ "Ductal Cell",
    seurat_clusters == 3 ~ "Acinar Cell",
    seurat_clusters == 4 ~ "Beta Cell",
    seurat_clusters == 5 ~ "Ductal Cell",
    seurat_clusters == 6 ~ "Alpha Cell",
    seurat_clusters == 7 ~ "Mixed Immune Cell")) |>
  dplyr::mutate(cell_type = as.factor(cell_type))

# add the cell types to the UMAP object
Pancreas_pbmc_UMAP <- AddMetaData(
  Pancreas_pbmc_UMAP,
  metadata = Pancreas_pbmc_UMAP_new_meta,
  col.name = "cell_type")

# export the UMAP object to a rds file
saveRDS(Pancreas_pbmc_UMAP, "Pancreas_pbmc_UMAP.rds")

3.7.0.1 Summary Plots

Plot the UMAP with the cell types for the Pancreas data.

DimPlot(Pancreas_pbmc_UMAP,
        reduction = "umap",
        label = TRUE,
        group.by = "cell_type",
        label.size = 4,
        repel = TRUE,
        alpha = 0.5) +
  ggtitle("Pancreas Cell Types") +
  NoLegend()

Plot the heatmap

# plot the heatmap
DoHeatmap(
  Pancreas_pbmc_UMAP,
  features = Pancreas_pbmc_markers_top_5$gene,
  label = TRUE,
  group.by = "cell_type",
  size = 4) |>
  suppressWarnings() +
  NoLegend() +
  rremove("y.text") 

New marker for alpha cells

FeaturePlot(
  Pancreas_pbmc_UMAP,
  features = c("ENSG00000126218"),
  reduction = "umap",
  slot = "data")

4 Conclusion

In this final project, I have analyzed the BoneMarrow1, BoneMarrow2 and Pancreas data using the Seurat package in R. I have identified 14 cell types in the BoneMarrow1 data, 11 cell types in the BoneMarrow2 data, and 5 cell types in the Pancreas data. I have also identified the gene markers for each cell type and annotated the cell types based on the gene markers and the feature plots. Finally, I have plotted the heatmap for the top 5 gene markers for each cell type in all three datasets. The results of this analysis can be used to better understand the cell types in the BoneMarrow1, BoneMarrow2 and Pancreas data and to further investigate the role of these cell types in health and disease.