STAT M254 Final Project

BoneMarrow and Pancreas Data Analysis

Author

Jiachen Ai, UID: 206182615

Published

June 16, 2024

# Load libraries
library(readr)
library(dplyr)
library(tidyverse)
library(Seurat) |> suppressWarnings()
library(patchwork)
library(ggplot2)
library(sctransform)
library(matrixStats)
library(purrr)
library(BayesTools)
library(VGAM) |> suppressWarnings()
library(knitr) |> suppressWarnings()
library(Matrix)
library(fastglmpca)
library(cowplot)
library(gridExtra)
library(ggpubr)
library(motifcluster)
library(cluster)
library(mclust)
library(tightClust)
library(devtools)
library(biomaRt) |> suppressWarnings()
library(presto)
library(writexl)
# library(SingleR)
set.seed(2024)

1 Introduction

In this project, I will analyze two datasets: BoneMarrow and Pancreas. The BoneMarrow dataset contains two datasets: BoneMarrow1 and BoneMarrow2. The Pancreas dataset contains one dataset: Pancreas. The goal of this project is to identify cell types in the BoneMarrow and Pancreas datasets. I will perform the following analysis:

Setup the Seurat Object
Standard pre-processing workfow
Normalizing the data
Identification of highly variable features (feature selection)
Scaling the data
Perform linear dimensional reduction
Determine the ‘dimensionality’ of the dataset
Cluster the cells
Run non-linear dimensional reduction (UMAP/tSNE)
Finding differentially expressed features (cluster biomarkers)
Assigning cell type identity to clusters

2 Part 1: BoneMarrow

2.1 Read the data

# Load the BoneMarrow dataset
BoneMarrow_data1 <- readRDS(
  paste("/Users/jacenai/Desktop/STAT_M254/",
        "Final_Project/Datasets_final/",
        "BoneMarrow_dataset1.rds",
        sep = "")) |>
  as.data.frame()

BoneMarrow_data2 <- readRDS(
  paste("/Users/jacenai/Desktop/STAT_M254/",
        "Final_Project/Datasets_final/",
        "BoneMarrow_dataset2.rds",
        sep = "")) |>
  as.data.frame()

2.2 Create Seurat and QC

# Create the BoneMarrow1 Seurat object
BoneMarrow1 <- Seurat::CreateSeuratObject(
  counts = BoneMarrow_data1,
  project = "BoneMarrow1",
  assay = "DNA",
  min.cells = 10,
  min.features = 200) |>
  suppressWarnings()

# Find mitochondrial genes for BoneMarrow1
BoneMarrow1[["percent.mt"]] <- PercentageFeatureSet(
  BoneMarrow1,
  pattern = "^MT-") |>
  suppressWarnings()

# Create the BoneMarrow2 Seurat object
BoneMarrow2 <- Seurat::CreateSeuratObject(
  counts = BoneMarrow_data2,
  project = "BoneMarrow2",
  assay = "DNA",
  min.cells = 10,
  min.features = 200) |>
  suppressWarnings()

# Find mitochondrial genes for BoneMarrow2
BoneMarrow2[["percent.mt"]] <- PercentageFeatureSet(
  BoneMarrow2,
  pattern = "^MT-") |>
  suppressWarnings()

VlnPlot(BoneMarrow1,
        features = c("nFeature_DNA", 
                     "nCount_DNA",
                     "percent.mt"), 
        ncol = 3) |>
  suppressWarnings()

VlnPlot(BoneMarrow2,
        features = c("nFeature_DNA", 
                     "nCount_DNA",
                     "percent.mt"), 
        ncol = 3) |>
  suppressWarnings()

2.3 Transformation & Feature selection & Scaling

I will try log1pCPM, and SCTransform to transform the data and compare the results.

# Transform the BoneMarrow1 data
# Seurat default log-normalization (log1pCP10k)
pbmc_log1pCP1k_1 <- NormalizeData(
  BoneMarrow1,
  normalization.method = "LogNormalize",
  scale.factor = 1000)

# identify the highly variable genes
pbmc_features_log1pCP1k_1 <- FindVariableFeatures(
  pbmc_log1pCP1k_1,
  selection.method = "vst",
  nfeatures = 2000) # tune this parameter

# Scale the data
pbmc_scaled_log1pCP1k_1 <- 
  ScaleData(pbmc_features_log1pCP1k_1, 
            features = 
              VariableFeatures(pbmc_features_log1pCP1k_1))


# Transform the BoneMarrow2 data
# Seurat default log-normalization (log1pCP10k)
pbmc_log1pCP1k_2 <- NormalizeData(
  BoneMarrow2,
  normalization.method = "LogNormalize",
  scale.factor = 1000)

# identify the highly variable genes
pbmc_features_log1pCP1k_2 <- FindVariableFeatures(
  pbmc_log1pCP1k_2,
  selection.method = "vst",
  nfeatures = 2000) # tune this parameter

# Scale the data
pbmc_scaled_log1pCP1k_2 <- 
  ScaleData(pbmc_features_log1pCP1k_2, 
            features = 
              VariableFeatures(pbmc_features_log1pCP1k_2))

Try SCTransform to transform the data.

# SCTransform
# SCTransform the BoneMarrow1 data
BoneMarrow1_SCTransform <- SCTransform(
  BoneMarrow1,
  verbose = FALSE,
  variable.features.n = 3000, # tune this parameter
  return.only.var.genes = TRUE,
  assay = "DNA",
  vars.to.regress = c("nCount_DNA", "percent.mt"))

# SCTransform the BoneMarrow2 data
BoneMarrow2_SCTransform <- SCTransform(
  BoneMarrow2,
  verbose = FALSE,
  variable.features.n = 3000, # tune this parameter
  return.only.var.genes = TRUE,
  assay = "DNA",
  vars.to.regress = c("nCount_DNA", "percent.mt"))

Calculate the variance of transformed data and plot it against the mean.

# Calculate the gene variance of transformed data 
# and plot it against the gene mean

# BoneMarrow1 Log1pCPM
gene_var_log1pCPM_1 <- 
  pbmc_scaled_log1pCP1k_1[["DNA"]]@layers$scale.data |>
  apply(1, var)

gene_mean_log1pCPM_1 <- 
  pbmc_scaled_log1pCP1k_1[["DNA"]]@layers$scale.data |>
  apply(1, mean)

# BoneMarrow2 Log1pCPM
gene_var_log1pCPM_2 <- 
  pbmc_scaled_log1pCP1k_2[["DNA"]]@layers$scale.data |>
  apply(1, var)

gene_mean_log1pCPM_2 <- 
  pbmc_scaled_log1pCP1k_2[["DNA"]]@layers$scale.data |>
  apply(1, mean)


# BoneMarrow1 SCTransform
gene_var_SCTransform_1 <- 
  BoneMarrow1_SCTransform[["SCT"]]@scale.data |>
  apply(1, var)

gene_mean_SCTransform_1 <- 
  BoneMarrow1_SCTransform[["SCT"]]@scale.data |>
  apply(1, mean)

# BoneMarrow2 SCTransform
gene_var_SCTransform_2 <- 
  BoneMarrow2_SCTransform[["SCT"]]@scale.data |>
  apply(1, var)

gene_mean_SCTransform_2 <- 
  BoneMarrow2_SCTransform[["SCT"]]@scale.data |>
  apply(1, mean)

# Plot the gene variance against the gene mean
  
# Create a data frame for plotting
gene_var_mean_df <-
  data.frame(Gene_Variance =
               c(gene_var_log1pCPM_1,
                 gene_var_log1pCPM_2,
                 gene_var_SCTransform_1,
                 gene_var_SCTransform_2),
             Gene_Mean =
               c(gene_mean_log1pCPM_1,
                 gene_mean_log1pCPM_2,
                 gene_mean_SCTransform_1,
                 gene_mean_SCTransform_2),
             Method =
               c(rep("Log1pCPM", 
                     length(gene_var_log1pCPM_1) + 
                       length(gene_var_log1pCPM_2)),
                 rep("SCTransform", 
                     length(gene_var_SCTransform_1) + 
                       length(gene_var_SCTransform_2))))    

# Plot the gene means vs. different variances
ggplot(gene_var_mean_df, aes(x = Gene_Mean, 
                             y = Gene_Variance, 
                             color = Method)) +
  geom_point(alpha = 0.3) +
  theme_grey() +
  facet_wrap(~ Method, scales = "free") +
  labs(title = "Gene Variance vs. Gene Mean",
       x = "Gene Mean",
       y = "Gene Variance") +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5))

# Calculate the Spearman correlation
correlation_log1pCPM_1 <- 
  cor(gene_var_log1pCPM_1, gene_mean_log1pCPM_1, 
      method = "spearman")

correlation_log1pCPM_2 <- 
  cor(gene_var_log1pCPM_2, gene_mean_log1pCPM_2,
      method = "spearman")

correlation_SCTransform_1 <- 
  cor(gene_var_SCTransform_1, gene_mean_SCTransform_1,
      method = "spearman")

correlation_SCTransform_2 <- 
  cor(gene_var_SCTransform_2, gene_mean_SCTransform_2,
      method = "spearman")

# print the Spearman correlation results
cbind(Method = c("Log1pCPM", "Log1pCPM", 
                 "SCTransform", "SCTransform"),
      Dataset = c("BoneMarrow1", "BoneMarrow2", 
                  "BoneMarrow1", "BoneMarrow2"),
      Correlation = c(correlation_log1pCPM_1, 
                      correlation_log1pCPM_2,
                      correlation_SCTransform_1, 
                      correlation_SCTransform_2)) |>
  knitr::kable()

Method	Dataset	Correlation
Log1pCPM	BoneMarrow1	0.974603251887739
Log1pCPM	BoneMarrow2	0.961849201395136
SCTransform	BoneMarrow1	-0.01872351035011
SCTransform	BoneMarrow2	0.00471352040124233

It can be seen from the resutls that the SCTransform method has a lower correlation between gene variance and gene mean compared to the Log1pCPM method. This indicates that the SCTransform method is better at normalizing the data. Therefore, I will use the SCTransform method for downstream analysis.

2.4 Linear dimensional reduction

# perform PCA
BoneMarrow1_pbmc_PCA <- RunPCA(
  BoneMarrow1_SCTransform,
  features = VariableFeatures(BoneMarrow1_SCTransform),
  npcs = 100, # tune this parameter
  verbose = FALSE)
ElbowPlot(BoneMarrow1_pbmc_PCA, ndims = 100)

BoneMarrow2_pbmc_PCA <- RunPCA(
  BoneMarrow2_SCTransform,
  features = VariableFeatures(BoneMarrow2_SCTransform),
  npcs = 100, # tune this parameter
  verbose = FALSE)
ElbowPlot(BoneMarrow2_pbmc_PCA, ndims = 100)

2.5 Get the gene names

Ensembl IDs to gene

mrt = useMart(biomart = "ensembl",
              dataset = "hsapiens_gene_ensembl")

if (file.exists("BoneMarrow_data1_IDs.rds")) {
  BoneMarrow_data1_IDs <- readRDS("BoneMarrow_data1_IDs.rds")
} else {
  BoneMarrow_data1_IDs <- getBM(
    attributes = c("ensembl_gene_id","hgnc_symbol"),
    filters = 'ensembl_gene_id',
    values = rownames(BoneMarrow_data1),
    mart = mrt)
  saveRDS(BoneMarrow_data1_IDs, "BoneMarrow_data1_IDs.rds")
}

if (file.exists("BoneMarrow_data2_IDs.rds")) {
  BoneMarrow_data2_IDs <- readRDS("BoneMarrow_data2_IDs.rds")
} else {
  BoneMarrow_data2_IDs <- getBM(
    attributes = c("ensembl_gene_id","hgnc_symbol"),
    filters = 'ensembl_gene_id',
    values = rownames(BoneMarrow_data2),
    mart = mrt)
  saveRDS(BoneMarrow_data2_IDs, "BoneMarrow_data2_IDs.rds")
}

2.6 Clustering

Perform neighborhood-based clustering (I tried different parameters and the ones below gave the best results).

# Perform neighborhood-based clustering
BoneMarrow1_pbmc_neighbor <- FindNeighbors(
  BoneMarrow1_pbmc_PCA,
  reduction = "pca",
  dims = 1:100, # tune this parameter
  k.param = 30, # tune this parameter
  prune.SNN = 1/10, # tune this parameter
  verbose = FALSE)

BoneMarrow1_pbmc_cluster <- FindClusters(
  BoneMarrow1_pbmc_neighbor,
  resolution = 1.8, # tune this parameter
  verbose = FALSE)

BoneMarrow2_pbmc_neighbor <- FindNeighbors(
  BoneMarrow2_pbmc_PCA,
  reduction = "pca",
  dims = 1:100, # tune this parameter
  k.param = 40, # tune this parameter
  prune.SNN = 1/10, # tune this parameter
  verbose = FALSE)

BoneMarrow2_pbmc_cluster <- FindClusters(
  BoneMarrow2_pbmc_neighbor,
  resolution = 0.8, # tune this parameter
  verbose = FALSE)

Run UMAP and t-SNE

# Run UMAP
BoneMarrow1_pbmc_UMAP <- RunUMAP(
  BoneMarrow1_pbmc_cluster,
  min.dist = 0.3,
  n.neighbors = 30,
  dims = 1:79,
  metric = "cosine",
  verbose = FALSE) |>
  suppressWarnings()

BoneMarrow2_pbmc_UMAP <- RunUMAP(
  BoneMarrow2_pbmc_cluster,
  min.dist = 0.3,
  n.neighbors = 30,
  dims = 1:87,
  metric = "cosine",
  verbose = FALSE)

# try t_SNE
BoneMarrow1_pbmc_tSNE <- RunTSNE(
  BoneMarrow1_pbmc_cluster,
  dims = 1:50,
  verbose = FALSE)

BoneMarrow2_pbmc_tSNE <- RunTSNE(
  BoneMarrow2_pbmc_cluster,
  dims = 1:98,
  verbose = FALSE)

Plot the UMAP

DimPlot(BoneMarrow1_pbmc_UMAP,
        reduction = "umap",
        label = TRUE,
        group.by = "seurat_clusters",
        alpha = 0.5) +
  ggtitle("BoneMarrow1 UMAP")

DimPlot(BoneMarrow2_pbmc_UMAP,
        reduction = "umap",
        label = TRUE,
        group.by = "seurat_clusters",
        alpha = 0.5) +
  ggtitle("BoneMarrow2 UMAP")

Plot the t-SNE

DimPlot(BoneMarrow1_pbmc_tSNE,
        reduction = "tsne",
        label = TRUE,
        group.by = "seurat_clusters",
        alpha = 0.5) +
  ggtitle("BoneMarrow1 t-SNE")

DimPlot(BoneMarrow2_pbmc_tSNE,
        reduction = "tsne",
        label = TRUE,
        group.by = "seurat_clusters",
        alpha = 0.5) +
  ggtitle("BoneMarrow2 t-SNE")

By comparing the UMAP and t-SNE plots, I decided to use the UMAP for both datasets.

2.7 Annotation Results

2.7.1 BoneMarrow1 data

# Find gene markers for the BoneMarrow1 data

# use the UMAP for the BoneMarrow1 data to find the gene markers

if (file.exists("BoneMarrow1_pbmc_markers.rds")) {
  BoneMarrow1_pbmc_markers <- readRDS("BoneMarrow1_pbmc_markers.rds")
} else {
  BoneMarrow1_pbmc_markers <- 
    FindAllMarkers(
      BoneMarrow1_pbmc_UMAP,
      min.pct = 0.01,
      logfc.threshold = 0.01,
      test.use = "wilcox",
      verbose = FALSE)
  saveRDS(BoneMarrow1_pbmc_markers, "BoneMarrow1_pbmc_markers.rds")
}

# select the top 5 highly expressed genes in each cluster
# to roughly identify the cell types
BoneMarrow1_pbmc_markers_top_5 <-
BoneMarrow1_pbmc_markers |>
  as_tibble() |>
  filter(avg_log2FC > 4) |>
  filter(p_val_adj < 0.01) |>
  group_by(cluster, p_val_adj) |>
  
  # arrange the gene with the order of 
  # high in avg_log2FC and low in p_val_adj
  arrange(cluster, p_val_adj, desc(avg_log2FC)) |>
  mutate(gene_names = map_chr(
    gene, ~ BoneMarrow_data1_IDs$hgnc_symbol[
      match(.x, BoneMarrow_data1_IDs$ensembl_gene_id)])) |>
  
  # select the top 5 genes in each cluster
  group_by(cluster) |>
  dplyr::slice(1:5) |>
  dplyr::select(cluster, gene_names, everything()) 

# export the top 5 highly expressed genes each cluster to a xlsx file
if (!file.exists("BoneMarrow1_pbmc_markers_top_5.xlsx")) {
  write_xlsx(BoneMarrow1_pbmc_markers_top_5, 
             "BoneMarrow1_pbmc_markers_top_5.xlsx")
}

BoneMarrow1_pbmc_markers_top_5 |>
  knitr::kable()

cluster	gene_names	avg_log2FC	pct.1	pct.2	p_val_adj	gene
0	CCR4	4.874914	0.382	0.014	0.0000000	ENSG00000183813
0	TTC39C-AS1	4.401808	0.091	0.004	0.0000000	ENSG00000264745
0	WNT7A	4.294893	0.087	0.004	0.0000000	ENSG00000154764
0	FOXP3	4.169362	0.044	0.002	0.0000000	ENSG00000049768
1	MAFB	4.238447	0.972	0.106	0.0000000	ENSG00000204103
1	THBS1	4.626654	0.816	0.069	0.0000000	ENSG00000137801
1	PID1	4.263354	0.624	0.039	0.0000000	ENSG00000153823
1	CYP1B1	5.438358	0.543	0.025	0.0000000	ENSG00000138061
1	CD300E	4.320640	0.571	0.030	0.0000000	ENSG00000186407
2	KLRC2	4.734576	0.829	0.053	0.0000000	ENSG00000205809
2	TKTL1	5.867000	0.566	0.017	0.0000000	ENSG00000007350
2	KIR3DL2	4.176698	0.483	0.028	0.0000000	ENSG00000240403
2	KLRC3	4.646220	0.415	0.021	0.0000000	ENSG00000205810
2	LDB2	4.309103	0.102	0.005	0.0000000	ENSG00000169744
3	AZU1	7.113508	0.741	0.034	0.0000000	ENSG00000172232
3	MPO	7.051123	0.615	0.030	0.0000000	ENSG00000005381
3	ELANE	7.931680	0.498	0.014	0.0000000	ENSG00000197561
3	RNASE2	4.601070	0.678	0.054	0.0000000	ENSG00000169385
3	CTSG	7.272826	0.390	0.007	0.0000000	ENSG00000100448
4	TNFRSF9	5.339711	0.261	0.012	0.0000000	ENSG00000049249
4	CCL3-AS1	4.315129	0.227	0.017	0.0000000	ENSG00000277089
4	CXCR6	4.409252	0.177	0.010	0.0000000	ENSG00000172215
4	DKK3	4.144354	0.148	0.010	0.0000000	ENSG00000050165
4	VCAM1	6.842211	0.025	0.000	0.0000000	ENSG00000162692
5	MS4A1	7.384164	0.866	0.015	0.0000000	ENSG00000156738
5	CD79A	7.144223	0.985	0.034	0.0000000	ENSG00000105369
5	LINC00926	7.117973	0.738	0.012	0.0000000	ENSG00000247982
5	BANK1	6.714238	0.713	0.019	0.0000000	ENSG00000153064
5	FCER2	7.537949	0.554	0.006	0.0000000	ENSG00000104921
6	MYOM2	5.380428	0.885	0.050	0.0000000	ENSG00000036448
6	FGFBP2	4.273741	0.935	0.093	0.0000000	ENSG00000137441
6	SH2D1B	4.317394	0.620	0.037	0.0000000	ENSG00000198574
6	SPON2	4.029231	0.975	0.208	0.0000000	ENSG00000159674
6	LINC00299	5.023763	0.370	0.016	0.0000000	ENSG00000236790
7	FSBP	5.568025	0.012	0.000	0.0002713	ENSG00000265817
7	CELA1	5.568025	0.012	0.000	0.0002713	ENSG00000139610
8	CCL3L3	4.485081	0.781	0.057	0.0000000	ENSG00000276085
8	KCNE1	4.955907	0.323	0.014	0.0000000	ENSG00000180509
8	TNFAIP6	4.494003	0.303	0.022	0.0000000	ENSG00000123610
8		4.503048	0.181	0.008	0.0000000	ENSG00000287553
8	CXCL1	4.981095	0.168	0.008	0.0000000	ENSG00000163739
9	TRAV14DV4	4.364786	0.263	0.014	0.0000000	ENSG00000211792
9	TRDV2	6.154833	0.180	0.005	0.0000000	ENSG00000211821
9	ZNF683	4.564768	0.218	0.010	0.0000000	ENSG00000176083
9	TRBV5-1	5.172667	0.241	0.014	0.0000000	ENSG00000211734
9	TRGV2	4.735881	0.158	0.009	0.0000000	ENSG00000233306
10	SLC4A10	7.069240	0.250	0.002	0.0000000	ENSG00000144290
10	TRAV1-2	4.894153	0.189	0.006	0.0000000	ENSG00000256553
10	LINC01644	5.024152	0.144	0.004	0.0000000	ENSG00000218357
10	TRBV6-4	5.680197	0.091	0.002	0.0000000	ENSG00000211713
10	KIF5C	4.225359	0.091	0.005	0.0000000	ENSG00000168280
11	LINC02812	9.882266	0.825	0.001	0.0000000	ENSG00000230138
11	LILRA4	8.612708	0.992	0.012	0.0000000	ENSG00000239961
11	PTPRS	8.572198	0.775	0.003	0.0000000	ENSG00000105426
11	CLEC4C	8.503754	0.850	0.004	0.0000000	ENSG00000198178
11	SCT	8.283629	0.592	0.002	0.0000000	ENSG00000070031
12	SPTA1	6.572461	0.030	0.000	0.0000000	ENSG00000163554
12	SELP	6.350069	0.030	0.000	0.0000000	ENSG00000174175
12	MFSD2B	5.765106	0.030	0.000	0.0000000	ENSG00000205639
12	POLQ	5.765106	0.030	0.000	0.0000000	ENSG00000051341
12	B3GALNT1	5.765106	0.030	0.000	0.0000000	ENSG00000169255
13	CRHBP	11.350012	0.762	0.000	0.0000000	ENSG00000145708
13	CD34	8.263918	0.845	0.004	0.0000000	ENSG00000174059
13	CYTL1	7.493245	0.690	0.007	0.0000000	ENSG00000170891
13	NPR3	8.010162	0.560	0.003	0.0000000	ENSG00000113389
13	EGFL7	5.907402	0.833	0.019	0.0000000	ENSG00000172889
14	LINC02446	5.434920	0.836	0.037	0.0000000	ENSG00000256039
14	CD8B	4.262326	0.970	0.076	0.0000000	ENSG00000172116
14	C20orf204	4.298116	0.209	0.011	0.0000000	ENSG00000196421
14	PCSK1N	4.345422	0.224	0.015	0.0000000	ENSG00000102109
14	CD248	7.667350	0.045	0.000	0.0000000	ENSG00000174807
15	CLEC10A	7.985711	0.578	0.004	0.0000000	ENSG00000132514
15	ENHO	7.668228	0.356	0.002	0.0000000	ENSG00000168913
15	CD1C	6.268298	0.578	0.013	0.0000000	ENSG00000158481
15	FCER1A	5.012487	0.844	0.040	0.0000000	ENSG00000179639
15	AXL	6.400748	0.289	0.004	0.0000000	ENSG00000167601
16	LYPD2	13.985578	0.543	0.000	0.0000000	ENSG00000197353
16	UICLM	10.106432	0.629	0.000	0.0000000	ENSG00000233392
16	HES4	7.714115	0.514	0.003	0.0000000	ENSG00000188290
16	CDKN1C	6.806037	0.571	0.007	0.0000000	ENSG00000129757
16	VMO1	9.205968	0.257	0.000	0.0000000	ENSG00000182853
17	SDC1	10.283611	0.548	0.000	0.0000000	ENSG00000115884
17	TNFRSF17	8.133368	0.645	0.003	0.0000000	ENSG00000048462
17	JSRP1	10.605539	0.452	0.000	0.0000000	ENSG00000167476
17	IGLV3-1	11.477532	0.581	0.005	0.0000000	ENSG00000211673
17	PARM1	7.476256	0.452	0.003	0.0000000	ENSG00000169116

By looking up the highly expressed genes in each cluster online and in the literature, I can roughly identify the cell types in the BoneMarrow1 data. I found the following cell types:

Cluster	Cell Type
0	Regulatory T-cell
1	Macrophage Cell
2	Natural Killer Cell
3	Neutrophil Precursors
4	CD8 T Cell
5	Naïve B Cell
6	Natural Killer Cell
7	Megakaryocyte Cell
8	Macrophage Cell
9	CD8 T Cell
10	Mucosal-associated Invariant T Cell
11	Plasmacytoid Dendritic Cell
12	Erythroid Cells
13	Hematopoietic Stem Cells
14	CD8 T Cell
15	Dendritic Cell
16	Non-classical Monocyte Cell
17	Plasma Cell

Since I already have the primary cell types for the BoneMarrow1 data from the top 5 highly expressed genes in each cluster, we can verfiy the cell types by gene markers from the literature or from online databases (e.g., CellMarker 2.0, PanglaoDB)

And I found the following gene markers for the cell types in the BoneMarrow1 data:

Cell Type	Gene Markers
Regulatory T-cell	CD25, CD4, FOXP3, CCR4, CCR6
Macrophage Cell	ARG, CCL2, CD163, CD206, FIZZ1, IL-10, MRC1, CD163, CD206, CD14
Natural Killer Cell	TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX
Neutrophil Precursors	S100A8, S100A9, S100A12, LCN2, LTF
CD8 T Cell	CD3, CD5, CD8, CD27, CD28
Naïve B Cell	P2RX5, PTPRCAP
Natural Killer Cell	TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX
Megakaryocyte Cell	IL6ST
Macrophage Cell	ARG, CCL2, CD163, CD206, FIZZ1, IL-10, MRC1, CD163, CD206, CD14
CD8 T Cell	CD3, CD5, CD8, CD27, CD28
Mucosal-associated Invariant T Cell	TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10
Plasmacytoid Dendritic Cell	BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1
Erythroid Cells	AHSP, HBA-A1, HBA-A2, HIST1H2AO, STFA1, ALAD, PNPO
Hematopoietic Stem Cells	CD34, MECOM, EGR1
CD8 T Cell	CD8A, CD8B, CD3D, CD3E, CD3G, CD4, CD5, CD6, CD7, CD8, CD8, CD8
Dendritic Cell	CD1C, CD86
Non-classical Monocyte Cell	CD14, CD16, CX3CR1, LYPD2
Plasma Cell	MZB1, IGHG1, JCHAIN, SPAG4, TGM5

Then plot feature plots for the classic gene markers for each cell type to verify the cell types in the BoneMarrow1 data.

Regulatory T-cell: CD25, CD4, FOXP3, CCR4, CCR6

# define a function to plot feature plots for the classic gene markers
plot_feature_plots <- function(cell_type, classic_gene_markers) {
  # map the gene names back to the ENSEMBL IDs
  classic_gene_maker_names <- c(
    classic_gene_markers)

  Ensembl <- getBM(
    attributes=c("ensembl_gene_id","hgnc_symbol"),
    filters = 'hgnc_symbol', values = classic_gene_maker_names,
    mart = mrt)
  
  feature_plot <- FeaturePlot(
    cell_type,
    features = Ensembl$ensembl_gene_id,
    pt.size = 0.1) |>
    suppressWarnings()
  
  violin_plot <- VlnPlot(
    cell_type,
    features = Ensembl$ensembl_gene_id,
    pt.size = 0.1,
    ncol = 2) |>
    suppressWarnings()

  gene_id_names <- Ensembl |>
    # pivot_wider(names_from = hgnc_symbol, 
    #             values_from = ensembl_gene_id) |>
    kable() |>
    suppressWarnings()
  
return(list(
    gene_id_names = gene_id_names,
    feature_plot = feature_plot,
    violin_plot = violin_plot
  ))
}

# map the gene names back to the ENSEMBL IDs
Regulatory_T_classic_gene_maker_names <- c(
  "CD25", "CD4", "FOXP3", "CCR4", "CCR6")

plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Regulatory_T_classic_gene_maker_names)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000183813 |CCR4        |
|ENSG00000112486 |CCR6        |
|ENSG00000010610 |CD4         |
|ENSG00000049768 |FOXP3       |

$feature_plot


$violin_plot

Macrophage Cell: ARG, CCL2, CD163, CD206, FIZZ1, IL-10, MRC1, CD163, CD206

Macrophage_Cell_classic_gene_maker_names <- c(
  "ARG", "CCL2", "CD163", "CD206", "FIZZ1", "IL-10", "MRC1", "CD163", "CD206", "CD14")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Macrophage_Cell_classic_gene_maker_names)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000108691 |CCL2        |
|ENSG00000170458 |CD14        |
|ENSG00000177575 |CD163       |
|ENSG00000260314 |MRC1        |

$feature_plot


$violin_plot

NK Cell: TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX

NK_Cell_classic_gene_maker_names <- c(
  "TRDC", "KLRF1", "GNLY", "NCR1", "GZMA", "HOPX")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   NK_Cell_classic_gene_maker_names)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000115523 |GNLY        |
|ENSG00000145649 |GZMA        |
|ENSG00000171476 |HOPX        |
|ENSG00000150045 |KLRF1       |
|ENSG00000275521 |NCR1        |
|ENSG00000278025 |NCR1        |
|ENSG00000277442 |NCR1        |
|ENSG00000277334 |NCR1        |
|ENSG00000276450 |NCR1        |
|ENSG00000277629 |NCR1        |
|ENSG00000273506 |NCR1        |
|ENSG00000275637 |NCR1        |
|ENSG00000277824 |NCR1        |
|ENSG00000275822 |NCR1        |
|ENSG00000274053 |NCR1        |
|ENSG00000278362 |NCR1        |
|ENSG00000273916 |NCR1        |
|ENSG00000273535 |NCR1        |
|ENSG00000275156 |NCR1        |
|ENSG00000189430 |NCR1        |
|ENSG00000211829 |TRDC        |

$feature_plot


$violin_plot

Neutrophil Precursors: S100A8, S100A9, S100A12, LCN2, LTF

Neutrophil_Precursors_classic_gene_maker_names <- c("S100A8", "S100A9", "S100A12", "LCN2", "LTF")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Neutrophil_Precursors_classic_gene_maker_names)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000148346 |LCN2        |
|ENSG00000012223 |LTF         |
|ENSG00000163221 |S100A12     |
|ENSG00000143546 |S100A8      |
|ENSG00000163220 |S100A9      |

$feature_plot


$violin_plot

CD8 T Cell: CD3, CD5, CD8, CD27, CD28

CD8_T_Cell_classic_gene_maker_names <- c("CD3", "CD5", "CD8", "CD27", "CD28")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   CD8_T_Cell_classic_gene_maker_names)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000139193 |CD27        |
|ENSG00000178562 |CD28        |
|ENSG00000110448 |CD5         |

$feature_plot


$violin_plot

Naïve B Cell: P2RX5, PTPRCAP

Naive_B_Cell_classic_gene_maker_names <- c("P2RX5", "PTPRCAP")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Naive_B_Cell_classic_gene_maker_names)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000083454 |P2RX5       |
|ENSG00000213402 |PTPRCAP     |

$feature_plot


$violin_plot

Megakaryocyte Cell: IL6ST

Megakaryocyte_Cell_classic_gene_maker_names <- c("IL6ST")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Megakaryocyte_Cell_classic_gene_maker_names)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000134352 |IL6ST       |

$feature_plot


$violin_plot

Mucosal-associated Invariant T Cell: TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10

Mucosal_associated_Invariant_T_Cell_classic_gene_maker_names <- c("TRAV1-2", "TRAJ33", "TRAJ20", "TRAJ12", "SLC4A10")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Mucosal_associated_Invariant_T_Cell_classic_gene_maker_names)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000144290 |SLC4A10     |
|ENSG00000211877 |TRAJ12      |
|ENSG00000211869 |TRAJ20      |
|ENSG00000211856 |TRAJ33      |
|ENSG00000256553 |TRAV1-2     |

$feature_plot


$violin_plot

Plasmacytoid Dendritic Cell: BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1，CD40, CD80, CD86,

Plasmacytoid_Dendritic_Cell_classic_gene_maker_names <- c(
  "BDCA2", "BDCA-4", "CD123", "CD303", "CD304", "CLEC4C", "DR6", "Fc-epsilon RI-alpha", "ILT3", "ILT7", "NRP1")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Plasmacytoid_Dendritic_Cell_classic_gene_maker_names)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000198178 |CLEC4C      |
|ENSG00000099250 |NRP1        |

$feature_plot


$violin_plot

Erythroid Cell: AHSP, HBA-A1, HBA-A2, HIST1H2AO, STFA1, ALAD, PNPO

Erythroid_Cell_classic_gene_maker_names <- c("AHSP", "HBA-A1", "HBA-A2", "HIST1H2AO", "STFA1","ALAD", "PNPO")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Erythroid_Cell_classic_gene_maker_names)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000169877 |AHSP        |
|ENSG00000148218 |ALAD        |
|ENSG00000108439 |PNPO        |

$feature_plot


$violin_plot

Hematopoietic Stem Cell: CD34, MECOM, EGR1

Hematopoietic_Stem_Cell_classic_gene_maker_names <- c("CD34", "MECOM", "EGR1")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Hematopoietic_Stem_Cell_classic_gene_maker_names)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000174059 |CD34        |
|ENSG00000120738 |EGR1        |
|ENSG00000085276 |MECOM       |

$feature_plot


$violin_plot

Dendritic Cell: CD1C, CD86

Dendritic_Cell_classic_gene_maker_names <- c(
  "CD1C", "CD86")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Dendritic_Cell_classic_gene_maker_names)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000158481 |CD1C        |
|ENSG00000114013 |CD86        |

$feature_plot


$violin_plot

Non-classical Monocyte Cell: CD14, CD16, CX3CR1, LYPD2

Non_classical_Monocyte_Cell_classic_gene_maker_names <- c(
  "CD14", "CD16", "LYPD2")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Non_classical_Monocyte_Cell_classic_gene_maker_names)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000170458 |CD14        |
|ENSG00000197353 |LYPD2       |

$feature_plot


$violin_plot

Plasma Cell: MZB1, IGHG1, JCHAIN, SPAG4, TGM5

Plasma_Cell_classic_gene_maker_names <- c("MZB1", "IGHG1", "JCHAIN", "SPAG4", "TGM5")
plot_feature_plots(BoneMarrow1_pbmc_UMAP, 
                   Plasma_Cell_classic_gene_maker_names)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000277633 |IGHG1       |
|ENSG00000211896 |IGHG1       |
|ENSG00000132465 |JCHAIN      |
|ENSG00000170476 |MZB1        |
|ENSG00000061656 |SPAG4       |
|ENSG00000104055 |TGM5        |

$feature_plot


$violin_plot

Annotate the cell types for the BoneMarrow1 data

# Annotate the cell types for the BoneMarrow1 data
# manually rename the seurat_clusters to the cell types
BoneMarrow1_pbmc_UMAP_new_meta <- BoneMarrow1_pbmc_UMAP@meta.data |>
  # change the seurat_clusters to cell types
  dplyr::mutate(cell_type = case_when(
    seurat_clusters == 0 ~ "Regulatory T-cell",
    seurat_clusters == 1 ~ "Macrophage Cell",
    seurat_clusters == 2 ~ "Natural Killer Cell",
    seurat_clusters == 3 ~ "Neutrophil Precursors",
    seurat_clusters == 4 ~ "CD8 T cell",
    seurat_clusters == 5 ~ "Naïve B cell",
    seurat_clusters == 6 ~ "Natural Killer Cell",
    seurat_clusters == 7 ~ "Megakaryocyte Cell",
    seurat_clusters == 8 ~ "Macrophage Cell",
    seurat_clusters == 9 ~ "CD8 T cell",
    seurat_clusters == 10 ~ "Mucosal-associated Invariant T Cell",
    seurat_clusters == 11 ~ "Plasmacytoid Dendritic Cell",
    seurat_clusters == 12 ~ "Erythroid Cells",
    seurat_clusters == 13 ~ "Hematopoietic Stem Cells",
    seurat_clusters == 14 ~ "CD8 T cell",
    seurat_clusters == 15 ~ "Dendritic Cell",
    seurat_clusters == 16 ~ "Non-classical Monocyte Cell",
    seurat_clusters == 17 ~ "Plasma Cell")) |>
  dplyr::mutate(cell_type = as.factor(cell_type))

# add the cell types to the UMAP object
BoneMarrow1_pbmc_UMAP <- AddMetaData(
  BoneMarrow1_pbmc_UMAP,
  metadata = BoneMarrow1_pbmc_UMAP_new_meta,
  col.name = "cell_type")

# export the UMAP object to the RDS file
saveRDS(BoneMarrow1_pbmc_UMAP, "BoneMarrow1_pbmc_UMAP.rds")

2.7.1.1 Summary Plots

Plot the UMAP with the cell types for the BoneMarrow1 data

DimPlot(BoneMarrow1_pbmc_UMAP,
        reduction = "umap",
        label = TRUE,
        group.by = "cell_type",
        label.size = 3,
        repel = TRUE,
        alpha = 0.5) +
  ggtitle("BoneMarrow1 Cell Types") +
  NoLegend()

Plot the heatmap

DoHeatmap(
  BoneMarrow1_pbmc_UMAP,
  features = BoneMarrow1_pbmc_markers_top_5$gene,
  label = TRUE,
  group.by = "cell_type",
  size = 4) |>
  suppressWarnings() +
  NoLegend() +
  ggtitle("BoneMarrow1 Heatmap") +
  rremove("y.text")

2.7.2 BoneMarrow2 data

# I decided to use the UMAP for the BoneMarrow2 data
# to find the gene markers

if (file.exists("BoneMarrow2_pbmc_markers.rds")) {
  BoneMarrow2_pbmc_markers <- readRDS("BoneMarrow2_pbmc_markers.rds")
} else {
  BoneMarrow2_pbmc_markers <- 
    FindAllMarkers(
      BoneMarrow2_pbmc_UMAP,
      min.pct = 0.01,
      logfc.threshold = 0.01,
      test.use = "wilcox",
      verbose = FALSE)
  saveRDS(BoneMarrow2_pbmc_markers, "BoneMarrow2_pbmc_markers.rds")
}

BoneMarrow2_pbmc_markers_top_5 <-
BoneMarrow2_pbmc_markers |>
  as_tibble() |>
  filter(avg_log2FC > 4) |>
  filter(p_val_adj < 0.01) |>
  group_by(cluster, p_val_adj) |>
  
  # arrange the gene with the order of 
  # high in avg_log2FC and low in p_val_adj
  arrange(cluster, p_val_adj, desc(avg_log2FC)) |>
  mutate(gene_names = map_chr(
    gene, ~ BoneMarrow_data2_IDs$hgnc_symbol[
      match(.x, BoneMarrow_data2_IDs$ensembl_gene_id)])) |>
  
  # select the top 5 genes in each cluster
  group_by(cluster) |>
  dplyr::slice(1:5) |>
  dplyr::select(cluster, gene_names, everything()) 

# export the top 5 highly expressed genes each cluster to a xlsx file
if (!file.exists("BoneMarrow2_pbmc_markers_top_5.xlsx")) {
  write_xlsx(BoneMarrow2_pbmc_markers_top_5, 
             "BoneMarrow2_pbmc_markers_top_5.xlsx")
}

BoneMarrow2_pbmc_markers_top_5 |>
  knitr::kable()

cluster	gene_names	avg_log2FC	pct.1	pct.2	p_val_adj	gene
0	S100A12	8.700578	0.897	0.011	0e+00	ENSG00000163221
0	S100A8	7.696677	0.993	0.093	0e+00	ENSG00000143546
0	S100A9	7.390087	0.998	0.138	0e+00	ENSG00000163220
0	CD14	7.300052	0.676	0.009	0e+00	ENSG00000170458
0	CLEC4E	7.265346	0.657	0.007	0e+00	ENSG00000166523
1	CCR4	5.170545	0.306	0.008	0e+00	ENSG00000183813
1	TTC39C-AS1	4.442917	0.097	0.004	0e+00	ENSG00000264745
1	FOXP3	4.069459	0.062	0.004	0e+00	ENSG00000049768
1	PI16	5.528891	0.039	0.000	0e+00	ENSG00000164530
1	NEFL	4.593021	0.043	0.001	0e+00	ENSG00000277586
2	KLRC2	4.488809	0.642	0.057	0e+00	ENSG00000205809
2	TKTL1	5.667839	0.362	0.017	0e+00	ENSG00000007350
2	KLRC3	4.342381	0.345	0.020	0e+00	ENSG00000205810
2	IL5RA	4.732259	0.072	0.003	0e+00	ENSG00000091181
3	MYOM2	4.906103	0.886	0.055	0e+00	ENSG00000036448
3	FGFBP2	4.041734	0.905	0.100	0e+00	ENSG00000137441
3	SH2D1B	4.343918	0.649	0.045	0e+00	ENSG00000198574
3	LINC00299	4.531001	0.431	0.025	0e+00	ENSG00000236790
3	PRSS23	4.230768	0.455	0.034	0e+00	ENSG00000150687
4	TNFRSF9	4.412879	0.325	0.018	0e+00	ENSG00000049249
4	ASTL	4.889529	0.067	0.002	0e+00	ENSG00000188886
4	VCAM1	6.889529	0.029	0.000	0e+00	ENSG00000162692
4	ZNF80	5.889529	0.019	0.000	1e-07	ENSG00000174255
5	LRRN3	4.684415	0.187	0.006	0e+00	ENSG00000173114
6	SLC4A10	7.301805	0.395	0.003	0e+00	ENSG00000144290
6	TRAV1-2	4.486837	0.238	0.009	0e+00	ENSG00000256553
6	LINC01644	5.420450	0.195	0.004	0e+00	ENSG00000218357
6	TSPAN15	4.319363	0.157	0.008	0e+00	ENSG00000099282
6	IL23R	5.216916	0.114	0.003	0e+00	ENSG00000162594
7	LILRA4	9.930713	0.975	0.005	0e+00	ENSG00000239961
7	LINC02812	9.230507	0.787	0.002	0e+00	ENSG00000230138
7	CLEC4C	8.741929	0.861	0.003	0e+00	ENSG00000198178
7	LRRC26	8.675292	0.672	0.003	0e+00	ENSG00000184709
7	PROC	8.369202	0.787	0.004	0e+00	ENSG00000115718
8	CD1C	6.120903	0.286	0.010	0e+00	ENSG00000158481
8	CAVIN2	5.834204	0.321	0.019	0e+00	ENSG00000168497
8	CLEC10A	5.894971	0.188	0.003	0e+00	ENSG00000132514
8	ENHO	6.330357	0.152	0.002	0e+00	ENSG00000168913
8	C19orf33	7.107964	0.134	0.001	0e+00	ENSG00000167644
9	CD79A	8.333834	0.958	0.021	0e+00	ENSG00000105369
9	MS4A1	7.611297	0.832	0.015	0e+00	ENSG00000156738
9	CD19	8.874331	0.568	0.002	0e+00	ENSG00000177455
9	FCER2	7.647739	0.653	0.006	0e+00	ENSG00000104921
9	FCRL1	7.857457	0.568	0.003	0e+00	ENSG00000163534
10	CDKN1C	7.368670	0.603	0.004	0e+00	ENSG00000129757
10	CKB	8.221113	0.562	0.003	0e+00	ENSG00000166165
10	LYPD2	13.087031	0.438	0.000	0e+00	ENSG00000197353
10	VMO1	9.435237	0.438	0.001	0e+00	ENSG00000182853
10	UICLM	8.747181	0.438	0.001	0e+00	ENSG00000233392
11	PRTN3	9.439529	0.952	0.019	0e+00	ENSG00000196415
11	C1QTNF4	9.376598	0.629	0.003	0e+00	ENSG00000172247
11	SMIM24	7.842478	0.710	0.006	0e+00	ENSG00000095932
11	CTSG	7.899165	0.790	0.010	0e+00	ENSG00000100448
11	MS4A3	8.151139	0.581	0.002	0e+00	ENSG00000149516
12	SPTSSB	6.160634	0.500	0.010	0e+00	ENSG00000196542
12	XCL1	4.973271	0.871	0.051	0e+00	ENSG00000143184
12	KLRC1	4.814189	0.613	0.035	0e+00	ENSG00000134545
12	TNFRSF11A	5.751829	0.194	0.004	0e+00	ENSG00000141655
12	ADGRG3	4.967806	0.226	0.008	0e+00	ENSG00000182885

By looking up the highly expressed genes in each cluster online and in the literature, I can roughly identify the cell types in the BoneMarrow2 data. I found the following cell types:

Cluster	Cell Type
0	Neutrophil
1	Regulatory T-cell
2	Natural Killer Cell
3	CD16 Natural Killer Cell
4	CD8 T cell
5	Regulatory T-cell
6	Mucosal-associated Invariant
7	Plasmacytoid Dendritic Cell
8	Dendritic Cell
9	Naïve B cell
10	Non-classical Monocyte Cell
11	Neutrophil Granulocytes
12	Natural Killer Cell

Since I already have the primary cell types for the BoneMarrow2 data from the top 5 highly expressed genes in each cluster, we can verfiy the cell types by gene markers from the literature or from online databases (e.g., CellMarker 2.0, PanglaoDB)

And I found the following gene markers for the cell types in the BoneMarrow2 data:

Cell Type	Gene Markers
Neutrophil	S100A8, S100A9, S100A12, LCN2, LTF
Regulatory T-cell	CD25, CD4, FOXP3, CCR4, CCR6
Natural Killer Cell	TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX
CD16 Natural Killer Cell	MYOM2, FGFBP2, SH2D1B
CD8 T cell	CD3, CD5, CD8, CD27, CD28
Mucosal-associated Invariant T Cell	TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10
Plasmacytoid Dendritic Cell	BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1
Dendritic Cell	CD1C, CD86
Naïve B cell	MS4A1, CD79A, P2RX5, PTPRCAP
Non-classical Monocyte Cell	CD14, CD16, CX3CR1, LYPD2
Neutrophil Granulocytes	S100A8, S100A9, S100A12, LCN2, LTF

Then plot feature plots for the classic gene markers for each cell type to verify the cell types in the BoneMarrow2 data.

Neutrophil: S100A8, S100A9, S100A12, LCN2, LTF

Neutrophil_markers_bone_marrow2 <- c("S100A8", "S100A9", "S100A12", "LCN2", "LTF")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   Neutrophil_markers_bone_marrow2)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000148346 |LCN2        |
|ENSG00000012223 |LTF         |
|ENSG00000163221 |S100A12     |
|ENSG00000143546 |S100A8      |
|ENSG00000163220 |S100A9      |

$feature_plot


$violin_plot

Regulatory T-cell: CD25, CD4, FOXP3, CCR4, CCR6

Regulatory_T_cell_markers_bone_marrow2 <- c("CD25", "CD4", "FOXP3", "CCR4", "CCR6")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   Regulatory_T_cell_markers_bone_marrow2)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000183813 |CCR4        |
|ENSG00000112486 |CCR6        |
|ENSG00000010610 |CD4         |
|ENSG00000049768 |FOXP3       |

$feature_plot


$violin_plot

Natural Killer Cell: TRDC, KLRF1, GNLY, NCR1, GZMA, HOPX

Natural_Killer_Cell_markers_bone_marrow2 <- c("TRDC", "KLRF1", "GNLY", "NCR1", "GZMA", "HOPX")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   Natural_Killer_Cell_markers_bone_marrow2)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000115523 |GNLY        |
|ENSG00000145649 |GZMA        |
|ENSG00000171476 |HOPX        |
|ENSG00000150045 |KLRF1       |
|ENSG00000275521 |NCR1        |
|ENSG00000278025 |NCR1        |
|ENSG00000277442 |NCR1        |
|ENSG00000277334 |NCR1        |
|ENSG00000276450 |NCR1        |
|ENSG00000277629 |NCR1        |
|ENSG00000273506 |NCR1        |
|ENSG00000275637 |NCR1        |
|ENSG00000277824 |NCR1        |
|ENSG00000275822 |NCR1        |
|ENSG00000274053 |NCR1        |
|ENSG00000278362 |NCR1        |
|ENSG00000273916 |NCR1        |
|ENSG00000273535 |NCR1        |
|ENSG00000275156 |NCR1        |
|ENSG00000189430 |NCR1        |
|ENSG00000211829 |TRDC        |

$feature_plot


$violin_plot

CD16 Natural Killer Cell: MYOM2, FGFBP2, SH2D1B

CD16_Natural_Killer_Cell_markers_bone_marrow2 <- c("MYOM2", "FGFBP2", "SH2D1B")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   CD16_Natural_Killer_Cell_markers_bone_marrow2)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000137441 |FGFBP2      |
|ENSG00000274137 |MYOM2       |
|ENSG00000036448 |MYOM2       |
|ENSG00000198574 |SH2D1B      |

$feature_plot


$violin_plot

CD8 T cell: CD3, CD5, CD8, CD27, CD28

CD8_T_cell_markers_bone_marrow2 <- c("CD3", "CD5", "CD8", "CD27", "CD28")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   CD8_T_cell_markers_bone_marrow2)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000139193 |CD27        |
|ENSG00000178562 |CD28        |
|ENSG00000110448 |CD5         |

$feature_plot


$violin_plot

Mucosal-associated Invariant T Cell: TRAV1-2, TRAJ33, TRAJ20, TRAJ12, SLC4A10

Mucosal_associated_Invariant_T_Cell_markers_bone_marrow2 <- c("TRAV1-2", "TRAJ33", "TRAJ20", "TRAJ12", "SLC4A10")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   Mucosal_associated_Invariant_T_Cell_markers_bone_marrow2)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000144290 |SLC4A10     |
|ENSG00000211877 |TRAJ12      |
|ENSG00000211869 |TRAJ20      |
|ENSG00000211856 |TRAJ33      |
|ENSG00000256553 |TRAV1-2     |

$feature_plot


$violin_plot

Plasmacytoid Dendritic Cell: BDCA2, BDCA-4, CD123, CD303, CD304, CLEC4C, DR6, Fc-epsilon RI-alpha, ILT3, ILT7, NRP1

Plasmacytoid_Dendritic_Cell_markers_bone_marrow2 <- c("BDCA2", "BDCA-4", "CD123", "CD303", "CD304", "CLEC4C", "DR6", "Fc-epsilon RI-alpha", "ILT3", "ILT7", "NRP1")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   Plasmacytoid_Dendritic_Cell_markers_bone_marrow2)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000198178 |CLEC4C      |
|ENSG00000099250 |NRP1        |

$feature_plot


$violin_plot

Dendritic Cell: CD1C, CD86

Dendritic_Cell_markers_bone_marrow2 <- c("CD1C", "CD86")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   Dendritic_Cell_markers_bone_marrow2)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000158481 |CD1C        |
|ENSG00000114013 |CD86        |

$feature_plot


$violin_plot

Naïve B cell: MS4A1, CD79A, P2RX5

Naive_B_cell_markers_bone_marrow2 <- c("MS4A1", "CD79A", "P2RX5")
plot_feature_plots(BoneMarrow2_pbmc_UMAP, 
                   Naive_B_cell_markers_bone_marrow2)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000105369 |CD79A       |
|ENSG00000156738 |MS4A1       |
|ENSG00000083454 |P2RX5       |

$feature_plot


$violin_plot

Annotate the cell types for the BoneMarrow2 data

# Annotate the cell types for the BoneMarrow2 data
# manually rename the seurat_clusters to the cell types
BoneMarrow2_pbmc_UMAP_new_meta <- BoneMarrow2_pbmc_UMAP@meta.data |>
  # change the seurat_clusters to cell types
  dplyr::mutate(cell_type = case_when(
    seurat_clusters == 0 ~ "Neutrophil",
    seurat_clusters == 1 ~ "Regulatory T-cell",
    seurat_clusters == 2 ~ "Natural Killer Cell",
    seurat_clusters == 3 ~ "CD16 Natural Killer Cell",
    seurat_clusters == 4 ~ "CD8 T Cell",
    seurat_clusters == 5 ~ "Regulatory T-cell",
    seurat_clusters == 6 ~ "Mucosal-associated Invariant T Cell",
    seurat_clusters == 7 ~ "Plasmacytoid Dendritic Cell",
    seurat_clusters == 8 ~ "Dendritic Cell",
    seurat_clusters == 9 ~ "Naïve B Cell",
    seurat_clusters == 10 ~ "Non-classical Monocyte Cell",
    seurat_clusters == 11 ~ "Neutrophil Granulocytes",
    seurat_clusters == 12 ~ "Natural Killer Cell")) |>
  dplyr::mutate(cell_type = as.factor(cell_type))
    
# add the cell types to the UMAP object    
BoneMarrow2_pbmc_UMAP <- AddMetaData(
  BoneMarrow2_pbmc_UMAP,
  metadata = BoneMarrow2_pbmc_UMAP_new_meta,
  col.name = "cell_type")

# export the UMAP object to a rds file
saveRDS(BoneMarrow2_pbmc_UMAP, "BoneMarrow2_pbmc_UMAP.rds")

2.7.2.1 Summary Plots

Plot the UMAP with the cell types for the BoneMarrow2 data

DimPlot(BoneMarrow2_pbmc_UMAP,
        reduction = "umap",
        label = TRUE,
        group.by = "cell_type",
        label.size = 3,
        repel = TRUE,
        alpha = 0.5) +
  ggtitle("BoneMarrow2 Cell Types") +
  NoLegend()

Plot the heatmap

DoHeatmap(
  BoneMarrow2_pbmc_UMAP,
  features = BoneMarrow2_pbmc_markers_top_5$gene,
  label = TRUE,
  group.by = "cell_type",
  angle = 23,
  size = 4) |> 
  suppressWarnings() +
  NoLegend() +
  ggtitle("BoneMarrow2 Heatmap") +
  rremove("y.text")

3 Part 2: Pancreas

3.1 Read the data

# Load the Pancreas dataset
Pancreas <- readRDS(
  paste("/Users/jacenai/Desktop/STAT_M254/",
        "Final_Project/Datasets_final/",
        "Pancreas.rds",
        sep = "")) |>
  as.data.frame()

3.2 Create Seurat and QC

# Create the Pancreas Seurat object
Pancreas <- Seurat::CreateSeuratObject(
  counts = Pancreas,
  project = "Pancreas",
  assay = "DNA",
  min.cells = 10,
  min.features = 200) |>
  suppressWarnings()

# Find mitochondrial genes for Pancreas
Pancreas[["percent.mt"]] <- PercentageFeatureSet(
  Pancreas,
  pattern = "^MT-") |>
  suppressWarnings()

VlnPlot(Pancreas,
        features = c("nFeature_DNA", 
                     "nCount_DNA",
                     "percent.mt"), 
        ncol = 3) |>
  suppressWarnings()

3.3 Transformation & Feature selection & Scaling

I will try log1pCPM, and SCTransform to transform the data.

# Transform the pancreas data
# Seurat default log-normalization (log1pCP10k)
pancreas_log1pCP1k_1 <- NormalizeData(
  Pancreas,
  normalization.method = "LogNormalize",
  scale.factor = 1000)

# identify the highly variable genes
pancreas_features_log1pCP1k_1 <- FindVariableFeatures(
  pancreas_log1pCP1k_1,
  selection.method = "vst",
  nfeatures = 2000) # tune this parameter

# Scale the data
pancreas_scaled_log1pCP1k_1 <- 
  ScaleData(pancreas_features_log1pCP1k_1, 
            features = 
              VariableFeatures(pancreas_features_log1pCP1k_1))


# SCTransform the Pancreas data
pancreas_SCTransform <- SCTransform(
  Pancreas,
  verbose = FALSE,
  variable.features.n = 3000, # tune this parameter
  return.only.var.genes = TRUE,
  assay = "DNA",
  vars.to.regress = c("nCount_DNA", "percent.mt"))

Calculate the variance of transformed data and plot it against the mean.

# Calculate the variance of the transformed data
pancreas_variance_log1pCP1k_1 <- 
  pancreas_scaled_log1pCP1k_1[["DNA"]]@layers$scale.data |>
  apply(1, var)

pancreas_mean_log1pCP1k_1 <- 
  pancreas_scaled_log1pCP1k_1[["DNA"]]@layers$scale.data |>
  apply(1, mean)

pancreas_variance_SCTransform <- 
  pancreas_SCTransform[["SCT"]]@scale.data |>
  apply(1, var)

pancreas_mean_SCTransform <- 
  pancreas_SCTransform[["SCT"]]@scale.data |>
  apply(1, mean)

Plot the variance against the mean

# Plot the variance against the mean

# Create a data frame for plotting
gene_var_mean_df <-
  data.frame(Gene_Variance =
               c(pancreas_variance_log1pCP1k_1,
                 pancreas_variance_SCTransform),
             Gene_Mean =
               c(pancreas_mean_log1pCP1k_1,
                 pancreas_mean_SCTransform),
             Transformation =
               rep(c("log1pCP1k", "SCTransform"),
                   c(length(pancreas_variance_log1pCP1k_1),
                     length(pancreas_variance_SCTransform))))

# Plot the variance against the mean
ggplot(gene_var_mean_df,
       aes(x = Gene_Mean,
           y = Gene_Variance,
           color = Transformation)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~Transformation,
             scales = "free") +
  theme_gray() +
  labs(title = "Variance vs Mean",
       x = "Mean",
       y = "Variance") +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 5))

I decided to use SCTransform for the downstream analysis.

3.4 Linear dimensional reduction

# I decided to use SCTransform for the downstream analysis
# Perform PCA on the Pancreas data
Pancreas_pbmc_PCA <- RunPCA(
  pancreas_SCTransform,
  features = VariableFeatures(pancreas_SCTransform),
  npcs = 100,
  verbose = FALSE)
ElbowPlot(Pancreas_pbmc_PCA, ndims = 100)

3.5 Get the gene names

Ensembl IDs to gene

if (file.exists("Pancreas_data_IDs.rds")) {
  Pancreas_data_IDs <- readRDS("Pancreas_data_IDs.rds")
} else {
  Pancreas_data_IDs <- getBM(
    attributes = c("ensembl_gene_id","hgnc_symbol"),
    filters = 'ensembl_gene_id',
    values = rownames(Pancreas),
    mart = mrt)
  saveRDS(Pancreas_data_IDs, "Pancreas_data_IDs.rds")
}

3.6 Clustering

# Perform neighborhood-based clustering
Pancreas_pbmc_neighbor <- FindNeighbors(
  Pancreas_pbmc_PCA,
  reduction = "pca",
  dims = 1:100,
  k.param = 30,
  prune.SNN = 1/15,
  verbose = FALSE)

Pancreas_pbmc_cluster <- FindClusters(
  Pancreas_pbmc_neighbor,
  resolution = 0.5 ,
  verbose = FALSE)

Run UMAP and t-SNE

# Run UMAP
Pancreas_pbmc_UMAP <- RunUMAP(
  Pancreas_pbmc_cluster,
  dims = 1:100,
  verbose = FALSE)

# try t_SNE
Pancreas_pbmc_tSNE <- RunTSNE(
  Pancreas_pbmc_cluster,
  dims = 1:90,
  verbose = FALSE)

Plot the UMAP and t-SNE

DimPlot(Pancreas_pbmc_UMAP,
        reduction = "umap",
        label = TRUE,
        group.by = "seurat_clusters",
        alpha = 0.5) +
  ggtitle("Pancreas UMAP")

DimPlot(Pancreas_pbmc_tSNE,
        reduction = "tsne",
        label = TRUE,
        group.by = "seurat_clusters",
        alpha = 0.5) +
  ggtitle("Pancreas t-SNE")

By comparing the UMAP and t-SNE plots, I decided to use the UMAP.

3.7 Annotation Results

# Find gene markers
if (file.exists("pancreas_gene_markers.rds")) {
  pancreas_gene_markers <- readRDS("pancreas_gene_markers.rds")
} else {
  pancreas_gene_markers <- FindAllMarkers(
    Pancreas_pbmc_UMAP,
    min.pct = 0.01,
    logfc.threshold = 0.01,
    test.use = "wilcox",
    verbose = FALSE)
  saveRDS(pancreas_gene_markers, "pancreas_gene_markers.rds")
}

# Filter the gene markers
Pancreas_pbmc_markers_top_5 <-
  pancreas_gene_markers |>
  as_tibble() |>
  filter(avg_log2FC > 2) |>
  filter(p_val_adj < 0.01) |>
  group_by(cluster, p_val_adj) |>
  
  # arrange the gene with the order of 
  # high in avg_log2FC and low in p_val_adj
  arrange(cluster, p_val_adj, desc(avg_log2FC)) |>
  mutate(gene_names = map_chr(
    gene, ~ Pancreas_data_IDs$hgnc_symbol[
      match(.x, Pancreas_data_IDs$ensembl_gene_id)])) |>
  
  # select the top 5 genes in each cluster
  group_by(cluster) |>
  dplyr::slice(1:5) |>
  dplyr::select(cluster, gene_names, everything()) 

Pancreas_pbmc_markers_top_5 |>
  as_tibble() |>
  knitr::kable()

cluster	gene_names	avg_log2FC	pct.1	pct.2	gene
0	GATM	2.314042	1.000	0.817	ENSG00000171766
0	SYCN	2.056717	1.000	0.969	ENSG00000179751
0	AMY2A	2.232347	0.998	0.994	ENSG00000243480
0	AQP8	2.280071	0.948	0.561	ENSG00000103375
0	AMY2B	2.190193	0.993	0.883	ENSG00000240038
1	REG3G	4.080445	1.000	0.844	ENSG00000143954
1	REG1B	3.215856	1.000	1.000	ENSG00000172023
1	REG1A	2.081271	1.000	1.000	ENSG00000115386
1	SERPINA3	3.717401	0.534	0.083	ENSG00000196136
1	C3	3.439010	0.685	0.193	ENSG00000125730
2	GYPC	3.637994	0.741	0.105	ENSG00000136732
2	DAB2	4.486262	0.553	0.049	ENSG00000153071
2	S100A13	2.685883	0.962	0.452	ENSG00000189171
2	AQP1	4.018850	0.662	0.103	ENSG00000240583
2	CAVIN1	2.775235	0.700	0.100	ENSG00000177469
3	HSD11B1-AS1	3.592255	0.329	0.034	ENSG00000227591
3	LINC01610	2.458333	0.883	0.643	ENSG00000237643
3	TAC1	3.921941	0.428	0.109	ENSG00000006128
3	CPA4	2.247372	0.266	0.062	ENSG00000128510
3	ANXA10	2.430645	0.302	0.089	ENSG00000109511
4	RXRG	6.361009	0.399	0.005	ENSG00000143171
4	ADCYAP1	5.879420	0.454	0.014	ENSG00000141433
4	PCSK1	4.752696	0.475	0.018	ENSG00000175426
4	TAGLN3	5.974577	0.464	0.017	ENSG00000144834
4	ASB9	6.459432	0.492	0.024	ENSG00000102048
5	MALL	5.182514	0.982	0.118	ENSG00000144063
5	ALDH1A3	4.547304	0.945	0.129	ENSG00000184254
5	FLNA	4.041860	0.982	0.158	ENSG00000196924
5	S100A14	3.988552	0.936	0.135	ENSG00000189334
5	TSPAN15	3.189382	0.955	0.157	ENSG00000099282
6	IRX2-DT	6.099829	0.833	0.023	ENSG00000186493
6	F10	5.597897	0.833	0.031	ENSG00000126218
6	CRH	6.225981	0.567	0.012	ENSG00000147571
6	SMIM24	5.272775	0.817	0.043	ENSG00000095932
6	PCSK2	5.318404	0.933	0.067	ENSG00000125851
7	MALAT1	4.024411	1.000	0.992	ENSG00000251562
7	NEAT1	4.437703	1.000	0.947	ENSG00000245532
7	LENG8	2.830771	1.000	0.752	ENSG00000167615
7	RBM25	2.400703	0.982	0.569	ENSG00000119707
7	MEG3	2.917470	0.912	0.474	ENSG00000214548

# export the top 5 highly expressed genes each cluster to a xlsx file
if (!file.exists("Pancreas_pbmc_markers_top_5.xlsx")) {
  write_xlsx(Pancreas_pbmc_markers_top_5, 
             "Pancreas_pbmc_markers_top_5.xlsx")
}

By looking up the highly expressed genes in each cluster online and in the literature, I can roughly identify the cell types in the Pancreas data. I found the following cell types:

Cluster	Cell Type
0	Acinar Cell
1	Acinar Cell
2	Ductal Cell
3	Acinar Cell
4	Beta Cell
5	Ductal Cell
6	Alpha Cell
7	Mixed Immune Cell

Since I already have the primary cell types for the Pancreas data from the top 5 highly expressed genes in each cluster, we can verfiy the cell types by gene markers from the literature or from online databases (e.g., CellMarker 2.0, PanglaoDB)

And I found the following gene markers for the cell types in the Pancreas data:

Cell Type	Gene Markers
Acinar Cell	PRSS1, CELA3A, CELA3B, CPA1, CTRB1, CTRB2, REG1A, REG1B, REG3A, REG3G, REG4
Ductal Cell	KRT19, KRT7, KRT8, KRT18, KRT20, MUC1, MUC6, MUC5AC, MUC5B, SOX9
Beta Cell	INS, GCG, IAPP, PCSK1, PCSK2, SLC2A2, SLC30A8, NKX6-1, PDX1, MAFA
Alpha Cell	GCG, ARX, IRX2, IRX3, IRX5, IRX6, IRX1, IRX4, IRX7, IRX8, IRX9
Mixed Immune Cell	CD3D, CD3E, CD3G, CD4, CD8A, CD8B, CD19, CD20, CD22, CD79A, CD79B, CD34, CD38

Then plot feature plots for the classic gene markers for each cell type to verify the cell types in the Pancreas data.

Acinar Cell: PRSS1, CELA3A, CELA3B, CPA1, CTRB1, CTRB2, REG1A, REG1B, REG3A, REG3G, REG4

Acinar_cell_markers <- c("PRSS1", "CELA3A", "CELA3B", "CPA1")
plot_feature_plots(Pancreas_pbmc_UMAP, 
                   Acinar_cell_markers)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000142789 |CELA3A      |
|ENSG00000219073 |CELA3B      |
|ENSG00000091704 |CPA1        |
|ENSG00000274247 |PRSS1       |
|ENSG00000204983 |PRSS1       |

$feature_plot


$violin_plot

Ductal Cell: KRT19, KRT7, KRT8, KRT18, KRT20, MUC1, MUC6, MUC5AC, MUC5B, SOX9

Ductal_cell_markers <- c("KRT19", "KRT7", "KRT8", "KRT18", "MUC5AC")
plot_feature_plots(Pancreas_pbmc_UMAP, 
                   Ductal_cell_markers)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000111057 |KRT18       |
|ENSG00000171345 |KRT19       |
|ENSG00000135480 |KRT7        |
|ENSG00000170421 |KRT8        |
|ENSG00000215182 |MUC5AC      |
|ENSG00000291363 |MUC5AC      |

$feature_plot


$violin_plot

Beta Cell: INS, GCG, IAPP, PCSK1, PCSK2, SLC2A2, SLC30A8, NKX6-1, PDX1, MAFA

Beta_cell_markers <- c("INS", "GCG", "IAPP", "PCSK1", "PCSK2")
plot_feature_plots(Pancreas_pbmc_UMAP, 
                   Beta_cell_markers)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000115263 |GCG         |
|ENSG00000121351 |IAPP        |
|ENSG00000254647 |INS         |
|ENSG00000175426 |PCSK1       |
|ENSG00000125851 |PCSK2       |

$feature_plot


$violin_plot

Alpha Cell: GCG, ARX, IRX2, IRX3, IRX5, IRX6, IRX1, IRX4, IRX7, IRX8, IRX9

Alpha_cell_markers <- c("GCG", "ARX", "IRX2", "IRX3")
plot_feature_plots(Pancreas_pbmc_UMAP, 
                   Alpha_cell_markers)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000004848 |ARX         |
|ENSG00000115263 |GCG         |
|ENSG00000170561 |IRX2        |
|ENSG00000177508 |IRX3        |

$feature_plot


$violin_plot

Mixed Immune Cell: MALAT1, NEAT1

Mixed_immune_cell_markers <- c("MALAT1", "NEAT1")
plot_feature_plots(Pancreas_pbmc_UMAP, 
                   Mixed_immune_cell_markers)

$gene_id_names


|ensembl_gene_id |hgnc_symbol |
|:---------------|:-----------|
|ENSG00000251562 |MALAT1      |
|ENSG00000245532 |NEAT1       |

$feature_plot


$violin_plot

Annotate the cell types

Annotate the cell types for the Pancreas data based on the gene markers and the feature plots.

# Annotate the cell types for the Pancreas data
# manually rename the seurat_clusters to the cell types
Pancreas_pbmc_UMAP_new_meta <- Pancreas_pbmc_UMAP@meta.data |>
  # change the seurat_clusters to cell types
  dplyr::mutate(cell_type = case_when(
    seurat_clusters == 0 ~ "Acinar Cell",
    seurat_clusters == 1 ~ "Acinar Cell",
    seurat_clusters == 2 ~ "Ductal Cell",
    seurat_clusters == 3 ~ "Acinar Cell",
    seurat_clusters == 4 ~ "Beta Cell",
    seurat_clusters == 5 ~ "Ductal Cell",
    seurat_clusters == 6 ~ "Alpha Cell",
    seurat_clusters == 7 ~ "Mixed Immune Cell")) |>
  dplyr::mutate(cell_type = as.factor(cell_type))

# add the cell types to the UMAP object
Pancreas_pbmc_UMAP <- AddMetaData(
  Pancreas_pbmc_UMAP,
  metadata = Pancreas_pbmc_UMAP_new_meta,
  col.name = "cell_type")

# export the UMAP object to a rds file
saveRDS(Pancreas_pbmc_UMAP, "Pancreas_pbmc_UMAP.rds")

3.7.0.1 Summary Plots

Plot the UMAP with the cell types for the Pancreas data.

DimPlot(Pancreas_pbmc_UMAP,
        reduction = "umap",
        label = TRUE,
        group.by = "cell_type",
        label.size = 4,
        repel = TRUE,
        alpha = 0.5) +
  ggtitle("Pancreas Cell Types") +
  NoLegend()

Plot the heatmap

# plot the heatmap
DoHeatmap(
  Pancreas_pbmc_UMAP,
  features = Pancreas_pbmc_markers_top_5$gene,
  label = TRUE,
  group.by = "cell_type",
  size = 4) |>
  suppressWarnings() +
  NoLegend() +
  rremove("y.text")

New marker for alpha cells

FeaturePlot(
  Pancreas_pbmc_UMAP,
  features = c("ENSG00000126218"),
  reduction = "umap",
  slot = "data")

4 Conclusion

In this final project, I have analyzed the BoneMarrow1, BoneMarrow2 and Pancreas data using the Seurat package in R. I have identified 14 cell types in the BoneMarrow1 data, 11 cell types in the BoneMarrow2 data, and 5 cell types in the Pancreas data. I have also identified the gene markers for each cell type and annotated the cell types based on the gene markers and the feature plots. Finally, I have plotted the heatmap for the top 5 gene markers for each cell type in all three datasets. The results of this analysis can be used to better understand the cell types in the BoneMarrow1, BoneMarrow2 and Pancreas data and to further investigate the role of these cell types in health and disease.