Section 5: snRNAseq/scRNAseq analysis
snRNAseq analysis
This section uses snRNA-seq BAP.d8 dataset (Khan et. al., GSE171768) and nTE/nCT dataset (Io et. al., GSE167924) to run sn and scRNA-seq analyses with Seurat. Briefly, the count data are imported into R as a Seurat object, samples are integrated, transformed, and clustering analyses is performed. Expression of marker genes in each cluster, composition of cell types and PlacentaCellEnrich are then plotted.
Prerequisites
R packages required for this section are loaded.
setwd("~/github/BAPvsTrophoblast_Amnion")
library(Seurat)
#> Warning in register(): Can't find generic `scale_type` in package ggplot2 to
#> register S3 method.
library(SeuratWrappers)
library(knitr)
library(kableExtra)
library(ggplot2)
library(cowplot)
library(patchwork)
library(metap)
library(multtest)
library(gridExtra)
library(dplyr)
library(stringr)
library(TissueEnrich)
library(gprofiler2)
library(tidyverse)
library(enhancedDimPlot)
library(calibrate)
library(ggrepel)
library(dittoSeq)
library(ComplexHeatmap)
library(RColorBrewer)
library(pheatmap)
library(scales)
library(plotly)
library(R.utils)
library(biomaRt)
Import datasets
The 10X data was already processed with CellRanger
(v.4.0.0) and the count table was ready for us to import for the data analyses. We used the inbuilt function to import this data and to create a Seurat
object as described below.
= "BAP"
experiment_name <- "~/TutejaLab/expression-data"
dataset_loc <-
ids c(
"5pcO2_r1",
"5pcO2_r2",
"20pcO2_r1",
"20pcO2_r2",
"nCT_D5",
"nCT_D10",
"nTE_D2",
"nTE_D3"
)<- sapply(ids, function(i) {
d10x.data <-
d10x Read10X(file.path(dataset_loc, i, "filtered_feature_bc_matrix"))
colnames(d10x) <-
paste(sapply(strsplit(colnames(d10x), split = "-"), '[[' , 1L),
i,sep ="-")
d10x
})<- do.call("cbind", d10x.data)
experiment.data <- CreateSeuratObject(
bapd8.combined
experiment.data,project = "BAPd8",
min.cells = 10,
min.genes = 200,
names.field = 2,
names.delim = "\\-"
)
Data quality inspection
After the data was imported, we checked the quality of the data. Mitochondrial expression is an important criteria (along with other quantitative features of each nuclei) to decide if the nucleus is high quality. We tested it as follows.
MT ratio in nucleus
<- bapd8.combined
bapd8.temp $log10GenesPerUMI <-
bapd8.combinedlog10(bapd8.combined$nFeature_RNA) / log10(bapd8.combined$nCount_RNA)
$mitoRatio <-
bapd8.combinedPercentageFeatureSet(object = bapd8.combined, pattern = "^MT-")
$mitoRatio <- bapd8.combined@meta.data$mitoRatio / 100
bapd8.combined<- bapd8.combined@meta.data
metadata $cells <- rownames(metadata)
metadata<- metadata %>%
metadata ::rename(
dplyrseq_folder = orig.ident,
nUMI = nCount_RNA,
nGene = nFeature_RNA,
seq_folder = orig.ident
)
<-
mt ggplot(dat = metadata, aes(x = nUMI, y = nGene, color = mitoRatio)) +
geom_point(alpha = 0.5) +
scale_colour_gradient(low = "gray90", high = "black") +
labs(colour = "MT ratio") +
theme_bw(base_size = 12) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
panel.border = element_rect(colour = "black")
+
) xlab("RNA counts") + ylab("Gene counts") +
stat_smooth(method = lm) +
facet_wrap(~ seq_folder, labeller = labeller(
seq_folder =
c(
"20pcO2_r1" = "20% Oxygen (rep1)",
"20pcO2_r2" = "20% Oxygen (rep2)",
"5pcO2_r1" = "5% Oxygen (rep1)",
"5pcO2_r2" = "5% Oxygen (rep2)",
"nCT_D5" = "nCT day 5",
"nCT_D10" = "nCT day 10",
"nTE_D2" = "nTE day 2",
"nTE_D3" = "nTE day 3"
)+
)) scale_y_continuous(label = comma) +
scale_x_continuous(label = comma)
mt
Number of nuclei per sample
<- ggplot(metadata, aes(x = seq_folder, fill = seq_folder)) +
ncells geom_bar() +
theme_classic() +
theme(axis.text.x = element_text(
angle = 45,
vjust = 1,
hjust = 1
+
)) theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("Number of Cells")
ncells
Density of nuclei per sample
<-
dcells ggplot(metadata, aes(color = seq_folder, x = nUMI, fill = seq_folder)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
ylab("Cell density") +
geom_vline(xintercept = 500)
dcells
Number of cells vs. genes
<-
ngenes ggplot(metadata, aes(x = seq_folder, y = log10(nGene), fill = seq_folder)) +
geom_boxplot() +
theme_classic() +
theme(axis.text.x = element_text(
angle = 45,
vjust = 1,
hjust = 1
+
)) theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("NCells vs NGenes")
ngenes
Number of cells vs. transcripts
<-
dtranscripts ggplot(metadata,
aes(x = log10GenesPerUMI, color = seq_folder, fill = seq_folder)) +
geom_density(alpha = 0.2) +
theme_classic() +
geom_vline(xintercept = 0.8)
dtranscripts
Mitochondrial density across samples
<-
mtratio ggplot(metadata, aes(color = seq_folder, x = mitoRatio, fill = seq_folder)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
geom_vline(xintercept = 0.2)
mtratio#> Warning: Transformation introduced infinite values in continuous x-axis
#> Warning: Removed 21 rows containing non-finite values (stat_density).
Data filtering
After inspection, we decided to remove all mitochondrial genes as well as ribosomal genes from our analyses.
set up the metadata file and organize
<- bapd8.temp
bapd8.combined <- bapd8.combined@meta.data
df $replicate <- NA
df$replicate[which(str_detect(df$orig.ident, "5pcO2"))] <- "5pcO2"
df$replicate[which(str_detect(df$orig.ident, "20pcO2"))] <- "20pcO2"
df$replicate[which(str_detect(df$orig.ident, "nCT_"))] <- "nCT"
df$replicate[which(str_detect(df$orig.ident, "nTE_"))] <- "nTE"
df@meta.data <- df
bapd8.combined"percent.mt"]] <-
bapd8.combined[[PercentageFeatureSet(bapd8.combined, pattern = "^MT-")
Comparing samples QC across replicates
<-
p VlnPlot(bapd8.combined, features = "nFeature_RNA", pt.size = 1) +
geom_hline(yintercept = 200,
color = "red",
size = 1) +
geom_hline(yintercept = 10000,
color = "red",
size = 1) +
theme(legend.position = "none")
<- VlnPlot(bapd8.combined, features = "nCount_RNA", pt.size = 1) +
q theme(legend.position = "none")
<- VlnPlot(bapd8.combined, features = "percent.mt", pt.size = 1) +
r geom_hline(yintercept = 15,
color = "red",
size = 1) +
theme(legend.position = "none")
<-
panel_plot plot_grid(p,
q,
r,labels = c("A", "B", "C"),
ncol = 3,
nrow = 1)
panel_plot
Filtering
<-
B1 FeatureScatter(bapd8.combined,
feature1 = "nCount_RNA",
feature2 = "percent.mt")
<-
B2 FeatureScatter(bapd8.combined,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")
<-
bapd8.combined subset(bapd8.combined,
subset = nFeature_RNA > 200 &
< 10000 & percent.mt < 25)
nFeature_RNA <-
I1 FeatureScatter(bapd8.combined,
feature1 = "nCount_RNA",
feature2 = "percent.mt")
<-
I2 FeatureScatter(bapd8.combined,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")
<-
bapd8.combined subset(bapd8.combined,
subset = nFeature_RNA > 200 &
< 10000 & percent.mt < 15)
nFeature_RNA <-
A1 FeatureScatter(bapd8.combined,
feature1 = "nCount_RNA",
feature2 = "percent.mt")
<-
A2 FeatureScatter(bapd8.combined,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")
<- B1 | B2
B <- I1 | I2
I <- A1 | A2
A <-
panel_plot plot_grid(
B1,
B2,
I1,
I2,
A1,
A2,labels = c("A", "B", "C", "D", "E", "F"),
ncol = 2,
nrow = 3
) panel_plot
Removing ribosomal and MT transcripts
<- GetAssayData(object = bapd8.combined, slot = "counts")
counts <-
counts grep(pattern = "^MT",
counts[x = rownames(counts),
invert = TRUE), ]
<-
counts grep(pattern = "^MT",
counts[x = rownames(counts),
invert = TRUE), ] #why filtering for MT genes twice?
<-
counts grep(pattern = "^RPL",
counts[x = rownames(counts),
invert = TRUE), ]
<-
counts grep(pattern = "^RPS",
counts[x = rownames(counts),
invert = TRUE), ]
<-
counts grep(pattern = "^MRPS",
counts[x = rownames(counts),
invert = TRUE), ]
<-
counts grep(pattern = "^MRPL",
counts[x = rownames(counts),
invert = TRUE), ]
<- Matrix::rowSums(counts) >= 10
keep_genes <- counts[keep_genes,]
filtered_counts <-
bapd8.fcombined CreateSeuratObject(filtered_counts, meta.data = bapd8.combined@meta.data)
@meta.data <- bapd8.fcombined@meta.data[1:4]
bapd8.fcombined<- bapd8.fcombined bapd8.combined
Data integration and Clustering
Seurat
package was used for integrating samples and running the snRNA-seq analyses.
Data integration/clustering
(see optimization section below)
<- SplitObject(bapd8.combined, split.by = "orig.ident")
bapd8.list <- lapply(
bapd8.list X = bapd8.list,
FUN = function(x) {
<- NormalizeData(x)
x <-
x FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
}
)<-
bapd8.anchors FindIntegrationAnchors(object.list = bapd8.list, dims = 1:20)
<-
bapd8.integrated IntegrateData(anchorset = bapd8.anchors, dims = 1:20)
DefaultAssay(bapd8.integrated) <- "integrated"
<- ScaleData(bapd8.integrated, verbose = FALSE)
bapd8.integrated <-
bapd8.integrated RunPCA(bapd8.integrated, npcs = 30, verbose = FALSE)
<-
bapd8.integrated RunUMAP(bapd8.integrated, reduction = "pca", dims = 1:20)
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
<-
bapd8.integrated FindNeighbors(bapd8.integrated, reduction = "pca", dims = 1:20)
<- FindClusters(bapd8.integrated, resolution = 0.5)
bapd8.integrated #> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 17208
#> Number of edges: 610547
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8716
#> Number of communities: 13
#> Elapsed time: 1 seconds
<- nlevels(bapd8.integrated$seurat_clusters)
num.clusters
num.clusters#> [1] 13
Optimization: Cells/Genes defining PCA (4 PCs)
VizDimLoadings(bapd8.integrated, dims = 1:4, reduction = "pca")
Optimization: Determine data dimensionality
<- JackStraw(bapd8.integrated, num.replicate = 100)
bapd8.integrated <- ScoreJackStraw(bapd8.integrated, dims = 1:20)
bapd8.integrated <- ElbowPlot(bapd8.integrated)
elbow <- JackStrawPlot(bapd8.integrated, dims = 1:20)
jack <- plot_grid(elbow, jack, labels = c('A', 'B'), ncol = 1)
panel_plot #> Warning: Removed 28000 rows containing missing values (geom_point).
panel_plot
Renumber the clusters
By default the clusters are numbered 0-12, we need them as 1-13.
<- bapd8.integrated@meta.data
df $new_clusters <- as.factor(as.numeric(df$seurat_clusters))
df@meta.data <- df
bapd8.integratedIdents(bapd8.integrated) <- "new_clusters"
Visualize dimensional reduction
= enhancedDimPlot(
A object = bapd8.integrated,
grouping_var = 'ident',
reduction = "umap",
label = TRUE,
pt.size = 1,
alpha = 0.5
+
) ggtitle("A") +
xlab("UMAP_1") +
ylab("UMAP_2") +
theme_classic() +
theme(legend.position = "none",
plot.title = element_text(face = "bold"))
<- enhancedDimPlot(
B object = bapd8.integrated,
grouping_var = 'replicate',
reduction = "umap",
label = FALSE,
pt.size = 1,
alpha = 0.4
+
) ggtitle("B") +
xlab("UMAP_1") +
ylab("UMAP_2") +
theme_classic() +
theme(
legend.justification = c(1, 1),
legend.position = c(1, 1),
plot.title = element_text(face = "bold")
+
) scale_colour_manual(
name = "Conditions",
labels = c(expression(paste('20% ', 'O'[2])),
expression(paste('5% ', 'O'[2])),
'nCT',
'nTE'),
values = c(
"20pcO2" = "#DA3C96",
"5pcO2" = "#A90065",
"nCT" = "#FFD74D",
"nTE" = "#9BC13C"
)+
) scale_fill_manual(
name = "Conditions",
labels = c(expression(paste('20% ', 'O'[2])),
expression(paste('5% ', 'O'[2])),
'nCT',
'nTE'),
values = c(
"20pcO2" = "#DA3C96",
"5pcO2" = "#A90065",
"nCT" = "#FFD74D",
"nTE" = "#9BC13C"
)+
) scale_linetype_manual(values = "blank")
<- enhancedDimPlot(
C object = bapd8.integrated,
grouping_var = 'orig.ident',
reduction = "umap",
label = FALSE,
pt.size = 1,
alpha = 0.4
+
) ggtitle("C") +
xlab("UMAP_1") +
ylab("UMAP_2") +
theme_classic() +
theme(
legend.justification = c(1, 1),
legend.position = c(1, 1),
plot.title = element_text(face = "bold")
+
) scale_colour_manual(
name = "Replicates",
labels = c(
expression(paste('20% ', 'O'[2], ' rep1')),
expression(paste('20% ', 'O'[2], ' rep2')),
expression(paste('5% ', 'O'[2], ' rep1')),
expression(paste('5% ', 'O'[2], ' rep1')),
"nCT day 5",
"nCT day 10",
"nTE day 3",
"nTE day 5"
),values = c(
"20pcO2_r1" = "#0571b0",
"20pcO2_r2" = "#92c5de",
"5pcO2_r1" = "#ca0020",
"5pcO2_r2" = "#f4a582",
"nCT_D5" = "#d133ff",
"nCT_D10" = "#ff33f6",
"nTE_D2" = "#33ffa2",
"nTE_D3" = "#5bff33"
)+
) scale_fill_manual(
name = "Replicates",
labels = c(
expression(paste('20% ', 'O'[2], ' rep1')),
expression(paste('20% ', 'O'[2], ' rep2')),
expression(paste('5% ', 'O'[2], ' rep1')),
expression(paste('5% ', 'O'[2], ' rep1')),
"nCT day 5",
"nCT day 10",
"nTE day 3",
"nTE day 5"
),values = c(
"20pcO2_r1" = "#0571b0",
"20pcO2_r2" = "#92c5de",
"5pcO2_r1" = "#ca0020",
"5pcO2_r2" = "#f4a582",
"nCT_D5" = "#d133ff",
"nCT_D10" = "#ff33f6",
"nTE_D2" = "#33ffa2",
"nTE_D3" = "#5bff33"
)+
) scale_linetype_manual(values = "blank")
<- plot_grid(A, B, ncol = 2, nrow = 1)
panel_plot panel_plot
DimPlot(object = bapd8.integrated,
split.by = 'orig.ident',
ncol = 4) +
xlab("UMAP_1") +
ylab("UMAP_2") +
theme_classic() +
theme(legend.position = "none")
Interactive dimensional reduction plot
= enhancedDimPlot(
A object = bapd8.integrated,
grouping_var = 'ident',
reduction = "umap",
label = FALSE,
pt.size = 1,
alpha = 0.5
+
) xlab("UMAP_1") +
ylab("UMAP_2") +
theme_classic() +
theme(legend.position = "none")
ggplotly(A)
Finding cluster markers
Cluster markers are defined as fold change of >= 1.5 and p-value (adj) <= 0.05. We will find all markers for each cluster with a loop.
DefaultAssay(bapd8.integrated) <- "RNA"
for (i in 1:num.clusters) {
try({
<- FindMarkers(bapd8.integrated, ident.1 = i)
cluster.markers.all <-
cluster.markers.filtered %>%
cluster.markers.all filter(avg_log2FC >= 0.584962501) %>%
filter(p_val_adj <= 0.05) %>%
arrange(desc(avg_log2FC))
<- rownames(cluster.markers.filtered)
markers.filtered.names assign(paste("cluster.marker.names", i, sep = "."),
markers.filtered.names)
}) }
Cluster cell type composition
<- tibble(
fullCounts cluster = bapd8.integrated@meta.data$new_clusters,
cell_type = bapd8.integrated@meta.data$orig.ident
%>%
) ::group_by(cluster, cell_type) %>%
dplyr::count() %>%
dplyr::group_by(cluster) %>%
dplyr::ungroup() %>%
dplyr::mutate(cluster = paste0("Cluster_", cluster))
dplyr<- fullCounts %>%
fullCounts group_by(cell_type) %>%
mutate(cell_type_sum = sum(n)) %>%
mutate(percent = (n * 100) / cell_type_sum)
list2env(split(fullCounts, fullCounts$cluster), envir = .GlobalEnv)
#> <environment: R_GlobalEnv>
Bar plot function
<- function(pdata, i) {
mybarplot ggplot(data = pdata,
aes(
x = cell_type,
y = percent,
fill = cell_type,
alpha = 0.5
+
)) geom_bar(stat = "identity") +
theme_classic(base_size = 12) +
theme(legend.position = "none",
axis.text.x = element_text(
angle = 45,
vjust = 1,
hjust = 1
+
)) ggtitle(paste("Cluster", i, "cell composition")) +
ylab("% cells in cluster") +
xlab("") +
scale_colour_manual(
name = "Replicates",
labels = c(
expression(paste('20% ', 'O'[2], ' rep1')),
expression(paste('20% ', 'O'[2], ' rep2')),
expression(paste('5% ', 'O'[2], ' rep1')),
expression(paste('5% ', 'O'[2], ' rep1')),
"nCT day 5",
"nCT day 10",
"nTE day 3",
"nTE day 5"
),values = c(
"20pcO2_r1" = "#DA3C96",
"20pcO2_r2" = "#DA3C96",
"5pcO2_r1" = "#A90065",
"5pcO2_r2" = "#A90065",
"nCT_D5" = "#FFD74D",
"nCT_D10" = "#FFD74D",
"nTE_D2" = "#9BC13C",
"nTE_D3" = "#9BC13C"
)+
) scale_fill_manual(
name = "Replicates",
labels = c(
expression(paste('20% ', 'O'[2], ' rep1')),
expression(paste('20% ', 'O'[2], ' rep2')),
expression(paste('5% ', 'O'[2], ' rep1')),
expression(paste('5% ', 'O'[2], ' rep1')),
"nCT day 5",
"nCT day 10",
"nTE day 3",
"nTE day 5"
),values = c(
"20pcO2_r1" = "#DA3C96",
"20pcO2_r2" = "#DA3C96",
"5pcO2_r1" = "#A90065",
"5pcO2_r2" = "#A90065",
"nCT_D5" = "#FFD74D",
"nCT_D10" = "#FFD74D",
"nTE_D2" = "#9BC13C",
"nTE_D3" = "#9BC13C"
)+
) scale_linetype_manual(values = "blank")
}
Define Colors
Define colors for each cluster so that they are standardized.
<- function(n = 6, h = c(0, 360) + 15) {
ggplotColours if ((diff(h) %% 360) < 1)
2] <- h[2] - 360 / n
h[hcl(h = (seq(h[1], h[2], length = n)), c = 100, l = 65)
}<- ggplotColours(n = 13) color_list
Violin plot function
<-
grouped_violinPlots function(markersfile,
clusternumber,seuratobject = bapd8.integrated) {
dittoPlotVarsAcrossGroups(
seuratobject,
markersfile,group.by = "new_clusters",
main = paste("Cluster ", clusternumber, " markers expression"),
xlab = "",
ylab = "Mean z-score expression",
x.labels = c(
"Cluster 1",
"Cluster 2",
"Cluster 3",
"Cluster 4",
"Cluster 5",
"Cluster 6",
"Cluster 7",
"Cluster 8",
"Cluster 9",
"Cluster 10",
"Cluster 11",
"Cluster 12",
"Cluster 13"
),vlnplot.lineweight = 0.5,
legend.show = FALSE,
jitter.size = 0.5,
color.panel = color_list
) }
PCE plot function
Load the PCE data
# Vento-Tormo et al., dataset
<-
l load(file = "assets/combine-test-expression1.Rdata")
<- dataset$GRCH38$humanGeneMapping
humanGeneMapping <- dataset$PlacentaDeciduaBloodData
d <- d$expressionData
data <- d$cellDetails
cellDetails
# Xiang et al., dataset
<- readRDS("assets/te.dataset.xiang.rds")
te.dataset.xiang
# Castel et al., dataset
<- readRDS("assets/te.dataset.castel.rds")
te.dataset.castel
# full names for cell types
<-
xi.md read.csv(
"assets/md-xi.tsv",
sep = "\t",
header = TRUE,
row.names = 1
)<-
vt.md read.csv(
"assets/md-vt.tsv",
sep = "\t",
header = TRUE,
row.names = 1
)<-
zp.md read.csv(
"assets/md-zp.tsv",
sep = "\t",
header = TRUE,
row.names = 1
)
<- function(geneList1, geneList2, filename, barcolor) {
runpce <-
expressionData intersect(row.names(data), humanGeneMapping$Gene),]
data[<-
se SummarizedExperiment(
assays = SimpleList(as.matrix(expressionData)),
rowData = row.names(expressionData),
colData = colnames(expressionData)
)<-
cellSpecificGenesExp teGeneRetrieval(se, expressedGeneThreshold = 1)
print(length(geneList1))
<- GeneSet(geneIds = toupper(geneList1))
gs.vt <- teEnrichmentCustom(gs.vt, cellSpecificGenesExp)
output.vt <-
en.output.vt setNames(data.frame(assay(output.vt[[1]]), row.names = rowData(output.vt[[1]])[, 1]),
colData(output.vt[[1]])[, 1])
row.names(cellDetails) <- cellDetails$RName
$Tissue <-
en.output.vtrow.names(en.output.vt), "CellName"]
cellDetails[<- GeneSet(unique(geneList2))
gs <- teEnrichmentCustom(gs, te.dataset.xiang)
output.xi <- teEnrichmentCustom(gs, te.dataset.castel)
output.zp <-
en.output.xi setNames(data.frame(assay(output.xi[[1]]), row.names = rowData(output.xi[[1]])[, 1]),
colData(output.xi[[1]])[, 1])
$Tissue <- rownames(en.output.xi)
en.output.xi<-
en.output.zp setNames(data.frame(assay(output.zp[[1]]), row.names = rowData(output.zp[[1]])[, 1]),
colData(output.zp[[1]])[, 1])
$Tissue <- rownames(en.output.zp)
en.output.zp$source <- "ZP"
en.output.zp<- en.output.zp[order(-en.output.zp$Log10PValue), ]
en.output.zp <-
en.output.zp merge(en.output.zp, zp.md, by = "row.names", all.x = TRUE)
<- rownames_to_column(en.output.zp, var = "Name")
en.output.zp $source <- "VT"
en.output.vt<- en.output.vt[order(-en.output.vt$Log10PValue), ]
en.output.vt <-
en.output.vt merge(en.output.vt, vt.md, by = "row.names", all.x = TRUE)
<- rownames_to_column(en.output.vt, var = "Name")
en.output.vt $source <- "Xi"
en.output.xi<- en.output.xi[order(-en.output.xi$Log10PValue), ]
en.output.xi <-
en.output.xi merge(en.output.xi, xi.md, by = "row.names", all.x = TRUE)
<- rownames_to_column(en.output.xi, var = "Name")
en.output.xi <- rbind(en.output.vt, en.output.xi, en.output.zp)
en.conbined <- 0.05
p <- -log10(p)
logp <- en.conbined %>%
en.conbined mutate(Log10PValue = replace(Log10PValue, Log10PValue < logp, 0))
%>%
en.conbined group_by(source) %>%
arrange(source, desc(Log10PValue)) %>% dplyr::slice(1:7) %>%
%>%
ungroup mutate(
source = as.factor(source),
CellNames = tidytext::reorder_within(CellNames, Log10PValue, source, sep = ":")
%>%
) ggplot(aes(CellNames, Log10PValue)) + geom_bar(stat = 'identity', fill = barcolor) + theme_minimal() +
theme(
axis.text.x = element_text(
vjust = 1,
hjust = 1,
size = 12
),axis.text.y = element_text(size = 12),
plot.margin = margin(10, 10, 10, 100),
legend.position = "none",
plot.title = element_text(
color = "black",
size = 18,
face = "bold.italic"
),axis.title.y = element_blank(),
axis.line.x = element_line(
colour = 'black',
size = 0.5,
linetype = 'solid'
),axis.ticks.x = element_line(
colour = 'black',
size = 1,
linetype = 'solid'
),axis.title.x = element_text(
color = "black",
size = 14,
face = "bold"
)+
) scale_y_continuous(expand = expansion(mult = c(0, .1)), breaks = pretty_breaks()) +
facet_wrap(~ source, scales = "free", ncol = 3) +
coord_flip()
}
Converting Gene IDs
For PCE analyses we need to covert gene symbols to ENS ids. We need the conversion table, a function to convert marker list and run this function on each cluster markers.
= useMart("ENSEMBL_MART_ENSEMBL")
ensembl listDatasets(ensembl) %>%
filter(str_detect(description, "Human"))
#> dataset description version
#> 1 hsapiens_gene_ensembl Human genes (GRCh38.p13) GRCh38.p13
= useDataset("hsapiens_gene_ensembl", mart = ensembl)
ensembl listFilters(ensembl) %>%
filter(str_detect(name, "ensembl"))
#> name
#> 1 ensembl_gene_id
#> 2 ensembl_gene_id_version
#> 3 ensembl_transcript_id
#> 4 ensembl_transcript_id_version
#> 5 ensembl_peptide_id
#> 6 ensembl_peptide_id_version
#> 7 ensembl_exon_id
#> description
#> 1 Gene stable ID(s) [e.g. ENSG00000000003]
#> 2 Gene stable ID(s) with version [e.g. ENSG00000000003.15]
#> 3 Transcript stable ID(s) [e.g. ENST00000000233]
#> 4 Transcript stable ID(s) with version [e.g. ENST00000000233.10]
#> 5 Protein stable ID(s) [e.g. ENSP00000000233]
#> 6 Protein stable ID(s) with version [e.g. ENSP00000000233.5]
#> 7 Exon ID(s) [e.g. ENSE00000000003]
<- "ensembl_gene_id_version"
filterType <- as.matrix(read.csv('assets/counts-subset-v5.txt', sep = "\t", row.names = "gene.ids"))
cts <- rownames(cts)
filterValues listAttributes(ensembl) %>%
head(20)
#> name description
#> 1 ensembl_gene_id Gene stable ID
#> 2 ensembl_gene_id_version Gene stable ID version
#> 3 ensembl_transcript_id Transcript stable ID
#> 4 ensembl_transcript_id_version Transcript stable ID version
#> 5 ensembl_peptide_id Protein stable ID
#> 6 ensembl_peptide_id_version Protein stable ID version
#> 7 ensembl_exon_id Exon stable ID
#> 8 description Gene description
#> 9 chromosome_name Chromosome/scaffold name
#> 10 start_position Gene start (bp)
#> 11 end_position Gene end (bp)
#> 12 strand Strand
#> 13 band Karyotype band
#> 14 transcript_start Transcript start (bp)
#> 15 transcript_end Transcript end (bp)
#> 16 transcription_start_site Transcription start site (TSS)
#> 17 transcript_length Transcript length (including UTRs and CDS)
#> 18 transcript_tsl Transcript support level (TSL)
#> 19 transcript_gencode_basic GENCODE basic annotation
#> 20 transcript_appris APPRIS annotation
#> page
#> 1 feature_page
#> 2 feature_page
#> 3 feature_page
#> 4 feature_page
#> 5 feature_page
#> 6 feature_page
#> 7 feature_page
#> 8 feature_page
#> 9 feature_page
#> 10 feature_page
#> 11 feature_page
#> 12 feature_page
#> 13 feature_page
#> 14 feature_page
#> 15 feature_page
#> 16 feature_page
#> 17 feature_page
#> 18 feature_page
#> 19 feature_page
#> 20 feature_page
<- c('ensembl_gene_id_version',
attributeNames 'ensembl_gene_id',
'external_gene_name')
<- getBM(
annot attributes = attributeNames,
filters = filterType,
values = filterValues,
mart = ensembl
)<- duplicated(annot$ensembl_gene_id)
isDup <- annot$ensembl_gene_id[isDup]
dup <- annot[!annot$ensembl_gene_id %in% dup, ] #this object will be saved and used later annot
<- function(genesymbols) {
symbols.to.ensembl <- paste0(deparse(substitute(genesymbols)), ".ensids")
newlist <- annot[annot$external_gene_name %in% genesymbols,]
temp <- temp[c(2,3)]
temp assign(newlist, temp, envir = .GlobalEnv)
}
symbols.to.ensembl(cluster.marker.names.1)
symbols.to.ensembl(cluster.marker.names.2)
symbols.to.ensembl(cluster.marker.names.3)
symbols.to.ensembl(cluster.marker.names.4)
symbols.to.ensembl(cluster.marker.names.5)
symbols.to.ensembl(cluster.marker.names.6)
symbols.to.ensembl(cluster.marker.names.7)
symbols.to.ensembl(cluster.marker.names.8)
symbols.to.ensembl(cluster.marker.names.9)
symbols.to.ensembl(cluster.marker.names.10)
symbols.to.ensembl(cluster.marker.names.11)
symbols.to.ensembl(cluster.marker.names.12)
symbols.to.ensembl(cluster.marker.names.13)
Run analyses on each cluster
Cluster 1
<- runpce(cluster.marker.names.1.ensids$ensembl_gene_id, cluster.marker.names.1, "Cluster_1_allPCE", color_list[1])
pce #> [1] 54
<- mybarplot(Cluster_1, 1)
count <- grouped_violinPlots(cluster.marker.names.1, 1)
violin <-
toprow plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
<-
panel_plot plot_grid(
toprow,
pce,axis = "r",
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1),
rel_widths = c(1, 1.3)
) panel_plot
Cluster 2
<- runpce(cluster.marker.names.2.ensids$ensembl_gene_id, cluster.marker.names.2, "Cluster_2_allPCE", color_list[2])
pce #> [1] 25
<- mybarplot(Cluster_2, 2)
count <- grouped_violinPlots(cluster.marker.names.2, 2)
violin <-
toprow plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
<-
panel_plot plot_grid(
toprow,
pce,axis = "r",
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1),
rel_widths = c(1, 1.3)
) panel_plot
Cluster 3
<- runpce(cluster.marker.names.3.ensids$ensembl_gene_id, cluster.marker.names.3, "Cluster_3_allPCE", color_list[3])
pce #> [1] 169
<- mybarplot(Cluster_3, 3)
count <- grouped_violinPlots(cluster.marker.names.3, 3)
violin <-
toprow plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
<-
panel_plot plot_grid(
toprow,
pce,axis = "r",
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1),
rel_widths = c(1, 1.3)
) panel_plot
Cluster 4
<- runpce(cluster.marker.names.4.ensids$ensembl_gene_id, cluster.marker.names.4, "Cluster_4_allPCE", color_list[4])
pce #> [1] 163
<- mybarplot(Cluster_4, 4)
count <- grouped_violinPlots(cluster.marker.names.4, 4)
violin <-
toprow plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
<-
panel_plot plot_grid(
toprow,
pce,axis = "r",
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1),
rel_widths = c(1, 1.3)
) panel_plot
Cluster 5
<- runpce(cluster.marker.names.5.ensids$ensembl_gene_id, cluster.marker.names.5, "Cluster_5_allPCE", color_list[5])
pce #> [1] 28
<- mybarplot(Cluster_5, 5)
count <- grouped_violinPlots(cluster.marker.names.5, 5)
violin <-
toprow plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
<-
panel_plot plot_grid(
toprow,
pce,axis = "r",
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1),
rel_widths = c(1, 1.3)
) panel_plot
Cluster 6
<- runpce(cluster.marker.names.6.ensids$ensembl_gene_id, cluster.marker.names.6, "Cluster_6_allPCE", color_list[6])
pce #> [1] 260
<- mybarplot(Cluster_6, 6)
count <- grouped_violinPlots(cluster.marker.names.6, 6)
violin <-
toprow plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
<-
panel_plot plot_grid(
toprow,
pce,labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1.3)
) panel_plot
Cluster 7
<- runpce(cluster.marker.names.7.ensids$ensembl_gene_id, cluster.marker.names.7, "Cluster_7_allPCE", color_list[7])
pce #> [1] 35
<- mybarplot(Cluster_7, 7)
count <- grouped_violinPlots(cluster.marker.names.7, 7)
violin <-
toprow plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
<-
panel_plot plot_grid(
toprow,
pce,axis = "r",
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1),
rel_widths = c(1, 1.3)
) panel_plot
Cluster 8
<- runpce(cluster.marker.names.8.ensids$ensembl_gene_id, cluster.marker.names.8, "Cluster_8_allPCE", color_list[8])
pce #> [1] 227
<- mybarplot(Cluster_8, 8)
count <- grouped_violinPlots(cluster.marker.names.8, 8)
violin <-
toprow plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
<-
panel_plot plot_grid(
toprow,
pce,axis = "r",
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1),
rel_widths = c(1, 1.3)
) panel_plot
Cluster 9
<- runpce(cluster.marker.names.9.ensids$ensembl_gene_id, cluster.marker.names.9, "Cluster_9_allPCE", color_list[9])
pce #> [1] 218
<- mybarplot(Cluster_9, 9)
count <- grouped_violinPlots(cluster.marker.names.9, 9)
violin <-
toprow plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
<-
panel_plot plot_grid(
toprow,
pce,axis = "r",
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1),
rel_widths = c(1, 1.3)
) panel_plot
Cluster 10
<- runpce(cluster.marker.names.10.ensids$ensembl_gene_id, cluster.marker.names.10, "Cluster_10_allPCE", color_list[10])
pce #> [1] 212
<- mybarplot(Cluster_10, 10)
count <- grouped_violinPlots(cluster.marker.names.10, 10)
violin <-
toprow plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
<-
panel_plot plot_grid(
toprow,
pce,labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1.3)
) panel_plot
Cluster 11
<- runpce(cluster.marker.names.11.ensids$ensembl_gene_id, cluster.marker.names.11, "Cluster_11_allPCE", color_list[11])
pce #> [1] 18
<- mybarplot(Cluster_11, 11)
count <- grouped_violinPlots(cluster.marker.names.11, 11)
violin <-
toprow plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
<-
panel_plot plot_grid(
toprow,
pce,axis = "r",
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1),
rel_widths = c(1, 1.3)
) panel_plot
Cluster 12
<- runpce(cluster.marker.names.12.ensids$ensembl_gene_id, cluster.marker.names.12, "Cluster_12_allPCE", color_list[12])
pce #> [1] 98
<- mybarplot(Cluster_12, 12)
count <- grouped_violinPlots(cluster.marker.names.12, 12)
violin <-
toprow plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
<-
panel_plot plot_grid(
toprow,
pce,axis = "r",
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1),
rel_widths = c(1, 1.3)
) panel_plot
Cluster 13
<- runpce(cluster.marker.names.13.ensids$ensembl_gene_id, cluster.marker.names.13, "Cluster_13_allPCE", color_list[13])
pce #> [1] 408
<- mybarplot(Cluster_13, 13)
count <- grouped_violinPlots(cluster.marker.names.13, 13)
violin <-
toprow plot_grid(count, violin, labels = c('A', 'B'), align = 'h')
<-
panel_plot plot_grid(
toprow,
pce,axis = "r",
labels = c('', 'C'),
ncol = 1,
rel_heights = c(1, 1),
rel_widths = c(1, 1.3)
) panel_plot
Session Information
sessionInfo()
#> R version 4.0.5 (2021-03-31)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 19044)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=English_United States.1252
#> [2] LC_CTYPE=English_United States.1252
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United States.1252
#>
#> attached base packages:
#> [1] grid stats4 parallel stats graphics grDevices utils
#> [8] datasets methods base
#>
#> other attached packages:
#> [1] biomaRt_2.46.3 R.utils_2.10.1
#> [3] R.oo_1.24.0 R.methodsS3_1.8.1
#> [5] plotly_4.9.3 scales_1.1.1
#> [7] pheatmap_1.0.12 RColorBrewer_1.1-2
#> [9] ComplexHeatmap_2.6.2 dittoSeq_1.2.6
#> [11] ggrepel_0.9.1 calibrate_1.7.7
#> [13] MASS_7.3-54 enhancedDimPlot_0.0.0.9100
#> [15] forcats_0.5.1 purrr_0.3.4
#> [17] readr_2.1.1 tidyr_1.1.4
#> [19] tibble_3.1.1 tidyverse_1.3.1
#> [21] gprofiler2_0.2.0 TissueEnrich_1.10.1
#> [23] GSEABase_1.52.1 graph_1.68.0
#> [25] annotate_1.68.0 XML_3.99-0.6
#> [27] AnnotationDbi_1.52.0 SummarizedExperiment_1.20.0
#> [29] GenomicRanges_1.42.0 GenomeInfoDb_1.26.7
#> [31] IRanges_2.24.1 S4Vectors_0.28.1
#> [33] MatrixGenerics_1.2.1 matrixStats_0.58.0
#> [35] ensurer_1.1 stringr_1.4.0
#> [37] dplyr_1.0.7 gridExtra_2.3
#> [39] multtest_2.46.0 Biobase_2.50.0
#> [41] BiocGenerics_0.36.1 metap_1.4
#> [43] patchwork_1.1.1 cowplot_1.1.1
#> [45] ggplot2_3.3.5 kableExtra_1.3.4
#> [47] knitr_1.37 SeuratWrappers_0.3.0
#> [49] SeuratObject_4.0.1 Seurat_4.0.2
#>
#> loaded via a namespace (and not attached):
#> [1] rappdirs_0.3.3 SnowballC_0.7.0
#> [3] scattermore_0.7 bit64_4.0.5
#> [5] irlba_2.3.3 multcomp_1.4-17
#> [7] DelayedArray_0.16.3 data.table_1.14.0
#> [9] rpart_4.1-15 RCurl_1.98-1.3
#> [11] generics_0.1.1 TH.data_1.0-10
#> [13] RSQLite_2.2.9 RANN_2.6.1
#> [15] future_1.21.0 tokenizers_0.2.1
#> [17] bit_4.0.4 tzdb_0.2.0
#> [19] mutoss_0.1-12 spatstat.data_2.1-0
#> [21] webshot_0.5.2 xml2_1.3.3
#> [23] lubridate_1.8.0 httpuv_1.6.5
#> [25] assertthat_0.2.1 xfun_0.29
#> [27] hms_1.1.1 jquerylib_0.1.4
#> [29] evaluate_0.14 promises_1.2.0.1
#> [31] progress_1.2.2 fansi_0.4.2
#> [33] dbplyr_2.1.1 readxl_1.3.1
#> [35] igraph_1.2.6 DBI_1.1.2
#> [37] tmvnsim_1.0-2 htmlwidgets_1.5.3
#> [39] spatstat.geom_2.3-1 paletteer_1.3.0
#> [41] ellipsis_0.3.2 crosstalk_1.1.1
#> [43] RSpectra_0.16-0 backports_1.2.1
#> [45] bookdown_0.24 deldir_1.0-6
#> [47] vctrs_0.3.8 SingleCellExperiment_1.12.0
#> [49] remotes_2.3.0 Cairo_1.5-12.2
#> [51] ROCR_1.0-11 abind_1.4-5
#> [53] cachem_1.0.5 withr_2.4.3
#> [55] sctransform_0.3.2 prettyunits_1.1.1
#> [57] goftest_1.2-2 mnormt_2.0.2
#> [59] svglite_2.0.0 cluster_2.1.2
#> [61] lazyeval_0.2.2 crayon_1.4.2
#> [63] labeling_0.4.2 edgeR_3.32.1
#> [65] pkgconfig_2.0.3 nlme_3.1-152
#> [67] rlang_0.4.11 globals_0.14.0
#> [69] lifecycle_1.0.1 miniUI_0.1.1.1
#> [71] sandwich_3.0-1 BiocFileCache_1.14.0
#> [73] mathjaxr_1.4-0 modelr_0.1.8
#> [75] rsvd_1.0.5
#> [ reached getOption("max.print") -- omitted 96 entries ]