Section 3: PCA plots
This section uses the count data (all datasets) generated in Section 1 for cluster analyses. Briefly, the count data are imported in R, batch corrected using ComBat_seq
, vst
transformation and clustering is performed using DESeq2
, and results are visualized as PCA plots.
Prerequisites
R packages required for this section are loaded.
setwd("~/github/BAPvsTrophoblast_Amnion")
library(sva)
library(tidyverse)
library(DESeq2)
library(vsn)
library(pheatmap)
library(ggrepel)
library(RColorBrewer)
library(plotly)
library(PCAtools)
library(scales)
library(htmlwidgets)
library(factoextra)
library(spatstat.core)
PCA plots for all datasets
Import datasets
The counts
data and its associated metadata (coldata
) are imported for analyses.
= 'assets/counts-pca-v2.txt'
counts = 'assets/batch-pca-v2.txt'
groupFile <-
coldata read.csv(
groupFile,row.names = 1,
sep = "\t",
stringsAsFactors = TRUE
)<- as.matrix(read.csv(counts, sep = "\t", row.names = "gene.ids")) cts
Inspect the coldata
.
::datatable(coldata) DT
Reorder columns of cts
according to coldata
rows. Check if samples in both files match.
colnames(cts)
#> [1] "BT_EVT_Okae.1" "BT_SCT_Okae.1" "BT_TSC_Okae.1" "BT_EVT_Okae.2"
#> [5] "BT_SCT_Okae.2" "BT_TSC_Okae.2" "CT_EVT_Okae.1" "CT_SCT_Okae.1"
#> [9] "CT_TSC_Okae.1" "CT_EVT_Okae.2" "CT_SCT_Okae.2" "CT_TSC_Okae.2"
#> [13] "CT_EVT_Okae.3" "CT_SCT_Okae.3" "CT_TSC_Okae.3" "n_H9.1"
#> [17] "n_H9.2" "nTE_D1.1" "nTE_D1.2" "nTE_D2.1"
#> [21] "nTE_D2.2" "nTE_D3.1" "nTE_D3.2" "nCT_P3.1"
#> [25] "nCT_P3.2" "nCT_P10.1" "nCT_P10.2" "nCT_P15.1"
#> [29] "nCT_P15.2" "cR_nCT_P15.1" "cR_nCT_P15.2" "nCT_P15_iPSC.1"
#> [33] "nCT_P15_iPSC.2" "CT_Okae.1" "CT_Okae.2" "nST.1"
#> [37] "nST.2" "nEVT.1" "nEVT.2" "pH9.1"
#> [41] "pH9.2" "pBAP_D1.1" "pBAP_D1.2" "pBAP_D2.1"
#> [45] "pBAP_D2.2" "pBAP_D3.1" "pBAP_D3.2" "CT_7wk.1"
#> [49] "CT_7wk.2" "CT_9wk.1" "CT_11wk.1" "amnion_18w.1"
#> [53] "amnion_9w.1" "amnion_18w.2" "amnion_16w.2" "amnion_22w.1"
#> [57] "amnion_9w.2" "amnion_22w.2" "H1_gt70_D8_BAP.1" "H1_gt70_D8_BAP.2"
#> [61] "H1_gt70_D8_BAP.3" "H1_lt40_D8_BAP.1" "H1_lt40_D8_BAP.2" "H1_lt40_D8_BAP.3"
#> [65] "H1_Yabe.1" "H1_Yabe.2" "H1_Yabe.3" "amnion_Term.1"
#> [69] "amnion_Term.2" "amnion_Term.3" "amnion_Term.4" "amnion_Term.5"
#> [73] "amnion_Term.6" "amnion_Term.7" "amnion_Term.8"
#> [ reached getOption("max.print") -- omitted 22 entries ]
all(rownames(coldata) %in% colnames(cts))
#> [1] TRUE
<- cts[, rownames(coldata)] cts
Batch correction
Using Combat_seq
(SVA package) run batch correction - using bioproject IDs as variable (dataset origin).
<- as.factor(coldata$authors)
cov1 <- ComBat_seq(cts, batch = cov1, group = NULL)
adjusted_counts #> Found 7 batches
#> Using null model in ComBat-seq.
#> Adjusting for 0 covariate(s) or covariate level(s)
#> Estimating dispersions
#> Fitting the GLM model
#> Shrinkage off - using GLM estimates for parameters
#> Adjusting the data
all(rownames(coldata) %in% colnames(cts))
#> [1] TRUE
<- cts[, rownames(coldata)] cts
Run DESeq2
The batch corrected read counts are then used for running DESeq2 analyses.
<- DESeqDataSetFromMatrix(countData = adjusted_counts,
dds colData = coldata,
design = ~ group)
Transformation:
<- assay(vst(dds))
vst <- vst(dds, blind = FALSE)
vsd <-
pcaData plotPCA(vsd,
intgroup = c("group", "authors"),
returnData = TRUE)
<- round(100 * attr(pcaData, "percentVar")) percentVar
3D PCA plot function:
# @ Thomas W. Battaglia
#' Plot DESeq2's PCA plotting with Plotly 3D scatterplot
#'
#' The function will generate a plot_ly 3D scatter plot image for
#' a 3D exploration of the PCA.
#'
#' @param object a DESeqTransform object, with data in assay(x), produced for example by either rlog or varianceStabilizingTransformation.
#' @param intgroup interesting groups: a character vector of names in colData(x) to use for grouping
#' @param ntop number of top genes to use for principal components, selected by highest row variance
#' @param returnData should the function only return the data.frame of PC1, PC2 and PC3 with intgroup covariates for custom plotting (default is FALSE)
#' @return An object created by plot_ly, which can be assigned and further customized.
#' @export
<-
plotPCA3D function (object,
intgroup = "condition",
ntop = 500,
returnData = FALSE) {
<- rowVars(assay(object))
rv <-
select order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
<- prcomp(t(assay(object)[select,]))
pca <- pca$sdev ^ 2 / sum(pca$sdev ^ 2)
percentVar if (!all(intgroup %in% names(colData(object)))) {
stop("the argument 'intgroup' should specify columns of colData(dds)")
}<-
intgroup.df as.data.frame(colData(object)[, intgroup, drop = FALSE])
<- if (length(intgroup) > 1) {
group factor(apply(intgroup.df, 1, paste, collapse = " : "))
}else {
colData(object)[[intgroup]]
}<- data.frame(
d PC1 = pca$x[, 1],
PC2 = pca$x[, 2],
PC3 = pca$x[, 3],
group = group,
intgroup.df,name = colnames(object)
)if (returnData) {
attr(d, "percentVar") <- percentVar[1:3]
return(d)
}message("Generating plotly plot")
<- plotly::plot_ly(
p data = d,
x = ~ PC1,
y = ~ PC2,
z = ~ PC3,
color = group,
mode = "markers",
type = "scatter3d"
)return(p)
}
Process data for PCA (all samples)
PCA plot for the dataset that includes all libraries.
<- rowVars(assay(vsd))
rv <-
select order(rv, decreasing = TRUE)[seq_len(min(500, length(rv)))]
<- prcomp(t(assay(vsd)[select, ]))
pca <- pca$sdev ^ 2 / sum(pca$sdev ^ 2)
percentVar = c("group", "authors")
intgroup <- as.data.frame(colData(vsd)[, intgroup, drop = FALSE])
intgroup.df <- if (length(intgroup) > 1) {
group factor(apply(intgroup.df, 1, paste, collapse = " : "))
}<- data.frame(
d PC1 = pca$x[, 1],
PC2 = pca$x[, 2],
PC3 = pca$x[, 3],
PC4 = pca$x[, 4],
PC5 = pca$x[, 5],
group = group,
intgroup.df,name = colnames(vsd)
)
Components 1 and 2
<- ggplot(d, aes(PC1, PC2, color = group.1, shape = authors)) +
g scale_shape_manual(values = 1:8) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))
ggplotly(g)
Components 1 and 3
<- ggplot(d, aes(PC1, PC3, color = group.1, shape = authors)) +
g scale_shape_manual(values = 1:8) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
ylab(paste("PC3", round(percentVar[3] * 100, 2), "% variance"))
ggplotly(g)
Components 2 and 3
<- ggplot(d, aes(PC2, PC3, color = group.1, shape = authors)) +
g scale_shape_manual(values = 1:8) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC2", round(percentVar[2] * 100, 2), "% variance")) +
ylab(paste("PC3", round(percentVar[3] * 100, 2), "% variance"))
ggplotly(g)
Scree plot
= data.frame(percentVar)
scree_plot 2] <- c(1:97)
scree_plot[, colnames(scree_plot) <- c("variance", "component_number")
<-
g ggplot(scree_plot, aes(component_number, variance * 100)) +
geom_bar(stat = 'identity', fill = "slateblue") +
theme_minimal() +
theme(
axis.text.x = element_text(
vjust = 1,
hjust = 1,
size = 12
),axis.text.y = element_text(size = 12),
plot.margin = margin(10, 10, 10, 100),
legend.position = "none",
plot.title = element_text(color = "black", size = 18, face = "bold.italic"),
axis.title.y = element_text(color = "black", size = 14, face = "bold"),
axis.line.x = element_line(
colour = 'black',
size = 0.5,
linetype = 'solid'
),axis.line.y = element_line(
colour = 'black',
size = 0.5,
linetype = 'solid'
),axis.ticks.x = element_line(
colour = 'black',
size = 1,
linetype = 'solid'
),axis.title.x = element_text(color = "black", size = 14, face = "bold")
+
) xlab("Components") +
ylab("Percent Variance") + ggtitle("All samples")
scale_y_continuous(expand = expansion(mult = c(0, .1)), breaks = pretty_breaks())
#> <ScaleContinuousPosition>
#> Range:
#> Limits: 0 -- 1
ggplotly(g)
Euclidean Distance
<-
pca prcomp(t(assay(vsd)[select,]),
center = TRUE,
scale. = FALSE,
rank. = 3)
<- pca$x
results <- get_dist(results, method = "euclidean")
distance write.table(
as.matrix(distance),
"assets/all_samples-PC1-PC2-PC3.tsv",
row.names = T,
qmethod = "double",
sep = "\t"
)fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
<- prcomp(t(assay(vsd)[select,]), center = TRUE, scale. = FALSE)
pca <- pca$x
results <- get_dist(results, method = "euclidean")
distance write.table(
as.matrix(distance),
"assets/all_samples-allPC.tsv",
row.names = T,
qmethod = "double",
sep = "\t"
)fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
PCA plot for differentiated cell libraries (no amnion datasets)
Import datasets
The counts
data and its associated metadata (coldata
) are imported for analyses.
setwd("~/github/BAPvsTrophoblast_Amnion")
= 'assets/counts-pca-v2.2.txt'
counts2 = 'assets/batch-pca-v2.2.txt'
groupFile <-
coldata2 read.csv(
groupFile,row.names = 1,
sep = "\t",
stringsAsFactors = TRUE
)<-
cts2 as.matrix(read.csv(counts2, sep = "\t", row.names = "gene.ids"))
Inspect the coldata
.
::datatable(coldata2) DT
Reorder columns of cts
according to coldata
rows. Check if samples in both files match.
all(rownames(coldata2) %in% colnames(cts2))
#> [1] TRUE
<- cts2[, rownames(coldata2)] cts2
Batch correction
Using ComBat_seq
(SVA package) run batch correction - using bioproject IDs as variable (dataset origin).
<- as.factor(coldata2$authors)
cov1 <- ComBat_seq(cts2, batch = cov1, group = NULL)
adjusted_counts #> Found 5 batches
#> Using null model in ComBat-seq.
#> Adjusting for 0 covariate(s) or covariate level(s)
#> Estimating dispersions
#> Fitting the GLM model
#> Shrinkage off - using GLM estimates for parameters
#> Adjusting the data
all(rownames(coldata2) %in% colnames(cts2))
#> [1] TRUE
<- cts2[, rownames(coldata2)] cts2
Run DESeq2
The batch corrected read counts are then used for running DESeq2 analyses.
<- DESeqDataSetFromMatrix(countData = adjusted_counts,
dds colData = coldata2,
design = ~ group)
Transformation
<- vst(dds, blind = FALSE)
vsd <-
pcaData plotPCA(vsd,
intgroup = c("group", "authors"),
returnData = TRUE)
<- round(100 * attr(pcaData, "percentVar")) percentVar
Process data for PCA (differenticated)
<- rowVars(assay(vsd))
rv <-
select order(rv, decreasing = TRUE)[seq_len(min(500, length(rv)))]
<- prcomp(t(assay(vsd)[select,]))
pca <- pca$sdev ^ 2 / sum(pca$sdev ^ 2)
percentVar = c("group", "authors")
intgroup <- as.data.frame(colData(vsd)[, intgroup, drop = FALSE])
intgroup.df <- if (length(intgroup) > 1) {
group factor(apply(intgroup.df, 1, paste, collapse = " : "))
}<- data.frame(
d PC1 = pca$x[, 1],
PC2 = pca$x[, 2],
PC3 = pca$x[, 3],
PC4 = pca$x[, 4],
PC5 = pca$x[, 5],
group = group,
intgroup.df,name = colnames(vsd)
)
Components 1 and 2
<- ggplot(d, aes(PC1, PC2, color = group.1, shape = authors)) +
g scale_shape_manual(values = 1:8) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))
ggplotly(g)
Components 1 and 3
<- ggplot(d, aes(PC1, PC3, color = group.1, shape = authors)) +
g scale_shape_manual(values = 1:8) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
ylab(paste("PC3", round(percentVar[3] * 100, 2), "% variance"))
ggplotly(g)
Components 2 and 3
<- ggplot(d, aes(PC2, PC3, color = group.1, shape = authors)) +
g scale_shape_manual(values = 1:8) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC2", round(percentVar[2] * 100, 2), "% variance")) +
ylab(paste("PC3", round(percentVar[3] * 100, 2), "% variance"))
ggplotly(g)
Scree plot
= data.frame(percentVar)
scree_plot = length(scree_plot$percentVar)
end 2] <- c(1:end)
scree_plot[, colnames(scree_plot) <- c("variance", "component_number")
<- ggplot(scree_plot, aes(component_number, variance * 100)) +
g geom_bar(stat = 'identity', fill = "slateblue") +
theme_minimal() +
theme(
axis.text.x = element_text(
vjust = 1,
hjust = 1,
size = 12
),axis.text.y = element_text(size = 12),
plot.margin = margin(10, 10, 10, 100),
legend.position = "none",
plot.title = element_text(color = "black", size = 18, face = "bold.italic"),
axis.title.y = element_text(color = "black", size = 14, face = "bold"),
axis.line.x = element_line(
colour = 'black',
size = 0.5,
linetype = 'solid'
),axis.line.y = element_line(
colour = 'black',
size = 0.5,
linetype = 'solid'
),axis.ticks.x = element_line(
colour = 'black',
size = 1,
linetype = 'solid'
),axis.title.x = element_text(color = "black", size = 14, face = "bold")
+
) xlab("Components") +
ylab("Percent Variance") + ggtitle("differentiated samples")
scale_y_continuous(expand = expansion(mult = c(0, .1)), breaks = pretty_breaks())
#> <ScaleContinuousPosition>
#> Range:
#> Limits: 0 -- 1
ggplotly(g)
Euclidean Distance
<-
pca prcomp(t(assay(vsd)[select,]),
center = TRUE,
scale. = FALSE,
rank. = 3)
<- pca$x
results <- get_dist(results, method = "euclidean")
distance write.table(
as.matrix(distance),
"assets/diff_samples-PC1-PC2-PC3.tsv",
row.names = T,
qmethod = "double",
sep = "\t"
)fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
<- prcomp(t(assay(vsd)[select,]), center = TRUE, scale. = FALSE)
pca <- pca$x
results <- get_dist(results, method = "euclidean")
distance write.table(
as.matrix(distance),
"assets/diff_samples-allPC.tsv",
row.names = T,
qmethod = "double",
sep = "\t"
)fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
Interactive 3D PCA plot
<- plotPCA3D(vsd, intgroup = c("group", "authors"))
g saveWidget(g, file = "PCA_full.html")
#> Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
#> Returning the palette you asked for with that many colors
#> Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
#> Returning the palette you asked for with that many colors
<- plotPCA3D(vsd, intgroup = c("group", "authors"))
g saveWidget(g, file = "PCA_dif.html")
#> Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
#> Returning the palette you asked for with that many colors
#> Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
#> Returning the palette you asked for with that many colors
Session Information
sessionInfo()
#> R version 4.0.5 (2021-03-31)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 19044)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=English_United States.1252
#> [2] LC_CTYPE=English_United States.1252
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United States.1252
#>
#> attached base packages:
#> [1] parallel stats4 stats graphics grDevices utils datasets
#> [8] methods base
#>
#> other attached packages:
#> [1] spatstat.core_2.3-2 rpart_4.1-15
#> [3] spatstat.geom_2.3-1 spatstat.data_2.1-0
#> [5] factoextra_1.0.7.999 htmlwidgets_1.5.3
#> [7] scales_1.1.1 PCAtools_2.5.15
#> [9] plotly_4.9.3 RColorBrewer_1.1-2
#> [11] ggrepel_0.9.1 pheatmap_1.0.12
#> [13] vsn_3.58.0 DESeq2_1.30.1
#> [15] SummarizedExperiment_1.20.0 Biobase_2.50.0
#> [17] MatrixGenerics_1.2.1 matrixStats_0.58.0
#> [19] GenomicRanges_1.42.0 GenomeInfoDb_1.26.7
#> [21] IRanges_2.24.1 S4Vectors_0.28.1
#> [23] BiocGenerics_0.36.1 forcats_0.5.1
#> [25] stringr_1.4.0 dplyr_1.0.7
#> [27] purrr_0.3.4 readr_2.1.1
#> [29] tidyr_1.1.4 tibble_3.1.1
#> [31] ggplot2_3.3.5 tidyverse_1.3.1
#> [33] sva_3.38.0 BiocParallel_1.24.1
#> [35] genefilter_1.72.1 mgcv_1.8-35
#> [37] nlme_3.1-152
#>
#> loaded via a namespace (and not attached):
#> [1] readxl_1.3.1 backports_1.2.1 plyr_1.8.6
#> [4] lazyeval_0.2.2 splines_4.0.5 crosstalk_1.1.1
#> [7] digest_0.6.27 htmltools_0.5.2 fansi_0.4.2
#> [10] magrittr_2.0.1 memoise_2.0.1 tensor_1.5
#> [13] tzdb_0.2.0 limma_3.46.0 annotate_1.68.0
#> [16] modelr_0.1.8 spatstat.sparse_2.0-0 rmdformats_1.0.3
#> [19] colorspace_2.0-1 blob_1.2.2 rvest_1.0.0
#> [22] haven_2.4.1 xfun_0.29 crayon_1.4.2
#> [25] RCurl_1.98-1.3 jsonlite_1.7.3 survival_3.2-11
#> [28] glue_1.4.2 polyclip_1.10-0 gtable_0.3.0
#> [31] zlibbioc_1.36.0 XVector_0.30.0 DelayedArray_0.16.3
#> [34] BiocSingular_1.6.0 abind_1.4-5 DBI_1.1.2
#> [37] edgeR_3.32.1 Rcpp_1.0.8 viridisLite_0.4.0
#> [40] xtable_1.8-4 dqrng_0.3.0 bit_4.0.4
#> [43] rsvd_1.0.5 preprocessCore_1.52.1 DT_0.18
#> [46] httr_1.4.2 ellipsis_0.3.2 farver_2.1.0
#> [49] pkgconfig_2.0.3 XML_3.99-0.6 sass_0.4.0
#> [52] dbplyr_2.1.1 deldir_1.0-6 locfit_1.5-9.4
#> [55] utf8_1.2.1 labeling_0.4.2 tidyselect_1.1.1
#> [58] rlang_0.4.11 reshape2_1.4.4 AnnotationDbi_1.52.0
#> [61] munsell_0.5.0 cellranger_1.1.0 tools_4.0.5
#> [64] cachem_1.0.5 cli_3.1.1 generics_0.1.1
#> [67] RSQLite_2.2.9 broom_0.7.6 evaluate_0.14
#> [70] fastmap_1.1.0 yaml_2.2.1 goftest_1.2-2
#> [73] knitr_1.37 bit64_4.0.5 fs_1.5.0
#> [ reached getOption("max.print") -- omitted 34 entries ]