Section 3: PCA plots

This section uses the count data (all datasets) generated in Section 1 for cluster analyses. Briefly, the count data are imported in R, batch corrected using ComBat_seq, vst transformation and clustering is performed using DESeq2, and results are visualized as PCA plots.

Prerequisites

R packages required for this section are loaded.

setwd("~/github/BAPvsTrophoblast_Amnion")
library(sva)
library(tidyverse)
library(DESeq2)
library(vsn)
library(pheatmap)
library(ggrepel)
library(RColorBrewer)
library(plotly)
library(PCAtools)
library(scales)
library(htmlwidgets)
library(factoextra)
library(spatstat.core)

PCA plots for all datasets

Import datasets

The counts data and its associated metadata (coldata) are imported for analyses.

counts = 'assets/counts-pca-v2.txt'
groupFile = 'assets/batch-pca-v2.txt'
coldata <-
  read.csv(
    groupFile,
    row.names = 1,
    sep = "\t",
    stringsAsFactors = TRUE
  )
cts <- as.matrix(read.csv(counts, sep = "\t", row.names = "gene.ids"))

Inspect the coldata.

DT::datatable(coldata)

Reorder columns of cts according to coldata rows. Check if samples in both files match.

colnames(cts)
#>  [1] "BT_EVT_Okae.1"    "BT_SCT_Okae.1"    "BT_TSC_Okae.1"    "BT_EVT_Okae.2"   
#>  [5] "BT_SCT_Okae.2"    "BT_TSC_Okae.2"    "CT_EVT_Okae.1"    "CT_SCT_Okae.1"   
#>  [9] "CT_TSC_Okae.1"    "CT_EVT_Okae.2"    "CT_SCT_Okae.2"    "CT_TSC_Okae.2"   
#> [13] "CT_EVT_Okae.3"    "CT_SCT_Okae.3"    "CT_TSC_Okae.3"    "n_H9.1"          
#> [17] "n_H9.2"           "nTE_D1.1"         "nTE_D1.2"         "nTE_D2.1"        
#> [21] "nTE_D2.2"         "nTE_D3.1"         "nTE_D3.2"         "nCT_P3.1"        
#> [25] "nCT_P3.2"         "nCT_P10.1"        "nCT_P10.2"        "nCT_P15.1"       
#> [29] "nCT_P15.2"        "cR_nCT_P15.1"     "cR_nCT_P15.2"     "nCT_P15_iPSC.1"  
#> [33] "nCT_P15_iPSC.2"   "CT_Okae.1"        "CT_Okae.2"        "nST.1"           
#> [37] "nST.2"            "nEVT.1"           "nEVT.2"           "pH9.1"           
#> [41] "pH9.2"            "pBAP_D1.1"        "pBAP_D1.2"        "pBAP_D2.1"       
#> [45] "pBAP_D2.2"        "pBAP_D3.1"        "pBAP_D3.2"        "CT_7wk.1"        
#> [49] "CT_7wk.2"         "CT_9wk.1"         "CT_11wk.1"        "amnion_18w.1"    
#> [53] "amnion_9w.1"      "amnion_18w.2"     "amnion_16w.2"     "amnion_22w.1"    
#> [57] "amnion_9w.2"      "amnion_22w.2"     "H1_gt70_D8_BAP.1" "H1_gt70_D8_BAP.2"
#> [61] "H1_gt70_D8_BAP.3" "H1_lt40_D8_BAP.1" "H1_lt40_D8_BAP.2" "H1_lt40_D8_BAP.3"
#> [65] "H1_Yabe.1"        "H1_Yabe.2"        "H1_Yabe.3"        "amnion_Term.1"   
#> [69] "amnion_Term.2"    "amnion_Term.3"    "amnion_Term.4"    "amnion_Term.5"   
#> [73] "amnion_Term.6"    "amnion_Term.7"    "amnion_Term.8"   
#>  [ reached getOption("max.print") -- omitted 22 entries ]
all(rownames(coldata) %in% colnames(cts))
#> [1] TRUE
cts <- cts[, rownames(coldata)]

Batch correction

Using Combat_seq (SVA package) run batch correction - using bioproject IDs as variable (dataset origin).

cov1 <- as.factor(coldata$authors)
adjusted_counts <- ComBat_seq(cts, batch = cov1, group = NULL)
#> Found 7 batches
#> Using null model in ComBat-seq.
#> Adjusting for 0 covariate(s) or covariate level(s)
#> Estimating dispersions
#> Fitting the GLM model
#> Shrinkage off - using GLM estimates for parameters
#> Adjusting the data
all(rownames(coldata) %in% colnames(cts))
#> [1] TRUE
cts <- cts[, rownames(coldata)]

Run DESeq2

The batch corrected read counts are then used for running DESeq2 analyses.

dds <- DESeqDataSetFromMatrix(countData = adjusted_counts,
                              colData = coldata,
                              design = ~ group)

Transformation:

vst <- assay(vst(dds))
vsd <- vst(dds, blind = FALSE)
pcaData <-
  plotPCA(vsd,
          intgroup = c("group", "authors"),
          returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

3D PCA plot function:

# @ Thomas W. Battaglia

#' Plot DESeq2's PCA plotting with Plotly 3D scatterplot
#'
#' The function will generate a plot_ly 3D scatter plot image for
#' a 3D exploration of the PCA.
#'
#' @param object a DESeqTransform object, with data in assay(x), produced for example by either rlog or varianceStabilizingTransformation.
#' @param intgroup interesting groups: a character vector of names in colData(x) to use for grouping
#' @param ntop number of top genes to use for principal components, selected by highest row variance
#' @param returnData should the function only return the data.frame of PC1, PC2 and PC3 with intgroup covariates for custom plotting (default is FALSE)
#' @return An object created by plot_ly, which can be assigned and further customized.
#' @export
plotPCA3D <-
  function (object,
            intgroup = "condition",
            ntop = 500,
            returnData = FALSE) {
    rv <- rowVars(assay(object))
    select <-
      order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
    pca <- prcomp(t(assay(object)[select,]))
    percentVar <- pca$sdev ^ 2 / sum(pca$sdev ^ 2)
    if (!all(intgroup %in% names(colData(object)))) {
      stop("the argument 'intgroup' should specify columns of colData(dds)")
    }
    intgroup.df <-
      as.data.frame(colData(object)[, intgroup, drop = FALSE])
    group <- if (length(intgroup) > 1) {
      factor(apply(intgroup.df, 1, paste, collapse = " : "))
    }
    else {
      colData(object)[[intgroup]]
    }
    d <- data.frame(
      PC1 = pca$x[, 1],
      PC2 = pca$x[, 2],
      PC3 = pca$x[, 3],
      group = group,
      intgroup.df,
      name = colnames(object)
    )
    if (returnData) {
      attr(d, "percentVar") <- percentVar[1:3]
      return(d)
    }
    message("Generating plotly plot")
    p <- plotly::plot_ly(
      data = d,
      x = ~ PC1,
      y = ~ PC2,
      z = ~ PC3,
      color = group,
      mode = "markers",
      type = "scatter3d"
    )
    return(p)
  }

Process data for PCA (all samples)

PCA plot for the dataset that includes all libraries.

rv <- rowVars(assay(vsd))
select <-
  order(rv, decreasing = TRUE)[seq_len(min(500, length(rv)))]
pca <- prcomp(t(assay(vsd)[select, ]))
percentVar <- pca$sdev ^ 2 / sum(pca$sdev ^ 2)
intgroup = c("group", "authors")
intgroup.df <- as.data.frame(colData(vsd)[, intgroup, drop = FALSE])
group <- if (length(intgroup) > 1) {
  factor(apply(intgroup.df, 1, paste, collapse = " : "))
}
d <- data.frame(
  PC1 = pca$x[, 1],
  PC2 = pca$x[, 2],
  PC3 = pca$x[, 3],
  PC4 = pca$x[, 4],
  PC5 = pca$x[, 5],
  group = group,
  intgroup.df,
  name = colnames(vsd)
)

Components 1 and 2

g <- ggplot(d, aes(PC1, PC2, color = group.1, shape = authors)) +
  scale_shape_manual(values = 1:8) +
  theme_bw() +
  theme(legend.title = element_blank()) +
  geom_point(size = 2, stroke = 2) +
  xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
  ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))
ggplotly(g)

Fig 3.1: PCA plots (all samples) - PC1 & PC2

Components 1 and 3

g <- ggplot(d, aes(PC1, PC3, color = group.1, shape = authors)) +
  scale_shape_manual(values = 1:8) +
  theme_bw() +
  theme(legend.title = element_blank()) +
  geom_point(size = 2, stroke = 2) +
  xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
  ylab(paste("PC3", round(percentVar[3] * 100, 2), "% variance"))
ggplotly(g)

Fig 3.2: PCA plots (all samples) - PC1 and PC3

Components 2 and 3

g <- ggplot(d, aes(PC2, PC3, color = group.1, shape = authors)) +
  scale_shape_manual(values = 1:8) +
  theme_bw() +
  theme(legend.title = element_blank()) +
  geom_point(size = 2, stroke = 2) +
  xlab(paste("PC2", round(percentVar[2] * 100, 2), "% variance")) +
  ylab(paste("PC3", round(percentVar[3] * 100, 2), "% variance"))
ggplotly(g)

Fig 3.3: PCA plots (all samples) - PC2 and PC3

Scree plot

scree_plot = data.frame(percentVar)
scree_plot[, 2] <- c(1:97)
colnames(scree_plot) <- c("variance", "component_number")
g <-
  ggplot(scree_plot, aes(component_number, variance * 100)) +
  geom_bar(stat = 'identity', fill = "slateblue") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(
      vjust = 1,
      hjust = 1,
      size = 12
    ),
    axis.text.y = element_text(size = 12),
    plot.margin = margin(10, 10, 10, 100),
    legend.position = "none",
    plot.title = element_text(color = "black", size = 18, face = "bold.italic"),
    axis.title.y = element_text(color = "black", size = 14, face = "bold"),
    axis.line.x = element_line(
      colour = 'black',
      size = 0.5,
      linetype = 'solid'
    ),
    axis.line.y = element_line(
      colour = 'black',
      size = 0.5,
      linetype = 'solid'
    ),
    axis.ticks.x = element_line(
      colour = 'black',
      size = 1,
      linetype = 'solid'
    ),
    axis.title.x = element_text(color = "black", size = 14, face = "bold")
  ) +
  xlab("Components") +
  ylab("Percent Variance") + ggtitle("All samples")
scale_y_continuous(expand = expansion(mult = c(0, .1)), breaks = pretty_breaks())
#> <ScaleContinuousPosition>
#>  Range:  
#>  Limits:    0 --    1
ggplotly(g)

Fig 3.4: Scree plot showing variance contributed by each component

Euclidean Distance

pca <-
  prcomp(t(assay(vsd)[select,]),
         center = TRUE,
         scale. = FALSE,
         rank. = 3)
results <- pca$x
distance <- get_dist(results, method = "euclidean")
write.table(
  as.matrix(distance),
  "assets/all_samples-PC1-PC2-PC3.tsv",
  row.names = T,
  qmethod = "double",
  sep = "\t"
)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

Fig 3.5: Heatmap of Euclidean Distance using first 3 components (all samples)

pca <- prcomp(t(assay(vsd)[select,]), center = TRUE, scale. = FALSE)
results <- pca$x
distance <- get_dist(results, method = "euclidean")
write.table(
  as.matrix(distance),
  "assets/all_samples-allPC.tsv",
  row.names = T,
  qmethod = "double",
  sep = "\t"
)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

Fig 3.6: Heatmap of Euclidean Distance using all components (all samples)

PCA plot for differentiated cell libraries (no amnion datasets)

Import datasets

The counts data and its associated metadata (coldata) are imported for analyses.

setwd("~/github/BAPvsTrophoblast_Amnion")
counts2 = 'assets/counts-pca-v2.2.txt'
groupFile = 'assets/batch-pca-v2.2.txt'
coldata2 <-
  read.csv(
    groupFile,
    row.names = 1,
    sep = "\t",
    stringsAsFactors = TRUE
  )
cts2 <-
  as.matrix(read.csv(counts2, sep = "\t", row.names = "gene.ids"))

Inspect the coldata.

DT::datatable(coldata2)

Reorder columns of cts according to coldata rows. Check if samples in both files match.

all(rownames(coldata2) %in% colnames(cts2))
#> [1] TRUE
cts2 <- cts2[, rownames(coldata2)]

Batch correction

Using ComBat_seq (SVA package) run batch correction - using bioproject IDs as variable (dataset origin).

cov1 <- as.factor(coldata2$authors)
adjusted_counts <- ComBat_seq(cts2, batch = cov1, group = NULL)
#> Found 5 batches
#> Using null model in ComBat-seq.
#> Adjusting for 0 covariate(s) or covariate level(s)
#> Estimating dispersions
#> Fitting the GLM model
#> Shrinkage off - using GLM estimates for parameters
#> Adjusting the data
all(rownames(coldata2) %in% colnames(cts2))
#> [1] TRUE
cts2 <- cts2[, rownames(coldata2)]

Run DESeq2

The batch corrected read counts are then used for running DESeq2 analyses.

dds <- DESeqDataSetFromMatrix(countData = adjusted_counts,
                              colData = coldata2,
                              design = ~ group)

Transformation

vsd <- vst(dds, blind = FALSE)
pcaData <-
  plotPCA(vsd,
          intgroup = c("group", "authors"),
          returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

Process data for PCA (differenticated)

rv <- rowVars(assay(vsd))
select <-
  order(rv, decreasing = TRUE)[seq_len(min(500, length(rv)))]
pca <- prcomp(t(assay(vsd)[select,]))
percentVar <- pca$sdev ^ 2 / sum(pca$sdev ^ 2)
intgroup = c("group", "authors")
intgroup.df <- as.data.frame(colData(vsd)[, intgroup, drop = FALSE])
group <- if (length(intgroup) > 1) {
  factor(apply(intgroup.df, 1, paste, collapse = " : "))
}
d <- data.frame(
  PC1 = pca$x[, 1],
  PC2 = pca$x[, 2],
  PC3 = pca$x[, 3],
  PC4 = pca$x[, 4],
  PC5 = pca$x[, 5],
  group = group,
  intgroup.df,
  name = colnames(vsd)
)

Components 1 and 2

g <- ggplot(d, aes(PC1, PC2, color = group.1, shape = authors)) +
  scale_shape_manual(values = 1:8) +
  theme_bw() +
  theme(legend.title = element_blank()) +
  geom_point(size = 2, stroke = 2) +
  xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
  ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))
ggplotly(g)

Fig 3.7: PCA plots (differentiated) - PC1 and PC2

Components 1 and 3

g <- ggplot(d, aes(PC1, PC3, color = group.1, shape = authors)) +
  scale_shape_manual(values = 1:8) +
  theme_bw() +
  theme(legend.title = element_blank()) +
  geom_point(size = 2, stroke = 2) +
  xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
  ylab(paste("PC3", round(percentVar[3] * 100, 2), "% variance"))
ggplotly(g)

Fig 3.8: PCA plots (differentiated) - PC1 and PC3

Components 2 and 3

g <- ggplot(d, aes(PC2, PC3, color = group.1, shape = authors)) +
  scale_shape_manual(values = 1:8) +
  theme_bw() +
  theme(legend.title = element_blank()) +
  geom_point(size = 2, stroke = 2) +
  xlab(paste("PC2", round(percentVar[2] * 100, 2), "% variance")) +
  ylab(paste("PC3", round(percentVar[3] * 100, 2), "% variance"))
ggplotly(g)

Fig 3.9: PCA plots (differentiated) - PC2 and PC3

Scree plot

scree_plot = data.frame(percentVar)
end = length(scree_plot$percentVar)
scree_plot[, 2] <- c(1:end)
colnames(scree_plot) <- c("variance", "component_number")


g <- ggplot(scree_plot, aes(component_number, variance * 100)) +
  geom_bar(stat = 'identity', fill = "slateblue") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(
      vjust = 1,
      hjust = 1,
      size = 12
    ),
    axis.text.y = element_text(size = 12),
    plot.margin = margin(10, 10, 10, 100),
    legend.position = "none",
    plot.title = element_text(color = "black", size = 18, face = "bold.italic"),
    axis.title.y = element_text(color = "black", size = 14, face = "bold"),
    axis.line.x = element_line(
      colour = 'black',
      size = 0.5,
      linetype = 'solid'
    ),
    axis.line.y = element_line(
      colour = 'black',
      size = 0.5,
      linetype = 'solid'
    ),
    axis.ticks.x = element_line(
      colour = 'black',
      size = 1,
      linetype = 'solid'
    ),
    axis.title.x = element_text(color = "black", size = 14, face = "bold")
  ) +
  xlab("Components") +
  ylab("Percent Variance") + ggtitle("differentiated samples")
scale_y_continuous(expand = expansion(mult = c(0, .1)), breaks = pretty_breaks())
#> <ScaleContinuousPosition>
#>  Range:  
#>  Limits:    0 --    1
ggplotly(g)

Fig 3.10: Scree plot showing variance contributed by each component (differentiated)

Euclidean Distance

pca <-
  prcomp(t(assay(vsd)[select,]),
         center = TRUE,
         scale. = FALSE,
         rank. = 3)
results <- pca$x
distance <- get_dist(results, method = "euclidean")
write.table(
  as.matrix(distance),
  "assets/diff_samples-PC1-PC2-PC3.tsv",
  row.names = T,
  qmethod = "double",
  sep = "\t"
)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

Fig 3.11: Heatmap of Euclidean Distance using first 3 components (differentiated)

pca <- prcomp(t(assay(vsd)[select,]), center = TRUE, scale. = FALSE)
results <- pca$x
distance <- get_dist(results, method = "euclidean")
write.table(
  as.matrix(distance),
  "assets/diff_samples-allPC.tsv",
  row.names = T,
  qmethod = "double",
  sep = "\t"
)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

Fig 3.12: Heatmap of Euclidean Distance using all components (differentiated)

Interactive 3D PCA plot

g <- plotPCA3D(vsd, intgroup = c("group", "authors"))
saveWidget(g, file = "PCA_full.html")
#> Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
#> Returning the palette you asked for with that many colors

#> Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
#> Returning the palette you asked for with that many colors

g <- plotPCA3D(vsd, intgroup = c("group", "authors"))
saveWidget(g, file = "PCA_dif.html")
#> Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
#> Returning the palette you asked for with that many colors

#> Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
#> Returning the palette you asked for with that many colors

Session Information

sessionInfo()
#> R version 4.0.5 (2021-03-31)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 19044)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=English_United States.1252 
#> [2] LC_CTYPE=English_United States.1252   
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C                          
#> [5] LC_TIME=English_United States.1252    
#> 
#> attached base packages:
#> [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
#> [8] methods   base     
#> 
#> other attached packages:
#>  [1] spatstat.core_2.3-2         rpart_4.1-15               
#>  [3] spatstat.geom_2.3-1         spatstat.data_2.1-0        
#>  [5] factoextra_1.0.7.999        htmlwidgets_1.5.3          
#>  [7] scales_1.1.1                PCAtools_2.5.15            
#>  [9] plotly_4.9.3                RColorBrewer_1.1-2         
#> [11] ggrepel_0.9.1               pheatmap_1.0.12            
#> [13] vsn_3.58.0                  DESeq2_1.30.1              
#> [15] SummarizedExperiment_1.20.0 Biobase_2.50.0             
#> [17] MatrixGenerics_1.2.1        matrixStats_0.58.0         
#> [19] GenomicRanges_1.42.0        GenomeInfoDb_1.26.7        
#> [21] IRanges_2.24.1              S4Vectors_0.28.1           
#> [23] BiocGenerics_0.36.1         forcats_0.5.1              
#> [25] stringr_1.4.0               dplyr_1.0.7                
#> [27] purrr_0.3.4                 readr_2.1.1                
#> [29] tidyr_1.1.4                 tibble_3.1.1               
#> [31] ggplot2_3.3.5               tidyverse_1.3.1            
#> [33] sva_3.38.0                  BiocParallel_1.24.1        
#> [35] genefilter_1.72.1           mgcv_1.8-35                
#> [37] nlme_3.1-152               
#> 
#> loaded via a namespace (and not attached):
#>  [1] readxl_1.3.1          backports_1.2.1       plyr_1.8.6           
#>  [4] lazyeval_0.2.2        splines_4.0.5         crosstalk_1.1.1      
#>  [7] digest_0.6.27         htmltools_0.5.2       fansi_0.4.2          
#> [10] magrittr_2.0.1        memoise_2.0.1         tensor_1.5           
#> [13] tzdb_0.2.0            limma_3.46.0          annotate_1.68.0      
#> [16] modelr_0.1.8          spatstat.sparse_2.0-0 rmdformats_1.0.3     
#> [19] colorspace_2.0-1      blob_1.2.2            rvest_1.0.0          
#> [22] haven_2.4.1           xfun_0.29             crayon_1.4.2         
#> [25] RCurl_1.98-1.3        jsonlite_1.7.3        survival_3.2-11      
#> [28] glue_1.4.2            polyclip_1.10-0       gtable_0.3.0         
#> [31] zlibbioc_1.36.0       XVector_0.30.0        DelayedArray_0.16.3  
#> [34] BiocSingular_1.6.0    abind_1.4-5           DBI_1.1.2            
#> [37] edgeR_3.32.1          Rcpp_1.0.8            viridisLite_0.4.0    
#> [40] xtable_1.8-4          dqrng_0.3.0           bit_4.0.4            
#> [43] rsvd_1.0.5            preprocessCore_1.52.1 DT_0.18              
#> [46] httr_1.4.2            ellipsis_0.3.2        farver_2.1.0         
#> [49] pkgconfig_2.0.3       XML_3.99-0.6          sass_0.4.0           
#> [52] dbplyr_2.1.1          deldir_1.0-6          locfit_1.5-9.4       
#> [55] utf8_1.2.1            labeling_0.4.2        tidyselect_1.1.1     
#> [58] rlang_0.4.11          reshape2_1.4.4        AnnotationDbi_1.52.0 
#> [61] munsell_0.5.0         cellranger_1.1.0      tools_4.0.5          
#> [64] cachem_1.0.5          cli_3.1.1             generics_0.1.1       
#> [67] RSQLite_2.2.9         broom_0.7.6           evaluate_0.14        
#> [70] fastmap_1.1.0         yaml_2.2.1            goftest_1.2-2        
#> [73] knitr_1.37            bit64_4.0.5           fs_1.5.0             
#>  [ reached getOption("max.print") -- omitted 34 entries ]