Proteomics: data analysis

Prerequisites

Load the packages needed for the analyses and read-in the relavant data

library(scales)
library(data.table)
library(vctrs)
library(tidyverse)
library(plotly)
library(TissueEnrich)
library(ggrepel)
library(RColorBrewer)
library(pheatmap)
setwd("/work/LAS/geetu-lab/arnstrm/mouse.trophoblast.smallRNAseq")
dat <-
  read.csv("assets/data_processed.csv",
           row.names = NULL,
           stringsAsFactors = TRUE)
orig_data <-
  read.csv("assets/orig_data.csv",
           row.names = NULL,
           stringsAsFactors = TRUE)
source("assets/theme_clean.R")

quick check:

dat_tidy <- dat %>% drop_na()
dat %>% summarise(Number_of_proteins = n())
#>   Number_of_proteins
#> 1                174

Volcano plot

df <- orig_data
exp <- log(1.5, 2)
df$log2fc <- log(df$Ratio, base = 2)
df$negLog10p = -log10(df$Qvalue)
df$diffexpressed[df$log2fc <= -exp &
                   df$Qvalue <= 0.05] <- "up in pTGC"
df$diffexpressed[df$log2fc >= exp &
                   df$Qvalue <= 0.05] <- "up in TSC"
df$diffexpressed[df$log2fc >= -exp &
                   df$log2fc <= exp] <- "other genes"
df$diffexpressed[df$Qvalue > 0.05] <- "other genes"
df$newGenes <-
  str_replace_all(string = df$Genes,
                  pattern = "\\;.*$",
                  replacement = "")
data.table::setDT(df)[diffexpressed == "other genes", newGenes := NA]

g <-
  ggplot(data = df,
         aes(
           x = log2fc,
           y = negLog10p,
           col = diffexpressed,
           label = newGenes
         )) +
  geom_point(alpha = 0.5) +
  theme_classic() +
  scale_color_manual(name = "Expression",
                     values = c("grey", "#c6007b", "#a0b600")) +
  ggtitle(paste("pTGC vs. TSC (proteomics)")) +
  #  geom_text_repel(show.legend = F) +
  xlab("Log2 fold change") +
  ylab("-log10 pvalue") +
  theme(legend.text.align = 0)
ggplotly(g)

Figure 1: Volcano plot (proteomics)

#ggsave("prot_volc.png", dpi=900, width = 8, height = 6)

PCA plot

This was run on another R version due to incompatibility with existing packages. The command is shown below, but it is not executed when knitting the document.

library("MetaboAnalystR")
library(ggplot2)
library(ggrepel)

setwd("~/metabo")
mSet <- InitDataObjects("conc", "stat", FALSE)
mSet <- Read.TextData(mSet, "data_processed.csv", "colu", "disc")

mSet <- SanityCheckData(mSet)
mSet <- RemoveMissingPercent(mSet, percent = 0.5)
mSet <- ImputeMissingVar(mSet, method = "min")
mSet <- PreparePrenormData(mSet)
mSet <-
  Normalization(mSet,
                "NULL",
                "LogNorm",
                "NULL",
                ratio = FALSE,
                ratioNum = 20)
mSet <-
  PlotSampleNormSummary(mSet, "snorm_0_", "png", 300, width = NA)
mSet <- PCA.Anal(mSet)
saveRDS(mSet, "mSet.rds")

Read in the saved RDS file from the previous step and plot the PCA for samples.

mSet <- readRDS("assets/mset.rds")
group <- sapply(strsplit(rownames(mSet$analSet$pca$x), "_"), "[", 1)
intgroup.df <- as.data.frame(group)
d <- data.frame(
  PC1 = mSet$analSet$pca$x[, 1],
  PC2 = mSet$analSet$pca$x[, 2],
  intgroup.df,
  name = rownames(mSet$analSet$pca$x)
)
# rename
d$sample <- "NA"
d$rep <- gsub("$", ")", gsub("^rep", "(rep", str_extract(d$name,"rep.")))
d$sample[which(str_detect(rownames(d), "Diff_rep"))] <- "pTGC"
d$sample[which(str_detect(rownames(d), "Undiff_rep"))] <- "TSC"
d$group <- d$sample
d$name <- paste0(d$sample, d$rep, sep = " ")
d <- d %>% dplyr::select(PC1:name)
g <- ggplot(d, aes(PC1, PC2, color = group)) +
  scale_shape_manual(values = 1:10) +
  scale_color_manual(values = c('pTGC'      = '#c6007b',
                                'TSC'  = '#a0b600')) +
  theme_classic() +
  theme(legend.title = element_blank()) +
  geom_point(size = 2, stroke = 2) +
  geom_text_repel(aes(label = name)) +
  xlab(paste("PC1", round(mSet$analSet$pca$variance[1] * 100, 2), "% variance")) +
  ylab(paste("PC2", round(mSet$analSet$pca$variance[2] * 100, 2), "% variance"))
g
Figure 2: PCA plot (proteomics)

Figure 2: PCA plot (proteomics)

Tissue Enrichment

plotTE <- function(inputGenes = gene.list,
                   myColor = "color") {
  gs <-
    GeneSet(geneIds = inputGenes,
            organism = "Mus Musculus",
            geneIdType = SymbolIdentifier())
  output <- teEnrichment(inputGenes = gs, rnaSeqDataset = 3)
  en.output <-
    setNames(data.frame(assay(output[[1]]),
                        row.names = rowData(output[[1]])[, 1]),
             colData(output[[1]])[, 1])
  en.output$Tissue <- rownames(en.output)
  logp <- -log10(0.05)
  en.output <-
    mutate(en.output,
           significance = ifelse(Log10PValue > logp,
                                 "colored", "nocolor"))
  en.output$Sig <- "NA"
  ggplot(en.output, aes(reorder(Tissue, Log10PValue),
                        Log10PValue,
                        fill = significance)) +
    geom_bar(stat = 'identity') +
    theme_clean() + ylab("- log10 adj. p-value") + xlab("") +
    scale_fill_manual(values = c("colored" = myColor, "nocolor" = "gray")) +
    scale_y_continuous(expand = expansion(mult = c(0, .1)),
                       breaks = scales::pretty_breaks()) +
    coord_flip()
}
exp <- log(1.2, 2)
diff <- df$newGenes[df$log2fc <= -exp & df$Qvalue <= 0.05]
undiff <- df$newGenes[df$log2fc >= exp & df$Qvalue <= 0.05]
df.diff <- dat_tidy %>%
  rowwise() %>%
  mutate(Diff = mean(c(
    Diff_rep1,   Diff_rep2,  Diff_rep3,  Diff_rep4,  Diff_rep5
  ))) %>%
  dplyr::select(proteinID, Diff) %>%
  dplyr::filter(Diff > 0)  %>%
  ungroup() %>%
  mutate(quart = ntile(Diff, 4)) %>%
  mutate(decile = ntile(Diff, 10))
df.undiff <- dat_tidy %>%
  rowwise() %>%
  mutate(Undiff = mean(
    c(
      Undiff_rep1,
      Undiff_rep2,
      Undiff_rep3,
      Undiff_rep4,
      Undiff_rep5
    )
  )) %>%
  dplyr::select(proteinID, Undiff) %>%
  dplyr::filter(Undiff > 0)  %>%
  ungroup() %>%
  mutate(quart = ntile(Undiff, 4)) %>%
  mutate(decile = ntile(Undiff, 10))

filterCuts <- function(dataIn = df.undiff,
                       cutOff = 4,
                       type = decile) {
  type <- enquo(type)
  dataIn %>%
    dplyr::filter(!!type == cutOff) %>%
    dplyr::select(proteinID)
}
undiff.75pc <- filterCuts(df.undiff, 4, type = quart)
diff.75pc <- filterCuts(df.diff, 4, type = quart)
undiff.90pc <- filterCuts(df.undiff, 10, type = decile)
diff.90pc <- filterCuts(df.diff, 10, type = decile)
heat_colors <- brewer.pal(9, "YlOrRd")
plotTE.heatmap.full <-
  function(inputGenes = gene.list,
           inputTissue = "E14.5-Brain",
           GeneNames = FALSE) {
    gs <-
      GeneSet(geneIds = inputGenes,
              organism = "Mus Musculus",
              geneIdType = SymbolIdentifier())
    output <- teEnrichment(inputGenes = gs, rnaSeqDataset = 3)
    en.output <-
      setNames(data.frame(assay(output[[1]]),
                          row.names = rowData(output[[1]])[, 1]),
               colData(output[[1]])[, 1])
    en.output$Tissue <- rownames(en.output)
    seExp <- output[[2]][[inputTissue]]
    exp <-
      setNames(data.frame(assay(seExp), row.names = rowData(seExp)[, 1]),
               colData(seExp)[, 1])
    g <- pheatmap(
      exp,
      color = heat_colors,
      cluster_rows = F,
      cluster_cols  = T,
      show_rownames = GeneNames,
      border_color = NA,
      fontsize = 10,
      scale = "row",
      fontsize_row = 8
    )
    g
  }

Enrichment plots (TissueEnrich)

pTGC

plotTE(unique(diff), myColor = "#c6007b")
#> Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
#> ℹ Please use the `linewidth` argument instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
Figure 3A: Differentially expressed proteins (up-regulated in pTGC) enrichment in Mouse ENCODE data

Figure 3A: Differentially expressed proteins (up-regulated in pTGC) enrichment in Mouse ENCODE data

TSC

plotTE(unique(undiff), myColor = "#a0b600")
Figure 3B: Differentially expressed proteins (up-regulated in TSC) enrichment in Mouse ENCODE data

Figure 3B: Differentially expressed proteins (up-regulated in TSC) enrichment in Mouse ENCODE data

pTGC upper quartile

plotTE(as.character(diff.75pc$proteinID), "#c6007b")
Figure 3C: Upper quartile proteins (pTGC) enrichment in Mouse ENCODE data

Figure 3C: Upper quartile proteins (pTGC) enrichment in Mouse ENCODE data

TSC upper quartile

plotTE(as.character(undiff.75pc$proteinID), "#a0b600")
Figure 3E: Upper quartile proteins (TSC) enrichment in Mouse ENCODE data

Figure 3E: Upper quartile proteins (TSC) enrichment in Mouse ENCODE data

Enrichment Heatmaps for E14.5-Placenta

pTGC

plotTE.heatmap.full(
  inputGenes = unique(diff),
  inputTissue = "E14.5-Placenta",
  GeneNames = TRUE
)
Figure 4A: Heatmaps for E14.5-Brain tissue from the enrichment test using up-regulated proteins of pTGC

Figure 4A: Heatmaps for E14.5-Brain tissue from the enrichment test using up-regulated proteins of pTGC

TSC

plotTE.heatmap.full(
  inputGenes = unique(undiff),
  inputTissue = "E14.5-Placenta",
  GeneNames = TRUE
)
Figure 4B: Heatmaps for E14.5-Brain tissue from the enrichment test using up-regulated proteins of TSC

Figure 4B: Heatmaps for E14.5-Brain tissue from the enrichment test using up-regulated proteins of TSC

Session Information

sessionInfo()
#> R version 4.2.2 (2022-10-31)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 22.04.1 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] pheatmap_1.0.12             RColorBrewer_1.1-3         
#>  [3] ggrepel_0.9.3               TissueEnrich_1.18.0        
#>  [5] GSEABase_1.60.0             graph_1.76.0               
#>  [7] annotate_1.76.0             XML_3.99-0.12              
#>  [9] AnnotationDbi_1.60.0        SummarizedExperiment_1.28.0
#> [11] Biobase_2.58.0              GenomicRanges_1.50.1       
#> [13] GenomeInfoDb_1.34.3         IRanges_2.32.0             
#> [15] S4Vectors_0.36.0            BiocGenerics_0.44.0        
#> [17] MatrixGenerics_1.10.0       matrixStats_0.63.0         
#> [19] ensurer_1.1                 plotly_4.10.1              
#> [21] forcats_0.5.2               stringr_1.5.0              
#> [23] dplyr_1.0.10                purrr_1.0.1                
#> [25] readr_2.1.3                 tidyr_1.3.0                
#> [27] tibble_3.1.8                ggplot2_3.4.0              
#> [29] tidyverse_1.3.2             vctrs_0.6.2                
#> [31] data.table_1.14.6           scales_1.2.1               
#> 
#> loaded via a namespace (and not attached):
#>  [1] googledrive_2.0.0      colorspace_2.0-3       ellipsis_0.3.2        
#>  [4] XVector_0.38.0         fs_1.5.2               rstudioapi_0.14       
#>  [7] farver_2.1.1           bit64_4.0.5            fansi_1.0.3           
#> [10] lubridate_1.9.0        xml2_1.3.3             cachem_1.0.6          
#> [13] knitr_1.41             jsonlite_1.8.3         broom_1.0.1           
#> [16] dbplyr_2.2.1           png_0.1-7              compiler_4.2.2        
#> [19] httr_1.4.4             backports_1.4.1        assertthat_0.2.1      
#> [22] Matrix_1.5-3           fastmap_1.1.0          lazyeval_0.2.2        
#> [25] gargle_1.2.1           cli_3.4.1              htmltools_0.5.3       
#> [28] tools_4.2.2            gtable_0.3.1           glue_1.6.2            
#> [31] GenomeInfoDbData_1.2.9 Rcpp_1.0.9             cellranger_1.1.0      
#> [34] jquerylib_0.1.4        Biostrings_2.66.0      crosstalk_1.2.0       
#> [37] xfun_0.35              rvest_1.0.3            timechange_0.1.1      
#> [40] lifecycle_1.0.3        googlesheets4_1.0.1    zlibbioc_1.44.0       
#> [43] hms_1.1.2              yaml_2.3.6             memoise_2.0.1         
#> [46] sass_0.4.3             stringi_1.7.8          RSQLite_2.2.18        
#> [49] highr_0.9              rlang_1.1.1            pkgconfig_2.0.3       
#> [52] bitops_1.0-7           evaluate_0.18          lattice_0.20-45       
#> [55] htmlwidgets_1.5.4      labeling_0.4.2         bit_4.0.5             
#> [58] tidyselect_1.2.0       magrittr_2.0.3         bookdown_0.33         
#> [61] R6_2.5.1               generics_0.1.3         DelayedArray_0.24.0   
#> [64] DBI_1.1.3              pillar_1.8.1           haven_2.5.1           
#> [67] withr_2.5.0            KEGGREST_1.38.0        RCurl_1.98-1.9        
#> [70] modelr_0.1.10          crayon_1.5.2           utf8_1.2.2            
#> [73] tzdb_0.3.0             rmarkdown_2.18         grid_4.2.2            
#> [76] readxl_1.4.1           blob_1.2.3             rmdformats_1.0.4      
#> [79] reprex_2.0.2           digest_0.6.30          xtable_1.8-4          
#> [82] munsell_0.5.0          viridisLite_0.4.1      bslib_0.4.1