Proteomics: data analysis
Prerequisites
Load the packages needed for the analyses and read-in the relavant data
library(scales)
library(data.table)
library(vctrs)
library(tidyverse)
library(plotly)
library(TissueEnrich)
library(ggrepel)
library(RColorBrewer)
library(pheatmap)
setwd("/work/LAS/geetu-lab/arnstrm/mouse.trophoblast.smallRNAseq")
<-
dat read.csv("assets/data_processed.csv",
row.names = NULL,
stringsAsFactors = TRUE)
<-
orig_data read.csv("assets/orig_data.csv",
row.names = NULL,
stringsAsFactors = TRUE)
source("assets/theme_clean.R")
quick check:
<- dat %>% drop_na()
dat_tidy %>% summarise(Number_of_proteins = n())
dat #> Number_of_proteins
#> 1 174
Volcano plot
<- orig_data
df <- log(1.5, 2)
exp $log2fc <- log(df$Ratio, base = 2)
df$negLog10p = -log10(df$Qvalue)
df$diffexpressed[df$log2fc <= -exp &
df$Qvalue <= 0.05] <- "up in pTGC"
df$diffexpressed[df$log2fc >= exp &
df$Qvalue <= 0.05] <- "up in TSC"
df$diffexpressed[df$log2fc >= -exp &
df$log2fc <= exp] <- "other genes"
df$diffexpressed[df$Qvalue > 0.05] <- "other genes"
df$newGenes <-
dfstr_replace_all(string = df$Genes,
pattern = "\\;.*$",
replacement = "")
::setDT(df)[diffexpressed == "other genes", newGenes := NA]
data.table
<-
g ggplot(data = df,
aes(
x = log2fc,
y = negLog10p,
col = diffexpressed,
label = newGenes
+
)) geom_point(alpha = 0.5) +
theme_classic() +
scale_color_manual(name = "Expression",
values = c("grey", "#c6007b", "#a0b600")) +
ggtitle(paste("pTGC vs. TSC (proteomics)")) +
# geom_text_repel(show.legend = F) +
xlab("Log2 fold change") +
ylab("-log10 pvalue") +
theme(legend.text.align = 0)
ggplotly(g)
#ggsave("prot_volc.png", dpi=900, width = 8, height = 6)
PCA plot
This was run on another R version due to incompatibility with existing packages. The command is shown below, but it is not executed when knitting the document.
library("MetaboAnalystR")
library(ggplot2)
library(ggrepel)
setwd("~/metabo")
<- InitDataObjects("conc", "stat", FALSE)
mSet <- Read.TextData(mSet, "data_processed.csv", "colu", "disc")
mSet
<- SanityCheckData(mSet)
mSet <- RemoveMissingPercent(mSet, percent = 0.5)
mSet <- ImputeMissingVar(mSet, method = "min")
mSet <- PreparePrenormData(mSet)
mSet <-
mSet Normalization(mSet,
"NULL",
"LogNorm",
"NULL",
ratio = FALSE,
ratioNum = 20)
<-
mSet PlotSampleNormSummary(mSet, "snorm_0_", "png", 300, width = NA)
<- PCA.Anal(mSet)
mSet saveRDS(mSet, "mSet.rds")
Read in the saved RDS file from the previous step and plot the PCA for samples.
<- readRDS("assets/mset.rds")
mSet <- sapply(strsplit(rownames(mSet$analSet$pca$x), "_"), "[", 1)
group <- as.data.frame(group)
intgroup.df <- data.frame(
d PC1 = mSet$analSet$pca$x[, 1],
PC2 = mSet$analSet$pca$x[, 2],
intgroup.df,name = rownames(mSet$analSet$pca$x)
)# rename
$sample <- "NA"
d$rep <- gsub("$", ")", gsub("^rep", "(rep", str_extract(d$name,"rep.")))
d$sample[which(str_detect(rownames(d), "Diff_rep"))] <- "pTGC"
d$sample[which(str_detect(rownames(d), "Undiff_rep"))] <- "TSC"
d$group <- d$sample
d$name <- paste0(d$sample, d$rep, sep = " ")
d<- d %>% dplyr::select(PC1:name)
d <- ggplot(d, aes(PC1, PC2, color = group)) +
g scale_shape_manual(values = 1:10) +
scale_color_manual(values = c('pTGC' = '#c6007b',
'TSC' = '#a0b600')) +
theme_classic() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
geom_text_repel(aes(label = name)) +
xlab(paste("PC1", round(mSet$analSet$pca$variance[1] * 100, 2), "% variance")) +
ylab(paste("PC2", round(mSet$analSet$pca$variance[2] * 100, 2), "% variance"))
g
Tissue Enrichment
<- function(inputGenes = gene.list,
plotTE myColor = "color") {
<-
gs GeneSet(geneIds = inputGenes,
organism = "Mus Musculus",
geneIdType = SymbolIdentifier())
<- teEnrichment(inputGenes = gs, rnaSeqDataset = 3)
output <-
en.output setNames(data.frame(assay(output[[1]]),
row.names = rowData(output[[1]])[, 1]),
colData(output[[1]])[, 1])
$Tissue <- rownames(en.output)
en.output<- -log10(0.05)
logp <-
en.output mutate(en.output,
significance = ifelse(Log10PValue > logp,
"colored", "nocolor"))
$Sig <- "NA"
en.outputggplot(en.output, aes(reorder(Tissue, Log10PValue),
Log10PValue,fill = significance)) +
geom_bar(stat = 'identity') +
theme_clean() + ylab("- log10 adj. p-value") + xlab("") +
scale_fill_manual(values = c("colored" = myColor, "nocolor" = "gray")) +
scale_y_continuous(expand = expansion(mult = c(0, .1)),
breaks = scales::pretty_breaks()) +
coord_flip()
}<- log(1.2, 2)
exp <- df$newGenes[df$log2fc <= -exp & df$Qvalue <= 0.05]
diff <- df$newGenes[df$log2fc >= exp & df$Qvalue <= 0.05]
undiff <- dat_tidy %>%
df.diff rowwise() %>%
mutate(Diff = mean(c(
Diff_rep1, Diff_rep2, Diff_rep3, Diff_rep4, Diff_rep5%>%
))) ::select(proteinID, Diff) %>%
dplyr::filter(Diff > 0) %>%
dplyrungroup() %>%
mutate(quart = ntile(Diff, 4)) %>%
mutate(decile = ntile(Diff, 10))
<- dat_tidy %>%
df.undiff rowwise() %>%
mutate(Undiff = mean(
c(
Undiff_rep1,
Undiff_rep2,
Undiff_rep3,
Undiff_rep4,
Undiff_rep5
)%>%
)) ::select(proteinID, Undiff) %>%
dplyr::filter(Undiff > 0) %>%
dplyrungroup() %>%
mutate(quart = ntile(Undiff, 4)) %>%
mutate(decile = ntile(Undiff, 10))
<- function(dataIn = df.undiff,
filterCuts cutOff = 4,
type = decile) {
<- enquo(type)
type %>%
dataIn ::filter(!!type == cutOff) %>%
dplyr::select(proteinID)
dplyr
}.75pc <- filterCuts(df.undiff, 4, type = quart)
undiff.75pc <- filterCuts(df.diff, 4, type = quart)
diff.90pc <- filterCuts(df.undiff, 10, type = decile)
undiff.90pc <- filterCuts(df.diff, 10, type = decile)
diff<- brewer.pal(9, "YlOrRd")
heat_colors <-
plotTE.heatmap.full function(inputGenes = gene.list,
inputTissue = "E14.5-Brain",
GeneNames = FALSE) {
<-
gs GeneSet(geneIds = inputGenes,
organism = "Mus Musculus",
geneIdType = SymbolIdentifier())
<- teEnrichment(inputGenes = gs, rnaSeqDataset = 3)
output <-
en.output setNames(data.frame(assay(output[[1]]),
row.names = rowData(output[[1]])[, 1]),
colData(output[[1]])[, 1])
$Tissue <- rownames(en.output)
en.output<- output[[2]][[inputTissue]]
seExp <-
exp setNames(data.frame(assay(seExp), row.names = rowData(seExp)[, 1]),
colData(seExp)[, 1])
<- pheatmap(
g
exp,color = heat_colors,
cluster_rows = F,
cluster_cols = T,
show_rownames = GeneNames,
border_color = NA,
fontsize = 10,
scale = "row",
fontsize_row = 8
)
g }
Enrichment plots (TissueEnrich
)
pTGC
plotTE(unique(diff), myColor = "#c6007b")
#> Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
#> ℹ Please use the `linewidth` argument instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
TSC
plotTE(unique(undiff), myColor = "#a0b600")
pTGC upper quartile
plotTE(as.character(diff.75pc$proteinID), "#c6007b")
TSC upper quartile
plotTE(as.character(undiff.75pc$proteinID), "#a0b600")
Enrichment Heatmaps for E14.5-Placenta
pTGC
plotTE.heatmap.full(
inputGenes = unique(diff),
inputTissue = "E14.5-Placenta",
GeneNames = TRUE
)
TSC
plotTE.heatmap.full(
inputGenes = unique(undiff),
inputTissue = "E14.5-Placenta",
GeneNames = TRUE
)
Session Information
sessionInfo()
#> R version 4.2.2 (2022-10-31)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 22.04.1 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] pheatmap_1.0.12 RColorBrewer_1.1-3
#> [3] ggrepel_0.9.3 TissueEnrich_1.18.0
#> [5] GSEABase_1.60.0 graph_1.76.0
#> [7] annotate_1.76.0 XML_3.99-0.12
#> [9] AnnotationDbi_1.60.0 SummarizedExperiment_1.28.0
#> [11] Biobase_2.58.0 GenomicRanges_1.50.1
#> [13] GenomeInfoDb_1.34.3 IRanges_2.32.0
#> [15] S4Vectors_0.36.0 BiocGenerics_0.44.0
#> [17] MatrixGenerics_1.10.0 matrixStats_0.63.0
#> [19] ensurer_1.1 plotly_4.10.1
#> [21] forcats_0.5.2 stringr_1.5.0
#> [23] dplyr_1.0.10 purrr_1.0.1
#> [25] readr_2.1.3 tidyr_1.3.0
#> [27] tibble_3.1.8 ggplot2_3.4.0
#> [29] tidyverse_1.3.2 vctrs_0.6.2
#> [31] data.table_1.14.6 scales_1.2.1
#>
#> loaded via a namespace (and not attached):
#> [1] googledrive_2.0.0 colorspace_2.0-3 ellipsis_0.3.2
#> [4] XVector_0.38.0 fs_1.5.2 rstudioapi_0.14
#> [7] farver_2.1.1 bit64_4.0.5 fansi_1.0.3
#> [10] lubridate_1.9.0 xml2_1.3.3 cachem_1.0.6
#> [13] knitr_1.41 jsonlite_1.8.3 broom_1.0.1
#> [16] dbplyr_2.2.1 png_0.1-7 compiler_4.2.2
#> [19] httr_1.4.4 backports_1.4.1 assertthat_0.2.1
#> [22] Matrix_1.5-3 fastmap_1.1.0 lazyeval_0.2.2
#> [25] gargle_1.2.1 cli_3.4.1 htmltools_0.5.3
#> [28] tools_4.2.2 gtable_0.3.1 glue_1.6.2
#> [31] GenomeInfoDbData_1.2.9 Rcpp_1.0.9 cellranger_1.1.0
#> [34] jquerylib_0.1.4 Biostrings_2.66.0 crosstalk_1.2.0
#> [37] xfun_0.35 rvest_1.0.3 timechange_0.1.1
#> [40] lifecycle_1.0.3 googlesheets4_1.0.1 zlibbioc_1.44.0
#> [43] hms_1.1.2 yaml_2.3.6 memoise_2.0.1
#> [46] sass_0.4.3 stringi_1.7.8 RSQLite_2.2.18
#> [49] highr_0.9 rlang_1.1.1 pkgconfig_2.0.3
#> [52] bitops_1.0-7 evaluate_0.18 lattice_0.20-45
#> [55] htmlwidgets_1.5.4 labeling_0.4.2 bit_4.0.5
#> [58] tidyselect_1.2.0 magrittr_2.0.3 bookdown_0.33
#> [61] R6_2.5.1 generics_0.1.3 DelayedArray_0.24.0
#> [64] DBI_1.1.3 pillar_1.8.1 haven_2.5.1
#> [67] withr_2.5.0 KEGGREST_1.38.0 RCurl_1.98-1.9
#> [70] modelr_0.1.10 crayon_1.5.2 utf8_1.2.2
#> [73] tzdb_0.3.0 rmarkdown_2.18 grid_4.2.2
#> [76] readxl_1.4.1 blob_1.2.3 rmdformats_1.0.4
#> [79] reprex_2.0.2 digest_0.6.30 xtable_1.8-4
#> [82] munsell_0.5.0 viridisLite_0.4.1 bslib_0.4.1