Section 6: Custom PCE dataset generation
Generating custom PCE datasets
Following datasets were used for PlacentaCellEnrich
(PCE
)
- Vento-Tormo R et al.: scRNA-Seq data across 32 placental and decidual cells (built-in default dataset for [
PCE
]. - Castel et al., datasets, using Petropoulos et al., and Zhou et al.,.
- Xiang et. al. dataset
Prerequisites
R packages required for this section are loaded.
setwd("~/github/BAPvsTrophoblast_Amnion")
library(tidyverse)
library(TissueEnrich)
library(R.utils)
Download counts file
<-
zplink "https://iastate.box.com/shared/static/pb42m8gz4sgmlvzlxxcdk22ec1ke6z8a.gz"
download.file(zplink, "ZhouPetro_rawCounts_tpm.txt.gz")
gunzip("ZhouPetro_rawCounts_tpm.txt.gz")
<-
xilink "https://iastate.box.com/shared/static/5jhfv20ljybj4isvwidzl4rd06ncvnfl.gz"
download.file(xilink, "combined_555_rawCounts_tpm.txt.gz")
gunzip("combined_555_rawCounts_tpm.txt.gz")
Castel (Zhou + Petropoulos)
Castel et al., used the data from Petropoulos et al., and Zhou et al., in their analyses. We used the same data downloaded directly from their BitBucket for generating custom dataset for performing enrichment analyses.
<-
cts.castel as.matrix(read.csv(
"ZhouPetro_rawCounts_tpm.txt",
sep = "\t",
row.names = "Geneid"
))<-
md.castel read.csv(
"assets/metadata_castel.tsv",
sep = "\t",
header = FALSE,
row.names = 1
)<- as.data.frame(t(cts.castel))
mycts.t <- md.castel
metadata colnames(metadata) <- c("days", "celltypes")
<- metadata[2]
metadata <-
merged.table merge(metadata, mycts.t, by = "row.names", all.x = TRUE)
<-
merged.table %>% remove_rownames %>% column_to_rownames(var = "Row.names")
merged.table <- aggregate(. ~ celltypes, merged.table, mean)
mycts.mean <-
ordered as.data.frame(t(
%>% remove_rownames %>% column_to_rownames(var = "celltypes")
mycts.mean
))<-
se SummarizedExperiment(
assays = SimpleList(as.matrix(ordered)),
rowData = row.names(ordered),
colData = colnames(ordered)
)<- teGeneRetrieval(se)
te.dataset.castel saveRDS(te.dataset.castel, file = "te.dataset.castel.rds")
<- as.data.frame(assay(te.dataset.castel)) results.summary
# DISTRIBUTIONS ----
# Setting a custom ggplot2 function
# This function makes a pretty ggplot theme
# This function takes no arguments
# meaning that you always have just niwot_theme() and not niwot_theme(something else here)
# source: https://ourcodingclub.github.io/tutorials/dataviz-beautification/
<- function(){
theme_niwot theme_bw() +
theme(axis.text.y = element_text(size = 12),
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 12),
axis.title = element_text(size = 14),
axis.line.x = element_line(color="black"),
axis.line.y = element_line(color="black"),
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
plot.margin = unit(c(1, 1, 1, 1), units = , "cm"),
plot.title = element_text(size = 18, vjust = 1, hjust = 0),
legend.text = element_text(size = 12),
legend.title = element_blank(),
legend.position = c(0.95, 0.15),
legend.key = element_blank(),
legend.background = element_rect(color = "black",
fill = "transparent",
size = 2, linetype = "blank"))
}
ggplot(results.summary, aes(x = Tissue)) +
stat_count(alpha = 0.6, fill = "palegreen4") +
theme_niwot() +
scale_y_log10() +
xlab("Cell Types") +
ylab("Number of Genes enriched")
ggplot(results.summary, aes(x = Group)) +
stat_count(alpha = 0.6, fill = "slateblue") +
theme_niwot() +
xlab("Enrichment Type") +
ylab("Number of Genes enriched")
Xiang
Xiang et. al. dataset was generated, after downloading the SRA datasets from NCBI BioProject# PRJNA562548.
Download SRA datasets
while read a b; do
python3 enaBrowserTools-1.6/python3/enaDataGet.py \
-f fastq \
-as ~/.aspera_setting.ini \
$a;
done < assets/PRJNA562548.txt
Genome/annotation
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.primary_assembly.annotation.gtf.gz
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/GRCh38.primary_assembly.genome.fa.gz
gunzip gencode.v37.primary_assembly.annotation.gtf.gz
gunzip GRCh38.primary_assembly.genome.fa.gz
Mapping
Script to index (star-index.sh
):
#!/bin/bash
module load star
GTF=gencode.v37.primary_assembly.annotation.gtf
FASTA=GRCh38.primary_assembly.genome.fa
STAR --runThreadN 36 \
--runMode genomeGenerate \
--genomeDir GRCh38 \
--sjdbOverhang 100 \
--sjdbGTFfile ${GTF} \
--genomeFastaFiles ${FASTA}
Script to Map/Count (star-map.sh
):
#!/bin/bash
module load star
index=/work/LAS/geetu-lab/arnstrm/amnion-rnaseq/lo-etal/GRCh38
read1=$1
cpus=36
out=$(basename ${read1} | sed 's/.fastq.gz//g')
STAR \
\
--runThreadN 36 $index \
--genomeDir \
--outSAMtype BAM SortedByCoordinate \
--quantMode GeneCounts \
--outFilterScoreMinOverLread 0.3 \
--outFilterMatchNminOverLread 0.3 ${out}_ \
--outFileNamePrefix \
--readFilesCommand zcat ${read1} --readFilesIn
Prepare jobs:
find $(pwd) -name ".fastq.gz" > input.fofn
Slurm job script (star.slurm
):
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --time=8:00:00
#SBATCH --job-name=star
#SBATCH --output=nova-%x.%j.out
#SBATCH --error=nova-%x.%j.err
#SBATCH --mail-user=arnstrm@gmail.com
#SBATCH --mail-type=begin
#SBATCH --mail-type=end
IFS=$'\n' read -d '' -r -a LINES < input.fofn
LINE=${LINES[$SLURM_ARRAY_TASK_ID]}
echo -e "$(date +"%D %r")\tthis command is ${LINE}"
./star-map.sh ${LINE}
Submit jobs:
sbatch --array=0-554 star.slurm
Counts
for f in *_ReadsPerGene.out.tab; do
g=$(echo $f |sed 's/_ReadsPerGene.out.tab//g');
echo -e "AA_geneids\t$g" > ${g}_counts.txt;
tail -n +5 $f | cut -f1,4 >> ${g}_counts.txt;
done
join_files.sh *_counts.txt | grep -v "_counts.txt" | sort -k1,1 > counts-555.txt
# get geneids (protein coding only) from GTF file
gtf=gencode.v37.basic.annotation.gtf
awk '$3=="gene"' $gtf |\
-f 9 |\
cut '$4 ~ /protein_coding/ {print $2}'|\
awk -e 's/";//g' -e sed 's/"//g' > gene.ids
sed #filter coutns to retain only counts for geneids
grep -Fw -f gene.ids counts-555.txt > combined_555_rawCounts.txt
Generate TPM
From the raw counts, TPM table was generated using the raw counts. First normalize.R
script was saved as follows:
#! /usr/bin/env Rscript
# load the module
# r-devtools
# r-tidyr
# r-dplyr
# module load r-devtools r-tidyr r-dplyr
## functions for rpkm and tpm
## from https://gist.github.com/slowkow/c6ab0348747f86e2748b#file-counts_to_tpm-r-L44
## from https://www.biostars.org/p/171766/
rpkm <- function(counts, lengths) {
rate <- counts / lengths
rate / sum(counts) * 1e9
}
tpm <- function(counts, lengths) {
rate <- counts / lengths
rate / sum(rate) * 1e6
}
## read table from featureCounts output
args <- commandArgs(T)
tag <- tools::file_path_sans_ext(args[1])
ftr.cnt <- read.table(args[1], sep="\t", stringsAsFactors=FALSE,
header=TRUE)
library(dplyr)
library(tidyr)
ftr.rpkm <- ftr.cnt %>%
gather(sample, cnt, 7:ncol(ftr.cnt)) %>%
group_by(sample) %>%
mutate(rpkm=rpkm(cnt, Length)) %>%
select(-cnt) %>%
spread(sample,rpkm)
write.table(ftr.rpkm, file=paste0(tag, "_rpkm.txt"), sep="\t", row.names=FALSE, quote=FALSE)
ftr.tpm <- ftr.cnt %>%
gather(sample, cnt, 7:ncol(ftr.cnt)) %>%
group_by(sample) %>%
mutate(tpm=tpm(cnt, Length)) %>%
select(-cnt) %>%
spread(sample,tpm)
write.table(ftr.tpm, file=paste0(tag, "_tpm.txt"), sep="\t", row.names=FALSE, quote=FALSE)
second, the code was excuted as follows:
./normalize.R combined_555_rawCounts.txt
This generates two files: combined_555_rawCounts_tpm.txt
and combined_555_rawCounts_rpkm.txt
.
Processing TPM
<-
cts.xiang as.matrix(read.csv(
"combined_555_rawCounts_tpm.txt",
sep = "\t",
row.names = "Geneid"
))<-
md.xiang read.csv(
"assets/metadata_555.txt",
sep = "\t",
header = FALSE,
row.names = 1
)<- as.data.frame(t(cts.xiang))
mycts.t <- md.xiang
metadata colnames(metadata) <- c("days", "celltypes")
<- metadata[2]
metadata <- merge(metadata, mycts.t, by = "row.names", all.x = TRUE)
merged.table <-
merged.table %>% remove_rownames %>% column_to_rownames(var = "Row.names")
merged.table <- aggregate(. ~ celltypes, merged.table, mean)
mycts.mean <-
ordered as.data.frame(t(
%>% remove_rownames %>% column_to_rownames(var = "celltypes")
mycts.mean
))<-
se SummarizedExperiment(
assays = SimpleList(as.matrix(ordered)),
rowData = row.names(ordered),
colData = colnames(ordered)
)<-
te.dataset.xiang teGeneRetrieval(
se,foldChangeThreshold = 5,
maxNumberOfTissues = 6,
expressedGeneThreshold = 1
)saveRDS(te.dataset.xiang, file = "te.dataset.xiang.rds")
<- as.data.frame(assay(te.dataset.xiang)) results.summary
ggplot(results.summary, aes(x = Tissue)) +
stat_count(alpha = 0.6, fill = "palegreen4") +
theme_niwot() +
scale_y_log10() +
xlab("Cell Types") +
ylab("Number of Genes enriched")
ggplot(results.summary, aes(x = Group)) +
stat_count(alpha = 0.6, fill = "slateblue") +
theme_niwot() +
xlab("Enrichment Type") +
ylab("Number of Genes enriched")
Save Image file
save.image(file = "customPCE_data.RData")
Session Information
sessionInfo()
#> R version 4.0.5 (2021-03-31)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 19044)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=English_United States.1252
#> [2] LC_CTYPE=English_United States.1252
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United States.1252
#>
#> attached base packages:
#> [1] parallel stats4 stats graphics grDevices utils datasets
#> [8] methods base
#>
#> other attached packages:
#> [1] R.utils_2.10.1 R.oo_1.24.0
#> [3] R.methodsS3_1.8.1 TissueEnrich_1.10.1
#> [5] GSEABase_1.52.1 graph_1.68.0
#> [7] annotate_1.68.0 XML_3.99-0.6
#> [9] AnnotationDbi_1.52.0 SummarizedExperiment_1.20.0
#> [11] Biobase_2.50.0 GenomicRanges_1.42.0
#> [13] GenomeInfoDb_1.26.7 IRanges_2.24.1
#> [15] S4Vectors_0.28.1 BiocGenerics_0.36.1
#> [17] MatrixGenerics_1.2.1 matrixStats_0.58.0
#> [19] ensurer_1.1 forcats_0.5.1
#> [21] stringr_1.4.0 dplyr_1.0.7
#> [23] purrr_0.3.4 readr_2.1.1
#> [25] tidyr_1.1.4 tibble_3.1.1
#> [27] ggplot2_3.3.5 tidyverse_1.3.1
#>
#> loaded via a namespace (and not attached):
#> [1] bitops_1.0-7 fs_1.5.0 lubridate_1.8.0
#> [4] bit64_4.0.5 httr_1.4.2 tools_4.0.5
#> [7] backports_1.2.1 bslib_0.3.1 utf8_1.2.1
#> [10] R6_2.5.1 DBI_1.1.2 colorspace_2.0-1
#> [13] withr_2.4.3 tidyselect_1.1.1 bit_4.0.4
#> [16] compiler_4.0.5 cli_3.1.1 rvest_1.0.0
#> [19] xml2_1.3.3 DelayedArray_0.16.3 labeling_0.4.2
#> [22] bookdown_0.24 sass_0.4.0 scales_1.1.1
#> [25] digest_0.6.27 rmarkdown_2.11 XVector_0.30.0
#> [28] pkgconfig_2.0.3 htmltools_0.5.2 highr_0.9
#> [31] dbplyr_2.1.1 fastmap_1.1.0 rlang_0.4.11
#> [34] readxl_1.3.1 rstudioapi_0.13 RSQLite_2.2.9
#> [37] farver_2.1.0 jquerylib_0.1.4 generics_0.1.1
#> [40] jsonlite_1.7.3 RCurl_1.98-1.3 magrittr_2.0.1
#> [43] GenomeInfoDbData_1.2.4 Matrix_1.3-3 Rcpp_1.0.8
#> [46] munsell_0.5.0 fansi_0.4.2 lifecycle_1.0.1
#> [49] stringi_1.7.6 yaml_2.2.1 zlibbioc_1.36.0
#> [52] grid_4.0.5 blob_1.2.2 crayon_1.4.2
#> [55] lattice_0.20-44 haven_2.4.1 hms_1.1.1
#> [58] knitr_1.37 pillar_1.6.5 reprex_2.0.0
#> [61] glue_1.4.2 evaluate_0.14 modelr_0.1.8
#> [64] vctrs_0.3.8 rmdformats_1.0.3 tzdb_0.2.0
#> [67] cellranger_1.1.0 gtable_0.3.0 assertthat_0.2.1
#> [70] cachem_1.0.5 xfun_0.29 xtable_1.8-4
#> [73] broom_0.7.6 memoise_2.0.1 ellipsis_0.3.2