Section 6: Custom PCE dataset generation

Generating custom PCE datasets

Following datasets were used for PlacentaCellEnrich (PCE)

Vento-Tormo R et al.: scRNA-Seq data across 32 placental and decidual cells (built-in default dataset for [PCE].
Castel et al., datasets, using Petropoulos et al., and Zhou et al.,.
Xiang et. al. dataset

Prerequisites

R packages required for this section are loaded.

setwd("~/github/BAPvsTrophoblast_Amnion")
library(tidyverse)
library(TissueEnrich)
library(R.utils)

Download counts file

zplink <-
  "https://iastate.box.com/shared/static/pb42m8gz4sgmlvzlxxcdk22ec1ke6z8a.gz"
download.file(zplink, "ZhouPetro_rawCounts_tpm.txt.gz")
gunzip("ZhouPetro_rawCounts_tpm.txt.gz")
xilink <-
  "https://iastate.box.com/shared/static/5jhfv20ljybj4isvwidzl4rd06ncvnfl.gz"
download.file(xilink, "combined_555_rawCounts_tpm.txt.gz")
gunzip("combined_555_rawCounts_tpm.txt.gz")

Castel (Zhou + Petropoulos)

Castel et al., used the data from Petropoulos et al., and Zhou et al., in their analyses. We used the same data downloaded directly from their BitBucket for generating custom dataset for performing enrichment analyses.

cts.castel <-
  as.matrix(read.csv(
    "ZhouPetro_rawCounts_tpm.txt",
    sep = "\t",
    row.names = "Geneid"
  ))
md.castel <-
  read.csv(
    "assets/metadata_castel.tsv",
    sep = "\t",
    header = FALSE,
    row.names = 1
  )
mycts.t <- as.data.frame(t(cts.castel))
metadata <- md.castel
colnames(metadata) <- c("days", "celltypes")
metadata <- metadata[2]
merged.table <-
  merge(metadata, mycts.t, by = "row.names", all.x = TRUE)
merged.table <-
  merged.table %>% remove_rownames %>% column_to_rownames(var = "Row.names")
mycts.mean <- aggregate(. ~ celltypes, merged.table, mean)
ordered <-
  as.data.frame(t(
    mycts.mean %>% remove_rownames %>% column_to_rownames(var = "celltypes")
  ))
se <-
  SummarizedExperiment(
    assays = SimpleList(as.matrix(ordered)),
    rowData = row.names(ordered),
    colData = colnames(ordered)
  )
te.dataset.castel <- teGeneRetrieval(se)
saveRDS(te.dataset.castel, file = "te.dataset.castel.rds")
results.summary <- as.data.frame(assay(te.dataset.castel))

# DISTRIBUTIONS ----
# Setting a custom ggplot2 function
# This function makes a pretty ggplot theme
# This function takes no arguments 
# meaning that you always have just niwot_theme() and not niwot_theme(something else here)
# source: https://ourcodingclub.github.io/tutorials/dataviz-beautification/

theme_niwot <- function(){
  theme_bw() +
    theme(axis.text.y = element_text(size = 12),
          axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 12), 
          axis.title = element_text(size = 14),
          axis.line.x = element_line(color="black"), 
          axis.line.y = element_line(color="black"),
          panel.border = element_blank(),
          panel.grid.major.x = element_blank(),                                          
          panel.grid.minor.x = element_blank(),
          panel.grid.minor.y = element_blank(),
          panel.grid.major.y = element_blank(),  
          plot.margin = unit(c(1, 1, 1, 1), units = , "cm"),
          plot.title = element_text(size = 18, vjust = 1, hjust = 0),
          legend.text = element_text(size = 12),          
          legend.title = element_blank(),                              
          legend.position = c(0.95, 0.15), 
          legend.key = element_blank(),
          legend.background = element_rect(color = "black", 
                                           fill = "transparent", 
                                           size = 2, linetype = "blank"))
}

ggplot(results.summary, aes(x = Tissue)) +
    stat_count(alpha = 0.6, fill = "palegreen4") +
  theme_niwot() + 
  scale_y_log10() +
  xlab("Cell Types") +
  ylab("Number of Genes enriched")

Fig 6.1: Castel Dataset tissue enrichement distribution

ggplot(results.summary, aes(x = Group)) +
    stat_count(alpha = 0.6, fill = "slateblue") +
  theme_niwot() + 
  xlab("Enrichment Type") +
  ylab("Number of Genes enriched")

Fig 6.2: Castel Dataset tissue enrichement results

Xiang

Xiang et. al. dataset was generated, after downloading the SRA datasets from NCBI BioProject# PRJNA562548.

Download SRA datasets

while read a b; do
python3 enaBrowserTools-1.6/python3/enaDataGet.py \
  -f fastq \
  -as ~/.aspera_setting.ini \
  $a;
done < assets/PRJNA562548.txt

Genome/annotation

wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.primary_assembly.annotation.gtf.gz
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/GRCh38.primary_assembly.genome.fa.gz
gunzip gencode.v37.primary_assembly.annotation.gtf.gz
gunzip GRCh38.primary_assembly.genome.fa.gz

Mapping

Script to index (star-index.sh):

#!/bin/bash
module load star
GTF=gencode.v37.primary_assembly.annotation.gtf
FASTA=GRCh38.primary_assembly.genome.fa
STAR --runThreadN 36 \
     --runMode genomeGenerate \
     --genomeDir GRCh38 \
     --sjdbOverhang 100 \
     --sjdbGTFfile ${GTF} \
     --genomeFastaFiles ${FASTA}

Script to Map/Count (star-map.sh):

 #!/bin/bash
module load star
index=/work/LAS/geetu-lab/arnstrm/amnion-rnaseq/lo-etal/GRCh38
read1=$1
cpus=36
out=$(basename ${read1} | sed 's/.fastq.gz//g')
STAR \
--runThreadN 36 \
--genomeDir $index \
--outSAMtype BAM SortedByCoordinate \
--quantMode GeneCounts \
--outFilterScoreMinOverLread 0.3 \
--outFilterMatchNminOverLread 0.3 \
--outFileNamePrefix ${out}_ \
--readFilesCommand zcat \
--readFilesIn ${read1}

Prepare jobs:

find $(pwd) -name ".fastq.gz" > input.fofn

Slurm job script (star.slurm):

#!/bin/bash
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --time=8:00:00
#SBATCH --job-name=star
#SBATCH --output=nova-%x.%j.out
#SBATCH --error=nova-%x.%j.err
#SBATCH --mail-user=arnstrm@gmail.com
#SBATCH --mail-type=begin
#SBATCH --mail-type=end
IFS=$'\n' read -d '' -r -a LINES < input.fofn
LINE=${LINES[$SLURM_ARRAY_TASK_ID]}
echo -e "$(date +"%D  %r")\tthis command is ${LINE}"
./star-map.sh ${LINE}

Submit jobs:

sbatch --array=0-554 star.slurm

Counts

for f in *_ReadsPerGene.out.tab; do
g=$(echo $f |sed 's/_ReadsPerGene.out.tab//g');
echo -e "AA_geneids\t$g" > ${g}_counts.txt;
tail -n +5 $f | cut -f1,4 >> ${g}_counts.txt;
done
join_files.sh *_counts.txt | grep -v "_counts.txt" | sort -k1,1 > counts-555.txt
# get geneids (protein coding only) from GTF file
gtf=gencode.v37.basic.annotation.gtf
awk '$3=="gene"' $gtf |\
  cut -f 9 |\
  awk '$4 ~ /protein_coding/ {print $2}'|\
  sed -e 's/";//g' -e sed 's/"//g' > gene.ids
#filter coutns to retain only counts for geneids
grep -Fw -f gene.ids counts-555.txt > combined_555_rawCounts.txt

Generate TPM

From the raw counts, TPM table was generated using the raw counts. First normalize.R script was saved as follows:

#! /usr/bin/env Rscript
# load the module
# r-devtools
# r-tidyr
# r-dplyr
# module load r-devtools r-tidyr r-dplyr
## functions for rpkm and tpm
## from https://gist.github.com/slowkow/c6ab0348747f86e2748b#file-counts_to_tpm-r-L44
## from https://www.biostars.org/p/171766/
rpkm <- function(counts, lengths) {
  rate <- counts / lengths
  rate / sum(counts) * 1e9
}
tpm <- function(counts, lengths) {
  rate <- counts / lengths
  rate / sum(rate) * 1e6
}
## read table from featureCounts output
args <- commandArgs(T)
tag <- tools::file_path_sans_ext(args[1])
ftr.cnt <- read.table(args[1], sep="\t", stringsAsFactors=FALSE,
  header=TRUE)
library(dplyr)
library(tidyr)
ftr.rpkm <- ftr.cnt %>%
  gather(sample, cnt, 7:ncol(ftr.cnt)) %>%
  group_by(sample) %>%
  mutate(rpkm=rpkm(cnt, Length)) %>%
  select(-cnt) %>%
  spread(sample,rpkm)
write.table(ftr.rpkm, file=paste0(tag, "_rpkm.txt"), sep="\t", row.names=FALSE, quote=FALSE)
ftr.tpm <- ftr.cnt %>%
  gather(sample, cnt, 7:ncol(ftr.cnt)) %>%
  group_by(sample) %>%
  mutate(tpm=tpm(cnt, Length)) %>%
  select(-cnt) %>%
  spread(sample,tpm)
write.table(ftr.tpm, file=paste0(tag, "_tpm.txt"), sep="\t", row.names=FALSE, quote=FALSE)

second, the code was excuted as follows:

./normalize.R combined_555_rawCounts.txt

This generates two files: combined_555_rawCounts_tpm.txt and combined_555_rawCounts_rpkm.txt.

Processing TPM

cts.xiang <-
  as.matrix(read.csv(
    "combined_555_rawCounts_tpm.txt",
    sep = "\t",
    row.names = "Geneid"
  ))
md.xiang <-
  read.csv(
    "assets/metadata_555.txt",
    sep = "\t",
    header = FALSE,
    row.names = 1
  )
mycts.t <- as.data.frame(t(cts.xiang))
metadata <- md.xiang
colnames(metadata) <- c("days", "celltypes")
metadata <- metadata[2]
merged.table <- merge(metadata, mycts.t, by = "row.names", all.x = TRUE)
merged.table <-
  merged.table %>% remove_rownames %>% column_to_rownames(var = "Row.names")
mycts.mean <- aggregate(. ~ celltypes, merged.table, mean)
ordered <-
  as.data.frame(t(
    mycts.mean %>% remove_rownames %>% column_to_rownames(var = "celltypes")
  ))
se <-
  SummarizedExperiment(
    assays = SimpleList(as.matrix(ordered)),
    rowData = row.names(ordered),
    colData = colnames(ordered)
  )
te.dataset.xiang <-
  teGeneRetrieval(
    se,
    foldChangeThreshold = 5,
    maxNumberOfTissues = 6,
    expressedGeneThreshold = 1
  )
saveRDS(te.dataset.xiang, file = "te.dataset.xiang.rds")
results.summary <- as.data.frame(assay(te.dataset.xiang))

ggplot(results.summary, aes(x = Tissue)) +
    stat_count(alpha = 0.6, fill = "palegreen4") +
  theme_niwot() + 
  scale_y_log10() +
  xlab("Cell Types") +
  ylab("Number of Genes enriched")

Fig X.1: Xiang Dataset tissue enrichement distribution

ggplot(results.summary, aes(x = Group)) +
    stat_count(alpha = 0.6, fill = "slateblue") +
  theme_niwot() + 
  xlab("Enrichment Type") +
  ylab("Number of Genes enriched")

Fig X.2: Xiang Dataset tissue enrichement results

Save Image file

save.image(file = "customPCE_data.RData")

Session Information

sessionInfo()
#> R version 4.0.5 (2021-03-31)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 19044)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=English_United States.1252 
#> [2] LC_CTYPE=English_United States.1252   
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C                          
#> [5] LC_TIME=English_United States.1252    
#> 
#> attached base packages:
#> [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
#> [8] methods   base     
#> 
#> other attached packages:
#>  [1] R.utils_2.10.1              R.oo_1.24.0                
#>  [3] R.methodsS3_1.8.1           TissueEnrich_1.10.1        
#>  [5] GSEABase_1.52.1             graph_1.68.0               
#>  [7] annotate_1.68.0             XML_3.99-0.6               
#>  [9] AnnotationDbi_1.52.0        SummarizedExperiment_1.20.0
#> [11] Biobase_2.50.0              GenomicRanges_1.42.0       
#> [13] GenomeInfoDb_1.26.7         IRanges_2.24.1             
#> [15] S4Vectors_0.28.1            BiocGenerics_0.36.1        
#> [17] MatrixGenerics_1.2.1        matrixStats_0.58.0         
#> [19] ensurer_1.1                 forcats_0.5.1              
#> [21] stringr_1.4.0               dplyr_1.0.7                
#> [23] purrr_0.3.4                 readr_2.1.1                
#> [25] tidyr_1.1.4                 tibble_3.1.1               
#> [27] ggplot2_3.3.5               tidyverse_1.3.1            
#> 
#> loaded via a namespace (and not attached):
#>  [1] bitops_1.0-7           fs_1.5.0               lubridate_1.8.0       
#>  [4] bit64_4.0.5            httr_1.4.2             tools_4.0.5           
#>  [7] backports_1.2.1        bslib_0.3.1            utf8_1.2.1            
#> [10] R6_2.5.1               DBI_1.1.2              colorspace_2.0-1      
#> [13] withr_2.4.3            tidyselect_1.1.1       bit_4.0.4             
#> [16] compiler_4.0.5         cli_3.1.1              rvest_1.0.0           
#> [19] xml2_1.3.3             DelayedArray_0.16.3    labeling_0.4.2        
#> [22] bookdown_0.24          sass_0.4.0             scales_1.1.1          
#> [25] digest_0.6.27          rmarkdown_2.11         XVector_0.30.0        
#> [28] pkgconfig_2.0.3        htmltools_0.5.2        highr_0.9             
#> [31] dbplyr_2.1.1           fastmap_1.1.0          rlang_0.4.11          
#> [34] readxl_1.3.1           rstudioapi_0.13        RSQLite_2.2.9         
#> [37] farver_2.1.0           jquerylib_0.1.4        generics_0.1.1        
#> [40] jsonlite_1.7.3         RCurl_1.98-1.3         magrittr_2.0.1        
#> [43] GenomeInfoDbData_1.2.4 Matrix_1.3-3           Rcpp_1.0.8            
#> [46] munsell_0.5.0          fansi_0.4.2            lifecycle_1.0.1       
#> [49] stringi_1.7.6          yaml_2.2.1             zlibbioc_1.36.0       
#> [52] grid_4.0.5             blob_1.2.2             crayon_1.4.2          
#> [55] lattice_0.20-44        haven_2.4.1            hms_1.1.1             
#> [58] knitr_1.37             pillar_1.6.5           reprex_2.0.0          
#> [61] glue_1.4.2             evaluate_0.14          modelr_0.1.8          
#> [64] vctrs_0.3.8            rmdformats_1.0.3       tzdb_0.2.0            
#> [67] cellranger_1.1.0       gtable_0.3.0           assertthat_0.2.1      
#> [70] cachem_1.0.5           xfun_0.29              xtable_1.8-4          
#> [73] broom_0.7.6            memoise_2.0.1          ellipsis_0.3.2