Section 1: Pre-processing
Downloading datasets
SRA datasets
while read a b; do
python3 enaBrowserTools-1.6/python3/enaDataGet.py \
-f fastq \
-as ~/.aspera_setting.ini \
$a;
done < assets/needed-sra.txt
Genome/annotation
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.primary_assembly.annotation.gtf.gz
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/GRCh38.primary_assembly.genome.fa.gz
gunzip gencode.v37.primary_assembly.annotation.gtf.gz
gunzip GRCh38.primary_assembly.genome.fa.gz
Mapping
Script to index (star-index.sh
):
#!/bin/bash
module load star
GTF=gencode.v37.primary_assembly.annotation.gtf
FASTA=GRCh38.primary_assembly.genome.fa
STAR --runThreadN 36 \
--runMode genomeGenerate \
--genomeDir GRCh38 \
--sjdbOverhang 100 \
--sjdbGTFfile ${GTF} \
--genomeFastaFiles ${FASTA}
Script to Map/Count (star-map.sh
):
#!/bin/bash
module load star
index=/work/LAS/geetu-lab/arnstrm/amnion-rnaseq/lo-etal/GRCh38
read1=$1
cpus=36
out=$(basename ${read1} | sed 's/.fastq.gz//g')
STAR \
\
--runThreadN 36 $index \
--genomeDir \
--outSAMtype BAM SortedByCoordinate \
--quantMode GeneCounts \
--outFilterScoreMinOverLread 0.3 \
--outFilterMatchNminOverLread 0.3 ${out}_ \
--outFileNamePrefix \
--readFilesCommand zcat ${read1} --readFilesIn
Prepare jobs:
find $(pwd) -name ".fastq.gz" > input.fofn
Slurm job script (star.slurm
):
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --time=8:00:00
#SBATCH --job-name=star
#SBATCH --output=nova-%x.%j.out
#SBATCH --error=nova-%x.%j.err
#SBATCH --mail-user=arnstrm@gmail.com
#SBATCH --mail-type=begin
#SBATCH --mail-type=end
IFS=$'\n' read -d '' -r -a LINES < input.fofn
LINE=${LINES[$SLURM_ARRAY_TASK_ID]}
echo -e "$(date +"%D %r")\tthis command is ${LINE}"
./star-map.sh ${LINE}
Submit jobs:
sbatch --array=0-72 star.slurm
Counts
for f in *_ReadsPerGene.out.tab; do
g=$(echo $f |sed 's/_ReadsPerGene.out.tab//g');
echo -e "AA_geneids\t$g" > ${g}_counts.txt;
tail -n +5 $f | cut -f1,4 >> ${g}_counts.txt;
done
join_files.sh *_counts.txt | grep -v "_counts.txt" | sort -k1,1 > counts-full.txt
# get geneids (protein coding only) from GTF file
gtf=gencode.v37.basic.annotation.gtf
awk '$3=="gene"' $gtf |\
-f 9 |\
cut '$4 ~ /protein_coding/ {print $2}'|\
awk -e 's/";//g' -e sed 's/"//g' > gene.ids
sed #filter coutns to retain only counts for geneids
grep -Fw -f gene.ids counts-full.txt > coding-counts.txt
# organize folders
while read a b; do
mkdir -p ${b};
mv ${a}_* ${b}/;
done<needed-sra.txt
# rename folders:
mv E-MTAB-10429 E-MTAB-10429_Sheridan
mv PRJNA276463 PRJNA276463_Roost
mv PRJNA294733 PRJNA294733_Yabe
mv PRJNA316992 PRJNA316992_Suzuki
mv PRJNA352339 PRJNA352339_Shao
mv PRJNA605646 PRJNA605646_Io
mv PRJNA414247 PRJNA414247_Krendl
# run mulitqc
for dir in E-MTAB-10429_Sheridan PRJ*; do
cd $dir;
multiqc .;
cp multiqc_report.html ../${f}_multiqc.html;
cd ..
done
Subset counts
Using the srr-to-labels.txt
file, the SRR numbers in the master counts file were converted to informative names. From this master counts table, subset datasets were derived using the cut command.
cp coding-counts.txt coding-counts-renamed.txt
while read a b; do
sed -i 's/'$a'/'$b'/g' coding-counts-renamed.txt
done<srr-to-labels.txt