Human dataset RNAseq analyses
Files copy
#copy commandsPackages
library(data.table)
library(tidyverse)
library(sva)
library(DESeq2)
library(pheatmap)
library(viridis)
library(biomaRt)
library(ggvenn)
library(ggpubr)Input
setwd("/work/LAS/geetu-lab/arnstrm/HumanDatasets")
bewo <- fread("assets/BeWo_PLAC1_KD_counts_genes.tsv")
CT29 <- fread("assets/CT29_ST3D_PLAC1_KD_counts_genes.tsv")
colnames(CT29) <-
c(
"Geneid",
"CT29_SCR.1",
"CT29_SCR.2",
"CT29_SCR.3",
"CT29_shPLAC1.1",
"CT29_shPLAC1.2",
"CT29_shPLAC1.3"
)
CT27 <- fread("assets/hTS_CT27_ST3D_PLAC1_KD_counts_genes.tsv")
colnames(CT27) <-
c(
"Geneid",
"CT27_PLAC1.4_1",
"CT27_PLAC1.4_2",
"CT27_PLAC1.4_3",
"CT27_SCR_1",
"CT27_SCR_2",
"CT27_SCR_3"
)
bewo_CT27 <- left_join(bewo, CT27, by = "Geneid")
bewo_CT27_CT29 <- left_join(bewo_CT27, CT29, by = "Geneid")
counts <- bewo_CT27_CT29 %>%
as_tibble() %>%
column_to_rownames(var = "Geneid") %>%
as.matrix() info <- fread("assets/info.tsv")
info <- info[c(-1),]
names <- info %>%
column_to_rownames("name") %>%
mutate(across(where(is.character),
as.factor))Read previous experiment DE list
# human datasets
makeFilePath <- function(fileName = "", projDir = "") {
paste("/work/LAS/geetu-lab/arnstrm",
projDir,
"assets",
fileName,
sep = "/")
}
BeWo.de <- makeFilePath(fileName = "DESeq2results-KDvsCT_fc.tsv",
projDir = "BeWo_PLAC1_KD")
CT29.de <-
makeFilePath(fileName = "DESeq2results-PLAC1vsSCR_fc.tsv",
projDir = "CT29_ST3D_PLAC1_KD")
CT27.de <-
makeFilePath(fileName = "DESeq2results-PLAC1vsSCR_fc.tsv",
projDir = "hTS_CT27_ST3D_PLAC1_KD")
BeWo.obj <- makeFilePath(fileName = "BeWo_PLAC1_KD_genelists.RData",
projDir = "BeWo_PLAC1_KD")
CT29.obj <-
makeFilePath(fileName = "CT29_ST3D_PLAC1_KD_KD_genelists.RData",
projDir = "CT29_ST3D_PLAC1_KD")
CT27.obj <-
makeFilePath(fileName = "hTS_CT27_ST3D_PLAC1_KD_genelists.RData",
projDir = "hTS_CT27_ST3D_PLAC1_KD")
de.BeWo <- fread(BeWo.de)
colnames(de.BeWo)[3] <- "BeWo.log2FC"
de.CT29 <- fread(CT29.de)
colnames(de.CT29)[3] <- "CT29.log2FC"
de.CT27 <- fread(CT27.de)
colnames(de.CT27)[3] <- "CT27.log2FC"
load(BeWo.obj)
BeWo_PLAC1_KD.KDvsCT.dw.pce1 <- KDvsCT.dw.pce1
BeWo_PLAC1_KD.KDvsCT.up.pce1 <- KDvsCT.up.pce1
load(CT29.obj)
CT29_ST3D_PLAC1_KD.PLAC1vsSCR.dw.pce1 <- PLAC1vsSCR.dw.pce1
CT29_ST3D_PLAC1_KD.PLAC1vsSCR.up.pce1 <- PLAC1vsSCR.up.pce1
load(CT27.obj)
hTS_CT27_ST3D_PLAC1_KD.PLAC1vsSCR.up.pce1 <- PLAC1vsSCR.up.pce1
hTS_CT27_ST3D_PLAC1_KD.PLAC1vsSCR.dw.pce1 <- PLAC1vsSCR.dw.pce1Batch correction
all(rownames(names) %in% colnames(counts))
#> [1] TRUE
counts <- counts[, rownames(names)]
cov1 <- as.factor(names$expt)
adjusted_counts <- ComBat_seq(counts, batch = cov1, group = NULL)
#> Found 3 batches
#> Using null model in ComBat-seq.
#> Adjusting for 0 covariate(s) or covariate level(s)
#> Estimating dispersions
#> Fitting the GLM model
#> Shrinkage off - using GLM estimates for parameters
#> Adjusting the dataDESeq2
dds <- DESeqDataSetFromMatrix(countData = adjusted_counts,
colData = names,
design = contrasts ~ sample)
#> converting counts to integer mode
vst <- assay(vst(dds))
vsd <- vst(dds, blind = FALSE)
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep, ]
dds <- DESeq(dds)
#> estimating size factors
#> estimating dispersions
#> gene-wise dispersion estimates
#> mean-dispersion relationship
#> final dispersion estimates
#> fitting model and testingPCA
pcaData <- plotPCA(vsd,
intgroup = c("expt", "sample"),
returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
rv <- rowVars(assay(vsd))
select <- order(rv,
decreasing = TRUE)[seq_len(min(500,
length(rv)))]
pca <- prcomp(t(assay(vsd)[select, ]))
percentVar <- pca$sdev ^ 2 / sum(pca$sdev ^ 2)
intgroup = c("expt", "sample")
intgroup.df <- as.data.frame(colData(vsd)[, intgroup, drop = FALSE])
group <- if (length(intgroup) > 1) {
factor(apply(intgroup.df, 1, paste, collapse = " : "))
}
d <- data.frame(
PC1 = pca$x[, 1],
PC2 = pca$x[, 2],
PC3 = pca$x[, 3],
PC4 = pca$x[, 4],
PC5 = pca$x[, 5],
group = group,
intgroup.df,
name = colnames(vsd)
)PC1 vs PC2
ggplot(d, aes(PC1, PC2, color = sample, shape = expt)) +
scale_shape_manual(values = 1:3) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))PC1 vs PC3
ggplot(d, aes(PC1, PC3, color = sample, shape = expt)) +
scale_shape_manual(values = 1:3) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))PC1 vs PC4
ggplot(d, aes(PC1, PC4, color = sample, shape = expt)) +
scale_shape_manual(values = 1:3) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))PC1 vs PC5
ggplot(d, aes(PC1, PC5, color = sample, shape = expt)) +
scale_shape_manual(values = 1:3) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))PC2 vs PC3
ggplot(d, aes(PC2, PC3, color = sample, shape = expt)) +
scale_shape_manual(values = 1:3) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))PC2 vs PC4
ggplot(d, aes(PC2, PC4, color = sample, shape = expt)) +
scale_shape_manual(values = 1:3) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))PC2 vs PC5
ggplot(d, aes(PC2, PC5, color = sample, shape = expt)) +
scale_shape_manual(values = 1:3) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))PC3 vs PC4
ggplot(d, aes(PC3, PC4, color = sample, shape = expt)) +
scale_shape_manual(values = 1:3) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))PC3 vs PC5
ggplot(d, aes(PC3, PC5, color = sample, shape = expt)) +
scale_shape_manual(values = 1:3) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))PC4 vs PC5
ggplot(d, aes(PC4, PC5, color = sample, shape = expt)) +
scale_shape_manual(values = 1:3) +
theme_bw() +
theme(legend.title = element_blank()) +
geom_point(size = 2, stroke = 2) +
xlab(paste("PC1", round(percentVar[1] * 100, 2), "% variance")) +
ylab(paste("PC2", round(percentVar[2] * 100, 2), "% variance"))Sample distance
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- colnames(vsd)
colnames(sampleDistMatrix) <- NULL
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = magma(9))Gene Information
ensembl = useMart("ENSEMBL_MART_ENSEMBL")
listDatasets(ensembl) %>%
filter(str_detect(description, "Human"))
ensembl = useDataset("hsapiens_gene_ensembl", mart = ensembl)
listFilters(ensembl) %>%
filter(str_detect(name, "ensembl"))
filterType <- "ensembl_gene_id_version"
head(rownames(counts))
counts <- read.delim("assets/counts_genes.tsv", row.names = 1, header = TRUE)
head(rownames(counts))
filterValues <- rownames(counts)
listAttributes(ensembl) %>%
head(20)
attributeNames <- c('ensembl_gene_id_version',
'ensembl_gene_id',
'external_gene_name')
annot <- getBM(
attributes = attributeNames,
filters = filterType,
values = filterValues,
mart = ensembl
)
attributeNames <- c('ensembl_gene_id_version',
'gene_biotype',
'external_gene_name',
'description')
mart <- getBM(
attributes = attributeNames,
filters = filterType,
values = filterValues,
mart = ensembl
)
write_delim(
annot,
file = "assets/annot.tsv",
delim = "\t"
)
write_delim(
mart,
file = "assets/mart.tsv",
delim = "\t"
) Files were saved, so we don’t query BioMart everytime we run the markdown. The files will be loaded, instead
mart <-
read.csv("assets/mart.tsv",
sep = "\t",
header = TRUE)
annot <-
read.csv("assets/annot.tsv",
sep = "\t",
header = TRUE)Comparing DE genes from CT27, CT29 and BeWo experiments
Downregulated genes (Ct and SCRs)
myCol = viridis(3)
down = list(BeWo = BeWo_PLAC1_KD.KDvsCT.dw.pce1,
CT29 = CT29_ST3D_PLAC1_KD.PLAC1vsSCR.dw.pce1,
CT27 = hTS_CT27_ST3D_PLAC1_KD.PLAC1vsSCR.dw.pce1)
ggvenn(
down,
fill_color = myCol,
stroke_size = 0.5,
set_name_size = 4
)Upregulated genes (KD & PLAC1’s)
mycol = viridis(3)
up = list(BeWo = BeWo_PLAC1_KD.KDvsCT.up.pce1,
CT29 = CT29_ST3D_PLAC1_KD.PLAC1vsSCR.up.pce1,
CT27 = hTS_CT27_ST3D_PLAC1_KD.PLAC1vsSCR.up.pce1)
ggvenn(
up,
fill_color = myCol,
stroke_size = 0.5,
set_name_size = 4
)Top 500 variable genes
myCol = viridis(3)
topVar = list(BeWo = BeWo.var.genes,
CT29 = CT29.var.genes,
CT27 = CT27.var.genes)
ggvenn(
topVar,
fill_color = myCol,
stroke_size = 0.5,
set_name_size = 4
)Create intersecting genelists
getFCtable <-
function(de1 = de.BeWo,
de2 = de.CT27,
int1 = down$BeWo,
int2 = down$CT27) {
intList = intersect(int1, int2)
cond1 <- de1 %>%
mutate(newGene = gsub("\\..*", "", Gene)) %>%
filter(newGene %in% intList) %>%
dplyr::select(external_gene_name, 3) %>%
column_to_rownames("external_gene_name")
cond2 <- de2 %>%
mutate(newGene = gsub("\\..*", "", Gene)) %>%
filter(newGene %in% intList) %>%
dplyr::select(external_gene_name, 3) %>%
column_to_rownames("external_gene_name")
log2FCtable <- merge(cond1,
cond2,
by = "row.names",
all = TRUE)
return(log2FCtable)
}Genes with FC for down
log2FC.dw_BeWo.CT27 <-
getFCtable(
de1 = de.BeWo,
de2 = de.CT27,
int1 = down$BeWo,
int2 = down$CT27
)
log2FC.dw_BeWo.CT29 <-
getFCtable(
de1 = de.BeWo,
de2 = de.CT29,
int1 = down$BeWo,
int2 = down$CT29
)
log2FC.dw_CT29.CT27 <-
getFCtable(
de1 = de.CT29,
de2 = de.CT27,
int1 = down$CT29,
int2 = down$CT27
)Genes with FC for up
log2FC.up_BeWo.CT27 <-
getFCtable(
de1 = de.BeWo,
de2 = de.CT27,
int1 = up$BeWo,
int2 = up$CT27
)
log2FC.up_BeWo.CT29 <-
getFCtable(
de1 = de.BeWo,
de2 = de.CT29,
int1 = up$BeWo,
int2 = up$CT29
)
log2FC.up_CT29.CT27 <-
getFCtable(
de1 = de.CT29,
de2 = de.CT27,
int1 = up$CT29,
int2 = up$CT27
)top 500 variable genes
log2FC.vr_BeWo.CT27 <-
getFCtable(
de1 = de.BeWo,
de2 = de.CT27,
int1 = gsub("\\..*", "", topVar$BeWo),
int2 = gsub("\\..*", "", topVar$CT27)
)
log2FC.vr_BeWo.CT29 <-
getFCtable(
de1 = de.BeWo,
de2 = de.CT29,
int1 = gsub("\\..*", "", topVar$BeWo),
int2 = gsub("\\..*", "", topVar$CT29)
)
log2FC.vr_CT29.CT27 <-
getFCtable(
de1 = de.CT29,
de2 = de.CT27,
int1 = gsub("\\..*", "", topVar$CT29),
int2 = gsub("\\..*", "", topVar$CT27)
)Correlation plots (down regulated)
BeWo and CT27
ggscatter(
log2FC.dw_BeWo.CT27,
x = "BeWo.log2FC",
y = "CT27.log2FC",
add = "reg.line",
conf.int = TRUE,
add.params = list(color = "blue",
fill = "lightgray")) +
stat_cor(method = "pearson")BeWo and CT29
ggscatter(
log2FC.dw_BeWo.CT29,
x = "BeWo.log2FC",
y = "CT29.log2FC",
add = "reg.line",
conf.int = TRUE,
add.params = list(color = "blue",
fill = "lightgray")) +
stat_cor(method = "pearson")CT29 and CT27
ggscatter(
log2FC.dw_CT29.CT27,
x = "CT29.log2FC",
y = "CT27.log2FC",
add = "reg.line",
conf.int = TRUE,
add.params = list(color = "blue",
fill = "lightgray")) +
stat_cor(method = "pearson")Correlation plots (up regulated)
BeWo and CT27
ggscatter(
log2FC.up_BeWo.CT27,
x = "BeWo.log2FC",
y = "CT27.log2FC",
add = "reg.line",
conf.int = TRUE,
add.params = list(color = "blue",
fill = "lightgray")) +
stat_cor(method = "pearson")BeWo and CT29
ggscatter(
log2FC.up_BeWo.CT29,
x = "BeWo.log2FC",
y = "CT29.log2FC",
add = "reg.line",
conf.int = TRUE,
add.params = list(color = "blue",
fill = "lightgray")) +
stat_cor(method = "pearson")CT29 and CT27
ggscatter(
log2FC.up_CT29.CT27,
x = "CT29.log2FC",
y = "CT27.log2FC",
add = "reg.line",
conf.int = TRUE,
add.params = list(color = "blue",
fill = "lightgray")) +
stat_cor(method = "pearson")Correlation plots (most variable genes)
BeWo and CT27
ggscatter(
log2FC.vr_BeWo.CT27,
x = "BeWo.log2FC",
y = "CT27.log2FC",
add = "reg.line",
conf.int = TRUE,
add.params = list(color = "blue",
fill = "lightgray")) +
stat_cor(method = "pearson")BeWo and CT29
ggscatter(
log2FC.vr_BeWo.CT29,
x = "BeWo.log2FC",
y = "CT29.log2FC",
add = "reg.line",
conf.int = TRUE,
add.params = list(color = "blue",
fill = "lightgray")) +
stat_cor(method = "pearson")CT29 and CT27
ggscatter(
log2FC.vr_CT29.CT27,
x = "CT29.log2FC",
y = "CT27.log2FC",
add = "reg.line",
conf.int = TRUE,
add.params = list(color = "blue",
fill = "lightgray")) +
stat_cor(method = "pearson")