-
Notifications
You must be signed in to change notification settings - Fork 0
/
differential_gene_expression.Rmd
172 lines (126 loc) · 5.55 KB
/
differential_gene_expression.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
---
title: "Differential gene expression analysis"
author: "Mari-Lee Odendaal"
date: "`r Sys.time()`"
---
## Load R-packages
```{r message=F}
library(tximport);library(DESeq2);library(csaw);library(pheatmap);library(tidyverse);library(here);library(glue);library(DEFormats);library(colorspace);library(edgeR)
```
```{r setup, include=F, message=F}
subdir_name <- "differential_gene_expression"
# set paths
knitr::opts_knit$set(root.dir=".", aliases=c(h = "fig.height", w = "fig.width", ow = "out.width"))
knitr::opts_chunk$set(dev=c('png', 'pdf'),
fig.path=here("results", "figures", glue("{subdir_name}/")),
dpi=300)
```
```{r knit, echo=F, eval=FALSE}
rmarkdown::render(input = here("scripts", str_c(subdir_name, ".Rmd")), output_dir = here("results"))
```
## Input files
```{r message=F}
dir = here::here("input", "adult")
samples <- read.table(file.path(dir, "samplesheet.txt"), header = TRUE)
files <- file.path(dir, "Salmon", samples$File)
names(files) <- paste0("D", 7:12, ".tabular")
tx2gene <- read.table(file.path(dir, "gene_trans_map")) %>% .[,c(2,1)]
txi <- tximport(files, type = "salmon", tx2gene = tx2gene)
sample_tb <- data.frame(condition = factor(samples$Condition))
rownames(sample_tb) <- colnames(txi$counts)
dds <- DESeqDataSetFromTximport(txi, sample_tb, ~condition)
```
Salmon files are imported to summarize the abundances, counts, and transcript lengths to gene-level.
## Filter lowly expressed genes
```{r}
keep <- rowSums(cpm(dds)>=0.5) >= 2
dds <- dds[keep,]
```
Genes with an expression lower than 0.5 counts per million in at least 2 of the samples are removed from the data.
## Differential gene expression analysis with DESeq
```{r message=F}
dds <- DESeq(dds)
```
```{r}
res_pvsp <- results(dds, contrast=c("condition","Present","Past"), pAdjustMethod = "BH", alpha = 0.05, lfcThreshold = 0.5)
```
The differential gene expression analysis is performed for the comparison present vs past using a log2 fold change threshold of 0.5.
```{r}
res_fvsp <- results(dds, contrast=c("condition","Future","Present"), pAdjustMethod = "BH", alpha = 0.05, lfcThreshold = 0.5)
```
The differential gene expression analysis is performed for the comparison future vs present using a log2 fold change threshold of 0.5.
## Pre-processing for visualization of the DE genes
```{r}
res_pvsp$padj[is.na(res_pvsp$padj)] <- 1
keep <- res_pvsp$padj < 0.05
res_pvsp_sig <- res_pvsp[keep,] %>% as.data.frame()
res_pvsp_sig$gene <- rownames(res_pvsp_sig)
res_fvsp$padj[is.na(res_fvsp$padj)] <- 1
keep <- res_fvsp$padj < 0.05
res_fvsp_sig <- res_fvsp[keep,] %>% as.data.frame()
res_fvsp_sig$gene <- rownames(res_fvsp_sig)
res_all <- merge(res_pvsp_sig, res_fvsp_sig, by = "gene", all = T)
```
## Filter genes with extreme within group variation
```{r message=F}
df <- rownames_to_column(as.data.frame(counts(dds)), var = "gene") %>% as_tibble() %>% pivot_longer(c(2:7), names_to = "condition") %>% mutate(condition = ifelse(condition == samples$File, samples$Condition, NA)) %>% group_by(gene, condition) %>% summarise(diff = max(value)-min(value), fold = max(value)/min(value))
genes_rm <- unique(df[(df$diff > 300) & df$fold > 20,]$gene)
dds <- dds[!rownames(as.data.frame(cpm(dds))) %in% genes_rm,]
```
Genes with a high variation within groups are removed from the data.
```{r}
saveRDS(cpm(dds), file = "cpm_adults.RDS")
```
The filtered dataset is normalized by implementing counts per million, this is used for the correlation analysis
## Visualization of the DE genes
```{r}
y = as.DGEList(dds)
se <- SummarizedExperiment(assays = list(counts = y$counts, offset = y$offset))
se$totals <- y$samples$lib.size
cpms <- calculateCPM(se, use.offsets = TRUE, log = T)
```
The filtered data is normalized by implementing counts per million taking into account the offset and library size.
```{r}
cpms_sig <- subset(cpms, rownames(cpms) %in% res_all$gene)
colnames(cpms_sig) <- c("Future S1","Future S2","Past S1","Past S2","Present S1","Present S2")
cpms_sig2 <- t(scale(t(cpms_sig)))
cpms_sig2 <- cpms_sig2[,c(3,4,5,6,1,2)]
```
Rows are scaled for visualization
```{r}
col <- rev(diverging_hcl(15, "Red-Green"))
```
```{r heatmap, ow = '90%', h = 6, w = 5}
hm_all <- pheatmap(cpms_sig2, col=col, show_rownames = T, cluster_cols = F,cellwidth = 18, cellheight = 9)
```
## Pre-processing for GO MWU analysis
```{r}
res_pvsp2 = subset(res_pvsp, select = c(log2FoldChange,pvalue)) %>% as.data.frame()
res_pvsp2 <- res_pvsp2[!rownames(res_pvsp2) %in% genes_rm,]
res_pvsp2 <- res_pvsp2 %>% mutate(
direction = sqrt(log2FoldChange*log2FoldChange)/log2FoldChange*-1,
direction = ifelse(is.na(direction), 0, direction),
gene = rownames(.),
logp = log(pvalue,10)*direction) %>%
subset(., select = c(gene, logp))
```
```{r}
res_fvsp2 = subset(res_fvsp, select = c(log2FoldChange,pvalue)) %>% as.data.frame()
res_fvsp2 <- res_fvsp2[!rownames(res_fvsp2) %in% genes_rm,]
res_fvsp2 <- res_fvsp2 %>% mutate(
direction = sqrt(log2FoldChange*log2FoldChange)/log2FoldChange*-1,
direction = ifelse(is.na(direction), 0, direction),
gene = rownames(.),
logp = log(pvalue,10)*direction) %>%
subset(., select = c(gene, logp))
```
```{r}
write.table(res_pvsp2, file = "pvsp_adult.csv", append = FALSE, sep = ",", dec = ".", row.names = F, col.names = T)
write.table(res_fvsp2, file = "fvsp_adult.csv", append = FALSE, sep = ",", dec = ".", row.names = F, col.names = T)
```
All the genes with their log transformed P-value are used in the GO-MWU analysis (https://github.com/z0on/GO_MWU).
*The same script was implemented for the juveniles
## Session information
```{r}
sessionInfo()
```