Compare the effects (betas) from differential expression results for NGT vs T2D and insulin from genes in GO:0022904 (respiratory electron transport chain)
library(knitr)
library(ggplot2)
## Loading required package: methods
library(qvalue)
##
## Attaching package: 'qvalue'
##
## The following object is masked from 'package:ggplot2':
##
## qplot
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
load("/net/snowwhite/home/beckandy/tissue/datafreeze4/goseq/cat2gene.RData")
genes <- cat2gene[["GO:0022904"]]
gene_names <- read.table("/net/snowwhite/home/beckandy/geneConversion.tab", sep="\t", header=T)
insu <- read.table("/net/snowwhite/home/beckandy/tissue/datafreeze4/goseq/traits/peer_k03_S_Insu_all_genes.txt", header=T, as.is=T)
t2d <- read.table("/net/snowwhite/home/beckandy/tissue/datafreeze4/goseq/ngt_vs_t2d/peer_k03_all_genes.txt", header=T, as.is=T)
insu <- insu[order(insu$p.value),]
t2d <- t2d[order(t2d$p.value),]
insu$q.value <- qvalue(insu$p.value)$qvalues
t2d$q.value <- qvalue(t2d$p.value)$qvalues
insu$rank <- seq(1,length(insu$p.value))
t2d$rank <- seq(1,length(t2d$p.value))
insu$DE <- insu$rank <= 1000
t2d$DE <- t2d$rank <= 1000
insu$gene <- as.character(sapply(insu$gene,function(x)strsplit(x,"[.]")[[1]][1]))
t2d$gene <- as.character(sapply(t2d$gene,function(x)strsplit(x,"[.]")[[1]][1]))
rownames(insu) <- insu$gene
rownames(t2d) <- t2d$gene
keep <- intersect(insu$gene, t2d$gene)
genes <- intersect(genes, keep)
insu <- insu[genes,]
t2d <- t2d[genes,]
#insu <- left_join(insu, gene_names[,-1], by=c("gene" = "gene_id_noversion"))
#t2d <- left_join(t2d, gene_names[,-1], by=c("gene" = "gene_id_noversion"))
Sanity check: do rows (genes) match between the two data frames?
print(sum(rownames(t2d) != rownames(insu)) == 0)
## [1] TRUE
Merge results into one data frame and save
#insu$DE <- as.numeric(insu$DE)
#t2d$DE <- as.numeric(t2d$DE)
betas <- data.frame(cbind.data.frame(insu$effect, t2d$effect, insu$DE, t2d$DE, insu$p.value, t2d$p.value, insu$q.value, t2d$q.value))
rownames(betas) <- genes
names(betas) <- c("insu_beta", "t2d_beta", "insu_de", "t2d_de", "insu_p","t2d_p", "insu_q", "t2d_q")
#0 if no DE, 1 if DE in insulin, 2 if DE in T2D, 3 if DE in both
betas$DE <- betas$insu_de + (2*betas$t2d_de)
#write.csv(betas, file="/net/csgsites/csg/beckandy/reports/resp_elec_effects.csv")
Some simple graphs and stats
kable(summary(betas), format="html")
| insu_beta | t2d_beta | insu_de | t2d_de | insu_p | t2d_p | insu_q | t2d_q | DE | |
|---|---|---|---|---|---|---|---|---|---|
| Min. :-0.27979 | Min. :-0.7171 | Mode :logical | Mode :logical | Min. :0.000000 | Min. :0.0000188 | Min. :0.00000 | Min. :0.03859 | Min. :0.0000 | |
| 1st Qu.:-0.00907 | 1st Qu.:-0.5132 | FALSE:90 | FALSE:73 | 1st Qu.:0.001188 | 1st Qu.:0.0076977 | 1st Qu.:0.00489 | 1st Qu.:0.15273 | 1st Qu.:0.0000 | |
| Median : 0.06890 | Median :-0.3917 | TRUE :15 | TRUE :32 | Median :0.043490 | Median :0.0453922 | Median :0.06737 | Median :0.29344 | Median :0.0000 | |
| Mean : 0.09011 | Mean :-0.3127 | NA’s :0 | NA’s :0 | Mean :0.232388 | Mean :0.1655041 | Mean :0.15350 | Mean :0.32738 | Mean :0.7524 | |
| 3rd Qu.: 0.18361 | 3rd Qu.:-0.1701 | NA | NA | 3rd Qu.:0.359478 | 3rd Qu.:0.1939692 | 3rd Qu.:0.27270 | 3rd Qu.:0.46518 | 3rd Qu.:2.0000 | |
| Max. : 0.42375 | Max. : 0.4342 | NA | NA | Max. :0.979925 | Max. :0.9471544 | Max. :0.48598 | Max. :0.74329 | Max. :3.0000 |
kable(cor(betas[,1:2]), format="html")
| insu_beta | t2d_beta | |
|---|---|---|
| insu_beta | 1.0000000 | 0.0287681 |
| t2d_beta | 0.0287681 | 1.0000000 |
kable(t(as.matrix(table(betas$DE+1))), format="html")
| 1 | 2 | 3 | 4 |
|---|---|---|---|
| 59 | 14 | 31 | 1 |
p1 <- ggplot(betas, aes(x=insu_beta, y=t2d_beta, colour=factor(DE+1)))+geom_point(size=3)+geom_hline(aes(yintercept=0))+geom_vline(aes(xintercept=0))+
geom_abline(slope=1,intercept=0)
print(p1)
Looks like there are two groups in the plot? Lets try kmeans clustering
First, determine how many clusters we should use
wss <- (nrow(betas)-1)*sum(apply(betas,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(betas,centers=i, nstart=1000)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")
Let's try 4
clus <- kmeans(betas, 4, nstart=2500)
betas <- cbind(betas, clus$cluster)
names(betas)[6] <- "cluster"
p2 <- ggplot(betas, aes(x=insu_beta, y=t2d_beta, colour=factor(cluster)))+geom_point(size=3)+stat_smooth(method="lm")
print(p2)
## geom_smooth: Only one unique x value each group.Maybe you want aes(group = 1)?
How many genes in each cluster?
kable(t(as.matrix(table(betas$clus))), format="html")
kable(cor(betas[which(betas$clus==1),1:2]), format="html")
| insu_beta | t2d_beta | |
|---|---|---|
| insu_beta | NA | NA |
| t2d_beta | NA | NA |
kable(cor(betas[which(betas$clus==2),1:2]), format="html")
| insu_beta | t2d_beta | |
|---|---|---|
| insu_beta | NA | NA |
| t2d_beta | NA | NA |
kable(cor(betas[which(betas$clus==3),1:2]), format="html")
| insu_beta | t2d_beta | |
|---|---|---|
| insu_beta | NA | NA |
| t2d_beta | NA | NA |
kable(cor(betas[which(betas$clus==4),1:2]), format="html")
| insu_beta | t2d_beta | |
|---|---|---|
| insu_beta | NA | NA |
| t2d_beta | NA | NA |
p3 <- p1+geom_abline(slope=1,intercept=-0.25)
print(p3)
betas$cut <- apply(betas,1,function(x){as.numeric(x[2] < (-0.25+x[1]))})
p4 <- ggplot(betas, aes(x=insu_beta, y=t2d_beta, colour=factor(cut)))+geom_point(size=3)+geom_abline(slope=1, intercept=-0.25)+stat_smooth(method="lm")
print(p4)
kable(cor(betas[which(betas$cut==0),1:2]), format="html")
| insu_beta | t2d_beta | |
|---|---|---|
| insu_beta | 1.0000000 | 0.5807195 |
| t2d_beta | 0.5807195 | 1.0000000 |
kable(cor(betas[which(betas$cut==1),1:2]), format="html")
| insu_beta | t2d_beta | |
|---|---|---|
| insu_beta | 1.0000000 | 0.4195754 |
| t2d_beta | 0.4195754 | 1.0000000 |
betas$gene_id_noversion <- rownames(betas)
betas <- left_join(betas,gene_names[-1,])
## Joining by: "gene_id_noversion"
## Warning in left_join_impl(x, y, by$x, by$y): joining factor and character
## vector, coercing into character vector
betas$gene_id_noversion <- NULL
write.csv(betas, file="/net/csgsites/csg/beckandy/reports/resp_elec_effects.csv")