Oxidation-Reduction Genes

Compare the effects (betas) from differential expression results for NGT vs T2D and insulin from genes in GO:0055114 (oxidation-reduction process)

library(knitr)
library(ggplot2)
## Loading required package: methods
library(qvalue)
## 
## Attaching package: 'qvalue'
## 
## The following object is masked from 'package:ggplot2':
## 
##     qplot
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
load("/net/snowwhite/home/beckandy/tissue/datafreeze4/goseq/cat2gene.RData")
genes <- cat2gene[["GO:0055114"]]

gene_names <- read.table("/net/snowwhite/home/beckandy/geneConversion.tab", sep="\t", header=T)

insu <- read.table("/net/snowwhite/home/beckandy/tissue/datafreeze4/goseq/traits/peer_k03_S_Insu_all_genes.txt", header=T, as.is=T)
t2d <- read.table("/net/snowwhite/home/beckandy/tissue/datafreeze4/goseq/ngt_vs_t2d/peer_k03_all_genes.txt", header=T, as.is=T)

insu <- insu[order(insu$p.value),]
t2d <- t2d[order(t2d$p.value),]

insu$q.value <- qvalue(insu$p.value)$qvalues
t2d$q.value <- qvalue(t2d$p.value)$qvalues

insu$rank <- seq(1,length(insu$p.value))
t2d$rank <- seq(1,length(t2d$p.value))
insu$DE <- insu$rank <= 1000
t2d$DE <- t2d$rank <= 1000

insu$gene <- as.character(sapply(insu$gene, function(x)strsplit(x,"[.]")[[1]][1]))
t2d$gene <- as.character(sapply(t2d$gene, function(x)strsplit(x,"[.]")[[1]][1]))

rownames(insu) <- insu$gene
rownames(t2d) <- t2d$gene

keep <- intersect(rownames(insu),rownames(t2d))
keep <- intersect(keep, genes)

insu <- insu[keep,]
t2d <- t2d[keep,]

genes <- keep

#insu <- left_join(insu, gene_names[,-1], by=c("gene" = "gene_id_noversion"))
#t2d <- left_join(t2d, gene_names[,-1], by=c("gene" = "gene_id_noversion"))

Sanity check: do rows (genes) match between the two data frames?

print(sum(rownames(t2d) != rownames(insu)) == 0)
## [1] TRUE

Merge results into one data frame and save

#insu$DE <- as.numeric(insu$DE)
#t2d$DE <- as.numeric(t2d$DE)

betas <- data.frame(cbind.data.frame(insu$effect, t2d$effect, insu$DE, t2d$DE, insu$p.value, t2d$p.value, insu$q.value, t2d$q.value))
rownames(betas) <- genes
names(betas) <- c("insu_beta", "t2d_beta", "insu_de", "t2d_de", "insu_p","t2d_p", "insu_q", "t2d_q")

#0 if no DE, 1 if DE in insulin, 2 if DE in T2D, 3 if DE in both
betas$DE <- betas$insu_de + (2*betas$t2d_de)

#write.csv(betas, file="/net/csgsites/csg/beckandy/reports/oxReduc_effects.csv")

Some simple graphs and stats

kable(summary(betas), format="html")
insu_beta t2d_beta insu_de t2d_de insu_p t2d_p insu_q t2d_q DE
Min. :-0.48438 Min. :-0.74469 Mode :logical Mode :logical Min. :0.000000 Min. :0.0000188 Min. :0.00000 Min. :0.03859 Min. :0.0000
1st Qu.:-0.08124 1st Qu.:-0.27183 FALSE:817 FALSE:805 1st Qu.:0.004879 1st Qu.:0.0669990 1st Qu.:0.01401 1st Qu.:0.33349 1st Qu.:0.0000
Median : 0.01250 Median :-0.07240 TRUE :78 TRUE :90 Median :0.089256 Median :0.2813077 Median :0.10962 Median :0.52651 Median :0.0000
Mean : 0.01708 Mean :-0.06881 NA’s :0 NA’s :0 Mean :0.253188 Mean :0.3542650 Mean :0.17026 Mean :0.48731 Mean :0.2883
3rd Qu.: 0.11592 3rd Qu.: 0.14350 NA NA 3rd Qu.:0.445455 3rd Qu.:0.5892785 3rd Qu.:0.31103 3rd Qu.:0.65718 3rd Qu.:0.0000
Max. : 0.42375 Max. : 0.71693 NA NA Max. :0.999302 Max. :0.9978006 Max. :0.49107 Max. :0.75316 Max. :3.0000
kable(cor(betas[,1:2]), format="html")
insu_beta t2d_beta
insu_beta 1.0000000 0.2354465
t2d_beta 0.2354465 1.0000000

Key

  1. Not DE in either
  2. DE in insulin only
  3. DE in T2D only
  4. DE in both

Number of DE genes

kable(t(as.matrix(table(betas$DE+1))), format="html")
1 2 3 4
739 66 78 12
p1 <- ggplot(betas, aes(x=insu_beta, y=t2d_beta, colour=factor(DE+1)))+geom_point(size=3)+geom_hline(aes(yintercept=0))+geom_vline(aes(xintercept=0))+
    geom_abline(slope=1,intercept=0)
print(p1)

plot of chunk unnamed-chunk-7

Looks like there are two groups in the plot? Lets try kmeans clustering

First, determine how many clusters we should use

wss <- (nrow(betas)-1)*sum(apply(betas,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(betas,centers=i, nstart=1000)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")

plot of chunk unnamed-chunk-8

Let's try 4

clus <- kmeans(betas, 4, nstart=2500)
betas <- cbind(betas, clus$cluster)
names(betas)[6] <- "cluster"
p2 <- ggplot(betas, aes(x=insu_beta, y=t2d_beta, colour=factor(cluster)))+geom_point(size=3)+stat_smooth(method="lm")
print(p2)
## geom_smooth: Only one unique x value each group.Maybe you want aes(group = 1)?

plot of chunk unnamed-chunk-9

How many genes in each cluster?

kable(t(as.matrix(table(betas$clus))), format="html")
kable(cor(betas[which(betas$clus==1),1:2]), format="html")
insu_beta t2d_beta
insu_beta NA NA
t2d_beta NA NA
kable(cor(betas[which(betas$clus==2),1:2]), format="html")
insu_beta t2d_beta
insu_beta NA NA
t2d_beta NA NA
kable(cor(betas[which(betas$clus==3),1:2]), format="html")
insu_beta t2d_beta
insu_beta NA NA
t2d_beta NA NA
kable(cor(betas[which(betas$clus==4),1:2]), format="html")
insu_beta t2d_beta
insu_beta NA NA
t2d_beta NA NA
betas$gene_id_noversion <- rownames(betas)
betas <- left_join(betas,gene_names[-1,])
## Joining by: "gene_id_noversion"
## Warning in left_join_impl(x, y, by$x, by$y): joining factor and character
## vector, coercing into character vector
betas$gene_id_noversion <- NULL
betas$clus <- NULL
write.csv(betas, file="/net/csgsites/csg/beckandy/reports/oxReduc_effects.csv")