Oxidation-Reduction Genes

Compare the effects (betas) from differential expression results for NGT vs T2D and insulin from genes in GO:0055114 (oxidation-reduction process)

library(knitr)
library(ggplot2)

## Loading required package: methods

library(qvalue)

## 
## Attaching package: 'qvalue'
## 
## The following object is masked from 'package:ggplot2':
## 
##     qplot

library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

load("/net/snowwhite/home/beckandy/tissue/datafreeze4/goseq/cat2gene.RData")
genes <- cat2gene[["GO:0055114"]]

gene_names <- read.table("/net/snowwhite/home/beckandy/geneConversion.tab", sep="\t", header=T)

insu <- read.table("/net/snowwhite/home/beckandy/tissue/datafreeze4/goseq/traits/peer_k03_S_Insu_all_genes.txt", header=T, as.is=T)
t2d <- read.table("/net/snowwhite/home/beckandy/tissue/datafreeze4/goseq/ngt_vs_t2d/peer_k03_all_genes.txt", header=T, as.is=T)

insu <- insu[order(insu$p.value),]
t2d <- t2d[order(t2d$p.value),]

insu$q.value <- qvalue(insu$p.value)$qvalues
t2d$q.value <- qvalue(t2d$p.value)$qvalues

insu$rank <- seq(1,length(insu$p.value))
t2d$rank <- seq(1,length(t2d$p.value))
insu$DE <- insu$rank <= 1000
t2d$DE <- t2d$rank <= 1000

insu$gene <- as.character(sapply(insu$gene, function(x)strsplit(x,"[.]")[[1]][1]))
t2d$gene <- as.character(sapply(t2d$gene, function(x)strsplit(x,"[.]")[[1]][1]))

rownames(insu) <- insu$gene
rownames(t2d) <- t2d$gene

keep <- intersect(rownames(insu),rownames(t2d))
keep <- intersect(keep, genes)

insu <- insu[keep,]
t2d <- t2d[keep,]

genes <- keep

#insu <- left_join(insu, gene_names[,-1], by=c("gene" = "gene_id_noversion"))
#t2d <- left_join(t2d, gene_names[,-1], by=c("gene" = "gene_id_noversion"))

Sanity check: do rows (genes) match between the two data frames?

print(sum(rownames(t2d) != rownames(insu)) == 0)

## [1] TRUE

Merge results into one data frame and save

#insu$DE <- as.numeric(insu$DE)
#t2d$DE <- as.numeric(t2d$DE)

betas <- data.frame(cbind.data.frame(insu$effect, t2d$effect, insu$DE, t2d$DE, insu$p.value, t2d$p.value, insu$q.value, t2d$q.value))
rownames(betas) <- genes
names(betas) <- c("insu_beta", "t2d_beta", "insu_de", "t2d_de", "insu_p","t2d_p", "insu_q", "t2d_q")

#0 if no DE, 1 if DE in insulin, 2 if DE in T2D, 3 if DE in both
betas$DE <- betas$insu_de + (2*betas$t2d_de)

#write.csv(betas, file="/net/csgsites/csg/beckandy/reports/oxReduc_effects.csv")

Some simple graphs and stats

kable(summary(betas), format="html")

insu_beta	t2d_beta	insu_de	t2d_de	insu_p	t2d_p	insu_q	t2d_q	DE
Min. :-0.48438	Min. :-0.74469	Mode :logical	Mode :logical	Min. :0.000000	Min. :0.0000188	Min. :0.00000	Min. :0.03859	Min. :0.0000
1st Qu.:-0.08124	1st Qu.:-0.27183	FALSE:817	FALSE:805	1st Qu.:0.004879	1st Qu.:0.0669990	1st Qu.:0.01401	1st Qu.:0.33349	1st Qu.:0.0000
Median : 0.01250	Median :-0.07240	TRUE :78	TRUE :90	Median :0.089256	Median :0.2813077	Median :0.10962	Median :0.52651	Median :0.0000
Mean : 0.01708	Mean :-0.06881	NA’s :0	NA’s :0	Mean :0.253188	Mean :0.3542650	Mean :0.17026	Mean :0.48731	Mean :0.2883
3rd Qu.: 0.11592	3rd Qu.: 0.14350	NA	NA	3rd Qu.:0.445455	3rd Qu.:0.5892785	3rd Qu.:0.31103	3rd Qu.:0.65718	3rd Qu.:0.0000
Max. : 0.42375	Max. : 0.71693	NA	NA	Max. :0.999302	Max. :0.9978006	Max. :0.49107	Max. :0.75316	Max. :3.0000

kable(cor(betas[,1:2]), format="html")

	insu_beta	t2d_beta
insu_beta	1.0000000	0.2354465
t2d_beta	0.2354465	1.0000000

Key

Not DE in either
DE in insulin only
DE in T2D only
DE in both

Number of DE genes

kable(t(as.matrix(table(betas$DE+1))), format="html")

1	2	3	4
739	66	78	12

p1 <- ggplot(betas, aes(x=insu_beta, y=t2d_beta, colour=factor(DE+1)))+geom_point(size=3)+geom_hline(aes(yintercept=0))+geom_vline(aes(xintercept=0))+
    geom_abline(slope=1,intercept=0)
print(p1)

plot of chunk unnamed-chunk-7

Looks like there are two groups in the plot? Lets try kmeans clustering

First, determine how many clusters we should use

wss <- (nrow(betas)-1)*sum(apply(betas,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(betas,centers=i, nstart=1000)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")

plot of chunk unnamed-chunk-8

Let's try 4

clus <- kmeans(betas, 4, nstart=2500)
betas <- cbind(betas, clus$cluster)
names(betas)[6] <- "cluster"
p2 <- ggplot(betas, aes(x=insu_beta, y=t2d_beta, colour=factor(cluster)))+geom_point(size=3)+stat_smooth(method="lm")
print(p2)

## geom_smooth: Only one unique x value each group.Maybe you want aes(group = 1)?

plot of chunk unnamed-chunk-9

How many genes in each cluster?

kable(t(as.matrix(table(betas$clus))), format="html")

kable(cor(betas[which(betas$clus==1),1:2]), format="html")

	insu_beta	t2d_beta
insu_beta	NA	NA
t2d_beta	NA	NA

kable(cor(betas[which(betas$clus==2),1:2]), format="html")

	insu_beta	t2d_beta
insu_beta	NA	NA
t2d_beta	NA	NA

kable(cor(betas[which(betas$clus==3),1:2]), format="html")

	insu_beta	t2d_beta
insu_beta	NA	NA
t2d_beta	NA	NA

kable(cor(betas[which(betas$clus==4),1:2]), format="html")

	insu_beta	t2d_beta
insu_beta	NA	NA
t2d_beta	NA	NA

betas$gene_id_noversion <- rownames(betas)
betas <- left_join(betas,gene_names[-1,])

## Joining by: "gene_id_noversion"

## Warning in left_join_impl(x, y, by$x, by$y): joining factor and character
## vector, coercing into character vector

betas$gene_id_noversion <- NULL
betas$clus <- NULL
write.csv(betas, file="/net/csgsites/csg/beckandy/reports/oxReduc_effects.csv")