Homework 3, Tikhonova Polina

In [3]:
library(genefilter)
library(ALL); data(ALL)
colors=''
sampleinfo = pData(ALL)
head(sampleinfo, n=5)
coddiagnosissexageBTremissionCRdate.crt(4;11)t(9;22)...citogmol.biolfusion proteinmdrkinetccrrelapsetransplantf.udate last seen
010051005 5/21/1997 M 53 B2 CR CR 8/6/1997 FALSE TRUE ... t(9;22) BCR/ABL p210 NEG dyploid FALSE FALSE TRUE BMT / DEATH IN CRNA
010101010 3/29/2000 M 19 B2 CR CR 6/27/2000 FALSE FALSE ... simple alt. NEG NA POS dyploid FALSE TRUE FALSE REL 8/28/2000
030023002 6/24/1998 F 52 B4 CR CR 8/17/1998 NA NA ... NA BCR/ABL p190 NEG dyploid FALSE TRUE FALSE REL 10/15/1999
040064006 7/17/1997 M 38 B1 CR CR 9/8/1997 TRUE FALSE ... t(4;11) ALL1/AF4 NA NEG dyploid FALSE TRUE FALSE REL 1/23/1998
040074007 7/22/1997 M 57 B2 CR CR 9/17/1997 FALSE FALSE ... del(6q) NEG NA NEG dyploid FALSE TRUE FALSE REL 11/4/1997
In [4]:
colors <- as.character(factor(sampleinfo[,13],
                 levels=c('ALL1/AF4', 'BCR/ABL', 'E2A/PBX1', 'NEG',   'NUP-98', 'p15/p16'),
                 labels=c('#432C51',  '#54A4AF',  'white',  '#ECBC55', 'white', 'white')))

options(repr.plot.width=8, repr.plot.height=6)
boxplot(exprs(ALL), col=colors, ylim=c(2,15))
legend('topleft',c("NEG","BCR/ABL","ALL1/AF4", 'Others'), 
       fill=c('#ECBC55', '#54A4AF', '#432C51', 'white'), horiz=TRUE,cex=0.8)
Warning message in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels) else paste0(labels, :
"duplicated levels in factors are deprecated"
In [5]:
b_levels = levels(sampleinfo[,5])[1:5]
mutations = c("NEG","BCR/ABL","ALL1/AF4")
chosen_genes = (sapply(sampleinfo[,5], function(x){any(rep(x, 5) == b_levels)})&
                  sapply(sampleinfo[,13], function(x){any(rep(x, 3) == mutations)}))
In [12]:
ALL_expsessions <- nsFilter(ALL)
In [7]:
abnormalities <- factor(sampleinfo[chosen_genes,13], 
       levels=c('ALL1/AF4', 'BCR/ABL', 'NEG'),
       labels=c(1,1,0))
levels(abnormalities) <- c(1, 1,0)
diff_genes = rowttests(exprs(ALL_expsessions[[1]])[,chosen_genes],abnormalities)
print(paste('Number of statistically important genes: ', dim(diff_genes[diff_genes[['p.value']]<0.05,])[1], collapse='')) 
print(paste0('TOP 10: ',paste(rownames(diff_genes[order('p.value')])[1:10], collapse=', '))) # TOP 10
Warning message in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels) else paste0(labels, :
"duplicated levels in factors are deprecated"
[1] "Number of statistically important genes:  601"
[1] "TOP 10: 907_at, 35430_at, 36841_at, 38924_s_at, 34332_at, 39742_at, 36023_at, 35955_at, 34938_i_at, 266_s_at"
In [8]:
par(mfrow=c(2, 1))
options(repr.plot.width=5, repr.plot.height=6)
hist(diff_genes[['p.value']], col='#54A4AF', 
     main='Histogram of all p-values', xlab = 'p-values')
hist(diff_genes[diff_genes[['p.value']]<0.05,3], breaks = 20, col='#ECBC55', 
     main='Histogram of p-values < 0.05', xlab = 'p-values')
In [19]:
# Узнаем имена и ID зондов
library("hgu95av2.db")
## Bimap interface:
x <- hgu95av2GENENAME
mapped_probes <- mappedkeys(x)
gene_names <- as.environment(as.list(x[mapped_probes]))

x <- hgu95av2SYMBOL
mapped_probes <- mappedkeys(x)
gene_SYMBOL <- as.environment(as.list(x[mapped_probes]))

x <- hgu95av2ENTREZID
mapped_probes <- mappedkeys(x)
gene_ENTREZID <- as.environment(as.list(x[mapped_probes]))

zonds = rownames(diff_genes)
ALL.gene_names <- sapply(zonds, function(x){mget(x,gene_names)})
ALL.SYMBOL <- sapply(zonds, function(x){mget(x,gene_SYMBOL)})
ALL.ENTREZID <- sapply(zonds, function(x){mget(x,gene_ENTREZID)})
genes_info = cbind(zonds = zonds, p_value = diff_genes[['p.value']], name = ALL.gene_names, 
                   symbol=ALL.SYMBOL,id = ALL.ENTREZID)
rownames(genes_info)=c(1:4298)
head(genes_info)  # first few rows of dataframe
zondsp_valuenamesymbolid
1907_at 0.1711132 adenosine deaminaseADA 100
235430_at 0.3105678 mediator complex subunit 6MED6 10001
336841_at 0.2473114 acyl-CoA thioesterase 8ACOT8 10005
438924_s_at 0.5884306 abl interactor 1ABI1 10006
534332_at 0.392792 glucosamine-6-phosphate deaminase 1GNPDA1 10007
639742_at 0.03287374 TRAF family member associated NFKB activatorTANK 10010
In [40]:
FDR_importants = function(data1) {
  data = as.numeric(data1)
  p_value.adjusted = p.adjust(data, method = "BH")
  FDR.real = max(data[p_value.adjusted < 0.05]) 
  number_imp = sum(data<=FDR.real)
  print(paste('FDR:', FDR.real,collapse=" "))
  print(paste('Number of statistically important genes:',number_imp,collapse=" "))
  return(c(FDR.real, number_imp))
}
fdrs = FDR_importants(diff_genes[['p.value']]) # 108 important genes
[1] "FDR: 0.00124512378195435"
[1] "Number of statistically important genes: 108"
In [43]:
genes_df = data.frame(genes_info)
important_genes =  genes_df[genes_df$p_value <= fdrs[1],] # отобрали дифференциально экспрессированные гены с учетом FDR
In [45]:
# Получим базу данных путей, и найдем дифференциально экспрессированные гены, которые не участвуют ни в одном Pathway.

library("EnrichmentBrowser")
gs <- get.kegg.genesets("hsa")
  
functions = c()
functions_help = c()
genes_into_acount = c()
for (j in 1:dim(important_genes)[1]){
  for(i in 1:length(gs)) {
     if (any(sapply(gs[i], '==',important_genes[[j,5]]))) {
        functions_help = c(functions_help,names(gs[i]))
        genes_into_acount = c(genes_into_acount, important_genes[[j,5]])
        break
     }
  }
  if (length(functions_help) ==0) {
    functions_help = c('NONE')
  }
  functions = c(functions,functions_help)
  functions_help = c()
}
important_genes$kegg = functions
In [46]:
# ORA
p.values = c()
for(i in 1:length(gs)) {
  listed_our_genes = 0
  unlisted_our_genes = 0
  for (x in genes_into_acount){
    if (any(rep(x, length(gs[[i]])) == gs[[i]])){
      listed_our_genes = listed_our_genes+1
    }
    else {
      unlisted_our_genes = unlisted_our_genes+1
    }
  }
  
  listed_path_genes = 0
  unlisted_path_genes = 0
  for (x in gs[[i]]){
    if (any(rep(x, length(genes_into_acount)) == genes_into_acount)){
      listed_path_genes = listed_path_genes +1
    }
    else {
      unlisted_path_genes = unlisted_path_genes+1
    }
  }
  p.values = c(p.values,
               fisher.test(matrix(c(listed_our_genes, unlisted_our_genes,
                                    listed_path_genes, unlisted_path_genes), ncol=2))$p.value)
}
In [47]:
ora = data.frame(path=names(gs), p_value = p.values)
ora_imp = ora[ora$p_value < 0.05,]
ora_imp
pathp_value
87hsa01100_Metabolic_pathways 0.001171887
142hsa04144_Endocytosis 0.038446100
280hsa05165_Human_papillomavirus_infection0.007570625
285hsa05200_Pathways_in_cancer 0.009102544
290hsa05206_MicroRNAs_in_cancer 0.021441931
Данные WebGestaldt cлабо совпадают ((
Drawing