library(genefilter)
library(ALL); data(ALL)
colors=''
sampleinfo = pData(ALL)
head(sampleinfo, n=5)
colors <- as.character(factor(sampleinfo[,13],
levels=c('ALL1/AF4', 'BCR/ABL', 'E2A/PBX1', 'NEG', 'NUP-98', 'p15/p16'),
labels=c('#432C51', '#54A4AF', 'white', '#ECBC55', 'white', 'white')))
options(repr.plot.width=8, repr.plot.height=6)
boxplot(exprs(ALL), col=colors, ylim=c(2,15))
legend('topleft',c("NEG","BCR/ABL","ALL1/AF4", 'Others'),
fill=c('#ECBC55', '#54A4AF', '#432C51', 'white'), horiz=TRUE,cex=0.8)
b_levels = levels(sampleinfo[,5])[1:5]
mutations = c("NEG","BCR/ABL","ALL1/AF4")
chosen_genes = (sapply(sampleinfo[,5], function(x){any(rep(x, 5) == b_levels)})&
sapply(sampleinfo[,13], function(x){any(rep(x, 3) == mutations)}))
ALL_expsessions <- nsFilter(ALL)
abnormalities <- factor(sampleinfo[chosen_genes,13],
levels=c('ALL1/AF4', 'BCR/ABL', 'NEG'),
labels=c(1,1,0))
levels(abnormalities) <- c(1, 1,0)
diff_genes = rowttests(exprs(ALL_expsessions[[1]])[,chosen_genes],abnormalities)
print(paste('Number of statistically important genes: ', dim(diff_genes[diff_genes[['p.value']]<0.05,])[1], collapse=''))
print(paste0('TOP 10: ',paste(rownames(diff_genes[order('p.value')])[1:10], collapse=', '))) # TOP 10
par(mfrow=c(2, 1))
options(repr.plot.width=5, repr.plot.height=6)
hist(diff_genes[['p.value']], col='#54A4AF',
main='Histogram of all p-values', xlab = 'p-values')
hist(diff_genes[diff_genes[['p.value']]<0.05,3], breaks = 20, col='#ECBC55',
main='Histogram of p-values < 0.05', xlab = 'p-values')
# Узнаем имена и ID зондов
library("hgu95av2.db")
## Bimap interface:
x <- hgu95av2GENENAME
mapped_probes <- mappedkeys(x)
gene_names <- as.environment(as.list(x[mapped_probes]))
x <- hgu95av2SYMBOL
mapped_probes <- mappedkeys(x)
gene_SYMBOL <- as.environment(as.list(x[mapped_probes]))
x <- hgu95av2ENTREZID
mapped_probes <- mappedkeys(x)
gene_ENTREZID <- as.environment(as.list(x[mapped_probes]))
zonds = rownames(diff_genes)
ALL.gene_names <- sapply(zonds, function(x){mget(x,gene_names)})
ALL.SYMBOL <- sapply(zonds, function(x){mget(x,gene_SYMBOL)})
ALL.ENTREZID <- sapply(zonds, function(x){mget(x,gene_ENTREZID)})
genes_info = cbind(zonds = zonds, p_value = diff_genes[['p.value']], name = ALL.gene_names,
symbol=ALL.SYMBOL,id = ALL.ENTREZID)
rownames(genes_info)=c(1:4298)
head(genes_info) # first few rows of dataframe
FDR_importants = function(data1) {
data = as.numeric(data1)
p_value.adjusted = p.adjust(data, method = "BH")
FDR.real = max(data[p_value.adjusted < 0.05])
number_imp = sum(data<=FDR.real)
print(paste('FDR:', FDR.real,collapse=" "))
print(paste('Number of statistically important genes:',number_imp,collapse=" "))
return(c(FDR.real, number_imp))
}
fdrs = FDR_importants(diff_genes[['p.value']]) # 108 important genes
genes_df = data.frame(genes_info)
important_genes = genes_df[genes_df$p_value <= fdrs[1],] # отобрали дифференциально экспрессированные гены с учетом FDR
# Получим базу данных путей, и найдем дифференциально экспрессированные гены, которые не участвуют ни в одном Pathway.
library("EnrichmentBrowser")
gs <- get.kegg.genesets("hsa")
functions = c()
functions_help = c()
genes_into_acount = c()
for (j in 1:dim(important_genes)[1]){
for(i in 1:length(gs)) {
if (any(sapply(gs[i], '==',important_genes[[j,5]]))) {
functions_help = c(functions_help,names(gs[i]))
genes_into_acount = c(genes_into_acount, important_genes[[j,5]])
break
}
}
if (length(functions_help) ==0) {
functions_help = c('NONE')
}
functions = c(functions,functions_help)
functions_help = c()
}
important_genes$kegg = functions
# ORA
p.values = c()
for(i in 1:length(gs)) {
listed_our_genes = 0
unlisted_our_genes = 0
for (x in genes_into_acount){
if (any(rep(x, length(gs[[i]])) == gs[[i]])){
listed_our_genes = listed_our_genes+1
}
else {
unlisted_our_genes = unlisted_our_genes+1
}
}
listed_path_genes = 0
unlisted_path_genes = 0
for (x in gs[[i]]){
if (any(rep(x, length(genes_into_acount)) == genes_into_acount)){
listed_path_genes = listed_path_genes +1
}
else {
unlisted_path_genes = unlisted_path_genes+1
}
}
p.values = c(p.values,
fisher.test(matrix(c(listed_our_genes, unlisted_our_genes,
listed_path_genes, unlisted_path_genes), ncol=2))$p.value)
}
ora = data.frame(path=names(gs), p_value = p.values)
ora_imp = ora[ora$p_value < 0.05,]
ora_imp