EAF填补
在gwas数据获取当中,有许多数据并未含有eaf值,这对后续的分析极为不利。
- EAF值:
如何进行EAF值的填补
参考文章:https://cloud.tencent.com/developer/article/2327202
我采取了以下两种方法
-
snp_add_eaf函数
snp_add_eaf <- function(dat, build = "37", pop = "EUR") { stopifnot(build %in% c("37","38")) stopifnot("SNP" %in% names(dat)) # Create and get a url server <- ifelse(build == "37","http://grch37.rest.ensembl.org","http://rest.ensembl.org") pop <- paste0("1000GENOMES:phase_3:",pop) snp_reverse_base <- function(x) { x <- stringr::str_to_upper(x) stopifnot(x %in% c("A","T","C","G")) switch(x,"A"="T","T"="A","C"="G","G"="C") } res_tab <- lapply(1:nrow(dat), function(i) { print(paste0("seaching for No.", i, " SNP")) dat_i <- dat[i,] ext <- paste0("/variation/Homo_sapiens/",dat_i$SNP, "?content-type=application/json;pops=1") url <- paste(server, ext, sep = "") res <- httr::GET(url) # Converts http errors to R errors or warnings httr::stop_for_status(res) # Convert R objects from JSON res <- httr::content(res) res_pop <- jsonlite::fromJSON(jsonlite::toJSON(res))$populations # Filter query results based on population set res_pop <- try(res_pop[res_pop$population == pop,]) if("try-error" %in% class(res_pop)) { print(paste0("There is not information for population ",pop)) queried_effect_allele <- "NR" queried_other_allele <- "NR" queried_eaf <- -1 } else { if(nrow(res_pop)==0) { print(paste0("There is not information for population ",pop)) queried_effect_allele <- "NR" queried_other_allele <- "NR" queried_eaf <- -1 } else { queried_effect_allele <- res_pop[1,"allele"][[1]] queried_other_allele <- res_pop[2,"allele"][[1]] queried_eaf <- res_pop[1,"frequency"][[1]] } } effect_allele <- ifelse("effect_allele.exposure" %in% names(dat), dat_i$effect_allele.exposure, dat_i$effect_allele) other_allele <- ifelse("effect_allele.exposure" %in% names(dat), dat_i$other_allele.exposure, dat_i$other_allele) if("effect_allele.exposure" %in% names(dat)) { name_output <- unique(c(names(dat), "eaf.exposure","reliability.exposure")) } else { name_output <- unique(c(names(dat), "eaf","reliability.exposure")) } len_effect_allele <- nchar(effect_allele) len_other_allele <- nchar(other_allele) if(len_effect_allele==1&len_other_allele==1) { if((queried_effect_allele==effect_allele & queried_other_allele==other_allele)| (queried_effect_allele==other_allele & queried_other_allele==effect_allele)) { dat_i$eaf.exposure <- ifelse(effect_allele == queried_effect_allele, queried_eaf, 1-queried_eaf) dat_i$eaf <- dat_i$eaf.exposure dat_i$reliability.exposure <- "high" } else { r_queried_effect_allele <- snp_reverse_base(queried_effect_allele) r_queried_other_allele <- snp_reverse_base(queried_other_allele) if((r_queried_effect_allele==effect_allele & r_queried_other_allele==other_allele)| (r_queried_effect_allele==other_allele & r_queried_other_allele==effect_allele)) { dat_i$eaf.exposure <- ifelse(effect_allele == r_queried_effect_allele, queried_eaf, 1-queried_eaf) dat_i$eaf <- dat_i$eaf.exposure dat_i$reliability.exposure <- "high" } else { dat_i$eaf.exposure <- ifelse(effect_allele == queried_effect_allele, queried_eaf, 1-queried_eaf) dat_i$eaf <- dat_i$eaf.exposure dat_i$reliability.exposure <- "low" } } } else { # To identify the potential DEL/ INS short_allele <- ifelse(len_effect_allele==1, effect_allele, other_allele) short_allele_eaf <- ifelse(short_allele == queried_effect_allele, queried_eaf, 1-queried_eaf) dat_i$eaf.exposure <- ifelse(effect_allele == short_allele, short_allele_eaf, 1-short_allele_eaf) dat_i$eaf <- dat_i$eaf.exposure dat_i$reliability.exposure <- "low" } dat_i[name_output] }) return(do.call(rbind, res_tab)) }直接运行函数,在R中就会出现snp_add_eaf()函数了
注意事项:
1. 此方法适合于少量的数据
2.因为使用了API,大量请求的时候,会出现无法响应的问题
原始代码出处:
https://github.com/linjf15/MR_tricks/tree/main/GWAS_preprocessing
2. 使用get_eaf_from_1000G()get_eaf_from_1000G<-function(dat,path,type="exposure"){ corrected_eaf_expo<-function(data_MAF){ effect=data_MAF$effect_allele.exposure other=data_MAF$other_allele.exposure A1=data_MAF$A1 A2=data_MAF$A2 MAF_num=data_MAF$MAF EAF_num=1-MAF_num harna<-is.na(data_MAF$A1) harna<-data_MAF$SNP[which(harna==T)] cor1<-which(data_MAF$effect_allele.exposure !=data_MAF$A1) data_MAF$eaf.exposure=data_MAF$MAF data_MAF$type="raw" data_MAF$eaf.exposure[cor1]=EAF_num[cor1] data_MAF$type[cor1]="corrected" cor2<-which(data_MAF$other_allele.exposure ==data_MAF$A1) cor21<-setdiff(cor2,cor1) cor12<-setdiff(cor1,cor2) error<-c(cor12,cor21) data_MAF$eaf.exposure[error]=NA data_MAF$type[error]="error" data_MAF<-list(data_MAF=data_MAF,cor1=cor1,harna=harna,error=error) return(data_MAF) } corrected_eaf_out<-function(data_MAF){ effect=data_MAF$effect_allele.outcome other=data_MAF$other_allele.outcome A1=data_MAF$A1 A2=data_MAF$A2 MAF_num=data_MAF$MAF EAF_num=1-MAF_num harna<-is.na(data_MAF$A1) harna<-data_MAF$SNP[which(harna==T)] cor1<-which(data_MAF$effect_allele.outcome !=data_MAF$A1) data_MAF$eaf.outcome=data_MAF$MAF data_MAF$type="raw" data_MAF$eaf.outcome[cor1]=EAF_num[cor1] data_MAF$type[cor1]="corrected" cor2<-which(data_MAF$other_allele.outcome ==data_MAF$A1) cor21<-setdiff(cor2,cor1) cor12<-setdiff(cor1,cor2) error<-c(cor12,cor21) data_MAF$eaf.outcome[error]=NA data_MAF$type[error]="error" data_MAF<-list(data_MAF=data_MAF,cor1=cor1,harna=harna,error=error) return(data_MAF) } if(type=="exposure" && (is.na(dat$eaf.exposure[1])==T || is.null(dat$eaf.exposure)==T)){ r<-nrow(dat) setwd(path) MAF<-fread("fileFrequency.frq",header = T) dat<-merge(dat,MAF,by.x = "SNP",by.y = "SNP",all.x = T) dat<-corrected_eaf_expo(dat) cor1<-dat$cor1 harna<-dat$harna error<-dat$error dat<-dat$data_MAF print(paste0("一共有",(r-length(harna)-length(error)),"个SNP成功匹配EAF,占比",(r-length(harna)-length(error))/r*100,"%")) print(paste0("一共有",length(cor1),"个SNP是major allele,EAF被计算为1-MAF,在成功匹配数目中占比",length(cor1)/(r-length(harna)-length(error))*100,"%")) print(paste0("一共有",length(harna),"个SNP在1000G中找不到,占比",length(harna)/r*100,"%")) print(paste0("一共有",length(error),"个SNP在输入数据与1000G中效应列与参照列,将剔除eaf,占比",length(error)/r*100,"%")) print("输出数据中的type列说明:") print("raw:EAF直接等于1000G里的MAF数值,因为效应列是minor allele") print('corrected:EAF等于1000G中1-MAF,因为效应列是major allele') print("error:输入数据与1000G里面提供的数据完全不一致,比如这个SNP输入的效应列是C,参照列是G,但是1000G提供的是A-T,这种情况下,EAF会被清空(NA),当成匹配失败") return(dat) } if(type=="outcome" && (is.na(dat$eaf.outcome[1])==T || is.null(dat$eaf.outcome)==T)){ r<-nrow(dat) setwd(path) MAF<-fread("fileFrequency.frq",header = T) dat<-merge(dat,MAF,by.x = "SNP",by.y = "SNP",all.x = T) dat<-corrected_eaf_out(dat) cor1<-dat$cor1 harna<-dat$harna error<-dat$error dat<-dat$data_MAF print(paste0("一共有",(r-length(harna)-length(error)),"个SNP成功匹配EAF,占比",(r-length(harna)-length(error))/r*100,"%")) print(paste0("一共有",length(cor1),"个SNP是major allele,EAF被计算为1-MAF,在成功匹配数目中占比",length(cor1)/(r-length(harna)-length(error))*100,"%")) print(paste0("一共有",length(harna),"个SNP在1000G找不到,占比",length(harna)/r*100,"%")) print(paste0("一共有",length(error),"个SNP在输入数据与1000G中效应列与参照列,将剔除eaf,占比",length(error)/r*100,"%")) print("输出数据中的type列说明:") print("raw:EAF直接等于1000G里的MAF数值,因为效应列是minor allele") print('corrected:EAF等于1000G中1-MAF,因为效应列是major allele') print("error:输入数据与1000G里面提供的数据完全不一致,比如这个SNP输入的效应列是C,参照列是G,但是1000G提供的是A-T,这种情况下,EAF会被清空(NA),当成匹配失败") return(dat) } else{return(dat)} }
直接运行函数,在R中就会出现get_eaf_from_1000G()函数了
**注意事项:**
**1. 此方法速度快,效果稳定,个人比较推荐**
**2. 需要获取1000G参考文件**
原始代码出处:
[https://github.com/HaobinZhou/Get_MR/blob/main/Get_MR1.0 help.md]
(代码作者:广州医科大学第一临床学院周浩彬 ,第二临床学院谢治鑫)
**另:此为方法2中参考文件的百度云链接**
链接: https://pan.baidu.com/s/1ob6WjavJfk-6lgXa2wcp2Q?pwd=euht 提取码: euht
如有相关问题需要讨论,可通过邮箱:learner__lin2003@yeah.net联系我。
2024-07-18 16:54:19 星期四