##现在直接获取GODb对象##现在直接获取OrgDb对象##现在直接获取TxDb对象
作者:马丁·摩根(mtmorgan@fredhutch.org)
日期:2015年9月7日
回车间大纲
这份文件中的材料要求R版本3.2和Bioconductor版本3.1
stopifnot(getRversion() >= '3.2' && getRversion() < '3.3', BiocInstaller::biocVersion() >= "3.1")
TxDb
包TxDb.Hsapiens.UCSC.hg19.knownGene
txdb <- txdb . hsapiens . ucsc .hg19. knowngeneknownGene txdb
# # TxDb对象:# # # Db型:TxDb支持包:# # # # # # GenomicFeatures数据来源:UCSC基因组:# # # # # # hg19生物:智人# # #分类ID: 9606 # # # UCSC的表:knownGene # # #资源URL: http://genome.ucsc.edu/ # # #的基因类型ID: Entrez基因ID # # #完整数据集:是的# # # miRBase构建ID: GRCh37 # # # transcript_nrow: 82960 # # # exon_nrow: 289969 # # # cds_nrow: 237533 # # # Db由:GenomicFeatures包从Bioconductor # # #创建时间:2015-08-20 18:12:27 -0700 (Thu, 2015年8月20日)## #基因组特征版本创建时间:1.21.16 ## # RSQLite版本创建时间:1.0.0 ## # dbschem规避:1.1
方法(类=类(txdb))
## [1] $ $<- ExpressionSet annotatedDataFrameFrom ## [5] as。list asBED asGFF assayData ## [9] assayData ## [9] coerce columns combine contents ## [17] dbInfo dbconn dbfile dbmeta ## [21] dbschema disjointExons distance exons ## [25] exonsBy exonsByOverlaps extractUpstreamSeqs featureNames ## [29] featureNames<- fiveutrsbytranscripts genes initialize ## [33] intronsByTranscript isActiveSeq isActiveSeq<- isNA ## [37] keys keytypes locatevvariables mapIds ## [41] mapToTranscripts mappedkeys元数据microRNAs ## [45] nhit organismpredictCoding promoters ## [49] revmap sample sampleNames sampleNames<- ## [53] saveDb select seqinfo seqinfo<- ## [57] seqlevels0 show species storageMode ## [61] storageMode<- summarizeVariants tRNAs taxonomyId ## [65] threeutrsbytranscripts transcriptsBy transcriptsByOverlaps ## [69] updateObject ## see '?方法来访问帮助和源代码
TxDb
对象
dbfile (txdb)
GenomicFeatures:: makeTxDbFrom * ()
访问基因模型
外显子()
,成绩单()
,基因()
,cd ()
(编码序列)发起人()
和朋友exonsBy ()
&朋友-基因外显子,转录,…keytypes ()
,列()
,键()
,select ()
,mapIds ()
外显子(txdb)
## seqnames ranges string | exon_id ## | ## [1] chr1 [11874,12227] + | 1 ## [2] chr1 [12595,12721] + | 2 ## [3] chr1 [12646,12721] + | 3 ## [4] chr1 [12646,12697] + | 4 ## [5] chr1 [1321,14409] + | 5 ## ... ... ... ... ... ...## [289965] chrY [27607404, 27607432] - | 277746 ## [289966] chrY [276359919, 27635954] - | 277747 ## [289967] chrY [59358329, 59359508] - | 277748 ## [289968] chrY [59360007, 59360115] - | 277749 ## [289969] chrY [59360501, 59360854] - | 277750 ## ------- ## seqinfo:来自hg19基因组的93个序列(1个循环)
exonsBy (txdb tx)
## GRangesList对象长度82960:## $1 ## GRanges对象3个范围和3个元数据列:## seqnames ranges strand | exon_id exon_name exon_rank ## | ## [1] chr1 [11874,12227] + | 1 1 ## [2] chr1 [12613,12721] + | 3 2 ## [3] chr1 [1321,14409] + | 5 3 ## ## $2 ## GRanges对象3个范围和3个元数据列:# # seqnames范围链| exon_id exon_name exon_rank # # [1] chr1(11874、12227)+ | 1 < NA > 1 # # [2] chr1(12595、12721)+ | 2 < NA > 2 # # [3] chr1(13403、14409)+ | 6 < NA > 3 # # # # # # 3美元农庄对象与范围和3元数据列:# # seqnames范围链| exon_id exon_name exon_rank # # [1] chr1(11874、12227)+ | 1 < NA > 1 # # [2] chr1(12646、12697)+ | 4 < NA > 2 # # [3] chr1(13221、14409)+ | 5 < NA > 3 ## ## ...## <82957 more elements> ## ------- ## seqinfo:来自hg19基因组的93个序列(1个圆形)
OrgDb
图书馆(org.Hs.eg.db) org.Hs.eg.db
# # OrgDb对象:# # | DBSCHEMAVERSION: 2.1 # # | Db型:OrgDb # # |支持包:AnnotationDbi # # | DBSCHEMA: HUMAN_DB # # |生物:智人# # |物种:人类# # | EGSOURCEDATE: 2015 - aug11 # # | EGSOURCENAME: Entrez基因# # | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA # # | CENTRALID:如# # | TAXID: 9606 # # | GOSOURCENAME:基因本体# # | GOSOURCEURL: ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest-lite/ # # | GOSOURCEDATE: 20150808 # # | GOEGSOURCEDATE:2015年8月11日## | GOEGSOURCENAME: Entrez Gene ## | GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA ## | KEGGSOURCENAME: KEGG GENOME ## | KEGGSOURCEDATE: 2011-Mar15 ## | GPSOURCENAME: UCSC GENOME Bioinformatics (Homo sapiens) ## | GPSOURCEURL: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19 ## | GPSOURCEDATE: 2010-Mar22 ## | ENSOURCEDATE: 2015年7月16日## | ENSOURCENAME: integrbl ## | ENSOURCEURL:ftp://ftp.ensembl.org/pub/current_fasta ## | UPSOURCENAME: Uniprot ## | UPSOURCEURL: http://www.UniProt.org/ ## | UPSOURCEDATE: Thu Aug 20 15:34:08 2015
## ##请参见:help('select')了解使用信息
OrgDb
对象
TxDb
keytypes ()
,列()
,键()
,select ()
,mapIds ()
select ()
键型规格
选择(org.Hs.eg.db, c(“BRCA1”、“PTEN”),c(“ENTREZID”、“GENENAME”),“象征”)
'select()'返回键和列之间的1:1映射
BRCA1 672乳腺癌1,早发型## 2 PTEN 5728磷酸酶和张力蛋白同源物
keytypes (org.Hs.eg.db)
# [7] " enzyme " " evidence " " evidenceall " " genename " " go " goall " # [13] " ipi " " map " " omim " " ontology " " ontologyall " path " # [19] " pfam " " pmid " " prosite " " refseq " " symbol " " ucsckg " # [25] " unigene " " uniprot "
列(org.Hs.eg.db)
# [7] " enzyme " " evidence " " evidenceall " " genename " " go " goall " # [13] " ipi " " map " " omim " " ontology " " ontologyall " path " # [19] " pfam " " pmid " " prosite " " refseq " " symbol " " ucsckg " # [25] " unigene " " uniprot "
相关功能
mapIds ()
从一个标识符映射到另一个标识符的特殊情况OrganismDb
对象:结合org . *
,TxDb。*
,以及其他注释资源,方便访问
库(Homo.sapiens)选择(Homo。智人,c(“BRCA1”、“PTEN”),c(“TXNAME”、“TXCHROM”,“TXSTART”、“TXEND”),“象征”)
'select()'返回1:多个键和列之间的映射
##符号TXNAME TXCHROM TXSTART TXEND ## 1 BRCA1 uc010whl。2 chr17 41196312 41276132 ## 2 BRCA1 uc002icp。4 chr17 41196312 41277340 ## 3 BRCA1 uc010whm。2 chr17 41196312 41277340 ## 4 BRCA1 uc002icu。3 chr17 41196312 41277468 ## 5 BRCA1 uc010cyx。3 chr17 41196312 41277468 ## 6 BRCA1 uc002icq。3 chr17 41196312 41277500 ## 7 BRCA1 uc002ict。3 chr17 41196312 41277500 ## 8 BRCA1 uc010whn。2 chr17 41196312 41277500 ## 9 BRCA1 uc010who。3 chr17 41196312 41277500 ## 10 BRCA1 uc010whp。2 chr17 41196312 41322420 ## 11 BRCA1 uc010whq。1 chr17 41215350 41256973 ## 12 BRCA1 uc002idc。1 chr17 41215350 41277468 ## 13 BRCA1 uc010whr。1 chr17 41215350 41277468 ## 14 BRCA1 uc002idd。3 chr17 41243117 41276132 ## 15 BRCA1 uc002ide。1 chr17 41243452 41256973 ## 16 BRCA1 uc010cyy。1 chr17 41243452 41277340 ## 17 BRCA1 uc010whs。1 chr17 41243452 41277468 ## 18 BRCA1 uc010cyz。2 chr17 41243452 41277500 ## 19 BRCA1 uc010cza。2 chr17 41243452 41277500 ## 20 BRCA1 uc010wht。1 chr17 41243452 41277500 ## 21 PTEN uc001kfb。3 chr10 89623195 89728532 ## 22 PTEN uc021pvw。1 chr10 89623195 89728532
biomaRt
,AnnotationHub
http://biomart.org;Bioconductor包biomaRt
##需要上网!!library(biomaRt) head(listMarts(), 3) ## list marts head(listDatasets(useMart(" ensemble ")), 3) ## mart datassets ensemble bl <- ##完全指定的mart useMart(" ensemble ", dataset = " hsapiens_gene_ensemble ") head(listFilters(ensemble), 3) ## filters myFilter <- "chromosome_name" substr(filterOptions(myFilter, ensemble), 1,50) ##返回值myValues <- c("21", "22") head(listAttributes(ensemble),3) ## attributes myAttributes <- c(" ensemble bl_gene_id","chromosome_name") ##集合和查询市场res <- getBM(attributes = myAttributes, filters = myFilter, values = myValues, mart = ensemble)
其他互联网资源
示例:ensemble 'GTF'文件到R/BioconductorGRanges和TxDb
library(AnnotationHub) hub <- AnnotationHub() hub query(hub, c(" integrbl ", "80", "gtf")) ## ensgtf = display(hub) #可视选择hub["AH47107"] gtf <- hub[["AH47107"]] gtf txdb <- genome icfeatures::makeTxDbFromGRanges(gtf)
例如:非模式生物OrgDb
包
library(AnnotationHub) hub <- AnnotationHub() query(hub, "OrgDb")
示例:将路线图表观基因组标记映射到hg38
路线图BED文件如下农庄
library(AnnotationHub) hub <- AnnotationHub() query(hub, c("EpigenomeRoadMap", "E126", "H3K4ME2")) E126 <- hub[["AH29817"]]
UCSC 'liftOver'文件映射坐标
查询(中心,c(“hg19”、“hg38”,“chainfile”))链< -中心[[“AH14150”]]
提起可能是一对多的映射,所以农庄来GRangesList
库(rtracklayer) E126hg38 <- liftOver(E126, chain) E126hg38
示例:从VCF文件中读取变量,并根据已知的基因模型进行注释
##输入变量库(VariantAnnotation) fl <- system。##已知基因模型库(txdb . hsapens . ucsc .hg19. knowngene) coding <- locateVariants(rowRanges(vcf), txdb . hsapens . ucsc .hg19. hg19. vcf) vcf <- readVcf(fl, "hg19")已知基因,编码变体()
GRanges对象有6个范围和9个元数据列:# # seqnames范围链|位置LOCSTART LOCEND QUERYID TXID # # < Rle > < IRanges > < Rle > | <因素> <整数> <整数> <整数> <人物> # # 1 chr22(50301422、50301422)- |编码939 939 24 75253 # # 2 chr22(50301476、50301476)25 - |编码885 885 75253 # # 3 chr22(50301488、50301488)26 - |编码873 873 75253 # # 4 chr22(50301494、50301494)27 - |编码867 867 75253 # # 5 chr22(50301584、50301584)28 - |编码777 777 75253 # # 6 chr22 (50302962,50302962] - |编码698 698 57 75253 ## CDSID GENEID preferdeid FOLLOWID ## ## 1 218562 79087 ## 2 218562 79087 ## 3 218562 79087 ## 4 218562 79087 ## 5 218562 79087 ## 6 218563 79087 ## ------- ## seqinfo: 1序列来自一个未指定的基因组;没有seqlengths