##现在直接获取GODb对象##现在直接获取OrgDb对象##现在直接获取TxDb对象

作者:马丁·摩根(mtmorgan@fredhutch.org
日期:2015年9月7日
车间大纲

这份文件中的材料要求R版本3.2和Bioconductor版本3.1

stopifnot(getRversion() >= '3.2' && getRversion() < '3.3', BiocInstaller::biocVersion() >= "3.1")

注释

基因模型注释资源-TxDb

TxDb.Hsapiens.UCSC.hg19.knownGene

txdb <- txdb . hsapiens . ucsc .hg19. knowngeneknownGene txdb
# # TxDb对象:# # # Db型:TxDb支持包:# # # # # # GenomicFeatures数据来源:UCSC基因组:# # # # # # hg19生物:智人# # #分类ID: 9606 # # # UCSC的表:knownGene # # #资源URL: http://genome.ucsc.edu/ # # #的基因类型ID: Entrez基因ID # # #完整数据集:是的# # # miRBase构建ID: GRCh37 # # # transcript_nrow: 82960 # # # exon_nrow: 289969 # # # cds_nrow: 237533 # # # Db由:GenomicFeatures包从Bioconductor # # #创建时间:2015-08-20 18:12:27 -0700 (Thu, 2015年8月20日)## #基因组特征版本创建时间:1.21.16 ## # RSQLite版本创建时间:1.0.0 ## # dbschem规避:1.1
方法(类=类(txdb))
## [1] $ $<- ExpressionSet annotatedDataFrameFrom ## [5] as。list asBED asGFF assayData ## [9] assayData ## [9] coerce columns combine contents ## [17] dbInfo dbconn dbfile dbmeta ## [21] dbschema disjointExons distance exons ## [25] exonsBy exonsByOverlaps extractUpstreamSeqs featureNames ## [29] featureNames<- fiveutrsbytranscripts genes initialize ## [33] intronsByTranscript isActiveSeq isActiveSeq<- isNA ## [37] keys keytypes locatevvariables mapIds ## [41] mapToTranscripts mappedkeys元数据microRNAs ## [45] nhit organismpredictCoding promoters ## [49] revmap sample sampleNames sampleNames<- ## [53] saveDb select seqinfo seqinfo<- ## [57] seqlevels0 show species storageMode ## [61] storageMode<- summarizeVariants tRNAs taxonomyId ## [65] threeutrsbytranscripts transcriptsBy transcriptsByOverlaps ## [69] updateObject ## see '?方法来访问帮助和源代码

TxDb对象

访问基因模型

外显子(txdb)
## seqnames ranges string | exon_id ##    |  ## [1] chr1 [11874,12227] + | 1 ## [2] chr1 [12595,12721] + | 2 ## [3] chr1 [12646,12721] + | 3 ## [4] chr1 [12646,12697] + | 4 ## [5] chr1 [1321,14409] + | 5 ## ... ... ... ... ... ...## [289965] chrY [27607404, 27607432] - | 277746 ## [289966] chrY [276359919, 27635954] - | 277747 ## [289967] chrY [59358329, 59359508] - | 277748 ## [289968] chrY [59360007, 59360115] - | 277749 ## [289969] chrY [59360501, 59360854] - | 277750 ## ------- ## seqinfo:来自hg19基因组的93个序列(1个循环)
exonsBy (txdb tx)
## GRangesList对象长度82960:## $1 ## GRanges对象3个范围和3个元数据列:## seqnames ranges strand | exon_id exon_name exon_rank ##    |    ## [1] chr1 [11874,12227] + | 1  1 ## [2] chr1 [12613,12721] + | 3  2 ## [3] chr1 [1321,14409] + | 5  3 ## ## $2 ## GRanges对象3个范围和3个元数据列:# # seqnames范围链| exon_id exon_name exon_rank # # [1] chr1(11874、12227)+ | 1 < NA > 1 # # [2] chr1(12595、12721)+ | 2 < NA > 2 # # [3] chr1(13403、14409)+ | 6 < NA > 3 # # # # # # 3美元农庄对象与范围和3元数据列:# # seqnames范围链| exon_id exon_name exon_rank # # [1] chr1(11874、12227)+ | 1 < NA > 1 # # [2] chr1(12646、12697)+ | 4 < NA > 2 # # [3] chr1(13221、14409)+ | 5 < NA > 3  ## ## ...## <82957 more elements> ## ------- ## seqinfo:来自hg19基因组的93个序列(1个圆形)

标识符映射—OrgDb

图书馆(org.Hs.eg.db) org.Hs.eg.db
# # OrgDb对象:# # | DBSCHEMAVERSION: 2.1 # # | Db型:OrgDb # # |支持包:AnnotationDbi # # | DBSCHEMA: HUMAN_DB # # |生物:智人# # |物种:人类# # | EGSOURCEDATE: 2015 - aug11 # # | EGSOURCENAME: Entrez基因# # | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA # # | CENTRALID:如# # | TAXID: 9606 # # | GOSOURCENAME:基因本体# # | GOSOURCEURL: ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest-lite/ # # | GOSOURCEDATE: 20150808 # # | GOEGSOURCEDATE:2015年8月11日## | GOEGSOURCENAME: Entrez Gene ## | GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA ## | KEGGSOURCENAME: KEGG GENOME ## | KEGGSOURCEDATE: 2011-Mar15 ## | GPSOURCENAME: UCSC GENOME Bioinformatics (Homo sapiens) ## | GPSOURCEURL: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19 ## | GPSOURCEDATE: 2010-Mar22 ## | ENSOURCEDATE: 2015年7月16日## | ENSOURCENAME: integrbl ## | ENSOURCEURL:ftp://ftp.ensembl.org/pub/current_fasta ## | UPSOURCENAME: Uniprot ## | UPSOURCEURL: http://www.UniProt.org/ ## | UPSOURCEDATE: Thu Aug 20 15:34:08 2015
## ##请参见:help('select')了解使用信息

OrgDb对象

select ()

相关功能

其他注释资源-biomaRtAnnotationHub

biomaRt和朋友

http://biomart.orgBioconductorbiomaRt

##需要上网!!library(biomaRt) head(listMarts(), 3) ## list marts head(listDatasets(useMart(" ensemble ")), 3) ## mart datassets ensemble bl <- ##完全指定的mart useMart(" ensemble ", dataset = " hsapiens_gene_ensemble ") head(listFilters(ensemble), 3) ## filters myFilter <- "chromosome_name" substr(filterOptions(myFilter, ensemble), 1,50) ##返回值myValues <- c("21", "22") head(listAttributes(ensemble),3) ## attributes myAttributes <- c(" ensemble bl_gene_id","chromosome_name") ##集合和查询市场res <- getBM(attributes = myAttributes, filters = myFilter, values = myValues, mart = ensemble)

其他互联网资源

AnnotationHub

示例:ensemble 'GTF'文件到R/BioconductorGRanges和TxDb

library(AnnotationHub) hub <- AnnotationHub() hub query(hub, c(" integrbl ", "80", "gtf")) ## ensgtf = display(hub) #可视选择hub["AH47107"] gtf <- hub[["AH47107"]] gtf txdb <- genome icfeatures::makeTxDbFromGRanges(gtf)

例如:非模式生物OrgDb

library(AnnotationHub) hub <- AnnotationHub() query(hub, "OrgDb")

示例:将路线图表观基因组标记映射到hg38

注释变体

示例:从VCF文件中读取变量,并根据已知的基因模型进行注释

##输入变量库(VariantAnnotation) fl <- system。##已知基因模型库(txdb . hsapens . ucsc .hg19. knowngene) coding <- locateVariants(rowRanges(vcf), txdb . hsapens . ucsc .hg19. hg19. vcf) vcf <- readVcf(fl, "hg19")已知基因,编码变体()
GRanges对象有6个范围和9个元数据列:# # seqnames范围链|位置LOCSTART LOCEND QUERYID TXID # # < Rle > < IRanges > < Rle > | <因素> <整数> <整数> <整数> <人物> # # 1 chr22(50301422、50301422)- |编码939 939 24 75253 # # 2 chr22(50301476、50301476)25 - |编码885 885 75253 # # 3 chr22(50301488、50301488)26 - |编码873 873 75253 # # 4 chr22(50301494、50301494)27 - |编码867 867 75253 # # 5 chr22(50301584、50301584)28 - |编码777 777 75253 # # 6 chr22 (50302962,50302962] - |编码698 698 57 75253 ## CDSID GENEID preferdeid FOLLOWID ##     ## 1 218562 79087 ## 2 218562 79087 ## 3 218562 79087 ## 4 218562 79087 ## 5 218562 79087 ## 6 218563 79087 ## ------- ## seqinfo: 1序列来自一个未指定的基因组;没有seqlengths