作者:Martin Morgan (mtmorgan@fhcrc.org)、索娜莉·阿罗拉(sarora@fredhutch.org)
日期:2015年6月16日

映射组织中的标识符。*包

#方便的别名keytypes(org) # map from keys…
## [1] " entrezid " " pfam " " ipi " " prosite " ## [5] " accnum " " alias " " enzyme " " map " ## [9] " path " " mid " " refseq " " symbol " ## [13] " unigene " " ensembl " " ensemprot " " ensembltrans " ## [17] " genename " " uniprot " " go " " evidence " ## [21] " ontology " " goall " " evidence all " " ontology " ## [25] " omim " " ucsckg "
列(org) #……对列
## [1] " entrezid " " pfam " " ipi " " prosite " ## [5] " accnum " " alias " " chr " " chrloc " ## [9] " chrlocend " " enzyme " " map " " path " ## [13] " mid " " refseq " " symbol " " unigene " ## [17] " ensemble " " ensemble prot " " ensemble trans " " genename " ## [21] " uniprot " " go " " evidence " " ontology " ## [25] " goall " " evidence all " " omim " # [29] " ucsckg "

mapIds ()默认情况下强制1:1映射

mapIds(org, c("BRCA1", "PTEN"), "GENENAME", "SYMBOL")
## BRCA1 PTEN ##“乳腺癌1,早发”“磷酸酶和紧张素同源物”
map <- mapIds(org, sym, "GENENAME", "SYMBOL")
# # 56332年[1]
(地图)
## A2MP1 ## " α -2-巨球蛋白假基因1" ## NAT1 ## " n -乙酰转移酶1(芳胺n -乙酰转移酶)"## NAT2 ## " n -乙酰转移酶2(芳胺n -乙酰转移酶)"## NATP ##“n -乙酰转移酶假基因”

mapIds ()支持1:许多映射

头(选择(组织、钥匙(组织),“别名”))
## entrezid ## 1 1 a1b ## 2 1 abg ## 3 1 gab ## 4 1 hyst2477 ## 5 1 a1bg ## 6 2 a2md
mapIds(org, keys(org), ALIAS, ENTREZID))
## 1 2 3 9 10 11 ## " a1b " " a2md " " a2mp " " aac1 " " aac2 " " aacp "
mapIds(org, keys(org), "ALIAS", "ENTREZID", multiVals="CharacterList")
##长度6 ##[[“1”]A1B ABG GAB HYST2477 A1BG ##[[“2”]]A2MD CPAMD5 FWP007 S863-7 A2M ##[[“3”]A2MP A2MP1 ##[[“9”]AAC1 MNAT NAT-1 NATI NAT1 ##[[“10”]AAC2 NAT-2 PNAT NAT2 ##[[“11”]]AACP NATP1 NATP
str(head(mapIds(org, keys(org), "ALIAS", "ENTREZID", multiVals="list")))
# # 6 # # 1美元:列表对应[1:5]“达到”“ABG”“唠叨”“HYST2477”…# # $ 2:科[1:5]“A2MD”“CPAMD5”“FWP007”“S863-7”…# # 3:美元对应[1:2]“A2MP”“A2MP1”# # 9美元:空空的[1:5]“AAC1”“MNAT”“NAT-1”“NATI”…# # 10美元:空空的[1:4]“AAC2”“NAT-2”“PNAT”“NAT2”# # 11美元:空空的[1:3]“AACP”“NATP1”“NATP”

select ()可以用来选择几个不同的列吗

OrgDb和organisation mdb包

OrgDb(例如,org.Hs.eg.db)

OrganismDb(例如,Homo.sapiens)包。

library(txdb . haspens . ucsc .hg19. knowngene) txdb <- txdb . haspens . ucsc .hg19。knownGene sym <- "BRCA1" eid <- mapIds(org.Hs.eg.db, sym, "ENTREZID", "SYMBOL") txid <- mapIds(txdb, eid, "TXNAME", "GENEID", multiVals="list")[[eid]] txid
# # uc010whl[1]”。2”“uc002icp。4“uc010whm。2”“uc002icu。3“uc010cyx。3" ## [6] "uc002icq。3“uc002ict。3“uc010whn。2”“uc010who。3“uc010whp。2" ## [11] "uc010whq。1”“uc002idc。1”“uc010whr。1”“uc002idd。3“uc002ide。1" ## [16] "uc010cyy。1”“uc010whs。1”“uc010cyz。2”“uc010cza。2”“uc010wht.1”
cd <- cdsBy(txdb, by="tx", use.names=TRUE) cd [names(cds) %in% txid]
长度为20的GRangesList对象:2 # 22 #农庄对象范围和3元数据列:# # seqnames范围链| cds_id cds_name # # < Rle > < IRanges > < Rle > | <整数> <人物> # # [1]chr17(41276034、41276034)- | 186246 < NA > # # [2] chr17(41267743、41267743)- | 186245 < NA > # # [3] chr17(41258473、41258473)- | 186243 < NA > ## ... ... ... ... ... ... ...## [21] chr17 [41199660, 41199720] - | 186214  ## [22] chr17 [41197695, 41197819] - | 186212  ## exon_rank ##  ## [1] 1 ## [2] 2 ## [3] 3 ## ... ...##[21] 21 ##[22] 22 ## ##…## <19个元素> ## ------- ## seqinfo: 93个序列(1个循环)来自hg19基因组
library(Homo.sapiens) txid <- mapIds(Homo. sapiens)sym, "TXNAME", "SYMBOL", multiVals="list")[[sym]]智人,use.names = TRUE,通过=“tx”)

可以创建自定义版本,例如:OrganismDbi: makeOrganismDbFromBiomart ()

发现和检索biomaRt

网络资源:http://biomart.org
包:biomaRt

library(biomaRt) head(listMarts()) # available marts, 52!
# # 1 # # biomart版本运用运用基因80(英国桑格)# # 2 snp运用变异80(英国桑格)# # 3规定运用监管80年(英国桑格)# # 4维加维加60(英国桑格)# # 5 fungi_mart_26运用真菌26(英国EBI) # # 6 fungi_variations_26运用真菌变化26 (EBI英国)
# listDatasets(useMart("ensembl")) # datasets in mart, 69!
# # # #数据集1 oanatinus_gene_ensembl # # 2 cporcellus_gene_ensembl # # 3 gaculeatus_gene_ensembl # # 4 lafricana_gene_ensembl # # 5 itridecemlineatus_gene_ensembl # # 6 choffmanni_gene_ensembl # # 1 # #描述版本鸭嘴兽anatinus基因(OANA5) OANA5 # # 2 Cavia porcellus基因(cavPor3) cavPor3 # # 3 Gasterosteus aculeatus基因(broad1) broad1 ## 4 Loxodonta africana genes (loxAfr3) loxAfr3 ## 5 Ictidomys tridecemlineatus genes (spetri2) spetri2 ## 6 Choloepus hoffmanni genes (choHof1) choHof1
ensemble <- #完全指定mart useMart(" ensemble ", dataset = " hsapiens_gene_ensemble ") head(listFilters(ensemble), 3) # filters, 296!
## name description ## 1 chromosome_name染色体名称## 2 start Gene start (bp) ## 3 end Gene end (bp)
myFilter <- "chromosome_name" substr(filterOptions(myFilter, ensemble), 1,50) #返回值
# #”[1][1,2,3,4,5,6,7,8,9,10,11,12日,13日,14日,15日,16日,17日,18日,19日2”
myValues <- c("21", "22") head(listAttributes(ensembl), 3) # attributes . properties . properties . properties . properties . properties
## name description ## 1 ensembl_gene_id Ensembl Gene ID ## 2 ensembl_transcript_id Ensembl Transcript ID ## 3 ensembl_peptide_id Ensembl Protein ID
myAttributes <- c("ensembl_gene_id","chromosome_name") ##集合查询mart res <- getBM(myAttributes, myFilter, myValues, ensembl)

发现和检索AnnotationHub

包:AnnotationHub

library(AnnotationHub) hub = AnnotationHub() hub
## AnnotationHub with 16754 records ## snapshotDate(): 2015-05-26 ## # $dataprovider: UCSC, ensemble bl, NCBI, Haemcode, in偏执8,Pazar, dbS…种类:智人,Mus musculus, Bos taurus, Pan troglodytes, Da…## # $rdataclass: FaFile, GRanges, OrgDb, ChainFile, BigWigFile, in偏疑…## #附加mcols(): taxonomyid, genome, description, tags, ## # sourceurl, sourcetype ## #检索记录,例如,'object[["AH2"]]' ## ## title ## AH2 | Ailuropoda_melanoleuca.ailMel1.69.dna.toplevel。fa# # AH3 | ailuropoda_melanoleuca . ailmel1.69 dna_rm.toplevel。fa# # AH4 | Ailuropoda_melanoleuca.ailMel1.69.dna_sm.toplevel。## ... ...xiphophorus_maculatus . xipmac4 . .2.ncrna。xiphophorus_maculatus . xipmac4.4.2 .pe .all.fa ## AH47936 |
查询(hub, c(“ensemble bl”,“80”,“gtf”))
## # snapshothub with 102 records ## # snapshotDate(): 2015-05-26 ## # $dataprovider: Ensembl ## ## $species: Gadus morhua, Oryzias latipes, Xiphophorus maculatus, Ailur…## # $rdataclass: GRanges ## # additional mcols(): taxonomyid, genome, description, tags, ## # sourceurl, sourcetype ## #检索记录,例如,'object[["AH7535"]]' ## ## title ## AH7535 | Xiphophorus_maculatus.Xipmac4.4.2.69。gtf ## AH7554 |gtf## AH7575 | Oryzias_latipes.MEDAKA1.70.gtf ## ... ... ## AH47107 | Xenopus_tropicalis.JGI_4.2.80.gtf ## AH47108 | Xiphophorus_maculatus.Xipmac4.4.2.80.gtf
## ensgtf = display(hub) # visual choice hub["AH47107"]
# # # # # AnnotationHub 1记录snapshotDate(): 2015-05-26 # # #名称():AH47107 # # # $ dataprovider:运用# # # $物种:非洲爪蟾蜍tropicalis # # # $ rdataclass:农庄# # # $标题:Xenopus_tropicalis.JGI_4.2.80。gtf ## # $description: Gene Annotation for Xenopus tropicalis ## # $taxonomyid: 8364 ## # $genome: JGI_4.2 ## # $sourcetype: gtf ## ## $sourceurl: ftp://ftp.ensembl.org/pub/release-80/gtf/xenopus_tropical…## # $sourcelastmodifieddate: 2015-05-01 ## # $sourcesize: 8492889 ## # $tags: GTF, ensembl, Gene, Transcript, Annotation ## ##检索record with 'object[["AH47107"]]'
gtf <- hub[["AH47107"]
与581787年# #农庄对象范围和19元数据列:# # seqnames范围链|源类型# # < Rle > < IRanges > < Rle > | <因素> <因素> # # [1]GL172637.1[148] - |运用基因# # [2]GL172637.1[148] - |运用记录# # [3]GL172637.1[148] - |运用外显子  ## ... ... ... ... ... ... ...## [581787] GL180121.1[1817, 1835] + |外显子## [581787]1835] + |运用cd # #分阶段gene_id gene_version gene_name # # <数字> <整数> <人物> <数字> <人物> # # [1]< NA > < NA > ENSXETG00000030486 1 U5 # # [2] < NA > < NA > ENSXETG00000030486 1 U5 # # [3] < NA > < NA > ENSXETG00000030486 1 U5  ## ... ... ... ... ... ...## [581786]   ENSXETG00000033193 1  ## [581787]  1 ENSXETG00000033193 1  ## gene_source gene_biotype transcript_id ##    ## [1] ensemble snRNA  ## [2] ensemble snRNA ENSXETT00000065882 ## [3] ensemble snRNA ENSXETT00000065882 ## ... ... ... ...## [581786] ensemble protein_coding ENSXETT00000053735 ## [581787] ensemble protein_coding ENSXETT00000053735 ## transcript_version transcript_name transcript_source ##    ## [1]    ## [2] 1 U5-201 ensemble # [3] 1 U5-201 ensemble # ... ... ... ...## [581786] 2  ensemble bl ## [581787] 2  ensemble bl ## transcript_biotype exon_number exon_id exon_version ##     ## [1]     ## [2] snRNA    ## [3] snRNA 1 enxete00000393193 1 ## ... ... ... ... ...## [581786] protein_coding 3   ## protein_id protein_version ##   ## [1]   ## [2]   ## [3]   ## ... ... ...## [581786]   ## [581787] ENSXETP00000053735 2 ## ------- ## seqinfo: 2375序列来自一个未指定基因组;没有seqlengths
# # org。*数据库可从AnnotationHub查询(hub,“OrgDb”)
## ## $dataprovider: NCBI, ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/ ## # $species: Escherichia coli, Anopheles gambiae, Macaca mulatta, Pan tr…## # $rdataclass: OrgDb ## # additional mcols(): taxonomyid, genome, description, tags, ## # sourceurl, sourcetype ## #检索记录,例如,'object[["AH12818"]]' ## ## title ## AH12818 | org. pseudomonas_mendocina_n1 . 01.eg。sqlite ## AH12819 | org.Streptomyces_coelicolor_A3(2).eg。sqlite ## AH12820 |Sqlite ## ... ...## AH47001 | org.Pf.plasmo.db.sqlite
mcols(query(hub, "OrgDb"))[, "species", drop=FALSE]
## AH12818 Pseudomonas mendocina_NK-01 ## AH12819 Streptomyces coelicolor_A3(2) ## AH12820 Cricetulus griseus ## ... ...## AH47001恶性疟原虫

在Bioc-devel,元数据(gtf)的相关信息。AnnotationHub记录gtf来自。

AnnotationHub对于non-model-organismorg . *

查询(hub,“OrgDb”)
## ## $dataprovider: NCBI, ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/ ## # $species: Escherichia coli, Anopheles gambiae, Macaca mulatta, Pan tr…## # $rdataclass: OrgDb ## # additional mcols(): taxonomyid, genome, description, tags, ## # sourceurl, sourcetype ## #检索记录,例如,'object[["AH12818"]]' ## ## title ## AH12818 | org. pseudomonas_mendocina_n1 . 01.eg。sqlite ## AH12819 | org.Streptomyces_coelicolor_A3(2).eg。sqlite ## AH12820 |Sqlite ## ... ...## AH47001 | org.Pf.plasmo.db.sqlite
中心[[" AH12818 "]]
## OrgDb object: ## | DBSCHEMAVERSION: 2.1 ## | DBSCHEMA: NOSCHEMA_DB ## |生物体:Pseudomonas mendocina_NK-01 ## | SPECIES: Pseudomonas mendocina_NK-01 ## | CENTRALID: GID ## | TAXID: 1001585 ## | Db type: OrgDb ## | support package: AnnotationDbi
## ##请参阅:help('select')查看使用信息

AnnotationHub用于集成bl GTF、FASTA和TxDb资源

发现和回收河豚可用的GTF和FASTA资源

查询(hub, c(“ensemble”,“release-80”,“Takifugu”))
## AnnotationHub with 7 records ## snapshotDate(): 2015-05-26 ## # $dataprovider: Ensembl ## # $species: Takifugu rubripes ## # $rdataclass: FaFile, GRanges ## #附加mcols():检索记录,例如,'object[["AH47101"]]' ## ## title ## AH47101 | Takifugu_rubripes.FUGU4.80。Takifugu_rubripes.FUGU4.cdna.all. gtf ## AH47475 |fa# # AH47476 | Takifugu_rubripes.FUGU4.dna_rm.toplevel。## ... ...## AH47479 | Takifugu_rubripes.FUGU4.ncrna。Takifugu_rubripes.FUGU4.pep.all.fa ## AH47480 |
gtf < -中心[[“AH47101”]]dna < -[[“AH47477”]]gtf中心
与1388717 # #农庄对象范围和19元数据列:# # seqnames范围链|源类型# # < Rle > < IRanges > < Rle > | <因素> <因素> # # [1]scaffold_1(10422、11354)- - - - - - |运用基因# # [2]scaffold_1(10422、11354)- - - - - - |运用记录# # [3]scaffold_1(10422、11354)- - - - - - |运用外显子  ## ... ... ... ... ... ... ...# # [1388716] scaffold_16598(1807、1936)+ |运用外显子# # [1388717]scaffold_16598(1807、1936)+ |运用cd # #分阶段gene_id gene_version # # <数字> <整数> <人物> <数字> # # [1]< NA > < NA > ENSTRUG00000003702 1 # # [2] < NA > < NA > ENSTRUG00000003702 1 # # [3] < NA > < NA > ENSTRUG00000003702 1  ## ... ... ... ... ...# # (1388716) < NA > < NA > ENSTRUG00000004123 1 # # (1388717) < NA > 1 ENSTRUG00000004123 1 # # gene_source gene_biotype transcript_id # # <人物> <人物> <人物> # #[1]运用protein_coding < NA > # #[2]运用protein_coding ENSTRUT00000008740 # #[3]运用protein_coding ENSTRUT00000008740  ## ... ... ... ...# # # #(1388716)运用protein_coding ENSTRUT00000009819[1388717]运用protein_coding ENSTRUT00000009819 # # transcript_version transcript_source transcript_biotype # # <数字> <人物> <人物> # # [1]< NA > < NA > < NA > # #[2] 1运用protein_coding # #[3] 1运用protein_coding  ## ... ... ... ...1 ensemble protein_coding ## [1388717] 1 ensemble protein_coding ## exon_number exon_id exon_version protein_id ##     ## [1]     ## [2]     ## [3] 1 ENSTRUE00000055472  ## ... ... ... ... ...## [1388717] 4   ENSTRUP00000009764 ## protein_version gene_name transcript_name ##    ## [1]    ## [2]    ## [3]   ## ... ... ... ...## [1388716]  IWS1 (1 of 2) IWS1 (1 of 2)-201 ## [1388717] 1 IWS1 (1 of 2) IWS1 (1 of 2)-201 ## ------- ## seqinfo: 2056个序列来自一个未指定的基因组;没有seqlengths
dna
## FaFile ##路径:/home/mtmorgan/。AnnotationHub/53323 ## index: /home/mtmorgan/.AnnotationHub/53324 ## isOpen: FALSE ## yieldSize: NA
头(seqlevels (dna))
## [1] "scaffold_1" "scaffold_2" "scaffold_3" "scaffold_4" "scaffold_5" ## [6] "scaffold_6"

创建一个TxDb实例

library(GenomicFeatures) txdb <- makeTxDbFromGRanges(gtf) ## saveDb(txdb, " txdb . takifugu . ensemble .80.sqlite") ## loadb (" txdb . takifugu . ensemble .80.sqlite")

使用txdb

getSeq,FaFile-method
##加载所需的包:XVector
(txdb) getSeq(dna,外显子)
## A DNAStringSet instance of length 322622 ## width seq names ## [1] 68 GCTAGCGTAGCTTAACCA…Tgaaaagtcccgcaggca mt ## [2] 947 caaaagcttggtcctgac…Aggtgcacttggaaaaac mt ## [3] 74 cggagcatagcttaacag…... ... ...## [322621] 93 ccgtcagagcagcaggtg…GACGGCGACTACCATCAG scaffold d_9956 ## [322622] 198 AGGATTGAGCTGCGCTCC…GAGGTGGACGGGGTCAAG scaffold_9956