内容

本课程的材料要求R版本3.2和Bioconductor版本3.2

stopifnot(getRversion() >= '3.2' && getRversion() < '3.3', BiocInstaller::biocVersion() == "3.2")

1注释

1.1生物模型

1.1.1基因模型注释资源TxDb

例如,“{r Biocpkg (TxDb.Hsapiens.UCSC.hg19.knownGene)}

txdb <- txdb . hapiens . ucsc .hg19. knowngeneknownGene txdb
# # TxDb对象:# # # Db型:TxDb支持包:# # # # # # GenomicFeatures数据来源:UCSC基因组:# # # # # # hg19生物:智人# # #分类ID: 9606 # # # UCSC的表:knownGene # # #资源URL: http://genome.ucsc.edu/ # # #的基因类型ID: Entrez基因ID # # #完整数据集:是的# # # miRBase构建ID: GRCh37 # # # transcript_nrow:82960 # # # exon_nrow: 289969 # # # cds_nrow: 237533 # # # Db由:GenomicFeatures包从Bioconductor # # #创建时间:2015-10-07 18:11:28 + 0000(2015年10月结婚,07年)# # # GenomicFeatures版本在创建的时候:1.21.30创建时间:# # # RSQLite版本1.0.0 # # # DBSCHEMAVERSION: 1.1
方法(类=类(txdb))
## [1] annotatedDataFrameFrom asBED asGFF as。表# # [5]assayData assayData < - cdsByOverlaps cdsBy # # [9] cd胁迫列结合# #[13]内容dbconn dbfile dbInfo # #[17]该dbschema disjointExons距离# # [21]$ < - $ exonsByOverlaps exonsBy # #[25]外显子ExpressionSet extractUpstreamSeqs featureNames < - # # [29] featureNames fiveUTRsByTranscript基因初始化# # [33]## [49] revmap sample sampleNames<- sampleNames ## [53] saveDb select seqinfo seqlevels0 ## [57] seqlevels<- show species storageMode<- ## [61]storageMode summarizeVariants taxonomyId threeUTRsByTranscript ## [65] transcriptsByOverlaps transcriptsBy transcripts tRNAs ## [69] updateObject ## see '?方法,用于访问帮助和源代码

TxDb对象

访问基因模型

  • 外显子(),成绩单(),基因(),cd ()(编码序列)
  • 发起人()和朋友
  • exonsBy ()&朋友-外显子由基因,转录,…
  • “选择”界面:keytypes (),列(),键(),select (),mapIds ()
外显子(txdb)
与289969年# #农庄对象范围和1元数据列:# # seqnames范围链| exon_id # # < Rle > < IRanges > < Rle > | <整数> # # [1]chr1(11874、12227)+ | 1 # # [2]chr1(12595、12721)+ | 2 # # [3]chr1(12613、12721)+ | 3 # # [4]chr1(12646、12697)+ | 4 # # [5]chr1(13221、14409)+ | 5  ## ... ... ... ... ... ...# # [289965] chrUn_gl000241(35706、35859)- | 289965 # # [289966]chrUn_gl000241(36711、36875)- | 289966 # # [289967]chrUn_gl000243(11501、11530)+ | 289967 # # [289968]chrUn_gl000243(13608、13637)+ | 289968 # # [289969]chrUn_gl000247(5787、5816)- | 289969  ## ------- ## seqinfo: 93从hg19基因组序列(1循环)
exonsBy (txdb tx)
## $1 GRanges object of length 82960: ## $1 GRanges object with 3 range and 3 metadata column# # seqnames范围链| exon_id exon_name exon_rank # # < Rle > < IRanges > < Rle > | <整数> <人物> <整数> # # [1]chr1(11874、12227)+ | 1 < NA > 1 # # [2] chr1(12613、12721)+ | 3 < NA > 2 # # [3] chr1(13221、14409)+ | 5 < NA > 3 # # # # # # 2美元农庄对象与范围和3元数据列:# # seqnames范围链| exon_id exon_name exon_rank # # [1] chr1(11874、12227)+ | 1 < NA > 1 # # [2] chr1(12595、12721)+ | 2 < NA > 2 # # [3] chr1(13403、14409)+ | 6 < NA > 3 # # # # # # 3美元农庄对象与范围和3元数据列:## seqnames ranges strand | exon_id exon_name exon_rank ## [1] chr1 [11874, 12227] + | 1  1 ## [2] chr1 [12646, 12697] + | 4  2 ## [3] chr1 [13221, 14409] + | 5  3 ## ##…## <82957 more elements> ## ------- ## seqinfo:来自hg19基因组的93个序列(1个循环)

1.1.2全基因组序列,BSgenome

例如,“{r Biocpkg (BSgenome.Hsapiens.UCSC.hg19)}

BSgenome.Hsapiens.UCSC.hg19 (BSgenome.Hsapiens.UCSC.hg19)hg19 getSeq(基因外显子(txdb) [1:10])
## DNAStringSet实例的长度100 ##宽度seq ## [1] 354 cttgccgtcagccttttctttttgacctcttctttttctgttcatgt…##[2] 127…GACTTGGATCACACTCTTGTGAGTGTCCCCAGTGTTGCAGAG # # [3] 109 GTGTGTGGTGATGCCAGGCATGCCCTTCCCCAGCATCAGGTCT…GACTTGGATCACACTCTTGTGAGTGTCCCCAGTGTTGCAGAG # # [4] 52 CATCAGGTCTCCAGAGCTGCAGAAGACGACGGCCGACTTGGATCACACTCTT # # [5] 1189 GCAGGGCCATCAGGCACCAAAGGGATTCTGCCAGCATAGTGCT…Acagctagagatcctttattaaaagcacactgttggtttctg ## ... ... ...## [96] 251 cccgtccccggcgcgcgcgcccccccccccccccccccccccccccccctcgc…## [97] 262 gttcgggtctggcttgtacttgaagggcaaagacctggccc…TCACCCTGCGGAACCTGGAGGAGGTGGAGTTCTGTGTGGAAG 48 ATAAACCCGGGACCCACTTCACTCCAGTGCCTCCGACGCCTCCTGATG # # # # [98] [99] 216 CGTGCCGGGGAATGCTGTGCGGCTTCGGCGCCGTGTGCGAGCC…GCCAGCAGCGCCGCATCCGCCTGCTCAGCCGCGGGCCGTGCG # # 225 GCTCGCGGGACCCCTGCTCCAACGTGACCTGCAGCTTCGGCAG…CCCGCCAGGAGAATGTCTTCAAGAAGTTCGACGGCCCTTGTG [100]

1.1.3标识符映射—OrgDb

图书馆(org.Hs.eg.db) org.Hs.eg.db
# # OrgDb对象:# # | DBSCHEMAVERSION: 2.1 # # | Db型:OrgDb # # |支持包:AnnotationDbi # # | DBSCHEMA: HUMAN_DB # # |生物:智人# # |物种:人类# # | EGSOURCEDATE: 2015 - sep27 # # | EGSOURCENAME: Entrez基因# # | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA # # | CENTRALID:如# # | TAXID: 9606 # # | GOSOURCENAME:基因本体论# # | GOSOURCEURL: ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest-lite/ # # | GOSOURCEDATE: 20150919 # # | GOEGSOURCEDATE: 2015 - sep27 # # | GOEGSOURCENAME: Entrez基因# # | GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA # # | KEGGSOURCENAME: KEGG基因组# # | KEGGSOURCEURL:## | KEGGSOURCEDATE: 2011-Mar15 ## | GPSOURCENAME: UCSC基因组生物信息学(智人)## | GPSOURCEURL: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19 ## | GPSOURCEDATE: 2010-Mar22 ## | ENSOURCEDATE: 2015-Jul16 ## | ENSOURCENAME: Ensembl ## | ENSOURCEURL:ftp://ftp.ensembl.org/pub/current_fasta ## | UPSOURCENAME: Uniprot ## | UPSOURCEURL: http://www.UniProt.org/ ## | UPSOURCEDATE: 2015年10月9日星期五19:54:03
## ##请参阅:help('select')查看使用信息

OrgDb对象

  • 策划资源,底层的sqlite数据库,比如TxDb
  • 自己做:AnnotationForge(但请参阅下面的AnnotationHub !)
  • “选择”界面:keytypes (),列(),键(),select (),mapIds ()

select ()

  • 向量键,所需列
  • 按键类型规格

    选择(org.Hs.eg.db, c(“BRCA1”、“PTEN”),c(“ENTREZID”、“GENENAME”),“象征”)
    ## 'select()'返回键和列之间的1:1映射
    ## # ENTREZID基因## # 1 BRCA1 672乳腺癌1,早发## # 2 PTEN 5728磷酸酶和紧张素同源物
    keytypes (org.Hs.eg.db)
    ## [1] " accnum " " alias " " ensembl " " ensemblprot " " ensembltrans " " entrezid " ## [7] " enzyme " " evidence " " evidence all " " genename " " go " " goall " ## [13] " ipi " " map " " omim " " ontology " " ontology " " path " ## [19] " pfam " " mid " " prosite " " refseq " " symbol " " ucsckg " ## [25] " unigene " " uniprot "
    列(org.Hs.eg.db)
    ## [1] " accnum " " alias " " ensembl " " ensemblprot " " ensembltrans " " entrezid " ## [7] " enzyme " " evidence " " evidence all " " genename " " go " " goall " ## [13] " ipi " " map " " omim " " ontology " " ontology " " path " ## [19] " pfam " " mid " " prosite " " refseq " " symbol " " ucsckg " ## [25] " unigene " " uniprot "

相关功能

  • mapIds ()-从一个标识符映射到另一个标识符的特殊情况
  • OrganismDb对象:结合org . *,TxDb。*,以及其他易于访问的注释资源

    库(Homo.sapiens)
    ##现在直接获得GODb对象##现在直接获得OrgDb对象##现在直接获得TxDb对象
    选择(Homo。智人,c(“BRCA1”、“PTEN”),c(“TXNAME”、“TXCHROM”,“TXSTART”、“TXEND”),“象征”)
    ## select()返回1:很多键和列之间的映射
    ##符号TXNAME TXCHROM TXSTART TXEND ## 1 BRCA1 uc010whl。2 ch17 41196312 41276132 ## 2 BRCA1 uc002icp。4 chr17 41196312 41277340 ## 3 BRCA1 uc010whm。2 ch17 41196312 41277340 ## 4 BRCA1 uc002icu3 chr17 41196312 41277468 ## 5 BRCA1 uc010cyx3 chr17 41196312 41277468 ## 6 BRCA1 uc002icq。3 chr17 41196312 41277500 ## 7 BRCA1 uc002ict。3 chr17 41196312 41277500 ## 8 BRCA1 uc010whn。2 chr17 41196312 41277500 ## 9 BRCA1 uc010who。3.chr17 41196312 41277500 ## 10 BRCA1 uc010whp.2 chr17 41196312 41322420 ## 11 BRCA1 uc010whq.1 chr17 41215350 41256973 ## 12 BRCA1 uc002idc.1 chr17 41215350 41277468 ## 13 BRCA1 uc010whr.1 chr17 41215350 41277468 ## 14 BRCA1 uc002idd.3 chr17 41243117 41276132 ## 15 BRCA1 uc002ide.1 chr17 41243452 41256973 ## 16 BRCA1 uc010cyy.1 chr17 41243452 41277340 ## 17 BRCA1 uc010whs.1 chr17 41243452 41277468 ## 18 BRCA1 uc010cyz.2 chr17 41243452 41277500 ## 19 BRCA1 uc010cza.2 chr17 41243452 41277500 ## 20 BRCA1 uc010wht.1 chr17 41243452 41277500 ## 21 PTEN uc001kfb.3 chr10 89623195 89728532 ## 22 PTEN uc021pvw.1 chr10 89623195 89728532

1.2其他注释资源-biomaRt,AnnotationHub

1.2.1 "biomaRt和朋友

http://biomart.org;BioconductorbiomaRt

##需要上网!!库(biomaRt)头(listMarts(), 3) # #集市列表头(listDatasets (useMart(“运用”)),3)运用< - # # # #集市数据集完全指定的集市useMart(“运用”,数据集=“hsapiens_gene_ensembl”)负责人(listFilters(运用),3)# #过滤器myFilter < -“chromosome_name substr (filterOptions (myFilter运用),1,##集合和查询mart res <- getBM(attributes = myAttributes, filters = myFilter, values = myValues, mart = ensembl)

其他网络资源

1.2.2AnnotationHub

  • BioconductorAnnotationHub
  • 旨在方便使用“联盟”和其他基因组规模的资源
  • 简化发现、检索、本地管理和导入到标准Bioconductor表示

示例:集成“GTF”文件R/Bioconductor农庄和TxDb

library(AnnotationHub) hub <- AnnotationHub() hub query(hub, c(" ensemble bl", "80", "gtf")) ## ensgtf = display(hub) # visual choice hub["AH47107"] gtf <- hub[["AH47107"]] gtf txdb <- GenomicFeatures::makeTxDbFromGRanges(gtf)

例如:non-model生物OrgDb

library(AnnotationHub) hub <- AnnotationHub() query(hub, "OrgDb")

示例:将路线图表基因组标记映射到hg38

  • 路线图床文件如下农庄

    查询(hub, c("EpigenomeRoadMap", "E126", "H3K4ME2")) E126 <- hub[["AH29817"]]
  • UCSC ' liftOver '文件映射坐标

    查询(中心,c(“hg19”、“hg38”,“chainfile”))链< -中心[[“AH14150”]]
  • 提升可能的一对多映射,所以农庄GRangesList

    library(rtracklayer) E126hg38 <- liftOver(E126, chain) E126hg38

2注释变体

示例:从VCF文件中读取变体,并对已知的基因模型进行注解

##输入变量库(VariantAnnotation) fl <- system. php)##已知的基因模型库(txdb . haspises . ucsc .hg19. knowngene)编码<- locateVariants(rowRanges(vcf), txdb . haspises . ucsc .hg19. knowngene)编码<- locateVariants(rowRanges(vcf), txdb . haspises . ucsc .hg19. knowngene)。knownGene CodingVariants())头(编码)
## GRanges对象有6个范围和9个元数据列:# # seqnames范围链|位置LOCSTART LOCEND QUERYID TXID # # < Rle > < IRanges > < Rle > | <因素> <整数> <整数> <整数> <人物> # # 1 chr22(50301422、50301422)- |编码939 939 24 75253 # # 2 chr22(50301476、50301476)25 - |编码885 885 75253 # # 3 chr22(50301488、50301488)26 - |编码873 873 75253 # # 4 chr22 (50301494,50301494] - |编码867 867 27 75253 ## 5 chr22[50301584, 50301584] - |编码777 777 28 75253 ## 6 chr22 [50302962,50302962] 57 - |编码698 698 75253 # # CDSID GENEID PRECEDEID FOLLOWID # # < IntegerList > <人物> < CharacterList > < CharacterList > # # 1 218562 79087 # 218562 # 79087 # 218562 # 79087 # 218562 # 79087 # 218562 # 79087 # # 6 218563 79087  ## ------- ## seqinfo: 1从一个未指明的基因组序列;没有seqlengths

3.资源

确认

3.1sessionInfo ()

sessionInfo ()
## R version 3.2.2 (2015-08-14) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Debian GNU/Linux stretch/sid ## ## locale: ## [1] LC_CTYPE=en_US。utf - 8 LC_NUMERIC = C而= en_US。UTF-8 ## [4] LC_COLLATE=en_US。utf - 8 LC_MONETARY = en_US。utf - 8 LC_MESSAGES = en_US。UTF-8 ## [7] LC_PAPER=en_US。UTF-8 LC_NAME=C LC_ADDRESS= c# ## [10] LC_TELEPHONE=C LC_MEASUREMENT=en_US。## [1] stats4 parallel stats graphics grDevices utils datasets methods base ## ##其他附加包:# # # # [1] Homo.sapiens_1.3.1 GO.db_3.2.2 [3] OrganismDbi_1.11.43 biomaRt_2.25.3 # # [5] AnnotationHub_2.1.45 VariantAnnotation_1.15.34 # # [7] RNAseqData.HNRNPC.bam.chr14_0.7.0 GenomicAlignments_1.5.18 # # [9] Rsamtools_1.21.21 ALL_1.11.0 # # [11] org.Hs.eg.db_3.2.3 RSQLite_1.0.0 # # [13] DBI_0.3.1 ggplot2_1.0.1 # # [15] airway_0.103.1 limma_3.25.18 # #[17] DESeq2_1.9.51 RcppArmadillo_0.6.100.0.0 # # [19] Rcpp_0.12.1 BSgenome.Hsapiens.UCSC.hg19_1.4.0 # # [21] BSgenome_1.37.6 rtracklayer_1.29.28 # # [23] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2 GenomicFeatures_1.21.33 # # [25] AnnotationDbi_1.31.19 SummarizedExperiment_0.3.11 # # [27] Biobase_2.29.1 GenomicRanges_1.21.32 # # [29] GenomeInfoDb_1.5.16microbenchmark_1.4-2 ## [31] Biostrings_2.37.8 XVector_0.9.4 ## [33] IRanges_2.3.26 S4Vectors_0.7.23 ## [35] BiocGenerics_0.15.11 BiocStyle_1.7.9 ## ##通过命名空间加载(和没有附加):## [1] bitops_1.0-6 RColorBrewer_1.1-2 httr_1.0.0 ## [7] Hmisc_3.17-0 colorspace_1.2-6 nnet_7.3-11 ## [10] gridExtra_2.0.0 graph_1.47.2 formatR_1.2.1 ## [13] sandwich_2.3-4 labeling_0.3 scales_0.3.0 ## [16] mvtnorm_1.0-3 genefilter_1. 1.51.1 RBGL_1.45.1 ## [19] stringr_1.0.0 digest_0.6.8 foreign_0.8-66 ## [22]rmarkdown_0.8.1 htmltools_0.2.6 BiocInstaller_1.19.14 # # [25] shiny_0.12.2 zoo_1.7-12 BiocParallel_1.3.54 # # [28] acepack_1.3 4.7 - 3.3 rcurl_1.95 magrittr_1.5 # # [31] Formula_1.2-1 futile.logger_1.4.1 munsell_0.4.2 # # [34] proto_0.3-10 stringi_0.5-5 multcomp_1.4-1 # # [37] yaml_2.1.13 MASS_7.3-44 zlibbioc_1.15.0 # # [40] plyr_1.8.3 grid_3.2.2lattice_0.20-33 # # [43] splines_3.2.2 annotate_1.47.4 locfit_1.5 - 9.1 # # [46] knitr_1.11 geneplotter_1.47.0 reshape2_1.4.1 # # [49] codetools_0.2-14 futile.options_1.0.0 xml_3.98 - 1.3 # # [52] evaluate_0.8 latticeExtra_0.6-26 lambda.r_1.1.7 # # [55] httpuv_1.3.3 gtable_0.1.2 mime_0.4 # # [58] xtable_1.7-4 survival_2.38-3 cluster_2.0.3 # # [61] TH.data_1.0-6 interactiveDisplayBase_1.7.3