5 - 9 October, 2015" output: BiocStyle::html_document: toc: true toc_depth: 2 vignette: > % \VignetteIndexEntry{5。} % \VignetteEngine{knitr::rmarkdown}——' ' ' {r style, echo = FALSE, results = 'asis'} BiocStyle::markdown() options(width=100, max.print=1000) knitr::opts_chunk$set(eval=as.logical(Sys. logical(Sys. php)) $set(eval=as.logical(Sys. php)采用"KNITR_EVAL","真正的")),缓存= as.logical (Sys。采用"KNITR_CACHE","真正的")))``` ```{r setup, echo=FALSE, messages=FALSE, warnings=FALSE} suppressPackageStartupMessages({library(GenomicFiles) library(BiocParallel)})本课程的材料需要Rversion 3.2和Bioconductor version 3.2 {R configuration -test} stopifnot(getRversion() >= '3.2' && getRversion() < '3.3', BiocInstaller::biocVersion() == "3.2")#限制-只输入必要的数据,例如:' ScanBamParam() ' - ' which ':感兴趣的基因组范围- ' what ':##迭代——读取整个文件,但以块为单位——块大小足够小,可以轻松地放入内存,块大小足够大,可以从_R_的向量化操作中受益——一次10k到1M记录——例如,' BamFile(…, yieldSize=100000)的迭代编程模型- yield_数据块-将输入数据映射为方便的表示,通常将输入汇总为简化的形式-例如,对齐读坐标计数感兴趣的重叠区域-例如,对齐读序列到GC内容- _reduce_跨映射块-使用' GenomicFiles::reduceByYield()` ```{r iteration} library(GenomicFiles) yield <- function(bfl) {## input a chunk of alignments library(GenomicAlignments) readGAlignments(bfl,} map <- function(aln) {## Count G or C nucleotides per read library(Biostrings) gc <- letterFrequency(mcls (aln)$seq, " gc ") ## summary read number with 0, 1,…G或C核苷酸表(1 + gc, 73) # max。read length: 72} reduce <- ' +` ``` - Example ' ' {r iterator -doit}库(RNAseqData.HNRNPC.bam.chr14) fls <- RNAseqData.HNRNPC.bam. bam. xml)chr14_BAMFILES bf <- BamFile(fls[1], yieldSize=100000) gc <- reduceByYield(bf, yield, map, reduce) plot(gc, type="h", xlab="GC Content per Aligned Read", ylab="Number of Reads") ``` ## Parallel evaluation - Cores, computers, clusters, clouds - Generally, requires memory management techniques like restriction or iteration -- parallel processes competing for shared memory - Many problems are _embarassingly parallel_ -- `lapply()`-like -- especially in bioinformatics where parallel evaluation is across files - Example: GC content in several BAM files ```{r parallel-doit} library(BiocParallel) gc <- bplapply(BamFileList(fls), reduceByYield, yield, map, reduce) library(ggplot2) df <- stack(as.data.frame(lapply(gc, cumsum))) df$GC <- 0:72 ggplot(df, aes(x=GC, y=values)) + geom_line(aes(colour=ind)) + xlab("Number of GC Nucleotides per Read") + ylab("Number of Reads") ```