% \VignetteEngine{knitr::knitr} % \VignetteIndexEntry{04。Bioconductor for Sequence Analysis {\ documentclass{article} \newcommand{\Dmel}{\emph{D。\ melanogaster}} \newcommand{\Hsap}{\emph{H。\ sapiens}} \ useppackage {Exercise} < >= BiocStyle::latex() library(knitr) opts_chunk$set(cache=TRUE, tidy=FALSE) @ < >= suppressPackageStartupMessages({库(ShortRead)库(VariantAnnotation)库(BiocParallel)库(ggplot2)库(rnaseqdata . hnrnnc .bam.chr14)库(TxDb.Hsapiens.UCSC.hg19.knownGene)库(BSgenome.Hsapiens.UCSC.hg19)库(AnnotationHub)库(rtracklayer)库(BiocIntro)}) @ \title{\Bioconductor{} for Sequence Analysis} \作者{\url{mtmorgan@fhcrc.org}} \日期{2014年2月27-28日}\begin{document} \maketitle \tableofcontents\段落{常见文件格式}高通量序列分析的“大数据”组件似乎是文件类型之间转换的混乱;常见的文件汇总在表~\ref{tab:seq:fileformats}中。FASTQ和BAM(有时是CRAM)文件是表示原始序列及其对齐的主要格式。VCF用于总结DNA-seq中的所谓变体;BED和WIG文件有时用于表示ChIP和其他监管峰值和“覆盖范围”。GTF / GFF文件对于提供特征注释非常重要,例如外显子组织成转录本和基因。用于输入的\begin{table} \居中\标题{Common \href{http://genome.ucsc.edu/FAQ/FAQformat.html}{file types}和\Bioconductor{}包。\label{tab:seq:fileformats} \begin{tabular}{lp{。6\textwidth}l \\hline\noalign{\smallskip} FASTQ和未对齐序列:标识符,序列,编码质量分数元组和\Biocpkg{ShortRead}\\ BAM和对齐序列:标识符,序列,参考序列名称,链位置,雪茄和附加标签& \Biocpkg{Rsamtools} \\ VCF &被称为单核苷酸,indel,拷贝数,和结构变体,通常压缩和索引(与\Biocpkg{Rsamtools} \Rfunction{bgzip}, \Rfunction{indexTabix}) & \Biocpkg{VariantAnnotation} \\ GFF, GTF和基因注释:参考序列名称,数据源,特征类型,开始和结束位置,链等& \Biocpkg{rtracklayer}\\ BED和基于范围的注释: reference sequence name, start, end coordinates. & \Biocpkg{rtracklayer}\\ WIG, bigWig & `Continuous' single-nucleotide annotation. & \Biocpkg{rtracklayer}\\ 2bit & Compressed FASTA files with `masks' \\ \hline \end{tabular} \end{table} \section{Short reads: FASTQ files} \subsection{FASTQ files} The Illumina GAII and HiSeq technologies generate sequences by measuring incorporation of florescent nucleotides over successive PCR cycles. These sequencers produce output in a variety of formats, but \emph{FASTQ} is ubiquitous. Each read is represented by a record of four components: < >= bigdata <- "~/bigdata" fl <- "~/bigdata/fastq/ERR127302_2.fastq.gz" cat(noquote(tail(readLines(fl, 800), 4)), sep="\n") fq <- FastqStreamer(fl, 100000) enc <- yield(fq) close(fq) @ \noindent第一行和第三行(分别以\verb|@|和\verb|+|开头)是唯一标识符。由排序器生成的标识符通常包括一个机器id,后面跟着关于读取的通道、平铺、x和y坐标的冒号分隔的信息。这里演示的示例还包括SRA登录号,该编号是在数据提交到存档时添加的。机器标识符可以潜在地用于提取关于批处理效果的信息。空间坐标(车道、平铺、x、y)常用于识别光学副本;在质量评估过程中,空间坐标也可以用于识别测序的伪效应,例如,在流池中不均匀的扩增,尽管这些空间效应很少被研究。FASTQ记录的第二行和第四行是读取的每个周期的核苷酸和质量。这一信息是在5'到3'方向上给出的,由定序器看到。序列中的字母\texttt{N}用于表示排序器无法调用的碱基。FASTQ记录的第四行编码相应基本调用的质量(置信度)。 The quality score is encoded following one of several conventions, with the general notion being that letters later in the visible ASCII alphabet < >= encoding(quality(enc)) @ \noindent的质量较高。字母对应数字,数字对应(最常见的)$-10\log_{10}{p}$。在上面的编码中,\texttt{I}对应的phred分数为40,因此$p=0.0001$。序列和质量分数都可能跨越多行。\小节{FASTQ文件的基本操作}\begin{练习}在这里,我们首先看一下来自ArrayExpress存储库E-MTAB-1147的FASTQ文件\footnote{\url{http://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-1147/}} \cite{zakack2013direct}。\begin{enumerate} \item加载\Biocpkg{ShortRead}和\Biocpkg{BiocParallel}包。\item创建一个字符向量\Rcode{dirPath}到\file{bigdata/fastq}目录,包含files \file{ERR127302\_1.fastq.gz}, \file{ERR127302\_2.fastq.gz}。\项目从\文件{ERR127302\_1.fastq.gz}读取代表性样本\项目对FASTQ文件的简单操作-id, reads和quality \项目总结每个周期核苷酸的使用\项目分析每个周期核苷酸,每个周期gc含量和质量分数\项目构建单个reads gc含量直方图\end{enumerate} \end{Exercise} \begin{Solution}加载\Biocpkg{ShortRead}和\CRANpkg{BiocParallel}包< >= library(ShortRead) library(BiocParallel) @ FASTQ文件越来越大。在处理管道的早期阶段查看数据的一个非常常见的原因是探索序列质量。在这种情况下,通常不需要解析整个FASTQ文件。而是创建一个有代表性的样本< >= dirPath <- "~/bigdata/fastq" sampler <- FastqSampler(文件。path(dirPath, "ERR127302_1.fastq.gz"), 1000000) reads <- yield(sampler) @查看id, reads和quality < >= #输出读id作为列表作为BStringSet头(id(读取))#输出读序列作为列表作为dstringset头(sread(读取))#输出质量分数列表作为BStringSet头(质量(读取))@ alphabetByCycle函数总结了(等宽)ShortReadQ或DNAStringSet实例中每个周期中核苷酸的使用情况。< >= abc <- alphabetByCycle(sread(reads)) abc[1:4, 1:8] matplot(t(abc[c("A","G"," t "," c "),]), type="l") @得到单个reads GC含量的直方图:< >= alf0 <- alphabetFrequency(sread(reads), as.prob=TRUE) hist(alf0[,c("G", " c ")], main =" gc Content直方图",xlab="individual reads") @ \end{解决方案}\小节{质量评估}\begin{练习}这里我们从ArrayExpress存储库E-MTAB-1147创建一个FASTQ文件的质量评估报告{\url{http://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-1147/}} \cite{zachack2013direct}。\begin{enumerate} \item使用\Rfunction{ShortRead::qa}函数为这两个文件创建质量报告,例如\Rcode{qa <- qa(dirPath, "ERR*", type="fastq")}在web浏览器中使用\Rcode{browseURL(report(qa))} \item查看完整实验中所有fastq文件的质量报告。通过加载准备好的数据对象\文件{E-MTAB-1147-qa\_report.Rda}来实现这一点。和你的邻居讨论每个情节的意义。什么技术arti \end{enumerate} \end{Exercise} \begin{Solution}从两个示例文件创建QA报告< >= qa <- qa(dirPath, "ERR*", type="fastq") @查看报表< >= browseURL(report(qa)) @加载所有车道的报告< (文件> =(负载。path(dirPath, "E-MTAB-1147-qa_report.Rda"))) @查看报告< >= browseURL(report(qa)) @ \end{解决方案}\分段{切边}\begin{练习}本练习探索切边,然后对几个FASTQ文件应用切边过滤器。首先加载\Rpackage{BiocIntro}包< >= library(BiocIntro) @ \begin{enumerate} \item创建一个指向FASTQ文件的字符向量,\Rcode{fl <- file。从FASTQ文件中加载100,000个读取的随机样本,使用\Rcode{fq <- FastqSampler()}和\Rcode{srq <- yield(fq)} \item绘制质量作为周期函数的直方图,\Rcode{plotByCycle(srq)}。查看如何使用\Rcode{encoding(quality(srq))}对质量进行编码。\item Trim读取平均质量小于20的前3个核苷酸之后(encoding \texttt{"5"})使用\Rcode{trimTails(srq, 3, "5")} \end{enumerate} \end{Exercise} \begin{Solution} \noindent加载100,000个读取的样本并可视化其质量< >= fl <- file。path(dirPath, "ERR127302_1.fastq.gz") fq <- FastqSampler(fl, 100000) srq <- yield(fq) srq plotByCycle(quality(srq)) @ \noindent修改读取和可视化质量< >= trim <- trimTails(srq, 3, "5") plotByCycle(quality()) @ \end{解决方案}\begin{练习}(可选)这个练习使用我们的修剪函数修剪所有读取。\begin{enumerate} \item使用\Rcode{fls <- dir(dirPath, pattern="fastq.gz", full=TRUE)}列出所有fastq文件的完整路径。\item为每个源文件创建目标文件,使用\Rcode{destinations <- sub("fastq.gz", "。\item Trim读起来和以前一样,但是使用文件路径和目的地作为参数,\Rcode{trimTails(fls, 3, "5", FALSE, destinations=destinations)}。\end{enumerate} \end{Exercise} \begin{Solution}识别相关文件< >= (fls <- dir(dirPath, pattern="fastq.gz", full=TRUE)) @ \noindent将文件名映射到目的地< >=(目的地<- sub("fastq.gz", "修剪过的。fastq", fls)) @ \noindent执行裁剪< >= trimTails(fls, 2, "5", destinations=destinations) @ \end{Solution} \section{Aligned reads: BAM files} \分段{BAM files}大多数短读序列的下游分析都是基于与参考基因组对齐的读。有许多可用的校准器,包括\href{http://bio-bwa.sourceforge.net/}{BWA} \cite{pmid20080505,pmid19451168}, \href{http://bowtie-bio.sourceforge.net/}{Bowtie} / \href{http://bowtie-bio.sourceforge.net/bowtie2/}{Bowtie2} \cite{pmid19261174},和\href{http://research-pub.gene.com/gmap/}{GSNAP};文献中讨论了这些方法的优点。也有在\Bioconductor{}中实现的对齐算法(例如,\Biocpkg{Biostrings}包中的\Rfunction{matchPDict}和\Biocpkg{Rsubread}包中的\Rfunction{matchPDict);\Rfunction{matchPDict}对于灵活对齐中等大小的数据子集特别有用。大多数主流对齐器以SAM(基于文本的)或BAM格式生成输出。SAM文件是一个文本文件,每个对齐的读取有一行,字段由制表符分隔。下面是一个SAM行示例,它被分割成多个字段。< >= fl <- system。文件(“extdata”,”丈夫说。sam", package="Rsamtools") strsplit(readLines(fl, 1), "\t")[[1]] @ sam文件中的字段汇总在Table~\ref{tbl:sam}中。\begin{table} \居中\标题{SAM记录中的字段。从\url{http://samtools.sourceforge.net/samtools.shtml}} \medskip \label{tbl:sam} \begin{tabular}{lll}字段&名称&值\\\hline\noalign{\smallskip} 1 & QNAME &查询(读取)Name \\ 2 & FLAG &位标记,例如:链的排列\ \ 3 & RNAME &参考序列名称\ \ 4和POS和基于最左边的位置序列\ \ 5 & MAPQ &映射质量(Phred-scaled) \ \ 6和雪茄和扩展雪茄字符串\ \ 7 & MRNM &伴侣参考序列名称\ \ 8和mpo和基于交配地位\ \ 9 & ISIZE &推断插入大小10 \ \ & SEQ &查询序列的引用链查询质量\ \ \ \ 11 &质量& 12 $ + $ & &可选字段,选择我们从FASTQ文件中识别标识符字符串、读取序列和质量。该排列是从位置1开始的染色体“seq1”。对齐链编码在' flag'字段中。对齐记录还包括映射质量的度量,以及描述对齐性质的CIGAR字符串。 In this case, the CIGAR is 36M, indicating that the alignment consisted of 36 \texttt{M}atches or mismatches, with no indels or gaps; indels are represented by \texttt{I} and \texttt{D}; gaps (e.g., from alignments spanning introns) by \texttt{N}. BAM files encode the same information as SAM files, but in a format that is more efficiently parsed by software; BAM files are the primary way in which aligned reads are imported in to \R{}. \subsection{Gapped alignments in \R{}} The \Rfunction{readGAlignments} function from the \Biocpkg{GenomicAlignments} package reads essential information from a BAM file in to \R. The result is an instance of the \Rclass{GappedAlignments} class. The \Rclass{GappedAlignments} class has been designed to allow useful manipulation of many reads (e.g., 20 million) under moderate memory requirements (e.g., 4 GB). \begin{Exercise} This exercise explores the \Rclass{GappedAlignments} class. \begin{enumerate} \item Load the \Biocexptpkg{RNAseqData.HNRNPC.bam.chr14} and retrieve the names of the BAM files it contains. These BAM files are subsets of a larger experiment. \item Read one BAM file in to \R{} using \Rfunction{readGAlignments}. How many reads are there? What do the first few records look like? \item Use the \Rfunction{strand} accessor and the standard \R{} function \Rfunction{table} to tabulate the number of reads on the plus and minus strand. Use the \Rfunction{width} and \Rfunction{cigar} accessors to summarize the aligned width and to explore the alignment cigars. \item The \Rfunction{readGAlignments} function takes an additional argument, \Rcode{param}, allowing the user to specify regions of the BAM file (e.g., known gene coordinates) from which to extract alignments, and other data to be extracted from the BAM file. Create a \Rclass{ScanBamParam} object with argument \Rcode{what="seq"}, and use this to input the read sequences as well as basic alignment information. \item With larger BAM files we often want to iterate through the file in chunks. Do this by creating a \Rclass{BamFile} from a file path, specifying a \Rcode{yieldSize}. Then write a short loop that uses \Rfunction{readGAlignments} to input successive chunks until there are no more records left. \end{enumerate} \end{Exercise} \begin{Solution} Load the experiment data library and read in one file, discovering the number of reads present < >= library(GenomicAlignments) library(RNAseqData.HNRNPC.bam.chr14) fls <- RNAseqData.HNRNPC.bam。chr14_BAMFILES basename(fls) aln <- readGAlignments(fls[1]) length(aln) head(aln, 3) @ \noindent一个\Rclass{GappedAlignments}实例就像一个数据帧,但其访问器是由列名建议的。它很容易查询,例如,与每条线对齐的读取的分布,读取的宽度,或雪茄串< >= table(strand(aln)) range(width(aln)) head(sort(table(cigar(aln)), deleting =TRUE)) @ \noindent在这里,我们构造了一个\Rclass{ScanBamParam}对象,并指出我们还想输入读取序列。< >= param <- ScanBamParam(what="seq") aln <- readGAlignments(fls[1], param=param) @ \noindent要遍历BAM文件,请使用适当的\Rcode{yieldSize}创建\Rclass{BamFile}实例。我们在下面的工作中使用\Rcode{yieldSize=200000},但实际上这可能要大一到两个数量级。< >= bf <- open(BamFile(fls[1], yieldSize=200000)) repeat {aln <- readGAlignments(bf) if (length(aln) == 0) break # no more records ## do work message(length(aln))} close(bf) @ \end{解决方案}\分段{汇总重叠}\begin{练习}RNA-seq和其他工作流中的一个基本操作是计算对齐的读取重叠感兴趣特征的次数。\begin{enumerate} \item加载包含hg19 UCSC '已知基因'轨道每个外显子坐标的' transcript db'包。提取按基因分组的外显子坐标;结果是一个\Rclass{GRangesList}对象,我们将在后面详细讨论。使用带有外显子坐标和BAM文件的\Rfunction{summarizeOverlaps}函数来生成每个基因重叠的读数的计数。访问帮助页面\Rcode{?summarizeOverlaps}来阅读使用的计数策略。计数可以使用函数\Rfunction{assay}从\Rfunction{summarizeOverlaps}的返回值中提取。这是标准的\R{}矩阵。在每个样本中有多少读取重叠感兴趣的区域? How many genes had non-zero counts? \end{enumerate} \end{Exercise} \begin{Solution} < >= ## library(BiocParallel) library(TxDb.Hsapiens.UCSC.hg19. knowngene) ex <- exonsBy(TxDb.Hsapiens.UCSC.hg19. exonsBy)knownGene, "gene") counts <- summarizeOverlaps(ex, fls) colsum (assay(counts)) sum(rowsum (assay(counts)) != 0) @ \end{Solution} \section{变体:VCF文件}DNASeq实验的一个主要产品是被称为变体的目录(例如,snp, indels)。我们将使用\Biocpkg{VariantAnnotation}包来研究这种类型的数据。包中包含的样本数据是来自\href{ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20110521/}{1000 Genomes}项目的22号染色体的一个子集。可变呼叫格式(VCF;\href{http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41}{full description})文本文件包含元信息行,带有列名的标题行,包含基因组中某个位置信息的数据行,以及每个位置的样本可选基因型信息。\分段{编码结果}\段落{定位基因内和周围的变体}关于基因的变体位置可以用\Rfunction{locateVariants}函数来识别。区域在\Rcode{region}参数中指定,可以是以下构造函数之一:\Rcode{codingvariables ()}, \Rcode{intronvariables ()}, \Rcode{fiveutrvariables ()}, \Rcode{threeutrvariables ()}, \Rcode{intergenicvariables ()}, \Rcode{SpliceSiteVariants()},或\Rcode{allvariables()}。位置定义显示在表~\ref{Table: Location}中。\begin{table} \居中\标题{变量位置}\label{table:location} \begin{tabular}{lp{。7\textwidth}} Location & Details \\ \hline\noalign{\smallskip} \Rcode{coding} & Within a coding region \\ \Rcode{fiveUTR} & Within a 5' untranslated region \\ \Rcode{threeUTR} & Within a 3' untranslated region \\ \Rcode{intron} & Within an intron region \\ \Rcode{intergenic} & Not within a transcript associated with a gene \\ \Rcode{spliceSite} & Overlaps any of the first or last 2 nucleotides of an intron \\ \hline \end{tabular} \end{table} \begin{Exercise} Load the \Biocannopkg{TxDb.Hsapiens.UCSC.hg19.knownGene} annotation package, and read in the \texttt{chr22.vcf.gz} example file from the \Biocpkg{VariantAnnotation} package. Remembering to re-name sequence levels, use the \Rfunction{locateVariants} function to identify coding variants. Summarize aspects of your data, e.g., did any coding variants match more than one gene? How many coding variants are there per gene ID? \end{Exercise} \begin{Solution} Here we open the known genes data base, and read in the VCF file. %% < >= library(VariantAnnotation) library(txdb . hsapiens . ucsc .hg19. knowngene) txdb <- txdb . hsapiens . ucsc .hg19。knownGene fl <- system。file("extdata", "chr22.vcf.gz", package="VariantAnnotation") vcf <- readVcf(fl, "hg19") vcf <- renameSeqlevels(vcf, c("22"="chr22")) @ %%接下来的几行是编码变量。< >= rd <- rowData(vcf) loc <- locateVariants(rd, txdb, codingvariables ()) head(loc, 3) @ %回答以基因为中心的问题,无论转录本如何,数据都可以按基因汇总。% % < >= ##是否有任何编码变体匹配多个基因?splt <- split(loc$GENEID, loc$QUERYID) table(sapply(splt, function(x) length(unique(x)) > 1)) ##按基因ID总结编码变体的数量splt <- split(loc$QUERYID, loc$GENEID) head(sapply(splt, function(x) length(unique(x))), 3) @ \end{Solution} \paragraph{氨基酸编码变化}\Rfunction{predictCoding}计算非同义变体的氨基酸编码变化。只考虑\Rcode{query}中与\Rcode{subject}中的编码区域重叠的范围。引用序列从\Robject{BSgenome}或\Rcode{seqSource}中指定的fasta文件中检索。变体序列是通过将\Robject{varAllele}列中的值替换、插入或删除到引用序列中来构造的。当变异密码子序列的长度是3的倍数时,计算氨基酸代码。\Rfunction{predictCoding}的\Rcode{query}参数可以是\Robject{GRanges}或\Robject{VCF}。当提供\Robject{GRanges}时,必须指定\Rcode{varAllele}参数。在\Robject{VCF}对象的情况下,备用等位基因来自\Rcode{alt( )},并且没有指定\Rcode{varAllele}参数。结果是一个修改后的\Rcode{查询},只包含编码区域内的变量。每一行代表一个变体-转录匹配,因此每个原始变体可以有多个行。< >= library(BSgenome.Hsapiens.UCSC.hg19) coding <- predictCoding(vcf, txdb, seqSource=Hsapiens) coding[5:9] @ %以变体rs114264124为例,我们看到\Rcode{varAllele} \Rcode{A}已被替换到\Rcode{refCodon} \Rcode{CGG}以产生\Rcode{varCodon} \Rcode{CAG}。\Rcode{refCodon}是进行变异等位基因替换所必需的密码子序列,因此通常包含比范围内所指示的更多的核苷酸(即范围为50302962,50302962,宽度为1)。注意,它是\Rcode{refCodon}中的第二个位置被替换。这个密码子中的位置,取代的位置,对应于基因组的位置50302962。该基因组位置映射到编码区域坐标中的698号位置和蛋白质中的233号三联体。这是一个非同义编码变体,其中氨基酸已从\Rcode{R} (Arg)变为\Rcode{Q} (Gln)。当结果\Rcode{varCodon}不是3的倍数时,它不能被翻译。结果被认为是一个\Rcode{移码}和\Robject{varAA}将丢失。< >= coding[coding$CONSEQUENCE == "移码"]@ \appendix \nocite{10.1371/journal.pcbi. >= coding[coding$CONSEQUENCE == "移码"]@ \appendix \nocite{10.1371/journal.pcbi. {1003118} \bibliography{EMBOBGI} \end{document}