% \ VignetteIndexEntry{05。基因组范围}%\VignetteEngine{knitr::knitr} \documentclass{article} < >= options(max.print=1000) BiocStyle::latex() library(knitr) opts_chunk$set(cache=TRUE, tidy=FALSE) @ < >= suppressPackageStartupMessages({库(ShortRead)库(VariantAnnotation)库(ggplot2)库(RNAseqData.HNRNPC.bam.chr14)库(org. hs . egm .db)库(TxDb.Hsapiens.UCSC.hg19.knownGene)库(BSgenome.Hsapiens.UCSC.hg19)库(AnnotationHub)库(rtracklayer)}) @ \title{实用:范围}\作者{Martin Morgan (mtmorgan@fhcrc.org)} \日期{2014年2月27-28日}\newcommand{\Hsap}{\emph{H。~智人}}\ newcommand {\ Dmel} {\ emph {D。~melanogaster}} \ usepack{Exercise} \begin{document} \maketitle \tableofcontents \section{使用范围}开始加载\Biocpkg{GenomicRanges}包并定义\Rfunction{plotRanges}帮助函数< >= library(GenomicRanges) plotRanges <- function(x, xlim = x, main = deparse(substitute(x)), col = "black", sep = 0.5,…){height <- 1 if (is(xlim, "Ranges")) xlim <- c(min(start(xlim)), max(end(xlim))) bins <- disjointins (IRanges(start(x), end(x) + 1)) plot.new() par(mai=c(0.6, 0.2, 0.6, 0.2)) plot。window(xlim, c(0, max(bins)*(height + sep))) ybottom <- bins *(sep + height) - height rect(start(x)-0.5, ybottom, end(x)+0.5, ybottom + height, col = col,…)title(main, cex.main=2.8, font.main=1) axis(1)} @范围描述了感兴趣的特征(例如,基因,外显子,启动子)和与基因组对齐的reads。\Bioconductor{}有非常强大的功能来处理范围,其中一些在表~\ref{tab:ranges-pkgs}中总结。这些在\Biocpkg{GenomicRanges}包中实现;看到\ {10.1371 / journal.pcbi引用。1003118}用于更全面的概念定位。\begin{table} \居中\标题{选定的\Bioconductor{}包,用于表示和操作范围、字符串和其他数据结构。\label{tab:ranges-pkgs} \begin{tabular}{lp{。7\textwidth}}包和描述\\ \hline \Biocpkg{IRanges} &定义重要的类(如\Rclass{IRanges}, \Rclass{Rle})和方法(如\Rfunction{finoverlaps}, \Rfunction{countOverlaps})来表示和操作连续值的范围。还介绍了\Rclass{DataFrame}, \Rclass{SimpleList}和其他为表示非常大的数据而定制的类。\\ \Biocpkg{GenomicRanges} &为序列表示量身定制的基于范围的类(例如\Rclass{GRanges}, \Rclass{GRangesList}),带有关于链和序列名称的信息。\\ \Biocpkg{GenomicFeatures} &用于操作基因组范围数据库的基础,例如,表示已知基因的外显子和转录本的坐标和组织。\\ \hline \end{tabular} \end{table} \段落{\Rclass{GRanges} class} \Rclass{GRanges}的实例用于指定基因组坐标。 Suppose we wish to represent two \Dmel{} genes. The first is located on the positive strand of chromosome 3R, from position 19967117 to 19973212. The second is on the minus strand of the X chromosome, with `left-most' base at 18962306, and right-most base at 18962925. The coordinates are \emph{1-based} (i.e., the first nucleotide on a chromosome is numbered 1, rather than 0), \emph{left-most} (i.e., reads on the minus strand are defined to `start' at the left-most coordinate, rather than the 5' coordinate), and \emph{closed} (the start and end coordinates are included in the range; a range with identical start and end coordinates has width 1, a 0-width range is represented by the special construct where the end coordinate is one less than the start coordinate). A complete definition of these genes as \Rclass{GRanges} is: < >= genes <- GRanges(seqnames=c("chr3R", "chrX"), ranges=IRanges(start=c(19967117,18962306), end =c(19973212,18962925)), strand=c("+", "-"), seqlength =c(chr3R=27905053, chrX=22422827)) @ \noindent \Rclass{GRanges}对象的组件被定义为向量,例如\Rcode{seqnames},就像定义\Rclass{data.frame}一样。开始和结束坐标被分组到一个\Rclass{IRanges}实例中。可选的\Rcode{seqlength}参数指定每个序列的最大大小,在本例中是\Dmel{}基因组的' dm2'构建中的染色体3R和X的长度。该数据显示为< \Rclass{GRanges}类上定义了许多有用的方法。参考帮助页\begin{knitrout} \definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor} \begin{kframe} \begin{alltt} \hlkwd{?GRanges} \end{alltt} \end{kframe} \end{knitrout} \noindent和pack欧洲杯2021体育彩票age vignettes < >= vignette(package="GenomicRanges") @ \noindent进行全面介绍。一个\Rclass{GRanges}实例可以是一个子集,带有用于获取和更新信息的访问器。< >= genes[2] strand(genes) width(genes) length(genes) names(genes) <- c("FBgn0039155", "FBgn0085359") genes # now with names @ \noindent \Rfunction{strand}以一个称为\emph{run-length encoding}的紧凑表示形式返回链信息。' names'可以在实例构造时指定;一旦命名,\Rclass{GRanges}实例就可以像常规向量一样按名称作为子集。正如\Rfunction{GRanges}函数所示,\Rclass{GRanges}类通过添加有关\Rcode{seqnames}、\Rcode{strand}和其他与表示基因组上的范围特别相关的信息来扩展\Rclass{IRanges}类。\Rclass{IRanges}类和相关的数据结构(例如\Rclass{RangedData})意味着在任意空间中定义的范围的更一般的描述。在\Rclass{GRanges}类上实现的许多方法都“意识到”基因组定位的后果,例如将负链上的范围与正链上的范围区别对待(反映DNA施加的5'方向)。\Rclass{GRanges}类有很多有用的方法。我们使用\Rclass{IRanges}来说明这些操作,以避免与链和序列名相关的复杂性,但这些操作在\Rclass{GRanges}上是类似的。我们从一组简单的范围开始:< > =红外< - IRanges(开始= c(7、9、12 14日22:24),结束= c(15、11、12、18日26日27日28))@ % % < >= png("ranges-ir-plot.png", width=800, height=160) plotRanges(ir, xlim=c(5,35), main="Original") dev.off() png("ranges-shift-ir-plot.png", width=800, height=160) plotRanges(shift(ir, 5), xlim=c(5,35), main=" shift ") dev.off() png("ranges-reduce-ir-plot.png", width=800, height=160) plotRanges(reduce(ir), xlim=c(5,35), main=" reduce ") dev.off() @ \noindent这些和一些常见的操作在图~\ref{图:ranges}的上面板中说明,并在表~\ref{tab:range-ops}中进行了摘要。%% \begin{figure} \ centric \includegraphics[width=.5\textwidth,height=!] {ranges-ir-plot} \ \ \ includegraphics[宽度= 5 \ textwidth高度= !] {ranges-shift-ir-plot} \ \ \ includegraphics[宽度= 5 \ textwidth高度= !{range -reduce-ir-plot} \标题{Ranges} \label{fig: Ranges} \end{figure} %% range上的方法可以按以下方式分组:\begin{description} \item[range内]方法独立作用于每个range。其中包括\Rfunction{side}、\Rfunction{narrow}、\Rfunction{reflect}、\Rfunction{resize}、\Rfunction{restrict}和\Rfunction{shift}等等。一个例子是\Rfunction{shift},它根据\Rcode{shift}参数指定的量来转换每个范围。正值右移,负值左移;\Rcode{shift}可以是一个向量,向量的每个元素都移动\Rclass{IRanges}实例的相应元素。在这里,我们将所有范围向右移动5,结果如图~\ref{fig:ranges}的中间面板所示。< >= shift(ir, 5) @ < >= png("range -shift-ir-plot.png", width=800, height=160) plotRanges(shift(ir, 5), xlim=c(5,35), main=" shift ") dev.off() @ \item[Inter-range]方法将范围集合作为一个整体。其中包括\Rfunction{disjoin}、\Rfunction{reduce}、\Rfunction{gaps}和\Rfunction{range}。一个示例是\Rfunction{reduce},它将重叠的范围减少为单个范围,如图~\ref{fig:ranges}的下面板所示。< >= reduce(ir) @ < >= png("range -reduce-ir-plot.png", width=800, height=160) plotRanges(reduce(ir), xlim=c(5,35), main=" reduce ") dev.off() @ \Rfunction{coverage}是一个范围间操作,计算有多少个范围重叠单个位置。\Rfunction{coverage}不是返回范围,而是返回一个压缩的表示(运行长度编码)< >= cvg <- coverage(ir) cvg ## plot(as.integer(cvg), type="s", xlab="Coordinate", ylab="覆盖深度")@运行长度编码可以解释为'由0个范围覆盖的长度为6个核苷酸的运行,然后由1个范围覆盖的长度为2个核苷酸的运行\ldots'。\item[Between]方法作用于两个(有时更多)\Rclass{IRanges}实例。其中包括\Rfunction{intersect}、\Rfunction{setdiff}、\Rfunction{union}、\Rfunction{pintersect}、\Rfunction{psetdiff}和\Rfunction{punion}。\Rfunction{countOverlaps}和\Rfunction{findOverlaps}函数作用于两组范围。\Rfunction{countOverlaps}接受它的第一个参数(\Rcode{query}),并确定第二个参数(\Rcode{subject})中有多少个范围重叠。结果是一个整数向量,每个\Rcode{query}成员对应一个元素。\Rfunction{findOverlaps}执行类似的操作,但返回一个更通用的类似矩阵的结构,用于标识每对查询/主题重叠。这两种说法都允许对“重叠”的定义有一定的灵活性。\end{description} \begin{table} \居中\标题{对\Rclass{IRanges}、\Rclass{GRanges}和\Rclass{GRangesList}的常用操作。{} \标签选项卡:range-ops} \{表格}{微光}开始类别和功能和描述\ \ \线访问器& \ Rfunction{}开始,\ Rfunction{}结束,\ Rfunction{宽度}&获取或设置开始,结束,宽度\ \ & \ Rfunction{名称}&获取或设置名称\ \ & \ Rfunction {mcols}, \ Rfunction{元数据}&获取或设置元数据元素或对象上\ \ & \ Rfunction{长度}&向量的范围\ \ & \ Rfunction{范围}&范围形成从最小的开始和最大结束\ \订购& \ Rfunction {< \ Rfunction}, {< =}, \Rfunction{>}, \Rfunction{>=}, \Rfunction{==}, \Rfunction{!=} & Compare ranges, ordering by start then width \\ & \Rfunction{sort}, \Rfunction{order}, \Rfunction{rank} & Sort by the ordering \\ & \Rfunction{duplicated} & Find ranges with multiple instances \\ & \Rfunction{unique} & Find unique instances, removing duplicates \\ Arithmetic & \Rcode{r + x}, \Rcode{r - x}, \Rcode{r * x} & Shrink or expand ranges \Robject{r} by number \Robject{x} \\ & \Rfunction{shift} & Move the ranges by specified amount \\ & \Rfunction{resize} & Change width, anchoring on start, end or mid \\ & \Rfunction{distance} & Separation between ranges (closest endpoints) \\ & \Rfunction{restrict} & Clamp ranges to within some start and end \\ & \Rfunction{flank} & Generate adjacent regions on start or end \\ Set operations & \Rfunction{reduce} & Merge overlapping and adjacent ranges \\ & \Rfunction{intersect}, \Rfunction{union}, \Rfunction{setdiff} & Set operations on reduced ranges \\ & \Rfunction{pintersect}, \Rfunction{punion}, \Rfunction{psetdiff} & Parallel set operations, on each \Rcode{x[i]}, \Rcode{y[i]} \\ & \Rfunction{gaps}, \Rfunction{pgap} & Find regions not covered by reduced ranges \\ & \Rfunction{disjoin} & Ranges formed from union of endpoints \\ Overlaps & \Rfunction{findOverlaps} & Find all overlaps for each \Robject{x} in \Robject{y} \\ & \Rfunction{countOverlaps} & Count overlaps of each \Robject{x} range in \Robject{y} \\ & \Rfunction{nearest} & Find nearest neighbors (closest endpoints) \\ & \Rfunction{precede}, \Rfunction{follow} & Find nearest \Robject{y} that \Robject{x} precedes or follows \\ & \Rcode{x \%in\% y} & Find ranges in \Robject{x} that overlap range in \Robject{y} \\ Coverage & \Rfunction{coverage} & Count ranges covering each position \\ Extraction & \Rcode{r[i]} & Get or set by logical or numeric index \\ & \Rcode{r[[i]]} & Get integer sequence from \Rcode{start[i]} to \Rcode{end[i]} \\ & \Rfunction{subsetByOverlaps} & Subset \Robject{x} for those that overlap in \Robject{y} \\ & \Rfunction{head}, \Rfunction{tail}, \Rfunction{rev}, \Rfunction{rep} & Conventional R semantics \\ Split, combine & \Rfunction{split} & Split ranges by a factor into a \Rclass{RangesList} \\ & \Rfunction{c} & Concatenate two or more range objects \\ \hline \end{tabular} \end{table} \paragraph{Adding \Rfunction{mcols} and \Rfunction{metadata}} The \Rclass{GRanges} class (actually, most of the data structures defined or extending those in the \Biocpkg{IRanges} package) has two additional very useful data components. The \Rfunction{mcols} function allows information on each range to be stored and manipulated (e.g., subset) along with the \Rclass{GRanges} instance. The element metadata is represented as a \Rclass{DataFrame}, defined in \Biocpkg{IRanges} and acting like a standard \R{} \Rclass{data.frame} but with the ability to hold more complicated data structures as columns (and with element metadata of its own, providing an enhanced alternative to the \Biocpkg{Biobase} class \Rclass{AnnotatedDataFrame}). < >= mcols(基因)<- DataFrame(EntrezId=c("42865", "2768869"), Symbol=c("kal-1", "CG34330")) @ \noindent \Rfunction{metadata}允许向整个对象添加信息。信息以列表的形式;任何数据都可以提供。< >=元数据(基因)<- list(CreatedBy="A。\Rfunction{GRanges}类对于表示简单的范围非常有用。一些下一代序列数据和基因组特征更具有层次结构。一个基因可以由它内部的几个外显子来表示。一个对齐的读可以用一个引用的不连续的对齐范围来表示。\Rclass{GRangesList}类表示这种类型的信息。它是一个类似列表的数据结构,列表的每个元素本身都是一个\Rclass{GRanges}实例。先前鉴定的ENSEMBL基因可以表示为\ r类{GRangesList}。< >= library(txdb . hsapiens . ucsc .hg19. knowngene) txdb <- txdb . hsapiens . ucsc .hg19。knownGene # alias ## have ENSEMBL ids ensids <- c("ENSG00000130720", "ENSG00000103257", "ENSG00000156414", "ENSG00000144644", "ENSG00000159307", "ENSG00000144485") ##需要ENTREZID的egids <- select(org. hs . exe .db, ensids, "ENTREZID", "ENSEMBL")$ENTREZID exByGn <- exonsBy(txdb, "gene")[egids] @ < \Rclass{GRangesList}对象具有列表的方法(例如,\Rfunction{length}, sub-setting)。许多介绍的用于\Rclass{IRanges}的方法也是可用的,方法是按元素应用的。\小节{选择基因序列}\begin{练习}这个练习使用注释包从基因标识符到编码序列。\begin{enumerate} \item使用\Biocannopkg{org.Hs.eg.db}包从一个非正式的基因符号映射到ENTREZID基因标识符,使用\Rfunction{select}函数,使用\Biocannopkg{TxDb.Hsapiens.UCSC.hg19。knownGene}包和第二个映射从ENTREZID到TXNAME。\item使用\Biocannopkg{TxDb.Hsapiens.UCSC.hg19提取由转录本分组的编码序列。knownGene}包和\Rfunction{cdsBy}函数;只选择我们感兴趣的那些文本。\item检索\Biocannopkg{bsgenome . hspiens . ucsc中的核苷酸序列。hg19}包使用\Rfunction{extractTranscriptsFromGenome}函数。\item验证编码序列均为3的倍数,并将核苷酸序列转换为氨基酸序列。\end{enumerate} \end{Exercise} \begin{Solution}从基因符号映射到ENTREZID,从ENTREZID映射到TXNAME,提取相应的编码序列,按转录本分组< >= library(org.Hs.eg.db) egid <- select(org.Hs.eg.db, "BRCA1", "ENTREZID", "SYMBOL")$ENTREZID library(txdb . hsapiens . ucsc .hg19. knowngene) txdb <- txdb . hsapiens . ucsc .hg19. hg19。knownGene egToTx <- select(txdb, egid, "TXNAME", "GENEID") cdsByTx <- cdsBy(txdb, "tx", use.names=TRUE)[egToTx$TXNAME] @ \noindent检索序列< >= library(BSgenome.Hsapiens.UCSC.hg19) txx <- extractTranscriptsFromGenome(Hsapiens, cdsByTx) @ \noindent翻译为氨基酸序列< >= all(width(txx) %% 3 == 0) #完整性检查翻译(txx) #氨基酸序列@ \end{解决方案}\分段{汇总重叠}\begin{练习}RNA-seq和其他工作流中的一个基本操作是计算对齐的读取感兴趣的特征重叠的次数。\begin{enumerate} \item加载\Biocexptpkg{RNAseqData.HNRNPC.bam。chr14}实验数据包,并获取其中包含的BAM文件的路径。加载包含hg19 UCSC“已知基因”轨道的每个外显子坐标的“transcript db”包。提取按基因分组的外显子坐标;结果是一个\Rclass{GRangesList}对象,我们将在后面详细讨论。使用带有外显子坐标和BAM文件的\Rfunction{summarizeOverlaps}函数来生成每个基因重叠的读数的计数。访问帮助页面\Rcode{?summarizeOverlaps}来阅读使用的计数策略。计数可以使用函数\Rfunction{assay}从\Rfunction{summarizeOverlaps}的返回值中提取。 This is standard \R{} matrix. How many reads overlapped regions of interest in each sample? How many genes had non-zero counts? \end{enumerate} \end{Exercise} \begin{Solution} Point to BAM files < >= library(RNAseqData.HNRNPC.bam.chr14) fls <- RNAseqData.HNRNPC.bam. >= library(rnaseqdata . hnrnpc .chr14)获取基因模型;这也可以来自,例如,一个GFF或GTF文件。< >= library(BiocParallel) library(TxDb.Hsapiens.UCSC.hg19. knowngene) ex <- exonsBy(TxDb.Hsapiens.UCSC.hg19. exonsBy)knownGene, "gene") @ \noindent总结每个感兴趣区域重叠的读数< >= counts <- summarizeOverlaps(ex, fls) colsum (assay(counts)) sum(rowsum (assay(counts)) != 0) @ \end{Solution} \appendix \nocite{10.1371/journal.pcbi. pcbi. com1003118} \bibliography{EMBOBGI} \end{document}