这个工作流程的目的是检测从宾夕法尼亚州的大豆田收集的几个样本中已知的*Pythium*物种。*腐霉*是一种真核微生物,可以帮助或伤害植物取决于物种。许多腐霉会导致某些植物的根腐病。然而,有些品种的腐霉被用作生物防治剂,以防止病菌在作物上生长。该工作流程与Coffua *等人在《植物疾病》(2016)杂志上对相同数据集进行的分析类似。

这里,细胞色素C氧化酶亚基1(COI)基因用作系统发育标记物以鉴定物种。COI基因是所有真核生物的线粒体基因组的一部分。在本工作流程的第1部分中,目标是使用COI基因来设计最佳引物以区分所有*粘性*物种。第一步是从Internet下载已知* Pythium *物种的Coi基因序列,并将它们导入序列数据库,如下所示。```{r,结果=“hide”}#所有路径都是相对于已安装的datasets data_dir < - system.file(“extdata”,package =“bigbioseqdata”)库(解密)#创建与ON的连接 -磁盘sqlite数据库dbconn < - dbconnect(sqlite(),“./coigenes.sqlite”)#新数据库文件的路径#从GenBank格式文件SEQS2DB导入序列(粘贴(data_dir,“/ pythium_spp_coi.gb”,sep =““),键入=”genbank“,dbfile = dbconn,标识符=”pythium“)#查看被构造的数据库表(dbconn)#检索导入的序列DNA < - SearchDB(DBConn)DNA#对齐序列基于它们的翻译DNA < - -impletranslation(DNA)DNA#显示Web浏览器BrowseSQS(DNA)#中的序列显示与第一个序列BrowseSQS(DNA,突出显示= 1)#显示差异与共识序列BrowseSQS(DNA,突出显示=0)#更改浏览浏览程度(DNA,突出显示= 0,阈值= 0.2)#注意大多数序列模式< - dnastringset(“tagatttagcwattttagtttttttttttactttttacttttaca”)BrowseSeqs(DNA,模式=模式)#蛋白质序列非常相似AA < - ApplyTranslation(DNA,AsaStringsetet = True)BroweseQs(AA,突出显示= 1)#为FrameShift校正Ref < - Translate的参考选择参考(DNA [11])#序列#11#校正序列#12的框架#12正确< - 刀柄= DNA [12],myaastringset = ref,键入=“两个”)正确的DNA [12] < - 正确$序列#序列#11现在与#12 DNA < - Sengntranslation(DNA)BrowseSeqs(DNA,突出显示= 11)#识别引物设计的簇D < - distancematrix(DNA)DIM(d)#对称矩阵C < - idClusters(D,方法=“Upgma”,Cutoff = 0.05,show = true)head(c)#群集号码#通过数据库add2db中的群集名称识别序列(data.frame(标识符= paste(“cluster”,c $ cluster, sep="")), dbConn) BrowseDB(dbConn) # Design primers for next-generation sequencing primers <- DesignSignatures(dbConn, type="sequence", resolution=5, levels=5, minProductSize=400, maxProductSize=800, annealingTemp=55, maxPermutations=8) primers[1,] # the top scoring primer set # Highlight the primers' target sites BrowseSeqs(DNA, patterns=c(DNAStringSet(primers[1, 1]), reverseComplement(DNAStringSet(primers[1, 2])))) ``` Part #2 of the workflow uses sequences of the COI gene that were obtained from several locations in Pennsylvania. These DNA sequences are stored in FASTQ format along with their corresponding quality scores. After importing, the first step is to trim the sequences so that only the high quality center region remains. The subset of sequences that might belong to *Pythium* species will be identified by the presence of a conserved region common all *Pythium*. This analysis will be performed in batches so that all of the sequences do not need to fit in memory simultaneously. ```{r, results="hide"} # Import from the compressed FASTQ sequence files path <- paste(data_dir, "/FASTQ/", sep="") files <- list.files(path) samples <- substring(files, first=1, last=nchar(files) - 6) for (i in seq_along(files)) { cat(samples[i], ":\n", sep="") Seqs2DB(paste(path, files[i], sep=""), type="FASTQ", dbFile=dbConn, identifier=samples[i], tblName="Reads") } # Function for determining boundaries # of the high-quality central region bounds <- function(probs, thresh=0.001, width=21) { # Calculate a moving average padding <- floor(width/2) probs <- c(rep(thresh, padding), probs, rep(thresh, padding)) probs <- filter(probs, rep(1/width, width)) # Find region above the threshold w <- which(probs < thresh) - padding if (length(w)==0) w <- NA return(c(w[1], w[length(w)])) } # Trim the sequences by quality and identify # the subset belonging to the Pythium genus nSeqs <- SearchDB(dbConn, tbl="Reads", count=TRUE, verbose=FALSE) offset <- 0 ends <- starts <- counts <- integer(nSeqs) fprimer <- DNAString("TCAWCWMGATGGCTTTTTTCAAC") rprimer <- DNAString("") pBar <- txtProgressBar(max=nSeqs, style=3) while (offset < nSeqs) { # Select a batch of sequences dna <- SearchDB(dbConn, tbl="Reads", type="QualityScaledXStringSet", limit=paste(offset, 1e4, sep=","), verbose=FALSE) # Convert quality scores to error probabilities probs <- as(quality(dna), "NumericList") endpoints <- sapply(probs, bounds) # Store the results for later use index <- (offset + 1):(offset + length(dna)) starts[index] <- ifelse(endpoints[1,] >= 38L, endpoints[1,], 38L) # first base after the forward primer ends[index] <- ifelse(endpoints[2,] >= starts[index], endpoints[2,], starts[index] - 1L) # no high quality bases # Find the pattern expected in Pythium sequences counts[index] <- vcountPattern(pattern[[1]], subject=dna, max.mismatch=4, with.indels=TRUE, fixed="subject") # allow ambiguities offset <- offset + 1e4 setTxtProgressBar(pBar, ifelse(offset > nSeqs, nSeqs, offset)) } # Add the results to new columns in the database results <- data.frame(start=starts, end=ends, count=counts) Add2DB(results, dbFile=dbConn, tblName="Reads", verbose=FALSE) BrowseDB(dbConn, tblName="Reads", limit=1000) # Cluster the reads in each sample by percent identity for (i in seq_along(samples)) { cat(samples[i]) # Select moderately long sequences dna <- SearchDB(dbConn, tblName="Reads", identifier=samples[i], clause="count > 0 and (end - start + 1) >= 100", verbose=FALSE) cat(":", length(dna), "sequences") # Trim the sequences to the high-quality region index <- as.numeric(names(dna)) dna <- subseq(dna, start=starts[index], end=ends[index]) # Cluster the sequences without a distance matrix clusters <- IdClusters(myXStringSet=dna, method="inexact", cutoff=0.03, # > 97% identity verbose=FALSE) # Add the cluster numbers to the database Add2DB(clusters, dbFile=dbConn, tblName="Reads", verbose=FALSE) cat(",", length(unique(clusters[, 1])), "clusters\n") } # Now the database contains a column of clusters BrowseDB(dbConn, tblName="Reads", limit=1000, clause="cluster is not NULL") ``` In part #3 of the workflow, representatives from each sequence cluster are compared to known *Pythium* species. The goal of this analysis is to identify which organisms present in each sample are similar to known species. The known species are separated into two groups: those that are used as biocontrol agents (good strains) and those that are known to be plant pathogens (bad strains). ```{r, results="hide"} ids <- IdentifyByRank(dbConn, add2tbl=TRUE) lens <- IdLengths(dbConn, add2tbl=TRUE) BrowseDB(dbConn) # separate Pythium strains into good and bad groups biocontrol <- c('Pythium oligandrum', 'Pythium nunn', 'Pythium periplocum') pathogen <- c('Pythium acanthicum', # strawberries: 'Pythium rostratum', 'Pythium middletonii', 'Pythium aristosporum', # grasses/cereals: 'Pythium graminicola', 'Pythium okanoganense', 'Pythium paddicum', 'Pythium volutum', 'Pythium arrhenomanes', 'Pythium buismaniae', # flowers: 'Pythium spinosum', 'Pythium mastophorum', 'Pythium splendens', 'Pythium violae', # carrots: 'Pythium paroecandrum', 'Pythium sulcatum', 'Pythium dissotocum', # potatoes: 'Pythium scleroteichum', 'Pythium myriotylum', 'Pythium heterothallicum', # lettuce: 'Pythium tracheiphilum', 'Pythium ultimum', # multiple plants: 'Pythium irregulare', 'Pythium aphanidermatum', 'Pythium debaryanum', 'Pythium sylvaticum') # Select the longest sequence from each species species <- SearchDB(dbConn, nameBy="identifier", clause=paste("identifier in (", paste("'", c(biocontrol, pathogen), "'", sep="", collapse=", "), ") group by identifier having max(bases)", sep="")) # Select the longest sequence in each cluster dna <- SearchDB(dbConn, identifier="DauphinFarm", # choose a sample tblName="Reads", clause="cluster is not null group by cluster having max(end - start)") # Trim to the high quality central region index <- as.numeric(names(dna)) dna <- subseq(dna, start=starts[index], end=ends[index]) # Create a tree with known and unknown species combined <- AlignSeqs(c(dna, species)) dists <- DistanceMatrix(combined, verbose=FALSE, correction="Jukes-Cantor") tree <- IdClusters(dists, method="NJ", # Neighbor joining asDendrogram=TRUE, verbose=FALSE) plot(tree, nodePar=list(lab.cex=0.5, pch=NA)) # Color known species based on their pathogenicity tree_colored <- dendrapply(tree, function(x) { if (is.leaf(x)) { if (attr(x, "label") %in% pathogen) { attr(x, "edgePar") <- list(col="red") } else if (attr(x, "label") %in% biocontrol) { attr(x, "edgePar") <- list(col="green") } # remove the label attr(x, "label") <- "" } return(x) }) plot(tree_colored) # Disconnect from the sequence database dbDisconnect(dbConn) # permanently delete the database unlink("./COIgenes.sqlite") # optional! ```