1\documentclass{bioinfo2} 2 3\copyrightyear{2015} 4\pubyear{2015} 5 6\usepackage{amsmath} 7\usepackage{natbib} 8 9\newcommand{\specialcell}[2][c]{% 10 \begin{tabular}[#1]{@{}l@{}}#2\end{tabular}} 11 12\usepackage{caption} 13\renewcommand{\thetable}{S\arabic{table}} 14 15\bibliographystyle{apalike} 16 17\begin{document} 18\firstpage{1} 19 20\title[Error Correction for Illumina Data]{BFC: correcting Illumina sequencing errors (supplementary notes)} 21\author[Li]{Heng Li} 22\address{Broad Institute, 75 Ames Street, Cambridge, MA 02142, USA} 23\maketitle 24 25\section{Supplementary Notes} 26 27\subsection{Other error correction tools} 28 29In addition to the error correctors evaluated in the manuscript, we have also 30tried AllPaths-LG~\citep{Gnerre:2011ys}, Fiona~\citep{Schulz:2014aa} and 31Trowel~\citep{Lim:2014aa}, but they require more than 128GB RAM our machine 32has, so were not evaluated. According to \citet{Molnar:2014aa}, 33RACER~\citep{Ilie:2013aa} uses 1.38--2.36 bytes per input base for 34high-coverage human data. It would also use too much memory. We considered 35Blue~\citep{Greenfield:2014aa}, but its website is down at the time of the 36evaluation. We have also tried QuorUM~\citep{Zimin:2013aa}. However, it 37always trims reads, making it hard to be compared to others which keep 38full-length reads. 39 40\subsection{Mapping-based metrics} 41 42In Illumina sequencing, a small fraction of reads may harbor many errors. They 43pose a great challenge to error correctors. They may significantly increase 44the number of unmapped reads and inflate number of uncorrected base errors. 45However, in practice, it is usually safe to ignore these reads. There are at 46times enough high-quality reads in the same region. To reduce the effect 47of these low-quality reads, we measured the accuracy based on read counts 48rather than base counts. 49 50For the human data, a concern with mapping-based metrics is that sample NA12878 51is different from the human reference genome, which might lead to reference 52bias. We think this concern is minor because reads were corrected with no 53information about the reference. Although given an individual read we cannot say 54the corrected read with fewer mismatches to the reference is always better, given 55millions of reads, a better corrector should yield fewer mismatches overall. 56Mapping-based metrics are valid and appealing due to the simple ascertainment 57procedure. 58 59\subsection{Error correction for {\it C. elegans} reads} 60 61We have tried 23-mer and 27-mer for error correction. The N50 of assembled 62scaffolds and contigs are mostly within a few hundred bp with 23-mer 63slightly better. The rest of assembly based metrics were all calculated 64for the correction with 23-mers. Table~S1 evaluates the accuracy with 65mapping-based metrics. The relative performance of tools is similar to that in 66Table~1. 67 68\begin{table}[t] 69\processtable{Error correction for 67.6M {\it C. elegans} reads} 70{\footnotesize 71\begin{tabular}{p{2cm}p{1.35cm}p{1.35cm}p{1.35cm}r} 72\toprule 73Prog. & Perfect&Chim.& Better & Worse \\ 74\midrule 75raw data & 47.6M & 398k & -- & -- \\ 76BFC-bf & 57.9M & 402k & 11.3M & 117k \\ 77BFC-ht &{\bf 58.3M} & 410k & 12.1M & {\bf 92k} \\ 78BLESS & 57.3M & {\bf 393k} & 11.0M & 137k \\ 79Bloocoo & 55.3M & 417k & 10.2M & 255k \\ 80Fermi2 & 58.1M & 431k & {\bf 13.2M} & 287k \\ 81Lighter & 58.0M & 400k & 11.3M & 171k \\ 82Musket & 57.4M & 448k & 11.8M & 246k \\ 83SGA & 57.3M & 418k & 10.7M & 201k \\ 84\botrule 85\end{tabular}}{} 86\end{table} 87 88\begin{table*}[t] 89\processtable{Command lines used for assembly evaluation} 90{\footnotesize 91\begin{tabular}{lp{13.2cm}} 92\toprule 93Functionality & Command line \\ 94\midrule 95BFC-bf-r157 correction & \specialcell[t]{\tt kmc -k23 -t8 SRR065390.fq SRR065390.kmc kmc-tmp\\\tt bfc-kmc -t8 SRR065390.kmc SRR065390.raw.fq} \\ 96BFC-ht-r175 correction & {\tt bfc -t8 -s 100m -k 23 SRR065390.fq} \\ 97BLESS-v0p23 correction & {\tt bless -read SRR065390.fq -kmerlength 23 -prefix out -smpthread 8 -notrim} \\ 98Bloocoo-1.0.4 correction & {\tt Bloocoo -nb-cores 8 -file SRR065390.fq -kmer-size 23 -out out} \\ 99Fermi2-r175 correction & \specialcell[t]{\tt ropebwt2 -drq20 -x31 SRR065390.raw.fq \char62\,SRR065390.fmd\\\tt fermi2 correct -t8 -k23 SRR065390.fmd SRR065390.fq} \\ 100Lighter-1.0.4 correction & {\tt lighter -K 23 100000000 -r SRR065390.fq -t 8} \\ 101Musket-1.1 correction & {\tt musket -k 23 200000000 -p 8 -inorder -o out.fq SRR065390.fq} \\ 102SGA-0.10.3 correction & \specialcell[t]{\tt sga preprocess -p 2 SRR065390.fq > reads.fq\\\tt sga index -a ropebwt -t 8 --no-reverse reads.fq\\\tt sga correct -t 8 -k 23 --learn reads.fq} \\ 103\midrule 104Velvet-1.2.10 assembly & \specialcell[t]{\tt velveth k61 61 -shortPaired -fmtAuto SRR065390.fq\\\tt velvetg k61 -exp\_cov auto -scaffolding yes -cov\_cutoff auto -ins\_length 250} \\ 105ABySS-1.5.2 assembly & {\tt abyss-pe name=k67 k=67 in=SRR065390.fq q=0 s=500 n=5} \\ 106Fermi2-kit-0.9 assembly & \specialcell[t]{\tt fermi2.pl unitig -Es100m -t16 SRR065390.fq \char62\,SRR065390.mak\\\tt make -f SRR065390.mak} \\ 107Bwa-0.7.12 contig mapping & {\tt bwa mem -xintractg -t8 ce235.fa contigs.mag.gz \char62\,contigs.sam} \\ 108Aligned N50 and break points & {\tt htsbox abreak -l200 contigs.sam} \\ 109FreeBayes-0.9.20 variant calling & {\tt freebayes --experimental-gls -f celegans.fa reads.bam} \\ 110Variant calling from contigs & {\tt htsbox pileup -cuf celegans.fa contigs.bam \char62\,contigs.vcf} \\ 111Variant filtering & \specialcell[t]{\tt k8 hapdip.js deovlp contigs.vcf \char124\,k8 hapdip.js anno \char62\,contigs.anno\\\tt k8 hapdip.js filter -DF36 -q3 contigs.anno \char62\,contigs.flt} \\ 112\botrule 113\end{tabular}}{Command lines for correcting human data are similar except for 114the $k$-mer length and extra options to enable gzip'd input.} 115\end{table*} 116 117\subsection{Assembly-related data processing} 118 119\subsubsection{Velvet assembly} 120The {\it C. elegans} reads have been assembled by~\citet{Simpson:2012aa} and 121independently by~\citet{Song:2014aa}, both using Velvet. The former assembled 122the short reads into scaffolds and then broke them into contigs at assembly 123gaps and derived a contig N50 of 13.6kb. The latter directly assembled reads into 124contigs and reached an N50 of 17.3kb. We have tried both settings and could 125approximately reproduce the difference. We found Velvet produced significantly 126more misassemblies (477 vs 352) when it is not asked to generate scaffolds for 127this data set. We used setting by~\citet{Simpson:2012aa} in Table~2. 128 129\subsubsection{ABySS assembly} 130We initially used the default setting of ABySS, only specifying the $k$-mer length. 131We found the N50 is not as good as the one produced by~\citet{Simpson:2012aa}, 132so we adopted their setting. ABySS generate a file with suffix 133`\mbox{-contigs.fa}', but that file still contains long gaps. We finally 134decided to break `-scaffolds.fa' at gaps no short than 5bp. Changing this 135threshold leads to slightly different results, but the relatively ranking of 136correctors is similar. 137 138\subsubsection{FreeBayes SNP calling} 139The latest freebayes-0.9.21 has an issue that causes missing variants in some 140regions. A major change in 0.9.21 is that it uses `--experimental-gls' by 141default. We are using version 0.9.20 with this option applied. 142 143\subsubsection{SNP calling from assembled contigs} 144Fermi2 encodes per-base read depth in the quality string. HTSbox 145(http://bit.ly/HTSBox) is able to call variants using the read depth. 146HTSbox also works with contigs in the FASTA format without per-base read depth. 147It simply lists difference between contigs and the reference given a 148mapping quality threshold. 149 150\subsection{Divergence between SRR065390 and the reference} 151On BWA-MEM short-read mapping, freebayes called 7,047 SNPs before filtering. If 152we exclude SNPs with depth higher than 98 ($=65+4\sqrt{65}$, where 65 is the 153average depth), we are left with 4,251 SNPs. If we apply a similar set of 154filters used in our previous works~\citep{Li:2014aa}, we get 1,658 SNPs, about 155half of which are heterozygotes. We manually checked tens of heterozygous SNPs 156and believed most of them are false calls potentially due to false mappings 157around larger variants. Similarly, we called 3,793 unfiltered and 1,600 filtered 158SNPs from fermikit unitigs and found most post-filtered heterozygotes are 159questionable. Thus we conclude that per-base substitution rate between the 160sequenced sample and the {\it C. elegans} reference genome should be below one 161per 100kb. 162 163We called 8,819 unfiltered SNPs from Velvet contigs and 13,384 from ABySS contigs. 164Without per-base read depth, it is nontrivial to filter them further. In addition 165to SNPs, we also called 655 $>$10bp INDELs from Velvet contigs and 1,393 from 166ABySS contigs. In contrast, we called only 81 $>$10bp INDELs from fermi2 unitigs. 167Upon manual inspection, we suspect many of these Velvet/ABySS SNPs and long 168INDELs are caused by long-range misassemblies or short-range aggressive bubble 169popping. Velvet and ABySS produce long contigs, but they are not good enough 170for small variant calling. Therefore, we have not evaluated Velvet and ABySS 171SNP calls in Table~2 of the main article. 172 173\subsection{Effect of systematic sequencing errors} 174BFC-ht and BFC-bf use very similar algorithms except that when collecting 175trusted $k$-mers, BFC-ht marks a $k$-mer if it contains low-quality bases. BFC-ht 176uses this information in error correction. This strategy dramatically improves 177SNP accuracy (Table~2) because systematic errors tend to have low base quality. 178Fermi2 is the other corrector that is quality-aware at the $k$-mer counting 179phase and also produces relatively fewer false positive SNPs. 180 181We think the fewer number of false positive SNPs called from raw data are also 182related to systematic errors. In Illumina sequencing, when there are systematic 183errors, flanking regions nearby tend to be enriched with sequencing errors as 184well, which makes the assembler harder to assemble through such regions. 185Because there are no unitigs covering these regions, we don't call systematic 186errors as false positives. However, not using error correction in this case 187clearly leads to inferior N50 and probably also lead to reduced sensitivity. 188 189It should be noted that the profile of systematic errors is affected by 190chemistries. This {\it C. elegans} run appears to have high systematic error 191rate at the GGC motif~\citep{Nakamura:2011aa}, but recent Illumina data seem 192to have more errors associated with long ploy-A. 193 194\subsection{Biases in evaluation} 195We have been adjusting the BFC and fermi2 correctors based on older human 196assemblies produced by the fermi2 assembler. Other tools have not received this 197treatment and many of them have not been run on whole-genome human data before. 198Our evaluation is biased towards BFC and fermi2. 199 200%\subsection{Concluding remarks} 201%BFC is a free, fast and easy-to-use error corrector designed for Illumina short 202%reads. It uses a more exhausive algorithm but still maintains a speed 203%comparable to implementations based on greedy methods. In practice, BFC appears 204%to correct more errors with fewer overcorrections in comparison to others. It 205%particularly does well in suppressing systematic sequencing errors, which helps 206%to improve the base accuracy of {\it de novo} assemblies. 207 208\bibliography{bfc} 209\end{document} 210