1\documentclass{bioinfo}
2\copyrightyear{2015}
3\pubyear{2015}
4
5\usepackage{amsmath}
6\usepackage[ruled,vlined]{algorithm2e}
7\newcommand\mycommfont[1]{\footnotesize\rmfamily{\it #1}}
8\SetCommentSty{mycommfont}
9\SetKwComment{Comment}{$\triangleright$\ }{}
10\usepackage{natbib}
11
12\bibliographystyle{apalike}
13
14
15\begin{document}
16\firstpage{1}
17
18\title[Error Correction for Illumina Data]{BFC: correcting Illumina sequencing errors}
19
20\author[Li]{Heng Li}
21
22\address{Broad Institute, 75 Ames Street, Cambridge, MA 02142, USA}
23
24\history{Received on XXXXX; revised on XXXXX; accepted on XXXXX}
25\editor{Associate Editor: XXXXXXX}
26\maketitle
27
28\begin{abstract}
29%\section{Summary:} We present a new tool to correct sequencing errors in
30%Illumina data produced from high-coverage whole-genome shotgun resequencing.
31%It uses a non-greedy algorithm and shows comparable performance to other error
32%correctors and higher accuracy in evaluations on real data.
33
34\section{Summary:} BFC is a free, fast and easy-to-use sequencing error
35corrector designed for Illumina short reads. It uses a non-greedy algorithm
36but still maintains a speed comparable to implementations based on greedy
37methods. In evaluations on real data, BFC appears to correct more errors with
38fewer overcorrections in comparison to existing tools. It particularly does
39well in suppressing systematic sequencing errors, which helps to improve the
40base accuracy of {\it de novo} assemblies.
41
42\section{Availability and implementation:} https://github.com/lh3/bfc
43
44\section{Contact:} hengli@broadinstitute.org
45\end{abstract}
46
47\vspace{-1em}
48
49\section{Introduction}
50
51Error correction is a process to fix sequencing errors on a sequence read
52by using other overlapping reads that do not contain the errors. Many \emph{de
53novo} assemblers, in particular short-read assemblers for large genomes, use
54error correction to reduce the complexity of the assembly graph such that the
55graph can be fitted to limited RAM. Error correction was first expressed
56as the \emph{spectral alignment problem}~\citep{Pevzner:2001vn}, whereby we take
57a set of trusted $k$-mers and attempt to find a sequence with minimal
58corrections such that each $k$-mer on the corrected sequence is trusted.
59The majority of error correctors are based on this idea and take a greedy
60approach to solving this problem approximately. They make a correction based on
61the local sequence context and do not revert the decision. We worried that the
62greedy strategy might affect the accuracy given reads from a repeat-rich
63diploid genome, so derived a new non-greedy algorithm that explores larger
64search space in attempt to achieve higher accuracy.
65
66\vspace{-1em}
67
68\begin{methods}
69\section{Methods}
70Algorithm~1 is the key component of BFC. It defines a \emph{state}
71of correction as a 4-tuple $(i,W,\mathcal{C},p)$, which consists of the
72position $i$ of the preceding base, the last \mbox{($k$-1)--mer} $W$ ending at
73$i$, the set $\mathcal{C}$ of previous corrected positions and bases (called a
74\emph{solution}) up to $i$, and the penalty $p$ of solution $\mathcal{C}$. BFC
75keeps all possible states in a priority queue $\mathcal{Q}$. At each iteration,
76it retrieves the state $(i,W,\mathcal{C},p)$ with the lowest penalty $p$ (line~1) and
77adds a new state $(i+1,W[1,k-2]\circ a,\mathcal{C}',p')$ if $a$ is the read
78base or $W\circ a$ is a trusted $k$-mer. If the first $k$-mer in $S$ is
79error free and we disallow untrusted $k$-mers by removing line~3, this
80algorithm finds the optimal solution to the spectral alignment problem.
81\citet{Chaisson:2004kx} have described a more general non-greedy algorithm.
82Its strict form is not implementable in practice. The heuristic adaptation
83is loosely similar to ours without line~3.
84
85\begin{algorithm}[ht]
86\DontPrintSemicolon
87\footnotesize
88\KwIn{K-mer size $k$, set $\mathcal{H}$ of trusted $k$-mers, and one string $S$}
89\KwOut{Set of corrected positions and bases changed to}
90\BlankLine
91\textbf{Function} {\sc CorrectErrors}$(k, \mathcal{H}, S)$
92\Begin {
93	$\mathcal{Q}\gets${\sc HeapInit}$()$\Comment*[r]{$\mathcal{Q}$ is a priority queue}
94	{\sc HeapPush}$(\mathcal{Q}, (k-2, S[0,k-2], \emptyset, 0))$\Comment*[r]{0-based strings}
95	\While{$\mathcal{Q}$ is not empty} {
96		\nl$(i, W, \mathcal{C}, p)\gets${\sc HeapPopBest}$(\mathcal{Q})$\Comment*[r]{current best state}
97		$i\gets i+1$\;
98		\lIf{$i=|S|$} { {\bf return} $\mathcal{C}$ \Comment*[f]{reaching the end of $S$}}
99		\nl$\mathcal{N}\gets\{(i,{\rm A}),(i,{\rm C}),(i,{\rm G}),(i,{\rm T})\}$\Comment*[r]{set of next bases}
100		\ForEach (\Comment*[f]{try all possible next bases}) {$(j,a)\in\mathcal{N}$} {
101			$W'\gets W\circ a$\Comment*[r]{``$\circ$'' concatenates strings}
102			\uIf (\Comment*[f]{no correction}) {$i=j$ {\bf and} $a=S[j]$} {
103				\eIf (\Comment*[f]{good read base; no penalty}) {$W'\in\mathcal{H}$} {
104					{\sc HeapPush}$(\mathcal{Q}, (j,W'[1,k-1],\mathcal{C}, p))$\;
105				} (\Comment*[f]{bad read base; penalize}) {
106					\nl{\sc HeapPush}$(\mathcal{Q}, (j,W'[1,k-1],\mathcal{C}, p+1))$\;
107				}
108			} \ElseIf (\Comment*[f]{make a correction with penalty}) {$W'\in \mathcal{H}$} {
109				\nl{\sc HeapPush}$(\mathcal{Q}, (j,W'[1,k-1],\mathcal{C}\cup\{(j,a)\},p+1))$\;
110			}
111		}
112	}
113}
114\caption{Error correction for one string in one direction}
115\end{algorithm}
116
117It is possible to modify Algorithm~1 to correct insertion and deletion
118errors (INDELs) by augmenting the set of the ``next bases'' at line~2 to:
119$$\mathcal{N}=\{(j,a)|j\in\{i-1,i\},a\in\{{\rm A},{\rm C},{\rm G},{\rm T}\}\}\cup\{(i,\epsilon)\}$$
120In this set, $(i,a)$ substitutes a base at position $i$, $(i,\epsilon)$ deletes
121the base and $(i-1,a)$ inserts a base $a$ before $i$. We have not
122implemented this INDEL-aware algorithm because such
123errors are rare in Illumina data.
124
125The worst-case time complexity of Algorithm~1 is exponential in the length of
126the read. In implementation, we use a heuristic to reduce the search space by
127skipping line~4 if the base quality is 20 or higher (Q20) and the $k$-mer
128ending at it is trusted, or if five bases or two Q20 bases have been corrected
129in the last 10bp window. If function {\sc CorrectErrors} still takes too
130many iterations before returning, it stops the search and claims the read uncorrectable.
131
132Given a read, BFC finds the longest substring on which each $k$-mer is trusted. It
133then extends the substring to both ends of the read with Algorithm~1. If a read
134does not contain any trusted $k$-mers, BFC exhaustively enumerates all $k$-mers
135one-mismatch away from the first $k$-mer on the read to find a trusted
136$k$-mer. It marks the read uncorrectable if none or multiple trusted $k$-mers
137are found this way.
138
139We provided two related implementations of Algorithm~1, BFC-bf and BFC-ht.
140BFC-bf uses KMC2~\citep{kmc2} to get exact $k$-mers counts and then keeps
141$k$-mers occurring three times or more in a blocked bloom
142filter~\citep{DBLP:conf/wea/PutzeSS07}. BFC-ht uses a combination of bloom
143filter and in-memory hash table to derive approximate $k$-mer
144counts~\citep{Melsted:2011bh} and counts of $k$-mers consisting of Q20 bases.
145We modified Algorithm~1 such that missing trusted high-quality $k$-mers incurs
146an extra penalty. This supposedly helps to correct systematic sequencing errors
147which are recurrent but have lower base quality.
148
149%To take advantage of multiple CPU cores, we implemented a blocked bloom
150%filter~\citep{DBLP:conf/wea/PutzeSS07} with a 1-byte spin lock for every
151%63-byte block to prevent concurrent modifications to the same block.  For human
152%data, we used an array of 16 million (=$2^{24}$) hash tables to keep $k$-mer
153%counts. Each hash table has a spin lock. With a good hash function, locking is
154%infrequent. We also put computing and I/O on different threads.  The strategy
155%is particularly useful when I/O is slow.
156
157\end{methods}
158
159\section{Results and Discussions}
160
161We evaluated BFC along with
162BLESS-v0p23~\citep{Heo:2014aa}, Bloocoo-1.0.4~\citep{Drezen:2014aa},
163fermi2-r175~\citep{Li:2012fk}, Lighter-20150123~\citep{Song:2014aa},
164Musket-1.1~\citep{Liu:2013ac} and SGA-0.9.13~\citep{Simpson:2012aa}.
165We ran the tools on a Linux server with 20 cores of Intel
166E5-2660 CPUs and 128GB RAM. Precompiled binaries are available through
167http://bit.ly/biobin and the command lines were shown in Table~S2.
168Notably, BLESS only works with uncompressed files. Other tools were
169provided with gzip'd FASTQ.
170%The rest of tools were provided with gzip'd files as input.
171%We have considered a few other tools but have not included them for various
172%reasons (see Supplementary Notes).
173
174\begin{table}[t]
175\processtable{Performance of error correction}
176{\footnotesize
177\begin{tabular}{lcrrccrr}
178\toprule
179Prog.     & $k$ & Time  & RAM   & Perfect&Chim.& Better & Worse \\
180\midrule
181raw data  & --  & --    & --    & 2.40M  & 12.4k  & --     & -- \\
182%BBMap     & 31  &{\bf 3h22m}&33.0G& 2.78M& 12.4k  & 505k   & 19.2k \\
183%BFC-ht    & 37  & 6h33m & 63.5G & 3.04M  & 12.5k  & 830k   & 9.7k \\
184%BFC-ht    & 33  & 6h15m & 61.4G & 3.03M  &13.2k& 822k   & 10.1k \\
185%BFC-ht    &55/33& 9h18m & 67.9G &{\bf 3.07M}&11.8k&{\bf 861k}&9.5k\\
186BFC-bf    & 31  & 7h32m & 23.3G & 3.01M  & 13.1k  & 783k   & 9.2k \\
187BFC-bf    & 55&{\bf 4h41m}&23.3G&{\bf 3.05M}&11.8k& 819k   & 11.4k \\
188BFC-ht    & 31  & 7h15m & 83.5G & 3.03M  &13.6k   & 816k   & 10.8k \\
189BFC-ht    & 55  & 5h51m & 67.9G &{\bf 3.05M}&11.7k& 830k   &{\bf 9.0k}\\
190BLESS     & 31  & 6h31m & 22.3G & 2.91M  & 13.1k  & 674k   & 20.8k \\
191BLESS     & 55  & 5h09m & 22.3G & 3.01M  &{\bf 11.5k}& 775k& 10.3k \\
192Bloocoo   & 31  & 5h52m &{\bf 4.0G}&2.88M& 14.1k  & 764k   & 31.5k  \\
193Fermi2    & 29  &17h14m & 64.7G & 3.00M  & 17.7k  &{\bf 849k}&42.8k \\
194Lighter   & 31  & 5h12m & 13.4G & 2.98M  & 13.0k  & 756k   & 30.1k  \\
195Musket    & 27  &21h33m & 77.5G & 2.94M  & 22.5k  & 790k   & 36.3k  \\
196SGA       & 55  &48h40m & 35.6G & 3.01M  & 12.1k  & 755k   & 12.8k  \\
197\botrule
198\end{tabular}}{445 million pairs of $\sim$150bp reads were downloaded from
199BaseSpace, under the sample ``NA12878-L7'' of project ``HiSeq X Ten: TruSeq
200Nano (4 replicates of NA12878)'', and were corrected together. On a subset of
201two million randomly sampled read pairs, the original and the corrected
202sequences were mapped to hs37d5 (http://bit.ly/GRCh37d5) with
203BWA-MEM~\citep{Li:2013aa}.  A read is said to become \emph{better} (or
204\emph{worse}) if the best alignment of the corrected sequence has more (or
205fewer) identical bases to the reference genome than the best alignment of the
206original sequence. The table gives $k$-mer size (maximal size used for Bloocoo,
207fermi2, Lighter and Musket), the wall-clock \emph{time} when 16 threads are
208specified if possible, the peak \emph{RAM} measured by GNU time, number of
209corrected reads mapped \emph{perfectly}, number of \emph{chimeric} reads (i.e. reads with parts mapped to different places),
210number of corrected reads becoming \emph{better} and the number of reads
211becoming \emph{worse} than the original reads. For each metric, the best tool
212is highlighted in the bold fontface.}
213%Metrics in the table are complement to
214%each other. A tool that tends to correct a few but not all errors may lead to more
215%`better' reads but fewer `perfect' mapping.
216\end{table}
217
218\begin{table}[b]
219\processtable{Effect on {\it de novo} assembly}
220{\footnotesize
221\begin{tabular}{p{1.0cm}p{1.3cm}p{1.9cm}p{1.5cm}p{1.2cm}}
222\toprule
223Program   & Scaffold NG50 (kb) & Contig aligned-N50 (kb) & Alignment break points & Potential FP SNP\\
224\midrule
225raw data  & 31.6 / 29.9 & 14.8 / 20.2 / 4.9 & 352 / 157 / 29 & 1568 / 334\\
226BFC-bf    & 33.7 / 33.7 & 17.3 / 22.8 / 7.6 & 341 / 173 / 33 & 3334 / 668\\
227BFC-ht    & 34.8 / 34.2 & 17.3 / 22.9 / 9.1 & 314 / 176 / 31 & 1397 / 374\\
228BLESS     & 33.6 / 31.7 & 15.4 / 20.7 / 8.0 & 351 / 165 / 28 & 1744 / 414\\
229Bloocoo   & 34.5 / 33.7 & 17.1 / 22.6 / 6.2 & 340 / 177 / 32 & 3128 / 480\\
230Fermi2    & 33.8 / 33.7 & 16.8 / 22.6 / 8.9 & 333 / 174 / 32 & 1444 / 396\\
231Lighter   & 34.6 / 33.6 & 16.6 / 22.4 / 7.9 & 329 / 180 / 34 & 3011 / 651\\
232Musket    & 33.4 / 32.2 & 16.0 / 21.2 / 8.2 & 338 / 181 / 30 & 1940 / 617\\
233SGA       & 34.4 / 33.4 & 16.7 / 22.4 / 6.6 & 360 / 163 / 32 & 3107 / 495\\
234\botrule
235\end{tabular}}{33.8 million pairs of 100bp {\it C. elegans} reads were
236downloaded from SRA under accession SRR065390, corrected using $k$-mer length
23723 and then assembled with Velvet-1.2.10~(\citealp{Zerbino:2008uq}; velvetg
238option `\mbox{-exp\_cov} auto \mbox{-cov\_cutoff} auto \mbox{-ins\_length} 250'
239with $k$-mer length 61), ABySS-1.5.2~[\citealp{Simpson:2009ys}; abyss-pe
240option `k=67 q=0 s=500 n=5' taken from~\citet{Simpson:2012aa}] and fermikit-0.9
241(fermi2.pl option \mbox{`-Es100m'}). For Velvet and ABySS, contigs were derived
242by splitting scaffolds at contiguous `N' bases longer than 5bp. In the table,
243each row gives the NG50 of Velvet/ABySS scaffolds longer than 200bp, the
244aligned N50 of Velvet/ABySS/fermikit contigs longer than 200bp, the number of
245contig alignment break points with $>$200bp flanking sequences and the number
246of false positive unfiltered/filtered fermikit SNPs as compared to
247freebayes-0.9.20 calls~(\citealp{Garrison:2012aa}; option
248`\mbox{--experimental-gls}'). BFC and fermi2 were tuned to work with fermikit
249for human variant calling before this evaluation while others were not.}
250\end{table}
251
252On human data (Table~1), BFC has similar performance to other error correctors
253that use greedy algorithms. It tends to correct more errors without introducing
254new errors, potentially due to its non-greedy and quality-aware algorithm.
255
256BFC also works well with assemblers (Table~2). It should be noted
257that the short reads were sequenced from the Bristol N2 strain, the same as the
258{\it C. elegans} reference genome. We expect to see few alignment break points
259and base-level differences. Extended discussions on the results can be found in
260Supplementary Notes.
261
262%As is shown in the table, BBMap is the fastest. BFC, BLESS, Bloocoo and Lighter
263%are comparable in speed. Bloocoo is the most lightweight. Other bloom filter
264%based tools, BFC-bf, BLESS and Lighter, also have a small memory footprint.
265%
266%To evaluate accuracy, we counted the number of perfect mappings and compared
267%each read before and after correction. Although for an individual read we
268%cannot assert that better mapping always implies better correction,
269%the total counts from millions of reads are good indicators of accuracy.
270%In our evaluation, most tools have broadly similar accuracy.  BFC-ht is more accurate
271%than BFC-bf overall, suggesting retaining high-quality $k$-mers helps error
272%correction; both BFC implementations are marginally better in this evaluaton,
273%correcting more reads with fewer or comparable overcorrections when a similar
274%$k$-mer length is in use, which potentially demonstrates that a non-greedy
275%algorithm might work better, though subtle differences in heuristics and hidden
276%thresholds between the tools could also play a role.  We should note that it is
277%possible to tune the balance between accuracy, speed and memory for each tool.
278%We have not fully explored all the options.
279%
280%In the table, error correctors appear to be faster and more accurate when
281%longer $k$-mers are in use. A possible explanation is that longer $k$-mers
282%resolve more repeat sequences and also reduce the search space. However, when
283%we use BFC-ht to correct errors in this dataset, fermi2~\citep{Li:2012fk}
284%derived longer contigs and better variant calls with shorter $k$-mers. We
285%speculate that this observation is caused by reduced $k$-mer coverage firstly
286%because there are fewer long $k$-mers on each read and secondly because longer
287%$k$-mers are more likely to harbor errors. The reduced $k$-mer coverage makes
288%it harder to correct errors in regions with low coverage and thus increases the
289%chance of breaking contigs.  To take advantage of both shorter and longer
290%$k$-mers, we have also tried a two-round correction strategy with two $k$-mer
291%sizes. The strategy leads to better numbers in the table (861k reads corrected
292%to be better and 9.5k worse) at the cost of speed, but does not greatly improve
293%the assembly. We will focus on understanding the interaction between error
294%correctors and assemblers in future works.
295
296
297\section*{Acknowledgement}
298\paragraph{Funding\textcolon} NHGRI U54HG003037; NIH GM100233
299
300\bibliography{bfc}
301\end{document}
302