1\documentclass{bioinfo2}
2
3\copyrightyear{2015}
4\pubyear{2015}
5
6\usepackage{amsmath}
7\usepackage{natbib}
8
9\newcommand{\specialcell}[2][c]{%
10  \begin{tabular}[#1]{@{}l@{}}#2\end{tabular}}
11
12\usepackage{caption}
13\renewcommand{\thetable}{S\arabic{table}}
14
15\bibliographystyle{apalike}
16
17\begin{document}
18\firstpage{1}
19
20\title[Error Correction for Illumina Data]{BFC: correcting Illumina sequencing errors (supplementary notes)}
21\author[Li]{Heng Li}
22\address{Broad Institute, 75 Ames Street, Cambridge, MA 02142, USA}
23\maketitle
24
25\section{Supplementary Notes}
26
27\subsection{Other error correction tools}
28
29In addition to the error correctors evaluated in the manuscript, we have also
30tried AllPaths-LG~\citep{Gnerre:2011ys}, Fiona~\citep{Schulz:2014aa} and
31Trowel~\citep{Lim:2014aa}, but they require more than 128GB RAM our machine
32has, so were not evaluated. According to \citet{Molnar:2014aa},
33RACER~\citep{Ilie:2013aa} uses 1.38--2.36 bytes per input base for
34high-coverage human data. It would also use too much memory. We considered
35Blue~\citep{Greenfield:2014aa}, but its website is down at the time of the
36evaluation. We have also tried QuorUM~\citep{Zimin:2013aa}.  However, it
37always trims reads, making it hard to be compared to others which keep
38full-length reads.
39
40\subsection{Mapping-based metrics}
41
42In Illumina sequencing, a small fraction of reads may harbor many errors. They
43pose a great challenge to error correctors. They may significantly increase
44the number of unmapped reads and inflate number of uncorrected base errors.
45However, in practice, it is usually safe to ignore these reads. There are at
46times enough high-quality reads in the same region. To reduce the effect
47of these low-quality reads, we measured the accuracy based on read counts
48rather than base counts.
49
50For the human data, a concern with mapping-based metrics is that sample NA12878
51is different from the human reference genome, which might lead to reference
52bias. We think this concern is minor because reads were corrected with no
53information about the reference. Although given an individual read we cannot say
54the corrected read with fewer mismatches to the reference is always better, given
55millions of reads, a better corrector should yield fewer mismatches overall.
56Mapping-based metrics are valid and appealing due to the simple ascertainment
57procedure.
58
59\subsection{Error correction for {\it C. elegans} reads}
60
61We have tried 23-mer and 27-mer for error correction. The N50 of assembled
62scaffolds and contigs are mostly within a few hundred bp with 23-mer
63slightly better. The rest of assembly based metrics were all calculated
64for the correction with 23-mers. Table~S1 evaluates the accuracy with
65mapping-based metrics. The relative performance of tools is similar to that in
66Table~1.
67
68\begin{table}[t]
69\processtable{Error correction for 67.6M {\it C. elegans} reads}
70{\footnotesize
71\begin{tabular}{p{2cm}p{1.35cm}p{1.35cm}p{1.35cm}r}
72\toprule
73Prog.     & Perfect&Chim.& Better & Worse \\
74\midrule
75raw data  & 47.6M  & 398k  & --     & --   \\
76BFC-bf    & 57.9M  & 402k  & 11.3M  & 117k \\
77BFC-ht    &{\bf 58.3M}  & 410k  & 12.1M  & {\bf 92k}  \\
78BLESS     & 57.3M  & {\bf 393k}  & 11.0M  & 137k \\
79Bloocoo   & 55.3M  & 417k  & 10.2M  & 255k \\
80Fermi2    & 58.1M  & 431k  & {\bf 13.2M}  & 287k \\
81Lighter   & 58.0M  & 400k  & 11.3M  & 171k \\
82Musket    & 57.4M  & 448k  & 11.8M  & 246k \\
83SGA       & 57.3M  & 418k  & 10.7M  & 201k \\
84\botrule
85\end{tabular}}{}
86\end{table}
87
88\begin{table*}[t]
89\processtable{Command lines used for assembly evaluation}
90{\footnotesize
91\begin{tabular}{lp{13.2cm}}
92\toprule
93Functionality   & Command line \\
94\midrule
95BFC-bf-r157 correction & \specialcell[t]{\tt kmc -k23 -t8 SRR065390.fq SRR065390.kmc kmc-tmp\\\tt bfc-kmc -t8 SRR065390.kmc SRR065390.raw.fq} \\
96BFC-ht-r175 correction & {\tt bfc -t8 -s 100m -k 23 SRR065390.fq} \\
97BLESS-v0p23 correction & {\tt bless -read SRR065390.fq -kmerlength 23 -prefix out -smpthread 8 -notrim} \\
98Bloocoo-1.0.4 correction & {\tt Bloocoo -nb-cores 8 -file SRR065390.fq -kmer-size 23 -out out} \\
99Fermi2-r175 correction & \specialcell[t]{\tt ropebwt2 -drq20 -x31 SRR065390.raw.fq \char62\,SRR065390.fmd\\\tt fermi2 correct -t8 -k23 SRR065390.fmd SRR065390.fq} \\
100Lighter-1.0.4 correction & {\tt lighter -K 23 100000000 -r SRR065390.fq -t 8} \\
101Musket-1.1 correction & {\tt musket -k 23 200000000 -p 8 -inorder -o out.fq SRR065390.fq} \\
102SGA-0.10.3 correction & \specialcell[t]{\tt sga preprocess -p 2 SRR065390.fq > reads.fq\\\tt sga index -a ropebwt -t 8 --no-reverse reads.fq\\\tt sga correct -t 8 -k 23 --learn reads.fq} \\
103\midrule
104Velvet-1.2.10 assembly & \specialcell[t]{\tt velveth k61 61 -shortPaired -fmtAuto SRR065390.fq\\\tt velvetg k61 -exp\_cov auto -scaffolding yes -cov\_cutoff auto -ins\_length 250} \\
105ABySS-1.5.2 assembly & {\tt abyss-pe name=k67 k=67 in=SRR065390.fq q=0 s=500 n=5} \\
106Fermi2-kit-0.9 assembly & \specialcell[t]{\tt fermi2.pl unitig -Es100m -t16 SRR065390.fq \char62\,SRR065390.mak\\\tt make -f SRR065390.mak} \\
107Bwa-0.7.12 contig mapping & {\tt bwa mem -xintractg -t8 ce235.fa contigs.mag.gz \char62\,contigs.sam} \\
108Aligned N50 and break points & {\tt htsbox abreak -l200 contigs.sam} \\
109FreeBayes-0.9.20 variant calling & {\tt freebayes --experimental-gls -f celegans.fa reads.bam} \\
110Variant calling from contigs & {\tt htsbox pileup -cuf celegans.fa contigs.bam \char62\,contigs.vcf} \\
111Variant filtering & \specialcell[t]{\tt k8 hapdip.js deovlp contigs.vcf \char124\,k8 hapdip.js anno \char62\,contigs.anno\\\tt k8 hapdip.js filter -DF36 -q3 contigs.anno \char62\,contigs.flt} \\
112\botrule
113\end{tabular}}{Command lines for correcting human data are similar except for
114the $k$-mer length and extra options to enable gzip'd input.}
115\end{table*}
116
117\subsection{Assembly-related data processing}
118
119\subsubsection{Velvet assembly}
120The {\it C. elegans} reads have been assembled by~\citet{Simpson:2012aa} and
121independently by~\citet{Song:2014aa}, both using Velvet. The former assembled
122the short reads into scaffolds and then broke them into contigs at assembly
123gaps and derived a contig N50 of 13.6kb. The latter directly assembled reads into
124contigs and reached an N50 of 17.3kb. We have tried both settings and could
125approximately reproduce the difference.  We found Velvet produced significantly
126more misassemblies (477 vs 352) when it is not asked to generate scaffolds for
127this data set. We used setting by~\citet{Simpson:2012aa} in Table~2.
128
129\subsubsection{ABySS assembly}
130We initially used the default setting of ABySS, only specifying the $k$-mer length.
131We found the N50 is not as good as the one produced by~\citet{Simpson:2012aa},
132so we adopted their setting. ABySS generate a file with suffix
133`\mbox{-contigs.fa}', but that file still contains long gaps. We finally
134decided to break `-scaffolds.fa' at gaps no short than 5bp. Changing this
135threshold leads to slightly different results, but the relatively ranking of
136correctors is similar.
137
138\subsubsection{FreeBayes SNP calling}
139The latest freebayes-0.9.21 has an issue that causes missing variants in some
140regions. A major change in 0.9.21 is that it uses `--experimental-gls' by
141default. We are using version 0.9.20 with this option applied.
142
143\subsubsection{SNP calling from assembled contigs}
144Fermi2 encodes per-base read depth in the quality string. HTSbox
145(http://bit.ly/HTSBox) is able to call variants using the read depth.
146HTSbox also works with contigs in the FASTA format without per-base read depth.
147It simply lists difference between contigs and the reference given a
148mapping quality threshold.
149
150\subsection{Divergence between SRR065390 and the reference}
151On BWA-MEM short-read mapping, freebayes called 7,047 SNPs before filtering. If
152we exclude SNPs with depth higher than 98 ($=65+4\sqrt{65}$, where 65 is the
153average depth), we are left with 4,251 SNPs. If we apply a similar set of
154filters used in our previous works~\citep{Li:2014aa}, we get 1,658 SNPs, about
155half of which are heterozygotes. We manually checked tens of heterozygous SNPs
156and believed most of them are false calls potentially due to false mappings
157around larger variants. Similarly, we called 3,793 unfiltered and 1,600 filtered
158SNPs from fermikit unitigs and found most post-filtered heterozygotes are
159questionable. Thus we conclude that per-base substitution rate between the
160sequenced sample and the {\it C. elegans} reference genome should be below one
161per 100kb.
162
163We called 8,819 unfiltered SNPs from Velvet contigs and 13,384 from ABySS contigs.
164Without per-base read depth, it is nontrivial to filter them further. In addition
165to SNPs, we also called 655 $>$10bp INDELs from Velvet contigs and 1,393 from
166ABySS contigs. In contrast, we called only 81 $>$10bp INDELs from fermi2 unitigs.
167Upon manual inspection, we suspect many of these Velvet/ABySS SNPs and long
168INDELs are caused by long-range misassemblies or short-range aggressive bubble
169popping.  Velvet and ABySS produce long contigs, but they are not good enough
170for small variant calling. Therefore, we have not evaluated Velvet and ABySS
171SNP calls in Table~2 of the main article.
172
173\subsection{Effect of systematic sequencing errors}
174BFC-ht and BFC-bf use very similar algorithms except that when collecting
175trusted $k$-mers, BFC-ht marks a $k$-mer if it contains low-quality bases. BFC-ht
176uses this information in error correction. This strategy dramatically improves
177SNP accuracy (Table~2) because systematic errors tend to have low base quality.
178Fermi2 is the other corrector that is quality-aware at the $k$-mer counting
179phase and also produces relatively fewer false positive SNPs.
180
181We think the fewer number of false positive SNPs called from raw data are also
182related to systematic errors. In Illumina sequencing, when there are systematic
183errors, flanking regions nearby tend to be enriched with sequencing errors as
184well, which makes the assembler harder to assemble through such regions.
185Because there are no unitigs covering these regions, we don't call systematic
186errors as false positives. However, not using error correction in this case
187clearly leads to inferior N50 and probably also lead to reduced sensitivity.
188
189It should be noted that the profile of systematic errors is affected by
190chemistries. This {\it C. elegans} run appears to have high systematic error
191rate at the GGC motif~\citep{Nakamura:2011aa}, but recent Illumina data seem
192to have more errors associated with long ploy-A.
193
194\subsection{Biases in evaluation}
195We have been adjusting the BFC and fermi2 correctors based on older human
196assemblies produced by the fermi2 assembler. Other tools have not received this
197treatment and many of them have not been run on whole-genome human data before.
198Our evaluation is biased towards BFC and fermi2.
199
200%\subsection{Concluding remarks}
201%BFC is a free, fast and easy-to-use error corrector designed for Illumina short
202%reads. It uses a more exhausive algorithm but still maintains a speed
203%comparable to implementations based on greedy methods. In practice, BFC appears
204%to correct more errors with fewer overcorrections in comparison to others. It
205%particularly does well in suppressing systematic sequencing errors, which helps
206%to improve the base accuracy of {\it de novo} assemblies.
207
208\bibliography{bfc}
209\end{document}
210