bfc-69ab176e/tex/supp.tex

\documentclass{bioinfo2}

\copyrightyear{2015}
\pubyear{2015}

\usepackage{amsmath}
\usepackage{natbib}

\newcommand{\specialcell}[2][c]{%
  \begin{tabular}[#1]{@{}l@{}}#2\end{tabular}}

\usepackage{caption}
\renewcommand{\thetable}{S\arabic{table}}

\bibliographystyle{apalike}

\begin{document}
\firstpage{1}

\title[Error Correction for Illumina Data]{BFC: correcting Illumina sequencing errors (supplementary notes)}
\author[Li]{Heng Li}
\address{Broad Institute, 75 Ames Street, Cambridge, MA 02142, USA}
\maketitle

\section{Supplementary Notes}

\subsection{Other error correction tools}

In addition to the error correctors evaluated in the manuscript, we have also
tried AllPaths-LG~\citep{Gnerre:2011ys}, Fiona~\citep{Schulz:2014aa} and
Trowel~\citep{Lim:2014aa}, but they require more than 128GB RAM our machine
has, so were not evaluated. According to \citet{Molnar:2014aa},
RACER~\citep{Ilie:2013aa} uses 1.38--2.36 bytes per input base for
high-coverage human data. It would also use too much memory. We considered
Blue~\citep{Greenfield:2014aa}, but its website is down at the time of the
evaluation. We have also tried QuorUM~\citep{Zimin:2013aa}.  However, it
always trims reads, making it hard to be compared to others which keep
full-length reads.

\subsection{Mapping-based metrics}

In Illumina sequencing, a small fraction of reads may harbor many errors. They
pose a great challenge to error correctors. They may significantly increase
the number of unmapped reads and inflate number of uncorrected base errors.
However, in practice, it is usually safe to ignore these reads. There are at
times enough high-quality reads in the same region. To reduce the effect
of these low-quality reads, we measured the accuracy based on read counts
rather than base counts.

For the human data, a concern with mapping-based metrics is that sample NA12878
is different from the human reference genome, which might lead to reference
bias. We think this concern is minor because reads were corrected with no
information about the reference. Although given an individual read we cannot say
the corrected read with fewer mismatches to the reference is always better, given
millions of reads, a better corrector should yield fewer mismatches overall.
Mapping-based metrics are valid and appealing due to the simple ascertainment
procedure.

\subsection{Error correction for {\it C. elegans} reads}

We have tried 23-mer and 27-mer for error correction. The N50 of assembled
scaffolds and contigs are mostly within a few hundred bp with 23-mer
slightly better. The rest of assembly based metrics were all calculated
for the correction with 23-mers. Table~S1 evaluates the accuracy with
mapping-based metrics. The relative performance of tools is similar to that in
Table~1.

\begin{table}[t]
\processtable{Error correction for 67.6M {\it C. elegans} reads}
{\footnotesize
\begin{tabular}{p{2cm}p{1.35cm}p{1.35cm}p{1.35cm}r}
\toprule
Prog.     & Perfect&Chim.& Better & Worse \\
\midrule
raw data  & 47.6M  & 398k  & --     & --   \\
BFC-bf    & 57.9M  & 402k  & 11.3M  & 117k \\
BFC-ht    &{\bf 58.3M}  & 410k  & 12.1M  & {\bf 92k}  \\
BLESS     & 57.3M  & {\bf 393k}  & 11.0M  & 137k \\
Bloocoo   & 55.3M  & 417k  & 10.2M  & 255k \\
Fermi2    & 58.1M  & 431k  & {\bf 13.2M}  & 287k \\
Lighter   & 58.0M  & 400k  & 11.3M  & 171k \\
Musket    & 57.4M  & 448k  & 11.8M  & 246k \\
SGA       & 57.3M  & 418k  & 10.7M  & 201k \\
\botrule
\end{tabular}}{}
\end{table}

\begin{table*}[t]
\processtable{Command lines used for assembly evaluation}
{\footnotesize
\begin{tabular}{lp{13.2cm}}
\toprule
Functionality   & Command line \\
\midrule
BFC-bf-r157 correction & \specialcell[t]{\tt kmc -k23 -t8 SRR065390.fq SRR065390.kmc kmc-tmp\\\tt bfc-kmc -t8 SRR065390.kmc SRR065390.raw.fq} \\
BFC-ht-r175 correction & {\tt bfc -t8 -s 100m -k 23 SRR065390.fq} \\
BLESS-v0p23 correction & {\tt bless -read SRR065390.fq -kmerlength 23 -prefix out -smpthread 8 -notrim} \\
Bloocoo-1.0.4 correction & {\tt Bloocoo -nb-cores 8 -file SRR065390.fq -kmer-size 23 -out out} \\
Fermi2-r175 correction & \specialcell[t]{\tt ropebwt2 -drq20 -x31 SRR065390.raw.fq \char62\,SRR065390.fmd\\\tt fermi2 correct -t8 -k23 SRR065390.fmd SRR065390.fq} \\
Lighter-1.0.4 correction & {\tt lighter -K 23 100000000 -r SRR065390.fq -t 8} \\
Musket-1.1 correction & {\tt musket -k 23 200000000 -p 8 -inorder -o out.fq SRR065390.fq} \\
SGA-0.10.3 correction & \specialcell[t]{\tt sga preprocess -p 2 SRR065390.fq > reads.fq\\\tt sga index -a ropebwt -t 8 --no-reverse reads.fq\\\tt sga correct -t 8 -k 23 --learn reads.fq} \\
\midrule
Velvet-1.2.10 assembly & \specialcell[t]{\tt velveth k61 61 -shortPaired -fmtAuto SRR065390.fq\\\tt velvetg k61 -exp\_cov auto -scaffolding yes -cov\_cutoff auto -ins\_length 250} \\
ABySS-1.5.2 assembly & {\tt abyss-pe name=k67 k=67 in=SRR065390.fq q=0 s=500 n=5} \\
Fermi2-kit-0.9 assembly & \specialcell[t]{\tt fermi2.pl unitig -Es100m -t16 SRR065390.fq \char62\,SRR065390.mak\\\tt make -f SRR065390.mak} \\
Bwa-0.7.12 contig mapping & {\tt bwa mem -xintractg -t8 ce235.fa contigs.mag.gz \char62\,contigs.sam} \\
Aligned N50 and break points & {\tt htsbox abreak -l200 contigs.sam} \\
FreeBayes-0.9.20 variant calling & {\tt freebayes --experimental-gls -f celegans.fa reads.bam} \\
Variant calling from contigs & {\tt htsbox pileup -cuf celegans.fa contigs.bam \char62\,contigs.vcf} \\
Variant filtering & \specialcell[t]{\tt k8 hapdip.js deovlp contigs.vcf \char124\,k8 hapdip.js anno \char62\,contigs.anno\\\tt k8 hapdip.js filter -DF36 -q3 contigs.anno \char62\,contigs.flt} \\
\botrule
\end{tabular}}{Command lines for correcting human data are similar except for
the $k$-mer length and extra options to enable gzip'd input.}
\end{table*}

\subsection{Assembly-related data processing}

\subsubsection{Velvet assembly}
The {\it C. elegans} reads have been assembled by~\citet{Simpson:2012aa} and
independently by~\citet{Song:2014aa}, both using Velvet. The former assembled
the short reads into scaffolds and then broke them into contigs at assembly
gaps and derived a contig N50 of 13.6kb. The latter directly assembled reads into
contigs and reached an N50 of 17.3kb. We have tried both settings and could
approximately reproduce the difference.  We found Velvet produced significantly
more misassemblies (477 vs 352) when it is not asked to generate scaffolds for
this data set. We used setting by~\citet{Simpson:2012aa} in Table~2.

\subsubsection{ABySS assembly}
We initially used the default setting of ABySS, only specifying the $k$-mer length.
We found the N50 is not as good as the one produced by~\citet{Simpson:2012aa},
so we adopted their setting. ABySS generate a file with suffix
`\mbox{-contigs.fa}', but that file still contains long gaps. We finally
decided to break `-scaffolds.fa' at gaps no short than 5bp. Changing this
threshold leads to slightly different results, but the relatively ranking of
correctors is similar.

\subsubsection{FreeBayes SNP calling}
The latest freebayes-0.9.21 has an issue that causes missing variants in some
regions. A major change in 0.9.21 is that it uses `--experimental-gls' by
default. We are using version 0.9.20 with this option applied.

\subsubsection{SNP calling from assembled contigs}
Fermi2 encodes per-base read depth in the quality string. HTSbox
(http://bit.ly/HTSBox) is able to call variants using the read depth.
HTSbox also works with contigs in the FASTA format without per-base read depth.
It simply lists difference between contigs and the reference given a
mapping quality threshold.

\subsection{Divergence between SRR065390 and the reference}
On BWA-MEM short-read mapping, freebayes called 7,047 SNPs before filtering. If
we exclude SNPs with depth higher than 98 ($=65+4\sqrt{65}$, where 65 is the
average depth), we are left with 4,251 SNPs. If we apply a similar set of
filters used in our previous works~\citep{Li:2014aa}, we get 1,658 SNPs, about
half of which are heterozygotes. We manually checked tens of heterozygous SNPs
and believed most of them are false calls potentially due to false mappings
around larger variants. Similarly, we called 3,793 unfiltered and 1,600 filtered
SNPs from fermikit unitigs and found most post-filtered heterozygotes are
questionable. Thus we conclude that per-base substitution rate between the
sequenced sample and the {\it C. elegans} reference genome should be below one
per 100kb.

We called 8,819 unfiltered SNPs from Velvet contigs and 13,384 from ABySS contigs.
Without per-base read depth, it is nontrivial to filter them further. In addition
to SNPs, we also called 655 $>$10bp INDELs from Velvet contigs and 1,393 from
ABySS contigs. In contrast, we called only 81 $>$10bp INDELs from fermi2 unitigs.
Upon manual inspection, we suspect many of these Velvet/ABySS SNPs and long
INDELs are caused by long-range misassemblies or short-range aggressive bubble
popping.  Velvet and ABySS produce long contigs, but they are not good enough
for small variant calling. Therefore, we have not evaluated Velvet and ABySS
SNP calls in Table~2 of the main article.

\subsection{Effect of systematic sequencing errors}
BFC-ht and BFC-bf use very similar algorithms except that when collecting
trusted $k$-mers, BFC-ht marks a $k$-mer if it contains low-quality bases. BFC-ht
uses this information in error correction. This strategy dramatically improves
SNP accuracy (Table~2) because systematic errors tend to have low base quality.
Fermi2 is the other corrector that is quality-aware at the $k$-mer counting
phase and also produces relatively fewer false positive SNPs.

We think the fewer number of false positive SNPs called from raw data are also
related to systematic errors. In Illumina sequencing, when there are systematic
errors, flanking regions nearby tend to be enriched with sequencing errors as
well, which makes the assembler harder to assemble through such regions.
Because there are no unitigs covering these regions, we don't call systematic
errors as false positives. However, not using error correction in this case
clearly leads to inferior N50 and probably also lead to reduced sensitivity.

It should be noted that the profile of systematic errors is affected by
chemistries. This {\it C. elegans} run appears to have high systematic error
rate at the GGC motif~\citep{Nakamura:2011aa}, but recent Illumina data seem
to have more errors associated with long ploy-A.

\subsection{Biases in evaluation}
We have been adjusting the BFC and fermi2 correctors based on older human
assemblies produced by the fermi2 assembler. Other tools have not received this
treatment and many of them have not been run on whole-genome human data before.
Our evaluation is biased towards BFC and fermi2.

%\subsection{Concluding remarks}
%BFC is a free, fast and easy-to-use error corrector designed for Illumina short
%reads. It uses a more exhausive algorithm but still maintains a speed
%comparable to implementations based on greedy methods. In practice, BFC appears
%to correct more errors with fewer overcorrections in comparison to others. It
%particularly does well in suppressing systematic sequencing errors, which helps
%to improve the base accuracy of {\it de novo} assemblies.

\bibliography{bfc}
\end{document}