1\documentclass{article}
2
3\usepackage{program}
4\usepackage{latexsym}
5
6\title{Symmetric Version of DUST Algorithm}
7\author{Aleksandr Morgulis}
8
9\newcommand{\al}[1]{\alpha_{#1}}
10\newcommand{\sq}[2]{\al{#1}\ldots\al{#2}}
11\newcommand{\cala}{{\cal A}}
12\newcommand{\cals}{{\cal S}}
13
14\newcommand{\OF}{\ \keyword{of}\ }
15\newcommand{\RET}{\keyword{return}\ }
16\newcommand{\VARP}{\keyword{var}\ }
17\newcommand{\LIST}{\keyword{list}\ }
18\newcommand{\TUPLE}{\keyword{tuple}}
19\newcommand{\NIL}{\ \keyword{nil}\ }
20\newcommand{\TYPE}{\keyword{type}\ }
21\newcommand{\DODO}{\keyword{do}\tab}
22\newcommand{\DOWHILE}{\untab\keyword{while}\ }
23
24\newtheorem{lemma}{Lemma}
25
26\setlength{\parskip}{1ex}
27\setlength{\parindent}{0mm}
28
29\begin{document}
30
31\maketitle
32\tableofcontents
33
34\section*{NOTE}
35
36The code has been restructured recently. Everything in this text is still
37valid, but the pseudocode does not closely correspond to the C++ code
38anymore. This situation will be fixed in the future.
39
40\section{Definitions and Notation}
41
42Let ${\cal S} = \sq{0}{N-1}$  be a sequence of $N$ letters from $4$-letter
43alphabet $\cala = \{ \mathbf A, \mathbf C, \mathbf G, \mathbf T \}$.
44
45A {\em triplet} is a sequence of length $3$. Any sequence $\cals = \sq{0}{N-1}$
46of length $N > 2$ contains as subsequences exactly $N-2$ triplets, that are
47denoted $t_{0},\ldots,t_{N-3}$, where $t_i = \al{i}\al{i+1}\al{i+2}$.
48
49We define a mapping $d$ from $\cala$ to the subset of integers
50$\{0, 1, 2, 3\}$ in the following way: $d(\mathbf A) = 0$; $d(\mathbf C) = 1$;
51$d(\mathbf G) = 2$; $d(\mathbf T) = 3$.
52
53To any triplet $t = \al{i}\al{i+1}\al{i+2}$ we assign an integer
54{\em triplet value} $v(t) = 16d(\al{i}) + 4d(\al{i+1}) + d(\al{i+2})$. This
55defines a one-to-one mapping from $64$ possible 3-letter sequences to the subset
56$\{0..63\}$ of integers.  In this text the triplets and their values are used
57interchangeably.
58
59We assign a {\em score} $S(\cals)$ to any sequence $\cals$ of length $N > 3$ in
60the following way. Let $k_t :\ 0 \le t \le 63$ be the number of times a triplet
61with value $t$ appears in $\cals$. We define
62$$
63S(\cals) = \frac{\sum_{t=0}^{63}{k_t(k_t - 1)}}{2(l - 1)} \rm{\ ,}
64$$
65where $l = N - 2$ is the number of triplets in $\cals$. In other words, each
66triplet value $t$ contributes $1 + 2 + \cdots + (k_t - 1)$ to the score.
67
68Given an input nucleotide sequence $S$, window length $W$, and score threshold
69$T$, let $|LC|(S)$ denote a set of subsequences of $S$ of length at most $W$ with
70score greater than $T$. Both the original and the new DUST algorithms mask a subset
71of $|LC|(S)$. The bases masked by the new algorithm form a superset of those masked
72by the original one.
73
74The algorithms below are presented in Pascal-like pseudocode. All undeclared
75functions and types are described in section \ref{secpseudo}. Keyword \VARP
76is used to declare local variables in procedure/function. When used in
77the procedure/function parameter list it means that the corresponding parameter
78is passed by reference, rather than by value.
79
80\section{Original Algorithm}
81
82The original DUST algorithm looks at all subsequences, called {\em windows},
83of fixed length $W$ (subsequences of length less than $W$ are considered at
84the beginning and at the end of the input sequence) of the input nucleotide
85sequence.  For each such subsequence $\cals$ it finds the high scoring prefix
86$\cals'$ with the largest score (ties are resolved in favor of the leftmost
87such prefix). If the score of the selected prefix is greater than a given
88threshold value $T$ then the algorithm finds a subsequence $\cals''$ of $\cals'$
89with the maximum score (again the ties are resolved in favor of the leftmost
90subsequence). The algorithm then masks the union of all such $\cals''$ over
91all the windows in the input sequence.
92
93Below is the original DUST algorithm in pseudocode.
94
95\NumberProgramstrue
96\begin{program}
97\BEGIN
98\TYPE |prefix_info| := \TUPLE ( |end| : |int|,\ |score| : |real| );
99\COMMENT{ The first argument to |best\_prefix|() is used for both input }
100\COMMENT{ and output. The caller passes the input sequence, which }
101\COMMENT{ is modified by the function to contain its highest scoring }
102\COMMENT{ prefix. The actual score of the highest scoring prefix is }
103\COMMENT{ the return value of the function. }
104\FUNCT |best_prefix|( |Seq|:|sequence|) : |prefix_info| \BODY
105    \BEGIN
106    \VARP i, t, |i_max|, |sum| : |int|;
107    \VARP |result| : |prefix_info|;
108    \VARP |max| : |real|;
109    \VARP |counts| : \ARRAY [0..63] \OF |int|;
110    \FOR i:=0 \TO 63 \DO |counts|[i] := 0 \OD;
111    |i_max| := 0;
112    |max|   := 0;
113    |sum|   := 0;
114    \FOR i:=0 \TO |length|(|Seq|) - 3 \DO
115        t           := |triplet|(|Seq|, i);
116        |sum|       := |sum| + |counts|[t];
117        |counts|[t] := |counts|[t] + 1;
118        \IF i>0 \AND |sum|/i > |max|
119        \THEN |max|   := |sum|/i;
120              |i_max| := i;
121        \FI
122    \OD;
123    |Seq| := |subsequence|(|Seq|, 0, |i_max|);
124    |result|.|end|   := |i_max|;
125    |result|.|score| := |max|;
126    \RET |result|;
127    \END
128\ENDFUNCT
129\PROC |dust|( |Seq| : |sequence|,\ |Tres| : |real|,\ |Wsize| : |int|,
130             \VARP |Res| : |list_of_integer_pairs| )\BODY
131    \COMMENT{ Results are returned in |Res|. }
132    \BEGIN
133    \VARP i, j, a, b, |start|, |end|, |lim| : |int|;
134    \VARP |score|, |suffix_score| : |real|;
135    \VARP |window|, |suffix| : |sequence|;
136    \VARP |winprefix|, |prefix| : |prefix_info|;
137    \FOR i:=3 \TO |length|(|Seq|) + |Wsize| - 4 \DO
138        \COMMENT{In the first and last $|Wsize| - 4$ iterations}
139        \COMMENT{windows that are shorter than |Wsize| bases}
140        \COMMENT{long are considered. Those are the windows}
141        \COMMENT{at the beginning and the end of the sequence.}
142        a           := |max|(i-|Wsize| + 1,0);
143        b           := |min|(i,|length|(|Seq|)-1);
144        |window|    := |subsequence|(|Seq|,a,b);
145        |winprefix| := |best_prefix|(|window|);
146        \IF |winprefix|.|score|>|Tres|
147        \THEN |start| := 0;
148              |end|   := |winprefix|.|end|;
149              |lim|   := |end|;
150              \FOR j := 0 \TO |lim|-3 \DO
151                |suffix| := |subsequence|(|window|,j,|lim|);
152                |prefix| := |best_prefix|(|suffix|);
153                \IF |prefix|.|score| > |score|
154                \THEN |score| := |prefix|.|score|;
155                      |start| := j;
156                      |end|   := j + |prefix|.|end|;
157                \FI
158              \OD;
159              |append|(|Res|,(a + |start|,a + |end|))
160        \FI
161    \OD
162    \END
163\ENDPROC
164\END
165\end{program}
166
167\section{New Algorithm}
168
169The original DUST algorithm is not symmetric with respect to taking reverse
170complements. The new algorithm is symmetric and in all our tests, it masks
171no more than 1.7\% more bases than the original DUST.
172
173Any interval selected by the original algorithm for masking has
174the following properies:
175
176\begin{enumerate}
177\item its length is less than or equal to the window length $W$;
178\item its score is greater than the score threshold $T$;
179\item scores of all its subintervals are at most as high as its own score.
180\end{enumerate}
181
182From now on, intervals of the input sequence satisfying properties 1--3 above
183are called {\em perfect}.
184
185The new symmetric DUST algorithm finds and masks {\em all} perfect intervals in the input
186sequence. First a simple algorithm to find all perfect intervals is described.
187The next section presents some optimizations that result in significant
188performance improvement.
189
190The new algorithm maintains the following data structures:
191
192\begin{itemize}
193\item a sliding window (subsequence) of length $W$ (or less if at the
194      beginning of the sequence);
195\item a list $P$ of all perfect intervals within the sliding window and
196      their scores, sorted by their left ends.
197\end{itemize}
198
199When the sliding window shifts by one character the following steps are
200performed (note that any new perfect interval must be a suffix of the
201sliding window):
202
203\begin{enumerate}
204\item remove from $P$ all items that are not in the sliding window (all of them
205      are in the beginning of $P$);
206\item for each suffix of the sliding window with the score $S$ higher than $T$,
207      if $S$ is larger than the maximum of the scores of elements in $P$ covered
208      by that suffix, then add the suffix to $P$, since it is necessarily a
209      new perfect interval.
210\end{enumerate}
211
212Below is the new algorithm in pseudocode.
213
214\begin{program}
215\BEGIN
216\TYPE |perfect| := \TUPLE ( |start| : |int|,\ |end| : |int|,\ |score| : |real| );
217\PROC |process_window|(
218    \tab |window|     : |sequence|,
219         |Tres|       : |real|,
220         |wstart|     : |int|,
221         \VARP |Perf| : \LIST \OF |perfect| ) \untab\BODY
222    \BEGIN
223    \VARP |counts|           : \ARRAY [0..63] \OF |int|;
224    \VARP i, t, |len|, |sum| : |int|;
225    \VARP |max_score|        : |real|;
226    \VARP |curr_perfect|     : |list_iter|;
227    \VARP |elem|             : |perfect|;
228    \FOR i:=0 \TO 63 \DO |counts|[i] := 0 \OD;
229    |curr_perfect| := |last_iter|(|Perf|);
230    |max_score|    := 0;
231    |len|          := 0;
232    |sum|          := 0;
233    \FOR i := |length|(|window|) - 3 \TO 0 \STEP -1 \DO
234        t           := |triplet|(|window|, i);
235        |sum|       := |sum| + |counts|[t];
236        |counts|[t] := |counts|[t] + 1;
237        \IF |len| > 0 \AND |sum|/|len| > |Tres|
238        \THEN
239            \WHILE |curr_perfect| \ne \NIL
240                   \AND |list_elem|(|curr_perfect|).|start| \ge i + |wstart| \DO
241                |elem| := |list_elem|(|curr_perfect|);
242                \IF |max_score| < |elem|.|score|
243                \THEN |max_score|       := |elem|.|score|;
244                \FI
245                |current_prefect| := |prev|(|curr_perfect|)
246            \OD
247            \COMMENT{In the following comparison it is important}
248            \COMMENT{to consider the intervals with the score}
249            \COMMENT{equal to |max\_score|. The definition of a}
250            \COMMENT{perfect interval allows for proper}
251            \COMMENT{subintervals of equal score.}
252            \IF |sum|/|len| \ge |max_score|
253            \THEN |max_score|    := |sum|/|len|;
254                  |curr_perfect| := |insert|(
255                    \qquad|curr_perfect|,
256                    \qquad(
257                    \qquad\ \quad i + |wstart|,
258                    \qquad\ \quad|length|(|window|) - 1 + |wstart|,
259                    \qquad\ \quad|max_score|
260                    \qquad))
261            \FI
262        \FI
263        |len| := |len| + 1
264    \OD
265    \END
266\ENDPROC
267\PROC |sdust|(
268    \tab |Seq|       : |sequence|,
269         |Tres|      : |real|,
270         |Wsize|     : |int|,
271         \VARP |Res| : |list_of_integer_pairs| )\untab\BODY
272    \COMMENT{ Results are returned in |Res|. }
273    \COMMENT{ |Perf| is initially empty. }
274    \BEGIN
275    \VARP |Perf|     : \LIST \OF |perfect|;
276    \VARP i, |start| : |int|;
277    \VARP |window|   : |sequence|;
278    \FOR i:=3 \TO |length|(|Seq|) - 1 \DO
279        |start|  := |max|(i-|Wsize|+1,0);
280        |window| := |subsequence|(|Seq|, |start|, i);
281        \WHILE \NOT |empty|(|Perf|) \AND |head|(|Perf|).|start| < |start| \DO
282            |append|(|Res|,|head|(|Perf|).|start|,|head|(|Perf|).|end|);
283            |pop_front|(|Perf|)
284        \OD
285        |process_window|( |window|, |Tres|, |start|, |Perf| )
286    \OD
287    \END
288\ENDPROC
289\END
290\end{program}
291
292\section{Optimization}
293
294For the long-used current threshold of 20, window size of 64, and genomic test
295sets we have considered it is possible to partially or completely eliminate
296score computation for most windows.  To do this we estimate the length of the
297longest suffix of the window that is guaranteed to be free of perfect intervals.
298Then using the length of such suffix we can find a sufficient condition for the
299whole window to be free of perfect intervals.
300
301Let |w| be a window of length $l+2$. For each $i \ge 0$, let $n_i$ be the number
302of triplet values that occur $i$ times in |w|. Let $|maxocc(w)|$ be the largest positive
303integer, such that there is a triplet value that occurs $|maxocc(w)|$ times in |w|.  Then
304the following equalities hold.
305\begin{equation}\label{eql}
306l = \sum_{i=0}^{|maxocc(w)|}{n_ii}
307\end{equation}
308\begin{equation}\label{eqS}
309(l-1)S(|w|) = \sum_{i=0}^{|maxocc(w)|}{n_i\frac{i(i-1)}{2}}
310\end{equation}
311
312We are looking for the largest integer |threshocc| such that if $|maxocc(w)| \le |threshocc|$
313then we can guarantee that
314\begin{equation}\label{eqcond}
315S(|w|) \le T{\rm.}
316\end{equation}
317
318Using (\ref{eql}) and (\ref{eqS}), (\ref{eqcond}) can be rewritten in the following
319form:
320\begin{equation}\label{eqcondone}
321T \le \sum_{i=0}^k{n_i\left(\frac{(2T+1)i - i^2}{2}\right)}{\rm .}
322\end{equation}
323
324\begin{lemma}\label{lemmaone}
325Let |w| be a window and |maxocc(w)| defined as above. If
326
327\begin{equation}\label{eqsuff}
328|maxocc(w)| \le 2T
329\end{equation}
330for some $T \ge 1$ then $S(|w|) \le T$.
331\end{lemma}
332
333\proof Let $M = |maxocc(w)|$. We can assume that $M > 1$, because if every triplet appears at most once
334in |w|, then $S(|w|) = 0$. If $M \le 2T$ then
335$(2T + 1)i - i^2 = i( 2T + 1 - i ) \ge i \ge 0$ for every $i \le M$ . So every
336element of the sum in the right side of (\ref{eqcondone}) is non negative.
337Also $n_M \ge 1$ by definition of M. Therefore the right side of
338(\ref{eqcondone}) is greater or equal to
339${1\over2}n_M\left((2T + 1)M - M^2\right)$, and, to prove the lemma, it is
340sufficient to show that ${1\over2}\left((2T + 1)M - M^2\right) \ge T$ or,
341equivalently, $(2T + 1)M - M^2 - 2T \ge 0$. The left side of the last inequality
342can be written as $(M - 1)(2T - M)$ and, since $M > 1$ and $M \le 2T$, it is
343indeed non negative. $\Box$
344
345The bound in Lemma \ref{lemmaone} is tight. If |w| consists of $2T + 1$
346identical triplets, then $|maxocc(w)| = |threshocc| + 1$ and $S(|w|) = T + 1 > T$.
347This means that $|threshocc| = 2T$.
348
349Previous discussion suggests one optimization of the symmetric DUST algorithm.
350If, for each window |w|, we could maintain its longest suffix $|w|_1$ such that
351(\ref{eqsuff}) holds for $|w|_1$, then we could skip partial score computations
352for that suffix.
353
354For a window |w| containing $l$ triplets, let $L(|w|)$ be the number of triplets
355in the longest suffix of |w| that satisfies~(\ref{eqsuff}). We want to find a
356condition (in terms of $L(|w|)$) such that processing of |w| can be omitted in the
357symmetric DUST procedure. In order to do this it is sufficient to make sure that
358inequality $S(|w|_j) \le T$ holds for every suffix $|w|_j$ of |w| containing $j$
359triplets ($L(|w|) < j \le l$) or, equivalently
360\begin{equation}\label{eqsuffone}
361\sum_{t=0}^{63}{\frac{m_{t,j}(m_{t,j} - 1)}{2}} \le T(j - 1),%
362\quad \forall j : L(|w|) < j \le l,
363\end{equation}
364where $m_{t,j}$ is the number of times the triplet value $t$ appears in
365$|w|_j$. We call the left parts of inequalities (\ref{eqsuffone}) $s_j$.
366
367\begin{lemma}\label{lemmatwo}
368If $s_l \le L(|w|)T$ then $S(|w|) \le T$.
369\end{lemma}
370
371\proof It is enough to show that if $s_l \le L(|w|)T$ then inequalities
372(\ref{eqsuffone}) hold true.
373
374From the definition of $m_{t,j}$ it is clear that
375$m_{t,j} \le m_{t,l}$ for all triplet values $t$ and all $j \le l$. Therefore
376$s_j \le s_l$ for $j \le l$. On the other hand $T(j-1) \ge TL(|w|)$ for
377$L(|w|) < j \le l$. So if $s_l \le L(|w|)T$, then for all $j$ such that
378$L(|w|) < j \le l$ the following is true
379$$
380\sum_{t=0}^{63}{\frac{m_{t,j}(m_{t,j} - 1)}{2}} = s_j \le s_l \le L(|w|)T \le T(j-1)
381$$
382which means that inequalities (\ref{eqsuffone}) hold. $\Box$
383
384\subsection{Maintaining Window Suffix Information}
385
386The following information is maintained by the symmetric DUST algorithm
387implementation in order to implement the optimizations described above:
388
389\begin{itemize}
390\item the window suffix |suff|(|w|) of the current window |w| defined as
391follows: let $k = \lfloor 2T \rfloor$ as before, then |suff|(|w|) is the
392longest suffix such that every triplet appears no more than
393$k$ times in |suff|(|w|).
394\item $s_l$ and $s_L$ for each window |w|, where $l$ is the number of triplets
395in |w|, $L$ is the number of triplets in |suff|(|w|) defined above; these values
396are called {\em outer\_sum} and {\em inner\_sum} in the code;
397\item the counts of each triplet value in |w| and |suff|(|w|); in the code these
398values are called, correspondingly, {\em outer\_counts} and {\em inner\_counts}.
399\end{itemize}
400
401Every time a window slides one letter to the right |push_triplet|() and
402|pop_triplet|() functions are called. These functions keep the the data
403structures consistent when adding or removing a triplet from the window. In the
404following code $|thresocc| = \lfloor 2T \rfloor$, and |sf| is the position of the start
405of the window suffix.
406
407\begin{program}
408\BEGIN
409\TYPE |counts_type| := \ARRAY[0..63] \OF |int|;
410\PROC |push_triplet|(
411    \tab |wstart|, |threshocc|                : |int|,
412         \VARP |sf|, |outer_sum|, |inner_sum| : |int|,
413         \VARP |triplets|                     : |dequeue| \OF |int|,
414         \VARP |outer_counts|, |inner_counts| : |counts_type| ) \untab\BODY
415    \BEGIN
416    \VARP t : |int|;
417    t                 := |last|(|triplets|);
418    |outer_sum|       := |outer_sum| + |outer_counts|[t];
419    |outer_counts|[t] := |outer_counts|[t] + 1;
420    |add_thres_info|(|triplets|,t,|thresocc|,|wstart|,|sf|,|inner_sum|,|inner_counts|)
421    \END
422\ENDPROC
423\PROC |pop_triplet|(
424    \tab\VARP |triplets|                     : |dequeue| \OF |int|,
425        \VARP |outer_sum|, |inner_sum|       : |int|,
426        \VARP |outer_counts|, |inner_counts| : |counts_type| ) \untab\BODY
427    \BEGIN
428    \VARP t : |int|;
429    t := |triplets|[0];
430    \IF |inner_counts|[t] = |outer_counts|[t]
431    \THEN |rem_thres_info|(t, |inner_sum|, |inner_counts| )
432    \FI
433    |outer_counts|[t] := |outer_counts|[t] - 1;
434    |outer_sum|       := |outer_sum| - |outer_counts|[t]
435    \END
436\ENDPROC
437\END
438\end{program}
439
440Procedures |add_thres_info|() and |rem_thres_info|() are responsible for maintaining
441the |inner_sum| and |inner_counts|.
442
443\begin{program}
444\BEGIN
445\TYPE |counts_type| := \ARRAY[0..63] \OF |int|;
446\PROC |add_thres_info|(
447    \tab|triplets|              : |dequeue| \OF |int|,
448        t, |thresocc|, |wstart|          : |int|,
449        \VARP |sf|, |inner_sum| : |int|,
450        \VARP |inner_counts|    : |counts_type|) \untab\BODY
451    \BEGIN
452    \VARP |offset| : |int|;
453    |inner_sum|       := |inner_sum| + |inner_counts|[t];
454    |inner_counts|[t] := |inner_counts|[t] + 1;
455    |offset|          := |sf| - |wstart|;
456    \IF |inner_counts|[t] > |thresocc|
457    \THEN
458        \COMMENT{Updating the current suffix. The following}
459        \COMMENT{loop removes all triplets from the front of the}
460        \COMMENT{suffix up to and including the first triplet with}
461        \COMMENT{value |t|. This makes the internal count of |t| less}
462        \COMMENT{than |threshocc|.}
463        \DODO
464            |rem_thres_info|(t, |inner_sum|, |inner_counts|);
465            |sf|     := |sf| + 1;
466            |offset| := |offset| + 1
467        \DOWHILE |triplets|[|offset| - 1] \not= t
468    \FI
469    \END
470\ENDPROC
471\PROC |rem_thres_info|(
472    \tab t                    : |int|,
473         \VARP |inner_sum|    : |int|,
474         \VARP |inner_counts| : |counts_type| ) \untab\BODY
475    \BEGIN
476    |inner_counts|[t] := |inner_counts|[t] - 1;
477    |inner_sum|       := |inner_sum| - |inner_counts|[t]
478    \END
479\ENDPROC
480\END
481\end{program}
482
483\subsection{Optimized Algorithm}
484
485Below is the pseudocode of the optimized algorithm.
486
487\begin{program}
488\BEGIN
489\TYPE |perfect|     := \TUPLE ( |start| : |int|,\ |end| : |int|,\ |score| : |real| );
490\TYPE |counts_type| := \ARRAY[0..63] \OF |int|;
491\PROC |process_window|(
492\tab \VARP |triplets|                     : |dequeue| \OF |int|,
493     |Tres|                               : |real|,
494     |Wsize|, |wstart|                    : |int|,
495     \VARP |sf|, |outer_sum|, |inner_sum| : |int|,
496     \VARP |outer_counts|, |inner_counts| : |counts_type|,
497     \VARP |Perf|                         : \LIST \OF |perfect| )\untab\BODY
498    \BEGIN
499    \VARP |counts|                         : |counts_type|;
500    \VARP i, |threshocc|, |len|, |sum|, |suffix_len| : |int|;
501    \VARP |curr_perfect|                   : |list_iter|;
502    \VARP |elem|                           : |perfect|;
503    |threshocc| := |floor|(2*|Tres|);
504    \IF |length|(|triplets|) > |Wsize| - 1
505    \THEN |pop_triplet|(
506        \qquad |triplets|,
507        \qquad |outer_sum|, |inner_sum|,
508        \qquad |outer_counts|, |inner_counts| )
509    \FI
510    |push_triplet|(
511        \qquad |wstart|, |threshocc|, |sf|,
512        \qquad |outer_sum|, |inner_sum|,
513        \qquad |triplets|,
514        \qquad |outer_counts|, |inner_counts| );
515    |suffix_len| := |length|(|triplets|) - (|sf|-|wstart|);
516    \IF |outer_sum| > |suffix_len|*|Tres|
517    \THEN
518        \FOR i:=0 \TO 63 \DO |counts|[i] := |inner_counts|[i] \OD;
519        |curr_perfect| := |last_iter|(|Perf|);
520        |max_score|    := 0
521        |len|          := |suffix_len|;
522        |sum|          := |inner_sum|;
523        \FOR i := |length|(|triplets|) - 1 - |suffix_len| \TO 0 \STEP -1 \DO
524            t           := |triplets|[i];
525            |sum|       := |sum| + |counts|[t];
526            |counts|[t] := |counts|[t] + 1;
527            \IF |sum|/|len| > |Tres|
528            \THEN
529                \WHILE |curr_perfect| \ne \NIL
530                       \AND |list_elem|(|curr_perfect|).|start| \ge i + |wstart| \DO
531                    |elem| := |list_elem|(|curr_perfect|);
532                    \IF |max_score| < |elem|.|score|
533                    \THEN |max_score|       := |elem|.|score|;
534                    \FI
535                    |current_prefect| := |prev|(|curr_perfect|)
536                \OD
537                \COMMENT{In the following comparison it is important}
538                \COMMENT{to consider the intervals with the score}
539                \COMMENT{equal to |max\_score|. The definition of a}
540                \COMMENT{perfect interval allows for proper}
541                \COMMENT{subintervals of equal score.}
542                \IF |sum|/|len| \ge |max_score|
543                \THEN |max_score| := |sum|/|len|;
544                      |curr_perfect| := |insert|(
545                          \qquad|curr_perfect|,
546                          \qquad(
547                          \qquad\ \quad i + |wstart|,
548                          \qquad\ \quad|length|(|triplets|) + 1 + |wstart|,
549                          \qquad\ \quad|max_score|
550                          \qquad))
551                \FI
552            \FI
553            |len| := |len| + 1
554        \OD
555    \FI
556    \END
557\ENDPROC
558\PROC |sdust|(
559    \tab |Seq|       : |sequence|,
560         |Tres|      : |real|,
561         |Wsize|     : |int|,
562         \VARP |Res| : |list_of_integer_pairs| )\untab\BODY
563    \COMMENT{ Results are returned in |Res|. }
564    \COMMENT{ |Res| is initially empty. }
565    \BEGIN
566    \VARP i, |sf|, |start|, |outer_sum|, |inner_sum| : |int|;
567    \VARP |outer_counts|, |inner_counts|             : |counts_type|;
568    \VARP |Perf|                                     : \LIST \OF |perfect|;
569    \VARP |triplets|                                 : |dequeue| \OF |int|;
570    |sf|        := 0;
571    |inner_sum| := 0;
572    |outer_sum| := 0;
573    \FOR i := 0 \TO 63 \DO
574        |inner_counts|[i] := 0;
575        |outer_counts|[i] := 0
576    \OD
577    \FOR i:=2 \TO |length|(|Seq|) - 1 \DO
578        |start|        := |max|(i-|Wsize|+1,0);
579        |push_back|( |triplets|, |triplet|(|Seq|, i-2) );
580        \WHILE \NOT |empty|(|Perf|) \AND |head|(|Perf|).|start| < |start| \DO
581            |append|(|Res|,(|head|(|Perf|).|start|,|head|(|Perf|).|end|));
582            |pop_front|(|Perf|)
583        \OD
584        |process_window|( |triplets|, |Tres|, |Wsize|, |start|, |sf|,
585        \qquad |outer_sum|, |inner_sum|, |outer_counts|, |inner_counts|, |Perf| )
586    \OD
587    \END
588\ENDPROC
589\END
590\end{program}
591
592\section{Performance}
593
594All tests were performed on a dual Pentium 4 Xeon 3.2 Ghz
595Linux workstation with 4 Gb of RAM running Linux kernel 2.4.23.
596Applications were compiled with GNU C/C++ compilers
597version 3.4.0 with full optimization (-O3 -fomit-frame-pointer -ffast-math).
598
599The following table shows running times in seconds of the original DUST
600and optimized symmetric DUST when masking genomes of {\it Drosophila melanogaster}
601and {\it Homo sapiens}.
602For each test 3 runs were performed and the average time is reported
603in the table.
604
605\vskip 1cm
606\begin{tabular}{|l|r|r|r|}
607\hline
608 & original & symmetric & ratio of symmetric DUST time\\
609 & DUST\hfill\hfill     & DUST\hfill\hfill      & to original DUST time \hfill\hfill \\
610\hline
611{\it D. melanogaster} & 113.27   & 32.47 & .2870 \\
612\hline
613{\it H. sapiens} chr1  & 231.86 & 65.73 & .2835 \\
614{\it H. sapiens} chr2  & 245.16 & 68.36 & .2788 \\
615{\it H. sapiens} chr3  & 195.81 & 54.58 & .2787 \\
616{\it H. sapiens} chr4  & 210.38 & 53.65 & .2550 \\
617{\it H. sapiens} chr5  & 197.54 & 50.30 & .2546 \\
618{\it H. sapiens} chr6  & 184.34 & 47.90 & .2598 \\
619{\it H. sapiens} chr7  & 162.33 & 45.05 & .2775 \\
620{\it H. sapiens} chr8  & 156.36 & 40.98 & .2621 \\
621{\it H. sapiens} chr9  & 130.81 & 33.84 & .2587 \\
622{\it H. sapiens} chr10 & 144.12 & 38.02 & .2638 \\
623{\it H. sapiens} chr11 & 133.29 & 36.61 & .2747 \\
624{\it H. sapiens} chr12 & 135.87 & 37.60 & .2767 \\
625{\it H. sapiens} chr13 & 101.65 & 27.24 & .2680 \\
626{\it H. sapiens} chr14 & 92.81  & 24.90 & .2683 \\
627{\it H. sapiens} chr15 & 85.92  & 24.01 & .2794 \\
628{\it H. sapiens} chr16 & 104.46 & 24.99 & .2392 \\
629{\it H. sapiens} chr17 & 102.5  & 25.55 & .2493 \\
630{\it H. sapiens} chr18 & 76.56  & 22.70 & .2965 \\
631{\it H. sapiens} chr19 & 68.59  & 19.02 & .2773 \\
632{\it H. sapiens} chr20 & 62.44  & 17.13 & .2743 \\
633{\it H. sapiens} chr21 & 37.12  & 10.09 & .2718 \\
634{\it H. sapiens} chr22 & 38.21  & 10.18 & .2664 \\
635{\it H. sapiens} chrX  & 167.67 & 62.98 & .3756 \\
636{\it H. sapiens} chrY  & 32.91  & 8.33  & .2531 \\
637%d.{\it melanogaster} & 113.27   & 90.17 & 32.47 & 71.3\% \\
638%run 1 & 1m52.839s & 1m29.546s & 0m32.335s & \\
639%run 2 & 1m53.221s & 1m30.962s & 0m32.686s & \\
640%run 3 & 1m53.750s & 1m30.006s & 0m32.380s & \\
641\hline
642\end{tabular}
643
644\vskip 1cm
645The next table shows the number of bases masked by each algorithm
646in genomes of d.{\it melanogaster} and h.{\it sapiens}.
647
648\begin{tabular}{|l|r|r|r|}
649\hline
650 & original DUST & symmetric DUST & percentage of increase \\
651\hline
652{\it D. melanogaster} & 5278484 & 5368015 & 1.70 \% \\
653\hline
654{\it H. sapiens} chr1  & 9827477  & 9962990  & 1.39 \% \\
655{\it H. sapiens} chr2  & 10396020 & 10534797 & 1.33 \% \\
656{\it H. sapiens} chr3  & 8174058  & 8291266  & 1.43 \% \\
657{\it H. sapiens} chr4  & 8501155  & 8612129  & 1.31 \% \\
658{\it H. sapiens} chr5  & 7606117  & 7711431  & 1.38 \% \\
659{\it H. sapiens} chr6  & 7327405  & 7429406  & 1.39 \% \\
660{\it H. sapiens} chr7  & 7134998  & 7231658  & 1.35 \% \\
661{\it H. sapiens} chr8  & 6209036  & 6292534  & 1.34 \% \\
662{\it H. sapiens} chr9  & 5181032  & 5251123  & 1.35 \% \\
663{\it H. sapiens} chr10 & 6008077  & 6086515  & 1.31 \% \\
664{\it H. sapiens} chr11 & 5467439  & 5539418  & 1.32 \% \\
665{\it H. sapiens} chr12 & 5815186  & 5895050  & 1.37 \% \\
666{\it H. sapiens} chr13 & 4319532  & 4377601  & 1.34 \% \\
667{\it H. sapiens} chr14 & 3775868  & 3828803  & 1.40 \% \\
668{\it H. sapiens} chr15 & 3539237  & 3587821  & 1.37 \% \\
669{\it H. sapiens} chr16 & 3899069  & 3946925  & 1.23 \% \\
670{\it H. sapiens} chr17 & 3766104  & 3817441  & 1.36 \% \\
671{\it H. sapiens} chr18 & 3259307  & 3303535  & 1.36 \% \\
672{\it H. sapiens} chr19 & 3326872  & 3368114  & 1.24 \% \\
673{\it H. sapiens} chr20 & 2619954  & 2654471  & 1.32 \% \\
674{\it H. sapiens} chr21 & 1631269  & 1651742  & 1.26 \% \\
675{\it H. sapiens} chr22 & 1630297  & 1651307  & 1.29 \% \\
676{\it H. sapiens} chrX  & 6954440  & 7044967  & 1.30 \% \\
677{\it H. sapiens} chrY  & 1534226  & 1547697  & 0.88 \% \\
678\hline
679\end{tabular}
680
681\section{Type and Function definitions}\label{secpseudo}
682
683\subsection{Types}
684
685|int| is the type used to represent integers
686
687|real| is the type used to represent real numbers
688
689|vector| is a sequence of elements of the same type. It is different from
690\ARRAY in that |vector| can have variable length. |vector| supports random
691access of its elements by index. Indices start at $0$.
692
693|dequeue| is a variable length sequence of elements of the same type supporting
694efficient random access of its elements as well as efficient append/remove to/from
695both ends of the sequence.
696
697|tuple| is a sequence of elements of possibly different types that has a fixed
698length. Elements of a tuple are named. If |tup| is a tuple with an element named
699|start|, then that element can be accessed as |tup|.|start|. In the pseudocode
700an instance of a tuple having, e.g. 3 elements |a|, |b|, and |c| is written
701as (|a|, |b|, |c|).
702
703|list| is a doubly linked list of elements of the same type. The number of
704elements in |list| is not fixed. |list| does not support random access of its
705elements, but supports efficient forward and backwards traversal.
706
707|list_iter| iterator type used to traverse a list.
708
709\TYPE |sequence| : |vector| \OF $\{ \mathbf{A}, \mathbf{C}, \mathbf{G}, \mathbf{T} \}$
710
711\TYPE |integer_pair| : |tuple|( |first| : |int|, |second| : |int| )
712
713\TYPE |list_of_integer_pairs| : |list| \OF |integer_pair|
714
715\subsection{Functions and Procedures}
716
717\PROC |append|( |L| : |list| \OF |elem_type|, |elem| : |elem_type| ) \ENDPROC \\
718Appends a new element to the back of the list.
719
720\FUNCT |floor|( |r| : |real| ) : |int| \ENDFUNCT \\
721Returns $\lfloor r \rfloor$.
722
723\FUNCT |head|( |L| : |list| \OF |elem_type| ) : |elem_type| \ENDFUNCT \\
724Returns the first element of the list.
725
726\PROC |insert|( |iter| : |list_iter|, |elem| : |elem_type| ) \ENDPROC \\
727Inserts a new element into the list in front of the one pointed to by the iterator.
728
729\FUNCT |last_iter|( |L| : |list| ) : |list_iter| \ENDFUNCT \\
730Returns an iterator pointing to the last element of the list.
731
732\FUNCT |length|( |C| : |container_type| ) : |int| \ENDFUNCT\\
733Generic function that returns the number of elements in a container
734(e.g. vector, sequence, etc.).
735
736\FUNCT |list_elem|( |iter| : |list_iter| ) : |elem_type| \ENDFUNCT \\
737Returns the element of the list pointed to by the iterator.
738
739\FUNCT |max|( |a| : |int|, |b| : |int| ) : |int| \ENDFUNCT \\
740Returns tha maximum of two integer values.
741
742\PROC |pop_front|( |C| : |container_type| ) \ENDPROC \\
743Removes the first element from sequential container.
744
745\FUNCT |prev|( |iter| : |list_iter| ) : |list_iter| \ENDFUNCT \\
746Gets the list iterator preceding the given iterator.
747
748\PROC |push_back|( |C| : |container_type|, |elem| : |elem_type| ) \ENDPROC \\
749Appends an element to a sequential container.
750
751\FUNCT |subsequence|( |S| : |sequence|, |start| : |int|, |end| : |int| ) : |sequence| \ENDFUNCT\\
752Returns a subsequence of |S|. |start| is the index of the first element of
753the subsequence. |end| is the index of the last element of the subsequence.
754
755\FUNCT |triplet|( |S| : |sequence|, |i| : |int| ) : |int|\ENDFUNCT\\
756Returns a triplet value of the triplet that starts at index |i| in |S|.
757
758\end{document}
759
760