1 #include <stdio.h>
2 #include <string.h>
3 #include <stdlib.h>
4 #include <math.h>
5 #include <pthread.h>
6 #include <assert.h>
7 #include "args.h"
8 #include "emp.h"
9 #include "reader.h"
10 #include "async.h"
11 #include "pear.h"
12 
13 /** @file pear-pt.c
14     @brief Main file containing scoring and assembly related functions (pthreads version)
15 */
16 #define         THREAD_MIN_PACKET_SIZE           500
17 #define         THREAD_PACKET_SIZE_DELTA         20
18 
19 #define         PEAR_MATCH_SCORE                 1
20 #define         PEAR_MISMATCH_SCORE              1
21 
22 #define         NUM_OF_OUTFILES                  4
23 
24 #define         PEAR_READ_UNASSEMBLED            3
25 #define         PEAR_READ_DISCARDED              2
26 #define         PEAR_READ_ASSEMBLED              1
27 
28 #define         PEAR_DECODE_OUT_TYPE(x)          (*((x->data) - 1))
29 #define         PEAR_RESET_OUT_TYPE(x)           *((x->data) - 1) = 0
30 #define         PEAR_DECODE_ASM_TYPE(x)          (*((x->qscore) - 1))
31 #define         PEAR_RESET_ASM_TYPE(x)           *((x->qscore) - 1) = 0
32 #define         PEAR_SET_ASM_TYPE(x,y)           *((x->qscore) - 1) = y
33 
34 #define         PEAR_READ_OUT_SINGLE             0
35 #define         PEAR_READ_OUT_BOTH               1
36 #define         PEAR_SET_OUT_TYPE(x,y)           *((x->data) - 1) = y
37 
38 #define         PEAR_MERGE_NO_TRIM              1
39 #define         PEAR_MERGE_TRIM_FORWARD         2
40 #define         PEAR_MERGE_TRIM_BOTH            3
41 #define         PEAR_MERGE_TRIM_REVERSE         4
42 
43 #define         DEGENERATE(x) (((x) ^ 1) && ((x) ^ 2) && ((x) ^ 4) && ((x) ^ 8))
44 
45 char map_nt[256] =
46  {
47    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
48    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
49    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, -1, -1,
50    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15,
51    -1,  1, 14,  2, 13, -1, -1,  4, 11, -1, -1, 12, -1,  3, 15, 15,
52    -1, -1,  5,  6,  8,  8,  7,  9, 15, 10, -1, -1, -1, -1, -1, -1,
53    -1,  1, 14,  2, 13, -1, -1,  4, 11, -1, -1, 12, -1,  3, 15, 15,
54    -1, -1,  5,  6,  8,  8,  7,  9, 15, 10, -1, -1, -1, -1, -1, -1,
55    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
56    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
57    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
58    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
59    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
60    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
61    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
62    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
63  };
64 
65 /* reverse mapping of number to nucleotide */
66 char revmap_nt[16] =
67  {
68    '\0', 'A', 'C', 'M', 'G', 'R', 'S', 'V',
69    'T',  'W', 'Y', 'H', 'K', 'D', 'B', 'N'
70  };
71 
72 /* complements of the nucleotides */
73 char cpl_nt[16]   =   {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15};
74 
75 /* translate from a 4-bit code to a table of 6 entries (AC, AG, AT, CG, CT, GT) */
76 int translate[16] =  {-1, 0, 1, 4, 2, 5, 7, -1, 3, 6,  8, -1,  9, -1, -1, -1};
77 
78 extern void print_number (size_t x);
79 
80 static int trim_cpl (fastqRead * read, struct user_args * sw, double * uncalled);
81 static int trim (fastqRead * read, struct user_args * sw, double * uncalled);
82 static void init_scores (struct emp_freq * ef);
83 static char * makefilename (const char * prefix, const char * suffix);
84 static void mstrcpl (char * s);
85 static void mstrrev (char * s);
86 static INLINE int assembly_FORWARD_LONGER (fastqRead * forward, fastqRead * reverse, struct emp_freq * ef, struct user_args  * sw, int nForward, int nReverse);
87 static void * entry_point_ef (void * data);
88 static void * entry_point (void * data);
89 static void DisplayInstance (struct user_args * sw);
90 static void free_global_thread_memory (void);
91 static void close_output_files (void);
92 static void destroy_thr_global (void);
93 static void fatal(const char * msg);
94 #if 0
95 static int validate_input (int nleft, int nright);
96 #endif
97 static void * emp_entry_point (void * data);
98 static void init_thr_global (void);
99 static INLINE int assembly_ef (fastqRead * forward, fastqRead * reverse, struct emp_freq * ef, struct user_args  * sw);
100 static void flip_list (void);
101 static INLINE int assign_reads (memBlockInfo * block, struct thread_local_t * thr_local);
102 static INLINE int assembly_REVERSE_LONGER (fastqRead * forward, fastqRead * reverse, struct emp_freq * ef, struct user_args  * sw, int nForward, int nReverse);
103 static INLINE int assembly_READS_EQUAL (fastqRead * forward, fastqRead * reverse, struct emp_freq * ef, struct user_args  * sw, int n);
104 static void write_data (fastqRead ** fwd, fastqRead ** rev, unsigned int elms, FILE ** fd, struct user_args * sw);
105 static INLINE int assembly (fastqRead * left, fastqRead * right, struct user_args  * sw);
106 static INLINE void scoring_nm (char dleft, char dright, char qleft, char qright, int score_method, double * score, double * oes);
107 static INLINE void scoring_ef_nm (char dleft, char dright, char qleft, char qright, int score_method, double * score, double * oes, struct emp_freq * ef);
108 
109 static char * outfile_extensions[NUM_OF_OUTFILES] = { ".assembled.fastq",
110                                                       ".unassembled.forward.fastq",
111                                                       ".unassembled.reverse.fastq",
112                                                       ".discarded.fastq" };
113 
114 static char * sanityCheckMessage[2] = {
115  "\n[!!] Forward reads file contains more lines. Merging is done line-by-line"
116  "on both files and remaining reads in forward file are ignored. You are"
117  "strongly advised to check that corresponding paired-end reads are located at"
118  "the same line numbers in your files.\n\n",
119  "\n[!!] Reverse reads file contains more lines. Merging is done line-by-line"
120  "on both files and remaining reads in reverse file are ignored. You are"
121  "strongly advised to check that corresponding paired-end reads are located at"
122  "the same line numbers in your files.\n\n"
123 };
124 
125 static pthread_mutex_t cs_mutex_wnd  = PTHREAD_MUTEX_INITIALIZER;
126 static pthread_mutex_t cs_mutex_io   = PTHREAD_MUTEX_INITIALIZER;
127 static pthread_cond_t  cs_mutex_cond = PTHREAD_COND_INITIALIZER;
128 
129 /* used for locking the screen when debugging */
130 #ifdef __DEBUG__
131 static pthread_mutex_t cs_mutex_out  = PTHREAD_MUTEX_INITIALIZER;
132 #endif
133 
134 struct thread_global_t thr_global;
135 
136 int inputFileSanity = 0;        /* Check on whether forward/reverse files have the same number of reads */
137 
138 
139 static unsigned long g_count_assembled   = 0;
140 static unsigned long g_count_discarded   = 0;
141 static unsigned long g_count_unassembled = 0;
142 static unsigned long g_count_total       = 0;
143 
144 
145 unsigned long total_progress = 0;
146 unsigned long current_progress = 0;
147 
148 
149 //int stat_test (double, double, int, double);
150 int stat_test2 (double, double, int, double);
151 
152 static double assemble_overlap (fastqRead * left,
153                          fastqRead * right,
154                          int base_left,
155                          int base_right,
156                          int ol_size,
157                          fastqRead * ai,
158                          struct user_args * sw);
159 
160 double      new_eq[4096];
161 double     new_neq[4096];
162 
163 /*
164 double     new_eqA[4096];
165 double     new_eqC[4096];
166 double     new_eqG[4096];
167 double     new_eqT[4096];
168 double  new_diff[6][4096];
169 */
170 
171 double  penaltyEF[10][4096];
172 
173 
174 
fatal(const char * msg)175 static void fatal(const char * msg)
176 {
177   fprintf(stderr, "Error: %s\n", msg);
178   exit(1);
179 }
180 
convert(char * s)181 static void convert(char * s)
182 {
183   char c;
184   char m;
185   char * p = s;
186   int i = 0;
187 
188   while ((c = *p++))
189    {
190      if ((m = map_nt[(int)c]) >= 0)
191         *(s + i++) = m;
192      else
193         fatal("Illegal character in sequence.");
194    }
195 }
196 
revconvert(char * s)197 static void revconvert(char * s)
198 {
199   char c;
200   char * p = s;
201   int i = 0;
202 
203   while ((c = *p++))
204      *(s + i++) = revmap_nt[(int)c];
205 }
206 
remove_base_qscores(char * s,int base)207 static void remove_base_qscores (char * s, int base)
208 {
209   while (*s)
210    {
211      *s = *s - base + 1;   /* qscores must start from 1 to distinguish end-of-string */
212      ++s;
213    }
214 }
215 
add_base_qscores(char * s,int base,int cap)216 static void add_base_qscores (char * s, int base, int cap)
217 {
218   if (cap)
219    {
220      cap = cap + 1;
221      while (*s)
222       {
223         if (*s > cap) *s = cap;
224         *s = *s + base - 1;
225         ++s;
226       }
227    }
228   else
229     while (*s)
230      {
231        *s = *s + base - 1;
232        ++s;
233      }
234 }
235 
236 
237 /** @brief Trimming of forward part of unassembled sequence
238   *
239   * Finds two adjacent quality scores that are smaller than the minimum quality score threshold \a min_quality.
240   * It then \e trims the part starting from the rightmost quality score position.
241   *
242   * @param read          Forward sequence
243   * @param min_quality   Minimum quality score threshold
244   * @param uncalled      Variable to store the ratio of uncalled bases
245   * @return              Returns the length of the trimmed sequence
246 */
trim(fastqRead * read,struct user_args * sw,double * uncalled)247 static int trim (fastqRead * read, struct user_args * sw, double * uncalled)
248 {
249   int
250    i;
251   char
252     * qscore,
253     * data;
254 
255   qscore = read->qscore + 1;
256   data   = read->data   + 1;
257   *uncalled = 0;
258 
259   i = 1;
260   if (!*data)
261    {
262      if (DEGENERATE (*(data - 1))) ++ *uncalled;
263      return (i);
264    }
265 
266   while (*data)
267    {
268      if (DEGENERATE(*(data - 1))) ++ (*uncalled);
269      if (*(data - 1) <= sw->qual_thres  && *data <= sw->qual_thres)
270       {
271         *data   = 0;
272         *qscore = 0;
273         *uncalled = (*uncalled) / (i);
274         return (i);
275       }
276      ++ i;
277      ++ data;
278    }
279   if (DEGENERATE(*(data - 1))) ++ (*uncalled);
280   *uncalled = (*uncalled) / i;
281   return (i);
282 }
283 
284 /** @brief Trimming of reverse part (reversed + complemented) of unassembled sequence
285   *
286   * Finds two adjacent quality scores that are smaller than the minimum quality score threshold \a min_quality.
287   * It then \e trims the part starting from the rightmost quality score position.
288   *
289   * @param read         Forward sequence
290   * @param min_quality  Minimum quality score threshold
291   * @param uncalled     Variable to store the ratio of uncalled bases
292   * @return             Returns the length of the trimmed sequence
293   */
trim_cpl(fastqRead * read,struct user_args * sw,double * uncalled)294 static int trim_cpl (fastqRead * read, struct user_args * sw, double * uncalled)
295 {
296   int
297    i,
298    len;
299   char
300     * qscore,
301     * data;
302 
303   qscore = read->qscore;
304   data   = read->data;
305   *uncalled = 0;
306   len = 0;
307 
308   while (*data)
309    {
310      ++len;
311      ++data;
312    }
313 
314   data = read->data;
315   for (i = len - 1; i > 0; -- i)
316    {
317      if (DEGENERATE (data[i])) ++ *uncalled;
318      if (qscore[i] <= sw->qual_thres && qscore[i - 1] <= sw->qual_thres)
319       {
320         qscore[i - 1] = 0;
321         data[i - 1]   = 0;
322         memmove (data,   data + i,   len - i + 1);
323         memmove (qscore, qscore + i, len - i + 1);
324         *uncalled = (*uncalled) / (len - i);
325         return (len - i);
326       }
327    }
328   if (DEGENERATE (*data)) ++ *uncalled;
329   *uncalled = (*uncalled) / len;
330   return (len);
331 }
332 
333 /** @brief Initialize table of precomputed scores
334 
335     This function computes a table of scores for every possible combination of
336     basepair and quality scores. The element \a sc_eqX[i][j] is the score for a
337     basepair \a X with quality scores \a i and \j in the two sequences.  The
338     element \a sc_neqXY[i][j] is the score of basepairs \a X and \a Y with
339     quality scores \a i and \j, respectively.
340 
341     @param match     The match weight to be used in scoring
342     @param mismatch  The mismatch weight to be used in scoring
343     @param ef        Structure containing empirical frequencies
344 */
init_scores(struct emp_freq * ef)345 static void init_scores (struct emp_freq * ef)
346 {
347   int i, j, k;
348   double
349     ex, ey,
350     pa2,   pc2,   pg2,   pt2,   pagt2, pcgt2, pact2, pacg2, pacg, pact, pagt, pcgt,
351     pac2,  pag2,  pat2,  pcg2,  pct2,  pgt2,
352     p2acg, p2act, p2agt, p2cgt;
353 
354   pa2 = ef->pa * ef->pa;
355   pc2 = ef->pc * ef->pc;
356   pg2 = ef->pg * ef->pg;
357   pt2 = ef->pt * ef->pt;
358 
359   pacg2 = pa2 + pc2 + pg2;  pacg = ef->pa + ef->pc + ef->pg; p2acg = pacg * pacg;
360   pact2 = pa2 + pc2 + pt2;  pact = ef->pa + ef->pc + ef->pt; p2act = pact * pact;
361   pagt2 = pa2 + pg2 + pt2;  pagt = ef->pa + ef->pg + ef->pt; p2agt = pagt * pagt;
362   pcgt2 = pc2 + pg2 + pt2;  pcgt = ef->pc + ef->pg + ef->pt; p2cgt = pcgt * pcgt;
363 
364   pac2 = (ef->pa + ef->pc) * (ef->pa + ef->pc);
365   pag2 = (ef->pa + ef->pg) * (ef->pa + ef->pg);
366   pat2 = (ef->pa + ef->pt) * (ef->pa + ef->pt);
367   pcg2 = (ef->pc + ef->pg) * (ef->pc + ef->pg);
368   pct2 = (ef->pc + ef->pt) * (ef->pc + ef->pt);
369   pgt2 = (ef->pg + ef->pt) * (ef->pg + ef->pt);
370 
371 
372   for (i = 1; i < 64; ++ i)
373    {
374      for (j = 1; j < 64; ++ j)
375       {
376         ex = pow (10.0, - (i - 1) / 10.0);
377         ey = pow (10.0, - (j - 1) / 10.0);
378 
379         k = (i << 6) | j;
380 
381         new_eq[k]  =    PEAR_MATCH_SCORE * ((1 - ex) * (1 - ey) + (ex * ey) / 3.0);
382         new_neq[k] = PEAR_MISMATCH_SCORE * (1 - (1.0 / 3.0) * (1 - ex) * ey - (1.0 / 3.0) * (1 - ey) * ex - (2.0 / 9.0) * ex * ey);
383         //qs_mul[i][j] = ex * ey;
384 
385 /*
386         new_eqA[k] = PEAR_MATCH_SCORE * (1 - ex) * (1 - ey) + (ex * ey) * pcgt2 / p2cgt;
387         new_eqC[k] = PEAR_MATCH_SCORE * (1 - ex) * (1 - ey) + (ex * ey) * pagt2 / p2agt;
388         new_eqG[k] = PEAR_MATCH_SCORE * (1 - ex) * (1 - ey) + (ex * ey) * pact2 / p2act;
389         new_eqT[k] = PEAR_MATCH_SCORE * (1 - ex) * (1 - ey) + (ex * ey) * pacg2 / p2acg;
390 */
391         /* compute the penalties for empirical frequencies
392 
393            0 = A
394            1 = C
395            2 = G
396            3 = T
397            4 = AC
398            5 = AG
399            6 = AT
400            7 = CG
401            8 = CT
402            9 = GT
403         */
404         penaltyEF[0][k] = PEAR_MATCH_SCORE * (1 - ex) * (1 - ey) + (ex * ey) * pcgt2 / p2cgt;
405         penaltyEF[1][k] = PEAR_MATCH_SCORE * (1 - ex) * (1 - ey) + (ex * ey) * pagt2 / p2agt;
406         penaltyEF[2][k] = PEAR_MATCH_SCORE * (1 - ex) * (1 - ey) + (ex * ey) * pact2 / p2act;
407         penaltyEF[3][k] = PEAR_MATCH_SCORE * (1 - ex) * (1 - ey) + (ex * ey) * pacg2 / p2acg;
408         //new_neqAC[k] = PEAR_MISMATCH_SCORE * (1 - (1 - ey) * ex * (ef->pc / pcgt) - (1 - ex) * ey * (ef->pa / pagt) - ex * ey * (pg2 + pt2) / pgt2);
409         penaltyEF[4][k] = PEAR_MISMATCH_SCORE * (1 - (1 - ey) * ex * (ef->pc / pcgt) - (1 - ex) * ey * (ef->pa / pagt) - ex * ey * (pg2 + pt2) / pgt2); // AC
410         //sc_neqCA[i][j] PEAR_MISMATCH_SCOREch * (1 - (1 - ey) * ex * (ef->pa / pagt) - (1 - ex) * ey * (ef->pc / pcgt) - ex * ey * (pg2 + pt2) / pgt2);
411 
412         //new_neqAG[k] = PEAR_MISMATCH_SCORE * (1 - (1 - ey) * ex * (ef->pg / pcgt) - (1 - ex) * ey * (ef->pa / pact) - ex * ey * (pc2 + pt2) / pct2);
413         penaltyEF[5][k] = PEAR_MISMATCH_SCORE * (1 - (1 - ey) * ex * (ef->pg / pcgt) - (1 - ex) * ey * (ef->pa / pact) - ex * ey * (pc2 + pt2) / pct2); // AG
414         //sc_neqGA[i][j] PEAR_MISMATCH_SCOREch * (1 - (1 - ey) * ex * (ef->pa / pact) - (1 - ex) * ey * (ef->pg / pcgt) - ex * ey * (pc2 + pt2) / pct2);
415 
416         //new_neqAT[k] = PEAR_MISMATCH_SCORE * (1 - (1 - ey) * ex * (ef->pt / pcgt) - (1 - ex) * ey * (ef->pa / pacg) - ex * ey * (pc2 + pg2) / pcg2);
417         penaltyEF[6][k] = PEAR_MISMATCH_SCORE * (1 - (1 - ey) * ex * (ef->pt / pcgt) - (1 - ex) * ey * (ef->pa / pacg) - ex * ey * (pc2 + pg2) / pcg2); // AT
418         //sc_neqTA[i][j] PEAR_MISMATCH_SCOREch * (1 - (1 - ey) * ex * (ef->pa / pacg) - (1 - ex) * ey * (ef->pt / pcgt) - ex * ey * (pc2 + pg2) / pcg2);
419 
420         //new_neqCG[k] = PEAR_MISMATCH_SCORE * (1 - (1 - ey) * ex * (ef->pg / pagt) - (1 - ex) * ey * (ef->pc / pact) - ex * ey * (pa2 + pt2) / pat2);
421         penaltyEF[7][k] = PEAR_MISMATCH_SCORE * (1 - (1 - ey) * ex * (ef->pg / pagt) - (1 - ex) * ey * (ef->pc / pact) - ex * ey * (pa2 + pt2) / pat2); // CG
422         //sc_neqGC[i][j] PEAR_MISMATCH_SCOREch * (1 - (1 - ey) * ex * (ef->pc / pact) - (1 - ex) * ey * (ef->pg / pagt) - ex * ey * (pa2 + pt2) / pat2);
423 
424         //new_neqCT[k] = PEAR_MISMATCH_SCORE * (1 - (1 - ey) * ex * (ef->pt / pagt) - (1 - ex) * ey * (ef->pc / pacg) - ex * ey * (pa2 + pg2) / pag2);
425         penaltyEF[8][k] = PEAR_MISMATCH_SCORE * (1 - (1 - ey) * ex * (ef->pt / pagt) - (1 - ex) * ey * (ef->pc / pacg) - ex * ey * (pa2 + pg2) / pag2); // CT
426         //sc_neqTC[i][j] PEAR_MISMATCH_SCOREch * (1 - (1 - ey) * ex * (ef->pc / pacg) - (1 - ex) * ey * (ef->pt / pagt) - ex * ey * (pa2 + pg2) / pag2);
427 
428         //new_neqGT[k] = PEAR_MISMATCH_SCORE * (1 - (1 - ey) * ex * (ef->pt / pact) - (1 - ex) * ey * (ef->pg / pacg) - ex * ey * (pa2 + pc2) / pac2);
429         penaltyEF[9][k] = PEAR_MISMATCH_SCORE * (1 - (1 - ey) * ex * (ef->pt / pact) - (1 - ex) * ey * (ef->pg / pacg) - ex * ey * (pa2 + pc2) / pac2); // GT
430         //sc_neqTG[i][j] = mismatch * (1 - (1 - ey) * ex * (ef->pg / pacg) - (1 - ex) * ey * (ef->pt / pact) - ex * ey * (pa2 + pc2) / pac2);
431 
432 
433      }
434    }
435 }
436 
437 #ifdef EXPERIMENTAL
438 INLINE void
scoring_ef(char dleft,char dright,char qleft,char qright,int score_method,double * score,double * oes,int match,int mismatch,struct emp_freq * ef)439 scoring_ef (char dleft, char dright, char qleft, char qright, int score_method, double * score, double * oes, int match, int mismatch, struct emp_freq * ef)
440 {
441   double tmp;
442 
443   if (DEGENERATE(dleft) || DEGENERATE(dright))       /* one of them is N */
444    {
445      switch (score_method)
446       {
447         case 1:
448           *score += (ef->q * match - (1 - ef->q) * mismatch);
449           *oes    = *score;
450           break;
451         case 2:
452           tmp     = (1 - ef->q) * mismatch;
453           *oes   += (ef->q * match - tmp);
454           *score -= tmp;
455           break;
456         case 3:
457           *score -= mismatch;
458           *oes   += (ef->q * match - (1 - ef->q) * mismatch);
459           break;
460       }
461    }
462   else if (dleft == dright)     /* equal */
463    {
464      switch (score_method)
465       {
466         case 1:
467           switch (dleft)
468            {
469              case 'A':
470                *score += (sc_eqA[(int)qright][(int)qleft] - (1 - sc_eqA[(int)qright][(int)qleft] / match) * mismatch);
471                break;
472              case 'C':
473                *score += (sc_eqC[(int)qright][(int)qleft] - (1 - sc_eqC[(int)qright][(int)qleft] / match) * mismatch);
474                break;
475              case 'G':
476                *score += (sc_eqG[(int)qright][(int)qleft] - (1 - sc_eqG[(int)qright][(int)qleft] / match) * mismatch);
477                break;
478              case 'T':
479                *score += (sc_eqT[(int)qright][(int)qleft] - (1 - sc_eqT[(int)qright][(int)qleft] / match) * mismatch);
480                break;
481            }
482           *oes    = *score;
483           break;
484         case 2:
485           switch (dleft)
486            {
487              case 'A':
488                *score += sc_eqA[(int)qright][(int)qleft];
489                *oes += (sc_eqA[(int)qright][(int)qleft] - (1 - sc_eqA[(int)qright][(int)qleft] / match) * mismatch);
490                break;
491              case 'C':
492                *score += sc_eqC[(int)qright][(int)qleft];
493                *oes   += (sc_eqC[(int)qright][(int)qleft] - (1 - sc_eqC[(int)qright][(int)qleft] / match) * mismatch);
494                break;
495              case 'G':
496                *score += sc_eqG[(int)qright][(int)qleft];
497                *oes   += (sc_eqG[(int)qright][(int)qleft] - (1 - sc_eqG[(int)qright][(int)qleft] / match) * mismatch);
498                break;
499              case 'T':
500                *score += sc_eqT[(int)qright][(int)qleft];
501                *oes   += (sc_eqT[(int)qright][(int)qleft] - (1 - sc_eqT[(int)qright][(int)qleft] / match) * mismatch);
502                break;
503            }
504           break;
505         case 3:
506           switch (dleft)
507            {
508              case 'A':
509                *oes += (sc_eqA[(int)qright][(int)qleft] - (1 - sc_eqA[(int)qright][(int)qleft] / match) * mismatch);
510                break;
511              case 'C':
512                *oes += (sc_eqC[(int)qright][(int)qleft] - (1 - sc_eqC[(int)qright][(int)qleft] / match) * mismatch);
513                break;
514              case 'G':
515                *oes += (sc_eqG[(int)qright][(int)qleft] - (1 - sc_eqG[(int)qright][(int)qleft] / match) * mismatch);
516                break;
517              case 'T':
518                *oes += (sc_eqT[(int)qright][(int)qleft] - (1 - sc_eqT[(int)qright][(int)qleft] / match) * mismatch);
519                break;
520            }
521           *score += match;
522           break;
523       }
524    }
525   else          /* not equal */
526    {
527      switch (score_method)
528       {
529         case 1:
530            switch  (dleft)
531            {
532              case 'A':
533                switch (dright)
534                 {
535                   case 'C':
536                     *score = *score - (sc_neqAC[(int)qleft][(int)qright] - (1 - sc_neqAC[(int)qleft][(int)qright] / mismatch) * match);
537                     break;
538                   case 'G':
539                     *score = *score - (sc_neqAG[(int)qleft][(int)qright] - (1 - sc_neqAG[(int)qleft][(int)qright] / mismatch) * match);
540                     break;
541                   case 'T':
542                     *score = *score - (sc_neqAT[(int)qleft][(int)qright] - (1 - sc_neqAT[(int)qleft][(int)qright] / mismatch) * match);
543                     break;
544                 }
545                break;
546              case 'C':
547                switch (dright)
548                 {
549                   case 'A':
550                     *score = *score - (sc_neqCA[(int)qleft][(int)qright] - (1 - sc_neqCA[(int)qleft][(int)qright] / mismatch) * match);
551                     break;
552                   case 'G':
553                     *score = *score - (sc_neqCG[(int)qleft][(int)qright] - (1 - sc_neqCG[(int)qleft][(int)qright] / mismatch) * match);
554                     break;
555                   case 'T':
556                     *score = *score - (sc_neqCT[(int)qleft][(int)qright] - (1 - sc_neqCT[(int)qleft][(int)qright] / mismatch) * match);
557                     break;
558                 }
559                break;
560              case 'G':
561                switch (dright)
562                 {
563                   case 'A':
564                     *score = *score - (sc_neqGA[(int)qleft][(int)qright] - (1 - sc_neqGA[(int)qleft][(int)qright] / mismatch) * match);
565                     break;
566                   case 'C':
567                     *score = *score - (sc_neqGC[(int)qleft][(int)qright] - (1 - sc_neqGC[(int)qleft][(int)qright] / mismatch) * match);
568                     break;
569                   case 'T':
570                     *score = *score - (sc_neqGT[(int)qleft][(int)qright] - (1 - sc_neqGT[(int)qleft][(int)qright] / mismatch) * match);
571                     break;
572                 }
573                break;
574              case 'T':
575                switch (dright)
576                 {
577                   case 'A':
578                     *score = *score - (sc_neqTA[(int)qleft][(int)qright] - (1 - sc_neqTA[(int)qleft][(int)qright] / mismatch) * match);
579                     break;
580                   case 'C':
581                     *score = *score - (sc_neqTC[(int)qleft][(int)qright] - (1 - sc_neqTC[(int)qleft][(int)qright] / mismatch) * match);
582                     break;
583                   case 'G':
584                     *score = *score - (sc_neqTG[(int)qleft][(int)qright] - (1 - sc_neqTG[(int)qleft][(int)qright] / mismatch) * match);
585                     break;
586                 }
587                break;
588            }
589           *oes = *score;
590           break;
591         case 2:
592           switch  (dleft)
593            {
594              case 'A':
595                switch (dright)
596                 {
597                   case 'C':
598                     *score -= sc_neqAC[(int)qleft][(int)qright];
599                     *oes    = *oes - (sc_neqAC[(int)qleft][(int)qright] - (1 - sc_neqAC[(int)qleft][(int)qright] / mismatch) * match);
600                     break;
601                   case 'G':
602                     *score -= sc_neqAG[(int)qleft][(int)qright];
603                     *oes    = *oes - (sc_neqAG[(int)qleft][(int)qright] - (1 - sc_neqAG[(int)qleft][(int)qright] / mismatch) * match);
604                     break;
605                   case 'T':
606                     *score -= sc_neqAT[(int)qleft][(int)qright];
607                     *oes    = *oes - (sc_neqAT[(int)qleft][(int)qright] - (1 - sc_neqAT[(int)qleft][(int)qright] / mismatch) * match);
608                     break;
609                 }
610                break;
611              case 'C':
612                switch (dright)
613                 {
614                   case 'A':
615                     *score -= sc_neqCA[(int)qleft][(int)qright];
616                     *oes    = *oes - (sc_neqCA[(int)qleft][(int)qright] - (1 - sc_neqCA[(int)qleft][(int)qright] / mismatch) * match);
617                     break;
618                   case 'G':
619                     *score -= sc_neqCG[(int)qleft][(int)qright];
620                     *oes    = *oes - (sc_neqCG[(int)qleft][(int)qright] - (1 - sc_neqCG[(int)qleft][(int)qright] / mismatch) * match);
621                     break;
622                   case 'T':
623                     *score -= sc_neqCT[(int)qleft][(int)qright];
624                     *oes    = *oes - (sc_neqCT[(int)qleft][(int)qright] - (1 - sc_neqCT[(int)qleft][(int)qright] / mismatch) * match);
625                     break;
626                 }
627                break;
628              case 'G':
629                switch (dright)
630                 {
631                   case 'A':
632                     *score -= sc_neqGA[(int)qleft][(int)qright];
633                     *oes    = *oes - (sc_neqGA[(int)qleft][(int)qright] - (1 - sc_neqGA[(int)qleft][(int)qright] / mismatch) * match);
634                     break;
635                   case 'C':
636                     *score -= sc_neqGC[(int)qleft][(int)qright];
637                     *oes    = *oes - (sc_neqGC[(int)qleft][(int)qright] - (1 - sc_neqGC[(int)qleft][(int)qright] / mismatch) * match);
638                     break;
639                   case 'T':
640                     *score -= sc_neqGT[(int)qleft][(int)qright];
641                     *oes    = *oes - (sc_neqGT[(int)qleft][(int)qright] - (1 - sc_neqGT[(int)qleft][(int)qright] / mismatch) * match);
642                     break;
643                 }
644                break;
645              case 'T':
646                switch (dright)
647                 {
648                   case 'A':
649                     *score -= sc_neqTA[(int)qleft][(int)qright];
650                     *oes    = *oes - (sc_neqTA[(int)qleft][(int)qright] - (1 - sc_neqTA[(int)qleft][(int)qright] / mismatch) * match);
651                     break;
652                   case 'C':
653                     *score -= sc_neqTC[(int)qleft][(int)qright];
654                     *oes    = *oes - (sc_neqTC[(int)qleft][(int)qright] - (1 - sc_neqTC[(int)qleft][(int)qright] / mismatch) * match);
655                     break;
656                   case 'G':
657                     *score -= sc_neqTG[(int)qleft][(int)qright];
658                     *oes    = *oes - (sc_neqTG[(int)qleft][(int)qright] - (1 - sc_neqTG[(int)qleft][(int)qright] / mismatch) * match);
659                     break;
660                 }
661                break;
662            }
663           break;
664         case 3:
665           *score -= mismatch;
666            switch  (dleft)
667            {
668              case 'A':
669                switch (dright)
670                 {
671                   case 'C':
672                     *oes = *oes - (sc_neqAC[(int)qleft][(int)qright] - (1 - sc_neqAC[(int)qleft][(int)qright] / mismatch) * match);
673                     break;
674                   case 'G':
675                     *oes = *oes - (sc_neqAG[(int)qleft][(int)qright] - (1 - sc_neqAG[(int)qleft][(int)qright] / mismatch) * match);
676                     break;
677                   case 'T':
678                     *oes = *oes - (sc_neqAT[(int)qleft][(int)qright] - (1 - sc_neqAT[(int)qleft][(int)qright] / mismatch) * match);
679                     break;
680                 }
681                break;
682              case 'C':
683                switch (dright)
684                 {
685                   case 'A':
686                     *oes = *oes - (sc_neqCA[(int)qleft][(int)qright] - (1 - sc_neqCA[(int)qleft][(int)qright] / mismatch) * match);
687                     break;
688                   case 'G':
689                     *oes = *oes - (sc_neqCG[(int)qleft][(int)qright] - (1 - sc_neqCG[(int)qleft][(int)qright] / mismatch) * match);
690                     break;
691                   case 'T':
692                     *oes = *oes - (sc_neqCT[(int)qleft][(int)qright] - (1 - sc_neqCT[(int)qleft][(int)qright] / mismatch) * match);
693                     break;
694                 }
695                break;
696              case 'G':
697                switch (dright)
698                 {
699                   case 'A':
700                     *oes = *oes - (sc_neqGA[(int)qleft][(int)qright] - (1 - sc_neqGA[(int)qleft][(int)qright] / mismatch) * match);
701                     break;
702                   case 'C':
703                     *oes = *oes - (sc_neqGC[(int)qleft][(int)qright] - (1 - sc_neqGC[(int)qleft][(int)qright] / mismatch) * match);
704                     break;
705                   case 'T':
706                     *oes = *oes - (sc_neqGT[(int)qleft][(int)qright] - (1 - sc_neqGT[(int)qleft][(int)qright] / mismatch) * match);
707                     break;
708                 }
709                break;
710              case 'T':
711                switch (dright)
712                 {
713                   case 'A':
714                     *oes = *oes - (sc_neqTA[(int)qleft][(int)qright] - (1 - sc_neqTA[(int)qleft][(int)qright] / mismatch) * match);
715                     break;
716                   case 'C':
717                     *oes = *oes - (sc_neqTC[(int)qleft][(int)qright] - (1 - sc_neqTC[(int)qleft][(int)qright] / mismatch) * match);
718                     break;
719                   case 'G':
720                     *oes = *oes - (sc_neqTG[(int)qleft][(int)qright] - (1 - sc_neqTG[(int)qleft][(int)qright] / mismatch) * match);
721                     break;
722                 }
723                break;
724            }
725           break;
726       }
727    }
728 }
729 #endif
730 
scoring_ef_nm(char dleft,char dright,char qleft,char qright,int score_method,double * score,double * oes,struct emp_freq * ef)731 static INLINE void scoring_ef_nm (char dleft, char dright, char qleft, char qright, int score_method, double * score, double * oes, struct emp_freq * ef)
732 {
733   double tmp;
734   int i, k;
735 
736   if ( (DEGENERATE (dleft)) || DEGENERATE (dright))       /* one of them is N */
737    {
738      switch (score_method)
739       {
740         case 1:
741           *score += (ef->q - (1 - ef->q));
742           *oes    = *score;
743           break;
744         case 2:
745           tmp     = (1 - ef->q);
746           *oes   += (ef->q - tmp);
747           *score -= tmp;
748           break;
749         case 3:
750           *score -= 1;
751           *oes   += (ef->q - (1 - ef->q));
752           break;
753         default:
754           assert (0);
755       }
756    }
757   else if (dleft == dright)     /* equal */
758    {
759      i = translate[dleft | dright];
760      assert (i != -1);
761      k = ((int)qright << 6) | (int)qleft;
762      switch (score_method)
763       {
764         case 1:
765           *score += (penaltyEF[i][k] - (1 - penaltyEF[i][k]));
766           *oes    = *score;
767           break;
768 
769         case 2:
770           *score += penaltyEF[i][k];
771           *oes   += (penaltyEF[i][k] - (1 - penaltyEF[i][k]));
772           break;
773         case 3:
774           *oes += (penaltyEF[i][k] - (1 - penaltyEF[i][k]));
775           *score += 1;
776           break;
777         default:
778           assert (0);
779       }
780    }
781   else          /* not equal */
782    {
783      if (dleft > dright)
784       {
785        /* swap the values */
786        dleft  = dleft ^ dright; dright = dleft ^ dright; dleft  = dleft ^ dright;
787 
788        /* swap quality scores */
789        qleft  = qleft ^ qright; qright = qleft ^ qright; qleft  = qleft ^ qright;
790 
791       }
792      i = translate[dleft | dright];
793      assert (i != -1);
794      k = ((int)qleft << 6) | (int)qright;
795      switch (score_method)
796       {
797         case 1:
798           *score = *score - (2 * penaltyEF[i][k])  + 1;
799           *oes = *score;
800           break;
801 
802         case 2:
803           *score -= penaltyEF[i][k];
804           *oes = *oes - (2 * penaltyEF[i][k]) + 1;
805           break;
806 
807         case 3:
808           *score -= 1;
809           *oes = *oes - (2 * penaltyEF[i][k]) + 1;
810           break;
811         default:
812           assert (0);
813       }
814    }
815 }
816 
817 #if 0
818 /* TODO: Remember to speed up this function by doing something with the multiplication and division of match/mismatch */
819 INLINE void  scoring (char dleft, char dright, char qleft, char qright, int score_method, double * score, double * oes, int match, int mismatch)
820 {
821   double tmp;
822 
823   if (dleft > UNCALLED || dright > UNCALLED)       /* one of them is N */
824    {
825      switch (score_method)
826       {
827         case 1:
828           *score += (0.25 * match - (1 - 0.25) * mismatch);
829           *oes    = *score;
830           break;
831         case 2:
832           tmp     = (1 - 0.25) * mismatch;
833           *oes   += (0.25 * match - tmp);
834           *score -= tmp;
835           break;
836         case 3:
837           *score -= mismatch;
838           *oes += (0.25 * match - (1 - 0.25) * mismatch);
839           break;
840       }
841    }
842   else if (dleft == dright)     /* equal */
843    {
844      switch (score_method)
845       {
846         case 1:
847           *score += (sc_eq[(int)qright][(int)qleft] - (1 - sc_eq[(int)qright][(int)qleft] / match) * mismatch);
848           *oes    = *score;
849           break;
850         case 2:
851           tmp     = sc_eq[(int)qright][(int)qleft];
852           *oes   += (tmp - (1 - sc_eq[(int)qright][(int)qleft] / match) * mismatch);
853           *score += tmp;
854           break;
855         case 3:
856           *score += match;
857           *oes   += (sc_eq[(int)qright][(int)qleft] - (1 - sc_eq[(int)qright][(int)qleft] / match) * mismatch);
858           break;
859       }
860    }
861   else          /* not equal */
862    {
863      switch (score_method)
864       {
865         case 1:
866           *score = *score - (sc_neq[(int)qright][(int)qleft] - (1 - sc_neq[(int)qright][(int)qleft] / mismatch) * match);
867           *oes    = *score;
868           break;
869         case 2:
870           tmp     = sc_neq[(int)qright][(int)qleft];
871           *oes    = *oes - (tmp - (1 - sc_neq[(int)qright][(int)qleft] / mismatch) * match);
872           *score -= tmp;
873           break;
874         case 3:
875           *score -= mismatch;
876           *oes    = *score - (sc_neq[(int)qright][(int)qleft] - (1 - sc_neq[(int)qright][(int)qleft] / mismatch) * match);
877           break;
878       }
879    }
880 }
881 #endif
882 
883 /* TODO: Remember to speed up this function by doing something with the multiplication and division of match/mismatch */
scoring_nm(char dleft,char dright,char qleft,char qright,int score_method,double * score,double * oes)884 static INLINE void scoring_nm (char dleft, char dright, char qleft, char qright, int score_method, double * score, double * oes)
885 {
886   double tmp;
887   int k;
888 
889   k = ((int)qright << 6) | (int)qleft;
890   if (DEGENERATE (dleft) || DEGENERATE(dright))       /* one of them is N */
891    {
892      switch (score_method)
893       {
894         case 1:
895           *score += ( - 0.5);
896           *oes    = *score;
897           break;
898         case 2:
899           tmp     = 0.75;
900           *oes   += ( - 0.5);
901           *score -= 0.75;
902           break;
903         case 3:
904           *score -= 1;
905           *oes += ( - 0.5 );
906           break;
907         default:
908           assert (0);
909       }
910    }
911   else if (dleft == dright)     /* equal */
912    {
913      switch (score_method)
914       {
915         case 1:
916           //*score += (sc_eq[(int)qright][(int)qleft] - (1 - sc_eq[(int)qright][(int)qleft]) );
917           *score += (new_eq[k] - (1 - new_eq[k]) );
918           *oes    = *score;
919           break;
920         case 2:
921           //tmp     = sc_eq[(int)qright][(int)qleft];
922           tmp     = new_eq[k];
923           //*oes   += (tmp - (1 - sc_eq[(int)qright][(int)qleft] ));
924           *oes   += (tmp - (1 - new_eq[k] ));
925           *score += tmp;
926           break;
927         case 3:
928           *score += 1;
929           //*oes   += (sc_eq[(int)qright][(int)qleft] - (1 - sc_eq[(int)qright][(int)qleft] ) );
930           *oes   += (new_eq[k] - (1 - new_eq[k] ) );
931           break;
932         default:
933           assert (0);
934       }
935    }
936   else          /* not equal */
937    {
938      switch (score_method)
939       {
940         case 1:
941           //*score = *score - (sc_neq[(int)qright][(int)qleft] - (1 - sc_neq[(int)qright][(int)qleft] ) );
942           *score = *score - (new_neq[k] - (1 - new_neq[k] ) );
943           *oes    = *score;
944           break;
945         case 2:
946           //tmp     = sc_neq[(int)qright][(int)qleft];
947           tmp     = new_neq[k];
948           //*oes    = *oes - (tmp - (1 - sc_neq[(int)qright][(int)qleft] ) );
949           *oes    = *oes - (tmp - (1 - new_neq[k] ) );
950           *score -= tmp;
951           break;
952         case 3:
953           *score -= 1;
954           //*oes    = *score - (sc_neq[(int)qright][(int)qleft] - (1 - sc_neq[(int)qright][(int)qleft] ) );
955           *oes    = *score - (new_neq[k] - (1 - new_neq[k] ) );
956           break;
957         default:
958           assert (0);
959       }
960    }
961 }
962 
assembly_FORWARD_LONGER(fastqRead * forward,fastqRead * reverse,struct emp_freq * ef,struct user_args * sw,int nForward,int nReverse)963 static INLINE int assembly_FORWARD_LONGER (fastqRead * forward, fastqRead * reverse, struct emp_freq * ef, struct user_args  * sw, int nForward, int nReverse)
964 {
965   int
966     i,
967     j;
968   double
969     score = 0,
970     oes = 0,
971     best_score = 0,
972     best_oes = 0,
973     uncalled = 0;
974   int
975     best_overlap = 0,       /* overlap of the best alignment */
976     nMatch,
977     asm_len = 0,
978     st_pass,
979     bestScoreCase = 0;
980 
981      /*   -------------->
982      *                  <------
983      *    ....
984      *    ------------->
985      *           <------
986      */
987   for (i = 1; i <= nReverse; ++ i)
988    {
989      nMatch = score = oes = 0;
990      for (j = 0; j < i; ++ j)
991       {
992         scoring_ef_nm (forward->data[nForward - i + j],
993                        reverse->data[j],
994                        forward->qscore[nForward - i + j],
995                        reverse->qscore[j],
996                        sw->score_method,
997                        &score,
998                        &oes,
999                        ef);
1000 
1001         if (forward->data[nForward - i + j] == reverse->data[j]) ++nMatch;
1002       }
1003 
1004      if (score > best_score)
1005       {
1006         bestScoreCase = PEAR_MERGE_NO_TRIM;
1007         best_overlap  = i;
1008         best_score    = score;
1009         best_oes      = oes;
1010       }
1011    }
1012   /* compute score for runthrough case */
1013   /*   -------------->
1014   *           <------
1015   *    ....
1016   *    ------------->
1017   *    <------
1018   */
1019   for (i = nForward - nReverse  - 1; i >= 0; -- i)
1020    {
1021      score = oes = nMatch = 0;
1022      for (j = 0; j < nReverse; ++ j)
1023       {
1024         scoring_ef_nm (forward->data[i + j],
1025                        reverse->data[j],
1026                        forward->qscore[i + j],
1027                        reverse->qscore[j],
1028                        sw->score_method,
1029                        &score,
1030                        &oes,
1031                        ef);
1032         if (forward->data[i + j] == reverse->data[j]) ++nMatch;
1033       }
1034 
1035      if (score > best_score)
1036       {
1037         bestScoreCase = PEAR_MERGE_TRIM_FORWARD;
1038         best_overlap  = i;
1039         best_score    = score;
1040         best_oes      = oes;
1041       }
1042    }
1043   /* compute score for runthrough case */
1044   /*          -------------->
1045   *          <------
1046   *    ....
1047   *          ------------->
1048   *    <------
1049   */
1050   for (i = nReverse - 1; i > 0; -- i)
1051    {
1052      score = oes = nMatch = 0;
1053      for (j = 0; j < i; ++ j)
1054       {
1055         scoring_ef_nm (forward->data[j],
1056                        reverse->data[nReverse - i  + j],
1057                        forward->qscore[j],
1058                        reverse->qscore[nReverse - i + j],
1059                        sw->score_method,
1060                        &score,
1061                        &oes,
1062                        ef);
1063         if (forward->data[j] == reverse->data[nReverse - i + j]) ++nMatch;
1064       }
1065      if (score > best_score)
1066       {
1067         bestScoreCase = PEAR_MERGE_TRIM_BOTH;
1068         best_overlap  = i;
1069         best_score    = score;
1070         best_oes      = oes;
1071       }
1072    }
1073 
1074   if (!bestScoreCase) return (0);
1075 
1076   if (sw->test == 1)
1077     st_pass = stat_test2 (sw->p_value, best_oes, sw->min_overlap, ef->q);
1078   else
1079     st_pass = stat_test2 (sw->p_value, best_oes, best_overlap, ef->q);
1080 
1081   if (!st_pass) return (0);
1082 
1083   /* TODO PART */
1084 
1085   switch (bestScoreCase)
1086    {
1087      case PEAR_MERGE_NO_TRIM:
1088        if (best_overlap == 0)
1089         {
1090           assert(0);
1091           asm_len = nForward + nReverse;
1092 
1093           for (j = 0; j < nForward; ++ j)
1094             if ( DEGENERATE(forward->data[j]))  ++uncalled;
1095           for (j = 0; j < nReverse; ++ j)
1096             if (DEGENERATE(reverse->data[j]))  ++uncalled;
1097           uncalled /= (nForward + nReverse);
1098 
1099           if (0 >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1100            {
1101              PEAR_SET_OUT_TYPE(forward,PEAR_READ_OUT_BOTH);
1102            }
1103           else
1104            {
1105              return (0);
1106            }
1107         }
1108        else
1109         {
1110           asm_len = nForward + nReverse - best_overlap;
1111 
1112           /* count uncalled bases in the non-overlapping high-quality part of the forward read */
1113           for (j = 0; j < nForward - best_overlap; ++ j)
1114             if (DEGENERATE(forward->data[j]))  ++uncalled;
1115 
1116           /* count uncalled bases in the non-overlapping high-quality part of the reverse read */
1117           for (j = best_overlap; j < nReverse; ++ j)
1118             if (DEGENERATE(reverse->data[j]))  ++uncalled;
1119 
1120           /* count the uncalled bases in the overlapping part of the two reads */
1121           for (j = nForward - best_overlap; j < nForward; ++ j)
1122             if ((DEGENERATE(forward->data[j])) && (DEGENERATE(reverse->data[j - nForward + best_overlap])))  ++uncalled;
1123           uncalled /= asm_len;
1124 
1125           if (nForward + nReverse - asm_len >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1126            {
1127              if (asm_len > nForward)
1128                PEAR_SET_OUT_TYPE(forward,PEAR_READ_OUT_BOTH);   /* The merged read will require both memory buffers (forward + reverse) */
1129              else
1130                PEAR_SET_OUT_TYPE(forward,PEAR_READ_OUT_SINGLE); /* The merged read will fit inside the forward read mem buffer */
1131 
1132 
1133              assemble_overlap (forward, reverse, nForward - best_overlap, 0, best_overlap, forward, sw);
1134              memmove (reverse->data,   reverse->data   + best_overlap,  nReverse - best_overlap);
1135              memmove (reverse->qscore, reverse->qscore + best_overlap,  nReverse - best_overlap);
1136 
1137              reverse->data[nReverse   - best_overlap] = 0;
1138              reverse->qscore[nReverse - best_overlap] = 0;
1139            }
1140           else
1141            {
1142              return (0);
1143            }
1144         }
1145        break;
1146      case PEAR_MERGE_TRIM_FORWARD:
1147        /* note that here best_overlap refers to the position of the first byte of the overlapping region
1148           in the forward read */
1149        asm_len = best_overlap + nReverse;
1150 
1151        /* count uncalled bases in the non-overlapping high-quality part of the forward read */
1152        for (j = 0; j < best_overlap; ++ j)
1153          if (DEGENERATE(forward->data[j]))  ++uncalled;
1154 
1155        /* count the uncalled bases in the overlapping part of the two reads */
1156        for (j = best_overlap; j < best_overlap + nReverse; ++ j)
1157          if ((DEGENERATE(forward->data[j])) && (DEGENERATE(reverse->data[j - nReverse - best_overlap])))  ++uncalled;
1158        uncalled /= asm_len;
1159 
1160        if (nReverse >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1161         {
1162           PEAR_SET_OUT_TYPE(forward,PEAR_READ_OUT_SINGLE); /* The merged read will fit inside the forward read mem buffer */
1163 
1164           assemble_overlap (forward, reverse, best_overlap, 0, nReverse, forward, sw);
1165 
1166           forward->data[best_overlap + nReverse]   = 0;
1167           forward->qscore[best_overlap + nReverse] = 0;
1168         }
1169        else
1170         {
1171           return (0);
1172         }
1173        break;
1174      case PEAR_MERGE_TRIM_BOTH:
1175        asm_len = best_overlap;
1176 
1177        /* compute uncalled */
1178        for (j = 0; j < asm_len; ++ j)
1179          if ((DEGENERATE(forward->data[j])) && (DEGENERATE(reverse->data[nReverse - best_overlap + j]))) ++uncalled;
1180        uncalled /= asm_len;
1181 
1182        if (asm_len >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1183         {
1184           assemble_overlap (forward, reverse, 0, nReverse - best_overlap, best_overlap, forward, sw);
1185 
1186           forward->data[best_overlap]   = 0;
1187           forward->qscore[best_overlap] = 0;
1188           PEAR_SET_OUT_TYPE(forward,PEAR_READ_OUT_SINGLE);   /* flag that it's one piece */
1189         }
1190        else
1191         {
1192           return (0);
1193         }
1194        break;
1195      default:
1196        assert (0);
1197 
1198    }
1199 
1200   return (1);
1201 
1202 }
1203 
assembly_READS_EQUAL(fastqRead * forward,fastqRead * reverse,struct emp_freq * ef,struct user_args * sw,int n)1204 static INLINE int assembly_READS_EQUAL (fastqRead * forward, fastqRead * reverse, struct emp_freq * ef, struct user_args  * sw, int n)
1205 {
1206   int
1207     i,
1208     j;
1209   double
1210     score = 0,
1211     oes = 0,
1212     best_score = 0,
1213     best_oes = 0,
1214     uncalled = 0;
1215   int
1216     best_overlap = 0,       /* overlap of the best alignment */
1217     nMatch,
1218     asm_len = 0,
1219     st_pass,
1220     run_through = 0;
1221 
1222   /* compute score for every overlap */
1223   for (i = 0; i <= n; ++ i)    /* the size of the overlap */
1224    {
1225      nMatch = score = oes = 0;
1226      for (j = 0; j < i; ++ j)
1227       {
1228         scoring_ef_nm (forward->data[n - i + j],
1229                        reverse->data[j],
1230                        forward->qscore[n - i + j],
1231                        reverse->qscore[j],
1232                        sw->score_method,
1233                        &score,
1234                        &oes,
1235                        ef);
1236         if (forward->data[n - i + j] == reverse->data[j]) ++nMatch;
1237       }
1238      if (score > best_score)
1239       {
1240         best_overlap = i;
1241         best_score   = score;
1242         best_oes     = oes;
1243       }
1244    }
1245 
1246   /* compute for score for runthrough case */
1247   for (i = n - 1; i > 0; --i)
1248    {
1249      score = oes = nMatch = 0;
1250      for (j = 0; j < i; ++j)
1251       {
1252         scoring_ef_nm (forward->data[j],
1253                        reverse->data[n - i + j],
1254                        forward->qscore[j],
1255                        reverse->qscore[n - i + j],
1256                        sw->score_method,
1257                        &score,
1258                        &oes,
1259                        ef);
1260         if (forward->data[n - i + j] == reverse->data[j]) ++nMatch;
1261       }
1262 
1263      if (score > best_score)
1264       {
1265         run_through  = 1;
1266         best_overlap = i;
1267         best_score   = score;
1268         best_oes     = oes;
1269       }
1270    }
1271 
1272   /** NOW LET'S MERGE */
1273   if (sw->test == 1)
1274    {
1275      st_pass = stat_test2 (sw->p_value, best_oes, sw->min_overlap, ef->q);
1276    }
1277   else
1278    {
1279      st_pass = stat_test2 (sw->p_value, best_oes, best_overlap, ef->q);
1280    }
1281 
1282   if (!st_pass) return (0);
1283 
1284 
1285   /* do the assembly!!!! */
1286   if (!run_through)
1287    {
1288      if (best_overlap == 0)
1289       {
1290         asm_len = 2 * n;
1291 
1292         for (j = 0; j < n; ++ j)
1293           if (DEGENERATE(forward->data[j]))  ++uncalled;
1294         for (j = 0; j < n; ++ j)
1295           if (DEGENERATE(reverse->data[j]))  ++uncalled;
1296         uncalled /= asm_len;
1297 
1298         if (2 * n - asm_len >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1299          {
1300            PEAR_SET_OUT_TYPE(forward,PEAR_READ_OUT_BOTH);
1301          }
1302         else
1303          {
1304            return (0);
1305          }
1306       }
1307      else if (best_overlap == n )
1308       {
1309         asm_len         = n;
1310 
1311         for (j = 0; j < asm_len; ++ j)
1312           if ((DEGENERATE(forward->data[j])) && (DEGENERATE(reverse->data[j]))) ++uncalled;
1313         uncalled /= asm_len;
1314 
1315         if (2 * n - asm_len >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1316          {
1317            PEAR_SET_OUT_TYPE(forward,PEAR_READ_OUT_SINGLE);
1318            assemble_overlap (forward, reverse, 0, 0, n, forward, sw);
1319            forward->data[n]   = 0;
1320            forward->qscore[n] = 0;
1321          }
1322         else
1323          {
1324            return (0);
1325          }
1326       }
1327      else
1328       {
1329         asm_len = 2 * n - best_overlap;
1330         for (j = 0; j < n - best_overlap; ++ j)
1331           if (DEGENERATE(forward->data[j]))  ++uncalled;
1332         for (j = n - best_overlap; j < n; ++ j)
1333           if ((DEGENERATE(forward->data[j])) && (DEGENERATE(reverse->data[j - n + best_overlap])))  ++uncalled;
1334         for (j = best_overlap; j < n; ++ j)
1335           if (DEGENERATE(reverse->data[j]))  ++uncalled;
1336         uncalled /= asm_len;
1337 
1338         if (2 * n - asm_len >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1339          {
1340            PEAR_SET_OUT_TYPE(forward,PEAR_READ_OUT_BOTH);
1341 
1342            assemble_overlap (forward, reverse, n - best_overlap, 0, best_overlap, forward, sw);
1343            memmove (reverse->data,   reverse->data   + best_overlap,  n - best_overlap);
1344            memmove (reverse->qscore, reverse->qscore + best_overlap,  n - best_overlap);
1345            /* THIS IS WRONG */
1346            //memcpy (reverse->data,   reverse->data   + best_overlap,  n - best_overlap);
1347            //memcpy (reverse->qscore, reverse->qscore + best_overlap,  n - best_overlap);
1348 
1349            reverse->data[n   - best_overlap] = 0;
1350            reverse->qscore[n - best_overlap] = 0;
1351          }
1352         else
1353          {
1354            return (0);
1355          }
1356       }
1357    }     /* run-through case */
1358   else
1359    {
1360      asm_len = best_overlap;
1361 
1362      /* compute uncalled */
1363      for (j = 0; j < asm_len; ++ j)
1364        if ((DEGENERATE(forward->data[j])) && (DEGENERATE(reverse->data[n - best_overlap + j]))) ++uncalled;
1365      uncalled /= asm_len;
1366 
1367      if (asm_len >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1368       {
1369         assemble_overlap (forward, reverse, 0, n - best_overlap, best_overlap, forward, sw);
1370 
1371         forward->data[best_overlap]   = 0;
1372         forward->qscore[best_overlap] = 0;
1373         PEAR_SET_OUT_TYPE(forward,PEAR_READ_OUT_SINGLE);   /* flag that it's one piece */
1374       }
1375      else
1376       {
1377         return (0);
1378       }
1379    }
1380 
1381   /* TODO: Remember to reset *(let->data - 1) */
1382 
1383   /* TODO: Optimize this in assemble_overlap? */
1384 
1385   return (1);
1386 
1387 }
1388 
assembly_REVERSE_LONGER(fastqRead * forward,fastqRead * reverse,struct emp_freq * ef,struct user_args * sw,int nForward,int nReverse)1389 static INLINE int assembly_REVERSE_LONGER (fastqRead * forward, fastqRead * reverse, struct emp_freq * ef, struct user_args  * sw, int nForward, int nReverse)
1390 {
1391   int
1392     i,
1393     j;
1394   double
1395     score = 0,
1396     oes = 0,
1397     best_score = 0,
1398     best_oes = 0,
1399     uncalled = 0;
1400   int
1401     best_overlap = 0,       /* overlap of the best alignment */
1402     nMatch,
1403     asm_len = 0,
1404     st_pass,
1405     bestScoreCase = 0;
1406 
1407   /*   -------->
1408   *            <------------
1409   *    ....
1410   *    ------->
1411   *    <------------
1412   */
1413   for (i = 0; i <= nForward; ++ i)
1414    {
1415      nMatch = score = oes = 0;
1416      for (j = 0; j < i; ++ j)
1417       {
1418         scoring_ef_nm (forward->data[nForward - i + j],
1419                        reverse->data[j],
1420                        forward->qscore[nForward - i + j],
1421                        reverse->qscore[j],
1422                        sw->score_method,
1423                        &score,
1424                        &oes,
1425                        ef);
1426         if (forward->data[nForward - i + j] == reverse->data[j]) ++nMatch;
1427       }
1428      if (score > best_score)
1429       {
1430         bestScoreCase = PEAR_MERGE_NO_TRIM;
1431         best_overlap  = i;
1432         best_score    = score;
1433         best_oes      = oes;
1434       }
1435    }
1436   /*           -------->
1437   *           <------------
1438   *    ....
1439   *         ------->
1440   *    <------------
1441   */
1442   for (i = nReverse - nForward  - 1; i >= 0; -- i)
1443    {
1444      score = oes = nMatch = 0;
1445      for (j = 0; j < nForward; ++ j)
1446       {
1447         scoring_ef_nm (forward->data[j],
1448                        reverse->data[nReverse - nForward - i + j],
1449                        forward->qscore[j],
1450                        reverse->qscore[nReverse - nForward - i + j],
1451                        sw->score_method,
1452                        &score,
1453                        &oes,
1454                        ef);
1455         if (forward->data[j] == reverse->data[nReverse - nForward - i + j]) ++nMatch;
1456       }
1457      if (score > best_score)
1458       {
1459         bestScoreCase = PEAR_MERGE_TRIM_REVERSE;
1460         best_overlap  = i;  /* now this is the length of the remaining part of the reverse read on the right of the overlapping region  */
1461         best_score    = score;
1462         best_oes      = oes;
1463       }
1464    }
1465   /*               -------->
1466   *           <------------
1467   *    ....
1468   *                ------->
1469   *    <------------
1470   */
1471   for (i = nForward - 1; i > 0; -- i)
1472    {
1473      score = oes = nMatch = 0;
1474      for (j = 0; j < i; ++ j)
1475       {
1476         scoring_ef_nm (forward->data[j],
1477                        reverse->data[nReverse - i  + j],
1478                        forward->qscore[j],
1479                        reverse->qscore[nReverse - i + j],
1480                        sw->score_method,
1481                        &score,
1482                        &oes,
1483                        ef);
1484         if (forward->data[j] == reverse->data[nReverse - i + j]) ++nMatch;
1485       }
1486      if (score > best_score)
1487       {
1488         bestScoreCase = PEAR_MERGE_TRIM_BOTH;
1489         best_overlap  = i;
1490         best_score    = score;
1491         best_oes      = oes;
1492       }
1493    }
1494 
1495   if (!bestScoreCase) return (0);
1496 
1497   /* do a statistical test */
1498   if (sw->test == 1)
1499     st_pass = stat_test2 (sw->p_value, best_oes, sw->min_overlap, ef->q);
1500   else
1501     st_pass = stat_test2 (sw->p_value, best_oes, best_overlap, ef->q);
1502 
1503   if (!st_pass) return (0);
1504 
1505   switch (bestScoreCase)
1506    {
1507      case PEAR_MERGE_NO_TRIM:
1508        if (best_overlap == 0)
1509         {
1510           assert(0);
1511           asm_len = nForward + nReverse;
1512 
1513           for (j = 0; j < nForward; ++ j)
1514             if (DEGENERATE(forward->data[j]))  ++uncalled;
1515           for (j = 0; j < nReverse; ++ j)
1516             if (DEGENERATE(reverse->data[j]))  ++uncalled;
1517           uncalled /= (nForward + nReverse);
1518 
1519           if (0 >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1520            {
1521              PEAR_SET_OUT_TYPE(forward,PEAR_READ_OUT_BOTH);
1522            }
1523           else
1524            {
1525              return (0);
1526            }
1527         }
1528        else
1529         {
1530           asm_len = nForward + nReverse - best_overlap;
1531 
1532           /* count uncalled bases in the non-overlapping high-quality part of the forward read */
1533           for (j = 0; j < nForward - best_overlap; ++ j)
1534             if (DEGENERATE(forward->data[j]))  ++uncalled;
1535 
1536           /* count uncalled bases in the non-overlapping high-quality part of the reverse read */
1537           for (j = best_overlap; j < nReverse; ++ j)
1538             if (DEGENERATE(reverse->data[j]))  ++uncalled;
1539 
1540           /* count the uncalled bases in the overlapping part of the two reads */
1541           for (j = nForward - best_overlap; j < nForward; ++ j)
1542             if ((DEGENERATE(forward->data[j])) && (DEGENERATE(reverse->data[j - nForward + best_overlap])))  ++uncalled;
1543           uncalled /= asm_len;
1544 
1545           if (nForward + nReverse - asm_len >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1546            {
1547              if (asm_len > nForward)
1548                PEAR_SET_OUT_TYPE(forward,PEAR_READ_OUT_BOTH);   /* The merged read will require both memory buffers (forward + reverse) */
1549              else
1550                PEAR_SET_OUT_TYPE(forward,PEAR_READ_OUT_SINGLE); /* The merged read will fit inside the forward read mem buffer */
1551 
1552 
1553              assemble_overlap (forward, reverse, nForward - best_overlap, 0, best_overlap, forward, sw);
1554              memmove (reverse->data,   reverse->data   + best_overlap,  nReverse - best_overlap);
1555              memmove (reverse->qscore, reverse->qscore + best_overlap,  nReverse - best_overlap);
1556 
1557              reverse->data[nReverse   - best_overlap] = 0;
1558              reverse->qscore[nReverse - best_overlap] = 0;
1559            }
1560           else
1561            {
1562              return (0);
1563            }
1564         }
1565        break;
1566      case PEAR_MERGE_TRIM_REVERSE:
1567        /* note that here best_overlap refers to the length of the remaining part right of the overlapping regionn
1568           in the reverse read */
1569        asm_len = best_overlap + nForward;
1570 
1571        /* count uncalled bases in the non-overlapping high-quality part of the reverse read */
1572        for (j = nReverse - best_overlap; j < nReverse; ++ j)
1573          if (DEGENERATE(reverse->data[j]))  ++uncalled;
1574 
1575        /* count the uncalled bases in the overlapping part of the two reads */
1576        for (j = 0; j < nForward; ++ j)
1577          if ((DEGENERATE(forward->data[j])) && (DEGENERATE(reverse->data[nReverse - best_overlap - nForward + j])))  ++uncalled;
1578        uncalled /= asm_len;
1579 
1580        if (nForward >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1581         {
1582           PEAR_SET_OUT_TYPE(forward,PEAR_READ_OUT_BOTH);
1583 
1584           assemble_overlap (forward, reverse, 0, nReverse - nForward - best_overlap, nForward, forward, sw);
1585 
1586           memmove (reverse->data,   reverse->data + nReverse - best_overlap,   best_overlap);
1587           memmove (reverse->qscore, reverse->qscore + nReverse - best_overlap, best_overlap);
1588 
1589           reverse->data[best_overlap]   = 0;
1590           reverse->qscore[best_overlap] = 0;
1591         }
1592        else
1593         {
1594           return (0);
1595         }
1596        break;
1597      case PEAR_MERGE_TRIM_BOTH:
1598        asm_len = best_overlap;
1599 
1600        /* compute uncalled */
1601        for (j = 0; j < asm_len; ++ j)
1602          if ((DEGENERATE(forward->data[j])) && (DEGENERATE(reverse->data[nReverse - best_overlap + j]))) ++uncalled;
1603        uncalled /= asm_len;
1604 
1605        if (asm_len >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1606         {
1607           assemble_overlap (forward, reverse, 0, nReverse - best_overlap, best_overlap, forward, sw);
1608 
1609           forward->data[best_overlap]   = 0;
1610           forward->qscore[best_overlap] = 0;
1611           PEAR_SET_OUT_TYPE(forward,PEAR_READ_OUT_SINGLE);   /* flag that it's one piece */
1612         }
1613        else
1614         {
1615           return (0);
1616         }
1617        break;
1618      default:
1619        assert (0);
1620    }
1621 
1622   return (1);
1623 
1624 }
1625 
1626 
assembly_ef(fastqRead * forward,fastqRead * reverse,struct emp_freq * ef,struct user_args * sw)1627 static INLINE int assembly_ef (fastqRead * forward, fastqRead * reverse, struct emp_freq * ef, struct user_args  * sw)
1628 {
1629   int
1630     nForward,
1631     nReverse,
1632     rc;
1633 
1634   nForward = strlen (forward->data);
1635   nReverse = strlen (reverse->data);
1636 
1637   /************************* |FORWARD| > |REVERSE| ********************************/
1638   if (nForward > nReverse)
1639    {
1640      rc = assembly_FORWARD_LONGER (forward, reverse, ef, sw, nForward, nReverse);
1641    }
1642 
1643   /************************* |FORWARD| == |REVERSE| ********************************/
1644   else if (nForward == nReverse)
1645    {
1646      rc = assembly_READS_EQUAL (forward, reverse, ef, sw, nForward);
1647    }
1648   /************************* |FORWARD| < |REVERSE| ********************************/
1649   else
1650    {
1651      rc = assembly_REVERSE_LONGER (forward, reverse, ef, sw, nForward, nReverse);
1652    }
1653 
1654   return (rc);
1655 }
1656 
assembly(fastqRead * left,fastqRead * right,struct user_args * sw)1657 static INLINE int assembly (fastqRead * left, fastqRead * right, struct user_args  * sw)
1658 {
1659   int                   i,j;
1660   int                   n;
1661   double                score;
1662   double                oes;
1663   double                best_score = 0;
1664   double                best_oes   = 0;
1665   int                   best_overlap = 0;       /* overlap of the best alignment */
1666   int                   run_through = 0;
1667   int                   nMatch;
1668   int                   asm_len = 0;
1669   int                   st_pass;
1670   double                uncalled = 0;
1671 
1672   n = strlen (left->data);
1673 
1674   /* compute score for every overlap */
1675   score = 0;
1676   oes   = 0;
1677   for (i = 0; i <= n; ++ i)    /* the size of the overlap */
1678    {
1679      nMatch = 0;
1680      score = 0;
1681      oes   = 0;
1682      for (j = 0; j < i; ++ j)
1683       {
1684         //scoring (left->data[n - i + j], right->data[j], left->qscore[n - i + j], right->qscore[j], sw->score_method, &score, &oes, match_score, mismatch_score);
1685         scoring_nm (left->data[n - i + j], right->data[j], left->qscore[n - i + j], right->qscore[j], sw->score_method, &score, &oes);
1686         if (left->data[n - i + j] == right->data[j]) ++nMatch;
1687       }
1688      if (score > best_score)
1689       {
1690         best_overlap = i;
1691         best_score   = score;
1692         best_oes     = oes;
1693       }
1694    }
1695 
1696   /* compute for score for runthrough case */
1697   for (i = n - 1; i > 0; --i)
1698    {
1699      score  = 0;
1700      oes    = 0;
1701      nMatch = 0;
1702      for (j = 0; j < i; ++j)
1703       {
1704         //scoring (left->data[j], right->data[n - i + j], left->qscore[j], right->qscore[n - i + j], sw->score_method, &score, &oes, match_score, mismatch_score);
1705         scoring_nm (left->data[j], right->data[n - i + j], left->qscore[j], right->qscore[n - i + j], sw->score_method, &score, &oes);
1706         if (left->data[n - i + j] == right->data[j]) ++nMatch;
1707       }
1708 
1709      if (score > best_score)
1710       {
1711         run_through  = 1;
1712         best_overlap = i;
1713         best_score   = score;
1714         best_oes     = oes;
1715       }
1716    }
1717 
1718 
1719   if (sw->test == 1)
1720    {
1721      //st_pass = stat_test2 (sw->p_value, best_score, sw->min_overlap, 0.25);
1722      st_pass = stat_test2 (sw->p_value, best_oes, sw->min_overlap, 0.25);
1723    }
1724   else
1725    {
1726      //st_pass = stat_test2 (sw->p_value, best_score, best_overlap, 0.25);
1727      st_pass = stat_test2 (sw->p_value, best_oes, best_overlap, 0.25);
1728    }
1729 
1730   if (!st_pass) return (0);
1731 
1732 
1733   /* do the assembly!!!! */
1734   if (!run_through)
1735    {
1736      if (best_overlap == 0)
1737       {
1738         asm_len = (n << 1);
1739 
1740         for (j = 0; j < n; ++ j)
1741           if (DEGENERATE(left->data[j]))  ++uncalled;
1742         for (j = 0; j < n; ++ j)
1743           if (DEGENERATE(right->data[j]))  ++uncalled;
1744         uncalled /= asm_len;
1745 
1746         if ((n << 1) - asm_len >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1747          {
1748            PEAR_SET_OUT_TYPE(left,PEAR_READ_OUT_BOTH);
1749          }
1750         else
1751          {
1752            return (0);
1753          }
1754       }
1755      else if (best_overlap == n )
1756       {
1757         asm_len         = n;
1758 
1759         for (j = 0; j < asm_len; ++ j)
1760           if ((DEGENERATE(left->data[j])) && (DEGENERATE(right->data[j]))) ++uncalled;
1761         uncalled /= asm_len;
1762 
1763         if ((n << 1) - asm_len >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1764          {
1765            PEAR_SET_OUT_TYPE(left,PEAR_READ_OUT_SINGLE);
1766            assemble_overlap (left, right, 0, 0, n, left, sw);
1767            left->data[n]   = 0;
1768            left->qscore[n] = 0;
1769          }
1770         else
1771          {
1772            return (0);
1773          }
1774       }
1775      else
1776       {
1777         asm_len = (n << 1) - best_overlap;
1778         for (j = 0; j < n - best_overlap; ++ j)
1779           if (DEGENERATE(left->data[j]))  ++uncalled;
1780         for (j = n - best_overlap; j < n; ++ j)
1781           if ((DEGENERATE(left->data[j])) && (DEGENERATE(right->data[j - n + best_overlap])))  ++uncalled;
1782         for (j = best_overlap; j < n; ++ j)
1783           if (DEGENERATE(right->data[j]))  ++uncalled;
1784         uncalled /= asm_len;
1785 
1786         if ((n << 1) - asm_len >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1787          {
1788            PEAR_SET_OUT_TYPE(left,PEAR_READ_OUT_BOTH);
1789            assemble_overlap (left, right, n - best_overlap, 0, best_overlap, left, sw);
1790            memmove (right->data,   right->data   + best_overlap,  n - best_overlap);
1791            memmove (right->qscore, right->qscore + best_overlap,  n - best_overlap);
1792            /* THIS IS WRONG */
1793            //memcpy (right->data,   right->data   + best_overlap,  n - best_overlap);
1794            //memcpy (right->qscore, right->qscore + best_overlap,  n - best_overlap);
1795 
1796            right->data[n   - best_overlap] = 0;
1797            right->qscore[n - best_overlap] = 0;
1798          }
1799         else
1800          {
1801            return (0);
1802          }
1803       }
1804    }     /* run-through case */
1805   else
1806    {
1807      asm_len = best_overlap;
1808 
1809      /* compute uncalled */
1810      for (j = 0; j < asm_len; ++ j)
1811        if ((DEGENERATE(left->data[j])) && (DEGENERATE(right->data[n - best_overlap + j]))) ++uncalled;
1812      uncalled /= asm_len;
1813 
1814      if (asm_len >= sw->min_overlap && asm_len >= sw->min_asm_len && asm_len <= sw->max_asm_len && uncalled <= sw->max_uncalled)
1815       {
1816         assemble_overlap (left, right, 0, n - best_overlap, best_overlap, left, sw);
1817 
1818         left->data[best_overlap]   = 0;
1819         left->qscore[best_overlap] = 0;
1820         PEAR_SET_OUT_TYPE(left,PEAR_READ_OUT_SINGLE);   /* flag that it's one piece */
1821       }
1822      else
1823       {
1824         return (0);
1825       }
1826    }
1827 
1828   /* TODO: Remember to reset *(let->data - 1) */
1829 
1830   /* TODO: Optimize this in assemble_overlap? */
1831 
1832   return (1);
1833 }
1834 
1835 /** @brief Assemble the overlap
1836 
1837     @param left
1838     @param right
1839     @param base_left  Position in the forward read where the overlap starts
1840     @param base_right Position in the reverse read where the overlap starts
1841     @param ol_size    Overlap size
1842     @param ai         Buffer where to store the merge read
1843 */
1844 
assemble_overlap(fastqRead * left,fastqRead * right,int base_left,int base_right,int ol_size,fastqRead * ai,struct user_args * sw)1845 static double assemble_overlap (fastqRead * left, fastqRead * right, int base_left, int base_right, int ol_size, fastqRead * ai, struct user_args * sw)
1846 {
1847   int           i;
1848   char          x, y;
1849   char          qx, qy;
1850   //double        exp_match  = 0;
1851 
1852   for (i = 0; i < ol_size; ++i)
1853    {
1854      x  = left->data[base_left + i];
1855      y  = right->data[base_right + i];
1856      qx = left->qscore[base_left + i];
1857      qy = right->qscore[base_right + i];
1858      if ((DEGENERATE(x)) && (DEGENERATE(y)))
1859       {
1860         //exp_match += 0.25; sm_len
1861         ai->data[base_left + i]   = x | y;
1862         ai->qscore[base_left + i] = ( qx < qy ) ? qx : qy;
1863       }
1864      else if (DEGENERATE(x))
1865       {
1866         //exp_match += 0.25;
1867         ai->data[base_left + i]   = y;
1868         ai->qscore[base_left + i] = qy;
1869       }
1870      else if (DEGENERATE(y))
1871       {
1872         //exp_match += 0.25;
1873         ai->data[base_left + i]   = x;
1874         ai->qscore[base_left + i] = qx;
1875       }
1876      else
1877       {
1878         if (x == y)
1879          {
1880            //exp_match += (sc_eq[(int)qx][(int)qy] / match_score);
1881 
1882            ai->data[base_left + i] = x;
1883            //ai->qscore[base_left + i] = (right->qscore[base_right + i] - sw->phred_base) + (left->qscore[base_left + i] - sw->phred_base) + sw->phred_base; //qs_mul[qx][qy];
1884            ai->qscore[base_left + i] = (right->qscore[base_right + i]) + (left->qscore[base_left + i]) - 1; //qs_mul[qx][qy];
1885          }
1886         else
1887          {
1888            //exp_match += (1 - sc_neq[(int)qx][(int)qy] / mismatch_score);
1889 
1890            if (qx > qy)
1891             {
1892               if (sw->nbase)    /* nbase option */
1893               {
1894                 ai->data[base_left + i] = map_nt['N'];
1895                 ai->qscore[base_left + i] = qx;
1896               }
1897               else
1898               {
1899                 ai->data[base_left + i]   =  x;
1900                 ai->qscore[base_left + i] = qx;
1901               }
1902             }
1903            else
1904             {
1905               if (sw->nbase)    /* nbase option */
1906               {
1907                 ai->data[base_left + i] = map_nt['N'];
1908                 ai->qscore[base_left + i] = qy;
1909               }
1910               else
1911               {
1912                 ai->data[base_left + i]   =  y;
1913                 ai->qscore[base_left + i] = qy;
1914               }
1915             }
1916          }
1917       }
1918    }
1919   return (0);
1920 }
1921 
mstrrev(char * s)1922 static void mstrrev (char * s)
1923 {
1924   char * q = s;
1925 
1926   while (q && *q) ++q;
1927 
1928   for (--q; s < q; ++s, --q)
1929    {
1930      *s = *s ^ *q;
1931      *q = *s ^ *q;
1932      *s = *s ^ *q;
1933    }
1934 }
1935 
mstrcpl(char * s)1936 static void mstrcpl (char * s)
1937 {
1938   if (!s) return;
1939 
1940   while (*s)
1941    {
1942       /* if the codes were 0,1,2,3,4 it would be
1943          that simple:
1944       *s = (*s >> 2) + (~*s & 3);
1945       */
1946 
1947       /* with codes 1,2,3,4,5 it gets nasty: */
1948       *s = cpl_nt[(int)*s];
1949       /*
1950       *s = (((*s & 1) & (! (*s & 2))) << 2) |
1951            (((!((*s & 4) >> 2) ) & ((*s & 2) >> 1)) << 1 ) |
1952            (((!((*s & 4) >> 2)) & ((*s & 2) >> 1) & !(*s & 1)) |
1953                 (((*s & 4) >> 2) & !((*s & 2) >> 1)));*/
1954       ++s;
1955    }
1956 }
1957 
1958 #if 0
1959 static int validate_input (int nleft, int nright)
1960 {
1961   if (!nleft || !nright)
1962    {
1963      fprintf (stderr, "ERROR: At least one of the input files contains no records.\n");
1964      return (0);
1965    }
1966 
1967   if (nleft != nright)
1968    {
1969      fprintf (stderr, "ERROR: Number of records in the two input files does not match.\n");
1970      return (0);
1971    }
1972 
1973   return (1);
1974 }
1975 #endif
1976 
1977 static char *
makefilename(const char * prefix,const char * suffix)1978 makefilename (const char * prefix, const char * suffix)
1979 {
1980   char * filename;
1981 
1982   filename = (char *) malloc ((strlen(prefix) + strlen(suffix) + 1) * sizeof (char));
1983 
1984   strcpy (filename, prefix);
1985   strcat (filename, suffix);
1986 
1987   return (filename);
1988 }
1989 
1990 /** @brief Write data to file
1991  *
1992  *  Write the result of overlapping of a specific read to the corresponding output file
1993  *
1994  *  @param fwd  Structure containing the forward reads. Note that this structure is
1995  *              mutable and has been modified to contain the new read which is the
1996  *              result of the assembly.
1997  *  @param rev  Structure containing the reverse reads. Note that this structure is
1998  *              mutable and has been modified to contain the new read which is the
1999  *              result of the assembly.
2000  *  @param elms Number of reads in the structure
2001  *  @param fd   Array of file descriptors of output files
2002  */
write_data(fastqRead ** fwd,fastqRead ** rev,unsigned int elms,FILE ** fd,struct user_args * sw)2003 static void write_data (fastqRead ** fwd, fastqRead ** rev, unsigned int elms, FILE ** fd, struct user_args * sw)
2004 {
2005   unsigned int i;
2006   char bothOut;   /* this is set if both fwd[i] and rev[i] contain the resulting assembled read and qscore */
2007 
2008   for ( i = 0; i < elms; ++ i)
2009    {
2010      bothOut = PEAR_DECODE_OUT_TYPE(fwd[i]);
2011      PEAR_RESET_OUT_TYPE(fwd[i]);
2012 
2013      if (PEAR_DECODE_ASM_TYPE(fwd[i]) == PEAR_READ_ASSEMBLED)   /* assembled */
2014       {
2015         PEAR_RESET_ASM_TYPE(fwd[i]);
2016         revconvert(fwd[i]->data);
2017         fprintf (fd[0], "%s\n", fwd[i]->header);
2018         if (!bothOut)
2019          {
2020            fprintf (fd[0], "%s\n", fwd[i]->data);
2021          }
2022         else
2023          {
2024            revconvert(rev[i]->data);
2025            fprintf (fd[0], "%s",   fwd[i]->data);
2026            fprintf (fd[0], "%s\n", rev[i]->data);
2027          }
2028         fprintf (fd[0], "+\n");
2029 
2030 
2031         add_base_qscores(fwd[i]->qscore, sw->phred_base, sw->cap);
2032         if (!bothOut)
2033          {
2034            fprintf (fd[0], "%s\n", fwd[i]->qscore);
2035          }
2036         else
2037          {
2038            add_base_qscores(rev[i]->qscore, sw->phred_base, sw->cap);
2039            fprintf (fd[0], "%s",   fwd[i]->qscore);
2040            fprintf (fd[0], "%s\n", rev[i]->qscore);
2041          }
2042 
2043         ++ g_count_assembled;
2044       }
2045      else
2046       {
2047         add_base_qscores(fwd[i]->qscore, sw->phred_base, sw->cap);
2048         add_base_qscores(rev[i]->qscore, sw->phred_base, sw->cap);
2049         revconvert(fwd[i]->data);
2050         revconvert(rev[i]->data);
2051         if (PEAR_DECODE_ASM_TYPE(fwd[i]) == PEAR_READ_DISCARDED)                                            /* discarded */
2052          {
2053            PEAR_RESET_ASM_TYPE(fwd[i]);
2054            /* discarded reads*/
2055            /* Maybe consider printing the untrimmed sequences */
2056            fprintf (fd[3], "%s\n", fwd[i]->header);
2057            fprintf (fd[3], "%s\n+\n%s\n", fwd[i]->data,  fwd[i]->qscore);
2058            fprintf (fd[3], "%s\n", rev[i]->header);
2059            fprintf (fd[3], "%s\n+\n%s\n", rev[i]->data, rev[i]->qscore); /* printing the reverse compliment of the original sequence */
2060            ++ g_count_discarded;
2061          }
2062         else   /* unassembled reads*/
2063          {
2064            PEAR_RESET_ASM_TYPE(fwd[i]);
2065            fprintf (fd[1], "%s\n", fwd[i]->header);
2066            fprintf (fd[2], "%s\n", rev[i]->header);
2067            fprintf (fd[1], "%s\n+\n%s\n", fwd[i]->data,  fwd[i]->qscore);
2068            fprintf (fd[2], "%s\n+\n%s\n", rev[i]->data, rev[i]->qscore); /* printing the reverse compliment of the original sequence */
2069            ++ g_count_unassembled;
2070          }
2071       }
2072    }
2073 }
2074 
flip_list(void)2075 static void flip_list (void)
2076 {
2077   memBlockInfo * elm1;
2078   memBlockInfo * elm2;
2079 
2080   elm1 = thr_global.xblock;
2081   elm2 = thr_global.yblock;
2082 
2083   thr_global.xblock = elm2;
2084   thr_global.yblock = elm1;
2085 }
2086 
assign_reads(memBlockInfo * block,struct thread_local_t * thr_local)2087 static INLINE int assign_reads (memBlockInfo * block, struct thread_local_t * thr_local)
2088 {
2089   unsigned int r;
2090 
2091   if (block->reads == block->processed) return (0);
2092 
2093   r = THREAD_MIN_PACKET_SIZE + (rand() % THREAD_MIN_PACKET_SIZE);
2094   thr_local->block   = block;
2095   thr_local->start   = block->processed;
2096 
2097   if (block->reads - block->processed <= r + THREAD_PACKET_SIZE_DELTA)
2098    {
2099      thr_local->end    = block->reads;
2100      block->processed  = block->reads;
2101    }
2102   else
2103    {
2104      thr_local->end    = thr_local->start + r;
2105      block->processed  = thr_local->start + r;
2106    }
2107   ++ block->threads;
2108   return (1);
2109 }
2110 
entry_point_ef(void * data)2111 static void * entry_point_ef (void * data)
2112 {
2113   struct thread_local_t * thr_local;
2114   int ass, sleep, elms;
2115   unsigned int i;
2116   double                uncalled_forward, uncalled_reverse;
2117 
2118   /* initialization of values */
2119   thr_local = (struct thread_local_t *)data;
2120 
2121 
2122   while (1)
2123    {
2124      // TODO: The first condition in the next line is useless?
2125      if (thr_local->block == thr_global.xblock && thr_global.io_thread == thr_local->id)
2126       {
2127         pthread_mutex_lock (&cs_mutex_io);
2128         //fprintf (stdout, "."); fflush (stdout);
2129         fprintf (stdout,"\rAssemblying reads: %d\%%", (int) ((current_progress / (double)total_progress) * 100));
2130         fflush(stdout);
2131         write_data (thr_global.xblock->fwd->reads, thr_global.xblock->rev->reads, thr_global.xblock->reads, thr_global.fd, thr_local->sw);
2132         // TODO: read_data ();
2133         elms = db_get_next_reads (thr_global.xblock->fwd,
2134                                   thr_global.xblock->rev,
2135                                   thr_global.yblock->fwd,
2136                                   thr_global.yblock->rev,
2137                                   &inputFileSanity);
2138           //read_size = strlen (fwd_block.reads[0]->data);
2139         pthread_mutex_unlock (&cs_mutex_io);
2140 
2141         pthread_mutex_lock (&cs_mutex_wnd);
2142         thr_global.xblock->reads      =  elms;
2143         thr_global.xblock->processed  =  0;
2144         thr_global.io_thread          = -1;
2145         thr_global.xblock->threads    =  0;
2146         if (!elms) thr_global.finish  =  1;
2147         flip_list ();
2148 
2149 #ifdef __DEBUG__
2150            pthread_mutex_lock (&cs_mutex_out);
2151            printf ("READ %d elms\n", elms);
2152            printf ("WAKE_UP_ALL!    (reads: %d processed: %d)\n", thr_global.xblock->reads, thr_global.xblock->processed);
2153            pthread_mutex_unlock (&cs_mutex_out);
2154 #endif
2155         // wakeup threads
2156         pthread_cond_broadcast (&cs_mutex_cond);
2157         pthread_mutex_unlock (&cs_mutex_wnd);
2158       }
2159 
2160      sleep = 1;
2161 
2162      pthread_mutex_lock (&cs_mutex_wnd);
2163      if (thr_global.xblock->reads == thr_global.xblock->processed)
2164       {
2165         if (thr_global.finish)
2166          {
2167            if (!thr_global.xblock->threads)
2168             {
2169               thr_global.xblock->threads = 1;
2170               pthread_mutex_unlock (&cs_mutex_wnd);
2171 //              printf ("!!!!!!!!!! reads: %d processed: %d\n", thr_global.xblock->reads, thr_global.xblock->processed);
2172 //              printf ("!!!!!!!!!! Writing another %d reads\n", thr_global.xblock->reads);
2173               write_data (thr_global.xblock->fwd->reads, thr_global.xblock->rev->reads, thr_global.xblock->reads, thr_global.fd, thr_local->sw);
2174 //              printf ("Finsihed\n");
2175               break;
2176             }
2177            pthread_mutex_unlock (&cs_mutex_wnd);
2178            break;
2179          }
2180         /* is this the last thread using the current buffer? */
2181         if (thr_global.xblock->threads == 0 && thr_global.io_thread == -1 && thr_global.finish == 0)
2182          {
2183 #ifdef __DEBUG__
2184            pthread_mutex_lock (&cs_mutex_out);
2185            printf ("ASSIGNED IO THREAD %d\n", thr_local->id);
2186            pthread_mutex_unlock (&cs_mutex_out);
2187 #endif
2188            thr_global.io_thread = thr_local->id;
2189            thr_local->block = thr_global.xblock;
2190            sleep = 0;
2191          }
2192         else
2193          {
2194            if (assign_reads (thr_global.yblock, thr_local)) sleep = 0;
2195 #ifdef __DEBUG__
2196            if (!sleep) {
2197            pthread_mutex_lock (&cs_mutex_out);
2198            printf ("ASSIGNED y READS to %d   (%d - %d)  Threads: %d\n", thr_local->id, thr_local->start, thr_local->end, thr_global.yblock->threads);
2199            pthread_mutex_unlock (&cs_mutex_out);}
2200 #endif
2201          }
2202       }
2203      else
2204       {
2205         if (assign_reads (thr_global.xblock,thr_local)) sleep = 0;
2206 #ifdef __DEBUG__
2207            if (!sleep) {
2208            pthread_mutex_lock (&cs_mutex_out);
2209            printf ("ASSIGNED x READS to %d  (%d - %d)   Threads: %d\n", thr_local->id, thr_local->start, thr_local->end, thr_global.xblock->threads);
2210            pthread_mutex_unlock (&cs_mutex_out);}
2211 #endif
2212       }
2213      if (sleep)
2214       {
2215 #ifdef __DEBUG__
2216            pthread_mutex_lock (&cs_mutex_out);
2217            printf ("Sleeping %d\n", thr_local->id);
2218            pthread_mutex_unlock (&cs_mutex_out);
2219 #endif
2220         pthread_cond_wait (&cs_mutex_cond, &cs_mutex_wnd);
2221 #ifdef __DEBUG__
2222            pthread_mutex_lock (&cs_mutex_out);
2223            printf ("WAKING %d\n", thr_local->id);
2224            pthread_mutex_unlock (&cs_mutex_out);
2225 #endif
2226       }
2227      pthread_mutex_unlock (&cs_mutex_wnd);
2228 
2229      if (!sleep && thr_global.io_thread != thr_local->id)
2230       {
2231         // TODO: Process reads and make this an inline function
2232         for (i = thr_local->start; i < thr_local->end; ++ i)
2233          {
2234            convert (thr_local->block->fwd->reads[i]->data);
2235            convert (thr_local->block->rev->reads[i]->data);
2236            remove_base_qscores (thr_local->block->fwd->reads[i]->qscore, thr_local->sw->phred_base);
2237            remove_base_qscores (thr_local->block->rev->reads[i]->qscore, thr_local->sw->phred_base);
2238 
2239            mstrrev (thr_local->block->rev->reads[i]->data);    /* reverse the sequence */
2240            mstrcpl (thr_local->block->rev->reads[i]->data);    /* complement the sequence */
2241            mstrrev (thr_local->block->rev->reads[i]->qscore);  /* reverse the quality scores */
2242 
2243            //TODO Switch for empirical frequencies
2244            ass = assembly_ef (thr_local->block->fwd->reads[i], thr_local->block->rev->reads[i], thr_local->ef, thr_local->sw);
2245            PEAR_SET_ASM_TYPE(thr_local->block->fwd->reads[i],ass);
2246            if (!ass)
2247             {
2248               if (trim     (thr_local->block->fwd->reads[i], thr_local->sw, &uncalled_forward) < thr_local->sw->min_trim_len ||
2249                   trim_cpl (thr_local->block->rev->reads[i], thr_local->sw, &uncalled_reverse) < thr_local->sw->min_trim_len ||
2250                   uncalled_forward >= thr_local->sw->max_uncalled || uncalled_reverse >= thr_local->sw->max_uncalled)
2251                {
2252                  PEAR_SET_ASM_TYPE(thr_local->block->fwd->reads[i],PEAR_READ_DISCARDED);
2253                }
2254               else
2255                {
2256                  PEAR_SET_ASM_TYPE(thr_local->block->fwd->reads[i],PEAR_READ_UNASSEMBLED);
2257                }
2258             }
2259          }
2260 
2261         pthread_mutex_lock (&cs_mutex_wnd);
2262           -- thr_local->block->threads;
2263 #ifdef __DEBUG__
2264            pthread_mutex_lock (&cs_mutex_out);
2265            printf ("Finished %d  (Remaining threads: %d)\n", thr_local->id, thr_local->block->threads);
2266            pthread_mutex_unlock (&cs_mutex_out);
2267 #endif
2268         pthread_mutex_unlock (&cs_mutex_wnd);
2269       }
2270   //   pthread_mutex_lock (&cs_mutex_out);
2271   //     printf ("Thread: %d\n", thr_local->id);
2272   //   pthread_mutex_unlock (&cs_mutex_out);
2273    }
2274   return (NULL);
2275 }
entry_point(void * data)2276 static void * entry_point (void * data)
2277 {  struct thread_local_t * thr_local;
2278   int ass, sleep, elms;
2279   unsigned int i;
2280   double                uncalled_forward, uncalled_reverse;
2281 
2282   /* initialization of values */
2283   thr_local = (struct thread_local_t *)data;
2284 
2285 
2286   while (1)
2287    {
2288      // TODO: The first condition in the next line is useless?
2289      if (thr_local->block == thr_global.xblock && thr_global.io_thread == thr_local->id)
2290       {
2291         pthread_mutex_lock (&cs_mutex_io);
2292         fprintf (stdout, "."); fflush (stdout);
2293         write_data (thr_global.xblock->fwd->reads, thr_global.xblock->rev->reads, thr_global.xblock->reads, thr_global.fd, thr_local->sw);
2294         // TODO: read_data ();
2295         elms = db_get_next_reads (thr_global.xblock->fwd,
2296                                   thr_global.xblock->rev,
2297                                   thr_global.yblock->fwd,
2298                                   thr_global.yblock->rev,
2299                                   &inputFileSanity);
2300           //read_size = strlen (fwd_block.reads[0]->data);
2301         pthread_mutex_unlock (&cs_mutex_io);
2302 
2303         pthread_mutex_lock (&cs_mutex_wnd);
2304         thr_global.xblock->reads      =  elms;
2305         thr_global.xblock->processed  =  0;
2306         thr_global.io_thread          = -1;
2307         thr_global.xblock->threads    =  0;
2308         if (!elms) thr_global.finish  =  1;
2309         flip_list ();
2310 
2311 #ifdef __DEBUG__
2312            pthread_mutex_lock (&cs_mutex_out);
2313            printf ("READ %d elms\n", elms);
2314            printf ("WAKE_UP_ALL!    (reads: %d processed: %d)\n", thr_global.xblock->reads, thr_global.xblock->processed);
2315            pthread_mutex_unlock (&cs_mutex_out);
2316 #endif
2317         // wakeup threads
2318         pthread_cond_broadcast (&cs_mutex_cond);
2319         pthread_mutex_unlock (&cs_mutex_wnd);
2320       }
2321 
2322      sleep = 1;
2323 
2324      pthread_mutex_lock (&cs_mutex_wnd);
2325      if (thr_global.xblock->reads == thr_global.xblock->processed)
2326       {
2327         if (thr_global.finish)
2328          {
2329            if (!thr_global.xblock->threads)
2330             {
2331               thr_global.xblock->threads = 1;
2332               pthread_mutex_unlock (&cs_mutex_wnd);
2333               write_data (thr_global.xblock->fwd->reads, thr_global.xblock->rev->reads, thr_global.xblock->reads, thr_global.fd, thr_local->sw);
2334             }
2335            pthread_mutex_unlock (&cs_mutex_wnd);
2336            break;
2337          }
2338         /* is this the last thread using the current buffer? */
2339         if (thr_global.xblock->threads == 0 && thr_global.io_thread == -1)
2340          {
2341 #ifdef __DEBUG__
2342            pthread_mutex_lock (&cs_mutex_out);
2343            printf ("ASSIGNED IO THREAD %d\n", thr_local->id);
2344            pthread_mutex_unlock (&cs_mutex_out);
2345 #endif
2346            thr_global.io_thread = thr_local->id;
2347            thr_local->block = thr_global.xblock;
2348            sleep = 0;
2349          }
2350         else
2351          {
2352            if (assign_reads (thr_global.yblock, thr_local)) sleep = 0;
2353 #ifdef __DEBUG__
2354            if (!sleep) {
2355            pthread_mutex_lock (&cs_mutex_out);
2356            printf ("ASSIGNED y READS to %d   (%d - %d)  Threads: %d\n", thr_local->id, thr_local->start, thr_local->end, thr_global.yblock->threads);
2357            pthread_mutex_unlock (&cs_mutex_out);}
2358 #endif
2359          }
2360       }
2361      else
2362       {
2363         if (assign_reads (thr_global.xblock,thr_local)) sleep = 0;
2364 #ifdef __DEBUG__
2365            if (!sleep) {
2366            pthread_mutex_lock (&cs_mutex_out);
2367            printf ("ASSIGNED x READS to %d  (%d - %d)   Threads: %d\n", thr_local->id, thr_local->start, thr_local->end, thr_global.xblock->threads);
2368            pthread_mutex_unlock (&cs_mutex_out);}
2369 #endif
2370       }
2371      if (sleep)
2372       {
2373 #ifdef __DEBUG__
2374            pthread_mutex_lock (&cs_mutex_out);
2375            printf ("Sleeping %d\n", thr_local->id);
2376            pthread_mutex_unlock (&cs_mutex_out);
2377 #endif
2378         pthread_cond_wait (&cs_mutex_cond, &cs_mutex_wnd);
2379 #ifdef __DEBUG__
2380            pthread_mutex_lock (&cs_mutex_out);
2381            printf ("WAKING %d\n", thr_local->id);
2382            pthread_mutex_unlock (&cs_mutex_out);
2383 #endif
2384       }
2385      pthread_mutex_unlock (&cs_mutex_wnd);
2386 
2387      if (!sleep && thr_global.io_thread != thr_local->id)
2388       {
2389         // TODO: Process reads and make this an inline function
2390         for (i = thr_local->start; i < thr_local->end; ++ i)
2391          {
2392            convert (thr_local->block->fwd->reads[i]->data);
2393            convert (thr_local->block->rev->reads[i]->data);
2394            remove_base_qscores (thr_local->block->fwd->reads[i]->qscore, thr_local->sw->phred_base);
2395            remove_base_qscores (thr_local->block->rev->reads[i]->qscore, thr_local->sw->phred_base);
2396 
2397            mstrrev (thr_local->block->rev->reads[i]->data);    /* reverse the sequence */
2398            mstrcpl (thr_local->block->rev->reads[i]->data);    /* complement the sequence */
2399            mstrrev (thr_local->block->rev->reads[i]->qscore);  /* reverse the quality scores */
2400 
2401            //TODO Switch for empirical frequencies
2402            ass = assembly (thr_local->block->fwd->reads[i], thr_local->block->rev->reads[i], thr_local->sw);
2403            PEAR_SET_ASM_TYPE(thr_local->block->fwd->reads[i],ass);
2404            if (!ass)
2405             {
2406               if (trim     (thr_local->block->fwd->reads[i], thr_local->sw, &uncalled_forward) < thr_local->sw->min_trim_len ||
2407                   trim_cpl (thr_local->block->rev->reads[i], thr_local->sw, &uncalled_reverse) < thr_local->sw->min_trim_len ||
2408                   uncalled_forward >= thr_local->sw->max_uncalled || uncalled_reverse >= thr_local->sw->max_uncalled)
2409                {
2410                  PEAR_SET_ASM_TYPE(thr_local->block->fwd->reads[i],PEAR_READ_DISCARDED);
2411                }
2412               else
2413                {
2414                  PEAR_SET_ASM_TYPE(thr_local->block->fwd->reads[i],PEAR_READ_UNASSEMBLED);
2415                }
2416             }
2417          }
2418 
2419         pthread_mutex_lock (&cs_mutex_wnd);
2420           -- thr_local->block->threads;
2421 #ifdef __DEBUG__
2422            pthread_mutex_lock (&cs_mutex_out);
2423            printf ("Finished %d  (Remaining threads: %d)\n", thr_local->id, thr_local->block->threads);
2424            pthread_mutex_unlock (&cs_mutex_out);
2425 #endif
2426         pthread_mutex_unlock (&cs_mutex_wnd);
2427       }
2428   //   pthread_mutex_lock (&cs_mutex_out);
2429   //     printf ("Thread: %d\n", thr_local->id);
2430   //   pthread_mutex_unlock (&cs_mutex_out);
2431    }
2432   return (NULL);
2433 }
2434 
init_thr_global(void)2435 static void init_thr_global (void)
2436 {
2437   thr_global.xblock = (memBlockInfo *) calloc (1,sizeof(memBlockInfo));
2438   thr_global.yblock = (memBlockInfo *) calloc (1,sizeof(memBlockInfo));
2439   thr_global.xblock->fwd = (memBlock *) calloc (1,sizeof(memBlock));
2440   thr_global.xblock->rev = (memBlock *) calloc (1,sizeof(memBlock));
2441   thr_global.yblock->fwd = (memBlock *) calloc (1,sizeof(memBlock));
2442   thr_global.yblock->rev = (memBlock *) calloc (1,sizeof(memBlock));
2443 
2444   thr_global.xblock->reads     = 0;
2445   thr_global.xblock->processed = 0;
2446   thr_global.xblock->threads   = 0;
2447   thr_global.yblock->reads     = 0;
2448   thr_global.yblock->processed = 0;
2449   thr_global.yblock->threads   = 0;
2450 
2451   thr_global.io_thread = -1;
2452   thr_global.finish = 0;
2453 }
2454 
close_output_files(void)2455 static void close_output_files (void)
2456 {
2457   fclose (thr_global.fd[0]);
2458   fclose (thr_global.fd[1]);
2459   fclose (thr_global.fd[2]);
2460   fclose (thr_global.fd[3]);
2461 }
2462 
free_global_thread_memory(void)2463 static void free_global_thread_memory (void)
2464 {
2465   free (thr_global.xblock->fwd);
2466   free (thr_global.xblock->rev);
2467   free (thr_global.yblock->fwd);
2468   free (thr_global.yblock->rev);
2469   free (thr_global.xblock);
2470   free (thr_global.yblock);
2471 }
2472 
destroy_thr_global(void)2473 static void destroy_thr_global (void)
2474 {
2475   close_output_files();
2476 
2477   /* free memory */
2478   free_global_thread_memory ();
2479 }
2480 
2481 /** @brief Initialize output file names
2482 
2483     Create output file names based on input parameters, open them for writing
2484     and store the file pointers to the global thread structures
2485 
2486     @param sw
2487       Parsed command-line parameters given by the user
2488 
2489     @return
2490       Returns 1 in case of success, 0 if error
2491 */
init_files(struct user_args * sw)2492 static int init_files (struct user_args * sw)
2493 {
2494   int i, j;
2495   char * out[4];
2496 
2497   /* construct output file names */
2498   for (i = 0; i < NUM_OF_OUTFILES; ++ i)
2499    {
2500      out[i] = makefilename (sw->outfile, outfile_extensions[i]);
2501      thr_global.fd[i] = fopen (out[i], "w");
2502      if (!thr_global.fd[i])
2503       {
2504         fprintf (stderr, "[ERROR]: Cannot create file %s.\n         "
2505         "Check whether a) directory exists and b) there is enough space\n", out[i]);
2506         free (out[i]);
2507         for (j = 0; j < i; ++ j) fclose (thr_global.fd[j]);
2508         return (0);
2509       }
2510      free (out[i]);
2511    }
2512 
2513   return (1);
2514 }
2515 
emp_entry_point(void * data)2516 static void * emp_entry_point (void * data)
2517 {
2518   struct thread_local_t * thr_local;
2519   int j, sleep, elms;
2520   unsigned int i;
2521   struct emp_freq * ef;
2522   fastqRead * reads[2];
2523   char * seq;
2524 
2525   /* initialization of values */
2526   thr_local = (struct thread_local_t *)data;
2527 
2528   ef = (struct emp_freq *) calloc (1, sizeof (struct emp_freq));
2529 
2530   while (1)
2531    {
2532      // TODO: The first condition in the next line is useless?
2533      if (thr_local->block == thr_global.xblock && thr_global.io_thread == thr_local->id)
2534       {
2535         pthread_mutex_lock (&cs_mutex_io);
2536         //write_data (thr_global.xblock->fwd->reads, thr_global.xblock->rev->reads, thr_global.xblock->reads, thr_global.fd);
2537         // TODO: read_data ();
2538         elms = db_get_next_reads (thr_global.xblock->fwd,
2539                                   thr_global.xblock->rev,
2540                                   thr_global.yblock->fwd,
2541                                   thr_global.yblock->rev,
2542                                   &inputFileSanity);
2543           //read_size = strlen (fwd_block.reads[0]->data);
2544         pthread_mutex_unlock (&cs_mutex_io);
2545 
2546         pthread_mutex_lock (&cs_mutex_wnd);
2547         thr_global.xblock->reads      =  elms;
2548         thr_global.xblock->processed  =  0;
2549         thr_global.io_thread          = -1;
2550         thr_global.xblock->threads    =  0;
2551         if (!elms) thr_global.finish  =  1;
2552         flip_list ();
2553 
2554 #ifdef __DEBUG__
2555            pthread_mutex_lock (&cs_mutex_out);
2556            printf ("READ %d elms\n", elms);
2557            printf ("WAKE_UP_ALL!    (reads: %d processed: %d)\n", thr_global.xblock->reads, thr_global.xblock->processed);
2558            pthread_mutex_unlock (&cs_mutex_out);
2559 #endif
2560         // wakeup threads
2561         pthread_cond_broadcast (&cs_mutex_cond);
2562         pthread_mutex_unlock (&cs_mutex_wnd);
2563       }
2564 
2565      sleep = 1;
2566 
2567      pthread_mutex_lock (&cs_mutex_wnd);
2568      if (thr_global.xblock->reads == thr_global.xblock->processed)
2569       {
2570         if (thr_global.finish)
2571          {
2572            pthread_mutex_unlock (&cs_mutex_wnd);
2573            break;
2574          }
2575         /*
2576         if (thr_global.finish)
2577          {
2578            pthread_mutex_unlock (&cs_mutex_wnd);
2579            if (!thr_global.xblock->threads)
2580             {
2581               write_data (thr_global.xblock->fwd->reads, thr_global.xblock->rev->reads, thr_global.xblock->reads, thr_global.fd);
2582             }
2583            break;
2584          }
2585         */
2586         /* is this the last thread using the current buffer? */
2587         if (thr_global.xblock->threads == 0 && thr_global.io_thread == -1)
2588          {
2589 #ifdef __DEBUG__
2590            pthread_mutex_lock (&cs_mutex_out);
2591            printf ("ASSIGNED IO THREAD %d\n", thr_local->id);
2592            pthread_mutex_unlock (&cs_mutex_out);
2593 #endif
2594            thr_global.io_thread = thr_local->id;
2595            thr_local->block = thr_global.xblock;
2596            sleep = 0;
2597          }
2598         else
2599          {
2600            if (assign_reads (thr_global.yblock, thr_local)) sleep = 0;
2601 #ifdef __DEBUG__
2602            if (!sleep) {
2603            pthread_mutex_lock (&cs_mutex_out);
2604            printf ("ASSIGNED y READS to %d   (%d - %d)  Threads: %d\n", thr_local->id, thr_local->start, thr_local->end, thr_global.yblock->threads);
2605            pthread_mutex_unlock (&cs_mutex_out);}
2606 #endif
2607          }
2608       }
2609      else
2610       {
2611         if (assign_reads (thr_global.xblock,thr_local)) sleep = 0;
2612 #ifdef __DEBUG__
2613            if (!sleep) {
2614            pthread_mutex_lock (&cs_mutex_out);
2615            printf ("ASSIGNED x READS to %d  (%d - %d)   Threads: %d\n", thr_local->id, thr_local->start, thr_local->end, thr_global.xblock->threads);
2616            pthread_mutex_unlock (&cs_mutex_out);}
2617 #endif
2618       }
2619      if (sleep)
2620       {
2621 #ifdef __DEBUG__
2622            pthread_mutex_lock (&cs_mutex_out);
2623            printf ("Sleeping %d\n", thr_local->id);
2624            pthread_mutex_unlock (&cs_mutex_out);
2625 #endif
2626         pthread_cond_wait (&cs_mutex_cond, &cs_mutex_wnd);
2627 #ifdef __DEBUG__
2628            pthread_mutex_lock (&cs_mutex_out);
2629            printf ("WAKING %d\n", thr_local->id);
2630            pthread_mutex_unlock (&cs_mutex_out);
2631 #endif
2632       }
2633      pthread_mutex_unlock (&cs_mutex_wnd);
2634 
2635      if (!sleep && thr_global.io_thread != thr_local->id)
2636       {
2637         // TODO: Process reads and make this an inline function
2638         for (i = thr_local->start; i < thr_local->end; ++ i)
2639          {
2640            reads[0] = thr_local->block->fwd->reads[i];
2641            reads[1] = thr_local->block->rev->reads[i];
2642            for (j = 0; j < 2; ++ j)
2643             {
2644               seq = reads[j]->data;
2645               while (*seq)
2646                {
2647                  switch (*seq)
2648                   {
2649                     case 'A':
2650                     case 'a':
2651                       ++ ef->freqa;
2652                       break;
2653 
2654                     case 'C':
2655                     case 'c':
2656                       ++ ef->freqc;
2657                       break;
2658 
2659                     case 'G':
2660                     case 'g':
2661                       ++ ef->freqg;
2662                       break;
2663 
2664                     case 'T':
2665                     case 't':
2666                       ++ ef->freqt;
2667                       break;
2668                     default:    /* uncalled bases */
2669                       ++ ef->freqn;
2670                       break;
2671 
2672                   }
2673                  ++ seq;
2674                }
2675             }
2676          }
2677 
2678         pthread_mutex_lock (&cs_mutex_wnd);
2679           -- thr_local->block->threads;
2680 #ifdef __DEBUG__
2681            pthread_mutex_lock (&cs_mutex_out);
2682            printf ("Finished %d  (Remaining threads: %d)\n", thr_local->id, thr_local->block->threads);
2683            pthread_mutex_unlock (&cs_mutex_out);
2684 #endif
2685         pthread_mutex_unlock (&cs_mutex_wnd);
2686       }
2687   //   pthread_mutex_lock (&cs_mutex_out);
2688   //     printf ("Thread: %d\n", thr_local->id);
2689   //   pthread_mutex_unlock (&cs_mutex_out);
2690    }
2691 //           pthread_mutex_lock (&cs_mutex_out);
2692 
2693 //  printf ("thread: %2d   A: %d C: %d G: %d T: %d\n", thr_local->id, ef->freqa, ef->freqc, ef->freqg, ef->freqt);
2694   //         pthread_mutex_unlock (&cs_mutex_out);
2695   return (ef);
2696 }
2697 
DisplayInstance(struct user_args * sw)2698 static void DisplayInstance (struct user_args * sw)
2699 {
2700   fprintf (stdout, " ____  _____    _    ____ \n");
2701   fprintf (stdout, "|  _ \\| ____|  / \\  |  _ \\\n");
2702   fprintf (stdout, "| |_) |  _|   / _ \\ | |_) |\n");
2703   fprintf (stdout, "|  __/| |___ / ___ \\|  _ <\n");
2704   fprintf (stdout, "|_|   |_____/_/   \\_\\_| \\_\\\n\n");
2705   fprintf (stdout, "%s v%s [%s]\n\n", PROGRAM_NAME, PROGRAM_VERSION, VERSION_DATE);
2706   fprintf (stdout, "Citation - PEAR: a fast and accurate Illumina Paired-End reAd mergeR\n");
2707   fprintf (stdout, "Zhang et al (2014) Bioinformatics 30(5): 614-620 | doi:10.1093/bioinformatics/btt593\n\n");
2708   fprintf (stdout, "Forward reads file.................: %s\n", sw->fastq_left);
2709   fprintf (stdout, "Reverse reads file.................: %s\n", sw->fastq_right);
2710   fprintf (stdout, "PHRED..............................: %d\n", sw->phred_base);
2711   fprintf (stdout, "Using empirical frequencies........: %s\n", sw->emp_freqs ? "YES" : "NO");
2712   fprintf (stdout, "Statistical method.................: %s\n", (sw->test - 1) ? "Acceptance probability" : "OES");
2713   fprintf (stdout, "Maximum assembly length............: %d\n", sw->max_asm_len);
2714   fprintf (stdout, "Minimum assembly length............: %d\n", sw->min_asm_len);
2715   fprintf (stdout, "p-value............................: %f\n", sw->p_value);
2716   fprintf (stdout, "Quality score threshold (trimming).: %d\n", sw->qual_thres);
2717   fprintf (stdout, "Minimum read size after trimming...: %d\n", sw->min_trim_len);
2718   fprintf (stdout, "Maximal ratio of uncalled bases....: %f\n", sw->max_uncalled);
2719   fprintf (stdout, "Minimum overlap....................: %d\n", sw->min_overlap);
2720   fprintf (stdout, "Scoring method.....................: ");
2721   if  (sw->score_method == 1)
2722    {
2723      fprintf (stdout, "OES with match = +1 and mismatch = -1\n");
2724    }
2725   else if (sw->score_method == 2)
2726    {
2727      fprintf (stdout, "Scaled score\n");
2728    }
2729   else
2730    {
2731      fprintf (stdout, "Ignore quality scores, match = +1 and mismatch = -1\n");
2732    }
2733   fprintf (stdout, "Threads............................: %d\n\n", sw->threads);
2734 
2735 
2736 }
2737 
main(int argc,char * argv[])2738 int main (int argc, char * argv[])
2739 {
2740   int                   i;
2741   struct user_args      sw;
2742   struct emp_freq * ef;
2743   struct thread_local_t * thr_data;
2744   double
2745     a = 0,
2746     c = 0,
2747     g = 0,
2748     t = 0,
2749     n = 0;
2750   pthread_t * tid;
2751   unsigned int
2752     blockElements;
2753 
2754   /* parse command-line arguments */
2755   if (!decode_switches (argc, argv, &sw))
2756    {
2757      /* TODO: Validate reads names */
2758      usage ();
2759      return (EXIT_FAILURE);
2760    }
2761 
2762   /* Display PEAR instance settings */
2763   DisplayInstance (&sw);
2764 
2765   /* check consistency of input files */
2766   if (!pear_check_files (sw.fastq_left, sw.fastq_right))
2767    {
2768      return (EXIT_FAILURE);
2769    }
2770 
2771   init_thr_global ();
2772   thr_data = (struct thread_local_t *) calloc (sw.threads, sizeof (struct thread_local_t));
2773   tid      = (pthread_t *) malloc (sw.threads * sizeof (pthread_t));
2774   if (!init_files (&sw))
2775    {
2776      free_global_thread_memory ();
2777      free (tid);
2778      free (thr_data);
2779      return (0);
2780    }
2781 
2782   /* Initialize read buffers */
2783   init_fastq_reader_double_buffer (sw.fastq_left,
2784                                    sw.fastq_right,
2785                                    sw.memory,
2786                                    thr_global.xblock->fwd,
2787                                    thr_global.xblock->rev,
2788                                    thr_global.yblock->fwd,
2789                                    thr_global.yblock->rev);
2790 
2791   if (sw.emp_freqs)
2792    {
2793      printf ("Computing empirical frequencies....: ");
2794      fflush (stdout);
2795      blockElements = db_get_next_reads (thr_global.yblock->fwd,
2796                                thr_global.yblock->rev,
2797                                thr_global.xblock->fwd,
2798                                thr_global.xblock->rev,
2799                                &inputFileSanity);
2800 
2801      thr_global.yblock->reads     = blockElements;
2802      thr_global.yblock->processed = 0;
2803 
2804      for (i = 0; i < sw.threads; ++ i)
2805       {
2806         thr_data[i].block          = thr_global.xblock;
2807         thr_data[i].id             = i;
2808         thr_data[i].sw             = &sw;
2809         thr_data[i].start          = 0;
2810         thr_data[i].end            = 0;
2811         pthread_create (&tid[i], NULL, emp_entry_point, (void *)&thr_data[i]);
2812       }
2813      for (i = 0; i < sw.threads; ++ i)
2814       {
2815          pthread_join (tid[i], (void **)&ef);
2816          a += ef->freqa; c += ef->freqc; g += ef->freqg; t += ef->freqt; n += ef->freqn;
2817          free (ef);
2818 //         printf ("tid: %d ef->a: %d ef->c: %d ef->g: %d ef->t: %d\n", (int)tid[i], ef->freqa, ef->freqc, ef->freqg, ef->freqt);
2819       }
2820      ef = (struct emp_freq *)malloc (sizeof(struct emp_freq));
2821      ef->freqa = a; ef->freqc = c; ef->freqg = g; ef->freqt = t; ef->freqn = n;
2822      ef->total = a + c + g + t;
2823      ef->pa = ef->freqa / ef->total; ef->pc = ef->freqc / ef->total; ef->pg = ef->freqg / ef->total; ef->pt = ef->freqt / ef->total;
2824      ef->q  = ef->pa * ef->pa + ef->pc * ef->pc + ef->pg * ef->pg + ef->pt * ef->pt;
2825      printf ("DONE\n");
2826      printf ("  A: %f\n  C: %f\n  G: %f\n  T: %f\n", ef->pa, ef->pc, ef->pg, ef->pt);
2827      printf ("  %ld uncalled bases\n", ef->freqn);      /* TODO: Note that only N's are reported */
2828      rewind_files (sw.fastq_left, sw.fastq_right);
2829      thr_global.xblock->fwd->unread = thr_global.xblock->rev->unread = NULL;
2830      thr_global.yblock->fwd->unread = thr_global.yblock->rev->unread = NULL;
2831      thr_global.yblock->reads = thr_global.xblock->reads = 0;
2832      thr_global.yblock->processed = thr_global.xblock->processed = 0;
2833      thr_global.finish = 0;
2834      thr_global.xblock->reads     = 0;
2835      thr_global.xblock->processed = 0;
2836      thr_global.xblock->threads   = 0;
2837      thr_global.yblock->reads     = 0;
2838      thr_global.yblock->processed = 0;
2839      thr_global.yblock->threads   = 0;
2840 
2841 
2842      thr_global.io_thread = -1;
2843      thr_global.finish = 0;
2844 
2845      if (inputFileSanity)
2846       {
2847         fprintf (stderr, "%s", sanityCheckMessage[inputFileSanity - 1]);
2848       }
2849      inputFileSanity = 0;
2850 
2851    }
2852   else
2853    {
2854      ef = (struct emp_freq *)malloc (sizeof(struct emp_freq));
2855      ef->freqa = ef->freqc = ef->freqg = ef->freqt = ef->total = ef->pa = ef->pc = ef->pg = ef->pt = ef->q = 0.25;
2856      sw.emp_freqs = 1;
2857    }
2858 
2859   init_scores(ef);
2860   current_progress = 0;
2861 
2862   //fprintf (stdout, "Assemblying reads..................: [");
2863   fprintf (stdout, "Assemblying reads: 0\%%");
2864   fflush (stdout);
2865 
2866   blockElements = db_get_next_reads (thr_global.yblock->fwd,
2867                             thr_global.yblock->rev,
2868                             thr_global.xblock->fwd,
2869                             thr_global.xblock->rev,
2870                             &inputFileSanity);
2871 
2872   thr_global.yblock->reads     = blockElements;
2873   thr_global.yblock->processed = 0;
2874 
2875   /* pthreads entry point */
2876   for (i = 0; i < sw.threads; ++ i)
2877    {
2878      thr_data[i].block  = thr_global.xblock;
2879      thr_data[i].id     = i;
2880      thr_data[i].sw     = &sw;
2881 //     thr_data[i].match_score = match_score;
2882 //     thr_data[i].mismatch_score = mismatch_score;
2883      thr_data[i].start  = 0;
2884      thr_data[i].end    = 0;
2885      thr_data[i].ef     = ef;
2886      if (sw.emp_freqs)
2887        pthread_create (&tid[i], NULL, entry_point_ef, (void *)&thr_data[i]);
2888      else
2889        pthread_create (&tid[i], NULL, entry_point, (void *)&thr_data[i]);
2890    }
2891 
2892   for (i = 0; i < sw.threads; ++ i)
2893    {
2894      pthread_join (tid[i], NULL);
2895    }
2896   printf ("\n\n");
2897 
2898   g_count_total = g_count_assembled + g_count_discarded + g_count_unassembled;
2899   printf ("Assembled reads ...................: ");
2900   print_number (g_count_assembled);
2901   printf (" / ");
2902   print_number (g_count_total);
2903   printf (" (%.3f%%)\n", ((double)g_count_assembled / g_count_total) * 100);
2904 
2905   printf ("Discarded reads ...................: ");
2906   print_number (g_count_discarded);
2907   printf (" / ");
2908   print_number (g_count_total);
2909   printf (" (%.3f%%)\n", ((double)g_count_discarded / g_count_total) * 100);
2910 
2911   printf ("Not assembled reads ...............: ");
2912   print_number (g_count_unassembled);
2913   printf (" / ");
2914   print_number (g_count_total);
2915   printf (" (%.3f%%)\n", ((double)g_count_unassembled / g_count_total) * 100);
2916 
2917 
2918 
2919   printf ("Assembled reads file...............: %s%s\n", sw.outfile, ".assembled.fastq");
2920   printf ("Discarded reads file...............: %s%s\n", sw.outfile, ".discarded.fastq");
2921   printf ("Unassembled forward reads file.....: %s%s\n", sw.outfile, ".unassembled.forward.fastq");
2922   printf ("Unassembled reverse reads file.....: %s%s\n", sw.outfile, ".unassembled.reverse.fastq" );
2923 
2924      if (inputFileSanity)
2925       {
2926         fprintf (stderr, "%s", sanityCheckMessage[inputFileSanity - 1]);
2927       }
2928   free (ef);
2929   free (tid);
2930 
2931   destroy_reader ();
2932   destroy_thr_global ();
2933   free (thr_data);
2934 
2935   return (EXIT_SUCCESS);
2936 }
2937