1 /****************************************************************\
2 *                                                                *
3 *  Library for HSP sets (high-scoring segment pairs)             *
4 *                                                                *
5 *  Guy St.C. Slater..   mailto:guy@ebi.ac.uk                     *
6 *  Copyright (C) 2000-2009.  All Rights Reserved.                *
7 *                                                                *
8 *  This source code is distributed under the terms of the        *
9 *  GNU General Public License, version 3. See the file COPYING   *
10 *  or http://www.gnu.org/licenses/gpl.txt for details            *
11 *                                                                *
12 *  If you use this code, please keep this notice intact.         *
13 *                                                                *
14 \****************************************************************/
15 
16 #include <string.h> /* For strlen()  */
17 #include <ctype.h>  /* For tolower() */
18 #include <stdlib.h> /* For qsort()   */
19 
20 #include "hspset.h"
21 
HSPset_ArgumentSet_create(Argument * arg)22 HSPset_ArgumentSet *HSPset_ArgumentSet_create(Argument *arg){
23     register ArgumentSet *as;
24     static HSPset_ArgumentSet has = {0};
25     if(arg){
26         as = ArgumentSet_create("HSP creation options");
27         ArgumentSet_add_option(as, '\0', "hspfilter", NULL,
28                 "Aggressive HSP filtering level", "0",
29                 Argument_parse_int, &has.filter_threshold);
30         ArgumentSet_add_option(as, '\0', "useworddropoff", NULL,
31                 "Use word neighbourhood dropoff", "TRUE",
32                 Argument_parse_boolean, &has.use_wordhood_dropoff);
33         ArgumentSet_add_option(as, '\0', "seedrepeat", NULL,
34                 "Seeds per diagonal required for HSP seeding", "1",
35                 Argument_parse_int, &has.seed_repeat);
36         /**/
37         ArgumentSet_add_option(as, 0, "dnawordlen", "bp",
38             "Wordlength for DNA words", "12",
39             Argument_parse_int, &has.dna_wordlen);
40         ArgumentSet_add_option(as, 0, "proteinwordlen", "aa",
41             "Wordlength for protein words", "6",
42             Argument_parse_int, &has.protein_wordlen);
43         ArgumentSet_add_option(as, 0, "codonwordlen", "bp",
44             "Wordlength for codon words", "12",
45             Argument_parse_int, &has.codon_wordlen);
46         /**/
47         ArgumentSet_add_option(as, 0, "dnahspdropoff", "score",
48             "DNA HSP dropoff score", "30",
49             Argument_parse_int, &has.dna_hsp_dropoff);
50         ArgumentSet_add_option(as, 0, "proteinhspdropoff", "score",
51             "Protein HSP dropoff score", "20",
52             Argument_parse_int, &has.protein_hsp_dropoff);
53         ArgumentSet_add_option(as, 0, "codonhspdropoff", "score",
54             "Codon HSP dropoff score", "40",
55             Argument_parse_int, &has.codon_hsp_dropoff);
56         /**/
57         ArgumentSet_add_option(as, 0, "dnahspthreshold", "score",
58             "DNA HSP threshold score", "75",
59             Argument_parse_int, &has.dna_hsp_threshold);
60         ArgumentSet_add_option(as, 0, "proteinhspthreshold", "score",
61             "Protein HSP threshold score", "30",
62             Argument_parse_int, &has.protein_hsp_threshold);
63         ArgumentSet_add_option(as, 0, "codonhspthreshold", "score",
64             "Codon HSP threshold score", "50",
65             Argument_parse_int, &has.codon_hsp_threshold);
66         /**/
67         ArgumentSet_add_option(as, 0, "dnawordlimit", "score",
68             "Score limit for dna word neighbourhood", "0",
69             Argument_parse_int, &has.dna_word_limit);
70         ArgumentSet_add_option(as, 0, "proteinwordlimit", "score",
71             "Score limit for protein word neighbourhood", "4",
72             Argument_parse_int, &has.protein_word_limit);
73         ArgumentSet_add_option(as, 0, "codonwordlimit", "score",
74             "Score limit for codon word neighbourhood", "4",
75             Argument_parse_int, &has.codon_word_limit);
76         /**/
77         ArgumentSet_add_option(as, '\0', "geneseed", "threshold",
78              "Geneseed Threshold", "0",
79              Argument_parse_int, &has.geneseed_threshold);
80         ArgumentSet_add_option(as, '\0', "geneseedrepeat", "number",
81              "Seeds per diagonal required for geneseed HSP seeding", "3",
82              Argument_parse_int, &has.geneseed_repeat);
83         /**/
84         Argument_absorb_ArgumentSet(arg, as);
85         }
86     return &has;
87     }
88 
89 /**/
90 
HSP_check_positions(HSPset * hsp_set,gint query_pos,gint target_pos)91 static gboolean HSP_check_positions(HSPset *hsp_set,
92                                     gint query_pos, gint target_pos){
93     g_assert(query_pos >= 0);
94     g_assert(target_pos >= 0);
95     g_assert(query_pos < hsp_set->query->len);
96     g_assert(target_pos < hsp_set->target->len);
97     return TRUE;
98     }
99 
HSP_check(HSP * hsp)100 static gboolean HSP_check(HSP *hsp){
101     g_assert(hsp);
102     g_assert(hsp->hsp_set);
103     g_assert(hsp->length);
104     g_assert(HSP_query_end(hsp) <= hsp->hsp_set->query->len);
105     g_assert(HSP_target_end(hsp) <= hsp->hsp_set->target->len);
106     return TRUE;
107     }
108 
HSP_Param_set_wordlen(HSP_Param * hsp_param,gint wordlen)109 void HSP_Param_set_wordlen(HSP_Param *hsp_param, gint wordlen){
110     if(wordlen <= 0)
111         g_error("Wordlength must be greater than zero");
112     hsp_param->wordlen = wordlen;
113     hsp_param->seedlen = hsp_param->wordlen
114                        / hsp_param->match->query->advance;
115     return;
116     }
117 
118 /**/
119 
HSP_Param_set_dna_hsp_threshold(HSP_Param * hsp_param,gint dna_hsp_threshold)120 void HSP_Param_set_dna_hsp_threshold(HSP_Param *hsp_param,
121                                      gint dna_hsp_threshold){
122     if(hsp_param->match->type == Match_Type_DNA2DNA)
123         hsp_param->threshold = dna_hsp_threshold;
124     return;
125     }
126 
HSP_Param_set_protein_hsp_threshold(HSP_Param * hsp_param,gint protein_hsp_threshold)127 void HSP_Param_set_protein_hsp_threshold(HSP_Param *hsp_param,
128                                          gint protein_hsp_threshold){
129     if((hsp_param->match->type == Match_Type_PROTEIN2PROTEIN)
130     || (hsp_param->match->type == Match_Type_PROTEIN2DNA)
131     || (hsp_param->match->type == Match_Type_DNA2PROTEIN))
132         hsp_param->threshold = protein_hsp_threshold;
133     return;
134     }
135 
HSP_Param_set_codon_hsp_threshold(HSP_Param * hsp_param,gint codon_hsp_threshold)136 void HSP_Param_set_codon_hsp_threshold(HSP_Param *hsp_param,
137                                        gint codon_hsp_threshold){
138     if(hsp_param->match->type == Match_Type_CODON2CODON)
139         hsp_param->threshold = codon_hsp_threshold;
140     return;
141     }
142 
143 /**/
144 
HSP_Param_refresh_wordhood(HSP_Param * hsp_param)145 static void HSP_Param_refresh_wordhood(HSP_Param *hsp_param){
146     register WordHood_Alphabet *wha = NULL;
147     register Submat *submat;
148     if(hsp_param->wordhood)
149         WordHood_destroy(hsp_param->wordhood);
150     if(hsp_param->has->use_wordhood_dropoff && (!hsp_param->wordlimit)){
151         hsp_param->wordhood = NULL;
152     } else {
153         submat = (hsp_param->match->type == Match_Type_DNA2DNA)
154                ? hsp_param->match->mas->dna_submat
155                : hsp_param->match->mas->protein_submat;
156         wha = WordHood_Alphabet_create_from_Submat(
157                 (gchar*)hsp_param->match->comparison_alphabet->member,
158                 (gchar*)hsp_param->match->comparison_alphabet->member,
159                 submat, FALSE);
160         g_assert(wha);
161         hsp_param->wordhood = WordHood_create(wha, hsp_param->wordlimit,
162                                        hsp_param->has->use_wordhood_dropoff);
163         WordHood_Alphabet_destroy(wha);
164         }
165     return;
166     }
167 /* FIXME: optimisation: do not free/recreate wordhood when unnecessary */
168 
HSP_Param_set_dna_word_limit(HSP_Param * hsp_param,gint dna_word_limit)169 void HSP_Param_set_dna_word_limit(HSP_Param *hsp_param,
170                                   gint dna_word_limit){
171     if(hsp_param->match->type == Match_Type_DNA2DNA){
172         hsp_param->wordlimit = dna_word_limit;
173         HSP_Param_refresh_wordhood(hsp_param);
174         }
175     return;
176     }
177 
HSP_Param_set_protein_word_limit(HSP_Param * hsp_param,gint protein_word_limit)178 void HSP_Param_set_protein_word_limit(HSP_Param *hsp_param,
179                                       gint protein_word_limit){
180     if((hsp_param->match->type == Match_Type_PROTEIN2PROTEIN)
181     || (hsp_param->match->type == Match_Type_PROTEIN2DNA)
182     || (hsp_param->match->type == Match_Type_DNA2PROTEIN)){
183         hsp_param->wordlimit = protein_word_limit;
184         HSP_Param_refresh_wordhood(hsp_param);
185         }
186     return;
187     }
188 
HSP_Param_set_codon_word_limit(HSP_Param * hsp_param,gint codon_word_limit)189 void HSP_Param_set_codon_word_limit(HSP_Param *hsp_param,
190                                     gint codon_word_limit){
191     if(hsp_param->match->type == Match_Type_CODON2CODON){
192         hsp_param->threshold = codon_word_limit;
193         HSP_Param_refresh_wordhood(hsp_param);
194         }
195     return;
196     }
197 
198 /**/
199 
HSP_Param_set_dna_hsp_dropoff(HSP_Param * hsp_param,gint dna_hsp_dropoff)200 void HSP_Param_set_dna_hsp_dropoff(HSP_Param *hsp_param,
201                                    gint dna_hsp_dropoff){
202     if(hsp_param->match->type == Match_Type_DNA2DNA)
203         hsp_param->dropoff = dna_hsp_dropoff;
204     return;
205     }
206 
HSP_Param_set_protein_hsp_dropoff(HSP_Param * hsp_param,gint protein_hsp_dropoff)207 void HSP_Param_set_protein_hsp_dropoff(HSP_Param *hsp_param,
208                                        gint protein_hsp_dropoff){
209     if((hsp_param->match->type == Match_Type_PROTEIN2PROTEIN)
210     || (hsp_param->match->type == Match_Type_PROTEIN2DNA)
211     || (hsp_param->match->type == Match_Type_DNA2PROTEIN))
212         hsp_param->dropoff  = protein_hsp_dropoff;
213     return;
214     }
215 
HSP_Param_set_codon_hsp_dropoff(HSP_Param * hsp_param,gint codon_hsp_dropoff)216 void HSP_Param_set_codon_hsp_dropoff(HSP_Param *hsp_param,
217                                      gint codon_hsp_dropoff){
218     if(hsp_param->match->type == Match_Type_CODON2CODON)
219         hsp_param->dropoff = codon_hsp_dropoff;
220     return;
221     }
222 
223 /**/
224 
HSP_Param_set_hsp_threshold(HSP_Param * hsp_param,gint hsp_threshold)225 void HSP_Param_set_hsp_threshold(HSP_Param *hsp_param,
226                                  gint hsp_threshold){
227     hsp_param->threshold = hsp_threshold;
228     return;
229     }
230 
HSP_Param_set_seed_repeat(HSP_Param * hsp_param,gint seed_repeat)231 void HSP_Param_set_seed_repeat(HSP_Param *hsp_param,
232                                gint seed_repeat){
233     hsp_param->seed_repeat = seed_repeat;
234     return;
235     }
236 
HSP_Param_create(Match * match,gboolean use_horizon)237 HSP_Param *HSP_Param_create(Match *match, gboolean use_horizon){
238     register HSP_Param *hsp_param = g_new(HSP_Param, 1);
239     hsp_param->thread_ref = ThreadRef_create();
240     hsp_param->has = HSPset_ArgumentSet_create(NULL);
241     hsp_param->match = match;
242     hsp_param->seed_repeat = hsp_param->has->seed_repeat;
243     switch(match->type){
244         case Match_Type_DNA2DNA:
245             hsp_param->dropoff = hsp_param->has->dna_hsp_dropoff;
246             hsp_param->threshold = hsp_param->has->dna_hsp_threshold;
247             HSP_Param_set_wordlen(hsp_param, hsp_param->has->dna_wordlen);
248             hsp_param->wordlimit = hsp_param->has->dna_word_limit;
249             break;
250         case Match_Type_PROTEIN2PROTEIN: /*fallthrough*/
251         case Match_Type_PROTEIN2DNA:     /*fallthrough*/
252         case Match_Type_DNA2PROTEIN:
253             hsp_param->dropoff = hsp_param->has->protein_hsp_dropoff;
254             hsp_param->threshold
255                 = hsp_param->has->protein_hsp_threshold;
256             HSP_Param_set_wordlen(hsp_param, hsp_param->has->protein_wordlen);
257             hsp_param->wordlimit = hsp_param->has->protein_word_limit;
258             break;
259         case Match_Type_CODON2CODON:
260             hsp_param->dropoff = hsp_param->has->codon_hsp_dropoff;
261             hsp_param->threshold = hsp_param->has->codon_hsp_threshold;
262             HSP_Param_set_wordlen(hsp_param, hsp_param->has->codon_wordlen);
263             hsp_param->wordlimit = hsp_param->has->codon_word_limit;
264             g_assert(!(hsp_param->wordlen % 3));
265             break;
266         default:
267             g_error("Bad Match_Type [%d]", match->type);
268             break;
269         }
270     hsp_param->use_horizon = use_horizon;
271 #ifdef USE_PTHREADS
272     pthread_mutex_init(&hsp_param->hsp_recycle_lock, NULL);
273 #endif /* USE_PTHREADS */
274     hsp_param->hsp_recycle = RecycleBin_create("HSP", sizeof(HSP), 64);
275     hsp_param->wordhood = NULL;
276     HSP_Param_refresh_wordhood(hsp_param);
277     return hsp_param;
278     }
279 
HSP_Param_destroy(HSP_Param * hsp_param)280 void HSP_Param_destroy(HSP_Param *hsp_param){
281     g_assert(hsp_param);
282     if(ThreadRef_destroy(hsp_param->thread_ref))
283         return;
284     if(hsp_param->wordhood)
285         WordHood_destroy(hsp_param->wordhood);
286 #ifdef USE_PTHREADS
287     pthread_mutex_lock(&hsp_param->hsp_recycle_lock);
288 #endif /* USE_PTHREADS */
289     RecycleBin_destroy(hsp_param->hsp_recycle);
290 #ifdef USE_PTHREADS
291     pthread_mutex_unlock(&hsp_param->hsp_recycle_lock);
292     pthread_mutex_destroy(&hsp_param->hsp_recycle_lock);
293 #endif /* USE_PTHREADS */
294     g_free(hsp_param);
295     return;
296     }
297 
HSP_Param_share(HSP_Param * hsp_param)298 HSP_Param *HSP_Param_share(HSP_Param *hsp_param){
299     g_assert(hsp_param);
300     ThreadRef_share(hsp_param->thread_ref);
301     return hsp_param;
302     }
303 
HSP_Param_swap(HSP_Param * hsp_param)304 HSP_Param *HSP_Param_swap(HSP_Param *hsp_param){
305     g_assert(hsp_param);
306     return HSP_Param_create(Match_swap(hsp_param->match),
307                             hsp_param->use_horizon);
308     }
309 
HSPset_create(Sequence * query,Sequence * target,HSP_Param * hsp_param)310 HSPset *HSPset_create(Sequence *query, Sequence *target,
311                       HSP_Param *hsp_param){
312     register HSPset *hsp_set = g_new(HSPset, 1);
313     g_assert(query);
314     g_assert(target);
315     hsp_set->ref_count = 1;
316     hsp_set->query = Sequence_share(query);
317     hsp_set->target = Sequence_share(target);
318     hsp_set->param = HSP_Param_share(hsp_param);
319     /**/
320     if(hsp_param->use_horizon){
321         hsp_set->horizon = (gint****)Matrix4d_create(
322                                    1 + ((hsp_param->seed_repeat > 1)?2:0),
323                                    query->len,
324                                    hsp_param->match->query->advance,
325                                    hsp_param->match->target->advance,
326                                    sizeof(gint));
327     } else {
328         hsp_set->horizon = NULL;
329         }
330     hsp_set->hsp_list = g_ptr_array_new();
331     hsp_set->is_finalised = FALSE;
332     hsp_set->param->has = HSPset_ArgumentSet_create(NULL);
333     if(hsp_set->param->has->filter_threshold){
334         hsp_set->filter = g_new0(PQueue*, query->len);
335         hsp_set->pqueue_set = PQueueSet_create();
336     } else {
337         hsp_set->filter = NULL;
338         hsp_set->pqueue_set = NULL;
339         }
340     hsp_set->is_empty = TRUE;
341     return hsp_set;
342     }
343 /* horizon[score(,repeat_count,diag)]
344  *        [query_len]
345  *        [query_advance]
346  *        [target_advance]
347  */
348 
349 /**/
350 
HSPset_share(HSPset * hsp_set)351 HSPset *HSPset_share(HSPset *hsp_set){
352     g_assert(hsp_set);
353     hsp_set->ref_count++;
354     return hsp_set;
355     }
356 
HSPset_destroy(HSPset * hsp_set)357 void HSPset_destroy(HSPset *hsp_set){
358     register gint i;
359     register HSP *hsp;
360     g_assert(hsp_set);
361     if(--hsp_set->ref_count)
362         return;
363     HSP_Param_destroy(hsp_set->param);
364     if(hsp_set->filter)
365         g_free(hsp_set->filter);
366     if(hsp_set->pqueue_set)
367         PQueueSet_destroy(hsp_set->pqueue_set);
368     Sequence_destroy(hsp_set->query);
369     Sequence_destroy(hsp_set->target);
370     for(i = 0; i < hsp_set->hsp_list->len; i++){
371         hsp = hsp_set->hsp_list->pdata[i];
372         HSP_destroy(hsp);
373         }
374     g_ptr_array_free(hsp_set->hsp_list, TRUE);
375     if(hsp_set->horizon)
376         g_free(hsp_set->horizon);
377     g_free(hsp_set);
378     return;
379     }
380 
HSPset_swap(HSPset * hsp_set,HSP_Param * hsp_param)381 void HSPset_swap(HSPset *hsp_set, HSP_Param *hsp_param){
382     register Sequence *query;
383     register gint i, query_start;
384     register HSP *hsp;
385     g_assert(hsp_set->ref_count == 1);
386     /* Swap query and target */
387     query = hsp_set->query;
388     hsp_set->query = hsp_set->target;
389     hsp_set->target = query;
390     /* Switch parameters */
391     HSP_Param_destroy(hsp_set->param);
392     hsp_set->param = HSP_Param_share(hsp_param);
393     /* Swap HSPs coordinates */
394     for(i = 0; i < hsp_set->hsp_list->len; i++){
395         hsp = hsp_set->hsp_list->pdata[i];
396         query_start = hsp->query_start;
397         hsp->query_start = hsp->target_start;
398         hsp->target_start = query_start;
399         }
400     return;
401     }
402 
HSPset_revcomp(HSPset * hsp_set)403 void HSPset_revcomp(HSPset *hsp_set){
404     register Sequence *rc_query = Sequence_revcomp(hsp_set->query),
405                       *rc_target = Sequence_revcomp(hsp_set->target);
406     register gint i;
407     register HSP *hsp;
408     g_assert(hsp_set);
409     g_assert(hsp_set->is_finalised);
410     g_assert(hsp_set->ref_count == 1);
411     /**/
412     Sequence_destroy(hsp_set->query);
413     Sequence_destroy(hsp_set->target);
414     hsp_set->query = rc_query;
415     hsp_set->target = rc_target;
416     for(i = 0; i < hsp_set->hsp_list->len; i++){
417         hsp = hsp_set->hsp_list->pdata[i];
418         hsp->query_start = hsp_set->query->len - HSP_query_end(hsp);
419         hsp->target_start = hsp_set->target->len - HSP_target_end(hsp);
420         }
421     return;
422     }
423 
424 
HSP_find_cobs(HSP * hsp)425 static gint HSP_find_cobs(HSP *hsp){
426     register gint i, query_pos = hsp->query_start,
427                      target_pos = hsp->target_start;
428     register Match_Score score = 0;
429     /* Find the HSP centre offset by score */
430     for(i = 0; i < hsp->length; i++){
431         g_assert(HSP_check_positions(hsp->hsp_set,
432                                      query_pos, target_pos));
433         score += HSP_get_score(hsp, query_pos, target_pos);
434         if(score >= (hsp->score>>1))
435             break;
436         query_pos += HSP_query_advance(hsp);
437         target_pos += HSP_target_advance(hsp);
438         }
439     return i;
440     }
441 
442 /**/
HSP_print_info(HSP * hsp)443 static void HSP_print_info(HSP *hsp){
444     g_print("HSP info (%p)\n"
445             "    query_start  = [%d]\n"
446             "    target_start = [%d]\n"
447             "    length       = [%d]\n"
448             "    score        = [%d]\n"
449             "    cobs         = [%d]\n",
450             (gpointer)hsp,
451             hsp->query_start,
452             hsp->target_start,
453             hsp->length,
454             hsp->score,
455             hsp->cobs);
456     return;
457     }
458 
459 typedef struct {
460         HSP *hsp;         /* not freed */
461        gint  padding;
462        gint  max_advance;
463        gint  query_display_pad;
464        gint  target_display_pad;
465     GString *top;
466     GString *mid;
467     GString *low;
468 } HSP_Display;
469 
HSP_Display_create(HSP * hsp,gint padding)470 static HSP_Display *HSP_Display_create(HSP *hsp, gint padding){
471     register HSP_Display *hd = g_new(HSP_Display, 1);
472     register gint approx_length;
473     hd->hsp = hsp;
474     hd->padding = padding;
475     hd->max_advance = MAX(HSP_query_advance(hsp),
476                           HSP_target_advance(hsp));
477     hd->query_display_pad = (hd->max_advance
478                              -HSP_query_advance(hsp))>>1;
479     hd->target_display_pad = (hd->max_advance
480                               -HSP_target_advance(hsp))>>1;
481     approx_length = hd->max_advance * (hsp->length + 2);
482     hd->top = g_string_sized_new(approx_length);
483     hd->mid = g_string_sized_new(approx_length);
484     hd->low = g_string_sized_new(approx_length);
485     return hd;
486     }
487 
HSP_Display_destroy(HSP_Display * hd)488 static void HSP_Display_destroy(HSP_Display *hd){
489     g_assert(hd);
490     g_string_free(hd->top, TRUE);
491     g_string_free(hd->mid, TRUE);
492     g_string_free(hd->low, TRUE);
493     g_free(hd);
494     return;
495     }
496 
HSP_Display_add(HSP_Display * hd,gchar * top,gchar * mid,gchar * low)497 static void HSP_Display_add(HSP_Display *hd,
498                             gchar *top, gchar *mid, gchar *low){
499     g_assert(hd);
500     g_assert(top);
501     g_assert(mid);
502     g_assert(low);
503     g_string_append(hd->top, top);
504     g_string_append(hd->mid, mid);
505     g_string_append(hd->low, low);
506     g_assert(hd->top->len == hd->mid->len);
507     g_assert(hd->mid->len == hd->low->len);
508     return;
509     }
510 
HSP_Display_get_ruler_char(HSP_Display * hd,gint pos,gint advance)511 static gchar HSP_Display_get_ruler_char(HSP_Display *hd, gint pos,
512                                        gint advance){
513     register gint stop;
514     register gint pad_length = 3;
515     stop = hd->padding * hd->max_advance;
516     if(pos >= stop){
517         if(pos < (stop+pad_length)){
518             return '#';
519             }
520         stop = ((hd->padding+hd->hsp->length) * hd->max_advance)
521              + pad_length;
522         if(pos >= stop){
523             if(pos < (stop+pad_length)){
524                 return '#';
525                 }
526             pos -= pad_length;
527             }
528         pos -= pad_length;
529         }
530     if((pos/advance) & 1)
531         return '=';
532     return '-';
533     }
534 
HSP_Display_print_ruler(HSP_Display * hd,gint width,gint pos,gboolean is_query)535 static void HSP_Display_print_ruler(HSP_Display *hd, gint width,
536                                     gint pos, gboolean is_query){
537     register gint i, adv;
538     if(is_query){
539         if(HSP_target_advance(hd->hsp) == 1)
540             return; /* opposite padding */
541         adv = HSP_target_advance(hd->hsp);
542     } else { /* Is target */
543         if(HSP_query_advance(hd->hsp) == 1)
544             return; /* opposite padding */
545         adv = HSP_query_advance(hd->hsp);
546         }
547     g_print(" ruler:[");
548     for(i = 0; i < width; i++)
549         g_print("%c", HSP_Display_get_ruler_char(hd, pos+i, adv));
550     g_print("]\n");
551     return;
552     }
553 
HSP_Display_print(HSP_Display * hd)554 static void HSP_Display_print(HSP_Display *hd){
555     register gint pos, pause, width = 50;
556     g_assert(hd);
557     g_assert(hd->top->len == hd->mid->len);
558     g_assert(hd->mid->len == hd->low->len);
559     for(pos = 0, pause = hd->top->len-width; pos < pause; pos += width){
560         HSP_Display_print_ruler(hd, width, pos, TRUE);
561         g_print(" query:[%.*s]\n       [%.*s]\ntarget:[%.*s]\n",
562                 width, hd->top->str+pos,
563                 width, hd->mid->str+pos,
564                 width, hd->low->str+pos);
565         HSP_Display_print_ruler(hd, width, pos, FALSE);
566         g_print("\n");
567         }
568     HSP_Display_print_ruler(hd, hd->top->len-pos, pos, TRUE);
569     g_print(" query:[%.*s]\n       [%.*s]\ntarget:[%.*s]\n",
570             (gint)(hd->top->len-pos), hd->top->str+pos,
571             (gint)(hd->mid->len-pos), hd->mid->str+pos,
572             (gint)(hd->low->len-pos), hd->low->str+pos);
573     HSP_Display_print_ruler(hd, hd->top->len-pos, pos, FALSE);
574     g_print("\n");
575     return;
576     }
577 
HSP_Display_insert(HSP_Display * hd,gint position)578 static void HSP_Display_insert(HSP_Display *hd, gint position){
579     register gint query_pos, target_pos;
580     register gboolean is_padding,
581                       query_valid = TRUE, target_valid = TRUE;
582     register Match *match = hd->hsp->hsp_set->param->match;
583     gchar query_str[4] = {0},
584           target_str[4] = {0},
585           equiv_str[4] = {0};
586     g_assert(hd);
587     query_pos  = hd->hsp->query_start
588                + (HSP_query_advance(hd->hsp) * position);
589     target_pos = hd->hsp->target_start
590                + (HSP_target_advance(hd->hsp) * position);
591     /* If outside HSP, then is_padding */
592     is_padding = ((position < 0) || (position >= hd->hsp->length));
593     /* If outside seqs, then invalid */
594     query_valid = ( (query_pos >= 0)
595                   &&((query_pos+HSP_query_advance(hd->hsp))
596                       <= hd->hsp->hsp_set->query->len));
597     target_valid = ((target_pos >= 0)
598           &&((target_pos+HSP_target_advance(hd->hsp))
599               <= hd->hsp->hsp_set->target->len));
600     /* Get equiv string */
601     if(query_valid && target_valid){
602         g_assert(HSP_check_positions(hd->hsp->hsp_set,
603                                      query_pos, target_pos));
604         HSP_get_display(hd->hsp, query_pos, target_pos, equiv_str);
605     } else {
606         strncpy(equiv_str, "###", hd->max_advance);
607         equiv_str[hd->max_advance] = '\0';
608         }
609     /* Get query string */
610     if(query_valid){
611         Match_Strand_get_raw(match->query, hd->hsp->hsp_set->query,
612                              query_pos, query_str);
613         if((match->query->advance == 1) && (hd->max_advance == 3)){
614             query_str[1] = query_str[0];
615             query_str[0] = query_str[2] = ' ';
616             query_str[3] = '\0';
617             }
618     } else {
619         strncpy(query_str, "###", hd->max_advance);
620         query_str[hd->max_advance] = '\0';
621         }
622     /* Get target string */
623     if(target_valid){
624         Match_Strand_get_raw(match->target, hd->hsp->hsp_set->target,
625                              target_pos, target_str);
626         if((match->target->advance == 1) && (hd->max_advance == 3)){
627             target_str[1] = target_str[0];
628             target_str[0] = target_str[2] = ' ';
629             target_str[3] = '\0';
630             }
631     } else {
632         strncpy(target_str, "###", hd->max_advance);
633         target_str[hd->max_advance] = '\0';
634         }
635     /* Make lower case for padding */
636     if(is_padding){
637         g_strdown(query_str);
638         g_strdown(target_str);
639     } else {
640         g_strup(query_str);
641         g_strup(target_str);
642         }
643     HSP_Display_add(hd, query_str, equiv_str, target_str);
644     return;
645     }
646 
HSP_print_alignment(HSP * hsp)647 static void HSP_print_alignment(HSP *hsp){
648     register HSP_Display *hd = HSP_Display_create(hsp, 10);
649     register gint i;
650     for(i = 0; i < hd->padding; i++) /* Pre-padding */
651         HSP_Display_insert(hd, i-hd->padding);
652     /* Use pad_length == 3 */
653     HSP_Display_add(hd, " < ", " < ", " < "); /* Start divider */
654     for(i = 0; i < hsp->length; i++) /* The HSP itself */
655         HSP_Display_insert(hd, i);
656     HSP_Display_add(hd, " > ", " > ", " > "); /* End divider */
657     for(i = 0; i < hd->padding; i++) /* Post-padding */
658         HSP_Display_insert(hd, hsp->length+i);
659     HSP_Display_print(hd);
660     HSP_Display_destroy(hd);
661     return;
662     }
663 /*
664  * HSP display style:
665  *
666  *  =-=-=-   =-=-=-=-=-=-=-=-=-   =-=-=-
667  *  nnnnnn < ACGACGCCCACGATCGAT > nnn###
668  *     ||| < |||:::|||   |||||| > |||
669  *  ### x  <  A  R  N  D  C  Q  >  x ###
670  *  ===---   ===---===---===---   ===---
671  */
672 
HSP_print_sugar(HSP * hsp)673 static void HSP_print_sugar(HSP *hsp){
674     g_print("sugar: %s %d %d %c %s %d %d %c %d\n",
675             hsp->hsp_set->query->id,
676             hsp->query_start,
677             hsp->length*HSP_query_advance(hsp),
678             Sequence_get_strand_as_char(hsp->hsp_set->query),
679             hsp->hsp_set->target->id,
680             hsp->target_start,
681             hsp->length*HSP_target_advance(hsp),
682             Sequence_get_strand_as_char(hsp->hsp_set->target),
683             hsp->score);
684     return;
685     }
686 /* Sugar output format:
687  * sugar: <query_id> <query_start> <query_length> <query_strand>
688  *        <target_id> <target_start> <target_start> <target_strand>
689  *        <score>
690  */
691 
HSP_print(HSP * hsp,gchar * name)692 void HSP_print(HSP *hsp, gchar *name){
693     g_print("draw_hsp(%d, %d, %d, %d, %d, %d, \"%s\")\n",
694             hsp->query_start,
695             hsp->target_start,
696             hsp->length,
697             hsp->cobs,
698             HSP_query_advance(hsp),
699             HSP_target_advance(hsp),
700             name);
701     HSP_print_info(hsp);
702     HSP_print_alignment(hsp);
703     HSP_print_sugar(hsp);
704     return;
705     }
706 
HSPset_print(HSPset * hsp_set)707 void HSPset_print(HSPset *hsp_set){
708     register gint i;
709     register gchar *name;
710     g_print("HSPset [%p] contains [%d] hsps\n",
711             (gpointer)hsp_set, hsp_set->hsp_list->len);
712     g_print("Comparison of [%s] and [%s]\n",
713             hsp_set->query->id, hsp_set->target->id);
714     for(i = 0; i < hsp_set->hsp_list->len; i++){
715         name = g_strdup_printf("hsp [%d]", i+1);
716         HSP_print(hsp_set->hsp_list->pdata[i], name);
717         g_free(name);
718         }
719     return;
720     }
721 
722 /**/
723 
HSP_init(HSP * nh)724 static void HSP_init(HSP *nh){
725     register gint i;
726     register gint query_pos, target_pos;
727     g_assert(HSP_check(nh));
728     /* Initial hsp score */
729     query_pos = nh->query_start;
730     target_pos = nh->target_start;
731     nh->score = 0;
732     for(i = 0; i < nh->length; i++){
733         g_assert(HSP_check_positions(nh->hsp_set,
734                                      query_pos, target_pos));
735         nh->score += HSP_get_score(nh, query_pos, target_pos);
736         query_pos += HSP_query_advance(nh);
737         target_pos += HSP_target_advance(nh);
738         }
739     if(nh->score < 0){
740         HSP_print(nh, "Bad HSP seed");
741         g_error("Initial HSP score [%d] less than zero", nh->score);
742         }
743     g_assert(HSP_check(nh));
744     return;
745     }
746 
HSP_extend(HSP * nh,gboolean forbid_masked)747 static void HSP_extend(HSP *nh, gboolean forbid_masked){
748     register Match_Score score, maxscore;
749     register gint query_pos, target_pos;
750     register gint extend, maxext;
751     g_assert(HSP_check(nh));
752     /* extend left */
753     maxscore = score = nh->score;
754     query_pos = nh->query_start-HSP_query_advance(nh);
755     target_pos = nh->target_start-HSP_target_advance(nh);
756     for(extend = 1, maxext = 0;
757        ((query_pos >= 0) && (target_pos >= 0));
758        extend++){
759         g_assert(HSP_check_positions(nh->hsp_set,
760                                      query_pos, target_pos));
761         if((forbid_masked)
762         && (HSP_query_masked(nh, query_pos)
763          || HSP_target_masked(nh, target_pos)))
764             break;
765         score += HSP_get_score(nh, query_pos, target_pos);
766         if(maxscore <= score){
767             maxscore = score;
768             maxext   = extend;
769         } else {
770             if(score < 0) /* See note below */
771                 break;
772             if((maxscore-score) >= nh->hsp_set->param->dropoff)
773                 break;
774             }
775         query_pos -= HSP_query_advance(nh);
776         target_pos -= HSP_target_advance(nh);
777         }
778     query_pos = HSP_query_end(nh);
779     target_pos = HSP_target_end(nh);
780     nh->query_start -= (maxext * HSP_query_advance(nh));
781     nh->target_start -= (maxext * HSP_target_advance(nh));
782     nh->length += maxext;
783     score = maxscore;
784     /* extend right */
785     for(extend = 1, maxext = 0;
786        (  ((query_pos+HSP_query_advance(nh))
787            <= nh->hsp_set->query->len)
788        && ((target_pos+HSP_target_advance(nh))
789            <= nh->hsp_set->target->len) );
790        extend++){
791         g_assert(HSP_check_positions(nh->hsp_set,
792                                      query_pos, target_pos));
793         if((forbid_masked)
794         && (HSP_query_masked(nh, query_pos)
795          || HSP_target_masked(nh, target_pos)))
796             break;
797         score += HSP_get_score(nh, query_pos, target_pos);
798         if(maxscore <= score){
799             maxscore = score;
800             maxext = extend;
801         } else {
802             if(score < 0) /* See note below */
803                 break;
804             if((maxscore-score) >= nh->hsp_set->param->dropoff)
805                 break;
806             }
807         query_pos += HSP_query_advance(nh);
808         target_pos += HSP_target_advance(nh);
809         }
810     nh->score = maxscore;
811     nh->length += maxext;
812     g_assert(HSP_check(nh));
813     return;
814     }
815 /* The score cannot be allowed to drop below zero in the HSP,
816  * as this can result in overlapping HSPs in some circumstances.
817  */
818 
HSP_create(HSP * nh)819 static HSP *HSP_create(HSP *nh){
820     register HSP *hsp;
821 #ifdef USE_PTHREADS
822     pthread_mutex_lock(&nh->hsp_set->param->hsp_recycle_lock);
823 #endif /* USE_PTHREADS */
824     hsp = RecycleBin_alloc(nh->hsp_set->param->hsp_recycle);
825 #ifdef USE_PTHREADS
826     pthread_mutex_unlock(&nh->hsp_set->param->hsp_recycle_lock);
827 #endif /* USE_PTHREADS */
828     hsp->hsp_set = nh->hsp_set;
829     hsp->query_start = nh->query_start;
830     hsp->target_start = nh->target_start;
831     hsp->length = nh->length;
832     hsp->score = nh->score;
833     hsp->cobs = nh->cobs; /* Value can be set by HSPset_finalise(); */
834     return hsp;
835     }
836 
HSP_destroy(HSP * hsp)837 void HSP_destroy(HSP *hsp){
838     register HSPset *hsp_set = hsp->hsp_set;
839 #ifdef USE_PTHREADS
840     pthread_mutex_lock(&hsp_set->param->hsp_recycle_lock);
841 #endif /* USE_PTHREADS */
842     RecycleBin_recycle(hsp_set->param->hsp_recycle, hsp);
843 #ifdef USE_PTHREADS
844     pthread_mutex_unlock(&hsp_set->param->hsp_recycle_lock);
845 #endif /* USE_PTHREADS */
846     return;
847     }
848 
HSP_trim_ends(HSP * hsp)849 static void HSP_trim_ends(HSP *hsp){
850     register gint i;
851     register gint query_pos, target_pos;
852     /* Trim left to first good match */
853     g_assert(HSP_check(hsp));
854     for(i = 0; i < hsp->length; i++){
855         if(HSP_get_score(hsp, hsp->query_start, hsp->target_start) > 0)
856             break;
857         hsp->query_start += HSP_query_advance(hsp);
858         hsp->target_start += HSP_target_advance(hsp);
859         }
860     hsp->length -= i;
861     /**/
862     g_assert(HSP_check(hsp));
863     query_pos = HSP_query_end(hsp) - HSP_query_advance(hsp);
864     target_pos = HSP_target_end(hsp) - HSP_target_advance(hsp);
865     /* Trim right to last good match */
866     while(hsp->length > 0){
867         g_assert(HSP_check_positions(hsp->hsp_set,
868                                      query_pos, target_pos));
869         if(HSP_get_score(hsp, query_pos, target_pos) > 0)
870             break;
871         hsp->length--;
872         query_pos -= HSP_query_advance(hsp);
873         target_pos -= HSP_target_advance(hsp);
874         }
875     g_assert(HSP_check(hsp));
876     return;
877     }
878 /* This is to remove any unmatching ends from the HSP seed.
879  */
880 
HSP_PQueue_comp_func(gpointer low,gpointer high,gpointer user_data)881 static gboolean HSP_PQueue_comp_func(gpointer low, gpointer high,
882                                      gpointer user_data){
883     register HSP *hsp_low = (HSP*)low, *hsp_high = (HSP*)high;
884     return hsp_low->score - hsp_high->score;
885     }
886 
HSP_store(HSP * nascent_hsp)887 static void HSP_store(HSP *nascent_hsp){
888     register HSPset *hsp_set = nascent_hsp->hsp_set;
889     register PQueue *pq;
890     register HSP *hsp = NULL;
891     g_assert(nascent_hsp);
892     if(nascent_hsp->score < hsp_set->param->threshold)
893         return;
894     if(hsp_set->param->has->filter_threshold){ /* If have filter */
895         /* Get cobs value */
896         nascent_hsp->cobs = HSP_find_cobs(nascent_hsp);
897         pq = hsp_set->filter[HSP_query_cobs(nascent_hsp)];
898         if(pq){ /* Put in PQueue if better than worst */
899             if(PQueue_total(pq)
900                < hsp_set->param->has->filter_threshold){
901                 hsp = HSP_create(nascent_hsp);
902                 PQueue_push(pq, hsp);
903             } else {
904                 g_assert(PQueue_total(pq));
905                 hsp = PQueue_top(pq);
906                 if(hsp->score < nascent_hsp->score){
907                     hsp = PQueue_pop(pq);
908                     HSP_destroy(hsp);
909                     hsp = HSP_create(nascent_hsp);
910                     PQueue_push(pq, hsp);
911                     }
912                 }
913         } else {
914             pq = PQueue_create(hsp_set->pqueue_set,
915                                HSP_PQueue_comp_func, NULL);
916             hsp_set->filter[HSP_query_cobs(nascent_hsp)] = pq;
917             hsp = HSP_create(nascent_hsp);
918             PQueue_push(pq, hsp);
919             hsp_set->is_empty = FALSE;
920             }
921     } else {
922         hsp = HSP_create(nascent_hsp);
923         g_ptr_array_add(hsp_set->hsp_list, hsp);
924         hsp_set->is_empty = FALSE;
925         }
926     return;
927     }
928 /* FIXME: optimisation: could store HSPs as a list up until
929  *                      filter_threshold, then convert to a PQueue
930  */
931 
HSPset_seed_hsp(HSPset * hsp_set,guint query_start,guint target_start)932 void HSPset_seed_hsp(HSPset *hsp_set,
933                      guint query_start, guint target_start){
934     register gint diag_pos
935         = ((target_start * hsp_set->param->match->query->advance)
936           -(query_start * hsp_set->param->match->target->advance));
937     register gint query_frame = query_start
938                               % hsp_set->param->match->query->advance,
939                   target_frame = target_start
940                                % hsp_set->param->match->target->advance;
941     register gint section_pos = (diag_pos + hsp_set->query->len)
942                               % hsp_set->query->len;
943     HSP nascent_hsp;
944     g_assert(!hsp_set->is_finalised);
945     /**/
946     g_assert(section_pos >= 0);
947     g_assert(section_pos < hsp_set->query->len);
948     /* Clear if diag has changed */
949     if(hsp_set->param->seed_repeat > 1){
950         if(hsp_set->horizon[2][section_pos][query_frame][target_frame]
951                != (diag_pos + hsp_set->query->len)){
952             hsp_set->horizon[0][section_pos][query_frame][target_frame] = 0;
953             hsp_set->horizon[1][section_pos][query_frame][target_frame] = 0;
954             hsp_set->horizon[2][section_pos][query_frame][target_frame]
955                 = diag_pos + hsp_set->query->len;
956             }
957         }
958     /* Check whether we have seen this HSP already */
959     if(target_start < hsp_set->horizon[0]
960                                       [section_pos]
961                                       [query_frame]
962                                       [target_frame])
963         return;
964     if(hsp_set->param->seed_repeat > 1){
965         if(++hsp_set->horizon[1][section_pos][query_frame][target_frame]
966                 < hsp_set->param->seed_repeat)
967             return;
968         hsp_set->horizon[1][section_pos][query_frame][target_frame] = 0;
969         }
970     /* Nascent HSP building: */
971     nascent_hsp.hsp_set      = hsp_set;
972     nascent_hsp.query_start  = query_start;
973     nascent_hsp.target_start = target_start;
974     nascent_hsp.length       = hsp_set->param->seedlen;
975     nascent_hsp.cobs         = 0;
976     g_assert(HSP_check(&nascent_hsp));
977     HSP_trim_ends(&nascent_hsp);
978     /* Score is irrelevant before HSP_init() */
979     HSP_init(&nascent_hsp);
980     /* Try to make above threshold HSP using masking */
981     if(hsp_set->param->match->query->mask_func
982     || hsp_set->param->match->target->mask_func){
983         HSP_extend(&nascent_hsp, TRUE);
984         if(nascent_hsp.score < hsp_set->param->threshold){
985             hsp_set->horizon[0][section_pos][query_frame][target_frame]
986                 = HSP_target_end(&nascent_hsp);
987             return;
988             }
989         }
990     /* Extend the HSP again ignoring masking */
991     HSP_extend(&nascent_hsp, FALSE);
992     HSP_store(&nascent_hsp);
993     hsp_set->horizon[0][section_pos][query_frame][target_frame]
994             = HSP_target_end(&nascent_hsp);
995     return;
996     }
997 
HSPset_add_known_hsp(HSPset * hsp_set,guint query_start,guint target_start,guint length)998 void HSPset_add_known_hsp(HSPset *hsp_set,
999                           guint query_start, guint target_start,
1000                           guint length){
1001     HSP nascent_hsp;
1002     nascent_hsp.hsp_set      = hsp_set;
1003     nascent_hsp.query_start  = query_start;
1004     nascent_hsp.target_start = target_start;
1005     nascent_hsp.length       = length;
1006     nascent_hsp.cobs         = 0;
1007     /* Score is irrelevant before HSP_init() */
1008     HSP_init(&nascent_hsp);
1009     HSP_store(&nascent_hsp);
1010     return;
1011     }
1012 
HSPset_seed_hsp_sorted(HSPset * hsp_set,guint query_start,guint target_start,gint *** horizon)1013 static void HSPset_seed_hsp_sorted(HSPset *hsp_set,
1014                      guint query_start, guint target_start,
1015                      gint ***horizon){
1016     HSP nascent_hsp;
1017     register gint diag_pos
1018         = ((target_start * hsp_set->param->match->query->advance)
1019           -(query_start * hsp_set->param->match->target->advance));
1020     register gint query_frame = query_start
1021                               % hsp_set->param->match->query->advance,
1022                   target_frame = target_start
1023                                % hsp_set->param->match->target->advance;
1024     register gint section_pos = (diag_pos + hsp_set->query->len)
1025                               % hsp_set->query->len;
1026     g_assert(!hsp_set->is_finalised);
1027     g_assert(!hsp_set->horizon);
1028     g_assert(section_pos >= 0);
1029     g_assert(section_pos < hsp_set->query->len);
1030     /**/
1031     if(horizon[1][query_frame][target_frame] != section_pos){
1032         horizon[1][query_frame][target_frame] = section_pos;
1033         horizon[0][query_frame][target_frame] = 0;
1034         horizon[2][query_frame][target_frame] = 0;
1035         }
1036 /* FIXME: seedrepeat overflow here */
1037     if(++horizon[2][query_frame][target_frame] < hsp_set->param->seed_repeat)
1038         return;
1039     horizon[2][query_frame][target_frame] = 0;
1040     /* Check whether we have seen this HSP already */
1041     if(target_start < horizon[0][query_frame][target_frame])
1042         return;
1043     /**/
1044     /* Nascent HSP building: */
1045     nascent_hsp.hsp_set      = hsp_set;
1046     nascent_hsp.query_start  = query_start;
1047     nascent_hsp.target_start = target_start;
1048     nascent_hsp.length       = hsp_set->param->seedlen;
1049     nascent_hsp.cobs         = 0;
1050     g_assert(HSP_check(&nascent_hsp));
1051     HSP_trim_ends(&nascent_hsp);
1052     /* Score is irrelevant before HSP_init() */
1053     HSP_init(&nascent_hsp);
1054     /* Try to make above threshold HSP using masking */
1055     if(hsp_set->param->match->query->mask_func
1056     || hsp_set->param->match->target->mask_func){
1057         HSP_extend(&nascent_hsp, TRUE);
1058         if(nascent_hsp.score < hsp_set->param->threshold){
1059             horizon[0][query_frame][target_frame]
1060                 = HSP_target_end(&nascent_hsp);
1061             return;
1062             }
1063         }
1064     /* Extend the HSP again ignoring masking */
1065     HSP_extend(&nascent_hsp, FALSE);
1066     HSP_store(&nascent_hsp);
1067     /**/
1068     horizon[0][query_frame][target_frame] = HSP_target_end(&nascent_hsp);
1069     return;
1070     }
1071 /* horizon[0] = diag
1072  * horizon[1] = target_end
1073  * horizon[2] = repeat_count
1074  */
1075 
1076 /* Need to use the global to pass q,t advance to qsort compare func */
1077 static HSPset *HSPset_seed_compare_hsp_set = NULL;
1078 
HSPset_seed_compare(const void * a,const void * b)1079 static int HSPset_seed_compare(const void *a, const void *b){
1080     register guint *seed_a = (guint*)a,
1081                    *seed_b = (guint*)b;
1082     register gint diag_a, diag_b;
1083     register HSPset *hsp_set = HSPset_seed_compare_hsp_set;
1084     g_assert(hsp_set);
1085     diag_a = ((seed_a[1] * hsp_set->param->match->query->advance)
1086            -  (seed_a[0] * hsp_set->param->match->target->advance)),
1087     diag_b = ((seed_b[1] * hsp_set->param->match->query->advance)
1088            -  (seed_b[0] * hsp_set->param->match->target->advance));
1089     if(diag_a == diag_b)
1090         return seed_a[0] - seed_b[0];
1091     return diag_a - diag_b;
1092     }
1093 
HSPset_seed_all_hsps(HSPset * hsp_set,guint * seed_list,guint seed_list_len)1094 void HSPset_seed_all_hsps(HSPset *hsp_set,
1095                           guint *seed_list, guint seed_list_len){
1096     register gint i;
1097     register gint ***horizon;
1098     register gint qpos, tpos;
1099     if(seed_list_len > 1){
1100         HSPset_seed_compare_hsp_set = hsp_set;
1101         qsort(seed_list, seed_list_len, sizeof(guint) << 1,
1102               HSPset_seed_compare);
1103         HSPset_seed_compare_hsp_set = NULL;
1104         }
1105     if(seed_list_len){
1106         horizon = (gint***)Matrix3d_create(3,
1107                      hsp_set->param->match->query->advance,
1108                      hsp_set->param->match->target->advance,
1109                      sizeof(gint));
1110         for(i = 0; i < seed_list_len; i++){
1111             HSPset_seed_hsp_sorted(hsp_set,
1112                                    seed_list[(i << 1)],
1113                                    seed_list[(i << 1) + 1],
1114                                    horizon);
1115             qpos = seed_list[(i << 1)];
1116             tpos = seed_list[(i << 1) + 1];
1117             }
1118         g_free(horizon);
1119         }
1120     HSPset_finalise(hsp_set);
1121     return;
1122     }
1123 
1124 /**/
1125 
HSPset_finalise(HSPset * hsp_set)1126 HSPset *HSPset_finalise(HSPset *hsp_set){
1127     register gint i;
1128     register HSP *hsp;
1129     register PQueue *pq;
1130     g_assert(!hsp_set->is_finalised);
1131     hsp_set->is_finalised = TRUE;
1132     if(hsp_set->param->has->filter_threshold && (!hsp_set->is_empty)){
1133         /* Get HSPs from each PQueue */
1134         for(i = 0; i < hsp_set->query->len; i++){
1135             pq = hsp_set->filter[i];
1136             if(pq){
1137                 while(PQueue_total(pq)){
1138                     hsp = PQueue_pop(pq);
1139                     g_ptr_array_add(hsp_set->hsp_list, hsp);
1140                     }
1141                 }
1142             }
1143         }
1144     /* Set cobs for each HSP */
1145     if(!hsp_set->param->has->filter_threshold){
1146         for(i = 0; i < hsp_set->hsp_list->len; i++){
1147             hsp = hsp_set->hsp_list->pdata[i];
1148             hsp->cobs = HSP_find_cobs(hsp);
1149             }
1150         }
1151     hsp_set->is_finalised = TRUE;
1152     return hsp_set;
1153     }
1154 
1155 /**/
1156 
HSPset_sort_by_diag_then_query_start(const void * a,const void * b)1157 static int HSPset_sort_by_diag_then_query_start(const void *a,
1158                                                 const void *b){
1159     register HSP **hsp_a = (HSP**)a, **hsp_b = (HSP**)b;
1160     register gint diag_a = HSP_diagonal(*hsp_a),
1161                   diag_b = HSP_diagonal(*hsp_b);
1162     if(diag_a == diag_b)
1163         return (*hsp_a)->query_start - (*hsp_b)->query_start;
1164     return diag_a - diag_b;
1165     }
1166 
HSP_score_overlap(HSP * left,HSP * right)1167 static Match_Score HSP_score_overlap(HSP *left, HSP *right){
1168     register Match_Score score = 0;
1169     register gint query_pos, target_pos;
1170     g_assert(left->hsp_set == right->hsp_set);
1171     g_assert(HSP_diagonal(left) == HSP_diagonal(right));
1172     query_pos = HSP_query_end(left) - HSP_query_advance(left);
1173     target_pos = HSP_target_end(left) - HSP_target_advance(left);
1174     while(query_pos >= right->query_start){
1175         score += HSP_get_score(left, query_pos, target_pos);
1176         query_pos -= HSP_query_advance(left);
1177         target_pos -= HSP_target_advance(left);
1178         }
1179     query_pos = right->query_start;
1180     target_pos = right->target_start;
1181     while(query_pos < (HSP_query_end(left)- HSP_query_advance(right))){
1182         score += HSP_get_score(right, query_pos, target_pos);
1183         query_pos += HSP_query_advance(right);
1184         target_pos += HSP_target_advance(right);
1185         }
1186     return score;
1187     }
1188 /* Returns score for overlapping region of HSPs on same diagonal */
1189 
HSPset_filter_ungapped(HSPset * hsp_set)1190 void HSPset_filter_ungapped(HSPset *hsp_set){
1191     register GPtrArray *new_hsp_list;
1192     register HSP *curr_hsp, *prev_hsp;
1193     register gboolean del_prev, del_curr;
1194     register gint i;
1195     register Match_Score score;
1196     /* Filter strongly overlapping HSPs on same diagonal
1197      * but different frames (happens with 3:3 HSPs only)
1198      */
1199     if((hsp_set->hsp_list->len > 1)
1200     && (hsp_set->param->match->query->advance == 3)
1201     && (hsp_set->param->match->target->advance == 3)){
1202         /* FIXME: should not sort when using all-at-once HSPset */
1203         qsort(hsp_set->hsp_list->pdata, hsp_set->hsp_list->len,
1204               sizeof(gpointer), HSPset_sort_by_diag_then_query_start);
1205         prev_hsp = hsp_set->hsp_list->pdata[0];
1206         del_prev = FALSE;
1207         del_curr = FALSE;
1208         new_hsp_list = g_ptr_array_new();
1209         for(i = 1; i < hsp_set->hsp_list->len; i++){
1210             curr_hsp = hsp_set->hsp_list->pdata[i];
1211             del_curr = FALSE;
1212             if((HSP_diagonal(prev_hsp) == HSP_diagonal(curr_hsp))
1213             && (HSP_query_end(prev_hsp) > curr_hsp->query_start)){
1214                 score = HSP_score_overlap(prev_hsp, curr_hsp);
1215                 if((score << 1) > (curr_hsp->score + prev_hsp->score)){
1216                     /* FIXME: use codon_usage scores here instead */
1217                     if(prev_hsp->score < curr_hsp->score){
1218                         del_prev = TRUE;
1219                     } else {
1220                         del_curr = TRUE;
1221                         }
1222                     }
1223                 }
1224             if(del_prev)
1225                 HSP_destroy(prev_hsp);
1226             else
1227                 g_ptr_array_add(new_hsp_list, prev_hsp);
1228             prev_hsp = curr_hsp;
1229             del_prev = del_curr;
1230             }
1231         if(del_prev)
1232             HSP_destroy(prev_hsp);
1233         else
1234             g_ptr_array_add(new_hsp_list, prev_hsp);
1235         g_ptr_array_free(hsp_set->hsp_list, TRUE);
1236         hsp_set->hsp_list = new_hsp_list;
1237         }
1238     return;
1239     }
1240 
1241 /**/
1242 
1243 #define HSPset_SList_PAGE_BIT_WIDTH 10
1244 #define HSPset_SList_PAGE_SIZE (1 << HSPset_SList_PAGE_BIT_WIDTH)
1245 
HSPset_SList_RecycleBin_create(void)1246 RecycleBin *HSPset_SList_RecycleBin_create(void){
1247     return RecycleBin_create("HSPset_Slist", sizeof(HSPset_SList_Node),
1248                              4096);
1249     }
1250 
HSPset_SList_append(RecycleBin * recycle_bin,HSPset_SList_Node * next,gint query_pos,gint target_pos)1251 HSPset_SList_Node *HSPset_SList_append(RecycleBin *recycle_bin,
1252                                        HSPset_SList_Node *next,
1253                                        gint query_pos, gint target_pos){
1254     register HSPset_SList_Node *node = RecycleBin_alloc(recycle_bin);
1255     node->next = next;
1256     node->query_pos = query_pos;
1257     node->target_pos = target_pos;
1258     return node;
1259     }
1260 
1261 #if 0
1262 typedef struct {
1263                  gint      page_alloc;
1264                  gint      page_total;
1265     HSPset_SList_Node    **diag_page_list;
1266                  gint  ****horizon;
1267                  gint     *page_used;
1268                  gint      page_used_total;
1269 } HSPset_SeedData;
1270 
1271 static HSPset_SeedData *HSPset_SeedData_create(HSP_Param *hsp_param,
1272                                                gint target_len){
1273     register HSPset_SeedData *seed_data = g_new(HSPset_SeedData, 1);
1274     seed_data->page_total = (target_len
1275                           >> HSPset_SList_PAGE_BIT_WIDTH) + 1;
1276     seed_data->page_alloc = seed_data->page_total;
1277     seed_data->diag_page_list = g_new0(HSPset_SList_Node*, seed_data->page_total);
1278     seed_data->page_used = g_new(gint, seed_data->page_total);
1279     seed_data->horizon = (gint****)Matrix4d_create(
1280                            2 + ((hsp_param->seed_repeat > 1)?1:0),
1281                            HSPset_SList_PAGE_SIZE,
1282                            hsp_param->match->query->advance,
1283                            hsp_param->match->target->advance,
1284                            sizeof(gint));
1285     seed_data->page_used_total = 0;
1286     return seed_data;
1287     }
1288 
1289 static void HSPset_SeedData_destroy(HSPset_SeedData *seed_data){
1290     g_free(seed_data->diag_page_list);
1291     g_free(seed_data->page_used);
1292     g_free(seed_data->horizon);
1293     g_free(seed_data);
1294     return;
1295     }
1296 
1297 static void HSPset_SeedData_set_target_len(HSPset_SeedData *seed_data,
1298                                            HSPset *hsp_set){
1299     register gint new_page_total = (hsp_set->target->len
1300                                     >> HSPset_SList_PAGE_BIT_WIDTH) + 1;
1301     register gint i, a, b, c, d;
1302     seed_data->page_total = new_page_total;
1303     if(seed_data->page_alloc < new_page_total){
1304         seed_data->page_alloc = seed_data->page_total;
1305         g_free(seed_data->diag_page_list);
1306         seed_data->diag_page_list = g_new(HSPset_SList_Node*,
1307                                           seed_data->page_total);
1308         g_free(seed_data->page_used);
1309         seed_data->page_used = g_new(gint, seed_data->page_total);
1310         }
1311     /* Clear diag_page_list */
1312     for(i = 0; i < seed_data->page_total; i++)
1313         seed_data->diag_page_list[i] = 0;
1314     /* Clear horizon */
1315     for(a = 2 + ((hsp_set->param->seed_repeat > 1)?1:0); a >= 0; a--)
1316         for(b = HSPset_SList_PAGE_SIZE; b >= 0; b--)
1317             for(c = hsp_set->param->match->query->advance; c >= 0; c--)
1318                 for(d = hsp_set->param->match->target->advance; d >= 0; d--)
1319                     seed_data->horizon[a][b][c][d] = 0;
1320     seed_data->page_used_total = 0;
1321     return;
1322     }
1323 #endif /* 0 */
1324 
HSPset_seed_all_qy_sorted(HSPset * hsp_set,HSPset_SList_Node * seed_list)1325 void HSPset_seed_all_qy_sorted(HSPset *hsp_set, HSPset_SList_Node *seed_list){
1326     register gint page_total = (hsp_set->target->len
1327                                 >> HSPset_SList_PAGE_BIT_WIDTH) + 1;
1328     register HSPset_SList_Node **diag_page_list
1329         = g_new0(HSPset_SList_Node*, page_total);
1330     register gint *page_used = g_new(gint, page_total);
1331     register gint ****horizon = (gint****)Matrix4d_create(
1332                            2 + ((hsp_set->param->seed_repeat > 1)?1:0),
1333                            HSPset_SList_PAGE_SIZE,
1334                            hsp_set->param->match->query->advance,
1335                            hsp_set->param->match->target->advance,
1336                            sizeof(gint));
1337     /*
1338     register HSPset_SeedData *seed_data = HSPset_SeedData_create(hsp_set->param,
1339                                                            hsp_set->target->len);
1340     */
1341     register gint i, page, diag_pos, query_frame, target_frame,
1342                    section_pos, page_pos;
1343     register HSPset_SList_Node *seed;
1344     register gint page_used_total = 0;
1345     HSP nascent_hsp;
1346     /* g_message("[%s] with [%d]", __FUNCTION__, hsp_set->target->len); */
1347     /* HSPset_SeedData_set_target_len(seed_data, hsp_set); */
1348     /* Bin on diagonal into pages */
1349     while(seed_list){
1350         seed = seed_list;
1351         seed_list = seed_list->next;
1352         /**/
1353         diag_pos = (seed->target_pos
1354                     * hsp_set->param->match->query->advance)
1355                  - (seed->query_pos
1356                     * hsp_set->param->match->target->advance);
1357         section_pos = ((diag_pos + hsp_set->target->len)
1358                     % hsp_set->target->len);
1359         page = (section_pos >> HSPset_SList_PAGE_BIT_WIDTH);
1360         g_assert(section_pos >= 0);
1361         g_assert(section_pos < hsp_set->target->len);
1362         g_assert(page >= 0);
1363         g_assert(page < page_total);
1364         /**/
1365         if(!diag_page_list[page])
1366             page_used[page_used_total++] = page;
1367         seed->next = diag_page_list[page];
1368         diag_page_list[page] = seed;
1369         }
1370     /* Seed each page using page horizon */
1371     for(i = 0; i < page_used_total; i++){
1372         page = page_used[i];
1373         for(seed = diag_page_list[page]; seed; seed = seed->next){
1374             g_assert((!seed->next)
1375                    || (seed->query_pos <= seed->next->query_pos));
1376             diag_pos = (seed->target_pos
1377                         * hsp_set->param->match->query->advance)
1378                      - (seed->query_pos
1379                         * hsp_set->param->match->target->advance);
1380             query_frame  = seed->query_pos
1381                          % hsp_set->param->match->query->advance;
1382             target_frame  = seed->target_pos
1383                           % hsp_set->param->match->target->advance;
1384             section_pos = ((diag_pos + hsp_set->target->len)
1385                         % hsp_set->target->len);
1386             page_pos = section_pos
1387                      - (page << HSPset_SList_PAGE_BIT_WIDTH);
1388             g_assert(page_pos >= 0);
1389             g_assert(page_pos < HSPset_SList_PAGE_SIZE);
1390             /* Clear if page has changed */
1391             if(horizon[1][page_pos][query_frame][target_frame] != page){
1392                horizon[0][page_pos][query_frame][target_frame] = 0;
1393                horizon[1][page_pos][query_frame][target_frame] = page;
1394                if(hsp_set->param->seed_repeat > 1)
1395                    horizon[2][page_pos][query_frame][target_frame] = 0;
1396                }
1397             if(seed->query_pos < horizon[0][page_pos][query_frame][target_frame])
1398                 continue;
1399             if(hsp_set->param->seed_repeat > 1){
1400                 if(++horizon[2][page_pos][query_frame][target_frame]
1401                         < hsp_set->param->seed_repeat){
1402                     continue;
1403                     }
1404                 horizon[2][page_pos][query_frame][target_frame] = 0;
1405                 }
1406             /* Nascent HSP building: */
1407             nascent_hsp.hsp_set      = hsp_set;
1408             nascent_hsp.query_start  = seed->query_pos;
1409             nascent_hsp.target_start = seed->target_pos;
1410             nascent_hsp.length       = hsp_set->param->seedlen;
1411             nascent_hsp.cobs         = 0;
1412             g_assert(HSP_check(&nascent_hsp));
1413             HSP_trim_ends(&nascent_hsp);
1414             /* Score is irrelevant before HSP_init() */
1415             HSP_init(&nascent_hsp);
1416             /* Try to make above threshold HSP using masking */
1417             if(hsp_set->param->match->query->mask_func
1418             || hsp_set->param->match->target->mask_func){
1419                 HSP_extend(&nascent_hsp, TRUE);
1420                 if(nascent_hsp.score < hsp_set->param->threshold){
1421                     horizon[0][page_pos][query_frame][target_frame]
1422                         = HSP_query_end(&nascent_hsp);
1423                     continue;
1424                     }
1425                 }
1426             /* Extend the HSP again ignoring masking */
1427             HSP_extend(&nascent_hsp, FALSE);
1428             HSP_store(&nascent_hsp);
1429             /**/
1430             horizon[0][page_pos][query_frame][target_frame]
1431                     = HSP_query_end(&nascent_hsp);
1432             }
1433         }
1434     g_free(diag_page_list);
1435     g_free(page_used);
1436     g_free(horizon);
1437     HSPset_finalise(hsp_set);
1438     /* HSPset_SeedData_destroy(seed_data); */
1439     return;
1440     }
1441 /* horizon[horizon][mailbox][seed_repeat]
1442  *        [page_size][qadv][tadv]
1443  */
1444 
1445 /**/
1446 
1447