1 /****************************************************************\
2 * *
3 * Library for HSP sets (high-scoring segment pairs) *
4 * *
5 * Guy St.C. Slater.. mailto:guy@ebi.ac.uk *
6 * Copyright (C) 2000-2009. All Rights Reserved. *
7 * *
8 * This source code is distributed under the terms of the *
9 * GNU General Public License, version 3. See the file COPYING *
10 * or http://www.gnu.org/licenses/gpl.txt for details *
11 * *
12 * If you use this code, please keep this notice intact. *
13 * *
14 \****************************************************************/
15
16 #include <string.h> /* For strlen() */
17 #include <ctype.h> /* For tolower() */
18 #include <stdlib.h> /* For qsort() */
19
20 #include "hspset.h"
21
HSPset_ArgumentSet_create(Argument * arg)22 HSPset_ArgumentSet *HSPset_ArgumentSet_create(Argument *arg){
23 register ArgumentSet *as;
24 static HSPset_ArgumentSet has = {0};
25 if(arg){
26 as = ArgumentSet_create("HSP creation options");
27 ArgumentSet_add_option(as, '\0', "hspfilter", NULL,
28 "Aggressive HSP filtering level", "0",
29 Argument_parse_int, &has.filter_threshold);
30 ArgumentSet_add_option(as, '\0', "useworddropoff", NULL,
31 "Use word neighbourhood dropoff", "TRUE",
32 Argument_parse_boolean, &has.use_wordhood_dropoff);
33 ArgumentSet_add_option(as, '\0', "seedrepeat", NULL,
34 "Seeds per diagonal required for HSP seeding", "1",
35 Argument_parse_int, &has.seed_repeat);
36 /**/
37 ArgumentSet_add_option(as, 0, "dnawordlen", "bp",
38 "Wordlength for DNA words", "12",
39 Argument_parse_int, &has.dna_wordlen);
40 ArgumentSet_add_option(as, 0, "proteinwordlen", "aa",
41 "Wordlength for protein words", "6",
42 Argument_parse_int, &has.protein_wordlen);
43 ArgumentSet_add_option(as, 0, "codonwordlen", "bp",
44 "Wordlength for codon words", "12",
45 Argument_parse_int, &has.codon_wordlen);
46 /**/
47 ArgumentSet_add_option(as, 0, "dnahspdropoff", "score",
48 "DNA HSP dropoff score", "30",
49 Argument_parse_int, &has.dna_hsp_dropoff);
50 ArgumentSet_add_option(as, 0, "proteinhspdropoff", "score",
51 "Protein HSP dropoff score", "20",
52 Argument_parse_int, &has.protein_hsp_dropoff);
53 ArgumentSet_add_option(as, 0, "codonhspdropoff", "score",
54 "Codon HSP dropoff score", "40",
55 Argument_parse_int, &has.codon_hsp_dropoff);
56 /**/
57 ArgumentSet_add_option(as, 0, "dnahspthreshold", "score",
58 "DNA HSP threshold score", "75",
59 Argument_parse_int, &has.dna_hsp_threshold);
60 ArgumentSet_add_option(as, 0, "proteinhspthreshold", "score",
61 "Protein HSP threshold score", "30",
62 Argument_parse_int, &has.protein_hsp_threshold);
63 ArgumentSet_add_option(as, 0, "codonhspthreshold", "score",
64 "Codon HSP threshold score", "50",
65 Argument_parse_int, &has.codon_hsp_threshold);
66 /**/
67 ArgumentSet_add_option(as, 0, "dnawordlimit", "score",
68 "Score limit for dna word neighbourhood", "0",
69 Argument_parse_int, &has.dna_word_limit);
70 ArgumentSet_add_option(as, 0, "proteinwordlimit", "score",
71 "Score limit for protein word neighbourhood", "4",
72 Argument_parse_int, &has.protein_word_limit);
73 ArgumentSet_add_option(as, 0, "codonwordlimit", "score",
74 "Score limit for codon word neighbourhood", "4",
75 Argument_parse_int, &has.codon_word_limit);
76 /**/
77 ArgumentSet_add_option(as, '\0', "geneseed", "threshold",
78 "Geneseed Threshold", "0",
79 Argument_parse_int, &has.geneseed_threshold);
80 ArgumentSet_add_option(as, '\0', "geneseedrepeat", "number",
81 "Seeds per diagonal required for geneseed HSP seeding", "3",
82 Argument_parse_int, &has.geneseed_repeat);
83 /**/
84 Argument_absorb_ArgumentSet(arg, as);
85 }
86 return &has;
87 }
88
89 /**/
90
HSP_check_positions(HSPset * hsp_set,gint query_pos,gint target_pos)91 static gboolean HSP_check_positions(HSPset *hsp_set,
92 gint query_pos, gint target_pos){
93 g_assert(query_pos >= 0);
94 g_assert(target_pos >= 0);
95 g_assert(query_pos < hsp_set->query->len);
96 g_assert(target_pos < hsp_set->target->len);
97 return TRUE;
98 }
99
HSP_check(HSP * hsp)100 static gboolean HSP_check(HSP *hsp){
101 g_assert(hsp);
102 g_assert(hsp->hsp_set);
103 g_assert(hsp->length);
104 g_assert(HSP_query_end(hsp) <= hsp->hsp_set->query->len);
105 g_assert(HSP_target_end(hsp) <= hsp->hsp_set->target->len);
106 return TRUE;
107 }
108
HSP_Param_set_wordlen(HSP_Param * hsp_param,gint wordlen)109 void HSP_Param_set_wordlen(HSP_Param *hsp_param, gint wordlen){
110 if(wordlen <= 0)
111 g_error("Wordlength must be greater than zero");
112 hsp_param->wordlen = wordlen;
113 hsp_param->seedlen = hsp_param->wordlen
114 / hsp_param->match->query->advance;
115 return;
116 }
117
118 /**/
119
HSP_Param_set_dna_hsp_threshold(HSP_Param * hsp_param,gint dna_hsp_threshold)120 void HSP_Param_set_dna_hsp_threshold(HSP_Param *hsp_param,
121 gint dna_hsp_threshold){
122 if(hsp_param->match->type == Match_Type_DNA2DNA)
123 hsp_param->threshold = dna_hsp_threshold;
124 return;
125 }
126
HSP_Param_set_protein_hsp_threshold(HSP_Param * hsp_param,gint protein_hsp_threshold)127 void HSP_Param_set_protein_hsp_threshold(HSP_Param *hsp_param,
128 gint protein_hsp_threshold){
129 if((hsp_param->match->type == Match_Type_PROTEIN2PROTEIN)
130 || (hsp_param->match->type == Match_Type_PROTEIN2DNA)
131 || (hsp_param->match->type == Match_Type_DNA2PROTEIN))
132 hsp_param->threshold = protein_hsp_threshold;
133 return;
134 }
135
HSP_Param_set_codon_hsp_threshold(HSP_Param * hsp_param,gint codon_hsp_threshold)136 void HSP_Param_set_codon_hsp_threshold(HSP_Param *hsp_param,
137 gint codon_hsp_threshold){
138 if(hsp_param->match->type == Match_Type_CODON2CODON)
139 hsp_param->threshold = codon_hsp_threshold;
140 return;
141 }
142
143 /**/
144
HSP_Param_refresh_wordhood(HSP_Param * hsp_param)145 static void HSP_Param_refresh_wordhood(HSP_Param *hsp_param){
146 register WordHood_Alphabet *wha = NULL;
147 register Submat *submat;
148 if(hsp_param->wordhood)
149 WordHood_destroy(hsp_param->wordhood);
150 if(hsp_param->has->use_wordhood_dropoff && (!hsp_param->wordlimit)){
151 hsp_param->wordhood = NULL;
152 } else {
153 submat = (hsp_param->match->type == Match_Type_DNA2DNA)
154 ? hsp_param->match->mas->dna_submat
155 : hsp_param->match->mas->protein_submat;
156 wha = WordHood_Alphabet_create_from_Submat(
157 (gchar*)hsp_param->match->comparison_alphabet->member,
158 (gchar*)hsp_param->match->comparison_alphabet->member,
159 submat, FALSE);
160 g_assert(wha);
161 hsp_param->wordhood = WordHood_create(wha, hsp_param->wordlimit,
162 hsp_param->has->use_wordhood_dropoff);
163 WordHood_Alphabet_destroy(wha);
164 }
165 return;
166 }
167 /* FIXME: optimisation: do not free/recreate wordhood when unnecessary */
168
HSP_Param_set_dna_word_limit(HSP_Param * hsp_param,gint dna_word_limit)169 void HSP_Param_set_dna_word_limit(HSP_Param *hsp_param,
170 gint dna_word_limit){
171 if(hsp_param->match->type == Match_Type_DNA2DNA){
172 hsp_param->wordlimit = dna_word_limit;
173 HSP_Param_refresh_wordhood(hsp_param);
174 }
175 return;
176 }
177
HSP_Param_set_protein_word_limit(HSP_Param * hsp_param,gint protein_word_limit)178 void HSP_Param_set_protein_word_limit(HSP_Param *hsp_param,
179 gint protein_word_limit){
180 if((hsp_param->match->type == Match_Type_PROTEIN2PROTEIN)
181 || (hsp_param->match->type == Match_Type_PROTEIN2DNA)
182 || (hsp_param->match->type == Match_Type_DNA2PROTEIN)){
183 hsp_param->wordlimit = protein_word_limit;
184 HSP_Param_refresh_wordhood(hsp_param);
185 }
186 return;
187 }
188
HSP_Param_set_codon_word_limit(HSP_Param * hsp_param,gint codon_word_limit)189 void HSP_Param_set_codon_word_limit(HSP_Param *hsp_param,
190 gint codon_word_limit){
191 if(hsp_param->match->type == Match_Type_CODON2CODON){
192 hsp_param->threshold = codon_word_limit;
193 HSP_Param_refresh_wordhood(hsp_param);
194 }
195 return;
196 }
197
198 /**/
199
HSP_Param_set_dna_hsp_dropoff(HSP_Param * hsp_param,gint dna_hsp_dropoff)200 void HSP_Param_set_dna_hsp_dropoff(HSP_Param *hsp_param,
201 gint dna_hsp_dropoff){
202 if(hsp_param->match->type == Match_Type_DNA2DNA)
203 hsp_param->dropoff = dna_hsp_dropoff;
204 return;
205 }
206
HSP_Param_set_protein_hsp_dropoff(HSP_Param * hsp_param,gint protein_hsp_dropoff)207 void HSP_Param_set_protein_hsp_dropoff(HSP_Param *hsp_param,
208 gint protein_hsp_dropoff){
209 if((hsp_param->match->type == Match_Type_PROTEIN2PROTEIN)
210 || (hsp_param->match->type == Match_Type_PROTEIN2DNA)
211 || (hsp_param->match->type == Match_Type_DNA2PROTEIN))
212 hsp_param->dropoff = protein_hsp_dropoff;
213 return;
214 }
215
HSP_Param_set_codon_hsp_dropoff(HSP_Param * hsp_param,gint codon_hsp_dropoff)216 void HSP_Param_set_codon_hsp_dropoff(HSP_Param *hsp_param,
217 gint codon_hsp_dropoff){
218 if(hsp_param->match->type == Match_Type_CODON2CODON)
219 hsp_param->dropoff = codon_hsp_dropoff;
220 return;
221 }
222
223 /**/
224
HSP_Param_set_hsp_threshold(HSP_Param * hsp_param,gint hsp_threshold)225 void HSP_Param_set_hsp_threshold(HSP_Param *hsp_param,
226 gint hsp_threshold){
227 hsp_param->threshold = hsp_threshold;
228 return;
229 }
230
HSP_Param_set_seed_repeat(HSP_Param * hsp_param,gint seed_repeat)231 void HSP_Param_set_seed_repeat(HSP_Param *hsp_param,
232 gint seed_repeat){
233 hsp_param->seed_repeat = seed_repeat;
234 return;
235 }
236
HSP_Param_create(Match * match,gboolean use_horizon)237 HSP_Param *HSP_Param_create(Match *match, gboolean use_horizon){
238 register HSP_Param *hsp_param = g_new(HSP_Param, 1);
239 hsp_param->thread_ref = ThreadRef_create();
240 hsp_param->has = HSPset_ArgumentSet_create(NULL);
241 hsp_param->match = match;
242 hsp_param->seed_repeat = hsp_param->has->seed_repeat;
243 switch(match->type){
244 case Match_Type_DNA2DNA:
245 hsp_param->dropoff = hsp_param->has->dna_hsp_dropoff;
246 hsp_param->threshold = hsp_param->has->dna_hsp_threshold;
247 HSP_Param_set_wordlen(hsp_param, hsp_param->has->dna_wordlen);
248 hsp_param->wordlimit = hsp_param->has->dna_word_limit;
249 break;
250 case Match_Type_PROTEIN2PROTEIN: /*fallthrough*/
251 case Match_Type_PROTEIN2DNA: /*fallthrough*/
252 case Match_Type_DNA2PROTEIN:
253 hsp_param->dropoff = hsp_param->has->protein_hsp_dropoff;
254 hsp_param->threshold
255 = hsp_param->has->protein_hsp_threshold;
256 HSP_Param_set_wordlen(hsp_param, hsp_param->has->protein_wordlen);
257 hsp_param->wordlimit = hsp_param->has->protein_word_limit;
258 break;
259 case Match_Type_CODON2CODON:
260 hsp_param->dropoff = hsp_param->has->codon_hsp_dropoff;
261 hsp_param->threshold = hsp_param->has->codon_hsp_threshold;
262 HSP_Param_set_wordlen(hsp_param, hsp_param->has->codon_wordlen);
263 hsp_param->wordlimit = hsp_param->has->codon_word_limit;
264 g_assert(!(hsp_param->wordlen % 3));
265 break;
266 default:
267 g_error("Bad Match_Type [%d]", match->type);
268 break;
269 }
270 hsp_param->use_horizon = use_horizon;
271 #ifdef USE_PTHREADS
272 pthread_mutex_init(&hsp_param->hsp_recycle_lock, NULL);
273 #endif /* USE_PTHREADS */
274 hsp_param->hsp_recycle = RecycleBin_create("HSP", sizeof(HSP), 64);
275 hsp_param->wordhood = NULL;
276 HSP_Param_refresh_wordhood(hsp_param);
277 return hsp_param;
278 }
279
HSP_Param_destroy(HSP_Param * hsp_param)280 void HSP_Param_destroy(HSP_Param *hsp_param){
281 g_assert(hsp_param);
282 if(ThreadRef_destroy(hsp_param->thread_ref))
283 return;
284 if(hsp_param->wordhood)
285 WordHood_destroy(hsp_param->wordhood);
286 #ifdef USE_PTHREADS
287 pthread_mutex_lock(&hsp_param->hsp_recycle_lock);
288 #endif /* USE_PTHREADS */
289 RecycleBin_destroy(hsp_param->hsp_recycle);
290 #ifdef USE_PTHREADS
291 pthread_mutex_unlock(&hsp_param->hsp_recycle_lock);
292 pthread_mutex_destroy(&hsp_param->hsp_recycle_lock);
293 #endif /* USE_PTHREADS */
294 g_free(hsp_param);
295 return;
296 }
297
HSP_Param_share(HSP_Param * hsp_param)298 HSP_Param *HSP_Param_share(HSP_Param *hsp_param){
299 g_assert(hsp_param);
300 ThreadRef_share(hsp_param->thread_ref);
301 return hsp_param;
302 }
303
HSP_Param_swap(HSP_Param * hsp_param)304 HSP_Param *HSP_Param_swap(HSP_Param *hsp_param){
305 g_assert(hsp_param);
306 return HSP_Param_create(Match_swap(hsp_param->match),
307 hsp_param->use_horizon);
308 }
309
HSPset_create(Sequence * query,Sequence * target,HSP_Param * hsp_param)310 HSPset *HSPset_create(Sequence *query, Sequence *target,
311 HSP_Param *hsp_param){
312 register HSPset *hsp_set = g_new(HSPset, 1);
313 g_assert(query);
314 g_assert(target);
315 hsp_set->ref_count = 1;
316 hsp_set->query = Sequence_share(query);
317 hsp_set->target = Sequence_share(target);
318 hsp_set->param = HSP_Param_share(hsp_param);
319 /**/
320 if(hsp_param->use_horizon){
321 hsp_set->horizon = (gint****)Matrix4d_create(
322 1 + ((hsp_param->seed_repeat > 1)?2:0),
323 query->len,
324 hsp_param->match->query->advance,
325 hsp_param->match->target->advance,
326 sizeof(gint));
327 } else {
328 hsp_set->horizon = NULL;
329 }
330 hsp_set->hsp_list = g_ptr_array_new();
331 hsp_set->is_finalised = FALSE;
332 hsp_set->param->has = HSPset_ArgumentSet_create(NULL);
333 if(hsp_set->param->has->filter_threshold){
334 hsp_set->filter = g_new0(PQueue*, query->len);
335 hsp_set->pqueue_set = PQueueSet_create();
336 } else {
337 hsp_set->filter = NULL;
338 hsp_set->pqueue_set = NULL;
339 }
340 hsp_set->is_empty = TRUE;
341 return hsp_set;
342 }
343 /* horizon[score(,repeat_count,diag)]
344 * [query_len]
345 * [query_advance]
346 * [target_advance]
347 */
348
349 /**/
350
HSPset_share(HSPset * hsp_set)351 HSPset *HSPset_share(HSPset *hsp_set){
352 g_assert(hsp_set);
353 hsp_set->ref_count++;
354 return hsp_set;
355 }
356
HSPset_destroy(HSPset * hsp_set)357 void HSPset_destroy(HSPset *hsp_set){
358 register gint i;
359 register HSP *hsp;
360 g_assert(hsp_set);
361 if(--hsp_set->ref_count)
362 return;
363 HSP_Param_destroy(hsp_set->param);
364 if(hsp_set->filter)
365 g_free(hsp_set->filter);
366 if(hsp_set->pqueue_set)
367 PQueueSet_destroy(hsp_set->pqueue_set);
368 Sequence_destroy(hsp_set->query);
369 Sequence_destroy(hsp_set->target);
370 for(i = 0; i < hsp_set->hsp_list->len; i++){
371 hsp = hsp_set->hsp_list->pdata[i];
372 HSP_destroy(hsp);
373 }
374 g_ptr_array_free(hsp_set->hsp_list, TRUE);
375 if(hsp_set->horizon)
376 g_free(hsp_set->horizon);
377 g_free(hsp_set);
378 return;
379 }
380
HSPset_swap(HSPset * hsp_set,HSP_Param * hsp_param)381 void HSPset_swap(HSPset *hsp_set, HSP_Param *hsp_param){
382 register Sequence *query;
383 register gint i, query_start;
384 register HSP *hsp;
385 g_assert(hsp_set->ref_count == 1);
386 /* Swap query and target */
387 query = hsp_set->query;
388 hsp_set->query = hsp_set->target;
389 hsp_set->target = query;
390 /* Switch parameters */
391 HSP_Param_destroy(hsp_set->param);
392 hsp_set->param = HSP_Param_share(hsp_param);
393 /* Swap HSPs coordinates */
394 for(i = 0; i < hsp_set->hsp_list->len; i++){
395 hsp = hsp_set->hsp_list->pdata[i];
396 query_start = hsp->query_start;
397 hsp->query_start = hsp->target_start;
398 hsp->target_start = query_start;
399 }
400 return;
401 }
402
HSPset_revcomp(HSPset * hsp_set)403 void HSPset_revcomp(HSPset *hsp_set){
404 register Sequence *rc_query = Sequence_revcomp(hsp_set->query),
405 *rc_target = Sequence_revcomp(hsp_set->target);
406 register gint i;
407 register HSP *hsp;
408 g_assert(hsp_set);
409 g_assert(hsp_set->is_finalised);
410 g_assert(hsp_set->ref_count == 1);
411 /**/
412 Sequence_destroy(hsp_set->query);
413 Sequence_destroy(hsp_set->target);
414 hsp_set->query = rc_query;
415 hsp_set->target = rc_target;
416 for(i = 0; i < hsp_set->hsp_list->len; i++){
417 hsp = hsp_set->hsp_list->pdata[i];
418 hsp->query_start = hsp_set->query->len - HSP_query_end(hsp);
419 hsp->target_start = hsp_set->target->len - HSP_target_end(hsp);
420 }
421 return;
422 }
423
424
HSP_find_cobs(HSP * hsp)425 static gint HSP_find_cobs(HSP *hsp){
426 register gint i, query_pos = hsp->query_start,
427 target_pos = hsp->target_start;
428 register Match_Score score = 0;
429 /* Find the HSP centre offset by score */
430 for(i = 0; i < hsp->length; i++){
431 g_assert(HSP_check_positions(hsp->hsp_set,
432 query_pos, target_pos));
433 score += HSP_get_score(hsp, query_pos, target_pos);
434 if(score >= (hsp->score>>1))
435 break;
436 query_pos += HSP_query_advance(hsp);
437 target_pos += HSP_target_advance(hsp);
438 }
439 return i;
440 }
441
442 /**/
HSP_print_info(HSP * hsp)443 static void HSP_print_info(HSP *hsp){
444 g_print("HSP info (%p)\n"
445 " query_start = [%d]\n"
446 " target_start = [%d]\n"
447 " length = [%d]\n"
448 " score = [%d]\n"
449 " cobs = [%d]\n",
450 (gpointer)hsp,
451 hsp->query_start,
452 hsp->target_start,
453 hsp->length,
454 hsp->score,
455 hsp->cobs);
456 return;
457 }
458
459 typedef struct {
460 HSP *hsp; /* not freed */
461 gint padding;
462 gint max_advance;
463 gint query_display_pad;
464 gint target_display_pad;
465 GString *top;
466 GString *mid;
467 GString *low;
468 } HSP_Display;
469
HSP_Display_create(HSP * hsp,gint padding)470 static HSP_Display *HSP_Display_create(HSP *hsp, gint padding){
471 register HSP_Display *hd = g_new(HSP_Display, 1);
472 register gint approx_length;
473 hd->hsp = hsp;
474 hd->padding = padding;
475 hd->max_advance = MAX(HSP_query_advance(hsp),
476 HSP_target_advance(hsp));
477 hd->query_display_pad = (hd->max_advance
478 -HSP_query_advance(hsp))>>1;
479 hd->target_display_pad = (hd->max_advance
480 -HSP_target_advance(hsp))>>1;
481 approx_length = hd->max_advance * (hsp->length + 2);
482 hd->top = g_string_sized_new(approx_length);
483 hd->mid = g_string_sized_new(approx_length);
484 hd->low = g_string_sized_new(approx_length);
485 return hd;
486 }
487
HSP_Display_destroy(HSP_Display * hd)488 static void HSP_Display_destroy(HSP_Display *hd){
489 g_assert(hd);
490 g_string_free(hd->top, TRUE);
491 g_string_free(hd->mid, TRUE);
492 g_string_free(hd->low, TRUE);
493 g_free(hd);
494 return;
495 }
496
HSP_Display_add(HSP_Display * hd,gchar * top,gchar * mid,gchar * low)497 static void HSP_Display_add(HSP_Display *hd,
498 gchar *top, gchar *mid, gchar *low){
499 g_assert(hd);
500 g_assert(top);
501 g_assert(mid);
502 g_assert(low);
503 g_string_append(hd->top, top);
504 g_string_append(hd->mid, mid);
505 g_string_append(hd->low, low);
506 g_assert(hd->top->len == hd->mid->len);
507 g_assert(hd->mid->len == hd->low->len);
508 return;
509 }
510
HSP_Display_get_ruler_char(HSP_Display * hd,gint pos,gint advance)511 static gchar HSP_Display_get_ruler_char(HSP_Display *hd, gint pos,
512 gint advance){
513 register gint stop;
514 register gint pad_length = 3;
515 stop = hd->padding * hd->max_advance;
516 if(pos >= stop){
517 if(pos < (stop+pad_length)){
518 return '#';
519 }
520 stop = ((hd->padding+hd->hsp->length) * hd->max_advance)
521 + pad_length;
522 if(pos >= stop){
523 if(pos < (stop+pad_length)){
524 return '#';
525 }
526 pos -= pad_length;
527 }
528 pos -= pad_length;
529 }
530 if((pos/advance) & 1)
531 return '=';
532 return '-';
533 }
534
HSP_Display_print_ruler(HSP_Display * hd,gint width,gint pos,gboolean is_query)535 static void HSP_Display_print_ruler(HSP_Display *hd, gint width,
536 gint pos, gboolean is_query){
537 register gint i, adv;
538 if(is_query){
539 if(HSP_target_advance(hd->hsp) == 1)
540 return; /* opposite padding */
541 adv = HSP_target_advance(hd->hsp);
542 } else { /* Is target */
543 if(HSP_query_advance(hd->hsp) == 1)
544 return; /* opposite padding */
545 adv = HSP_query_advance(hd->hsp);
546 }
547 g_print(" ruler:[");
548 for(i = 0; i < width; i++)
549 g_print("%c", HSP_Display_get_ruler_char(hd, pos+i, adv));
550 g_print("]\n");
551 return;
552 }
553
HSP_Display_print(HSP_Display * hd)554 static void HSP_Display_print(HSP_Display *hd){
555 register gint pos, pause, width = 50;
556 g_assert(hd);
557 g_assert(hd->top->len == hd->mid->len);
558 g_assert(hd->mid->len == hd->low->len);
559 for(pos = 0, pause = hd->top->len-width; pos < pause; pos += width){
560 HSP_Display_print_ruler(hd, width, pos, TRUE);
561 g_print(" query:[%.*s]\n [%.*s]\ntarget:[%.*s]\n",
562 width, hd->top->str+pos,
563 width, hd->mid->str+pos,
564 width, hd->low->str+pos);
565 HSP_Display_print_ruler(hd, width, pos, FALSE);
566 g_print("\n");
567 }
568 HSP_Display_print_ruler(hd, hd->top->len-pos, pos, TRUE);
569 g_print(" query:[%.*s]\n [%.*s]\ntarget:[%.*s]\n",
570 (gint)(hd->top->len-pos), hd->top->str+pos,
571 (gint)(hd->mid->len-pos), hd->mid->str+pos,
572 (gint)(hd->low->len-pos), hd->low->str+pos);
573 HSP_Display_print_ruler(hd, hd->top->len-pos, pos, FALSE);
574 g_print("\n");
575 return;
576 }
577
HSP_Display_insert(HSP_Display * hd,gint position)578 static void HSP_Display_insert(HSP_Display *hd, gint position){
579 register gint query_pos, target_pos;
580 register gboolean is_padding,
581 query_valid = TRUE, target_valid = TRUE;
582 register Match *match = hd->hsp->hsp_set->param->match;
583 gchar query_str[4] = {0},
584 target_str[4] = {0},
585 equiv_str[4] = {0};
586 g_assert(hd);
587 query_pos = hd->hsp->query_start
588 + (HSP_query_advance(hd->hsp) * position);
589 target_pos = hd->hsp->target_start
590 + (HSP_target_advance(hd->hsp) * position);
591 /* If outside HSP, then is_padding */
592 is_padding = ((position < 0) || (position >= hd->hsp->length));
593 /* If outside seqs, then invalid */
594 query_valid = ( (query_pos >= 0)
595 &&((query_pos+HSP_query_advance(hd->hsp))
596 <= hd->hsp->hsp_set->query->len));
597 target_valid = ((target_pos >= 0)
598 &&((target_pos+HSP_target_advance(hd->hsp))
599 <= hd->hsp->hsp_set->target->len));
600 /* Get equiv string */
601 if(query_valid && target_valid){
602 g_assert(HSP_check_positions(hd->hsp->hsp_set,
603 query_pos, target_pos));
604 HSP_get_display(hd->hsp, query_pos, target_pos, equiv_str);
605 } else {
606 strncpy(equiv_str, "###", hd->max_advance);
607 equiv_str[hd->max_advance] = '\0';
608 }
609 /* Get query string */
610 if(query_valid){
611 Match_Strand_get_raw(match->query, hd->hsp->hsp_set->query,
612 query_pos, query_str);
613 if((match->query->advance == 1) && (hd->max_advance == 3)){
614 query_str[1] = query_str[0];
615 query_str[0] = query_str[2] = ' ';
616 query_str[3] = '\0';
617 }
618 } else {
619 strncpy(query_str, "###", hd->max_advance);
620 query_str[hd->max_advance] = '\0';
621 }
622 /* Get target string */
623 if(target_valid){
624 Match_Strand_get_raw(match->target, hd->hsp->hsp_set->target,
625 target_pos, target_str);
626 if((match->target->advance == 1) && (hd->max_advance == 3)){
627 target_str[1] = target_str[0];
628 target_str[0] = target_str[2] = ' ';
629 target_str[3] = '\0';
630 }
631 } else {
632 strncpy(target_str, "###", hd->max_advance);
633 target_str[hd->max_advance] = '\0';
634 }
635 /* Make lower case for padding */
636 if(is_padding){
637 g_strdown(query_str);
638 g_strdown(target_str);
639 } else {
640 g_strup(query_str);
641 g_strup(target_str);
642 }
643 HSP_Display_add(hd, query_str, equiv_str, target_str);
644 return;
645 }
646
HSP_print_alignment(HSP * hsp)647 static void HSP_print_alignment(HSP *hsp){
648 register HSP_Display *hd = HSP_Display_create(hsp, 10);
649 register gint i;
650 for(i = 0; i < hd->padding; i++) /* Pre-padding */
651 HSP_Display_insert(hd, i-hd->padding);
652 /* Use pad_length == 3 */
653 HSP_Display_add(hd, " < ", " < ", " < "); /* Start divider */
654 for(i = 0; i < hsp->length; i++) /* The HSP itself */
655 HSP_Display_insert(hd, i);
656 HSP_Display_add(hd, " > ", " > ", " > "); /* End divider */
657 for(i = 0; i < hd->padding; i++) /* Post-padding */
658 HSP_Display_insert(hd, hsp->length+i);
659 HSP_Display_print(hd);
660 HSP_Display_destroy(hd);
661 return;
662 }
663 /*
664 * HSP display style:
665 *
666 * =-=-=- =-=-=-=-=-=-=-=-=- =-=-=-
667 * nnnnnn < ACGACGCCCACGATCGAT > nnn###
668 * ||| < |||:::||| |||||| > |||
669 * ### x < A R N D C Q > x ###
670 * ===--- ===---===---===--- ===---
671 */
672
HSP_print_sugar(HSP * hsp)673 static void HSP_print_sugar(HSP *hsp){
674 g_print("sugar: %s %d %d %c %s %d %d %c %d\n",
675 hsp->hsp_set->query->id,
676 hsp->query_start,
677 hsp->length*HSP_query_advance(hsp),
678 Sequence_get_strand_as_char(hsp->hsp_set->query),
679 hsp->hsp_set->target->id,
680 hsp->target_start,
681 hsp->length*HSP_target_advance(hsp),
682 Sequence_get_strand_as_char(hsp->hsp_set->target),
683 hsp->score);
684 return;
685 }
686 /* Sugar output format:
687 * sugar: <query_id> <query_start> <query_length> <query_strand>
688 * <target_id> <target_start> <target_start> <target_strand>
689 * <score>
690 */
691
HSP_print(HSP * hsp,gchar * name)692 void HSP_print(HSP *hsp, gchar *name){
693 g_print("draw_hsp(%d, %d, %d, %d, %d, %d, \"%s\")\n",
694 hsp->query_start,
695 hsp->target_start,
696 hsp->length,
697 hsp->cobs,
698 HSP_query_advance(hsp),
699 HSP_target_advance(hsp),
700 name);
701 HSP_print_info(hsp);
702 HSP_print_alignment(hsp);
703 HSP_print_sugar(hsp);
704 return;
705 }
706
HSPset_print(HSPset * hsp_set)707 void HSPset_print(HSPset *hsp_set){
708 register gint i;
709 register gchar *name;
710 g_print("HSPset [%p] contains [%d] hsps\n",
711 (gpointer)hsp_set, hsp_set->hsp_list->len);
712 g_print("Comparison of [%s] and [%s]\n",
713 hsp_set->query->id, hsp_set->target->id);
714 for(i = 0; i < hsp_set->hsp_list->len; i++){
715 name = g_strdup_printf("hsp [%d]", i+1);
716 HSP_print(hsp_set->hsp_list->pdata[i], name);
717 g_free(name);
718 }
719 return;
720 }
721
722 /**/
723
HSP_init(HSP * nh)724 static void HSP_init(HSP *nh){
725 register gint i;
726 register gint query_pos, target_pos;
727 g_assert(HSP_check(nh));
728 /* Initial hsp score */
729 query_pos = nh->query_start;
730 target_pos = nh->target_start;
731 nh->score = 0;
732 for(i = 0; i < nh->length; i++){
733 g_assert(HSP_check_positions(nh->hsp_set,
734 query_pos, target_pos));
735 nh->score += HSP_get_score(nh, query_pos, target_pos);
736 query_pos += HSP_query_advance(nh);
737 target_pos += HSP_target_advance(nh);
738 }
739 if(nh->score < 0){
740 HSP_print(nh, "Bad HSP seed");
741 g_error("Initial HSP score [%d] less than zero", nh->score);
742 }
743 g_assert(HSP_check(nh));
744 return;
745 }
746
HSP_extend(HSP * nh,gboolean forbid_masked)747 static void HSP_extend(HSP *nh, gboolean forbid_masked){
748 register Match_Score score, maxscore;
749 register gint query_pos, target_pos;
750 register gint extend, maxext;
751 g_assert(HSP_check(nh));
752 /* extend left */
753 maxscore = score = nh->score;
754 query_pos = nh->query_start-HSP_query_advance(nh);
755 target_pos = nh->target_start-HSP_target_advance(nh);
756 for(extend = 1, maxext = 0;
757 ((query_pos >= 0) && (target_pos >= 0));
758 extend++){
759 g_assert(HSP_check_positions(nh->hsp_set,
760 query_pos, target_pos));
761 if((forbid_masked)
762 && (HSP_query_masked(nh, query_pos)
763 || HSP_target_masked(nh, target_pos)))
764 break;
765 score += HSP_get_score(nh, query_pos, target_pos);
766 if(maxscore <= score){
767 maxscore = score;
768 maxext = extend;
769 } else {
770 if(score < 0) /* See note below */
771 break;
772 if((maxscore-score) >= nh->hsp_set->param->dropoff)
773 break;
774 }
775 query_pos -= HSP_query_advance(nh);
776 target_pos -= HSP_target_advance(nh);
777 }
778 query_pos = HSP_query_end(nh);
779 target_pos = HSP_target_end(nh);
780 nh->query_start -= (maxext * HSP_query_advance(nh));
781 nh->target_start -= (maxext * HSP_target_advance(nh));
782 nh->length += maxext;
783 score = maxscore;
784 /* extend right */
785 for(extend = 1, maxext = 0;
786 ( ((query_pos+HSP_query_advance(nh))
787 <= nh->hsp_set->query->len)
788 && ((target_pos+HSP_target_advance(nh))
789 <= nh->hsp_set->target->len) );
790 extend++){
791 g_assert(HSP_check_positions(nh->hsp_set,
792 query_pos, target_pos));
793 if((forbid_masked)
794 && (HSP_query_masked(nh, query_pos)
795 || HSP_target_masked(nh, target_pos)))
796 break;
797 score += HSP_get_score(nh, query_pos, target_pos);
798 if(maxscore <= score){
799 maxscore = score;
800 maxext = extend;
801 } else {
802 if(score < 0) /* See note below */
803 break;
804 if((maxscore-score) >= nh->hsp_set->param->dropoff)
805 break;
806 }
807 query_pos += HSP_query_advance(nh);
808 target_pos += HSP_target_advance(nh);
809 }
810 nh->score = maxscore;
811 nh->length += maxext;
812 g_assert(HSP_check(nh));
813 return;
814 }
815 /* The score cannot be allowed to drop below zero in the HSP,
816 * as this can result in overlapping HSPs in some circumstances.
817 */
818
HSP_create(HSP * nh)819 static HSP *HSP_create(HSP *nh){
820 register HSP *hsp;
821 #ifdef USE_PTHREADS
822 pthread_mutex_lock(&nh->hsp_set->param->hsp_recycle_lock);
823 #endif /* USE_PTHREADS */
824 hsp = RecycleBin_alloc(nh->hsp_set->param->hsp_recycle);
825 #ifdef USE_PTHREADS
826 pthread_mutex_unlock(&nh->hsp_set->param->hsp_recycle_lock);
827 #endif /* USE_PTHREADS */
828 hsp->hsp_set = nh->hsp_set;
829 hsp->query_start = nh->query_start;
830 hsp->target_start = nh->target_start;
831 hsp->length = nh->length;
832 hsp->score = nh->score;
833 hsp->cobs = nh->cobs; /* Value can be set by HSPset_finalise(); */
834 return hsp;
835 }
836
HSP_destroy(HSP * hsp)837 void HSP_destroy(HSP *hsp){
838 register HSPset *hsp_set = hsp->hsp_set;
839 #ifdef USE_PTHREADS
840 pthread_mutex_lock(&hsp_set->param->hsp_recycle_lock);
841 #endif /* USE_PTHREADS */
842 RecycleBin_recycle(hsp_set->param->hsp_recycle, hsp);
843 #ifdef USE_PTHREADS
844 pthread_mutex_unlock(&hsp_set->param->hsp_recycle_lock);
845 #endif /* USE_PTHREADS */
846 return;
847 }
848
HSP_trim_ends(HSP * hsp)849 static void HSP_trim_ends(HSP *hsp){
850 register gint i;
851 register gint query_pos, target_pos;
852 /* Trim left to first good match */
853 g_assert(HSP_check(hsp));
854 for(i = 0; i < hsp->length; i++){
855 if(HSP_get_score(hsp, hsp->query_start, hsp->target_start) > 0)
856 break;
857 hsp->query_start += HSP_query_advance(hsp);
858 hsp->target_start += HSP_target_advance(hsp);
859 }
860 hsp->length -= i;
861 /**/
862 g_assert(HSP_check(hsp));
863 query_pos = HSP_query_end(hsp) - HSP_query_advance(hsp);
864 target_pos = HSP_target_end(hsp) - HSP_target_advance(hsp);
865 /* Trim right to last good match */
866 while(hsp->length > 0){
867 g_assert(HSP_check_positions(hsp->hsp_set,
868 query_pos, target_pos));
869 if(HSP_get_score(hsp, query_pos, target_pos) > 0)
870 break;
871 hsp->length--;
872 query_pos -= HSP_query_advance(hsp);
873 target_pos -= HSP_target_advance(hsp);
874 }
875 g_assert(HSP_check(hsp));
876 return;
877 }
878 /* This is to remove any unmatching ends from the HSP seed.
879 */
880
HSP_PQueue_comp_func(gpointer low,gpointer high,gpointer user_data)881 static gboolean HSP_PQueue_comp_func(gpointer low, gpointer high,
882 gpointer user_data){
883 register HSP *hsp_low = (HSP*)low, *hsp_high = (HSP*)high;
884 return hsp_low->score - hsp_high->score;
885 }
886
HSP_store(HSP * nascent_hsp)887 static void HSP_store(HSP *nascent_hsp){
888 register HSPset *hsp_set = nascent_hsp->hsp_set;
889 register PQueue *pq;
890 register HSP *hsp = NULL;
891 g_assert(nascent_hsp);
892 if(nascent_hsp->score < hsp_set->param->threshold)
893 return;
894 if(hsp_set->param->has->filter_threshold){ /* If have filter */
895 /* Get cobs value */
896 nascent_hsp->cobs = HSP_find_cobs(nascent_hsp);
897 pq = hsp_set->filter[HSP_query_cobs(nascent_hsp)];
898 if(pq){ /* Put in PQueue if better than worst */
899 if(PQueue_total(pq)
900 < hsp_set->param->has->filter_threshold){
901 hsp = HSP_create(nascent_hsp);
902 PQueue_push(pq, hsp);
903 } else {
904 g_assert(PQueue_total(pq));
905 hsp = PQueue_top(pq);
906 if(hsp->score < nascent_hsp->score){
907 hsp = PQueue_pop(pq);
908 HSP_destroy(hsp);
909 hsp = HSP_create(nascent_hsp);
910 PQueue_push(pq, hsp);
911 }
912 }
913 } else {
914 pq = PQueue_create(hsp_set->pqueue_set,
915 HSP_PQueue_comp_func, NULL);
916 hsp_set->filter[HSP_query_cobs(nascent_hsp)] = pq;
917 hsp = HSP_create(nascent_hsp);
918 PQueue_push(pq, hsp);
919 hsp_set->is_empty = FALSE;
920 }
921 } else {
922 hsp = HSP_create(nascent_hsp);
923 g_ptr_array_add(hsp_set->hsp_list, hsp);
924 hsp_set->is_empty = FALSE;
925 }
926 return;
927 }
928 /* FIXME: optimisation: could store HSPs as a list up until
929 * filter_threshold, then convert to a PQueue
930 */
931
HSPset_seed_hsp(HSPset * hsp_set,guint query_start,guint target_start)932 void HSPset_seed_hsp(HSPset *hsp_set,
933 guint query_start, guint target_start){
934 register gint diag_pos
935 = ((target_start * hsp_set->param->match->query->advance)
936 -(query_start * hsp_set->param->match->target->advance));
937 register gint query_frame = query_start
938 % hsp_set->param->match->query->advance,
939 target_frame = target_start
940 % hsp_set->param->match->target->advance;
941 register gint section_pos = (diag_pos + hsp_set->query->len)
942 % hsp_set->query->len;
943 HSP nascent_hsp;
944 g_assert(!hsp_set->is_finalised);
945 /**/
946 g_assert(section_pos >= 0);
947 g_assert(section_pos < hsp_set->query->len);
948 /* Clear if diag has changed */
949 if(hsp_set->param->seed_repeat > 1){
950 if(hsp_set->horizon[2][section_pos][query_frame][target_frame]
951 != (diag_pos + hsp_set->query->len)){
952 hsp_set->horizon[0][section_pos][query_frame][target_frame] = 0;
953 hsp_set->horizon[1][section_pos][query_frame][target_frame] = 0;
954 hsp_set->horizon[2][section_pos][query_frame][target_frame]
955 = diag_pos + hsp_set->query->len;
956 }
957 }
958 /* Check whether we have seen this HSP already */
959 if(target_start < hsp_set->horizon[0]
960 [section_pos]
961 [query_frame]
962 [target_frame])
963 return;
964 if(hsp_set->param->seed_repeat > 1){
965 if(++hsp_set->horizon[1][section_pos][query_frame][target_frame]
966 < hsp_set->param->seed_repeat)
967 return;
968 hsp_set->horizon[1][section_pos][query_frame][target_frame] = 0;
969 }
970 /* Nascent HSP building: */
971 nascent_hsp.hsp_set = hsp_set;
972 nascent_hsp.query_start = query_start;
973 nascent_hsp.target_start = target_start;
974 nascent_hsp.length = hsp_set->param->seedlen;
975 nascent_hsp.cobs = 0;
976 g_assert(HSP_check(&nascent_hsp));
977 HSP_trim_ends(&nascent_hsp);
978 /* Score is irrelevant before HSP_init() */
979 HSP_init(&nascent_hsp);
980 /* Try to make above threshold HSP using masking */
981 if(hsp_set->param->match->query->mask_func
982 || hsp_set->param->match->target->mask_func){
983 HSP_extend(&nascent_hsp, TRUE);
984 if(nascent_hsp.score < hsp_set->param->threshold){
985 hsp_set->horizon[0][section_pos][query_frame][target_frame]
986 = HSP_target_end(&nascent_hsp);
987 return;
988 }
989 }
990 /* Extend the HSP again ignoring masking */
991 HSP_extend(&nascent_hsp, FALSE);
992 HSP_store(&nascent_hsp);
993 hsp_set->horizon[0][section_pos][query_frame][target_frame]
994 = HSP_target_end(&nascent_hsp);
995 return;
996 }
997
HSPset_add_known_hsp(HSPset * hsp_set,guint query_start,guint target_start,guint length)998 void HSPset_add_known_hsp(HSPset *hsp_set,
999 guint query_start, guint target_start,
1000 guint length){
1001 HSP nascent_hsp;
1002 nascent_hsp.hsp_set = hsp_set;
1003 nascent_hsp.query_start = query_start;
1004 nascent_hsp.target_start = target_start;
1005 nascent_hsp.length = length;
1006 nascent_hsp.cobs = 0;
1007 /* Score is irrelevant before HSP_init() */
1008 HSP_init(&nascent_hsp);
1009 HSP_store(&nascent_hsp);
1010 return;
1011 }
1012
HSPset_seed_hsp_sorted(HSPset * hsp_set,guint query_start,guint target_start,gint *** horizon)1013 static void HSPset_seed_hsp_sorted(HSPset *hsp_set,
1014 guint query_start, guint target_start,
1015 gint ***horizon){
1016 HSP nascent_hsp;
1017 register gint diag_pos
1018 = ((target_start * hsp_set->param->match->query->advance)
1019 -(query_start * hsp_set->param->match->target->advance));
1020 register gint query_frame = query_start
1021 % hsp_set->param->match->query->advance,
1022 target_frame = target_start
1023 % hsp_set->param->match->target->advance;
1024 register gint section_pos = (diag_pos + hsp_set->query->len)
1025 % hsp_set->query->len;
1026 g_assert(!hsp_set->is_finalised);
1027 g_assert(!hsp_set->horizon);
1028 g_assert(section_pos >= 0);
1029 g_assert(section_pos < hsp_set->query->len);
1030 /**/
1031 if(horizon[1][query_frame][target_frame] != section_pos){
1032 horizon[1][query_frame][target_frame] = section_pos;
1033 horizon[0][query_frame][target_frame] = 0;
1034 horizon[2][query_frame][target_frame] = 0;
1035 }
1036 /* FIXME: seedrepeat overflow here */
1037 if(++horizon[2][query_frame][target_frame] < hsp_set->param->seed_repeat)
1038 return;
1039 horizon[2][query_frame][target_frame] = 0;
1040 /* Check whether we have seen this HSP already */
1041 if(target_start < horizon[0][query_frame][target_frame])
1042 return;
1043 /**/
1044 /* Nascent HSP building: */
1045 nascent_hsp.hsp_set = hsp_set;
1046 nascent_hsp.query_start = query_start;
1047 nascent_hsp.target_start = target_start;
1048 nascent_hsp.length = hsp_set->param->seedlen;
1049 nascent_hsp.cobs = 0;
1050 g_assert(HSP_check(&nascent_hsp));
1051 HSP_trim_ends(&nascent_hsp);
1052 /* Score is irrelevant before HSP_init() */
1053 HSP_init(&nascent_hsp);
1054 /* Try to make above threshold HSP using masking */
1055 if(hsp_set->param->match->query->mask_func
1056 || hsp_set->param->match->target->mask_func){
1057 HSP_extend(&nascent_hsp, TRUE);
1058 if(nascent_hsp.score < hsp_set->param->threshold){
1059 horizon[0][query_frame][target_frame]
1060 = HSP_target_end(&nascent_hsp);
1061 return;
1062 }
1063 }
1064 /* Extend the HSP again ignoring masking */
1065 HSP_extend(&nascent_hsp, FALSE);
1066 HSP_store(&nascent_hsp);
1067 /**/
1068 horizon[0][query_frame][target_frame] = HSP_target_end(&nascent_hsp);
1069 return;
1070 }
1071 /* horizon[0] = diag
1072 * horizon[1] = target_end
1073 * horizon[2] = repeat_count
1074 */
1075
1076 /* Need to use the global to pass q,t advance to qsort compare func */
1077 static HSPset *HSPset_seed_compare_hsp_set = NULL;
1078
HSPset_seed_compare(const void * a,const void * b)1079 static int HSPset_seed_compare(const void *a, const void *b){
1080 register guint *seed_a = (guint*)a,
1081 *seed_b = (guint*)b;
1082 register gint diag_a, diag_b;
1083 register HSPset *hsp_set = HSPset_seed_compare_hsp_set;
1084 g_assert(hsp_set);
1085 diag_a = ((seed_a[1] * hsp_set->param->match->query->advance)
1086 - (seed_a[0] * hsp_set->param->match->target->advance)),
1087 diag_b = ((seed_b[1] * hsp_set->param->match->query->advance)
1088 - (seed_b[0] * hsp_set->param->match->target->advance));
1089 if(diag_a == diag_b)
1090 return seed_a[0] - seed_b[0];
1091 return diag_a - diag_b;
1092 }
1093
HSPset_seed_all_hsps(HSPset * hsp_set,guint * seed_list,guint seed_list_len)1094 void HSPset_seed_all_hsps(HSPset *hsp_set,
1095 guint *seed_list, guint seed_list_len){
1096 register gint i;
1097 register gint ***horizon;
1098 register gint qpos, tpos;
1099 if(seed_list_len > 1){
1100 HSPset_seed_compare_hsp_set = hsp_set;
1101 qsort(seed_list, seed_list_len, sizeof(guint) << 1,
1102 HSPset_seed_compare);
1103 HSPset_seed_compare_hsp_set = NULL;
1104 }
1105 if(seed_list_len){
1106 horizon = (gint***)Matrix3d_create(3,
1107 hsp_set->param->match->query->advance,
1108 hsp_set->param->match->target->advance,
1109 sizeof(gint));
1110 for(i = 0; i < seed_list_len; i++){
1111 HSPset_seed_hsp_sorted(hsp_set,
1112 seed_list[(i << 1)],
1113 seed_list[(i << 1) + 1],
1114 horizon);
1115 qpos = seed_list[(i << 1)];
1116 tpos = seed_list[(i << 1) + 1];
1117 }
1118 g_free(horizon);
1119 }
1120 HSPset_finalise(hsp_set);
1121 return;
1122 }
1123
1124 /**/
1125
HSPset_finalise(HSPset * hsp_set)1126 HSPset *HSPset_finalise(HSPset *hsp_set){
1127 register gint i;
1128 register HSP *hsp;
1129 register PQueue *pq;
1130 g_assert(!hsp_set->is_finalised);
1131 hsp_set->is_finalised = TRUE;
1132 if(hsp_set->param->has->filter_threshold && (!hsp_set->is_empty)){
1133 /* Get HSPs from each PQueue */
1134 for(i = 0; i < hsp_set->query->len; i++){
1135 pq = hsp_set->filter[i];
1136 if(pq){
1137 while(PQueue_total(pq)){
1138 hsp = PQueue_pop(pq);
1139 g_ptr_array_add(hsp_set->hsp_list, hsp);
1140 }
1141 }
1142 }
1143 }
1144 /* Set cobs for each HSP */
1145 if(!hsp_set->param->has->filter_threshold){
1146 for(i = 0; i < hsp_set->hsp_list->len; i++){
1147 hsp = hsp_set->hsp_list->pdata[i];
1148 hsp->cobs = HSP_find_cobs(hsp);
1149 }
1150 }
1151 hsp_set->is_finalised = TRUE;
1152 return hsp_set;
1153 }
1154
1155 /**/
1156
HSPset_sort_by_diag_then_query_start(const void * a,const void * b)1157 static int HSPset_sort_by_diag_then_query_start(const void *a,
1158 const void *b){
1159 register HSP **hsp_a = (HSP**)a, **hsp_b = (HSP**)b;
1160 register gint diag_a = HSP_diagonal(*hsp_a),
1161 diag_b = HSP_diagonal(*hsp_b);
1162 if(diag_a == diag_b)
1163 return (*hsp_a)->query_start - (*hsp_b)->query_start;
1164 return diag_a - diag_b;
1165 }
1166
HSP_score_overlap(HSP * left,HSP * right)1167 static Match_Score HSP_score_overlap(HSP *left, HSP *right){
1168 register Match_Score score = 0;
1169 register gint query_pos, target_pos;
1170 g_assert(left->hsp_set == right->hsp_set);
1171 g_assert(HSP_diagonal(left) == HSP_diagonal(right));
1172 query_pos = HSP_query_end(left) - HSP_query_advance(left);
1173 target_pos = HSP_target_end(left) - HSP_target_advance(left);
1174 while(query_pos >= right->query_start){
1175 score += HSP_get_score(left, query_pos, target_pos);
1176 query_pos -= HSP_query_advance(left);
1177 target_pos -= HSP_target_advance(left);
1178 }
1179 query_pos = right->query_start;
1180 target_pos = right->target_start;
1181 while(query_pos < (HSP_query_end(left)- HSP_query_advance(right))){
1182 score += HSP_get_score(right, query_pos, target_pos);
1183 query_pos += HSP_query_advance(right);
1184 target_pos += HSP_target_advance(right);
1185 }
1186 return score;
1187 }
1188 /* Returns score for overlapping region of HSPs on same diagonal */
1189
HSPset_filter_ungapped(HSPset * hsp_set)1190 void HSPset_filter_ungapped(HSPset *hsp_set){
1191 register GPtrArray *new_hsp_list;
1192 register HSP *curr_hsp, *prev_hsp;
1193 register gboolean del_prev, del_curr;
1194 register gint i;
1195 register Match_Score score;
1196 /* Filter strongly overlapping HSPs on same diagonal
1197 * but different frames (happens with 3:3 HSPs only)
1198 */
1199 if((hsp_set->hsp_list->len > 1)
1200 && (hsp_set->param->match->query->advance == 3)
1201 && (hsp_set->param->match->target->advance == 3)){
1202 /* FIXME: should not sort when using all-at-once HSPset */
1203 qsort(hsp_set->hsp_list->pdata, hsp_set->hsp_list->len,
1204 sizeof(gpointer), HSPset_sort_by_diag_then_query_start);
1205 prev_hsp = hsp_set->hsp_list->pdata[0];
1206 del_prev = FALSE;
1207 del_curr = FALSE;
1208 new_hsp_list = g_ptr_array_new();
1209 for(i = 1; i < hsp_set->hsp_list->len; i++){
1210 curr_hsp = hsp_set->hsp_list->pdata[i];
1211 del_curr = FALSE;
1212 if((HSP_diagonal(prev_hsp) == HSP_diagonal(curr_hsp))
1213 && (HSP_query_end(prev_hsp) > curr_hsp->query_start)){
1214 score = HSP_score_overlap(prev_hsp, curr_hsp);
1215 if((score << 1) > (curr_hsp->score + prev_hsp->score)){
1216 /* FIXME: use codon_usage scores here instead */
1217 if(prev_hsp->score < curr_hsp->score){
1218 del_prev = TRUE;
1219 } else {
1220 del_curr = TRUE;
1221 }
1222 }
1223 }
1224 if(del_prev)
1225 HSP_destroy(prev_hsp);
1226 else
1227 g_ptr_array_add(new_hsp_list, prev_hsp);
1228 prev_hsp = curr_hsp;
1229 del_prev = del_curr;
1230 }
1231 if(del_prev)
1232 HSP_destroy(prev_hsp);
1233 else
1234 g_ptr_array_add(new_hsp_list, prev_hsp);
1235 g_ptr_array_free(hsp_set->hsp_list, TRUE);
1236 hsp_set->hsp_list = new_hsp_list;
1237 }
1238 return;
1239 }
1240
1241 /**/
1242
1243 #define HSPset_SList_PAGE_BIT_WIDTH 10
1244 #define HSPset_SList_PAGE_SIZE (1 << HSPset_SList_PAGE_BIT_WIDTH)
1245
HSPset_SList_RecycleBin_create(void)1246 RecycleBin *HSPset_SList_RecycleBin_create(void){
1247 return RecycleBin_create("HSPset_Slist", sizeof(HSPset_SList_Node),
1248 4096);
1249 }
1250
HSPset_SList_append(RecycleBin * recycle_bin,HSPset_SList_Node * next,gint query_pos,gint target_pos)1251 HSPset_SList_Node *HSPset_SList_append(RecycleBin *recycle_bin,
1252 HSPset_SList_Node *next,
1253 gint query_pos, gint target_pos){
1254 register HSPset_SList_Node *node = RecycleBin_alloc(recycle_bin);
1255 node->next = next;
1256 node->query_pos = query_pos;
1257 node->target_pos = target_pos;
1258 return node;
1259 }
1260
1261 #if 0
1262 typedef struct {
1263 gint page_alloc;
1264 gint page_total;
1265 HSPset_SList_Node **diag_page_list;
1266 gint ****horizon;
1267 gint *page_used;
1268 gint page_used_total;
1269 } HSPset_SeedData;
1270
1271 static HSPset_SeedData *HSPset_SeedData_create(HSP_Param *hsp_param,
1272 gint target_len){
1273 register HSPset_SeedData *seed_data = g_new(HSPset_SeedData, 1);
1274 seed_data->page_total = (target_len
1275 >> HSPset_SList_PAGE_BIT_WIDTH) + 1;
1276 seed_data->page_alloc = seed_data->page_total;
1277 seed_data->diag_page_list = g_new0(HSPset_SList_Node*, seed_data->page_total);
1278 seed_data->page_used = g_new(gint, seed_data->page_total);
1279 seed_data->horizon = (gint****)Matrix4d_create(
1280 2 + ((hsp_param->seed_repeat > 1)?1:0),
1281 HSPset_SList_PAGE_SIZE,
1282 hsp_param->match->query->advance,
1283 hsp_param->match->target->advance,
1284 sizeof(gint));
1285 seed_data->page_used_total = 0;
1286 return seed_data;
1287 }
1288
1289 static void HSPset_SeedData_destroy(HSPset_SeedData *seed_data){
1290 g_free(seed_data->diag_page_list);
1291 g_free(seed_data->page_used);
1292 g_free(seed_data->horizon);
1293 g_free(seed_data);
1294 return;
1295 }
1296
1297 static void HSPset_SeedData_set_target_len(HSPset_SeedData *seed_data,
1298 HSPset *hsp_set){
1299 register gint new_page_total = (hsp_set->target->len
1300 >> HSPset_SList_PAGE_BIT_WIDTH) + 1;
1301 register gint i, a, b, c, d;
1302 seed_data->page_total = new_page_total;
1303 if(seed_data->page_alloc < new_page_total){
1304 seed_data->page_alloc = seed_data->page_total;
1305 g_free(seed_data->diag_page_list);
1306 seed_data->diag_page_list = g_new(HSPset_SList_Node*,
1307 seed_data->page_total);
1308 g_free(seed_data->page_used);
1309 seed_data->page_used = g_new(gint, seed_data->page_total);
1310 }
1311 /* Clear diag_page_list */
1312 for(i = 0; i < seed_data->page_total; i++)
1313 seed_data->diag_page_list[i] = 0;
1314 /* Clear horizon */
1315 for(a = 2 + ((hsp_set->param->seed_repeat > 1)?1:0); a >= 0; a--)
1316 for(b = HSPset_SList_PAGE_SIZE; b >= 0; b--)
1317 for(c = hsp_set->param->match->query->advance; c >= 0; c--)
1318 for(d = hsp_set->param->match->target->advance; d >= 0; d--)
1319 seed_data->horizon[a][b][c][d] = 0;
1320 seed_data->page_used_total = 0;
1321 return;
1322 }
1323 #endif /* 0 */
1324
HSPset_seed_all_qy_sorted(HSPset * hsp_set,HSPset_SList_Node * seed_list)1325 void HSPset_seed_all_qy_sorted(HSPset *hsp_set, HSPset_SList_Node *seed_list){
1326 register gint page_total = (hsp_set->target->len
1327 >> HSPset_SList_PAGE_BIT_WIDTH) + 1;
1328 register HSPset_SList_Node **diag_page_list
1329 = g_new0(HSPset_SList_Node*, page_total);
1330 register gint *page_used = g_new(gint, page_total);
1331 register gint ****horizon = (gint****)Matrix4d_create(
1332 2 + ((hsp_set->param->seed_repeat > 1)?1:0),
1333 HSPset_SList_PAGE_SIZE,
1334 hsp_set->param->match->query->advance,
1335 hsp_set->param->match->target->advance,
1336 sizeof(gint));
1337 /*
1338 register HSPset_SeedData *seed_data = HSPset_SeedData_create(hsp_set->param,
1339 hsp_set->target->len);
1340 */
1341 register gint i, page, diag_pos, query_frame, target_frame,
1342 section_pos, page_pos;
1343 register HSPset_SList_Node *seed;
1344 register gint page_used_total = 0;
1345 HSP nascent_hsp;
1346 /* g_message("[%s] with [%d]", __FUNCTION__, hsp_set->target->len); */
1347 /* HSPset_SeedData_set_target_len(seed_data, hsp_set); */
1348 /* Bin on diagonal into pages */
1349 while(seed_list){
1350 seed = seed_list;
1351 seed_list = seed_list->next;
1352 /**/
1353 diag_pos = (seed->target_pos
1354 * hsp_set->param->match->query->advance)
1355 - (seed->query_pos
1356 * hsp_set->param->match->target->advance);
1357 section_pos = ((diag_pos + hsp_set->target->len)
1358 % hsp_set->target->len);
1359 page = (section_pos >> HSPset_SList_PAGE_BIT_WIDTH);
1360 g_assert(section_pos >= 0);
1361 g_assert(section_pos < hsp_set->target->len);
1362 g_assert(page >= 0);
1363 g_assert(page < page_total);
1364 /**/
1365 if(!diag_page_list[page])
1366 page_used[page_used_total++] = page;
1367 seed->next = diag_page_list[page];
1368 diag_page_list[page] = seed;
1369 }
1370 /* Seed each page using page horizon */
1371 for(i = 0; i < page_used_total; i++){
1372 page = page_used[i];
1373 for(seed = diag_page_list[page]; seed; seed = seed->next){
1374 g_assert((!seed->next)
1375 || (seed->query_pos <= seed->next->query_pos));
1376 diag_pos = (seed->target_pos
1377 * hsp_set->param->match->query->advance)
1378 - (seed->query_pos
1379 * hsp_set->param->match->target->advance);
1380 query_frame = seed->query_pos
1381 % hsp_set->param->match->query->advance;
1382 target_frame = seed->target_pos
1383 % hsp_set->param->match->target->advance;
1384 section_pos = ((diag_pos + hsp_set->target->len)
1385 % hsp_set->target->len);
1386 page_pos = section_pos
1387 - (page << HSPset_SList_PAGE_BIT_WIDTH);
1388 g_assert(page_pos >= 0);
1389 g_assert(page_pos < HSPset_SList_PAGE_SIZE);
1390 /* Clear if page has changed */
1391 if(horizon[1][page_pos][query_frame][target_frame] != page){
1392 horizon[0][page_pos][query_frame][target_frame] = 0;
1393 horizon[1][page_pos][query_frame][target_frame] = page;
1394 if(hsp_set->param->seed_repeat > 1)
1395 horizon[2][page_pos][query_frame][target_frame] = 0;
1396 }
1397 if(seed->query_pos < horizon[0][page_pos][query_frame][target_frame])
1398 continue;
1399 if(hsp_set->param->seed_repeat > 1){
1400 if(++horizon[2][page_pos][query_frame][target_frame]
1401 < hsp_set->param->seed_repeat){
1402 continue;
1403 }
1404 horizon[2][page_pos][query_frame][target_frame] = 0;
1405 }
1406 /* Nascent HSP building: */
1407 nascent_hsp.hsp_set = hsp_set;
1408 nascent_hsp.query_start = seed->query_pos;
1409 nascent_hsp.target_start = seed->target_pos;
1410 nascent_hsp.length = hsp_set->param->seedlen;
1411 nascent_hsp.cobs = 0;
1412 g_assert(HSP_check(&nascent_hsp));
1413 HSP_trim_ends(&nascent_hsp);
1414 /* Score is irrelevant before HSP_init() */
1415 HSP_init(&nascent_hsp);
1416 /* Try to make above threshold HSP using masking */
1417 if(hsp_set->param->match->query->mask_func
1418 || hsp_set->param->match->target->mask_func){
1419 HSP_extend(&nascent_hsp, TRUE);
1420 if(nascent_hsp.score < hsp_set->param->threshold){
1421 horizon[0][page_pos][query_frame][target_frame]
1422 = HSP_query_end(&nascent_hsp);
1423 continue;
1424 }
1425 }
1426 /* Extend the HSP again ignoring masking */
1427 HSP_extend(&nascent_hsp, FALSE);
1428 HSP_store(&nascent_hsp);
1429 /**/
1430 horizon[0][page_pos][query_frame][target_frame]
1431 = HSP_query_end(&nascent_hsp);
1432 }
1433 }
1434 g_free(diag_page_list);
1435 g_free(page_used);
1436 g_free(horizon);
1437 HSPset_finalise(hsp_set);
1438 /* HSPset_SeedData_destroy(seed_data); */
1439 return;
1440 }
1441 /* horizon[horizon][mailbox][seed_repeat]
1442 * [page_size][qadv][tadv]
1443 */
1444
1445 /**/
1446
1447