1 /****************************************************************\
2 * *
3 * Analysis module for exonerate *
4 * *
5 * Guy St.C. Slater.. mailto:guy@ebi.ac.uk *
6 * Copyright (C) 2000-2009. All Rights Reserved. *
7 * *
8 * This source code is distributed under the terms of the *
9 * GNU General Public License, version 3. See the file COPYING *
10 * or http://www.gnu.org/licenses/gpl.txt for details *
11 * *
12 * If you use this code, please keep this notice intact. *
13 * *
14 \****************************************************************/
15
16 #include "analysis.h"
17 #include "ungapped.h"
18 #include "compoundfile.h"
19 #include "hspset.h"
20 #include "lineparse.h"
21
22 #include <stdlib.h> /* For atoi() */
23 #include <unistd.h> /* For sleep() */
24 #include <string.h> /* For strcmp() */
25 #include <ctype.h> /* For isdigit() */
26
27 #include <sys/types.h> /* For stat() */
28 #include <sys/stat.h> /* For stat() */
29 #include <unistd.h> /* For stat() */
30
Analysis_ArgumentSet_create(Argument * arg)31 Analysis_ArgumentSet *Analysis_ArgumentSet_create(Argument *arg){
32 register ArgumentSet *as;
33 static Analysis_ArgumentSet aas;
34 if(arg){
35 as = ArgumentSet_create("Analysis Options");
36 ArgumentSet_add_option(as, 'E', "exhaustive", NULL,
37 "Perform exhaustive alignment (slow)", "FALSE",
38 Argument_parse_boolean, &aas.use_exhaustive);
39 ArgumentSet_add_option(as, 'B', "bigseq", NULL,
40 "Allow rapid comparison between big sequences", "FALSE",
41 Argument_parse_boolean, &aas.use_bigseq);
42 ArgumentSet_add_option(as, 'r', "revcomp", NULL,
43 "Also search reverse complement of query and target", "TRUE",
44 Argument_parse_boolean, &aas.use_revcomp);
45 ArgumentSet_add_option(as, '\0', "forcescan", "[q|t]",
46 "Force FSM scan on query or target sequences", "none",
47 Argument_parse_string, &aas.force_scan);
48 /**/
49 ArgumentSet_add_option(as, 0, "saturatethreshold", "int",
50 "Word saturation threshold", "0",
51 Argument_parse_int, &aas.saturate_threshold);
52 /**/
53 ArgumentSet_add_option(as, 0, "customserver", "command",
54 "Custom command to send non-standard server", "NULL",
55 Argument_parse_string, &aas.custom_server_command);
56 ArgumentSet_add_option(as, 'c', "cores", "number",
57 "Number of cores/CPUs/threads for alignment computation", "1",
58 Argument_parse_int, &aas.thread_count);
59 Argument_absorb_ArgumentSet(arg, as);
60 }
61 return &aas;
62 }
63
64 /**/
65
66 typedef struct {
67 GAM *gam;
68 Comparison *comparison;
69 } Analysis_HeuristicJob;
70
Analysis_HeuristicJob_create(GAM * gam,Comparison * comparison)71 static Analysis_HeuristicJob *Analysis_HeuristicJob_create(GAM *gam,
72 Comparison *comparison){
73 register Analysis_HeuristicJob *ahj = g_new(Analysis_HeuristicJob, 1);
74 ahj->gam = GAM_share(gam);
75 ahj->comparison = Comparison_share(comparison);
76 return ahj;
77 }
78
Analysis_HeuristicJob_destroy(Analysis_HeuristicJob * ahj)79 static void Analysis_HeuristicJob_destroy(Analysis_HeuristicJob *ahj){
80 Comparison_destroy(ahj->comparison);
81 GAM_destroy(ahj->gam);
82 g_free(ahj);
83 return;
84 }
85
Analysis_HeuristicJob_run(gpointer data)86 static void Analysis_HeuristicJob_run(gpointer data){
87 register Analysis_HeuristicJob *ahj = data;
88 register GAM_Result *gam_result
89 = GAM_Result_heuristic_create(ahj->gam, ahj->comparison);
90 if(gam_result){
91 GAM_Result_submit(gam_result);
92 GAM_Result_destroy(gam_result);
93 }
94 Analysis_HeuristicJob_destroy(ahj);
95 return;
96 }
97
Analysis_report_func(Comparison * comparison,gpointer user_data)98 static void Analysis_report_func(Comparison *comparison,
99 gpointer user_data){
100 register Analysis *analysis = user_data;
101 register GAM_Result *gam_result;
102 register Analysis_HeuristicJob *ahj;
103 g_assert(Comparison_has_hsps(comparison));
104 if(analysis->scan_query){
105 /* Swap back query and target after a query scan */
106 Comparison_swap(comparison);
107 } else {
108 /* Revert target scan alignments to +ve query strand when possible */
109 if((comparison->query->alphabet->type == Alphabet_Type_DNA)
110 && (comparison->target->alphabet->type == Alphabet_Type_DNA)
111 && (comparison->query->strand == Sequence_Strand_REVCOMP)
112 && (comparison->target->strand == Sequence_Strand_FORWARD)
113 && (!analysis->gam->translate_both))
114 Comparison_revcomp(comparison);
115 }
116 if(Model_Type_is_gapped(analysis->gam->gas->type)){
117 ahj = Analysis_HeuristicJob_create(analysis->gam, comparison);
118 #ifdef USE_PTHREADS
119 g_assert(analysis->job_queue);
120 /* FIXME: need to use priority here */
121 JobQueue_submit(analysis->job_queue, Analysis_HeuristicJob_run, ahj, 2);
122 #else /* USE_PTHREADS */
123 Analysis_HeuristicJob_run(ahj);
124 #endif /* USE_PTHREADS */
125 } else {
126 gam_result = GAM_Result_ungapped_create(analysis->gam,
127 comparison);
128 if(gam_result){
129 GAM_Result_submit(gam_result);
130 GAM_Result_destroy(gam_result);
131 }
132 }
133 return;
134 }
135
136 /**/
137
Analysis_FastaPipe_Pair_init_func(gpointer user_data)138 static void Analysis_FastaPipe_Pair_init_func(gpointer user_data){
139 register Analysis *analysis = user_data;
140 g_assert(!analysis->curr_query);
141 return;
142 }
143 /* Called before query pipeline loading */
144
Analysis_FastaPipe_Pair_prep_func(gpointer user_data)145 static void Analysis_FastaPipe_Pair_prep_func(gpointer user_data){
146 register Analysis *analysis = user_data;
147 g_assert(analysis->curr_query);
148 return;
149 }
150 /* Called after query pipeline loading */
151
Analysis_FastaPipe_Pair_term_func(gpointer user_data)152 static void Analysis_FastaPipe_Pair_term_func(gpointer user_data){
153 register Analysis *analysis = user_data;
154 FastaDB_Seq_destroy(analysis->curr_query);
155 analysis->curr_query = NULL;
156 return;
157 }
158 /* Called after query pipeline analysis */
159
Analysis_FastaPipe_Pair_query_func(FastaDB_Seq * fdbs,gpointer user_data)160 static gboolean Analysis_FastaPipe_Pair_query_func(FastaDB_Seq *fdbs,
161 gpointer user_data){
162 register Analysis *analysis = user_data;
163 g_assert(!analysis->curr_query);
164 /*
165 if(analysis->aas->use_exhaustive)
166 g_strup(fdbs->seq->seq);
167 */
168 if(analysis->verbosity > 1)
169 g_message("Load query for pairwise comparision [%s] (%d)",
170 fdbs->seq->id, fdbs->seq->len);
171 analysis->curr_query = FastaDB_Seq_share(fdbs);
172 return TRUE; /* take queries one at a time */
173 }
174 /* Called on query loading */
175
Analysis_BSAM_compare(Analysis * analysis,FastaDB_Seq * query,FastaDB_Seq * target)176 static void Analysis_BSAM_compare(Analysis *analysis,
177 FastaDB_Seq *query,
178 FastaDB_Seq *target){
179 register Comparison *comparison = BSAM_compare(analysis->bsam,
180 query->seq,
181 target->seq);
182 if(comparison){
183 if(Comparison_has_hsps(comparison))
184 Analysis_report_func(comparison, analysis);
185 Comparison_destroy(comparison);
186 }
187 return;
188 }
189
190 /**/
191
192 typedef struct {
193 GAM *gam;
194 Sequence *query;
195 Sequence *target;
196 } Analysis_ExhaustiveJob;
197
Analysis_ExhaustiveJob_create(GAM * gam,Sequence * query,Sequence * target)198 static Analysis_ExhaustiveJob *Analysis_ExhaustiveJob_create(GAM *gam,
199 Sequence *query, Sequence *target){
200 register Analysis_ExhaustiveJob *aej = g_new(Analysis_ExhaustiveJob, 1);
201 aej->gam = GAM_share(gam);
202 aej->query = Sequence_share(query);
203 aej->target = Sequence_share(target);
204 return aej;
205 }
206
Analysis_ExhaustiveJob_destroy(Analysis_ExhaustiveJob * aej)207 static void Analysis_ExhaustiveJob_destroy(Analysis_ExhaustiveJob *aej){
208 GAM_destroy(aej->gam);
209 Sequence_destroy(aej->query);
210 Sequence_destroy(aej->target);
211 g_free(aej);
212 return;
213 }
214
Analysis_ExhaustiveJob_run(gpointer data)215 static void Analysis_ExhaustiveJob_run(gpointer data){
216 register Analysis_ExhaustiveJob *aej = data;
217 register GAM_Result *gam_result;
218 gam_result = GAM_Result_exhaustive_create(aej->gam,
219 aej->query, aej->target);
220 if(gam_result){
221 GAM_Result_submit(gam_result);
222 GAM_Result_destroy(gam_result);
223 }
224 Analysis_ExhaustiveJob_destroy(aej);
225 return;
226 }
227
Analysis_Pair_compare(Analysis * analysis,FastaDB_Seq * fdbs)228 static void Analysis_Pair_compare(Analysis *analysis,
229 FastaDB_Seq *fdbs){
230 register Analysis_ExhaustiveJob *aej;
231 if(analysis->aas->use_exhaustive){
232 aej = Analysis_ExhaustiveJob_create(analysis->gam,
233 analysis->curr_query->seq,
234 fdbs->seq);
235 #ifdef USE_PTHREADS
236 g_assert(analysis->job_queue);
237 JobQueue_submit(analysis->job_queue, Analysis_ExhaustiveJob_run, aej, 1);
238 #else /* USE_PTHREADS */
239 Analysis_ExhaustiveJob_run(aej);
240 #endif /* USE_PTHREADS */
241 } else {
242 Analysis_BSAM_compare(analysis, analysis->curr_query, fdbs);
243 }
244 return;
245 }
246
247 /**/
248
Analysis_FastaPipe_Pair_target_func(FastaDB_Seq * fdbs,gpointer user_data)249 static gboolean Analysis_FastaPipe_Pair_target_func(FastaDB_Seq *fdbs,
250 gpointer user_data){
251 register Analysis *analysis = user_data;
252 g_assert(analysis->gam);
253 g_assert(analysis->curr_query);
254 /*
255 if(analysis->aas->use_exhaustive)
256 g_strup(fdbs->seq->seq);
257 */
258 if(analysis->verbosity > 1)
259 g_message("Load target for pairwise comparison [%s] (%d)",
260 fdbs->seq->id, fdbs->seq->len);
261 /**/
262 Analysis_Pair_compare(analysis, fdbs);
263 return FALSE;
264 }
265 /* Called on target loading */
266
267 /**/
268
Analysis_FastaPipe_Seeder_init_func(gpointer user_data)269 static void Analysis_FastaPipe_Seeder_init_func(gpointer user_data){
270 register Analysis *analysis = user_data;
271 g_assert(!analysis->curr_seeder);
272 register Comparison_Param *comparison_param
273 = Comparison_Param_share(analysis->comparison_param);
274 if(analysis->scan_query)
275 comparison_param = Comparison_Param_swap(comparison_param);
276 analysis->curr_seeder = Seeder_create(analysis->verbosity,
277 comparison_param, analysis->aas->saturate_threshold,
278 Analysis_report_func, analysis);
279 Comparison_Param_destroy(comparison_param);
280 return;
281 }
282 /* Called before query pipeline loading */
283
Analysis_FastaPipe_Seeder_prep_func(gpointer user_data)284 static void Analysis_FastaPipe_Seeder_prep_func(gpointer user_data){
285 /* does nothing */
286 return;
287 }
288 /* Called after query pipeline loading */
289
Analysis_FastaPipe_Seeder_term_func(gpointer user_data)290 static void Analysis_FastaPipe_Seeder_term_func(gpointer user_data){
291 register Analysis *analysis = user_data;
292 Seeder_destroy(analysis->curr_seeder);
293 if(analysis->verbosity > 2){
294 g_message("### Seeder destroyed ###");
295 RecycleBin_profile();
296 }
297 analysis->curr_seeder = NULL;
298 return;
299 }
300 /* Called after query pipeline analysis */
301
Analysis_FastaPipe_Seeder_query_func(FastaDB_Seq * fdbs,gpointer user_data)302 static gboolean Analysis_FastaPipe_Seeder_query_func(FastaDB_Seq *fdbs,
303 gpointer user_data){
304 register Analysis *analysis = user_data;
305 if(analysis->verbosity > 1)
306 g_message("Load query for Seeder [%s] (%d)",
307 fdbs->seq->id, fdbs->seq->len);
308 return Seeder_add_query(analysis->curr_seeder, fdbs->seq);
309 }
310 /* Called on query loading */
311
Analysis_FastaPipe_Seeder_target_func(FastaDB_Seq * fdbs,gpointer user_data)312 static gboolean Analysis_FastaPipe_Seeder_target_func(FastaDB_Seq *fdbs,
313 gpointer user_data){
314 register Analysis *analysis = user_data;
315 if(analysis->verbosity > 1)
316 g_message("Load target for Seeder [%s] (%d)",
317 fdbs->seq->id, fdbs->seq->len);
318 Seeder_add_target(analysis->curr_seeder, fdbs->seq);
319 return FALSE;
320 }
321 /* Called on target loading */
322
323 /**/
324
Analysis_decide_scan_query(FastaDB * query_fdb,FastaDB * target_fdb,gchar * force_scan)325 static gboolean Analysis_decide_scan_query(FastaDB *query_fdb,
326 FastaDB *target_fdb,
327 gchar *force_scan){
328 register CompoundFile_Pos query_size, target_size;
329 if(!g_strcasecmp(force_scan, "none")){
330 query_size = CompoundFile_get_length(query_fdb->cf);
331 target_size = CompoundFile_get_length(target_fdb->cf);
332 if((query_size >> 4) < target_size)
333 return FALSE;
334 else
335 return TRUE;
336 } else if((!g_strcasecmp(force_scan, "query"))
337 || (!g_strcasecmp(force_scan, "q"))){
338 return TRUE;
339 } else if((!g_strcasecmp(force_scan, "target"))
340 || (!g_strcasecmp(force_scan, "t"))){
341 return FALSE;
342 } else {
343 g_error("Unknown force_scan command [%s]", force_scan);
344 }
345 return FALSE; /* not reached */
346 }
347
Analysis_find_matches(Analysis * analysis,Match ** dna_match,Match ** protein_match,Match ** codon_match)348 static void Analysis_find_matches(Analysis *analysis,
349 Match **dna_match, Match **protein_match,
350 Match **codon_match){
351 register GPtrArray *transition_list
352 = C4_Model_select_transitions(analysis->gam->model,
353 C4_Label_MATCH);
354 register gint i;
355 register C4_Transition *transition;
356 register Match *match;
357 for(i = 0; i < transition_list->len; i++){
358 transition = transition_list->pdata[i];
359 g_assert(transition);
360 g_assert(transition->label == C4_Label_MATCH);
361 match = transition->label_data;
362 if(match){
363 switch(match->type){
364 case Match_Type_DNA2DNA:
365 if((*dna_match) && ((*dna_match) != match))
366 g_error("Multiple DNA matches not implemented");
367 (*dna_match) = match;
368 break;
369 case Match_Type_PROTEIN2PROTEIN:
370 case Match_Type_DNA2PROTEIN:
371 case Match_Type_PROTEIN2DNA:
372 if((*protein_match) && ((*protein_match) != match))
373 g_error("Multiple protein matches"
374 " not supported");
375 (*protein_match) = match;
376 break;
377 case Match_Type_CODON2CODON:
378 if((*codon_match) && ((*codon_match) != match))
379 g_error("Multiple codon matches"
380 " not implemented");
381 (*codon_match) = match;
382 break;
383 default:
384 break;
385 }
386 }
387 }
388 g_ptr_array_free(transition_list, TRUE);
389 return;
390 }
391
392 /**/
393
Analysis_Client_connect(gchar * path)394 static SocketClient *Analysis_Client_connect(gchar *path){
395 register SocketClient *sc;
396 register gint i, divider = 0, port;
397 register gchar *server;
398 register gint connection_attempts = 10;
399 for(i = 0; path[i]; i++)
400 if(path[i] == ':'){
401 divider = i;
402 break;
403 }
404 if(divider){
405 port = atoi(path+divider+1);
406 server = g_strndup(path, divider);
407 for(i = connection_attempts; i >= 0; i--){
408 sc = SocketClient_create(server, port);
409 if(sc)
410 break;
411 g_warning("Failed connection to server (retrys left: %d", i);
412 sleep(1);
413 }
414 g_free(server);
415 return sc;
416 }
417 return NULL;
418 }
419
Analysis_Client_send(Analysis_Client * aclient,gchar * msg,gchar * expect,gboolean multi_line_reply)420 static gchar *Analysis_Client_send(Analysis_Client *aclient, gchar *msg,
421 gchar *expect, gboolean multi_line_reply){
422 register gchar *reply = SocketClient_send(aclient->sc, msg);
423 register gchar *line, *p = reply, *processed_reply;
424 register gint line_count = 0;
425 register GString *str = g_string_sized_new(64);
426 if(aclient->verbosity >= 3){
427 g_print("Message: client sent message [%s]\n", msg);
428 if((aclient->verbosity == 3) && (strlen(reply) >= 80))
429 g_print("Message: client received reply [%.*s<truncated>] \n",
430 80, reply);
431 else
432 g_print("Message: client received reply [%s]\n", reply);
433 }
434 do {
435 while(*p){
436 if(!isspace(*p)) /* Strip blank lines */
437 break;
438 p++;
439 }
440 line = p;
441 if(!strncmp(p, "error:", 6))
442 g_error("Error from server: [%s]", p);
443 if(!strncmp(p, "warning:", 8)){
444 g_warning("Warning from server: [%s]", p);
445 } else if(!strncmp(p, "linecount:", 8)){
446 while(*p){ /* Skip to end of line */
447 if(*p == '\n')
448 break;
449 p++;
450 }
451 } else if(!strncmp(p, expect, strlen(expect))){
452 while(*p){ /* Skip to end of line */
453 if(*p == '\n')
454 break;
455 p++;
456 }
457 line_count++;
458 *p = '\0';
459 g_string_append(str, line);
460 g_string_append_c(str, '\n');
461 *p = '\n';
462 } else {
463 if(!*p)
464 break;
465 g_error("Unexpected line from server [%s]", p);
466 }
467 } while(TRUE);
468 g_free(reply);
469 if(!line_count)
470 g_error("No reply received from server msg=[%s]", msg);
471 if((!multi_line_reply) && (line_count > 1))
472 g_error("Unexpected multi-line reply from server");
473 processed_reply = str->str;
474 g_string_free(str, FALSE);
475 return processed_reply;
476 }
477
Analysis_Client_set_param(Analysis_Client * aclient,GAM * gam)478 static void Analysis_Client_set_param(Analysis_Client *aclient, GAM *gam){
479 register HSPset_ArgumentSet *has = HSPset_ArgumentSet_create(NULL);
480 register Analysis_ArgumentSet *aas = Analysis_ArgumentSet_create(NULL);
481 register gchar *msg, *reply;
482 /**/
483 if(aas->custom_server_command){
484 msg = g_strdup_printf("%s\n", aas->custom_server_command);
485 reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
486 g_free(msg);
487 g_free(reply);
488 }
489 /**/
490 msg = g_strdup_printf("set param seedrepeat %d",
491 has->seed_repeat);
492 reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
493 g_free(msg);
494 g_free(reply);
495 /**/
496 msg = g_strdup_printf("set param dnahspthreshold %d",
497 has->dna_hsp_threshold);
498 reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
499 g_free(msg);
500 g_free(reply);
501 /**/
502 msg = g_strdup_printf("set param proteinhspthreshold %d",
503 has->protein_hsp_threshold);
504 reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
505 g_free(msg);
506 g_free(reply);
507 /**/
508 msg = g_strdup_printf("set param codonhspthreshold %d",
509 has->codon_hsp_threshold);
510 reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
511 g_free(msg);
512 g_free(reply);
513 /**/
514 msg = g_strdup_printf("set param dnawordlimit %d",
515 has->dna_word_limit);
516 reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
517 g_free(msg);
518 g_free(reply);
519 /**/
520 msg = g_strdup_printf("set param proteinwordlimit %d",
521 has->protein_word_limit);
522 reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
523 g_free(msg);
524 g_free(reply);
525 /**/
526 msg = g_strdup_printf("set param codonwordlimit %d",
527 has->codon_word_limit);
528 reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
529 g_free(msg);
530 g_free(reply);
531 /**/
532 msg = g_strdup_printf("set param dnahspdropoff %d",
533 has->dna_hsp_dropoff);
534 reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
535 g_free(msg);
536 g_free(reply);
537 /**/
538 msg = g_strdup_printf("set param proteinhspdropoff %d",
539 has->protein_hsp_dropoff);
540 reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
541 g_free(msg);
542 g_free(reply);
543 /**/
544 msg = g_strdup_printf("set param codonhspdropoff %d",
545 has->codon_hsp_dropoff);
546 reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
547 g_free(msg);
548 g_free(reply);
549 /**/
550 msg = g_strdup_printf("set param geneseedthreshold %d",
551 has->geneseed_threshold);
552 reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
553 g_free(msg);
554 g_free(reply);
555 /**/
556 msg = g_strdup_printf("set param geneseedrepeat %d",
557 has->geneseed_repeat);
558 reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
559 g_free(msg);
560 g_free(reply);
561 /**/
562 msg = g_strdup_printf("set param maxqueryspan %d",
563 gam->max_query_span);
564 reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
565 g_free(msg);
566 g_free(reply);
567 /**/
568 msg = g_strdup_printf("set param maxtargetspan %d",
569 gam->max_target_span);
570 reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
571 g_free(msg);
572 g_free(reply);
573 /**/
574 return;
575 }
576
Analysis_Client_info(Analysis_Client * aclient)577 static void Analysis_Client_info(Analysis_Client *aclient){
578 return;
579 }
580
Analysis_Client_create(gchar * path,gint verbosity)581 static Analysis_Client *Analysis_Client_create(gchar *path, gint verbosity){
582 register Analysis_Client *aclient;
583 register SocketClient *sc = Analysis_Client_connect(path);
584 register Alphabet_Type alphabet_type;
585 register gchar *dbinfo, **dbinfo_word;
586 if(!sc)
587 return NULL;
588 aclient = g_new(Analysis_Client, 1);
589 aclient->ref_count = 1;
590 aclient->sc = sc;
591 aclient->verbosity = verbosity;
592 aclient->probe_fdb = NULL;
593 dbinfo = Analysis_Client_send(aclient, "dbinfo", "dbinfo:", FALSE);
594 dbinfo_word = g_strsplit(dbinfo, " ", 8);
595 /**/
596 g_assert(dbinfo_word[0]);
597 g_assert(dbinfo_word[1]);
598 g_assert(dbinfo_word[2]);
599 g_assert(dbinfo_word[3]);
600 g_assert(dbinfo_word[4]);
601 /**/
602 alphabet_type = Alphabet_Type_UNKNOWN;
603 if(!strcmp(dbinfo_word[1], "dna"))
604 alphabet_type = Alphabet_Type_DNA;
605 if(!strcmp(dbinfo_word[1], "protein"))
606 alphabet_type = Alphabet_Type_PROTEIN;
607 /**/
608 aclient->is_masked = FALSE;
609 if(!strcmp(dbinfo_word[2], "softmasked"))
610 aclient->is_masked = TRUE;
611 else if(!strcmp(dbinfo_word[2], "unmasked"))
612 aclient->is_masked = TRUE;
613 else
614 g_error("Unrecognised dbinfo masking state [%s]", dbinfo_word[2]);
615 /**/
616 aclient->server_alphabet = Alphabet_create(alphabet_type, aclient->is_masked);
617 aclient->num_seqs = atoll(dbinfo_word[3]);
618 aclient->max_seq_len = atoll(dbinfo_word[4]);
619 aclient->total_seq_len = atoll(dbinfo_word[5]);
620 /**/
621 g_strfreev(dbinfo_word);
622 g_free(dbinfo);
623 aclient->curr_query = NULL;
624 aclient->seq_cache = g_new0(Sequence*, aclient->num_seqs);
625 Analysis_Client_info(aclient);
626 return aclient;
627 }
628
Analysis_Client_share(Analysis_Client * aclient)629 static Analysis_Client *Analysis_Client_share(Analysis_Client *aclient){
630 aclient->ref_count++;
631 return aclient;
632 }
633
Analysis_Client_destroy(Analysis_Client * aclient)634 static void Analysis_Client_destroy(Analysis_Client *aclient){
635 register gint i;
636 register Sequence *seq;
637 if(--aclient->ref_count)
638 return;
639 if(aclient->curr_query)
640 Sequence_destroy(aclient->curr_query);
641 for(i = 0; i < aclient->num_seqs; i++){
642 seq = aclient->seq_cache[i];
643 if(seq)
644 Sequence_destroy(seq);
645 }
646 g_free(aclient->seq_cache);
647 if(aclient->probe_fdb)
648 FastaDB_close(aclient->probe_fdb);
649 SocketClient_destroy(aclient->sc);
650 Alphabet_destroy(aclient->server_alphabet);
651 g_free(aclient);
652 return;
653 }
654
Analysis_Client_set_probe_fdb(Analysis_Client * aclient,FastaDB * probe_fdb)655 static void Analysis_Client_set_probe_fdb(Analysis_Client *aclient,
656 FastaDB *probe_fdb){
657 g_assert(!aclient->probe_fdb);
658 aclient->probe_fdb = FastaDB_dup(probe_fdb);
659 return;
660 }
661
Analysis_Client_set_query(Analysis_Client * aclient,Sequence * seq)662 static void Analysis_Client_set_query(Analysis_Client *aclient, Sequence *seq){
663 register gchar *seq_str = Sequence_get_str(seq);
664 register gchar *msg = g_strdup_printf("set query %s", seq_str);
665 register gchar *reply = Analysis_Client_send(aclient, msg, "ok:", FALSE);
666 register gchar **word;
667 register gint len, checksum;
668 if(strncmp(reply, "ok:", 3))
669 g_error("Could not set query [%s] on server", seq->id);
670 word = g_strsplit(reply+4, " ", 4);
671 len = atoi(word[0]);
672 checksum = atoi(word[1]);
673 if(seq->len != len)
674 g_error("Query length mismatch on server %d %d", seq->len, len);
675 if(Sequence_checksum(seq) != checksum)
676 g_error("Query checksum mismatch on server %d %d",
677 Sequence_checksum(seq), checksum);
678 g_strfreev(word);
679 g_free(reply);
680 g_free(msg);
681 g_free(seq_str);
682 if(aclient->curr_query)
683 Sequence_destroy(aclient->curr_query);
684 aclient->curr_query = Sequence_share(seq);
685 return;
686 }
687
Analysis_Client_revcomp_query(Analysis_Client * aclient)688 static void Analysis_Client_revcomp_query(Analysis_Client *aclient){
689 register gchar *reply = Analysis_Client_send(aclient,
690 "revcomp query", "ok: query strand revcomp", FALSE);
691 register Sequence *curr_query;
692 curr_query = aclient->curr_query;
693 aclient->curr_query = Sequence_revcomp(aclient->curr_query);
694 Sequence_destroy(curr_query);
695 g_free(reply);
696 return;
697 }
698
Analysis_Client_revcomp_target(Analysis_Client * aclient)699 static void Analysis_Client_revcomp_target(Analysis_Client *aclient){
700 register gchar *reply = Analysis_Client_send(aclient,
701 "revcomp target", "ok: target strand", FALSE);
702 g_free(reply);
703 return;
704 }
705
706 /**/
707
708 typedef struct {
709 Analysis_Client *aclient;
710 gint target_id;
711 gint seq_len;
712 } Analysis_Client_Key;
713
Analysis_Client_Key_create(Analysis_Client * aclient,gint target_id,gint seq_len)714 static Analysis_Client_Key *Analysis_Client_Key_create(Analysis_Client *aclient,
715 gint target_id, gint seq_len){
716 register Analysis_Client_Key *key = g_new(Analysis_Client_Key, 1);
717 key->aclient = Analysis_Client_share(aclient);
718 key->target_id = target_id;
719 key->seq_len = seq_len;
720 return key;
721 }
722
Analysis_Client_Key_destroy(Analysis_Client_Key * key)723 static void Analysis_Client_Key_destroy(Analysis_Client_Key *key){
724 Analysis_Client_destroy(key->aclient);
725 g_free(key);
726 return;
727 }
728
Analysis_Client_SparseCache_get_func(gint pos,gpointer page_data,gpointer user_data)729 static gpointer Analysis_Client_SparseCache_get_func(gint pos,
730 gpointer page_data,
731 gpointer user_data){
732 return GINT_TO_POINTER((gint)((gchar*)page_data)[pos]);
733 }
734
Analysis_Client_SparseCache_fill_func(gint start,gpointer user_data)735 static SparseCache_Page *Analysis_Client_SparseCache_fill_func(gint start,
736 gpointer user_data){
737 register Analysis_Client_Key *key = user_data;
738 register SparseCache_Page *page = g_new(SparseCache_Page, 1);
739 register gint len = MIN(SparseCache_PAGE_SIZE, key->seq_len-start),
740 page_len;
741 register gchar *msg = g_strdup_printf("get subseq %d %d %d",
742 key->target_id, start, len);
743 register gchar *reply = Analysis_Client_send(key->aclient, msg,
744 "subseq:", FALSE);
745 if(strncmp(reply, "subseq:", 7))
746 g_error("Failed to get subseq for target (%d,%d,%d) [%s]",
747 key->target_id, start, len, reply);
748 page->get_func = Analysis_Client_SparseCache_get_func;
749 page->copy_func = NULL;
750 page_len = strlen(reply+8)-1;
751 page->data = g_strndup(reply+8, page_len);
752 page->data_size = sizeof(gchar)*page_len;
753 g_free(msg);
754 g_free(reply);
755 FastaDB_SparseCache_compress(page, page_len);
756 return page;
757 }
758 /* FIXME: move compression stuff to SeqPage in Sequence */
759
Analysis_Client_SparseCache_free_func(gpointer user_data)760 static void Analysis_Client_SparseCache_free_func(gpointer user_data){
761 register Analysis_Client_Key *key = user_data;
762 Analysis_Client_Key_destroy(key);
763 return;
764 }
765
Analysis_Client_get_SparseCache(Analysis_Client * aclient,gint sequence_id,gint len)766 static SparseCache *Analysis_Client_get_SparseCache(
767 Analysis_Client *aclient,
768 gint sequence_id, gint len){
769 register Analysis_Client_Key *key
770 = Analysis_Client_Key_create(aclient, sequence_id, len);
771 return SparseCache_create(len, Analysis_Client_SparseCache_fill_func,
772 NULL, Analysis_Client_SparseCache_free_func, key);
773 }
774
Analysis_Client_get_Sequence(Analysis_Client * aclient,gint sequence_id,gboolean revcomp_target)775 static Sequence *Analysis_Client_get_Sequence(Analysis_Client *aclient,
776 gint sequence_id,
777 gboolean revcomp_target){
778 register gchar *msg, *reply, *id, *def;
779 register SparseCache *cache;
780 register Sequence *seq = aclient->seq_cache[sequence_id];
781 register gint len, checksum;
782 register gchar **seqinfo_word;
783 if(seq){
784 if(revcomp_target)
785 return Sequence_revcomp(seq);
786 return Sequence_share(seq);
787 }
788 msg = g_strdup_printf("get info %d", sequence_id);
789 reply = Analysis_Client_send(aclient, msg, "seqinfo:", FALSE);
790 /**/
791 if(strncmp(reply, "seqinfo:", 8))
792 g_error("Failed to set info for target [%d]", sequence_id);
793 /* parse seqinfo for <len> <checksum> <id> and [<def>] */
794 seqinfo_word = g_strsplit(reply+9, " ", 4);
795 len = atoi(seqinfo_word[0]);
796 checksum = atoi(seqinfo_word[1]);
797 id = seqinfo_word[2];
798 def = seqinfo_word[3];
799 /* Strip any trailing newlines */
800 if(id[strlen(id)-1] == '\n')
801 id[strlen(id)-1] = '\0';
802 if(def && (def[strlen(def)-1] == '\n'))
803 def[strlen(def)-1] = '\0';
804 /**/
805 cache = Analysis_Client_get_SparseCache(aclient, sequence_id, len);
806 seq = Sequence_create_extmem(id, def, len,
807 (aclient->server_alphabet->type == Alphabet_Type_DNA)
808 ?Sequence_Strand_FORWARD:Sequence_Strand_UNKNOWN,
809 aclient->server_alphabet, cache);
810 g_assert(!aclient->seq_cache[sequence_id]);
811 aclient->seq_cache[sequence_id] = seq;
812 g_strfreev(seqinfo_word);
813 SparseCache_destroy(cache);
814 g_free(reply);
815 g_free(msg);
816 if(revcomp_target)
817 return Sequence_revcomp(seq);
818 return Sequence_share(seq);
819 }
820
821 typedef enum {
822 Analysis_Client_HSP_TOKEN_BEGIN_SET,
823 Analysis_Client_HSP_TOKEN_END_SET,
824 Analysis_Client_HSP_TOKEN_INT,
825 Analysis_Client_HSP_TOKEN_FINISH
826 } Analysis_Client_HSP_TOKEN;
827
Analysis_Client_get_hsp_token(gchar * str,gint * pos,gint * intval)828 static Analysis_Client_HSP_TOKEN Analysis_Client_get_hsp_token(gchar *str,
829 gint *pos, gint *intval){
830 register gint ch;
831 gchar *endptr;
832 g_assert(str);
833 while((ch = str[(*pos)])){
834 switch(ch){
835 case 'h':
836 if(strncmp(str+(*pos), "hspset:", 7))
837 g_error("Unexpected string in HSPset list");
838 (*pos) += 7;
839 if(strncmp(str+(*pos), " empty\n", 7)){
840 return Analysis_Client_HSP_TOKEN_BEGIN_SET;
841 } else {
842 (*pos) += 7;
843 break;
844 }
845 case ' ':
846 (*pos)++;
847 break;
848 case '\n':
849 (*pos)++;
850 return Analysis_Client_HSP_TOKEN_END_SET;
851 default:
852 if(isdigit(ch)){
853 (*intval) = strtol(str+(*pos), &endptr, 10);
854 (*pos) = endptr-str;
855 return Analysis_Client_HSP_TOKEN_INT;
856 } else {
857 g_error("Unexpected character [%d] in HSPset list", ch);
858 }
859 break;
860 }
861 }
862 return Analysis_Client_HSP_TOKEN_FINISH;
863 }
864
Analysis_Client_get_hsp_sets(Analysis_Client * aclient,Analysis * analysis,gboolean swap_chains,gboolean revcomp_target)865 static void Analysis_Client_get_hsp_sets(Analysis_Client *aclient,
866 Analysis *analysis,
867 gboolean swap_chains,
868 gboolean revcomp_target){
869 register gchar *reply = Analysis_Client_send(aclient,
870 "get hsps", "hspset:", TRUE);
871 gint pos = 0, intval = 0;
872 register gboolean ok = TRUE;
873 register Analysis_Client_HSP_TOKEN token;
874 register gint target_id = -1, query_pos = -1, target_pos = -1, length;
875 register Comparison *comparison = NULL;
876 register Sequence *target = NULL;
877 register Match_Type match_type
878 = Match_Type_find(aclient->curr_query->alphabet->type,
879 aclient->server_alphabet->type, FALSE);
880 /* FIXME: use Match_Type_find with translate_both for codon alignments */
881 register HSPset *hsp_set = NULL;
882 do {
883 token = Analysis_Client_get_hsp_token(reply, &pos, &intval);
884 switch(token){
885 case Analysis_Client_HSP_TOKEN_BEGIN_SET:
886 break;
887 case Analysis_Client_HSP_TOKEN_INT:
888 if(target_id == -1){
889 target_id = intval;
890 target = Analysis_Client_get_Sequence(aclient,
891 target_id, revcomp_target);
892 g_assert(!comparison);
893 g_assert(aclient->curr_query);
894 /* FIXME: temp : make work with other HSP types */
895 /* FIXME: should take necessary HSP params from server */
896 if(swap_chains)
897 comparison = Comparison_create(analysis->comparison_param,
898 target, aclient->curr_query);
899 else
900 comparison = Comparison_create(analysis->comparison_param,
901 aclient->curr_query, target);
902 /* FIXME: should ensure that the HSPset is created
903 * without a horizon
904 */
905 Sequence_destroy(target);
906 target = NULL;
907 } else if(query_pos == -1){
908 query_pos = intval;
909 } else if(target_pos == -1){
910 target_pos = intval;
911 } else {
912 length = intval;
913 g_assert(comparison);
914 /*
915 g_message("adding one [%d,%d,%d]", query_pos, target_pos,
916 length);
917 */
918 /* FIXME: need fix to work with for other match types */
919 switch(match_type){
920 case Match_Type_DNA2DNA:
921 hsp_set = comparison->dna_hspset;
922 break;
923 case Match_Type_PROTEIN2PROTEIN:
924 case Match_Type_PROTEIN2DNA:
925 case Match_Type_DNA2PROTEIN:
926 hsp_set = comparison->protein_hspset;
927 break;
928 default:
929 g_error("Match_Type not supported [%s]",
930 Match_Type_get_name(match_type));
931 }
932 g_assert(hsp_set);
933 if(swap_chains)
934 HSPset_add_known_hsp(hsp_set, target_pos, query_pos,
935 length);
936 else
937 HSPset_add_known_hsp(hsp_set, query_pos, target_pos,
938 length);
939 query_pos = target_pos = -1;
940 }
941 break;
942 case Analysis_Client_HSP_TOKEN_END_SET:
943 /* FIXME: needs to work for other hsp_set types */
944 Comparison_finalise(comparison);
945 if(Comparison_has_hsps(comparison)){
946 #if 0
947 /* FIXME: move to use scan_query swap in report */
948 if(swap_chains)
949 Comparison_swap(comparison);
950 #endif /* 0 */
951 Analysis_report_func(comparison, analysis);
952 }
953 Comparison_destroy(comparison);
954 comparison = NULL;
955 target_id = -1;
956 break;
957 case Analysis_Client_HSP_TOKEN_FINISH:
958 ok = FALSE;
959 break;
960 }
961 } while(ok);
962 g_assert(target_id == -1);
963 /* format: <HSPSET> <TARGETID> { <QSTART TSTART LEN> } */
964 /* tokens <BEGIN_HSPSET> <INT> <ENDHSPSET> */
965 g_free(reply);
966 return;
967 }
968 /* FIXME: only working for single hspset comparisons */
969
Analysis_Client_process_query(Analysis_Client * aclient,Analysis * analysis,Sequence * query,gboolean swap_chains,gboolean revcomp_target,gint priority)970 static void Analysis_Client_process_query(Analysis_Client *aclient,
971 Analysis *analysis,
972 Sequence *query,
973 gboolean swap_chains,
974 gboolean revcomp_target,
975 gint priority){
976 Analysis_Client_set_query(aclient, query);
977 Analysis_Client_get_hsp_sets(aclient, analysis, swap_chains, revcomp_target);
978 /* Revcomp query if DNA */
979 if(aclient->curr_query->alphabet->type == Alphabet_Type_DNA){
980 Analysis_Client_revcomp_query(aclient);
981 Analysis_Client_get_hsp_sets(aclient, analysis,
982 swap_chains, revcomp_target);
983 }
984 return;
985 }
986
Analysis_Client_process(Analysis_Client * aclient,Analysis * analysis,gboolean swap_chains,gint priority)987 static void Analysis_Client_process(Analysis_Client *aclient, Analysis *analysis,
988 gboolean swap_chains, gint priority){
989 register FastaDB_Seq *fdbs;
990 /* FIXME: need to check for appropriate database type */
991 while((fdbs = FastaDB_next(aclient->probe_fdb, FastaDB_Mask_ALL))){
992 Analysis_Client_process_query(aclient, analysis, fdbs->seq,
993 swap_chains, FALSE, priority);
994 /* Revcomp target if protein vs DNA or translate_both */
995 if(((aclient->curr_query->alphabet->type == Alphabet_Type_PROTEIN)
996 && (aclient->server_alphabet->type == Alphabet_Type_DNA))
997 || analysis->gam->translate_both){
998 Analysis_Client_revcomp_target(aclient);
999 Analysis_Client_process_query(aclient, analysis,
1000 fdbs->seq, swap_chains, TRUE,
1001 priority);
1002 Analysis_Client_revcomp_target(aclient);
1003 }
1004 FastaDB_Seq_destroy(fdbs);
1005 }
1006 return;
1007 }
1008
1009 /**/
1010
Analysis_Server_create(Analysis_Builder * ab,gchar * name,gint priority)1011 static Analysis_Server *Analysis_Server_create(Analysis_Builder *ab,
1012 gchar *name, gint priority){
1013 register Analysis_Server *as = g_new(Analysis_Server, 1);
1014 as->name = g_strdup(name);
1015 as->ab = ab;
1016 as->priority = priority;
1017 return as;
1018 }
1019
Analysis_Server_destroy(Analysis_Server * as)1020 static void Analysis_Server_destroy(Analysis_Server *as){
1021 g_free(as->name);
1022 g_free(as);
1023 return;
1024 }
1025
1026 /**/
1027
Analysis_Builder_create(GPtrArray * path_list,Analysis * analysis,gint verbosity)1028 static Analysis_Builder *Analysis_Builder_create(GPtrArray *path_list,
1029 Analysis *analysis,
1030 gint verbosity){
1031 register Analysis_Builder *ab;
1032 register gint i;
1033 register Analysis_Client *ac;
1034 register Analysis_Server *as;
1035 g_assert(path_list->len);
1036 ac = Analysis_Client_create(path_list->pdata[0], analysis->verbosity);
1037 if(!ac)
1038 return NULL;
1039 /**/
1040 ab = g_new(Analysis_Builder, 1);
1041 ab->verbosity = verbosity;
1042 ab->server_list = g_ptr_array_new();
1043 ab->server_type = ac->server_alphabet->type;
1044 ab->probe_fdb = NULL;
1045 ab->analysis = analysis;
1046 Analysis_Client_destroy(ac);
1047 for(i = 0; i < path_list->len; i++){
1048 as = Analysis_Server_create(ab, path_list->pdata[i], i);
1049 g_ptr_array_add(ab->server_list, as);
1050 }
1051 return ab;
1052 }
1053
Analysis_Builder_destroy(Analysis_Builder * ab)1054 static void Analysis_Builder_destroy(Analysis_Builder *ab){
1055 register gint i;
1056 register Analysis_Server *as;
1057 for(i = 0; i < ab->server_list->len; i++){
1058 as = ab->server_list->pdata[i];
1059 Analysis_Server_destroy(as);
1060 }
1061 g_ptr_array_free(ab->server_list, TRUE);
1062 if(ab->probe_fdb)
1063 FastaDB_close(ab->probe_fdb);
1064 g_free(ab);
1065 return;
1066 }
1067
Analysis_Server_run(gpointer data)1068 static void Analysis_Server_run(gpointer data){
1069 register Analysis_Server *server = data;
1070 register Analysis_Client *client
1071 = Analysis_Client_create(server->name,
1072 server->ab->analysis->verbosity);
1073 if(!client)
1074 g_error("Could not connect to server [%s]", server->name);
1075 Analysis_Client_set_param(client, server->ab->analysis->gam);
1076 Analysis_Client_set_probe_fdb(client, server->ab->probe_fdb);
1077 Analysis_Client_process(client, server->ab->analysis,
1078 server->ab->swap_chains, server->priority);
1079 /**/
1080 Analysis_Client_destroy(client);
1081 return;
1082 }
1083
Analysis_Builder_process(Analysis_Builder * ab,Analysis * analysis,gboolean swap_chains)1084 static void Analysis_Builder_process(Analysis_Builder *ab,
1085 Analysis *analysis, gboolean swap_chains){
1086 register gint i;
1087 register Analysis_Server *as;
1088 ab->swap_chains = swap_chains;
1089 for(i = 0; i < ab->server_list->len; i++){
1090 as = ab->server_list->pdata[i];
1091 #ifdef USE_PTHREADS
1092 g_assert(analysis->job_queue);
1093 JobQueue_submit(analysis->job_queue, Analysis_Server_run, as, i);
1094 #else /* USE_PTHREADS */
1095 Analysis_Server_run(as);
1096 #endif /* USE_PTHREADS */
1097 }
1098 return;
1099 }
1100
Analysis_Builder_set_probe_fdb(Analysis_Builder * ab,FastaDB * probe_fdb)1101 static void Analysis_Builder_set_probe_fdb(Analysis_Builder *ab,
1102 FastaDB *probe_fdb){
1103 g_assert(!ab->probe_fdb);
1104 ab->probe_fdb = FastaDB_share(probe_fdb);
1105 return;
1106 }
1107
1108 /**/
1109
Analysis_path_list_destroy(GPtrArray * path_list)1110 static void Analysis_path_list_destroy(GPtrArray *path_list){
1111 register gint i;
1112 register gchar *path;
1113 for(i = 0; i < path_list->len; i++){
1114 path = path_list->pdata[i];
1115 g_free(path);
1116 }
1117 g_ptr_array_free(path_list, TRUE);
1118 return;
1119 }
1120
Analysis_FOSN_expand_path_list(GPtrArray * path_list)1121 static GPtrArray *Analysis_FOSN_expand_path_list(GPtrArray *path_list){
1122 register gint i;
1123 register gchar *path, *npath;
1124 struct stat buf;
1125 register gboolean expanded_list = FALSE;
1126 register GPtrArray *npath_list = g_ptr_array_new();
1127 register FILE *fp;
1128 register LineParse *lp;
1129 for(i = 0; i < path_list->len; i++){
1130 path = path_list->pdata[i];
1131 if((stat(path, &buf)) /* Cannot read file (ie. servername) */
1132 || (S_ISDIR(buf.st_mode)) /* Is directory */
1133 || (FastaDB_file_is_fasta(path))){ /* 1st non-whitespace char is '>' */
1134 npath = g_strdup(path);
1135 g_ptr_array_add(npath_list, npath);
1136 } else { /* Assume to be fosn */
1137 fp = fopen(path, "r");
1138 if(!fp)
1139 g_error("Could not open FOSN file [%s]", path);
1140 lp = LineParse_create(fp);
1141 while(LineParse_line(lp) != EOF){
1142 g_strstrip(lp->line->str); /* strip whitespace */
1143 if(lp->line->str[0]){
1144 npath = g_strdup(lp->line->str);
1145 g_ptr_array_add(npath_list, npath);
1146 expanded_list = TRUE;
1147 }
1148 }
1149 LineParse_destroy(lp);
1150 fclose(fp);
1151 }
1152 }
1153 if(!expanded_list){
1154 for(i = 0; i < npath_list->len; i++){
1155 npath = npath_list->pdata[i];
1156 g_free(npath);
1157 }
1158 g_ptr_array_free(npath_list, TRUE);
1159 return NULL;
1160 }
1161 return npath_list;
1162 }
1163 /* If members of the path list are files
1164 * where the first non-whitespace character is not '>',
1165 * it is assumed to be a FOSN, and parsed to expand the path_list.
1166 */
1167
Analysis_create(GPtrArray * query_path_list,Alphabet_Type query_type,gint query_chunk_id,gint query_chunk_total,GPtrArray * target_path_list,Alphabet_Type target_type,gint target_chunk_id,gint target_chunk_total,gint verbosity)1168 Analysis *Analysis_create(
1169 GPtrArray *query_path_list, Alphabet_Type query_type,
1170 gint query_chunk_id, gint query_chunk_total,
1171 GPtrArray *target_path_list, Alphabet_Type target_type,
1172 gint target_chunk_id, gint target_chunk_total,
1173 gint verbosity){
1174 register Analysis *analysis = g_new0(Analysis, 1);
1175 register FastaDB *query_fdb = NULL, *target_fdb = NULL,
1176 *seeder_query_fdb, *seeder_target_fdb;
1177 register Match *match;
1178 Match *dna_match = NULL, *protein_match = NULL,
1179 *codon_match = NULL;
1180 register HSP_Param *dna_hsp_param, *protein_hsp_param,
1181 *codon_hsp_param;
1182 register Match_ArgumentSet *mas = Match_ArgumentSet_create(NULL);
1183 register gboolean use_horizon;
1184 register GPtrArray *expanded_query_path_list = NULL,
1185 *expanded_target_path_list = NULL;
1186 g_assert(query_path_list);
1187 g_assert(target_path_list);
1188 g_assert(query_path_list->len);
1189 g_assert(target_path_list->len);
1190 analysis->aas = Analysis_ArgumentSet_create(NULL);
1191 analysis->verbosity = verbosity;
1192 analysis->job_queue = JobQueue_create(analysis->aas->thread_count);
1193 /* Expand FOSN paths */
1194 expanded_query_path_list = Analysis_FOSN_expand_path_list(
1195 query_path_list);
1196 expanded_target_path_list = Analysis_FOSN_expand_path_list(
1197 target_path_list);
1198 if(expanded_query_path_list)
1199 query_path_list = expanded_query_path_list;
1200 if(expanded_target_path_list)
1201 target_path_list = expanded_target_path_list;
1202 /**/
1203 analysis->query_builder = Analysis_Builder_create(query_path_list,
1204 analysis, verbosity);
1205 analysis->target_builder = Analysis_Builder_create(target_path_list,
1206 analysis, verbosity);
1207 /**/
1208 if(query_type == Alphabet_Type_UNKNOWN){
1209 if(analysis->query_builder){
1210 query_type = analysis->query_builder->server_type;
1211 } else {
1212 query_type = FastaDB_guess_type(
1213 (gchar*)query_path_list->pdata[0]);
1214 if(verbosity > 1)
1215 g_message("Guessed query type [%s]",
1216 Alphabet_Type_get_name(query_type));
1217 }
1218 }
1219 if(target_type == Alphabet_Type_UNKNOWN){
1220 if(analysis->target_builder){
1221 target_type = analysis->target_builder->server_type;
1222 } else {
1223 target_type = FastaDB_guess_type(
1224 (gchar*)target_path_list->pdata[0]);
1225 if(verbosity > 1)
1226 g_message("Guessed target type [%s]",
1227 Alphabet_Type_get_name(target_type));
1228 }
1229 }
1230 g_assert((query_type == Alphabet_Type_DNA)
1231 ||(query_type == Alphabet_Type_PROTEIN));
1232 g_assert((target_type == Alphabet_Type_DNA)
1233 ||(target_type == Alphabet_Type_PROTEIN));
1234 if(verbosity > 1)
1235 g_message("Creating analysis with query[%s] target[%s]",
1236 Alphabet_Type_get_name(query_type),
1237 Alphabet_Type_get_name(target_type));
1238 analysis->gam = GAM_create(query_type, target_type,
1239 mas->dna_submat,
1240 mas->protein_submat,
1241 mas->translate,
1242 analysis->aas->use_exhaustive,
1243 verbosity);
1244 /**/
1245 Analysis_find_matches(analysis, &dna_match, &protein_match,
1246 &codon_match);
1247 match = dna_match;
1248 if(!match)
1249 match = protein_match;
1250 if(!match)
1251 match = codon_match;
1252 g_assert(match);
1253 if(!analysis->query_builder)
1254 query_fdb = FastaDB_open_list_with_limit(query_path_list,
1255 match->query->alphabet, query_chunk_id, query_chunk_total);
1256 if(!analysis->target_builder)
1257 target_fdb = FastaDB_open_list_with_limit(target_path_list,
1258 match->target->alphabet, target_chunk_id, target_chunk_total);
1259 if(analysis->aas->use_exhaustive){
1260 if(analysis->query_builder || analysis->target_builder) /* FIXME: ni */
1261 g_error("Exhaustive alignment against server not implemented");
1262 analysis->fasta_pipe = FastaPipe_create(
1263 query_fdb, target_fdb,
1264 Analysis_FastaPipe_Pair_init_func,
1265 Analysis_FastaPipe_Pair_prep_func,
1266 Analysis_FastaPipe_Pair_term_func,
1267 Analysis_FastaPipe_Pair_query_func,
1268 Analysis_FastaPipe_Pair_target_func,
1269 FastaDB_Mask_ALL,
1270 analysis->gam->translate_both,
1271 analysis->aas->use_revcomp);
1272 analysis->curr_query = NULL;
1273 } else { /* Not exhaustive */
1274 use_horizon = (analysis->aas->use_bigseq
1275 || analysis->query_builder
1276 || analysis->target_builder)?FALSE:TRUE;
1277 dna_hsp_param = dna_match
1278 ? HSP_Param_create(dna_match, use_horizon)
1279 : NULL;
1280 protein_hsp_param = protein_match
1281 ? HSP_Param_create(protein_match, use_horizon)
1282 : NULL;
1283 codon_hsp_param = codon_match
1284 ? HSP_Param_create(codon_match, use_horizon)
1285 : NULL;
1286 analysis->comparison_param = Comparison_Param_create(
1287 query_type, target_type,
1288 dna_hsp_param, protein_hsp_param, codon_hsp_param);
1289 if(dna_hsp_param)
1290 HSP_Param_destroy(dna_hsp_param);
1291 if(protein_hsp_param)
1292 HSP_Param_destroy(protein_hsp_param);
1293 if(codon_hsp_param)
1294 HSP_Param_destroy(codon_hsp_param);
1295 /* Raise HSP thresholds to score if ungapped */
1296 if(!Model_Type_is_gapped(analysis->gam->gas->type)){
1297 if(analysis->comparison_param->dna_hsp_param
1298 && (analysis->comparison_param->dna_hsp_param->threshold
1299 < analysis->gam->gas->threshold))
1300 analysis->comparison_param->dna_hsp_param->threshold
1301 = analysis->gam->gas->threshold;
1302 if(analysis->comparison_param->protein_hsp_param
1303 && (analysis->comparison_param->protein_hsp_param->threshold
1304 < analysis->gam->gas->threshold))
1305 analysis->comparison_param->protein_hsp_param->threshold
1306 = analysis->gam->gas->threshold;
1307 if(analysis->comparison_param->codon_hsp_param
1308 && (analysis->comparison_param->codon_hsp_param->threshold
1309 < analysis->gam->gas->threshold))
1310 analysis->comparison_param->codon_hsp_param->threshold
1311 = analysis->gam->gas->threshold;
1312 }
1313 /* Don't need HSP horizon for bigseq comparison */
1314 if(analysis->query_builder || analysis->target_builder){
1315 if(analysis->query_builder && analysis->target_builder)
1316 g_error("Server vs server comparison not impelemented");
1317 analysis->fasta_pipe = NULL;
1318 if(analysis->query_builder){
1319 Analysis_Builder_set_probe_fdb(analysis->query_builder,
1320 target_fdb);
1321 } else {
1322 g_assert(analysis->target_builder);
1323 Analysis_Builder_set_probe_fdb(analysis->target_builder,
1324 query_fdb);
1325 }
1326 } else {
1327 if(analysis->aas->use_bigseq){
1328 analysis->bsam = BSAM_create(analysis->comparison_param,
1329 analysis->aas->saturate_threshold,
1330 verbosity);
1331 analysis->fasta_pipe = FastaPipe_create(
1332 query_fdb, target_fdb,
1333 Analysis_FastaPipe_Pair_init_func,
1334 Analysis_FastaPipe_Pair_prep_func,
1335 Analysis_FastaPipe_Pair_term_func,
1336 Analysis_FastaPipe_Pair_query_func,
1337 Analysis_FastaPipe_Pair_target_func,
1338 FastaDB_Mask_ALL,
1339 analysis->gam->translate_both,
1340 analysis->aas->use_revcomp);
1341 analysis->curr_query = NULL;
1342 } else { /* Use Seeder */
1343 analysis->scan_query = Analysis_decide_scan_query(query_fdb,
1344 target_fdb,
1345 analysis->aas->force_scan);
1346 if(verbosity > 1)
1347 g_message("Applying FSM scan to [%s]",
1348 analysis->scan_query?"query":"target");
1349 /* Swap paths and types
1350 * for query and target when scan on query
1351 */
1352 if(analysis->scan_query){
1353 seeder_query_fdb = target_fdb;
1354 seeder_target_fdb = query_fdb;
1355 } else {
1356 seeder_query_fdb = query_fdb;
1357 seeder_target_fdb = target_fdb;
1358 }
1359 analysis->curr_seeder = NULL;
1360 analysis->fasta_pipe = FastaPipe_create(
1361 seeder_query_fdb, seeder_target_fdb,
1362 Analysis_FastaPipe_Seeder_init_func,
1363 Analysis_FastaPipe_Seeder_prep_func,
1364 Analysis_FastaPipe_Seeder_term_func,
1365 Analysis_FastaPipe_Seeder_query_func,
1366 Analysis_FastaPipe_Seeder_target_func,
1367 FastaDB_Mask_ALL, analysis->gam->translate_both,
1368 analysis->aas->use_revcomp);
1369 }
1370 }
1371 }
1372 if(query_fdb)
1373 FastaDB_close(query_fdb);
1374 if(target_fdb)
1375 FastaDB_close(target_fdb);
1376 /**/
1377 if(expanded_query_path_list)
1378 Analysis_path_list_destroy(expanded_query_path_list);
1379 if(expanded_target_path_list)
1380 Analysis_path_list_destroy(expanded_target_path_list);
1381 return analysis;
1382 }
1383
Analysis_destroy(Analysis * analysis)1384 void Analysis_destroy(Analysis *analysis){
1385 JobQueue_destroy(analysis->job_queue);
1386 if(analysis->fasta_pipe)
1387 FastaPipe_destroy(analysis->fasta_pipe);
1388 if(analysis->curr_query)
1389 FastaDB_Seq_destroy(analysis->curr_query);
1390 if(analysis->curr_seeder)
1391 Seeder_destroy(analysis->curr_seeder);
1392 if(analysis->bsam)
1393 BSAM_destroy(analysis->bsam);
1394 if(analysis->comparison_param)
1395 Comparison_Param_destroy(analysis->comparison_param);
1396 /*
1397 if(analysis->query_ac)
1398 Analysis_Client_destroy(analysis->query_ac);
1399 if(analysis->target_ac)
1400 Analysis_Client_destroy(analysis->target_ac);
1401 */
1402 if(analysis->query_builder)
1403 Analysis_Builder_destroy(analysis->query_builder);
1404 if(analysis->target_builder)
1405 Analysis_Builder_destroy(analysis->target_builder);
1406 GAM_destroy(analysis->gam);
1407 g_free(analysis);
1408 return;
1409 }
1410
Analysis_process(Analysis * analysis)1411 void Analysis_process(Analysis *analysis){
1412 if(analysis->query_builder){
1413 Analysis_Builder_process(analysis->query_builder, analysis, TRUE);
1414 } else if(analysis->target_builder){
1415 Analysis_Builder_process(analysis->target_builder, analysis, FALSE);
1416 } else {
1417 while(FastaPipe_process(analysis->fasta_pipe, analysis));
1418 }
1419 if(analysis->job_queue)
1420 JobQueue_complete(analysis->job_queue);
1421 GAM_report(analysis->gam);
1422 return;
1423 }
1424
1425 /**/
1426
1427