1 static char rcsid[] = "$Id: gmap.c 222810 2020-06-03 22:01:50Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #ifdef USE_MPI
7 #include <mpi.h>
8 #include "mpidebug.h"
9 #endif
10
11 #ifdef HAVE_SYS_TYPES_H
12 #include <sys/types.h> /* Needed to define pthread_t on Solaris */
13 #endif
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h> /* For strcpy */
17 #include <strings.h> /* For rindex */
18 #include <ctype.h>
19 #include <math.h> /* For rint */
20 #ifdef HAVE_SSE2
21 #include <emmintrin.h>
22 #endif
23 #ifdef HAVE_SSE4_1
24 #include <smmintrin.h>
25 #endif
26 #if !defined(HAVE_SSE4_2)
27 /* Skip popcnt */
28 #elif defined(HAVE_POPCNT)
29 #include <immintrin.h>
30 #endif
31
32 #if !defined(HAVE_SSE4_2)
33 /* Skip mm_popcnt */
34 #elif defined(HAVE_MM_POPCNT)
35 #include <nmmintrin.h>
36 #endif
37
38
39 #if !defined(HAVE_SSE4_2)
40 /* Skip lzcnt/tzcnt */
41 #elif defined(HAVE_LZCNT) || defined(HAVE_TZCNT)
42 #include <immintrin.h>
43 #endif
44
45
46 #ifdef HAVE_PTHREAD
47 #include <pthread.h>
48 #endif
49
50 #include <signal.h>
51
52 #include "except.h"
53 #include "mem.h"
54 #include "bool.h"
55 #include "fopen.h"
56 #include "access.h"
57 #include "filesuffix.h"
58
59 #include "sequence.h"
60 #include "match.h"
61 #include "matchpool.h"
62 #include "pairpool.h"
63 #include "diagpool.h"
64 #include "cellpool.h"
65 #include "stopwatch.h"
66 #include "translation.h" /* For Translation_setup */
67 #include "genome.h"
68 #include "genome-write.h"
69 #include "genome128_hr.h" /* For Genome_hr_setup */
70 #include "genome_sites.h" /* For Genome_sites_setup */
71 #include "compress-write.h"
72 #include "maxent_hr.h" /* For Maxent_hr_setup */
73 #include "stage1.h"
74 #include "gregion.h"
75 #ifdef PMAP
76 #include "oligoindex_pmap.h"
77 #else
78 #include "oligoindex_hr.h" /* For Oligoindex_hr_setup */
79 /* #include "oligoindex_localdb.h" -- For Oligoindex_localdb_setup */
80 #endif
81 #include "stage2.h"
82 #include "splicestringpool.h"
83 #include "splicetrie.h"
84 #include "splicetrie_build.h"
85 #include "dynprog.h"
86 #include "dynprog_single.h"
87 #include "dynprog_genome.h"
88 #include "dynprog_end.h"
89 #include "pair.h"
90 #include "stage3.h"
91 #include "comp.h"
92 #include "chimera.h"
93 #ifdef PMAP
94 #include "oligop.h" /* For Oligop_setup */
95 #include "backtranslation.h"
96 #else
97 #include "oligo.h" /* For Oligo_setup */
98 #endif
99 #include "indexdb.h"
100 #include "result.h"
101 #include "request.h"
102 #include "intlist.h"
103 #include "list.h"
104 #include "iit-read-univ.h"
105 #include "iit-read.h"
106 #include "datadir.h"
107
108 #include "filestring.h"
109 #include "output.h"
110 #include "inbuffer.h"
111 #include "outbuffer.h"
112
113 #include "getopt.h"
114
115
116 #define MAX_QUERYLENGTH_FOR_ALLOC 100000
117 #define MAX_GENOMICLENGTH_FOR_ALLOC 1000000
118
119
120 #define STAGE1_FIRSTPAIR_SIZELIMIT 10000
121 #define STAGE1_STUTTER_SIZELIMIT 100
122 #define STAGE1_FILLIN_SIZELIMIT 10
123
124
125 #define POSSIBLE_OLIGOS 65536 /* 4^8 */
126 #define MAX_OLIGODEPTH 3.0
127 #define MAX_BADOLIGOS 0.30 /* Setting to 1.0 effectively turns this check off */
128 #define MAX_REPOLIGOS 0.40 /* Setting to 1.0 effectively turns this check off */
129
130 /* Value of 1 can miss end exons, but values larger than 1 can lead to
131 very long (or infinite?) run times when combined with
132 --intronlength */
133 #define MAX_CHIMERA_ITER 3
134
135 #define CHIMERA_PENALTY 30 /* A small value for chimera_margin will reduce this */
136 #define CHIMERA_IDENTITY 0.98
137 #define CHIMERA_PVALUE 0.01
138 #define CHIMERA_FVALUE 6.634897 /* qnorm(CHIMERA_PVALUE/2)^2 */
139 #define CHIMERA_SLOP 90 /* in nucleotides */
140 #define CHIMERA_EXTEND 20 /* Was previously 8, but this missed exon-exon boundaries */
141
142 #define MIN_MATCHES 20
143
144
145 #define MAX_NALIGNMENTS 10
146
147
148 /* #define EXTRACT_GENOMICSEG 1 */
149
150
151 /* MPI Processing */
152 #ifdef DEBUGM
153 #define debugm(x) x
154 #else
155 #define debugm(x)
156 #endif
157
158
159 #ifdef DEBUG
160 #define debug(x) x
161 #else
162 #define debug(x)
163 #endif
164
165 /* Chimera detection */
166 #ifdef DEBUG2
167 #define debug2(x) x
168 #else
169 #define debug2(x)
170 #endif
171
172 /* Chimera detection, details */
173 #ifdef DEBUG2A
174 #define debug2a(x) x
175 #else
176 #define debug2a(x)
177 #endif
178
179 /* stage3list_remove_duplicates */
180 #ifdef DEBUG3
181 #define debug3(x) x
182 #else
183 #define debug3(x)
184 #endif
185
186
187
188 /************************************************************************
189 * Global variables
190 ************************************************************************/
191
192 static int translation_code = 1;
193 static bool alt_initiation_codons_p = false;
194
195 static Univ_IIT_T chromosome_iit = NULL;
196 static Univ_IIT_T altscaffold_iit = NULL;
197 static Univcoord_T genomelength;
198 static int circular_typeint = -1;
199 static int nchromosomes;
200 static bool *circularp = NULL;
201 static bool any_circular_p;
202
203 static bool *altlocp = NULL;
204 static Univcoord_T *alias_starts = NULL;
205 static Univcoord_T *alias_ends = NULL;
206 static Univ_IIT_T contig_iit = NULL;
207 static Genome_T genomecomp = NULL;
208 static Genome_T genomecomp_alt = NULL;
209 static Genomecomp_T *genomecomp_blocks = NULL;
210
211 #ifdef PMAP
212 static Alphabet_T required_alphabet = AA0;
213 static Alphabet_T alphabet = AA20; /* Initialize in case we have a usersegment */
214 static int alphabet_size = 20; /* Initialize in case we have a usersegment */
215 static Width_T index1part_aa = 7;
216 #else
217 static Width_T index1part;
218 #endif
219
220 static Indexdb_T indexdb_fwd = NULL;
221 static Indexdb_T indexdb_rev = NULL;
222
223 /* static Localdb_T localdb = NULL; */
224
225 static Width_T required_index1part = 0;
226 static Width_T index1interval;
227 static Width_T required_index1interval = 0;
228
229 /* static Width_T local1part = 8; */
230 /* static Width_T required_local1part = 0; */
231 /* static Width_T local1interval; */
232 /* static Width_T required_local1interval = 0; */
233
234 static IIT_T altstrain_iit = NULL;
235
236 /* Cmet and AtoI */
237 static char *user_modedir = NULL; /* user_cmetdir, user_atoidir */
238 static Mode_T mode = STANDARD;
239
240
241 static char *user_snpsdir = NULL;
242 static char *snps_root = (char *) NULL;
243 static IIT_T map_iit = NULL;
244 static int *map_divint_crosstable = NULL;
245
246 #ifdef PMAP
247 #if 0
248 static Width_T minindexsize = 3; /* In stage 2; in aa */
249 static Width_T maxindexsize = 6; /* In stage 2; in aa */
250 #endif
251 /* Now controlled by defect_rate */
252 static int maxpeelback = 20; /* Needs to be at least indexsize
253 because stage 2 jumps by indexsize.
254 Also should exceed length of
255 repeated nucleotides (e.g., a
256 string of consecutive T's) */
257 #else
258 /* Making minindexsize too small can lead to spurious exons in stage 2 */
259 /* FOOBAR */
260 #if 0
261 static Width_T minindexsize = 8; /* In stage 2; in nt. Used if sampling required in stage 1. */
262 static Width_T maxindexsize = 8; /* In stage 2; in nt */
263 #endif
264 static int maxpeelback = 20; /* Needs to be at least indexsize
265 because stage 2 jumps by indexsize.
266 Also should exceed length of
267 repeated nucleotides (e.g., a
268 string of consecutive T's) */
269 #endif
270 static int maxpeelback_distalmedial = 100; /* Needs to be longer to fix bad end exons */
271
272 /* static int stuttercycles = 2; */
273 static int stutterhits = 3;
274 static int sufflookback = 60;
275 static int nsufflookback = 5;
276
277 #if 0
278 static int maxoligohits = 400; /* Must be smaller than ALLOC in oligoindex.c */
279 #endif
280 static int nullgap = 600;
281 static int extramaterial_end = 10;
282 static int extramaterial_paired = 8; /* Should be at least indexsize in nt */
283 static int extraband_single = 6; /* This is in addition to length2 -
284 length1. If onesidegap is true in
285 dynprog.c, then this is equivalent
286 to extraband_single of 0. Needs
287 to be > 0 to handle default
288 close_indels_mode. */
289 static int extraband_end = 6; /* Was 6. Shouldn't differ from 0, since onesidegapp is true?
290 This is only on both sides of main diagonal */
291 static int extraband_paired = 14; /* This is in addition to length2 - length1 */
292
293 static Stopwatch_T stopwatch = NULL;
294
295
296 /************************************************************************
297 * Program options
298 ************************************************************************/
299
300 /* Input options */
301 static char *user_genomedir = NULL;
302 static char *dbroot = NULL;
303 static char *dbversion = NULL;
304 static char *user_genomicseg = NULL;
305 static bool user_selfalign_p = false;
306 static bool user_pairalign_p = false;
307 static char *user_cmdline = NULL;
308 static Sequence_T global_usersegment = NULL;
309 static int part_modulus = 0;
310 static int part_interval = 1;
311
312 static char *read_files_command = NULL;
313
314
315 /* Compute options */
316 static int min_matches;
317
318 #ifdef USE_MPI
319 int nbeyond;
320 #else
321 bool multiple_sequences_p = false;
322 #endif
323
324 static bool sharedp = false;
325 static bool preload_shared_memory_p = false;
326 static bool unload_shared_memory_p = false;
327 static bool expand_offsets_p = false;
328
329 #ifdef HAVE_MMAP
330 static Access_mode_T offsetsstrm_access = USE_ALLOCATE;
331 static Access_mode_T positions_access = USE_MMAP_PRELOAD;
332 static Access_mode_T locoffsetsstrm_access = USE_ALLOCATE;
333 static Access_mode_T locpositions_access = USE_ALLOCATE;
334
335 static Access_mode_T genome_access = USE_MMAP_PRELOAD;
336 #else
337 static Access_mode_T offsetsstrm_access = USE_ALLOCATE;
338 static Access_mode_T positions_access = USE_ALLOCATE;
339 static Access_mode_T locoffsetsstrm_access = USE_ALLOCATE;
340 static Access_mode_T locpositions_access = USE_ALLOCATE;
341
342 static Access_mode_T genome_access = USE_ALLOCATE;
343 #endif
344
345 static int min_intronlength = 9;
346 static int max_deletionlength = 50;
347 static int maxtotallen_bound = 2400000;
348
349 static bool split_large_introns_p = false;
350
351 /* Need to set higher than 200,000 for many human genes, such as ALK */
352 static int maxintronlen = 500000; /* Was used previously in stage 1. Now used only in stage 2 and Stage3_mergeable. */
353 static int maxintronlen_ends = 10000; /* Used in stage 3 */
354
355 static int minendexon = 12;
356 static int maxextension = 1000000; /* Used in stage 1. Not adjustable by user */
357 static int chimera_margin = 30; /* Useful for finding readthroughs */
358 static int index1interval = 3; /* Stage 1 interval if user provides a genomic segment */
359 /* static char *referencefile = NULL; */
360
361 #if 0
362 #ifndef PMAP
363 static bool literalrefp = false;
364 #endif
365 #endif
366
367 #ifdef USE_MPI
368 static int nprocs, n_worker_procs, proci, myid;
369 #endif
370
371
372 /* static bool altstrainp = false; */
373 #ifdef HAVE_PTHREAD
374 static pthread_t output_thread_id, *worker_thread_ids;
375 static pthread_key_t global_request_key;
376 static int nworkers = 1; /* (int) sysconf(_SC_NPROCESSORS_ONLN) */
377 #else
378 static int nworkers = 0; /* (int) sysconf(_SC_NPROCESSORS_ONLN) */
379 #endif
380 #ifndef PMAP
381 static bool prune_poor_p = false;
382 static bool prune_repetitive_p = false;
383 #endif
384 static int canonical_mode = 1;
385 static bool cross_species_p = false;
386 static int homopolymerp = false;
387
388 static char *user_chrsubsetname = NULL;
389 static Univcoord_T chrsubset_start = 0;
390 static Univcoord_T chrsubset_end = -1;
391
392 static int close_indels_mode = +1;
393 static double microexon_spliceprob = 0.95;
394 static int suboptimal_score_start = -1; /* Determined by simulations to have minimal effect */
395 static int suboptimal_score_end = 3; /* Determined by simulations to have diminishing returns above 3 */
396
397 static int trim_indel_score = -2; /* was -4 */
398
399
400 /* Output options */
401 static unsigned int output_buffer_size = 1000;
402 static Printtype_T printtype = SIMPLE;
403 static bool exception_raise_p = true;
404 static bool debug_graphic_p = false;
405 static bool stage1debug = false;
406 static bool diag_debug = false;
407 static Stage3debug_T stage3debug = NO_STAGE3DEBUG;
408 static bool timingp = false;
409 static bool checkp = false;
410 static int maxpaths_report = 5; /* 0 means 1 if nonchimeric, 2 if chimeric */
411 static bool quiet_if_excessive_p = false;
412 static double suboptimal_score_float = 0.50;
413 static bool require_splicedir_p = false;
414
415
416 /* GFF3 */
417 static bool gff3_separators_p = true;
418 static bool gff3_phase_swap_p = false;
419 static GFF3_fasta_annotation_T gff3_fasta_annotation_type = NO_ANNOTATION;
420 static CDStype_T cdstype = CDS_CDNA;
421
422 /* SAM */
423 /* Applicable to PMAP? */
424 static bool sam_paired_p = false;
425 static bool user_quality_shift = false;
426 static int quality_shift = 0;
427 static bool sam_headers_p = true;
428 static char *sam_read_group_id = NULL;
429 static char *sam_read_group_name = NULL;
430 static char *sam_read_group_library = NULL;
431 static char *sam_read_group_platform = NULL;
432 static bool sam_insert_0M_p = false;
433 static bool sam_cigar_extended_p = false;
434 static Cigar_action_T cigar_action = CIGAR_ACTION_WARNING;
435
436 static bool orderedp = false;
437 static bool failsonlyp = false;
438 static bool nofailsp = false;
439 static bool checksump = false;
440 static int chimera_overlap = 0;
441 static bool force_xs_direction_p = false;
442 static bool md_lowercase_variant_p = false;
443
444 /* Map file options */
445 static char *user_mapdir = NULL;
446 static char *map_iitfile = NULL;
447 static bool map_exons_p = false;
448 static bool map_bothstrands_p = false;
449 static bool print_comment_p = false;
450 static int nflanking = 0;
451
452 /* Alignment options */
453 static bool fulllengthp = false;
454 static int cds_startpos = -1;
455 static bool truncatep = false;
456 static int sense_try = 0; /* both */
457 static int sense_filter = 0; /* both */
458 static double min_trimmed_coverage = 0.0;
459 static double min_identity = 0.0;
460 static bool strictp = true;
461 /* static int proteinmode = 1; */
462 static bool uncompressedp = false;
463 static bool nointronlenp = false;
464 static int invertmode = 0;
465 static int ngap = 3;
466 static int wraplength = 50;
467
468
469 /* Splicing IIT */
470 static bool novelsplicingp = true; /* Can be disabled with --nosplicing flag */
471 static bool knownsplicingp = false;
472 static bool distances_observed_p = false;
473 static Chrpos_T shortsplicedist = 2000000;
474 static char *user_splicingdir = (char *) NULL;
475 static char *splicing_file = (char *) NULL;
476 static IIT_T splicing_iit = NULL;
477 static bool amb_closest_p = false;
478
479 static int donor_typeint = -1; /* for splicing_iit */
480 static int acceptor_typeint = -1; /* for splicing_iit */
481
482 static int *splicing_divint_crosstable = NULL;
483 static Univcoord_T *splicesites = NULL;
484 static Splicetype_T *splicetypes = NULL;
485 static Chrpos_T *splicedists = NULL; /* maximum observed splice distance for given splice site */
486 static List_T *splicestrings = NULL;
487 static Genomecomp_T *splicefrags_ref = NULL;
488 static Genomecomp_T *splicefrags_alt = NULL;
489 static int nsplicesites = 0;
490
491 /* Splicing via splicesites */
492 static int *nsplicepartners_skip = NULL;
493 static int *nsplicepartners_obs = NULL;
494 static int *nsplicepartners_max = NULL;
495
496 static bool splicetrie_precompute_p = true;
497 static Trieoffset_T *trieoffsets_obs = NULL;
498 static Triecontent_T *triecontents_obs = NULL;
499 static Trieoffset_T *trieoffsets_max = NULL;
500 static Triecontent_T *triecontents_max = NULL;
501
502
503 /* Input/output */
504 static char *split_output_root = NULL;
505 static char *failedinput_root = NULL;
506 static bool appendp = false;
507 static Inbuffer_T inbuffer = NULL;
508 static Outbuffer_T outbuffer = NULL;
509 static unsigned int inbuffer_nspaces = 1000;
510
511
512 #ifdef PMAP
513 /* Used alphabetically: 01235789ABbCcDdEefGgHIiKkLlMmNnOoPQRSstuVvwXxYZ */
514 #else
515 /* Used alphabetically: 01235789AaBbCcDdEeFfGgHIijKkLlMmNnOoPpQRSsTtuVvwXxYZ */
516 #endif
517
518 static struct option long_options[] = {
519 /* Input options */
520 {"dir", required_argument, 0, 'D'}, /* user_genomedir */
521 {"db", required_argument, 0, 'd'}, /* dbroot */
522 #ifdef PMAP
523 {"alphabet", required_argument, 0, 'a'}, /* required_alphabet */
524 #endif
525 {"kmer", required_argument, 0, 'k'}, /* required_index1part, index1part */
526 {"sampling", required_argument, 0, 0}, /* required_nterval, index1interval */
527 {"genomefull", no_argument, 0, 'G'}, /* uncompressedp. No longer supported. */
528 {"gseg", required_argument, 0, 'g'}, /* user_genomicseg */
529 {"selfalign", no_argument, 0, '1'}, /* user_selfalign_p */
530 {"pairalign", no_argument, 0, '2'}, /* user_pairalign_p */
531 {"cmdline", required_argument, 0, 0}, /* user_cmdline */
532 {"part", required_argument, 0, 'q'}, /* part_modulus, part_interval */
533 {"input-buffer-size", required_argument, 0, 0}, /* inbuffer_nspaces */
534
535 {"read-files-command", required_argument, 0, 0}, /* read_files_command */
536
537
538 /* Compute options */
539 {"use-shared-memory", required_argument, 0, 0}, /* sharedp */
540 {"preload-shared-memory", no_argument, 0, 0}, /* preload_shared_memory_p */
541 {"unload-shared-memory", no_argument, 0, 0}, /* unload_shared_memory_p */
542 #ifdef HAVE_MMAP
543 {"batch", required_argument, 0, 'B'}, /* offsetsstrm_access, positions_access, genome_access */
544 #endif
545 {"expand-offsets", required_argument, 0, 0}, /* expand_offsets_p */
546 {"min-intronlength", required_argument, 0, 0}, /* min_intronlength */
547
548 {"intronlength", required_argument, 0, 'K'}, /* maxintronlen, maxintronlen_ends */
549 {"max-intronlength-middle", required_argument, 0, 0}, /* maxintronlen */
550 {"max-intronlength-ends", required_argument, 0, 0}, /* maxintronlen_ends */
551 {"split-large-introns", no_argument, 0, 0}, /* split_large_introns_p */
552
553 {"trim-end-exons", required_argument, 0, 0}, /* minendexon */
554 {"totallength", required_argument, 0, 'L'}, /* maxtotallen_bound */
555 {"chimera-margin", required_argument, 0, 'x'}, /* chimera_margin */
556 {"no-chimeras", no_argument, 0, 0}, /* chimera_margin */
557 #if 0
558 {"reference", required_argument, 0, 'w'}, /* referencefile */
559 #else
560 {"localsplicedist", required_argument, 0, 'w'}, /* shortsplicedist */
561 #endif
562 {"translation-code", required_argument, 0, 0}, /* translation_code */
563 {"alt-start-codons", no_argument, 0, 0}, /* alt_initiation_codons_p */
564
565 {"nthreads", required_argument, 0, 't'}, /* nworkers */
566 {"splicingdir", required_argument, 0, 0}, /* user_splicingdir */
567 {"nosplicing", no_argument, 0, 0}, /* novelsplicingp */
568 {"use-splicing", required_argument, 0, 's'}, /* splicing_iit, knownsplicingp (was previously altstrainp) */
569 {"chrsubset", required_argument, 0, 'c'}, /* user_chrsubsetname */
570 {"canonical-mode", required_argument, 0, 0}, /* canonical_mode */
571 {"cross-species", no_argument, 0, 0}, /* cross_species_p */
572 {"homopolymer", no_argument, 0, 0}, /* homopolymerp */
573 #ifndef PMAP
574 {"prunelevel", required_argument, 0, 'p'}, /* prune_poor_p, prune_repetitive_p */
575 #endif
576 {"allow-close-indels", required_argument, 0, 0}, /* close_indels_mode, extraband_single */
577 {"microexon-spliceprob", required_argument, 0, 0}, /* microexon_spliceprob */
578 {"stage2-start", required_argument, 0, 0}, /* suboptimal_score_start */
579 {"stage2-end", required_argument, 0, 0}, /* suboptimal_score_end */
580
581 {"cmetdir", required_argument, 0, 0}, /* user_modedir */
582 {"atoidir", required_argument, 0, 0}, /* user_modedir */
583 {"mode", required_argument, 0, 0}, /* mode */
584
585 /* Output options */
586 {"output-buffer-size", required_argument, 0, 0}, /* output_buffer_size */
587 {"summary", no_argument, 0, 'S'}, /* printtype */
588 {"align", no_argument, 0, 'A'}, /* printtype */
589 {"continuous", no_argument, 0, '3'}, /* printtype */
590 {"continuous-by-exon", no_argument, 0, '4'}, /* printtype */
591 {"noexceptions", no_argument, 0, '0'}, /* exception_raise_p */
592 {"graphic", no_argument, 0, '6'}, /* debug_graphic_p */
593 {"stage3debug", required_argument, 0, '8'}, /* stage3debug */
594 {"diagnostic", no_argument, 0, '9'}, /* checkp */
595 {"npaths", required_argument, 0, 'n'}, /* maxpaths_report */
596 #if 0
597 {"quiet-if-excessive", no_argument, 0, 0}, /* quiet_if_excessive_p */
598 #endif
599 {"format", required_argument, 0, 'f'}, /* printtype */
600 {"failsonly", no_argument, 0, 0}, /* failsonlyp */
601 {"nofails", no_argument, 0, 0}, /* nofailsp */
602 {"split-output", required_argument, 0, 0}, /* split_output_root */
603 {"failed-input", required_argument, 0, 0}, /* failedinput_root */
604 {"append-output", no_argument, 0, 0}, /* appendp */
605 {"suboptimal-score", required_argument, 0, 0}, /* suboptimal_score_float */
606 {"require-splicedir", no_argument, 0, 0}, /* require_splicedir_p */
607
608 {"gff3-add-separators", required_argument, 0, 0}, /* gff3_separators_p */
609 {"gff3-swap-phase", required_argument, 0, 0}, /* gff3_phase_swap_p */
610 {"gff3-fasta-annotation", required_argument, 0, 0}, /* gff3_fasta_annotation_type */
611 {"gff3-cds", required_argument, 0, 0}, /* cdstype */
612
613 #ifndef PMAP
614 {"quality-protocol", required_argument, 0, 0}, /* quality_shift */
615 {"quality-print-shift", required_argument, 0, 'j'}, /* quality_shift */
616 {"no-sam-headers", no_argument, 0, 0}, /* sam_headers_p */
617 {"sam-use-0M", no_argument, 0, 0}, /* sam_insert_0M_p */
618 {"sam-extended-cigar", no_argument, 0, 0}, /* sam_cigar_extended_p */
619 {"read-group-id", required_argument, 0, 0}, /* sam_read_group_id */
620 {"read-group-name", required_argument, 0, 0}, /* sam_read_group_name */
621 {"read-group-library", required_argument, 0, 0}, /* sam_read_group_library */
622 {"read-group-platform", required_argument, 0, 0}, /* sam_read_group_platform */
623 {"force-xs-dir", no_argument, 0, 0}, /* force_xs_direction_p */
624 {"md-lowercase-snp", no_argument, 0, 0}, /* md_lowercase_variant_p */
625 {"action-if-cigar-error", required_argument, 0, 0}, /* cigar_action */
626 #endif
627
628 {"compress", no_argument, 0, 'Z'}, /* printtype */
629 {"ordered", no_argument, 0, 'O'}, /* orderedp */
630 {"md5", no_argument, 0, '5'}, /* checksump */
631 {"chimera-overlap", required_argument, 0, 'o'}, /* chimera_overlap */
632 {"snpsdir", required_argument, 0, 'V'}, /* user_snpsdir */
633 {"use-snps", required_argument, 0, 'v'}, /* snps_root */
634
635 /* Map file options */
636 {"mapdir", required_argument, 0, 'M'}, /* user_mapdir */
637 {"map", required_argument, 0, 'm'}, /* map_iitfile */
638 {"mapexons", no_argument, 0, 'e'}, /* map_exons_p */
639 {"mapboth", no_argument, 0, 'b'}, /* map_bothstrands_p */
640 {"nflanking", required_argument, 0, 'u'}, /* nflanking */
641 {"print-comment", no_argument, 0, 0}, /* print_comment_p */
642
643 /* Alignment options */
644 {"exons", required_argument, 0, 'E'}, /* printtype */
645 #ifdef PMAP
646 {"protein_gen", no_argument, 0, 'P'}, /* printtype */
647 {"nucleotide", no_argument, 0, 'Q'}, /* printtype */
648 #else
649 {"protein_dna", no_argument, 0, 'P'}, /* printtype */
650 {"protein_gen", no_argument, 0, 'Q'}, /* printtype */
651 {"fulllength", no_argument, 0, 'F'}, /* fulllengthp */
652 {"cdsstart", required_argument, 0, 'a'}, /* cds_startpos */
653 {"truncate", no_argument, 0, 'T'}, /* truncatep */
654 {"direction", required_argument, 0, 'z'}, /* sense_try, sense_filter */
655 #endif
656 {"tolerant", no_argument, 0, 'Y'}, /* strictp */
657 {"nolengths", no_argument, 0, 'N'}, /* nointronlenp */
658 {"invertmode", required_argument, 0, 'I'}, /* invertmode */
659 {"introngap", required_argument, 0, 'i'}, /* ngap */
660 {"wraplength", required_argument, 0, 'l'}, /* wraplength */
661
662 /* Filtering options */
663 {"min-trimmed-coverage", required_argument, 0, 0}, /* min_trimmed_coverage */
664 {"min-identity", required_argument, 0, 0}, /* min_identity */
665
666 /* Diagnostic options */
667 {"time", no_argument, 0, 0}, /* timingp */
668
669 /* Help options */
670 {"check", no_argument, 0, 0}, /* check_compiler_assumptions */
671 {"version", no_argument, 0, 0}, /* print_program_version */
672 {"help", no_argument, 0, 0}, /* print_program_usage */
673 {0, 0, 0, 0}
674 };
675
676
677 static void
print_program_version()678 print_program_version () {
679 char *genomedir;
680
681 fprintf(stdout,"\n");
682 #ifdef PMAP
683 fprintf(stdout,"PMAP: Protein Mapping and Alignment Program\n");
684 #else
685 fprintf(stdout,"GMAP: Genomic Mapping and Alignment Program\n");
686 #endif
687 fprintf(stdout,"Part of GMAP package, version %s\n",PACKAGE_VERSION);
688 fprintf(stdout,"Build target: %s\n",TARGET);
689 fprintf(stdout,"Features: ");
690 #ifdef HAVE_PTHREAD
691 fprintf(stdout,"pthreads enabled, ");
692 #else
693 fprintf(stdout,"no pthreads, ");
694 #endif
695 #ifdef HAVE_ALLOCA
696 fprintf(stdout,"alloca available, ");
697 #else
698 fprintf(stdout,"no alloca, ");
699 #endif
700 #ifdef HAVE_ZLIB
701 fprintf(stdout,"zlib available, ");
702 #else
703 fprintf(stdout,"no zlib, ");
704 #endif
705 #ifdef HAVE_MMAP
706 fprintf(stdout,"mmap available, ");
707 #else
708 fprintf(stdout,"no mmap, ");
709 #endif
710 #ifdef WORDS_BIGENDIAN
711 fprintf(stdout,"bigendian, ");
712 #else
713 fprintf(stdout,"littleendian, ");
714 #endif
715 #ifdef HAVE_SIGACTION
716 fprintf(stdout,"sigaction available, ");
717 #else
718 fprintf(stdout,"no sigaction, ");
719 #endif
720 #ifdef HAVE_64_BIT
721 fprintf(stdout,"64 bits available");
722 #else
723 fprintf(stdout,"64 bits not available");
724 #endif
725 fprintf(stdout,"\n");
726
727 fprintf(stdout,"Popcnt:");
728 #ifdef HAVE_POPCNT
729 fprintf(stdout," popcnt/lzcnt/tzcnt");
730 #endif
731 #ifdef HAVE_MM_POPCNT
732 fprintf(stdout," mm_popcnt");
733 #endif
734 #ifdef HAVE_BUILTIN_POPCOUNT
735 fprintf(stdout," builtin_popcount");
736 #endif
737 fprintf(stdout,"\n");
738
739 fprintf(stdout,"Builtin functions:");
740 #ifdef HAVE_BUILTIN_CLZ
741 fprintf(stdout," builtin_clz");
742 #endif
743 #ifdef HAVE_BUILTIN_CTZ
744 fprintf(stdout," builtin_ctz");
745 #endif
746 #ifdef HAVE_BUILTIN_POPCOUNT
747 fprintf(stdout," builtin_popcount");
748 #endif
749 fprintf(stdout,"\n");
750
751
752 fprintf(stdout,"SIMD functions compiled:");
753 #ifdef HAVE_ALTIVEC
754 fprintf(stdout," Altivec");
755 #endif
756 #ifdef HAVE_MMX
757 fprintf(stdout," MMX");
758 #endif
759 #ifdef HAVE_SSE
760 fprintf(stdout," SSE");
761 #endif
762 #ifdef HAVE_SSE2
763 fprintf(stdout," SSE2");
764 #endif
765 #ifdef HAVE_SSE3
766 fprintf(stdout," SSE3");
767 #endif
768 #ifdef HAVE_SSSE3
769 fprintf(stdout," SSSE3");
770 #endif
771 #ifdef HAVE_SSE4_1
772 fprintf(stdout," SSE4.1");
773 #endif
774 #ifdef HAVE_SSE4_2
775 fprintf(stdout," SSE4.2");
776 #endif
777 #ifdef HAVE_AVX2
778 fprintf(stdout," AVX2");
779 #endif
780 #ifdef HAVE_AVX512
781 fprintf(stdout," AVX512");
782 #endif
783 #ifdef HAVE_AVX512BW
784 fprintf(stdout," AVX512BW");
785 #endif
786 fprintf(stdout,"\n");
787
788
789 #ifdef PMAP
790 fprintf(stdout,"Stage 1 index size: %d aa\n",index1part_aa);
791 #endif
792 fprintf(stdout,"Sizes: off_t (%d), size_t (%d), unsigned int (%d), long int (%d), long long int (%d)\n",
793 (int) sizeof(off_t),(int) sizeof(size_t),(int) sizeof(unsigned int),(int) sizeof(long int),(int) sizeof(long long int));
794 fprintf(stdout,"Default gmap directory (compiled): %s\n",GMAPDB);
795 genomedir = Datadir_find_genomedir(/*user_genomedir*/NULL);
796 fprintf(stdout,"Default gmap directory (environment): %s\n",genomedir);
797 FREE(genomedir);
798 fprintf(stdout,"Thomas D. Wu, Genentech, Inc.\n");
799 fprintf(stdout,"Contact: twu@gene.com\n");
800 fprintf(stdout,"\n");
801 return;
802 }
803
804 /* This flag is not well-supported, and therefore hidden, but
805 kept for backwards compatibility */
806 /* -R, --rel=STRING Release\n\ */
807
808 static void
809 print_program_usage ();
810
811
812 static void
check_compiler_assumptions()813 check_compiler_assumptions () {
814 unsigned int x = rand(), y = rand();
815 #ifdef HAVE_SSE2
816 int z;
817 __m128i a;
818 #ifdef HAVE_SSE4_1
819 char negx, negy;
820 #endif
821 #endif
822
823
824 #ifdef HAVE_SSE2
825 fprintf(stderr,"Checking compiler assumptions for SSE2: ");
826 fprintf(stderr,"%08X %08X",x,y);
827 a = _mm_xor_si128(_mm_set1_epi32(x),_mm_set1_epi32(y));
828 z = _mm_cvtsi128_si32(a);
829 fprintf(stderr," xor=%08X\n",z);
830 #endif
831
832 #ifdef HAVE_SSE4_1
833 if ((negx = (char) x) > 0) {
834 negx = -negx;
835 }
836 if ((negy = (char) y) > 0) {
837 negy = -negy;
838 }
839
840 fprintf(stderr,"Checking compiler assumptions for SSE4.1: ");
841 fprintf(stderr,"%d %d",negx,negy);
842 a = _mm_max_epi8(_mm_set1_epi8(negx),_mm_set1_epi8(negy));
843 z = _mm_extract_epi8(a,0);
844 fprintf(stderr," max=%d => ",z);
845 if (negx > negy) {
846 if (z == (int) negx) {
847 fprintf(stderr,"compiler sign extends\n"); /* technically incorrect, but SIMD procedures behave properly */
848 } else {
849 fprintf(stderr,"compiler zero extends\n");
850 }
851 } else {
852 if (z == (int) negy) {
853 fprintf(stderr,"compiler sign extends\n"); /* technically incorrect, but SIMD procedures behave properly */
854 } else {
855 fprintf(stderr,"compiler zero extends\n");
856 }
857 }
858 #endif
859
860 #ifdef HAVE_SSE4_2
861 fprintf(stderr,"Checking compiler options for SSE4.2: ");
862 fprintf(stderr,"%08X ",x);
863 #ifdef HAVE_LZCNT
864 fprintf(stderr,"_lzcnt_u32=%d ",_lzcnt_u32(x));
865 #endif
866 #ifdef HAVE_BUILTIN_CLZ
867 fprintf(stderr,"__builtin_clz=%d ",__builtin_clz(x));
868 #endif
869 #ifdef HAVE_TZCNT
870 fprintf(stderr,"_tzcnt_u32=%d ",_tzcnt_u32(x));
871 #endif
872 #ifdef HAVE_BUILTIN_CTZ
873 fprintf(stderr,"__builtin_ctz=%d ",__builtin_ctz(x));
874 #endif
875
876 #ifdef HAVE_POPCNT
877 fprintf(stderr,"_popcnt32=%d ",_popcnt32(x));
878 #endif
879 #if defined(HAVE_MM_POPCNT)
880 fprintf(stderr,"_mm_popcnt_u32=%d ",_mm_popcnt_u32(x));
881 #endif
882 #if defined(HAVE_BUILTIN_POPCOUNT)
883 fprintf(stderr,"__builtin_popcount=%d ",__builtin_popcount(x));
884 #endif
885 fprintf(stderr,"\n");
886
887 #endif
888
889 fprintf(stderr,"Finished checking compiler assumptions\n");
890
891 return;
892 }
893
894
895 /************************************************************************/
896
897
898 /* Call before Stage1_compute */
899 static Diagnostic_T
evaluate_query(bool * poorp,bool * repetitivep,char * queryuc_ptr,int querylength,Oligoindex_T oligoindex)900 evaluate_query (bool *poorp, bool *repetitivep, char *queryuc_ptr, int querylength,
901 Oligoindex_T oligoindex) {
902 Diagnostic_T diagnostic;
903
904 diagnostic = Diagnostic_new();
905
906 #ifdef PMAP
907 Oligoindex_set_inquery(&diagnostic->query_badoligos,&diagnostic->query_repoligos,
908 &diagnostic->query_trimoligos,&diagnostic->query_trim_start,
909 &diagnostic->query_trim_end,oligoindex,queryuc_ptr,
910 /*querystart*/0,/*queryend*/querylength);
911 *poorp = false;
912 *repetitivep = false;
913 #else
914 diagnostic->query_oligodepth =
915 Oligoindex_set_inquery(&diagnostic->query_badoligos,&diagnostic->query_repoligos,
916 &diagnostic->query_trimoligos,&diagnostic->query_trim_start,
917 &diagnostic->query_trim_end,oligoindex,queryuc_ptr,
918 /*querystart*/0,/*queryend*/querylength,/*trimp*/true);
919
920 debug2(printf("query_trimoligos %d, fraction badoligos %f = %d/%d, oligodepth %f, fraction repoligos %f = %d/%d\n",
921 diagnostic->query_trimoligos,
922 (double) diagnostic->query_badoligos/(double) diagnostic->query_trimoligos,
923 diagnostic->query_badoligos,diagnostic->query_trimoligos,
924 diagnostic->query_oligodepth,
925 (double) diagnostic->query_repoligos/(double) diagnostic->query_trimoligos,
926 diagnostic->query_repoligos,diagnostic->query_trimoligos));
927
928 if (diagnostic->query_trimoligos == 0) {
929 *poorp = true;
930 } else if (((double) diagnostic->query_badoligos/(double) diagnostic->query_trimoligos > MAX_BADOLIGOS) ||
931 (diagnostic->query_trim_end - diagnostic->query_trim_start < 80 && diagnostic->query_badoligos > 0)) {
932 *poorp = true;
933 } else {
934 *poorp = false;
935 }
936
937 if (diagnostic->query_trimoligos == 0) {
938 *repetitivep = false;
939 } else if (diagnostic->query_oligodepth > MAX_OLIGODEPTH ||
940 (double) diagnostic->query_repoligos/(double) diagnostic->query_trimoligos > MAX_REPOLIGOS) {
941 *repetitivep = true;
942 } else {
943 *repetitivep = false;
944 }
945 #endif
946
947 return diagnostic;
948 }
949
950
951
952
953 static Stage3_T *
stage3array_from_list(int * npaths_primary,int * npaths_altloc,int * first_absmq,int * second_absmq,List_T stage3list,bool chimerap,bool remove_overlaps_p)954 stage3array_from_list (int *npaths_primary, int *npaths_altloc, int *first_absmq, int *second_absmq,
955 List_T stage3list, bool chimerap, bool remove_overlaps_p) {
956 Stage3_T *array1, *array0, x, y;
957 bool *eliminate;
958 int norig_primary, norig_altloc, i_primary, i_altloc, i, j;
959 int threshold_score;
960
961 Univcoord_T alias_start, alias_end;
962
963 debug(printf("Entering stage3array_from_list with %d entries\n",List_length(stage3list)));
964
965 /* Stage3_recompute_goodness(stage3list); -- No longer necessary */
966 Stage3_compute_mapq(stage3list);
967
968 if (stage3list == NULL) {
969 *first_absmq = 0;
970 *second_absmq = 0;
971 *npaths_primary = *npaths_altloc = 0;
972 return (Stage3_T *) NULL;
973
974 #if 0
975 } else if (mergedp == true) {
976 debug(printf("mergedp is true\n"));
977 Stage3_count_paths(&norig_primary,&norig_altloc,stage3list);
978 array0 = (Stage3_T *) List_to_array_out(stage3list,NULL);
979 List_free(&stage3list);
980 *first_absmq = 0;
981 *second_absmq = 0;
982 *npaths_primary = norig_primary;
983 *npaths_altloc = norig_altloc;
984 return array0;
985 #endif
986
987 } else if (chimerap == true) {
988 debug(printf("chimerap is true\n"));
989 Stage3_count_paths(&norig_primary,&norig_altloc,stage3list);
990 array0 = (Stage3_T *) List_to_array_out(stage3list,NULL);
991 List_free(&stage3list);
992 *first_absmq = Stage3_absmq_score(array0[0]);
993 if (norig_primary + norig_altloc <= 2) {
994 *second_absmq = 0;
995 } else {
996 qsort(&(array0[2]),norig_primary + norig_altloc - 2,sizeof(Stage3_T),Stage3_cmp);
997 *second_absmq = Stage3_absmq_score(array0[2]);
998 }
999 *npaths_primary = norig_primary;
1000 *npaths_altloc = norig_altloc;
1001 return array0;
1002
1003 } else if (remove_overlaps_p == false) {
1004 debug(printf("remove_overlaps_p is false\n"));
1005 Stage3_count_paths(&norig_primary,&norig_altloc,stage3list);
1006 array0 = (Stage3_T *) List_to_array_out(stage3list,NULL);
1007 List_free(&stage3list);
1008 qsort(array0,norig_primary + norig_altloc,sizeof(Stage3_T),Stage3_cmp);
1009
1010 if (suboptimal_score_float < 1.0) {
1011 threshold_score = Stage3_goodness(array0[0]) * suboptimal_score_float;
1012 debug(printf("threshold score %d = goodness %d * suboptimal score_float %f\n",
1013 threshold_score,Stage3_goodness(array0[0]),suboptimal_score_float));
1014 } else {
1015 threshold_score = Stage3_goodness(array0[0]) - (int) suboptimal_score_float;
1016 debug(printf("threshold score %d = goodness %d - suboptimal score %d\n",
1017 threshold_score,Stage3_goodness(array0[0]),(int) suboptimal_score_float));
1018 }
1019
1020 if (Stage3_altloc_chr(&alias_start,&alias_end,array0[0]) == false) {
1021 i_primary = 1;
1022 i_altloc = 0;
1023 } else {
1024 i_primary = 0;
1025 i_altloc = 1;
1026 }
1027 i = 1;
1028 while (i < norig_primary + norig_altloc && Stage3_goodness(array0[i]) >= threshold_score) {
1029 if (Stage3_altloc_chr(&alias_start,&alias_end,array0[i]) == false) {
1030 i_primary++;
1031 } else {
1032 i_altloc++;
1033 }
1034 i++;
1035 }
1036 while (i < norig_primary + norig_altloc) {
1037 Stage3_free(&(array0[i]));
1038 i++;
1039 }
1040
1041 *npaths_primary = i_primary;
1042 *npaths_altloc = i_altloc;
1043 *first_absmq = Stage3_absmq_score(array0[0]);
1044 if ((*npaths_primary) + (*npaths_altloc) < 2) {
1045 *second_absmq = 0;
1046 } else {
1047 *second_absmq = Stage3_absmq_score(array0[1]);
1048 }
1049
1050 return array0;
1051
1052 } else {
1053 debug(printf("remove_overlaps_p is true\n"));
1054 Stage3_count_paths(&norig_primary,&norig_altloc,stage3list);
1055 eliminate = (bool *) CALLOCA(norig_primary + norig_altloc,sizeof(bool));
1056
1057 /* Initial sort to remove subsumed alignments */
1058 array0 = (Stage3_T *) MALLOCA((norig_primary + norig_altloc) * sizeof(Stage3_T));
1059 List_fill_array_and_free((void **) array0,&stage3list);
1060 qsort(array0,norig_primary + norig_altloc,sizeof(Stage3_T),Stage3_cmp);
1061
1062 for (i = 0; i < norig_primary + norig_altloc; i++) {
1063 x = array0[i];
1064 debug(printf("%d: chr %d:%u..%u, goodness %d, matches %d, npairs %d\n",
1065 i,Stage3_chrnum(x),Stage3_chrstart(x),Stage3_chrend(x),Stage3_goodness(x),Stage3_matches(x),Stage3_npairs(x)));
1066 for (j = i+1; j < norig_primary + norig_altloc; j++) {
1067 y = array0[j];
1068 if (Stage3_overlap(x,y)) {
1069 eliminate[j] = true;
1070 }
1071 }
1072 }
1073
1074
1075 *npaths_primary = *npaths_altloc = 0;
1076 for (i = 0; i < norig_primary + norig_altloc; i++) {
1077 if (eliminate[i] == false) {
1078 if (Stage3_altloc_chr(&alias_start,&alias_end,array0[i]) == false) {
1079 (*npaths_primary)++;
1080 } else {
1081 (*npaths_altloc)++;
1082 }
1083 }
1084 }
1085
1086 array1 = (Stage3_T *) MALLOC_OUT(((*npaths_primary) + (*npaths_altloc)) * sizeof(Stage3_T)); /* Return value */
1087 j = 0;
1088 for (i = 0; i < norig_primary + norig_altloc; i++) {
1089 x = array0[i];
1090 if (eliminate[i] == true) {
1091 Stage3_free(&x);
1092 } else {
1093 array1[j++] = x;
1094 }
1095 }
1096 FREEA(array0);
1097 FREEA(eliminate);
1098
1099 if (suboptimal_score_float < 1.0) {
1100 threshold_score = Stage3_goodness(array1[0]) * suboptimal_score_float;
1101 debug(printf("threshold score %d = goodness %d * suboptimal score %f\n",
1102 threshold_score,Stage3_goodness(array1[0]),suboptimal_score_float));
1103 } else {
1104 threshold_score = Stage3_goodness(array1[0]) - (int) suboptimal_score_float;
1105 debug(printf("threshold score %d = goodness %d - suboptimal score %d\n",
1106 threshold_score,Stage3_goodness(array1[0]),(int) suboptimal_score_float));
1107 }
1108
1109 if (Stage3_altloc_chr(&alias_start,&alias_end,array1[0]) == false) {
1110 i_primary = 1;
1111 i_altloc = 0;
1112 } else {
1113 i_primary = 0;
1114 i_altloc = 1;
1115 }
1116 i = 1;
1117 while (i < (*npaths_primary) + (*npaths_altloc) && Stage3_goodness(array1[i]) >= threshold_score) {
1118 if (Stage3_altloc_chr(&alias_start,&alias_end,array1[i]) == false) {
1119 i_primary++;
1120 } else {
1121 i_altloc++;
1122 }
1123 i++;
1124 }
1125 while (i < (*npaths_primary) + (*npaths_altloc)) {
1126 Stage3_free(&(array1[i]));
1127 i++;
1128 }
1129
1130 *npaths_primary = i_primary;
1131 *npaths_altloc = i_altloc;
1132 *first_absmq = Stage3_absmq_score(array1[0]);
1133 if ((*npaths_primary) + (*npaths_altloc) < 2) {
1134 *second_absmq = 0;
1135 } else {
1136 *second_absmq = Stage3_absmq_score(array1[1]);
1137 }
1138 return array1;
1139 }
1140 }
1141
1142
1143 static List_T
update_stage3middle_list(List_T stage3middle_list,Sequence_T queryseq,Sequence_T queryntseq,Sequence_T queryuc,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,Chrpos_T chrstart,Chrpos_T chrend,bool watsonp,int genestrand,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Stopwatch_T worker_stopwatch)1144 update_stage3middle_list (List_T stage3middle_list, Sequence_T queryseq,
1145 #ifdef PMAP
1146 Sequence_T queryntseq,
1147 #endif
1148 Sequence_T queryuc, Stage2_alloc_T stage2_alloc,
1149 Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
1150 Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
1151 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
1152 Chrpos_T chrstart, Chrpos_T chrend, bool watsonp, int genestrand,
1153 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
1154 Stopwatch_T worker_stopwatch) {
1155 /* int stage2_source, stage2_indexsize; */
1156 /* double stage3_runtime; */
1157
1158 #ifdef PMAP
1159 Sequence_T genomicuc = NULL;
1160 char *genomicseg_ptr = NULL, *genomicuc_ptr = NULL;
1161 #elif defined(EXTRACT_GENOMICSEG)
1162 Sequence_T genomicuc = NULL;
1163 #endif
1164
1165 List_T all_stage2results, all_stage3middle_results = NULL, p;
1166 Stage2_T stage2;
1167 Stage3middle_T stage3middle;
1168 #ifdef PMAP
1169 int subseq_offset;
1170 #endif
1171
1172
1173 #ifdef PMAP_OLD
1174 /* Previously used for PMAP */
1175 if (user_genomicseg == NULL && uncompressedp == false && straintype == 0) {
1176 genomicuc = Sequence_alias(genomicseg);
1177 } else {
1178 genomicuc = Sequence_uppercase(genomicseg);
1179 }
1180 genomicseg_ptr = Sequence_fullpointer(genomicseg);
1181 genomicuc_ptr = Sequence_fullpointer(genomicuc);
1182 #elif defined(EXTRACT_GENOMICSEG)
1183 if (user_genomicseg == NULL && uncompressedp == false && straintype == 0) {
1184 genomicuc = Sequence_alias(genomicseg);
1185 } else {
1186 genomicuc = Sequence_uppercase(genomicseg);
1187 }
1188 genomicseg_ptr = Sequence_fullpointer(genomicseg);
1189 genomicuc_ptr = Sequence_fullpointer(genomicuc);
1190 #endif
1191
1192 #if 0
1193 if (canonical_mode == 0) {
1194 do_final_p = false;
1195 } else if (canonical_mode == 1) {
1196 do_final_p = true;
1197 } else if (lowidentityp == false) {
1198 do_final_p = false;
1199 } else {
1200 do_final_p = true;
1201 }
1202 #endif
1203
1204 debug(printf("Entering update_stage3middle_list with %d results\n",List_length(stage3middle_list)));
1205 debug2(printf("Beginning Stage2_compute with chrstart %u and chrend %u and query_subseq_offset %d\n",
1206 chrstart,chrend,Sequence_subseq_offset(queryseq)));
1207 all_stage2results = Stage2_compute(Sequence_trimpointer(queryseq),Sequence_trimpointer(queryuc),
1208 Sequence_trimlength(queryseq),/*query_offset*/0,
1209 chrstart,chrend,chroffset,chrhigh,/*plusp*/watsonp,genestrand,
1210 stage2_alloc,/*proceed_pctcoverage*/0.3,oligoindices_major,
1211 pairpool,diagpool,cellpool,
1212 /*localp*/true,/*skip_repetitive_p*/true,
1213 /*favor_right_p*/false,/*max_nalignments*/MAX_NALIGNMENTS,debug_graphic_p,
1214 worker_stopwatch,diag_debug);
1215 debug(printf("End of Stage2_compute\n"));
1216
1217
1218 for (p = all_stage2results; p != NULL; p = List_next(p)) {
1219 stage2 = (Stage2_T) List_head(p);
1220 stage3middle = Stage3_compute_middle(Stage2_middle(stage2),Stage2_all_starts(stage2),Stage2_all_ends(stage2),
1221 #ifdef PMAP
1222 /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
1223 /*queryseq_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1224 /*queryuc_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1225 /*querylength*/Sequence_subseq_length(queryntseq,subseq_offset),
1226 #else
1227 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
1228 /*queryuc_ptr*/Sequence_fullpointer(queryuc),
1229 /*querylength*/Sequence_fulllength(queryseq),
1230 #endif
1231 chrnum,chroffset,chrhigh,chrlength,
1232 watsonp,genestrand,/*jump_late_p*/watsonp ? false : true,maxpeelback,
1233 oligoindices_minor,diagpool,cellpool,
1234 pairpool,dynprogL,dynprogM,dynprogR,sense_try);
1235 Stage2_free(&stage2);
1236 all_stage3middle_results = List_push(all_stage3middle_results,(void *) stage3middle);
1237 }
1238 List_free(&all_stage2results);
1239
1240 return List_append(all_stage3middle_results,stage3middle_list);
1241 }
1242
1243
1244
1245 /* Combination of update_stage3middle_list and Stage3_compute_ends,
1246 Needed for solving middle segments of chimeras */
1247 static List_T
update_stage3list(List_T stage3list,Sequence_T queryseq,Sequence_T queryntseq,Sequence_T queryuc,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,int straintype,char * strain,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,Chrpos_T chrstart,Chrpos_T chrend,bool watsonp,int genestrand,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Stopwatch_T worker_stopwatch)1248 update_stage3list (List_T stage3list, Sequence_T queryseq,
1249 #ifdef PMAP
1250 Sequence_T queryntseq,
1251 #endif
1252 Sequence_T queryuc, Stage2_alloc_T stage2_alloc,
1253 Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
1254 Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool, int straintype, char *strain,
1255 Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
1256 Chrpos_T chrstart, Chrpos_T chrend, bool watsonp, int genestrand,
1257 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
1258 Stopwatch_T worker_stopwatch) {
1259 /* int stage2_source, stage2_indexsize; */
1260 /* double stage3_runtime; */
1261
1262 #ifdef PMAP
1263 Sequence_T genomicuc = NULL;
1264 char *genomicseg_ptr = NULL, *genomicuc_ptr = NULL;
1265 #elif defined(EXTRACT_GENOMICSEG)
1266 Sequence_T genomicuc = NULL;
1267 #endif
1268 List_T all_stage2results, p;
1269 Stage2_T stage2;
1270 Stage3_T stage3;
1271
1272 struct Pair_T *pairarray;
1273 List_T pairs;
1274 int goodness;
1275 int npairs, cdna_direction, matches, unknowns, mismatches, qopens, qindels, topens, tindels,
1276 ncanonical, nsemicanonical, nnoncanonical;
1277 int sensedir;
1278 int nmatches_posttrim, max_match_length, ambig_end_length_5, ambig_end_length_3;
1279 Splicetype_T ambig_splicetype_5, ambig_splicetype_3;
1280 double ambig_prob_5, ambig_prob_3;
1281 double min_splice_prob;
1282 #ifdef PMAP
1283 int subseq_offset;
1284 #endif
1285
1286
1287 #ifdef PMAP_OLD
1288 /* Previously used for PMAP */
1289 if (user_genomicseg == NULL && uncompressedp == false && straintype == 0) {
1290 genomicuc = Sequence_alias(genomicseg);
1291 } else {
1292 genomicuc = Sequence_uppercase(genomicseg);
1293 }
1294 genomicseg_ptr = Sequence_fullpointer(genomicseg);
1295 genomicuc_ptr = Sequence_fullpointer(genomicuc);
1296 #elif defined(EXTRACT_GENOMICSEG)
1297 if (user_genomicseg == NULL && uncompressedp == false && straintype == 0) {
1298 genomicuc = Sequence_alias(genomicseg);
1299 } else {
1300 genomicuc = Sequence_uppercase(genomicseg);
1301 }
1302 genomicseg_ptr = Sequence_fullpointer(genomicseg);
1303 genomicuc_ptr = Sequence_fullpointer(genomicuc);
1304 #endif
1305
1306 #if 0
1307 if (canonical_mode == 0) {
1308 do_final_p = false;
1309 } else if (canonical_mode == 1) {
1310 do_final_p = true;
1311 } else if (lowidentityp == false) {
1312 do_final_p = false;
1313 } else {
1314 do_final_p = true;
1315 }
1316 #endif
1317
1318 debug(printf("Entering update_stage3list with %d results\n",List_length(stage3list)));
1319 debug2(printf("Beginning Stage2_compute with chrstart %u and chrend %u and query_subseq_offset %d\n",
1320 chrstart,chrend,Sequence_subseq_offset(queryseq)));
1321 all_stage2results = Stage2_compute(Sequence_trimpointer(queryseq),Sequence_trimpointer(queryuc),
1322 Sequence_trimlength(queryseq),/*query_offset*/0,
1323 chrstart,chrend,chroffset,chrhigh,/*plusp*/watsonp,genestrand,
1324 stage2_alloc,/*proceed_pctcoverage*/0.3,oligoindices_major,
1325 pairpool,diagpool,cellpool,
1326 /*localp*/true,/*skip_repetitive_p*/true,
1327 /*favor_right_p*/false,/*max_nalignments*/MAX_NALIGNMENTS,debug_graphic_p,
1328 worker_stopwatch,diag_debug);
1329
1330 debug(printf("End of Stage2_compute\n"));
1331
1332 for (p = all_stage2results; p != NULL; p = List_next(p)) {
1333 stage2 = (Stage2_T) List_head(p);
1334
1335 /* Stopwatch_start(worker_stopwatch); */
1336 #ifdef PMAP
1337 subseq_offset = Sequence_subseq_offset(queryseq); /* in nucleotides */
1338 #endif
1339 pairarray = Stage3_compute_one(&cdna_direction,&sensedir,&pairs,&npairs,&goodness,
1340 &matches,&nmatches_posttrim,&max_match_length,
1341 &ambig_end_length_5,&ambig_end_length_3,
1342 &ambig_splicetype_5,&ambig_splicetype_3,
1343 &ambig_prob_5,&ambig_prob_3,
1344 &unknowns,&mismatches,&qopens,&qindels,&topens,&tindels,
1345 &ncanonical,&nsemicanonical,&nnoncanonical,&min_splice_prob,
1346 Stage2_middle(stage2),Stage2_all_starts(stage2),Stage2_all_ends(stage2),
1347 #ifdef PMAP
1348 /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
1349 /*queryseq_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1350 /*queryuc_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1351 /*querylength*/Sequence_subseq_length(queryntseq,subseq_offset),
1352 /*skiplength*/Sequence_skiplength(queryntseq),
1353 /*query_subseq_offset*/subseq_offset,
1354 #else
1355 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
1356 /*queryuc_ptr*/Sequence_fullpointer(queryuc),
1357 /*querylength*/Sequence_fulllength(queryseq),
1358 /*skiplength*/Sequence_skiplength(queryseq),
1359 /*query_subseq_offset*/Sequence_subseq_offset(queryseq),
1360 #endif
1361 chrnum,chroffset,chrhigh,
1362 /*knownsplice_limit_low*/0U,/*knownsplice_limit_high*/-1U,
1363 watsonp,genestrand,/*jump_late_p*/watsonp ? false : true,maxpeelback,
1364 oligoindices_minor,diagpool,cellpool,
1365 pairpool,dynprogL,dynprogM,dynprogR,sense_try,sense_filter);
1366 /* stage3_runtime = Stopwatch_stop(worker_stopwatch); */
1367 if (pairarray == NULL) {
1368 /* Skip */
1369 } else if (matches < min_matches) {
1370 FREE_OUT(pairarray);
1371 } else if ((stage3 = Stage3_new(pairarray,pairs,npairs,goodness,cdna_direction,sensedir,
1372 matches,unknowns,mismatches,
1373 qopens,qindels,topens,tindels,ncanonical,nsemicanonical,nnoncanonical,
1374 chrnum,chroffset,chrhigh,chrlength,watsonp,genestrand,
1375 /*querylength*/Sequence_fulllength(queryseq),
1376 /*skiplength*/Sequence_skiplength(queryseq),
1377 /*trimlength*/Sequence_trimlength(queryseq),
1378 straintype,strain,altstrain_iit)) != NULL) {
1379 debug(printf("Pushing %p onto stage3list\n",stage3));
1380 stage3list = List_push(stage3list,(void *) stage3);
1381 }
1382
1383 Stage2_free(&stage2);
1384 }
1385
1386 List_free(&all_stage2results);
1387
1388 #ifdef PMAP_OLD
1389 Sequence_free(&genomicuc);
1390 #elif defined(EXTRACT_GENOMICSEG)
1391 Sequence_free(&genomicuc);
1392 #endif
1393
1394 return stage3list;
1395 }
1396
1397
1398
1399 #if 0
1400 /* This code is duplicated in get-genome.c */
1401 static int
1402 index_compare (const void *a, const void *b) {
1403 int index1 = * (int *) a;
1404 int index2 = * (int *) b;
1405 int type1, type2;
1406 Chrpos_T pos1, pos2;
1407
1408 type1 = Interval_type(IIT_interval(altstrain_iit,index1));
1409 type2 = Interval_type(IIT_interval(altstrain_iit,index2));
1410
1411 if (type1 < type2) {
1412 return -1;
1413 } else if (type1 > type2) {
1414 return +1;
1415 } else {
1416 /* Store in descending genomic position, so right shifting works
1417 in Genome_patch_strain */
1418 pos1 = Interval_low(IIT_interval(altstrain_iit,index1));
1419 pos2 = Interval_low(IIT_interval(altstrain_iit,index2));
1420
1421 if (pos1 > pos2) {
1422 return -1;
1423 } else if (pos1 < pos2) {
1424 return +1;
1425 } else {
1426 return 0;
1427 }
1428 }
1429 }
1430 #endif
1431
1432
1433 /* Not sure how to treat genestrand for usersegment */
1434 static Stage3_T *
stage3_from_usersegment(int * npaths_primary,int * npaths_altloc,int * first_absmq,int * second_absmq,Sequence_T queryseq,Sequence_T queryuc,Sequence_T queryntseq,Sequence_T usersegment,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Stopwatch_T worker_stopwatch)1435 stage3_from_usersegment (int *npaths_primary, int *npaths_altloc, int *first_absmq, int *second_absmq,
1436 Sequence_T queryseq, Sequence_T queryuc,
1437 #ifdef PMAP
1438 Sequence_T queryntseq,
1439 #endif
1440 Sequence_T usersegment, Stage2_alloc_T stage2_alloc,
1441 Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
1442 Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
1443 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
1444 Stopwatch_T worker_stopwatch) {
1445 List_T stage3list, stage3middle_list, p;
1446 Stage3middle_T stage3middle;
1447 Stage3_T stage3;
1448 bool watsonp;
1449
1450 struct Pair_T *pairarray;
1451 List_T pairs;
1452 int goodness;
1453 int npairs, cdna_direction, matches, unknowns, mismatches, qopens, qindels, topens, tindels,
1454 ncanonical, nsemicanonical, nnoncanonical;
1455 int sensedir;
1456 int nmatches_posttrim, max_match_length, ambig_end_length_5, ambig_end_length_3;
1457 Splicetype_T ambig_splicetype_5, ambig_splicetype_3;
1458 double ambig_prob_5, ambig_prob_3;
1459 double min_splice_prob;
1460 #ifdef PMAP
1461 int subseq_offset;
1462 #endif
1463
1464 Univcoord_T chroffset, chrhigh;
1465 Chrpos_T chrlength, chrpos;
1466 Chrnum_T chrnum = 0;
1467
1468 #ifdef PMAP
1469 Sequence_T revcomp;
1470 #endif
1471
1472 chroffset = chrpos = 0U;
1473 chrhigh = chrlength = Sequence_fulllength(usersegment);
1474
1475 stage3middle_list = update_stage3middle_list(/*stage3middle_list*/NULL,queryseq,
1476 #ifdef PMAP
1477 queryntseq,
1478 #endif
1479 queryuc,stage2_alloc,oligoindices_major,oligoindices_minor,
1480 pairpool,diagpool,cellpool,chrnum,chroffset,chrhigh,chrlength,
1481 /*chrstart*/0,/*chrend*/chrhigh,/*watsonp*/true,
1482 /*genestrand for usersegment*/0,
1483 dynprogL,dynprogM,dynprogR,worker_stopwatch);
1484
1485 #ifdef PMAP
1486 revcomp = Sequence_revcomp(usersegment);
1487 #endif
1488
1489 stage3middle_list = update_stage3middle_list(stage3middle_list,queryseq,
1490 #ifdef PMAP
1491 queryntseq,
1492 #endif
1493 queryuc,stage2_alloc,oligoindices_major,oligoindices_minor,
1494 pairpool,diagpool,cellpool,chrnum,chroffset,chrhigh,chrlength,
1495 /*chrstart*/0,/*chrend*/chrhigh,/*watsonp*/false,
1496 /*genestrand for usersegment*/0,
1497 dynprogL,dynprogM,dynprogR,worker_stopwatch);
1498
1499 #ifdef PMAP
1500 Sequence_free(&revcomp);
1501 #endif
1502
1503 if (stage3middle_list == NULL) {
1504 *npaths_primary = *npaths_altloc = 0;
1505 return (Stage3_T *) NULL;
1506
1507 } else {
1508 stage3list = (List_T) NULL;
1509 for (p = stage3middle_list; p != NULL; p = List_next(p)) {
1510 stage3middle = (Stage3middle_T) List_head(p);
1511 watsonp = Stage3middle_watsonp(stage3middle);
1512
1513 #ifdef PMAP
1514 subseq_offset = Sequence_subseq_offset(queryseq); /* in nucleotides */
1515 #endif
1516 pairarray = Stage3_compute_ends(&cdna_direction,&sensedir,&pairs,&npairs,&goodness,
1517 &matches,&nmatches_posttrim,&max_match_length,
1518 &ambig_end_length_5,&ambig_end_length_3,
1519 &ambig_splicetype_5,&ambig_splicetype_3,
1520 &ambig_prob_5,&ambig_prob_3,
1521 &unknowns,&mismatches,&qopens,&qindels,&topens,&tindels,
1522 &ncanonical,&nsemicanonical,&nnoncanonical,&min_splice_prob,
1523 stage3middle,
1524 #ifdef PMAP
1525 /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
1526 /*queryseq_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1527 /*queryuc_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1528 /*querylength*/Sequence_subseq_length(queryntseq,subseq_offset),
1529 /*skiplength*/Sequence_skiplength(queryntseq),
1530 /*query_subseq_offset*/subseq_offset,
1531 #else
1532 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
1533 /*queryuc_ptr*/Sequence_fullpointer(queryuc),
1534 /*querylength*/Sequence_fulllength(queryseq),
1535 /*skiplength*/Sequence_skiplength(queryseq),
1536 /*query_subseq_offset*/Sequence_subseq_offset(queryseq),
1537 #endif
1538 /*knownsplice_limit_low*/0U,/*knownsplice_limit_high*/-1U,
1539 maxpeelback,pairpool,dynprogL,dynprogM,dynprogR,
1540 sense_filter,oligoindices_minor,diagpool,cellpool);
1541 /* stage3_runtime = Stopwatch_stop(worker_stopwatch); */
1542 if (pairarray == NULL) {
1543 /* Skip */
1544 } else if (matches < min_matches) {
1545 FREE_OUT(pairarray);
1546 } else if ((stage3 = Stage3_new(pairarray,pairs,npairs,goodness,cdna_direction,sensedir,
1547 matches,unknowns,mismatches,
1548 qopens,qindels,topens,tindels,ncanonical,nsemicanonical,nnoncanonical,
1549 chrnum,chroffset,chrhigh,chrlength,watsonp,/*genestrand for usersegment*/0,
1550 /*querylength*/Sequence_fulllength(queryseq),
1551 /*skiplength*/Sequence_skiplength(queryseq),
1552 /*trimlength*/Sequence_trimlength(queryseq),
1553 /*straintype*/0,/*strain*/NULL,altstrain_iit)) != NULL) {
1554 debug(printf("Pushing %p onto stage3list\n",stage3));
1555 stage3list = List_push(stage3list,(void *) stage3);
1556 }
1557 Stage3middle_free(&stage3middle);
1558 }
1559 List_free(&stage3middle_list);
1560
1561 return stage3array_from_list(&(*npaths_primary),&(*npaths_altloc),&(*first_absmq),&(*second_absmq),
1562 stage3list,/*chimerap*/false,/*remove_overlaps_p*/true);
1563 }
1564 }
1565
1566
1567 #if 0
1568 static List_T
1569 stage3list_remove_duplicates (List_T stage3list) {
1570 List_T unique = NULL;
1571 Stage3_T *array;
1572 int best_score;
1573 Chrpos_T shortest_genomiclength;
1574 int n, besti, i, j, k;
1575
1576 if ((n = List_length(stage3list)) == 0) {
1577 return (List_T) NULL;
1578 } else if (n == 1) {
1579 return stage3list;
1580 } else {
1581 array = (Stage3_T *) List_to_array(stage3list,NULL);
1582 List_free(&stage3list);
1583 qsort(array,n,sizeof(Stage3_T),Stage3_position_cmp);
1584
1585 i = 0;
1586 while (i < n) {
1587 best_score = Stage3_goodness(array[i]);
1588 shortest_genomiclength = Stage3_genomiclength(array[i]);
1589 besti = i;
1590 debug3(printf("i = %d, score %d, genomiclength %u\n",
1591 i,best_score,shortest_genomiclength));
1592
1593 j = i + 1;
1594 while (j < n && Stage3_position_cmp(&(array[i]),&(array[j])) == 0) {
1595 debug3(printf(" j = %d, score %d, genomiclength %u\n",
1596 j,Stage3_goodness(array[j]),Stage3_genomiclength(array[j])));
1597
1598 if (Stage3_goodness(array[j]) < best_score) {
1599 best_score = Stage3_goodness(array[j]);
1600 shortest_genomiclength = Stage3_genomiclength(array[j]);
1601 besti = j;
1602
1603 } else if (Stage3_goodness(array[j]) == best_score &&
1604 Stage3_genomiclength(array[j]) < shortest_genomiclength) {
1605 best_score = Stage3_goodness(array[j]);
1606 shortest_genomiclength = Stage3_genomiclength(array[j]);
1607 besti = j;
1608 }
1609
1610 j++;
1611 }
1612 debug3(printf(" => besti = %d, score %d, genomiclength %u\n",
1613 besti,best_score,shortest_genomiclength));
1614
1615 for (k = i; k < j; k++) {
1616 if (k == besti) {
1617 unique = List_push(unique,(void *) array[besti]);
1618 } else {
1619 Stage3_free(&(array[k]));
1620 }
1621 }
1622
1623 i = j;
1624 }
1625
1626 FREE(array);
1627
1628 return unique;
1629 }
1630 }
1631 #endif
1632
1633
1634 #if 0
1635 static List_T
1636 stage3list_remove_empties (List_T stage3list) {
1637 List_T nonempty = NULL, p;
1638 Stage3_T stage3;
1639
1640 for (p = stage3list; p != NULL; p = List_next(p)) {
1641 stage3 = (Stage3_T) List_head(p);
1642 if (Stage3_pairs == NULL) {
1643 debug2(printf("Removing empty stage3 %p\n",stage3));
1644 Stage3_free(&stage3);
1645 } else {
1646 nonempty = List_push(nonempty,(void *) stage3);
1647 }
1648 }
1649
1650 return nonempty;
1651 }
1652 #endif
1653
1654
1655 static List_T
stage3list_sort(List_T stage3list)1656 stage3list_sort (List_T stage3list) {
1657 List_T sorted = NULL;
1658 Stage3_T *array;
1659 int n, i;
1660
1661 if ((n = List_length(stage3list)) == 0) {
1662 return (List_T) NULL;
1663 } else if (n == 1) {
1664 return stage3list;
1665 } else {
1666 array = (Stage3_T *) List_to_array(stage3list,NULL);
1667 List_free(&stage3list);
1668 qsort(array,n,sizeof(Stage3_T),Stage3_cmp);
1669 for (i = n-1; i >= 0; i--) {
1670 sorted = List_push(sorted,(void *) array[i]);
1671 }
1672 FREE(array);
1673
1674 return sorted;
1675 }
1676 }
1677
1678
1679 static List_T
stage3list_filter_and_sort(Chimera_T * chimera,List_T stage3list)1680 stage3list_filter_and_sort (Chimera_T *chimera, List_T stage3list) {
1681 List_T sorted = NULL;
1682 Stage3_T *array, stage3;
1683 int n, i;
1684
1685 if ((n = List_length(stage3list)) == 0) {
1686 return (List_T) NULL;
1687
1688 } else if (n == 1) {
1689 stage3 = (Stage3_T) List_head(stage3list);
1690 if (Stage3_passes_filter(stage3,min_trimmed_coverage,min_identity) == false) {
1691 Stage3_free(&stage3);
1692 List_free(&stage3list);
1693 return (List_T) NULL;
1694 } else {
1695 return stage3list;
1696 }
1697
1698 } else if (*chimera == NULL) {
1699 array = (Stage3_T *) List_to_array(stage3list,NULL);
1700 List_free(&stage3list);
1701 qsort(array,n,sizeof(Stage3_T),Stage3_cmp);
1702 for (i = n-1; i >= 0; i--) {
1703 if (Stage3_passes_filter(array[i],min_trimmed_coverage,min_identity) == false) {
1704 Stage3_free(&(array[i]));
1705 } else {
1706 sorted = List_push(sorted,(void *) array[i]);
1707 }
1708 }
1709 FREE(array);
1710 return sorted;
1711
1712 } else if (Stage3_passes_filter_chimera(*chimera,min_trimmed_coverage,min_identity) == true) {
1713 array = (Stage3_T *) List_to_array(stage3list,NULL);
1714 List_free(&stage3list);
1715 qsort(array,n,sizeof(Stage3_T),Stage3_cmp);
1716 for (i = n-1; i >= 0; i--) {
1717 if (Stage3_chimera_left_p(array[i]) == true) {
1718 sorted = List_push(sorted,(void *) array[i]);
1719 } else if (Stage3_chimera_right_p(array[i]) == true) {
1720 sorted = List_push(sorted,(void *) array[i]);
1721 } else if (Stage3_passes_filter(array[i],min_trimmed_coverage,min_identity) == false) {
1722 Stage3_free(&(array[i]));
1723 } else {
1724 sorted = List_push(sorted,(void *) array[i]);
1725 }
1726 }
1727 FREE(array);
1728 return sorted;
1729
1730 } else {
1731 array = (Stage3_T *) List_to_array(stage3list,NULL);
1732 List_free(&stage3list);
1733 qsort(array,n,sizeof(Stage3_T),Stage3_cmp);
1734 for (i = n-1; i >= 0; i--) {
1735 if (Stage3_chimera_left_p(array[i]) == true) {
1736 Stage3_free(&(array[i]));
1737 } else if (Stage3_chimera_right_p(array[i]) == true) {
1738 Stage3_free(&(array[i]));
1739 } else if (Stage3_passes_filter(array[i],min_trimmed_coverage,min_identity) == false) {
1740 Stage3_free(&(array[i]));
1741 } else {
1742 sorted = List_push(sorted,(void *) array[i]);
1743 }
1744 }
1745 FREE(array);
1746
1747 Chimera_free(&(*chimera));
1748 *chimera = (Chimera_T) NULL;
1749
1750 return sorted;
1751 }
1752 }
1753
1754
1755 /* Each gregion has its own genestrand */
1756 static List_T
stage3_from_gregions(List_T stage3list,List_T gregions,Sequence_T queryseq,Sequence_T queryuc,Sequence_T queryntseq,Sequence_T usersegment,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Stopwatch_T worker_stopwatch)1757 stage3_from_gregions (List_T stage3list, List_T gregions,
1758 Sequence_T queryseq, Sequence_T queryuc,
1759 #ifdef PMAP
1760 Sequence_T queryntseq,
1761 #endif
1762 Sequence_T usersegment, Stage2_alloc_T stage2_alloc,
1763 Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
1764 Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
1765 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
1766 Stopwatch_T worker_stopwatch) {
1767 Gregion_T gregion, *gregion_array;
1768 int ngregions, ncovered, max_ncovered, stage2_source;
1769 int n, i;
1770
1771 List_T stage3middle_list = NULL;
1772 Stage3middle_T stage3middle, *stage3middle_array;
1773 Stage3_T stage3;
1774 bool watsonp;
1775 int genestrand;
1776
1777 Chrnum_T chrnum;
1778 Univcoord_T chroffset, chrhigh;
1779 Chrpos_T chrlength;
1780
1781 struct Pair_T *pairarray;
1782 List_T pairs;
1783 int goodness, best_score;
1784 int npairs, cdna_direction, matches, unknowns, mismatches, qopens, qindels, topens, tindels,
1785 ncanonical, nsemicanonical, nnoncanonical;
1786 int sensedir;
1787 int nmatches_posttrim, max_match_length, ambig_end_length_5, ambig_end_length_3;
1788 Splicetype_T ambig_splicetype_5, ambig_splicetype_3;
1789 double ambig_prob_5, ambig_prob_3;
1790 double min_splice_prob;
1791 #ifdef PMAP
1792 int subseq_offset;
1793 #endif
1794
1795 #if 0
1796 int *indexarray, nindices, straintype, j;
1797 #endif
1798 void *item;
1799
1800 #ifdef EXTRACT_GENOMICSEG
1801 genomicuc_ptr = Sequence_fullpointer(genomicuc);
1802 Sequence_T genomicseg = NULL, genomicuc = NULL;
1803 #endif
1804
1805 if (usersegment == NULL && (ngregions = List_length(gregions)) > 0) {
1806 gregion_array = (Gregion_T *) List_to_array(gregions,NULL);
1807 List_free(&gregions);
1808
1809 for (i = 0; i < ngregions; i++) {
1810 gregion = gregion_array[i];
1811
1812 #if defined(EXTRACT_GENOMICSEG)
1813 genomicseg = Genome_get_segment(genome,Gregion_genomicstart(gregion),Gregion_genomiclength(gregion),
1814 /*chromosome_iit*/NULL,Gregion_revcompp(gregion));
1815 genomicuc = Sequence_uppercase(genomicseg);
1816 genomicuc_ptr = Sequence_fullpointer(genomicuc);
1817 #endif
1818 ncovered = Stage2_scan(&stage2_source,Sequence_trimpointer(queryuc),Sequence_trimlength(queryseq),
1819 Gregion_chrstart(gregion),Gregion_chrend(gregion),
1820 Gregion_chroffset(gregion),Gregion_chrhigh(gregion),
1821 /*plusp*/Gregion_revcompp(gregion) ? false : true,Gregion_genestrand(gregion),
1822 stage2_alloc,oligoindices_major,diagpool,debug_graphic_p);
1823 Gregion_set_ncovered(gregion,ncovered,stage2_source);
1824 #if defined(EXTRACT_GENOMICSEG)
1825 Sequence_free(&genomicuc);
1826 Sequence_free(&genomicseg);
1827 #endif
1828 }
1829 qsort(gregion_array,ngregions,sizeof(Gregion_T),Gregion_cmp);
1830 max_ncovered = Gregion_ncovered(gregion_array[0]);
1831 debug(printf("max_ncovered of gregion_array[0] = %d\n",max_ncovered));
1832 if (max_ncovered < 0.10*Sequence_fulllength(queryseq)) {
1833 debug(printf("coverage is too short, so skipping\n"));
1834 for (i = 0; i < ngregions; i++) {
1835 Gregion_free(&(gregion_array[i]));
1836 }
1837 FREE(gregion_array);
1838
1839 } else {
1840 gregions = (List_T) NULL;
1841 i = 0;
1842 while (i < ngregions && Gregion_ncovered(gregion_array[i]) > 0.25*max_ncovered) {
1843 debug(printf("Keeping %d ncovered relative to %d\n",Gregion_ncovered(gregion_array[i]),max_ncovered));
1844 gregions = List_push(gregions,(void *) gregion_array[i]);
1845 i++;
1846 }
1847 while (i < ngregions) {
1848 debug(printf("Discarding array %d with ncovered = %d\n",i,Gregion_ncovered(gregion_array[i])));
1849 Gregion_free(&(gregion_array[i]));
1850 i++;
1851 }
1852 FREE(gregion_array);
1853 }
1854
1855 while (gregions != NULL) {
1856 gregions = List_pop(gregions,&item);
1857 gregion = (Gregion_T) item;
1858
1859 /* if (Match_usep(match) == true) { */
1860 if (1) {
1861 if (usersegment != NULL) {
1862 /* chrlength = Sequence_fulllength(usersegment); */
1863 /* strain = NULL; */
1864 stage3middle_list = update_stage3middle_list(stage3middle_list,queryseq,
1865 #ifdef PMAP
1866 queryntseq,
1867 #endif
1868 queryuc,stage2_alloc,oligoindices_major,oligoindices_minor,
1869 pairpool,diagpool,cellpool,Gregion_chrnum(gregion),
1870 Gregion_chroffset(gregion),Gregion_chrhigh(gregion),Gregion_chrlength(gregion),
1871 Gregion_chrstart(gregion),Gregion_chrend(gregion),
1872 Gregion_plusp(gregion),Gregion_genestrand(gregion),
1873 dynprogL,dynprogM,dynprogR,worker_stopwatch);
1874 } else {
1875 stage3middle_list = update_stage3middle_list(stage3middle_list,queryseq,
1876 #ifdef PMAP
1877 queryntseq,
1878 #endif
1879 queryuc,stage2_alloc,oligoindices_major,oligoindices_minor,
1880 pairpool,diagpool,cellpool,Gregion_chrnum(gregion),
1881 Gregion_chroffset(gregion),Gregion_chrhigh(gregion),Gregion_chrlength(gregion),
1882 Gregion_chrstart(gregion),Gregion_chrend(gregion),
1883 Gregion_plusp(gregion),Gregion_genestrand(gregion),
1884 dynprogL,dynprogM,dynprogR,worker_stopwatch);
1885 }
1886 }
1887 Gregion_free(&gregion);
1888 }
1889
1890 if (stage3middle_list != NULL) {
1891 stage3middle_array = (Stage3middle_T *) List_to_array_n(&n,stage3middle_list);
1892 qsort(stage3middle_array,n,sizeof(Stage3middle_T),Stage3middle_cmp);
1893 List_free(&stage3middle_list);
1894
1895 best_score = Stage3middle_goodness(stage3middle_array[0]);
1896 i = 0;
1897
1898 while (i < n && Stage3middle_goodness(stage3middle_array[i]) > best_score - 20) {
1899 stage3middle = stage3middle_array[i];
1900 debug(printf("Processing stage3middle %d with goodness %d\n",i,Stage3middle_goodness(stage3middle)));
1901
1902 chrnum = Stage3middle_chrnum(stage3middle);
1903 chroffset = Stage3middle_chroffset(stage3middle);
1904 chrhigh = Stage3middle_chrhigh(stage3middle);
1905 chrlength = Stage3middle_chrlength(stage3middle);
1906 watsonp = Stage3middle_watsonp(stage3middle);
1907 genestrand = Stage3middle_genestrand(stage3middle);
1908
1909 #ifdef PMAP
1910 subseq_offset = Sequence_subseq_offset(queryseq); /* in nucleotides */
1911 #endif
1912 pairarray = Stage3_compute_ends(&cdna_direction,&sensedir,&pairs,&npairs,&goodness,
1913 &matches,&nmatches_posttrim,&max_match_length,
1914 &ambig_end_length_5,&ambig_end_length_3,
1915 &ambig_splicetype_5,&ambig_splicetype_3,
1916 &ambig_prob_5,&ambig_prob_3,
1917 &unknowns,&mismatches,&qopens,&qindels,&topens,&tindels,
1918 &ncanonical,&nsemicanonical,&nnoncanonical,&min_splice_prob,
1919 stage3middle,
1920 #ifdef PMAP
1921 /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
1922 /*queryseq_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1923 /*queryuc_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1924 /*querylength*/Sequence_subseq_length(queryntseq,subseq_offset),
1925 /*skiplength*/Sequence_skiplength(queryntseq),
1926 /*query_subseq_offset*/subseq_offset,
1927 #else
1928 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
1929 /*queryuc_ptr*/Sequence_fullpointer(queryuc),
1930 /*querylength*/Sequence_fulllength(queryseq),
1931 /*skiplength*/Sequence_skiplength(queryseq),
1932 /*query_subseq_offset*/Sequence_subseq_offset(queryseq),
1933 #endif
1934 /*knownsplice_limit_low*/0U,/*knownsplice_limit_high*/-1U,
1935 maxpeelback,pairpool,dynprogL,dynprogM,dynprogR,
1936 sense_filter,oligoindices_minor,diagpool,cellpool);
1937 /* stage3_runtime = Stopwatch_stop(worker_stopwatch); */
1938 if (pairarray == NULL) {
1939 /* Skip */
1940 } else if (matches < min_matches) {
1941 FREE_OUT(pairarray);
1942 } else if ((stage3 = Stage3_new(pairarray,pairs,npairs,goodness,cdna_direction,sensedir,
1943 matches,unknowns,mismatches,
1944 qopens,qindels,topens,tindels,ncanonical,nsemicanonical,nnoncanonical,
1945 chrnum,chroffset,chrhigh,chrlength,watsonp,genestrand,
1946 /*querylength*/Sequence_fulllength(queryseq),
1947 /*skiplength*/Sequence_skiplength(queryseq),
1948 /*trimlength*/Sequence_trimlength(queryseq),
1949 /*straintype*/0,/*strain*/NULL,altstrain_iit)) != NULL) {
1950 debug(printf("Pushing %p onto stage3list\n",stage3));
1951 stage3list = List_push(stage3list,(void *) stage3);
1952 }
1953 Stage3middle_free(&stage3middle);
1954 i++;
1955 }
1956
1957 while (i < n) {
1958 stage3middle = stage3middle_array[i];
1959 debug(printf("Ignoring stage3middle %d with goodness %d\n",i,Stage3middle_goodness(stage3middle)));
1960 Stage3middle_free(&stage3middle);
1961 i++;
1962 }
1963
1964 FREE(stage3middle_array);
1965 }
1966 }
1967
1968 #ifdef PMAP_OLD
1969 Sequence_free(&genomicuc);
1970 #elif defined(EXTRACT_GENOMICSEG)
1971 Sequence_free(&genomicuc);
1972 #endif
1973
1974 return stage3list;
1975 }
1976
1977
1978 static bool
middle_piece_local_p(int * querystart,int * queryend,Chrpos_T * chrstart,Chrpos_T * chrend,Chrnum_T * chrnum,Univcoord_T * chroffset,Univcoord_T * chrhigh,Chrpos_T * chrlength,bool * plusp,int * genestrand,Stage3_T from,Stage3_T to)1979 middle_piece_local_p (int *querystart, int *queryend,
1980 Chrpos_T *chrstart, Chrpos_T *chrend,
1981 Chrnum_T *chrnum, Univcoord_T *chroffset, Univcoord_T *chrhigh,
1982 Chrpos_T *chrlength, bool *plusp, int *genestrand,
1983 Stage3_T from, Stage3_T to) {
1984
1985 debug2(printf("? middle_piece_local_p from [%p] %d..%d (%u..%u) -> to [%p] %d..%d (%u..%u) => ",
1986 from,Stage3_querystart(from),Stage3_queryend(from),
1987 Stage3_chrstart(from),Stage3_chrend(from),
1988 to,Stage3_querystart(to),Stage3_queryend(to),
1989 Stage3_chrstart(to),Stage3_chrend(to)));
1990
1991 if (Stage3_chimera_right_p(from) == true) {
1992 debug2(printf("false, because from is already part of a chimera on its right\n"));
1993 return false;
1994
1995 } else if (Stage3_chimera_left_p(to) == true) {
1996 debug2(printf("false, because to is already part of a chimera on its left\n"));
1997 return false;
1998
1999 } else if ((*chrnum = Stage3_chrnum(from)) != Stage3_chrnum(to)) {
2000 /* Different chromosomes */
2001 debug2(printf("different chromosomes\n"));
2002 return false;
2003
2004 } else if (Stage3_watsonp(from) != Stage3_watsonp(to)) {
2005 /* Different strands */
2006 debug2(printf("different strands\n"));
2007 return false;
2008
2009 } else if (Stage3_genestrand(from) != Stage3_genestrand(to)) {
2010 /* Different genestrands */
2011 debug2(printf("different genestrands\n"));
2012 return false;
2013
2014 } else if (Stage3_querystart(to) <= Stage3_queryend(from) + CHIMERA_SLOP) {
2015 /* Already joinable */
2016 debug2(printf("wrong query order or already joinable\n"));
2017 return false;
2018
2019 } else if ((*plusp = Stage3_watsonp(from)) == true) {
2020 if (Stage3_chrend(from) < Stage3_chrstart(to) &&
2021 Stage3_chrend(from) + 1000000 > Stage3_chrstart(to)) {
2022 debug2(printf("true, because %u < %u and %u + %u > %u\n",
2023 Stage3_chrend(from),Stage3_chrstart(to),
2024 Stage3_chrend(from),1000000,Stage3_chrstart(to)));
2025 Univ_IIT_interval_bounds(&(*chroffset),&(*chrhigh),&(*chrlength),chromosome_iit,
2026 *chrnum,circular_typeint);
2027 *querystart = Stage3_queryend(from);
2028 *queryend = Stage3_querystart(to);
2029 *chrstart = Stage3_chrend(from);
2030 *chrend = Stage3_chrstart(to);
2031 *genestrand = Stage3_genestrand(from);
2032 return true;
2033 } else {
2034 debug2(printf("false, watsonp true, from_end %u, to start %u\n",
2035 Stage3_chrend(from),Stage3_chrstart(to)));
2036 return false;
2037 }
2038
2039 } else {
2040 if (Stage3_chrstart(to) < Stage3_chrend(from) &&
2041 Stage3_chrstart(to) + 1000000 > Stage3_chrend(from)) {
2042 debug2(printf("true, because %u < %u and %u + %u > %u\n",
2043 Stage3_chrstart(to),Stage3_chrend(from),
2044 Stage3_chrstart(to),1000000,Stage3_chrend(from)));
2045 Univ_IIT_interval_bounds(&(*chroffset),&(*chrhigh),&(*chrlength),chromosome_iit,
2046 *chrnum,circular_typeint);
2047 *querystart = Stage3_queryend(from);
2048 *queryend = Stage3_querystart(to);
2049 *chrstart = Stage3_chrstart(to);
2050 *chrend = Stage3_chrend(from);
2051 *genestrand = Stage3_genestrand(from);
2052 return true;
2053 } else {
2054 debug2(printf("false, watsonp false, from_end %u, to start %u\n",
2055 Stage3_chrend(from),Stage3_chrstart(to)));
2056 return false;
2057 }
2058 }
2059 }
2060
2061
2062 static bool
middle_piece_chimera_p(int * querystart,int * queryend,Stage3_T from,Stage3_T to)2063 middle_piece_chimera_p (int *querystart, int *queryend, Stage3_T from, Stage3_T to) {
2064
2065 debug2(printf("? middle_piece_chimera_p from [%p] %d..%d (%u..%u) -> to [%p] %d..%d (%u..%u) => ",
2066 from,Stage3_querystart(from),Stage3_queryend(from),
2067 Stage3_chrstart(from),Stage3_chrend(from),
2068 to,Stage3_querystart(to),Stage3_queryend(to),
2069 Stage3_chrstart(to),Stage3_chrend(to)));
2070
2071 if (Stage3_chimera_right_p(from) == true) {
2072 debug2(printf("false, because from is already part of a chimera on its right\n"));
2073 return false;
2074
2075 } else if (Stage3_chimera_left_p(to) == true) {
2076 debug2(printf("false, because to is already part of a chimera on its left\n"));
2077 return false;
2078
2079 } else if (Stage3_querystart(to) <= Stage3_queryend(from) + CHIMERA_SLOP) {
2080 /* Already joinable */
2081 debug2(printf("wrong query order or already joinable\n"));
2082 return false;
2083
2084 } else {
2085 *querystart = Stage3_queryend(from);
2086 *queryend = Stage3_querystart(to);
2087 return true;
2088 }
2089 }
2090
2091
2092 /* Does not alter stage3list. Puts Stage3_T objects into stage3array_sub1, stage3array_sub2, or both */
2093 static void
local_separate_paths(Stage3_T ** stage3array_sub1,int * npaths_sub1,Stage3_T ** stage3array_sub2,int * npaths_sub2,List_T stage3list)2094 local_separate_paths (Stage3_T **stage3array_sub1, int *npaths_sub1,
2095 Stage3_T **stage3array_sub2, int *npaths_sub2,
2096 List_T stage3list) {
2097 List_T p;
2098 Stage3_T from, to, stage3;
2099 Stage3_T *by_queryend, *by_querystart;
2100 Chrnum_T chrnum;
2101 int npaths, i, j, k, kstart, kend;
2102 int queryend;
2103
2104 debug2(printf("local_separate_paths called with list length %d\n",List_length(stage3list)));
2105 #ifdef DEBUG2
2106 for (p = stage3list; p != NULL; p = List_next(p)) {
2107 printf("%p\n",List_head(p));
2108 }
2109 #endif
2110
2111 if (stage3list == NULL) {
2112 *stage3array_sub1 = (Stage3_T *) NULL;
2113 *npaths_sub1 = 0;
2114 *stage3array_sub2 = (Stage3_T *) NULL;
2115 *npaths_sub2 = 0;
2116 return;
2117
2118 } else {
2119 for (p = stage3list; p != NULL; p = List_next(p)) {
2120 stage3 = (Stage3_T) List_head(p);
2121 Stage3_clear_joinable(stage3);
2122 }
2123 }
2124
2125 by_queryend = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
2126 qsort(by_queryend,npaths,sizeof(Stage3_T),Stage3_chrnum_queryend_cmp);
2127
2128 by_querystart = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
2129 qsort(by_querystart,npaths,sizeof(Stage3_T),Stage3_chrnum_querystart_cmp);
2130
2131 #ifdef DEBUG2
2132 for (i = 0; i < npaths; i++) {
2133 stage3 = (Stage3_T) by_queryend[i];
2134 printf("from: %p query %d..%d, chrnum %d, genomic %u..%u\t",
2135 stage3,Stage3_querystart(stage3),Stage3_queryend(stage3),
2136 Stage3_chrnum(stage3),Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
2137
2138 stage3 = (Stage3_T) by_querystart[i];
2139 printf("to: %p query %d..%d, chrnum %d, genomic %u..%u\n",
2140 stage3,Stage3_querystart(stage3),Stage3_queryend(stage3),
2141 Stage3_chrnum(stage3),Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
2142 }
2143 #endif
2144
2145 kend = 0;
2146 for (i = 0; i < npaths; i++) {
2147 debug2(printf("queryend %d:",i));
2148 from = by_queryend[i];
2149
2150 /* Find matching chromosomal bounds for querystart */
2151 chrnum = Stage3_chrnum(from);
2152 while (kend < npaths && Stage3_chrnum(by_querystart[kend]) == chrnum) {
2153 kend++;
2154 }
2155 kstart = kend - 1;
2156 while (kstart >= 0 && Stage3_chrnum(by_querystart[kstart]) == chrnum) {
2157 kstart--;
2158 }
2159 kstart++;
2160 debug2(printf(" querystart bounded by %d..%d:",kstart,kend));
2161
2162
2163 /* Find matching querystart */
2164 queryend = Stage3_queryend(from);
2165 j = kstart;
2166 while (j < kend && Stage3_querystart(by_querystart[j]) < queryend + CHIMERA_SLOP) {
2167 j++;
2168 }
2169 j--;
2170
2171 while (j >= kstart && Stage3_querystart(by_querystart[j]) > queryend - CHIMERA_SLOP) {
2172 j--;
2173 }
2174 j++;
2175
2176 while (j < kend && Stage3_querystart(by_querystart[j]) < queryend + CHIMERA_SLOP) {
2177 to = by_querystart[j];
2178
2179 debug2(printf(" %d",j));
2180 if (Chimera_local_join_p(from,to,CHIMERA_SLOP) == true) {
2181 debug2(printf("(to %d)",i));
2182 Stage3_set_joinable_left(from);
2183 Stage3_set_joinable_right(to);
2184 }
2185
2186 j++;
2187 }
2188 debug2(printf("\n"));
2189 }
2190
2191 FREE(by_querystart);
2192 FREE(by_queryend);
2193
2194
2195 *npaths_sub1 = *npaths_sub2 = 0;
2196 for (p = stage3list; p != NULL; p = List_next(p)) {
2197 stage3 = (Stage3_T) List_head(p);
2198 debug2(printf("Stage3 %p. joinable_left_p %d, joinable_right_p %d\n",
2199 stage3,Stage3_joinable_left_p(stage3),Stage3_joinable_right_p(stage3)));
2200 if (Stage3_joinable_left_p(stage3) == true) {
2201 debug2(printf("Putting stage3 %p into local sub1\n",stage3));
2202 (*npaths_sub1)++;
2203 }
2204 if (Stage3_joinable_right_p(stage3) == true) {
2205 debug2(printf("Putting stage3 %p into local sub2\n",stage3));
2206 (*npaths_sub2)++;
2207 }
2208 }
2209
2210 if (*npaths_sub1 == 0 || *npaths_sub2 == 0) {
2211 *stage3array_sub1 = (Stage3_T *) NULL;
2212 *npaths_sub1 = 0;
2213 *stage3array_sub2 = (Stage3_T *) NULL;
2214 *npaths_sub2 = 0;
2215
2216 } else {
2217 *stage3array_sub1 = (Stage3_T *) MALLOC((*npaths_sub1) * sizeof(Stage3_T)); /* Return value */
2218 *stage3array_sub2 = (Stage3_T *) MALLOC((*npaths_sub2) * sizeof(Stage3_T)); /* Return value */
2219 j = k = 0;
2220 for (p = stage3list; p != NULL; p = List_next(p)) {
2221 stage3 = (Stage3_T) List_head(p);
2222 /* Note: it is possible that the same stage3 object gets put into both lists */
2223 if (Stage3_joinable_left_p(stage3) == true) {
2224 debug2(printf("Putting %p into sub1\n",stage3));
2225 (*stage3array_sub1)[j++] = stage3;
2226 }
2227 if (Stage3_joinable_right_p(stage3) == true) {
2228 debug2(printf("Putting %p into sub2\n",stage3));
2229 (*stage3array_sub2)[k++] = stage3;
2230 }
2231 }
2232 }
2233
2234 debug2(printf("local_separate_paths returning %d paths\n",List_length(stage3list)));
2235 #ifdef DEBUG2
2236 for (p = stage3list; p != NULL; p = List_next(p)) {
2237 stage3 = (Stage3_T) List_head(p);
2238 printf("%p %p\n",stage3,Stage3_pairs(stage3));
2239 }
2240 #endif
2241
2242 return;
2243 }
2244
2245
2246 /* Does not alter stage3list. Puts Stage3_T objects into stage3array_sub1, stage3array_sub2, or both */
2247 static void
distant_separate_paths(Stage3_T ** stage3array_sub1,int * npaths_sub1,Stage3_T ** stage3array_sub2,int * npaths_sub2,List_T stage3list)2248 distant_separate_paths (Stage3_T **stage3array_sub1, int *npaths_sub1,
2249 Stage3_T **stage3array_sub2, int *npaths_sub2,
2250 List_T stage3list) {
2251 List_T p;
2252 Stage3_T from, to, stage3;
2253 Stage3_T *by_queryend, *by_querystart;
2254 int npaths, i, j, k;
2255 int queryend;
2256
2257 debug2(printf("distant_separate_paths called with list length %d\n",List_length(stage3list)));
2258 #ifdef DEBUG2
2259 for (p = stage3list; p != NULL; p = List_next(p)) {
2260 stage3 = (Stage3_T) List_head(p);
2261 printf("%p %p\n",stage3,Stage3_pairs(stage3));
2262 }
2263 #endif
2264
2265
2266 if (stage3list == NULL) {
2267 *stage3array_sub1 = (Stage3_T *) NULL;
2268 *npaths_sub1 = 0;
2269 *stage3array_sub2 = (Stage3_T *) NULL;
2270 *npaths_sub2 = 0;
2271 return;
2272
2273 } else {
2274 for (p = stage3list; p != NULL; p = List_next(p)) {
2275 stage3 = (Stage3_T) List_head(p);
2276 Stage3_clear_joinable(stage3);
2277 }
2278 }
2279
2280 by_queryend = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
2281 qsort(by_queryend,npaths,sizeof(Stage3_T),Stage3_queryend_cmp);
2282
2283 by_querystart = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
2284 qsort(by_querystart,npaths,sizeof(Stage3_T),Stage3_querystart_cmp);
2285
2286 j = 0;
2287 for (i = 0; i < npaths; i++) {
2288 from = by_queryend[i];
2289 queryend = Stage3_queryend(from);
2290
2291 while (j < npaths && Stage3_querystart(by_querystart[j]) < queryend + CHIMERA_SLOP) {
2292 j++;
2293 }
2294 j--;
2295
2296 while (j >= 0 && Stage3_querystart(by_querystart[j]) > queryend - CHIMERA_SLOP) {
2297 j--;
2298 }
2299 j++;
2300
2301 while (j < npaths && Stage3_querystart(by_querystart[j]) < queryend + CHIMERA_SLOP) {
2302 to = by_querystart[j];
2303
2304 if (Chimera_distant_join_p(from,to,CHIMERA_SLOP) == true) {
2305 debug2(printf("Found distant join from %d to %d\n",i,j));
2306 Stage3_set_joinable_left(from);
2307 Stage3_set_joinable_right(to);
2308 }
2309
2310 j++;
2311 }
2312 }
2313
2314 FREE(by_querystart);
2315 FREE(by_queryend);
2316
2317
2318 *npaths_sub1 = *npaths_sub2 = 0;
2319 for (p = stage3list; p != NULL; p = List_next(p)) {
2320 stage3 = (Stage3_T) List_head(p);
2321 if (Stage3_joinable_left_p(stage3) == true) {
2322 (*npaths_sub1)++;
2323 }
2324 if (Stage3_joinable_right_p(stage3) == true) {
2325 (*npaths_sub2)++;
2326 }
2327 }
2328
2329 if (*npaths_sub1 == 0 || *npaths_sub2 == 0) {
2330 *stage3array_sub1 = (Stage3_T *) NULL;
2331 *npaths_sub1 = 0;
2332 *stage3array_sub2 = (Stage3_T *) NULL;
2333 *npaths_sub2 = 0;
2334 } else {
2335 *stage3array_sub1 = (Stage3_T *) MALLOC((*npaths_sub1) * sizeof(Stage3_T)); /* Return value */
2336 *stage3array_sub2 = (Stage3_T *) MALLOC((*npaths_sub2) * sizeof(Stage3_T)); /* Return value */
2337 j = k = 0;
2338 for (p = stage3list; p != NULL; p = List_next(p)) {
2339 stage3 = (Stage3_T) List_head(p);
2340 /* Note: it is possible that the same stage3 object gets put into both lists */
2341 if (Stage3_joinable_left_p(stage3) == true) {
2342 debug2(printf("Putting stage3 %p into distant sub1\n",stage3));
2343 (*stage3array_sub1)[j++] = stage3;
2344 }
2345 if (Stage3_joinable_right_p(stage3) == true) {
2346 debug2(printf("Putting stage3 %p into distant sub2\n",stage3));
2347 (*stage3array_sub2)[k++] = stage3;
2348 }
2349 }
2350 }
2351
2352 debug2(printf("distant_separate_paths returning %d paths\n",List_length(stage3list)));
2353 #ifdef DEBUG2
2354 for (p = stage3list; p != NULL; p = List_next(p)) {
2355 stage3 = (Stage3_T) List_head(p);
2356 printf("%p %p\n",stage3,Stage3_pairs(stage3));
2357 }
2358 #endif
2359
2360 return;
2361 }
2362
2363
2364 static List_T
merge_left_and_right_readthrough(bool * mergedp,List_T stage3list,Stage3_T * stage3array_sub1,int bestfrom,Stage3_T * stage3array_sub2,int bestto,int breakpoint,int queryntlength,char * queryaaseq_ptr,Sequence_T queryseq,char * queryseq_ptr,char * queryuc_ptr,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Oligoindex_array_T oligoindices_minor,Diagpool_T diagpool,Cellpool_T cellpool)2365 merge_left_and_right_readthrough (bool *mergedp, List_T stage3list,
2366 Stage3_T *stage3array_sub1, int bestfrom,
2367 Stage3_T *stage3array_sub2, int bestto,
2368 int breakpoint, int queryntlength,
2369 #ifdef PMAP
2370 char *queryaaseq_ptr,
2371 #endif
2372 Sequence_T queryseq, char *queryseq_ptr, char *queryuc_ptr,
2373 Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
2374 Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool) {
2375 Stage3_T stage3, best0, best1;
2376
2377 best0 = stage3array_sub1[bestfrom];
2378 best1 = stage3array_sub2[bestto];
2379
2380 debug2(printf("\nEntering merge_left_and_right_readthrough with bestfrom %d: %p, bestto %d: %p\n",
2381 bestfrom,best0,bestto,best1));
2382 debug2(printf("Running Stage3_merge_local\n"));
2383
2384 if ((stage3 = Stage3_merge_local(best0,best1,/*minpos1*/0,/*maxpos1*/breakpoint,
2385 /*minpos2*/breakpoint+1,/*maxpos2*/queryntlength,queryseq,
2386 #ifdef PMAP
2387 queryaaseq_ptr,
2388 #endif
2389 queryseq_ptr,queryuc_ptr,
2390 oligoindices_minor,diagpool,cellpool,
2391 pairpool,dynprogL,dynprogM,dynprogR,maxpeelback)) == NULL) {
2392 *mergedp = false;
2393
2394 } else {
2395 debug2(printf("Changing genomicend of merged stage3 from %u to %u\n",Stage3_genomicend(stage3),Stage3_genomicend(best1)));
2396 Stage3_set_genomicend(stage3,Stage3_genomicend(best1));
2397 debug(printf("Pushing %p onto newstage3list\n",stage3));
2398 stage3list = List_push(stage3list,(void *) stage3);
2399 debug2(Stage3_print_ends(stage3));
2400 *mergedp = true;
2401 }
2402
2403 return stage3list;
2404 }
2405
2406
2407 #if 0
2408 /* Returns a list with only two Stage3_T objects */
2409 static List_T
2410 merge_left_and_right_transloc (Stage3_T *stage3array_sub1, int npaths_sub1, int bestfrom,
2411 Stage3_T *stage3array_sub2, int npaths_sub2, int bestto,
2412 List_T stage3list) {
2413 List_T newstage3list, p;
2414 Stage3_T best0, best1, stage3, *array;
2415 int i, k;
2416
2417 best0 = stage3array_sub1[bestfrom];
2418 best1 = stage3array_sub2[bestto];
2419
2420 debug2(printf("\nEntering merge_left_and_right_transloc with bestfrom %d: %p, bestto %d: %p, and stage3list %d\n",
2421 bestfrom,best0,bestto,best1,List_length(stage3list)));
2422
2423 debug2(printf("Before Stage3_merge_chimera, best0 is %p, query %d..%d\n",
2424 best0,Stage3_querystart(best0),Stage3_queryend(best0)));
2425 debug2(Stage3_print_ends(best0));
2426 debug2(printf("Before Stage3_merge_chimera, best1 is %p, query %d..%d\n",
2427 best1,Stage3_querystart(best1),Stage3_queryend(best1)));
2428 debug2(Stage3_print_ends(best1));
2429
2430 debug2(printf("Rearranging paths\n"));
2431 newstage3list = (List_T) NULL;
2432
2433 debug(printf("Pushing %p onto newstage3list\n",best0));
2434 debug(printf("Pushing %p onto newstage3list\n",best1));
2435 if (Stage3_npairs(best0) == 0) {
2436 Stage3_free(&best0);
2437 best0 = (Stage3_T) NULL;
2438 } else {
2439 newstage3list = List_push(newstage3list,(void *) best0);
2440 debug2(Stage3_print_ends(best0));
2441 }
2442 if (Stage3_npairs(best1) == 0) {
2443 Stage3_free(&best1);
2444 best1 = (Stage3_T) NULL;
2445 } else {
2446 newstage3list = List_push(newstage3list,(void *) best1);
2447 debug2(Stage3_print_ends(best1));
2448 }
2449
2450 if (List_length(stage3list) > 2) {
2451 /* Push rest of results, taking care not to have duplicates */
2452 array = (Stage3_T *) MALLOCA((List_length(stage3list) - 2) * sizeof(Stage3_T));
2453 k = 0;
2454 for (p = stage3list; p != NULL; p = List_next(p)) {
2455 stage3 = (Stage3_T) List_head(p);
2456 if (Stage3_npairs(stage3) == 0) {
2457 Stage3_free(&stage3);
2458 } else if (stage3 == best0 || stage3 == best1) {
2459 /* Skip */
2460 } else {
2461 array[k++] = stage3;
2462 debug(printf("Pushing %p onto newstage3list\n",stage3));
2463 newstage3list = List_push(newstage3list,(void *) stage3);
2464 }
2465 }
2466 qsort(array,k,sizeof(Stage3_T),Stage3_identity_cmp);
2467 FREEA(array);
2468 }
2469
2470 List_free(&stage3list);
2471 return List_reverse(newstage3list);
2472 }
2473 #endif
2474
2475
2476 static int
find_breakpoint(int * cdna_direction,int * chimerapos,int * chimeraequivpos,int * exonexonpos,char * donor1,char * donor2,char * acceptor2,char * acceptor1,bool * donor_watsonp,bool * acceptor_watsonp,double * donor_prob,double * acceptor_prob,Stage3_T from,Stage3_T to,Sequence_T queryntseq,Sequence_T queryseq,Sequence_T queryuc,int queryntlength,Genome_T genome,Genome_T genomealt,Univ_IIT_T chromosome_iit,Pairpool_T pairpool)2477 find_breakpoint (int *cdna_direction, int *chimerapos, int *chimeraequivpos, int *exonexonpos,
2478 char *donor1, char *donor2, char *acceptor2, char *acceptor1,
2479 bool *donor_watsonp, bool *acceptor_watsonp, double *donor_prob, double *acceptor_prob,
2480 Stage3_T from, Stage3_T to,
2481 #ifdef PMAP
2482 Sequence_T queryntseq,
2483 #endif
2484 Sequence_T queryseq, Sequence_T queryuc,
2485 int queryntlength, Genome_T genome, Genome_T genomealt,
2486 Univ_IIT_T chromosome_iit, Pairpool_T pairpool) {
2487 int breakpoint, rangelow, rangehigh, leftpos, rightpos, midpos;
2488 int maxpeelback_from, maxpeelback_to;
2489 int found_cdna_direction, try_cdna_direction;
2490 char comp; /* Not really used anywhere */
2491
2492 int queryjump;
2493 int genomejump;
2494 bool max_extend_p;
2495 Chrpos_T left_chrlength, right_chrlength;
2496 Univcoord_T chroffset, chrhigh;
2497
2498 if (Stage3_queryend(from) < Stage3_querystart(to)) {
2499 /* Gap exists between the two parts */
2500 if ((leftpos = Stage3_queryend(from) - CHIMERA_EXTEND) < 0) {
2501 leftpos = 0;
2502 }
2503 if ((rightpos = Stage3_querystart(to) + CHIMERA_EXTEND) >= queryntlength) {
2504 rightpos = queryntlength - 1;
2505 }
2506 maxpeelback_from = CHIMERA_EXTEND;
2507 maxpeelback_to = CHIMERA_EXTEND;
2508 debug2(printf("overlap: leftpos %d, rightpos %d, queryntlength %d, maxpeelback_from %d, maxpeelback_to %d\n",
2509 leftpos,rightpos,queryntlength,maxpeelback_from,maxpeelback_to));
2510
2511 if (Stage3_watsonp(from) == true && Stage3_watsonp(to) == true) {
2512 queryjump = Stage3_querystart(to) - Stage3_queryend(from) - 1;
2513 genomejump = Stage3_genomicstart(to) - Stage3_genomicend(from) - 1U;
2514 max_extend_p = ((int) genomejump == queryjump) ? false : true;
2515 debug2(printf("gap exists: genomejump = %u, queryjump = %d, max_extend_p = %d\n",genomejump,queryjump,max_extend_p));
2516 } else if (Stage3_watsonp(from) == false && Stage3_watsonp(to) == false) {
2517 queryjump = Stage3_querystart(to) - Stage3_queryend(from) - 1;
2518 genomejump = Stage3_genomicend(from) - Stage3_genomicstart(to) - 1U;
2519 max_extend_p = ((int) genomejump == queryjump) ? false : true;
2520 debug2(printf("gap exists: genomejump = %u, queryjump = %d, max_extend_p = %d\n",genomejump,queryjump,max_extend_p));
2521 } else {
2522 max_extend_p = false;
2523 }
2524
2525 } else {
2526 /* Two parts overlap */
2527 if ((leftpos = Stage3_querystart(to) - CHIMERA_EXTEND) < 0) {
2528 leftpos = 0;
2529 }
2530 if ((rightpos = Stage3_queryend(from) + CHIMERA_EXTEND) >= queryntlength) {
2531 rightpos = queryntlength - 1;
2532 }
2533 midpos = (leftpos+rightpos)/2;
2534 /* maxpeelback_from = rightpos - Stage3_querystart(to); */
2535 /* maxpeelback_to = Stage3_queryend(from) - leftpos; */
2536 maxpeelback_from = rightpos - midpos;
2537 maxpeelback_to = midpos - leftpos;
2538 debug2(printf("overlap: leftpos %d, rightpos %d, midpos %d, queryntlength %d, maxpeelback_from %d, maxpeelback_to %d\n",
2539 leftpos,rightpos,midpos,queryntlength,maxpeelback_from,maxpeelback_to));
2540 #if 0
2541 if (Stage3_watsonp(from) == true && Stage3_watsonp(to) == true) {
2542 queryjump = Stage3_queryend(from) - Stage3_querystart(to) - 1;
2543 genomejump = Stage3_genomicend(from) - Stage3_genomicstart(to) - 1U;
2544 max_extend_p = (genomejump == queryjump) ? false : true;
2545 } else if (Stage3_watsonp(from) == false && Stage3_watsonp(to) == false) {
2546 queryjump = Stage3_queryend(from) - Stage3_querystart(to) - 1;
2547 genomejump = Stage3_genomicstart(to) - Stage3_genomicend(from) - 1U;
2548 max_extend_p = (genomejump == queryjump) ? false : true;
2549 } else {
2550 max_extend_p = false;
2551 }
2552 #else
2553 debug2(printf("parts overlap: max_extend_p is false\n"));
2554 max_extend_p = false;
2555 #endif
2556 }
2557
2558 debug2(printf("Before Stage3_extend_right, bestfrom is %p, query %d..%d, rightpos %d, pairs %p\n",
2559 from,Stage3_querystart(from),Stage3_queryend(from),rightpos,Stage3_pairs(from)));
2560 debug2(Stage3_print_ends(from));
2561 debug2(printf("Before Stage3_extend_left, bestto is %p, query %d..%d, leftpos %d, pairs %p\n",
2562 to,Stage3_querystart(to),Stage3_queryend(to),leftpos,Stage3_pairs(to)));
2563 debug2(Stage3_print_ends(to));
2564
2565 Stage3_extend_right(from,/*goal*/rightpos,
2566 #ifdef PMAP
2567 /*querylength*/Sequence_fulllength(queryntseq),
2568 /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
2569 /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
2570 #else
2571 /*querylength*/Sequence_fulllength(queryseq),
2572 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
2573 /*queryuc_ptr*/Sequence_fullpointer(queryuc),
2574 #endif
2575 max_extend_p,pairpool,Stage3_genestrand(from),maxpeelback_from);
2576
2577 Stage3_extend_left(to,/*goal*/leftpos,
2578 #ifdef PMAP
2579 /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
2580 /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
2581 #else
2582 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
2583 /*queryuc_ptr*/Sequence_fullpointer(queryuc),
2584 #endif
2585 max_extend_p,pairpool,Stage3_genestrand(to),maxpeelback_to);
2586
2587 debug2(printf("Before Chimera_find_breakpoint, bestfrom is %p, query %d..%d, pairs %p\n",
2588 from,Stage3_querystart(from),Stage3_queryend(from),Stage3_pairs(from)));
2589 debug2(Stage3_print_ends(from));
2590 debug2(printf("Before Chimera_find_breakpoint, bestto is %p, query %d..%d, pairs %p\n",
2591 to,Stage3_querystart(to),Stage3_queryend(to),Stage3_pairs(to)));
2592 debug2(Stage3_print_ends(to));
2593
2594 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&left_chrlength,chromosome_iit,Stage3_chrnum(from),circular_typeint);
2595 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&right_chrlength,chromosome_iit,Stage3_chrnum(to),circular_typeint);
2596
2597 if ((*chimerapos = Chimera_find_breakpoint(&(*chimeraequivpos),&rangelow,&rangehigh,
2598 &(*donor1),&(*donor2),&(*acceptor2),&(*acceptor1),
2599 from,to,queryntlength,genome,left_chrlength,right_chrlength)) < 0) {
2600 /* TODO: Allow finding a breakpoint for DNA-Seq, which needs no donor or acceptor nucleotides */
2601 debug2(printf("Chimera_find_breakpoint returns no value\n"));
2602 *donor_prob = *acceptor_prob = 0.0;
2603 *donor_watsonp = *acceptor_watsonp = true;
2604 *cdna_direction = 0;
2605 return -1;
2606
2607 } else {
2608 debug2(printf("Chimera_find_breakpoint has chimerapos %d..%d\n",*chimerapos,*chimeraequivpos));
2609
2610 Stage3_trim_right(from,/*goal*/rangehigh,
2611 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
2612 pairpool);
2613
2614 Stage3_trim_left(to,/*goal*/rangelow,
2615 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
2616 pairpool);
2617
2618 debug2(printf("Before Chimera_find_exonexon, bestfrom is %p, query %d..%d, pairs %p\n",
2619 from,Stage3_querystart(from),Stage3_queryend(from),Stage3_pairs(from)));
2620 debug2(printf("Before Chimera_find_exonexon, bestto is %p, query %d..%d, pairs %p\n",
2621 to,Stage3_querystart(to),Stage3_queryend(to),Stage3_pairs(to)));
2622
2623 if ((*exonexonpos = Chimera_find_exonexon(&found_cdna_direction,&try_cdna_direction,
2624 &(*donor1),&(*donor2),&(*acceptor2),&(*acceptor1),
2625 &comp,&(*donor_watsonp),&(*acceptor_watsonp),&(*donor_prob),&(*acceptor_prob),
2626 /*left_part*/from,/*right_part*/to,genome,genomealt ? genomealt : genome,
2627 chromosome_iit,/*breakpoint_start*/Stage3_querystart(to),
2628 /*breakpoint_end*/Stage3_queryend(from))) <= 0) {
2629 /* Couldn't find a good exon-exon junction, so rely on sequence */
2630 *donor_prob = *acceptor_prob = 0.0;
2631 *donor_watsonp = *acceptor_watsonp = true;
2632
2633 debug2(printf("Chimera_find_breakpoint returns boundary at %d..%d (switch can occur at %d..%d)\n",
2634 *chimerapos,*chimeraequivpos,(*chimerapos)-1,*chimeraequivpos));
2635
2636 breakpoint = ((*chimerapos) + (*chimeraequivpos))/2;
2637 *cdna_direction = try_cdna_direction;
2638 debug2(printf("Exon-exon boundary not found, but setting breakpoint to be %d\n",breakpoint));
2639 return breakpoint;
2640
2641 } else {
2642 /* Use the exon-exon solution */
2643 breakpoint = *chimerapos = *chimeraequivpos = *exonexonpos;
2644 *cdna_direction = found_cdna_direction;
2645 debug2(printf("Exon-exon boundary found at %d, which is breakpoint. Comp = %c\n",
2646 *exonexonpos,comp));
2647 return breakpoint;
2648 }
2649 }
2650 }
2651
2652
2653 /* Can potentially include a larger stage3list */
2654 static List_T
check_for_local(bool * mergedp,List_T stage3list,int effective_start,int effective_end,Sequence_T queryseq,Sequence_T queryuc,Sequence_T queryntseq,int queryntlength,Sequence_T usersegment,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Matchpool_T matchpool,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR)2655 check_for_local (bool *mergedp, List_T stage3list, int effective_start, int effective_end,
2656 Sequence_T queryseq, Sequence_T queryuc,
2657 #ifdef PMAP
2658 Sequence_T queryntseq,
2659 #endif
2660 int queryntlength, Sequence_T usersegment, Stage2_alloc_T stage2_alloc,
2661 Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
2662 Matchpool_T matchpool, Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
2663 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR) {
2664 List_T gregions = NULL, p;
2665 Stage3_T *stage3array_sub1 = NULL, *stage3array_sub2 = NULL, from, to, stage3;
2666 Sequence_T querysubseq = NULL, querysubuc = NULL;
2667 Diagnostic_T diagnostic;
2668 int bestfrom, bestto;
2669 int five_margin, three_margin, five_score = 0, three_score = 0;
2670 int extension;
2671 int npaths_sub1 = 0, npaths_sub2 = 0;
2672 bool lowidentityp, poorp, repetitivep;
2673
2674 int max_single_goodness;
2675 int breakpoint, chimerapos, chimeraequivpos, exonexonpos;
2676 int chimera_cdna_direction;
2677 char donor1, donor2, acceptor2, acceptor1;
2678 bool donor_watsonp, acceptor_watsonp;
2679 double donor_prob, acceptor_prob;
2680
2681 int kstart1, kstart2, kend1, kend2;
2682 Chrnum_T chrnum;
2683 #ifdef DEBUG2
2684 int k;
2685 #endif
2686
2687
2688 #ifdef PMAP
2689 five_margin = effective_start - 3*Sequence_trim_start(queryseq);
2690 three_margin = 3*Sequence_trim_end(queryseq) - effective_end;
2691 debug2(printf("Margins are %d = %d - %d on the 5' end and %d = %d - %d on the 3' end\n",
2692 five_margin,effective_start,3*Sequence_trim_start(queryseq),
2693 three_margin,3*Sequence_trim_end(queryseq),effective_end));
2694 #else
2695 five_margin = effective_start - Sequence_trim_start(queryseq);
2696 three_margin = Sequence_trim_end(queryseq) - effective_end;
2697 debug2(printf("Margins are %d = %d - %d on the 5' end and %d = %d - %d on the 3' end\n",
2698 five_margin,effective_start,Sequence_trim_start(queryseq),
2699 three_margin,Sequence_trim_end(queryseq),effective_end));
2700 #endif
2701
2702 #ifdef DEBUG2A
2703 for (p = stage3list; p != NULL; p = List_next(p)) {
2704 stage3 = (Stage3_T) List_head(p);
2705 Pair_dump_array(Stage3_pairarray(stage3),Stage3_npairs(stage3),/*zerobasedp*/true);
2706 printf("\n");
2707 }
2708 #endif
2709
2710 /* Stage3_recompute_goodness(stage3list); */
2711 max_single_goodness = 0;
2712 for (p = stage3list; p != NULL; p = List_next(p)) {
2713 stage3 = (Stage3_T) List_head(p);
2714 if (Stage3_goodness(stage3) > max_single_goodness) {
2715 max_single_goodness = Stage3_goodness(stage3);
2716 }
2717 }
2718 debug2(printf("max single goodness = %d\n",max_single_goodness));
2719
2720
2721 debug2(printf("Running local_separate_paths\n"));
2722 local_separate_paths(&stage3array_sub1,&npaths_sub1,&stage3array_sub2,&npaths_sub2,
2723 stage3list);
2724 debug2(printf("local: npaths_sub1 %d, npaths_sub2 %d, stage3list %d\n",
2725 npaths_sub1,npaths_sub2,List_length(stage3list)));
2726
2727 if (npaths_sub1 == 0 && npaths_sub2 == 0) {
2728 /* Need to compute on margin explicitly */
2729 if (five_margin < chimera_margin && three_margin < chimera_margin) {
2730 debug2(printf("Insufficient margins\n"));
2731 } else if (five_margin > three_margin) {
2732 #if 0
2733 /* extension makes it harder to find the other alignment. The merging process will help fill in any gap. */
2734 extension = CHIMERA_SLOP;
2735 debug2(printf("Comparing extension %d with %d = (effective_start %d)/2\n",
2736 extension,effective_start/2,effective_start));
2737 if (extension > effective_start/2) {
2738 /* Extension occupies more than 1/3 of sequence */
2739 debug2(printf("Proposed extension of %d is too long relative to effective_start %d\n",extension,effective_start));
2740 extension = effective_start/3;
2741 }
2742 #else
2743 extension = 0;
2744 #endif
2745 if ((querysubseq = Sequence_subsequence(queryseq,0,effective_start+extension)) != NULL) {
2746 if ((querysubuc = Sequence_subsequence(queryuc,0,effective_start+extension)) != NULL) {
2747 debug2(printf("5 margin > 3 margin. "));
2748 debug2(printf("Beginning Stage1_compute on 5' margin from effective_start %d (%d..%d)\n",
2749 effective_start,0,effective_start+extension));
2750 debug2a(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
2751
2752 diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
2753 Oligoindex_array_elt(oligoindices_major,0));
2754 if (poorp == true || repetitivep == true) {
2755 debug2(printf("Subsequence is poor or repetitive\n"));
2756 } else {
2757 if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
2758 gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
2759 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2760 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2761 } else {
2762 gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
2763 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2764 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2765 }
2766 debug2(printf("A. Performing Stage 3 starting with list length %d\n",List_length(stage3list)));
2767 stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
2768 #ifdef PMAP
2769 queryntseq,
2770 #endif
2771 usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
2772 pairpool,diagpool,cellpool,
2773 dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
2774 #ifdef DEBUG2
2775 for (p = stage3list; p != NULL; p = List_next(p)) {
2776 stage3 = (Stage3_T) List_head(p);
2777 printf("%d..%d, %u..%u\n",
2778 Stage3_querystart(stage3),Stage3_queryend(stage3),
2779 Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
2780 }
2781 #endif
2782 }
2783 Diagnostic_free(&diagnostic);
2784
2785 /* Above function frees gregions */
2786 Sequence_free(&querysubuc);
2787 }
2788 Sequence_free(&querysubseq);
2789 }
2790
2791 /* And recompute on original part, just in case stage 1 was led astray by the ends */
2792 if ((querysubseq = Sequence_subsequence(queryseq,effective_start,queryntlength)) != NULL) {
2793 if ((querysubuc = Sequence_subsequence(queryuc,effective_start,queryntlength)) != NULL) {
2794 debug2(printf("Recomputing on original part. "));
2795 debug2(printf("Beginning Stage1_compute on 5' margin from effective_start %d (%d..%d)\n",
2796 effective_start,effective_start,queryntlength));
2797 debug2a(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
2798
2799 diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
2800 Oligoindex_array_elt(oligoindices_major,0));
2801 if (poorp == true || repetitivep == true) {
2802 debug2(printf("Subsequence is poor or repetitive\n"));
2803 } else {
2804 if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
2805 gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
2806 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2807 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2808 } else {
2809 gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
2810 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2811 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2812 }
2813 debug2(printf("B. Performing Stage 3 starting with list length %d\n",List_length(stage3list)));
2814 stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
2815 #ifdef PMAP
2816 queryntseq,
2817 #endif
2818 usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
2819 pairpool,diagpool,cellpool,
2820 dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
2821 #ifdef DEBUG2
2822 for (p = stage3list; p != NULL; p = List_next(p)) {
2823 stage3 = (Stage3_T) List_head(p);
2824 printf("%d..%d, %u..%u\n",
2825 Stage3_querystart(stage3),Stage3_queryend(stage3),
2826 Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
2827 }
2828 #endif
2829 }
2830 Diagnostic_free(&diagnostic);
2831
2832 /* Above function frees gregions */
2833 Sequence_free(&querysubuc);
2834 }
2835 Sequence_free(&querysubseq);
2836 }
2837
2838 debug2(printf("Running local_separate_paths\n"));
2839 local_separate_paths(&stage3array_sub1,&npaths_sub1,&stage3array_sub2,&npaths_sub2,
2840 stage3list);
2841 debug2(printf("local: npaths_sub1 %d, npaths_sub2 %d, stage3list %d\n",
2842 npaths_sub1,npaths_sub2,List_length(stage3list)));
2843
2844 } else {
2845 #if 0
2846 /* extension makes it harder to find the other alignment. The merging process will help fill in any gap. */
2847 extension = CHIMERA_SLOP;
2848 debug2(printf("Comparing extension %d with %d = (queryntlength %d - effective_end %d)/2\n",
2849 extension,(queryntlength-effective_end)/2,queryntlength,effective_end));
2850 if (extension > (queryntlength - effective_end)/2) {
2851 /* Extension occupies more than 1/3 of sequence */
2852 debug2(printf("Proposed extension of %d is too long relative to queryntlength %d and effective_end %d\n",
2853 extension,queryntlength,effective_end));
2854 extension = (queryntlength - effective_end)/3;
2855 }
2856 #else
2857 extension = 0;
2858 #endif
2859 if ((querysubseq = Sequence_subsequence(queryseq,effective_end-extension,queryntlength)) != NULL) {
2860 if ((querysubuc = Sequence_subsequence(queryuc,effective_end-extension,queryntlength)) != NULL) {
2861 debug2(printf("5 margin <= 3 margin. "));
2862 debug2(printf("Beginning Stage1_compute on 3' margin from effective_end %d (%d..%d) (extension %d)\n",
2863 effective_end,effective_end-extension,queryntlength,extension));
2864 debug2(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
2865
2866 diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
2867 Oligoindex_array_elt(oligoindices_major,0));
2868 if (poorp == true || repetitivep == true) {
2869 debug2(printf("Subsequence is poor or repetitive\n"));
2870 } else {
2871 if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
2872 gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
2873 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2874 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2875 } else {
2876 gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
2877 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2878 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2879 }
2880 debug2(printf("C. Performing Stage 3 with list length %d\n",List_length(stage3list)));
2881 stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
2882 #ifdef PMAP
2883 queryntseq,
2884 #endif
2885 usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
2886 pairpool,diagpool,cellpool,
2887 dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
2888 #ifdef DEBUG2
2889 for (p = stage3list; p != NULL; p = List_next(p)) {
2890 stage3 = (Stage3_T) List_head(p);
2891 printf("%d..%d, %u..%u\n",
2892 Stage3_querystart(stage3),Stage3_queryend(stage3),
2893 Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
2894 }
2895 #endif
2896 }
2897 Diagnostic_free(&diagnostic);
2898
2899 /* Above function frees gregions */
2900 Sequence_free(&querysubuc);
2901 }
2902 Sequence_free(&querysubseq);
2903 }
2904
2905 /* And recompute on original part, just in case stage 1 was led astray by the ends */
2906 if ((querysubseq = Sequence_subsequence(queryseq,0,effective_end)) != NULL) {
2907 if ((querysubuc = Sequence_subsequence(queryuc,0,effective_end)) != NULL) {
2908 debug2(printf("Recomputing on original part. "));
2909 debug2(printf("Beginning Stage1_compute on 3' margin from effective_end %d (%d..%d), extension %d\n",
2910 effective_end,0,effective_end,extension));
2911 debug2(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
2912
2913 diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
2914 Oligoindex_array_elt(oligoindices_major,0));
2915 if (poorp == true || repetitivep == true) {
2916 debug2(printf("Subsequence is poor or repetitive\n"));
2917 } else {
2918 if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
2919 gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
2920 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2921 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2922 } else {
2923 gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
2924 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2925 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2926 }
2927 debug2(printf("D. Performing Stage 3 with list length %d\n",List_length(stage3list)));
2928 stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
2929 #ifdef PMAP
2930 queryntseq,
2931 #endif
2932 usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
2933 pairpool,diagpool,cellpool,
2934 dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
2935 #ifdef DEBUG2
2936 for (p = stage3list; p != NULL; p = List_next(p)) {
2937 stage3 = (Stage3_T) List_head(p);
2938 printf("%d..%d, %u..%u\n",
2939 Stage3_querystart(stage3),Stage3_queryend(stage3),
2940 Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
2941 }
2942 #endif
2943 }
2944 Diagnostic_free(&diagnostic);
2945
2946 /* Above function frees gregions */
2947 Sequence_free(&querysubuc);
2948
2949 }
2950 Sequence_free(&querysubseq);
2951 }
2952
2953 debug2(printf("Running local_separate_paths\n"));
2954 local_separate_paths(&stage3array_sub1,&npaths_sub1,&stage3array_sub2,&npaths_sub2,
2955 stage3list);
2956 debug2(printf("local: npaths_sub1 %d, npaths_sub2 %d, stage3list %d\n",
2957 npaths_sub1,npaths_sub2,List_length(stage3list)));
2958 }
2959 }
2960
2961 *mergedp = false;
2962 if (npaths_sub1 == 0 && npaths_sub2 == 0) {
2963 /* Skip */
2964
2965 } else if (npaths_sub1 == 0) {
2966 /* Skip */
2967 FREE(stage3array_sub2);
2968
2969 } else if (npaths_sub2 == 0) {
2970 /* Skip */
2971 FREE(stage3array_sub1);
2972
2973 } else {
2974 /* Iterate for each chromosome */
2975 qsort(stage3array_sub1,npaths_sub1,sizeof(Stage3_T),Stage3_chrnum_cmp);
2976 qsort(stage3array_sub2,npaths_sub2,sizeof(Stage3_T),Stage3_chrnum_cmp);
2977
2978
2979 kend1 = kend2 = 0;
2980 *mergedp = false;
2981 /* List_free(&stage3list); */
2982
2983 while (kend1 < npaths_sub1 && kend2 < npaths_sub2) {
2984 kstart1 = kend1;
2985 kstart2 = kend2;
2986 chrnum = Stage3_chrnum(stage3array_sub1[kstart1]);
2987 while (kend1 < npaths_sub1 && Stage3_chrnum(stage3array_sub1[kend1]) == chrnum) {
2988 kend1++;
2989 }
2990 while (kend2 < npaths_sub2 && Stage3_chrnum(stage3array_sub2[kend2]) == chrnum) {
2991 kend2++;
2992 }
2993
2994 #ifdef DEBUG2
2995 printf("Chimera_bestpath left\n");
2996 for (k = kstart1; k < kend1; k++) {
2997 stage3 = stage3array_sub1[k];
2998 printf("%d..%d, %d:%u..%u\n",
2999 Stage3_querystart(stage3),Stage3_queryend(stage3),
3000 Stage3_chrnum(stage3),Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
3001 }
3002 printf("Chimera_bestpath right\n");
3003 for (k = kstart2; k < kend2; k++) {
3004 stage3 = stage3array_sub2[k];
3005 printf("%d..%d, %d:%u..%u\n",
3006 Stage3_querystart(stage3),Stage3_queryend(stage3),
3007 Stage3_chrnum(stage3),Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
3008 }
3009 #endif
3010
3011 if (Chimera_bestpath(&five_score,&three_score,&chimerapos,&chimeraequivpos,&bestfrom,&bestto,
3012 &(stage3array_sub1[kstart1]),/*npaths1*/kend1-kstart1,
3013 &(stage3array_sub2[kstart2]),/*npaths2*/kend2-kstart2,
3014 queryntlength,CHIMERA_SLOP,/*circularp*/NULL,/*localp*/true) == false) {
3015 /* Skip */
3016 debug2(printf("Chimera_bestpath returns false\n"));
3017
3018 } else {
3019 from = stage3array_sub1[kstart1 + bestfrom];
3020 to = stage3array_sub2[kstart2 + bestto];
3021 debug2(printf("Chimera_bestpath returns bestfrom %d (%d..%d, %u..%u) to bestto %d (%d..%d, %u..%u)\n",
3022 bestfrom,Stage3_querystart(from),Stage3_queryend(from),Stage3_genomicstart(from),Stage3_genomicend(from),
3023 bestto,Stage3_querystart(to),Stage3_queryend(to),Stage3_genomicstart(to),Stage3_genomicend(to)));
3024
3025 if ((breakpoint = find_breakpoint(&chimera_cdna_direction,&chimerapos,&chimeraequivpos,&exonexonpos,
3026 &donor1,&donor2,&acceptor2,&acceptor1,
3027 &donor_watsonp,&acceptor_watsonp,&donor_prob,&acceptor_prob,from,to,
3028 #ifdef PMAP
3029 queryntseq,
3030 #endif
3031 queryseq,queryuc,queryntlength,
3032 genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3033 debug2(printf("Cannot find breakpoint\n"));
3034
3035 } else {
3036 debug2(printf("find_breakpoint returns %d\n",breakpoint));
3037
3038 /* Check to see if we can merge chimeric parts */
3039 debug2(printf("Before Stage3_mergeable, bestfrom is %p, query %d..%d, pairs %p\n",
3040 from,Stage3_querystart(from),Stage3_queryend(from),Stage3_pairs(from)));
3041 debug2(printf("Before Stage3_mergeable, bestto is %p, query %d..%d, pairs %p\n",
3042 to,Stage3_querystart(to),Stage3_queryend(to),Stage3_pairs(to)));
3043
3044 if (Stage3_mergeable(from,to,breakpoint,queryntlength) == true) {
3045 debug2(printf("Mergeable! -- Merging left and right as a readthrough\n"));
3046 stage3list = merge_left_and_right_readthrough(&(*mergedp),stage3list,
3047 &(stage3array_sub1[kstart1]),/*npaths1:kend1-kstart1,*/bestfrom,
3048 &(stage3array_sub2[kstart2]),/*npaths2:kend2-kstart2,*/bestto,
3049 breakpoint,queryntlength,queryseq,
3050 #ifdef PMAP
3051 /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
3052 /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
3053 /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
3054 #else
3055 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
3056 /*queryuc_ptr*/Sequence_fullpointer(queryuc),
3057 #endif
3058 pairpool,dynprogL,dynprogM,dynprogR,
3059 oligoindices_minor,diagpool,cellpool);
3060
3061 debug2(printf("After merge_left_and_right_readthrough, bestfrom is %p, query %d..%d, pairs %p\n",
3062 from,Stage3_querystart(from),Stage3_queryend(from),Stage3_pairs(from)));
3063 debug2(printf("After merge_left_and_right_readthrough, bestto is %p, query %d..%d, pairs %p\n",
3064 to,Stage3_querystart(to),Stage3_queryend(to),Stage3_pairs(to)));
3065 }
3066 }
3067 }
3068 }
3069
3070 FREE(stage3array_sub2);
3071 FREE(stage3array_sub1);
3072
3073 /* stage3list = List_reverse(stage3list); */
3074 }
3075
3076 debug2(printf("check_for_local returning list of length %d\n",List_length(stage3list)));
3077 #ifdef DEBUG2
3078 for (p = stage3list; p != NULL; p = List_next(p)) {
3079 stage3 = (Stage3_T) List_head(p);
3080 printf("%p %p\n",stage3,Stage3_pairs(stage3));
3081 }
3082 #endif
3083
3084 /* stage3list = stage3list_remove_empties(stage3list); */
3085
3086 #if 0
3087 /* Should be handled by apply_stage3 loop */
3088 /* Needed after calls to stage3_from_gregions */
3089 Stage3_recompute_goodness(stage3list);
3090 stage3list = stage3list_remove_duplicates(stage3list);
3091 #endif
3092
3093 return stage3list;
3094 }
3095
3096
3097 static List_T
check_for_chimera(bool * mergedp,Chimera_T * chimera,List_T stage3list,int effective_start,int effective_end,Sequence_T queryseq,Sequence_T queryuc,Sequence_T queryntseq,int queryntlength,Sequence_T usersegment,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Matchpool_T matchpool,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR)3098 check_for_chimera (bool *mergedp, Chimera_T *chimera, List_T stage3list, int effective_start, int effective_end,
3099 Sequence_T queryseq, Sequence_T queryuc,
3100 #ifdef PMAP
3101 Sequence_T queryntseq,
3102 #endif
3103 int queryntlength, Sequence_T usersegment, Stage2_alloc_T stage2_alloc,
3104 Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
3105 Matchpool_T matchpool, Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
3106 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR) {
3107 List_T gregions = NULL, p;
3108 Stage3_T new_left, new_right;
3109 Stage3_T *stage3array_sub1 = NULL, *stage3array_sub2 = NULL, from, to, stage3;
3110 Sequence_T querysubseq = NULL, querysubuc = NULL;
3111 Diagnostic_T diagnostic;
3112 int bestfrom, bestto;
3113 int five_margin, three_margin, five_score = 0, three_score = 0;
3114 int extension;
3115 int npaths_sub1 = 0, npaths_sub2 = 0;
3116 bool lowidentityp, poorp, repetitivep;
3117
3118 int max_single_goodness, chimeric_goodness, penalty, matches0, matches1;
3119 int breakpoint, chimerapos, chimeraequivpos, exonexonpos;
3120 int chimera_cdna_direction;
3121 char donor1, donor2, acceptor2, acceptor1;
3122 bool donor_watsonp, acceptor_watsonp;
3123 double donor_prob, acceptor_prob;
3124
3125
3126 debug2(printf("check_for_chimera called with %d paths\n",List_length(stage3list)));
3127 #ifdef DEBUG2
3128 for (p = stage3list; p != NULL; p = List_next(p)) {
3129 stage3 = (Stage3_T) List_head(p);
3130 printf("%p %p\n",stage3,Stage3_pairs(stage3));
3131 }
3132 #endif
3133
3134
3135
3136 #ifdef PMAP
3137 five_margin = effective_start - 3*Sequence_trim_start(queryseq);
3138 three_margin = 3*Sequence_trim_end(queryseq) - effective_end;
3139 debug2(printf("Margins are %d = %d - %d on the 5' end and %d = %d - %d on the 3' end\n",
3140 five_margin,effective_start,3*Sequence_trim_start(queryseq),
3141 three_margin,3*Sequence_trim_end(queryseq),effective_end));
3142 #else
3143 five_margin = effective_start - Sequence_trim_start(queryseq);
3144 three_margin = Sequence_trim_end(queryseq) - effective_end;
3145 debug2(printf("Margins are %d = %d - %d on the 5' end and %d = %d - %d on the 3' end\n",
3146 five_margin,effective_start,Sequence_trim_start(queryseq),
3147 three_margin,Sequence_trim_end(queryseq),effective_end));
3148 #endif
3149
3150 #ifdef DEBUG2A
3151 for (p = stage3list; p != NULL; p = List_next(p)) {
3152 stage3 = (Stage3_T) List_head(p);
3153 Pair_dump_array(Stage3_pairarray(stage3),Stage3_npairs(stage3),/*zerobasedp*/true);
3154 printf("\n");
3155 }
3156 #endif
3157
3158 /* Stage3_recompute_goodness(stage3list); */
3159 max_single_goodness = 0;
3160 for (p = stage3list; p != NULL; p = List_next(p)) {
3161 stage3 = (Stage3_T) List_head(p);
3162 if (Stage3_goodness(stage3) > max_single_goodness) {
3163 max_single_goodness = Stage3_goodness(stage3);
3164 }
3165 }
3166 debug2(printf("max single goodness = %d\n",max_single_goodness));
3167
3168
3169 debug2(printf("Running distant_separate_paths\n"));
3170 distant_separate_paths(&stage3array_sub1,&npaths_sub1,&stage3array_sub2,&npaths_sub2,
3171 stage3list);
3172 debug2(printf("chimera: npaths_sub1 %d, npaths_sub2 %d, stage3list %d\n",
3173 npaths_sub1,npaths_sub2,List_length(stage3list)));
3174
3175 if (npaths_sub1 == 0 && npaths_sub2 == 0) {
3176 /* Need to compute on margin explicitly */
3177 if (five_margin < chimera_margin && three_margin < chimera_margin) {
3178 debug2(printf("Insufficient margins\n"));
3179 } else if (five_margin > three_margin) {
3180 extension = CHIMERA_SLOP;
3181 debug2(printf("Comparing extension %d with %d = (effective_start %d)/2\n",
3182 extension,effective_start/2,effective_start));
3183 if (extension > effective_start/2) {
3184 /* Extension occupies more than 1/3 of sequence */
3185 debug2(printf("Proposed extension of %d is too long relative to effective_start %d\n",extension,effective_start));
3186 extension = effective_start/3;
3187 }
3188 if ((querysubseq = Sequence_subsequence(queryseq,0,effective_start+extension)) != NULL) {
3189 if ((querysubuc = Sequence_subsequence(queryuc,0,effective_start+extension)) != NULL) {
3190 debug2(printf("5 margin > 3 margin. "));
3191 debug2(printf("Beginning Stage1_compute on 5' margin from effective_start %d (%d..%d)\n",
3192 effective_start,0,effective_start+extension));
3193 debug2a(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
3194
3195 diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
3196 Oligoindex_array_elt(oligoindices_major,0));
3197 if (poorp == true || repetitivep == true) {
3198 debug2(printf("Subsequence is poor or repetitive\n"));
3199 } else {
3200 if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
3201 gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
3202 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3203 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3204 } else {
3205 gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
3206 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3207 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3208 }
3209 debug2(printf("A. Performing Stage 3 starting with list length %d\n",List_length(stage3list)));
3210 stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
3211 #ifdef PMAP
3212 queryntseq,
3213 #endif
3214 usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
3215 pairpool,diagpool,cellpool,
3216 dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
3217 #ifdef DEBUG2
3218 for (p = stage3list; p != NULL; p = List_next(p)) {
3219 stage3 = (Stage3_T) List_head(p);
3220 printf("%d..%d, %u..%u\n",
3221 Stage3_querystart(stage3),Stage3_queryend(stage3),
3222 Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
3223 }
3224 #endif
3225 }
3226 Diagnostic_free(&diagnostic);
3227
3228 /* Above function frees gregions */
3229 Sequence_free(&querysubuc);
3230 }
3231 Sequence_free(&querysubseq);
3232 }
3233
3234 /* And recompute on original part, just in case stage 1 was led astray by the ends */
3235 if ((querysubseq = Sequence_subsequence(queryseq,effective_start,queryntlength)) != NULL) {
3236 if ((querysubuc = Sequence_subsequence(queryuc,effective_start,queryntlength)) != NULL) {
3237 debug2(printf("Recomputing on original part. "));
3238 debug2(printf("Beginning Stage1_compute on 5' margin from effective_start %d (%d..%d)\n",
3239 effective_start,effective_start,queryntlength));
3240 debug2a(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
3241
3242 diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
3243 Oligoindex_array_elt(oligoindices_major,0));
3244 if (poorp == true || repetitivep == true) {
3245 debug2(printf("Subsequence is poor or repetitive\n"));
3246 } else {
3247 if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
3248 gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
3249 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3250 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3251 } else {
3252 gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
3253 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3254 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3255 }
3256 debug2(printf("B. Performing Stage 3 starting with list length %d\n",List_length(stage3list)));
3257 stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
3258 #ifdef PMAP
3259 queryntseq,
3260 #endif
3261 usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
3262 pairpool,diagpool,cellpool,
3263 dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
3264 #ifdef DEBUG2
3265 for (p = stage3list; p != NULL; p = List_next(p)) {
3266 stage3 = (Stage3_T) List_head(p);
3267 printf("%d..%d, %u..%u\n",
3268 Stage3_querystart(stage3),Stage3_queryend(stage3),
3269 Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
3270 }
3271 #endif
3272 }
3273 Diagnostic_free(&diagnostic);
3274
3275 /* Above function frees gregions */
3276 Sequence_free(&querysubuc);
3277 }
3278 Sequence_free(&querysubseq);
3279 }
3280
3281 debug2(printf("Running distant_separate_paths\n"));
3282 distant_separate_paths(&stage3array_sub1,&npaths_sub1,&stage3array_sub2,&npaths_sub2,
3283 stage3list);
3284 debug2(printf("chimera: npaths_sub1 %d, npaths_sub2 %d, stage3list %d\n",
3285 npaths_sub1,npaths_sub2,List_length(stage3list)));
3286
3287 } else {
3288 extension = CHIMERA_SLOP;
3289 debug2(printf("Comparing extension %d with %d = (queryntlength %d - effective_end %d)/2\n",
3290 extension,(queryntlength-effective_end)/2,queryntlength,effective_end));
3291 if (extension > (queryntlength - effective_end)/2) {
3292 /* Extension occupies more than 1/3 of sequence */
3293 debug2(printf("Proposed extension of %d is too long relative to queryntlength %d and effective_end %d\n",
3294 extension,queryntlength,effective_end));
3295 extension = (queryntlength - effective_end)/3;
3296 }
3297 if ((querysubseq = Sequence_subsequence(queryseq,effective_end-extension,queryntlength)) != NULL) {
3298 if ((querysubuc = Sequence_subsequence(queryuc,effective_end-extension,queryntlength)) != NULL) {
3299 debug2(printf("5 margin <= 3 margin. "));
3300 debug2(printf("Beginning Stage1_compute on 3' margin from effective_end %d (%d..%d)\n",
3301 effective_end,effective_end-extension,queryntlength));
3302 debug2(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
3303
3304 diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
3305 Oligoindex_array_elt(oligoindices_major,0));
3306 if (poorp == true || repetitivep == true) {
3307 debug2(printf("Subsequence is poor or repetitive\n"));
3308 } else {
3309 if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
3310 gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
3311 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3312 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3313 } else {
3314 gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
3315 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3316 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3317 }
3318 debug2(printf("C. Performing Stage 3 with list length %d\n",List_length(stage3list)));
3319 stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
3320 #ifdef PMAP
3321 queryntseq,
3322 #endif
3323 usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
3324 pairpool,diagpool,cellpool,
3325 dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
3326 #ifdef DEBUG2
3327 for (p = stage3list; p != NULL; p = List_next(p)) {
3328 stage3 = (Stage3_T) List_head(p);
3329 printf("%d..%d, %u..%u\n",
3330 Stage3_querystart(stage3),Stage3_queryend(stage3),
3331 Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
3332 }
3333 #endif
3334 }
3335 Diagnostic_free(&diagnostic);
3336
3337 /* Above function frees gregions */
3338 Sequence_free(&querysubuc);
3339 }
3340 Sequence_free(&querysubseq);
3341 }
3342
3343 /* And recompute on original part, just in case stage 1 was led astray by the ends */
3344 if ((querysubseq = Sequence_subsequence(queryseq,0,effective_end)) != NULL) {
3345 if ((querysubuc = Sequence_subsequence(queryuc,0,effective_end)) != NULL) {
3346 debug2(printf("Recomputing on original part. "));
3347 debug2(printf("Beginning Stage1_compute on 3' margin from effective_end %d (%d..%d)\n",
3348 effective_end,0,effective_end));
3349 debug2(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
3350
3351 diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
3352 Oligoindex_array_elt(oligoindices_major,0));
3353 if (poorp == true || repetitivep == true) {
3354 debug2(printf("Subsequence is poor or repetitive\n"));
3355 } else {
3356 if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
3357 gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
3358 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3359 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3360 } else {
3361 gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
3362 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3363 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3364 }
3365 debug2(printf("D. Performing Stage 3 with list length %d\n",List_length(stage3list)));
3366 stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
3367 #ifdef PMAP
3368 queryntseq,
3369 #endif
3370 usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
3371 pairpool,diagpool,cellpool,
3372 dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
3373 #ifdef DEBUG2
3374 for (p = stage3list; p != NULL; p = List_next(p)) {
3375 stage3 = (Stage3_T) List_head(p);
3376 printf("%d..%d, %u..%u\n",
3377 Stage3_querystart(stage3),Stage3_queryend(stage3),
3378 Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
3379 }
3380 #endif
3381 }
3382 Diagnostic_free(&diagnostic);
3383
3384 /* Above function frees gregions */
3385 Sequence_free(&querysubuc);
3386
3387 }
3388 Sequence_free(&querysubseq);
3389 }
3390
3391 debug2(printf("Running distant_separate_paths\n"));
3392 distant_separate_paths(&stage3array_sub1,&npaths_sub1,&stage3array_sub2,&npaths_sub2,
3393 stage3list);
3394 debug2(printf("chimera: npaths_sub1 %d, npaths_sub2 %d, stage3list %d\n",
3395 npaths_sub1,npaths_sub2,List_length(stage3list)));
3396 }
3397 }
3398
3399 *mergedp = false;
3400 *chimera = (Chimera_T) NULL;
3401 if (npaths_sub1 == 0 || npaths_sub2 == 0) {
3402 /* Skip */
3403
3404 } else if (Chimera_bestpath(&five_score,&three_score,&chimerapos,&chimeraequivpos,&bestfrom,&bestto,
3405 stage3array_sub1,npaths_sub1,stage3array_sub2,npaths_sub2,queryntlength,
3406 CHIMERA_SLOP,circularp,/*localp*/false) == false) {
3407 /* Skip */
3408 debug2(printf("Chimera_bestpath returns false, so skipping\n"));
3409 FREE(stage3array_sub2);
3410 FREE(stage3array_sub1);
3411
3412 } else {
3413 from = stage3array_sub1[bestfrom];
3414 to = stage3array_sub2[bestto];
3415 debug2(printf("Chimera_bestpath returns bestfrom %d (%d..%d, %u..%u) to bestto %d (%d..%d, %u..%u)\n",
3416 bestfrom,Stage3_querystart(from),Stage3_queryend(from),Stage3_genomicstart(from),Stage3_genomicend(from),
3417 bestto,Stage3_querystart(to),Stage3_queryend(to),Stage3_genomicstart(to),Stage3_genomicend(to)));
3418
3419 chimeric_goodness = Stage3_chimeric_goodness(&matches0,&matches1,from,to,chimerapos);
3420 debug2(printf("chimeric goodness = %d\n",chimeric_goodness));
3421
3422 penalty = CHIMERA_PENALTY;
3423 if (chimera_margin < penalty) {
3424 /* User is looking for higher sensitivity */
3425 penalty = chimera_margin;
3426 }
3427
3428 if (chimeric_goodness < max_single_goodness + penalty) {
3429 debug2(printf("chimeric goodness not good enough relative to max_single_goodness %d and penalty %d\n",
3430 max_single_goodness,penalty));
3431
3432 } else if ((breakpoint = find_breakpoint(&chimera_cdna_direction,&chimerapos,&chimeraequivpos,&exonexonpos,
3433 &donor1,&donor2,&acceptor2,&acceptor1,
3434 &donor_watsonp,&acceptor_watsonp,&donor_prob,&acceptor_prob,from,to,
3435 #ifdef PMAP
3436 queryntseq,
3437 #endif
3438 queryseq,queryuc,queryntlength,
3439 genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3440 debug2(printf("find_breakpoint returns no value\n"));
3441
3442 } else {
3443 debug2(printf("find_breakpoint returns %d\n",breakpoint));
3444
3445 /* Check to see if we can merge chimeric parts */
3446 debug2(printf("Before Stage3_mergeable, bestfrom is %p, query %d..%d, pairs %p\n",
3447 from,Stage3_querystart(from),Stage3_queryend(from),Stage3_pairs(from)));
3448 debug2(printf("Before Stage3_mergeable, bestto is %p, query %d..%d, pairs %p\n",
3449 to,Stage3_querystart(to),Stage3_queryend(to),Stage3_pairs(to)));
3450
3451 if (maxpaths_report != 1 && /* if maxpaths_report == 1, then don't want distant chimeras */
3452 Stage3_mergeable(from,to,breakpoint,queryntlength) == false &&
3453 Stage3_test_bounds(from,0,chimeraequivpos+chimera_overlap) == true &&
3454 Stage3_test_bounds(to,chimerapos+1-chimera_overlap,queryntlength) == true &&
3455 Stage3_merge_chimera(&new_left,&new_right,/*best0*/from,/*best1*/to,
3456 /*minpos1*/0,/*maxpos1*/breakpoint,
3457 /*minpos2*/breakpoint+1,/*maxpos2*/queryntlength,queryseq,
3458 #ifdef PMAP
3459 Sequence_fullpointer(queryntseq),Sequence_fullpointer(queryntseq),
3460 #else
3461 Sequence_fullpointer(queryseq),Sequence_fullpointer(queryuc),
3462 #endif
3463 pairpool,dynprogL,dynprogR,maxpeelback) == true) {
3464
3465 debug2(printf("Not mergeable -- Merging left and right as a transloc\n"));
3466 *chimera = Chimera_new(new_left,new_right,chimerapos,chimeraequivpos,exonexonpos,chimera_cdna_direction,
3467 donor1,donor2,acceptor2,acceptor1,donor_watsonp,acceptor_watsonp,
3468 donor_prob,acceptor_prob);
3469
3470 debug2(printf("Before merge_left_and_right_transloc, bestfrom is %p, query %d..%d\n",
3471 from,Stage3_querystart(from),Stage3_queryend(from)));
3472 debug2(printf("Before merge_left_and_right_transloc, bestto is %p, query %d..%d\n",
3473 to,Stage3_querystart(to),Stage3_queryend(to)));
3474
3475 /* Used to call merge_left_and_right_transloc */
3476 for (p = stage3list; p != NULL; p = List_next(p)) {
3477 stage3 = (Stage3_T) List_head(p);
3478 Stage3_free(&stage3);
3479 }
3480 List_free(&stage3list);
3481
3482 stage3list = List_push(NULL,(void *) new_right);
3483 stage3list = List_push(stage3list,(void *) new_left);
3484 }
3485
3486 debug2(printf("After Stage3_mergeable, bestfrom is %p, query %d..%d, pairs %p\n",
3487 from,Stage3_querystart(from),Stage3_queryend(from),Stage3_pairs(from)));
3488 debug2(printf("After Stage3_mergeable, bestto is %p, query %d..%d, pairs %p\n",
3489 to,Stage3_querystart(to),Stage3_queryend(to),Stage3_pairs(to)));
3490 }
3491
3492 FREE(stage3array_sub2);
3493 FREE(stage3array_sub1);
3494 }
3495
3496 debug2(printf("check_for_chimera returning list of length %d\n",List_length(stage3list)));
3497 #ifdef DEBUG2
3498 for (p = stage3list; p != NULL; p = List_next(p)) {
3499 stage3 = (Stage3_T) List_head(p);
3500 printf("%p %p\n",stage3,Stage3_pairs(stage3));
3501 }
3502 #endif
3503
3504 #if 0
3505 /* Should be handled by apply_stage3 loop */
3506 /* Needed after calls to stage3_from_gregions */
3507 Stage3_recompute_goodness(stage3list);
3508 stage3list = stage3list_remove_duplicates(stage3list);
3509 #endif
3510
3511 return stage3list;
3512 }
3513
3514
3515 /* Needs to guarantee that all elements of stage3list and middlepieces end up in result */
3516 /* The Stage3_T objects from and to come from stage3list */
3517 /* The Stage3_T object middle does not come from stage3list (but from middlepieces in caller) */
3518 static List_T
merge_middlepieces(List_T stage3list,Stage3_T from,Stage3_T to,Stage3_T middle,bool mergeableAp,bool mergeableBp,int breakpointA,int breakpointB,Sequence_T queryseq,Sequence_T queryntseq,Sequence_T queryuc,int queryntlength,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Oligoindex_array_T oligoindices_minor,Diagpool_T diagpool,Cellpool_T cellpool)3519 merge_middlepieces (List_T stage3list, Stage3_T from, Stage3_T to, Stage3_T middle,
3520 bool mergeableAp, bool mergeableBp,
3521 int breakpointA, int breakpointB, Sequence_T queryseq,
3522 #ifdef PMAP
3523 Sequence_T queryntseq,
3524 #endif
3525 Sequence_T queryuc, int queryntlength,
3526 Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
3527 Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool) {
3528 bool mergedAp, mergedBp;
3529 /* List_T r, p; */
3530 /* Stage3_T stage3; */
3531
3532
3533 if (mergeableAp == true && mergeableBp == true) {
3534 stage3list = merge_left_and_right_readthrough(&mergedAp,stage3list,
3535 /*stage3array_sub1*/&from,/*npaths_sub1:1,*//*bestfrom*/0,
3536 /*stage3array_sub2*/&middle,/*npaths_sub2:1,*//*bestto*/0,
3537 breakpointA,queryntlength,queryseq,
3538 #ifdef PMAP
3539 /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
3540 /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
3541 /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
3542 #else
3543 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
3544 /*queryuc_ptr*/Sequence_fullpointer(queryuc),
3545 #endif
3546 pairpool,dynprogL,dynprogM,dynprogR,
3547 oligoindices_minor,diagpool,cellpool);
3548 /* List_free(&merged); */
3549
3550 stage3list = merge_left_and_right_readthrough(&mergedBp,stage3list,
3551 /*stage3array_sub1*/&from,/*npaths_sub1:1,*//*bestfrom*/0,
3552 /*stage3array_sub2*/&to,/*npaths_sub2:1,*//*bestto*/0,
3553 breakpointB,queryntlength,queryseq,
3554 #ifdef PMAP
3555 /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
3556 /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
3557 /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
3558 #else
3559 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
3560 /*queryuc_ptr*/Sequence_fullpointer(queryuc),
3561 #endif
3562 pairpool,dynprogL,dynprogM,dynprogR,
3563 oligoindices_minor,diagpool,cellpool);
3564
3565 #ifndef PMAP
3566 Stage3_guess_cdna_direction(from);
3567 #endif
3568
3569 } else if (mergeableBp == true) {
3570 stage3list = merge_left_and_right_readthrough(&mergedBp,stage3list,
3571 /*stage3array_sub1*/&middle,/*npaths_sub1:1,*//*bestfrom*/0,
3572 /*stage3array_sub2*/&to,/*npaths_sub2:1,*//*bestto*/0,
3573 breakpointB,queryntlength,queryseq,
3574 #ifdef PMAP
3575 /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
3576 /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
3577 /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
3578 #else
3579 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
3580 /*queryuc_ptr*/Sequence_fullpointer(queryuc),
3581 #endif
3582 pairpool,dynprogL,dynprogM,dynprogR,
3583 oligoindices_minor,diagpool,cellpool);
3584 #ifndef PMAP
3585 Stage3_guess_cdna_direction(middle);
3586 #endif
3587
3588 } else if (mergeableAp == true) {
3589 stage3list = merge_left_and_right_readthrough(&mergedAp,stage3list,
3590 /*stage3array_sub1*/&from,/*npaths_sub1:1,*//*bestfrom*/0,
3591 /*stage3array_sub2*/&middle,/*npaths_sub2:1,*//*bestto*/0,
3592 breakpointA,queryntlength,queryseq,
3593 #ifdef PMAP
3594 /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
3595 /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
3596 /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
3597 #else
3598 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
3599 /*queryuc_ptr*/Sequence_fullpointer(queryuc),
3600 #endif
3601 pairpool,dynprogL,dynprogM,dynprogR,
3602 oligoindices_minor,diagpool,cellpool);
3603
3604 #ifndef PMAP
3605 Stage3_guess_cdna_direction(from);
3606 #endif
3607 }
3608
3609 return stage3list;
3610 }
3611
3612
3613
3614 /* Returns stage3list with additional merged alignments and middle pieces */
3615 static List_T
check_middle_piece_local(bool * foundp,List_T stage3list,Sequence_T queryseq,Sequence_T queryuc,Sequence_T queryntseq,int queryntlength,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR)3616 check_middle_piece_local (bool *foundp, List_T stage3list, Sequence_T queryseq, Sequence_T queryuc,
3617 #ifdef PMAP
3618 Sequence_T queryntseq,
3619 #endif
3620 int queryntlength, Stage2_alloc_T stage2_alloc,
3621 Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
3622 Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
3623 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR) {
3624 Sequence_T querysubseq = NULL, querysubuc = NULL;
3625 int npaths, i, j;
3626 Stage3_T from = NULL, to = NULL, middle = NULL;
3627 Stage3_T *by_queryend, *by_querystart;
3628 List_T r;
3629 bool plusp;
3630 int genestrand;
3631
3632 int querystart, queryend;
3633 Chrpos_T chrstart, chrend, chrlength;
3634 Univcoord_T chroffset, chrhigh;
3635 Chrnum_T chrnum;
3636
3637 int breakpointA = 0, chimeraposA, chimeraequivposA, exonexonposA;
3638 char donorA1, donorA2, acceptorA2, acceptorA1;
3639 bool donor_watsonp_A, acceptor_watsonp_A;
3640 double donor_prob_A, acceptor_prob_A;
3641
3642 int breakpointB = 0, chimeraposB, chimeraequivposB, exonexonposB;
3643 char donorB1, donorB2, acceptorB2, acceptorB1;
3644 bool donor_watsonp_B, acceptor_watsonp_B;
3645 double donor_prob_B, acceptor_prob_B;
3646
3647 int chimera_cdna_direction_A, chimera_cdna_direction_B;
3648 bool mergeableAp, mergeableBp;
3649
3650 List_T all_middlepieces = NULL, middlepieces;
3651 #ifdef DEBUG2A
3652 List_T p;
3653 Stage3_T stage3;
3654 #endif
3655
3656
3657 #ifdef DEBUG2A
3658 for (p = stage3list; p != NULL; p = List_next(p)) {
3659 stage3 = (Stage3_T) List_head(p);
3660 Pair_dump_array(Stage3_pairarray(stage3),Stage3_npairs(stage3),/*zerobasedp*/true);
3661 printf("\n");
3662 }
3663 #endif
3664
3665 *foundp = false;
3666
3667 by_queryend = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
3668 qsort(by_queryend,npaths,sizeof(Stage3_T),Stage3_queryend_cmp);
3669
3670 by_querystart = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
3671 qsort(by_querystart,npaths,sizeof(Stage3_T),Stage3_querystart_cmp);
3672
3673 j = 0;
3674 for (i = 0; i < npaths && *foundp == false; i++) {
3675 from = by_queryend[i];
3676 queryend = Stage3_queryend(from);
3677
3678 while (j < npaths && Stage3_querystart(by_querystart[j]) < queryend) {
3679 j++;
3680 }
3681 j--;
3682
3683 while (j >= 0 && Stage3_querystart(by_querystart[j]) > queryend) {
3684 j--;
3685 }
3686 j++;
3687
3688 for ( ; j < npaths && *foundp == false; j++) {
3689 to = by_querystart[j];
3690
3691 if (middle_piece_local_p(&querystart,&queryend,&chrstart,&chrend,
3692 &chrnum,&chroffset,&chrhigh,&chrlength,&plusp,&genestrand,
3693 from,to) == true) {
3694 debug2(printf("Found middle piece missing from %d to %d\n",i,j));
3695
3696 if ((querysubseq = Sequence_subsequence(queryseq,querystart,queryend)) != NULL) {
3697 if ((querysubuc = Sequence_subsequence(queryuc,querystart,queryend)) != NULL) {
3698 debug2(printf("Performing Stage 3 on %d..%d against %u..%u\n",
3699 querystart,queryend,chrstart,chrend));
3700 if ((middlepieces = update_stage3list(/*stage3list*/NULL,querysubseq,
3701 #ifdef PMAP
3702 queryntseq,
3703 #endif
3704 querysubuc,stage2_alloc,oligoindices_major,oligoindices_minor,
3705 pairpool,diagpool,cellpool,
3706 /*straintype*/0,/*strain*/NULL,chrnum,
3707 chroffset,chrhigh,chrlength,chrstart,chrend,plusp,genestrand,
3708 dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL)) != NULL) {
3709 middlepieces = stage3list_sort(middlepieces);
3710
3711 /* 1. Look first for middle piece that joins locally on both ends */
3712 r = middlepieces;
3713 mergeableAp = mergeableBp = false;
3714 while (r != NULL && (mergeableAp == false || mergeableBp == false)) {
3715 middle = (Stage3_T) List_head(r);
3716 if (Chimera_local_join_p(from,middle,CHIMERA_SLOP) == true && Chimera_local_join_p(middle,to,CHIMERA_SLOP) == true) {
3717 if ((breakpointA = find_breakpoint(&chimera_cdna_direction_A,&chimeraposA,&chimeraequivposA,&exonexonposA,
3718 &donorA1,&donorA2,&acceptorA2,&acceptorA1,
3719 &donor_watsonp_A,&acceptor_watsonp_A,&donor_prob_A,&acceptor_prob_A,
3720 from,/*to*/middle,
3721 #ifdef PMAP
3722 queryntseq,
3723 #endif
3724 queryseq,queryuc,queryntlength,
3725 genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3726 mergeableAp = false;
3727 } else {
3728 mergeableAp = Stage3_mergeable(from,/*to*/middle,breakpointA,queryntlength);
3729 }
3730
3731 if ((breakpointB = find_breakpoint(&chimera_cdna_direction_B,&chimeraposB,&chimeraequivposB,&exonexonposB,
3732 &donorB1,&donorB2,&acceptorB2,&acceptorB1,
3733 &donor_watsonp_B,&acceptor_watsonp_B,&donor_prob_B,&acceptor_prob_B,
3734 /*from*/middle,to,
3735 #ifdef PMAP
3736 queryntseq,
3737 #endif
3738 queryseq,queryuc,queryntlength,
3739 genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3740 mergeableBp = false;
3741 } else {
3742 mergeableBp = Stage3_mergeable(/*from*/middle,to,breakpointB,queryntlength);
3743 }
3744 }
3745 r = List_next(r);
3746 } /* End of while loop looking for dual merge */
3747
3748 if (mergeableAp == true && mergeableBp == true) {
3749 debug2(printf("Middle segment %p found and mergeable locally with both! -- Merging three as a readthrough.\n",middle));
3750 *foundp = true;
3751 } else {
3752 /* 2. Look for middle piece that joins locally on one end */
3753 r = middlepieces;
3754 mergeableAp = mergeableBp = false;
3755 while (r != NULL && mergeableAp == false && mergeableBp == false) {
3756 middle = (Stage3_T) List_head(r);
3757 if (Chimera_local_join_p(from,middle,CHIMERA_SLOP) == true && Chimera_local_join_p(middle,to,CHIMERA_SLOP) == true) {
3758 if ((breakpointA = find_breakpoint(&chimera_cdna_direction_A,&chimeraposA,&chimeraequivposA,&exonexonposA,
3759 &donorA1,&donorA2,&acceptorA2,&acceptorA1,
3760 &donor_watsonp_A,&acceptor_watsonp_A,&donor_prob_A,&acceptor_prob_A,
3761 from,/*to*/middle,
3762 #ifdef PMAP
3763 queryntseq,
3764 #endif
3765 queryseq,queryuc,queryntlength,
3766 genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3767 mergeableAp = false;
3768 } else {
3769 mergeableAp = Stage3_mergeable(from,/*to*/middle,breakpointA,queryntlength);
3770 }
3771
3772 if ((breakpointB = find_breakpoint(&chimera_cdna_direction_B,&chimeraposB,&chimeraequivposB,&exonexonposB,
3773 &donorB1,&donorB2,&acceptorB2,&acceptorB1,
3774 &donor_watsonp_B,&acceptor_watsonp_B,&donor_prob_B,&acceptor_prob_B,
3775 /*from*/middle,to,
3776 #ifdef PMAP
3777 queryntseq,
3778 #endif
3779 queryseq,queryuc,queryntlength,
3780 genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3781 mergeableBp = false;
3782 } else {
3783 mergeableBp = Stage3_mergeable(/*from*/middle,to,breakpointB,queryntlength);
3784 }
3785 }
3786 r = List_next(r);
3787 } /* End of while loop looking for single merge */
3788
3789 if (mergeableAp == true || mergeableBp == true) {
3790 *foundp = true;
3791 }
3792 }
3793
3794 stage3list = merge_middlepieces(stage3list,from,to,middle,mergeableAp,mergeableBp,
3795 breakpointA,breakpointB,queryseq,
3796 #ifdef PMAP
3797 queryntseq,
3798 #endif
3799 queryuc,queryntlength,pairpool,dynprogL,dynprogM,dynprogR,
3800 oligoindices_minor,diagpool,cellpool);
3801 all_middlepieces = List_append(all_middlepieces,middlepieces);
3802 }
3803
3804 Sequence_free(&querysubuc);
3805 }
3806 Sequence_free(&querysubseq);
3807 }
3808 }
3809 }
3810 }
3811
3812 FREE(by_querystart);
3813 FREE(by_queryend);
3814
3815 stage3list = List_append(stage3list,all_middlepieces);
3816
3817 return stage3list;
3818 }
3819
3820
3821 /* Returns stage3list with additional merged alignments and middle pieces */
3822 static List_T
check_middle_piece_chimera(bool * foundp,List_T stage3list,Sequence_T queryseq,Sequence_T queryuc,Sequence_T queryntseq,int queryntlength,Sequence_T usersegment,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Matchpool_T matchpool,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR)3823 check_middle_piece_chimera (bool *foundp, List_T stage3list, Sequence_T queryseq, Sequence_T queryuc,
3824 #ifdef PMAP
3825 Sequence_T queryntseq,
3826 #endif
3827 int queryntlength, Sequence_T usersegment, Stage2_alloc_T stage2_alloc,
3828 Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
3829 Matchpool_T matchpool, Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
3830 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR) {
3831 Sequence_T querysubseq = NULL, querysubuc = NULL;
3832 int npaths, i, j;
3833 Stage3_T bestfrom, bestto, from, to, middle;
3834 Stage3_T *by_queryend, *by_querystart;
3835 List_T r;
3836 int querystart, queryend, maxdist, dist;
3837
3838 int breakpointA, chimeraposA, chimeraequivposA, exonexonposA;
3839 char donorA1, donorA2, acceptorA2, acceptorA1;
3840 bool donor_watsonp_A, acceptor_watsonp_A;
3841 double donor_prob_A, acceptor_prob_A;
3842
3843 int breakpointB, chimeraposB, chimeraequivposB, exonexonposB;
3844 char donorB1, donorB2, acceptorB2, acceptorB1;
3845 bool donor_watsonp_B, acceptor_watsonp_B;
3846 double donor_prob_B, acceptor_prob_B;
3847
3848 int chimera_cdna_direction_A, chimera_cdna_direction_B;
3849 bool mergeableAp, mergeableBp, mergedAp, mergedBp;
3850
3851 List_T middlepieces = NULL;
3852 Diagnostic_T diagnostic;
3853 List_T gregions;
3854 bool lowidentityp, poorp, repetitivep;
3855
3856
3857 #ifdef DEBUG2A
3858 for (p = stage3list; p != NULL; p = List_next(p)) {
3859 stage3 = (Stage3_T) List_head(p);
3860 Pair_dump_array(Stage3_pairarray(stage3),Stage3_npairs(stage3),/*zerobasedp*/true);
3861 printf("\n");
3862 }
3863 #endif
3864
3865 by_queryend = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
3866 qsort(by_queryend,npaths,sizeof(Stage3_T),Stage3_queryend_cmp);
3867
3868 by_querystart = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
3869 qsort(by_querystart,npaths,sizeof(Stage3_T),Stage3_querystart_cmp);
3870
3871 maxdist = 0;
3872 j = 0;
3873 for (i = 0; i < npaths; i++) {
3874 from = by_queryend[i];
3875 queryend = Stage3_queryend(from);
3876
3877 while (j < npaths && Stage3_querystart(by_querystart[j]) < queryend) {
3878 j++;
3879 }
3880 j--;
3881
3882 while (j >= 0 && Stage3_querystart(by_querystart[j]) > queryend) {
3883 j--;
3884 }
3885 j++;
3886
3887 if (j < npaths) {
3888 /* Should have the first querystart just after queryend */
3889 to = by_querystart[j];
3890
3891 if ((dist = Stage3_queryend(to) - Stage3_querystart(from)) > maxdist) {
3892 bestfrom = from;
3893 bestto = to;
3894 maxdist = dist;
3895 }
3896 }
3897 }
3898
3899 FREE(by_querystart);
3900 FREE(by_queryend);
3901
3902
3903 *foundp = false;
3904 if (maxdist < CHIMERA_SLOP) {
3905 debug2(printf("maxdist %d < CHIMERA_SLOP %d\n",maxdist,CHIMERA_SLOP));
3906 } else {
3907 if (middle_piece_chimera_p(&querystart,&queryend,bestfrom,bestto) == true) {
3908 if ((querysubseq = Sequence_subsequence(queryseq,querystart,queryend)) != NULL) {
3909 if ((querysubuc = Sequence_subsequence(queryuc,querystart,queryend)) != NULL) {
3910 debug2(printf("Performing Stage 3 on %d..%d\n",querystart,queryend));
3911
3912 diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),
3913 Sequence_fulllength(querysubuc),Oligoindex_array_elt(oligoindices_major,0));
3914 if (poorp == true || repetitivep == true) {
3915 debug2(printf("Subsequence is poor or repetitive\n"));
3916 } else {
3917 if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
3918 gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
3919 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3920 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3921 } else {
3922 gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
3923 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3924 stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3925 }
3926 debug2(printf("Performing Stage 3 starting with list length %d\n",List_length(stage3list)));
3927 middlepieces = stage3_from_gregions(/*stage3list*/NULL,gregions,querysubseq,querysubuc,
3928 #ifdef PMAP
3929 queryntseq,
3930 #endif
3931 usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
3932 pairpool,diagpool,cellpool,
3933 dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
3934 }
3935 Diagnostic_free(&diagnostic);
3936
3937 /* Above function frees gregions */
3938 Sequence_free(&querysubuc);
3939 }
3940 Sequence_free(&querysubseq);
3941 }
3942 }
3943
3944 if (middlepieces != NULL) {
3945 middlepieces = stage3list_sort(middlepieces);
3946
3947 r = middlepieces;
3948 mergeableAp = mergeableBp = false;
3949 while (r != NULL && mergeableAp == false && mergeableBp == false) {
3950 middle = (Stage3_T) List_head(r);
3951 if (middle != bestfrom && middle != bestto) {
3952 if (Chimera_local_join_p(bestfrom,middle,CHIMERA_SLOP) == true) {
3953 if ((breakpointA = find_breakpoint(&chimera_cdna_direction_A,&chimeraposA,&chimeraequivposA,&exonexonposA,
3954 &donorA1,&donorA2,&acceptorA2,&acceptorA1,
3955 &donor_watsonp_A,&acceptor_watsonp_A,&donor_prob_A,&acceptor_prob_A,
3956 bestfrom,/*to*/middle,
3957 #ifdef PMAP
3958 queryntseq,
3959 #endif
3960 queryseq,queryuc,queryntlength,
3961 genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3962 mergeableAp = false;
3963 } else {
3964 mergeableAp = Stage3_mergeable(bestfrom,/*to*/middle,breakpointA,queryntlength);
3965 }
3966 }
3967 if (Chimera_local_join_p(middle,bestto,CHIMERA_SLOP) == true) {
3968 if ((breakpointB = find_breakpoint(&chimera_cdna_direction_B,&chimeraposB,&chimeraequivposB,&exonexonposB,
3969 &donorB1,&donorB2,&acceptorB2,&acceptorB1,
3970 &donor_watsonp_B,&acceptor_watsonp_B,&donor_prob_B,&acceptor_prob_B,
3971 /*from*/middle,to,
3972 #ifdef PMAP
3973 queryntseq,
3974 #endif
3975 queryseq,queryuc,queryntlength,
3976 genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3977 mergeableBp = false;
3978 } else {
3979 mergeableBp = Stage3_mergeable(/*from*/middle,bestto,breakpointB,queryntlength);
3980 }
3981 }
3982 }
3983 r = List_next(r);
3984 }
3985
3986 if (mergeableAp == true) {
3987 debug2(printf("Middle segment %p found and mergeable locally with from! -- Merging as a readthrough. cdna_direction = %d\n",
3988 middle,chimera_cdna_direction_A));
3989 stage3list =
3990 merge_left_and_right_readthrough(&mergedAp,stage3list,
3991 /*stage3array_sub1*/&bestfrom,/*npaths_sub1:1,*//*bestfrom*/0,
3992 /*stage3array_sub2*/&middle,/*npaths_sub2:1,*//*bestto*/0,
3993 breakpointA,queryntlength,queryseq,
3994 #ifdef PMAP
3995 /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
3996 /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
3997 /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
3998 #else
3999 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
4000 /*queryuc_ptr*/Sequence_fullpointer(queryuc),
4001 #endif
4002 pairpool,dynprogL,dynprogM,dynprogR,
4003 oligoindices_minor,diagpool,cellpool);
4004
4005 #ifndef PMAP
4006 Stage3_guess_cdna_direction(from);
4007 #endif
4008
4009 if (mergedAp == true) {
4010 *foundp = true;
4011 }
4012
4013 } else if (mergeableBp == true) {
4014 debug2(printf("Middle segment %p found and mergeable locally with to! -- Merging as a readthrough. cdna_direction = %d\n",
4015 middle,chimera_cdna_direction_B));
4016 stage3list =
4017 merge_left_and_right_readthrough(&mergedBp,stage3list,
4018 /*stage3array_sub1*/&middle,/*npaths_sub1:1,*//*bestfrom*/0,
4019 /*stage3array_sub2*/&bestto,/*npaths_sub2:1,*//*bestto*/0,
4020 breakpointB,queryntlength,queryseq,
4021 #ifdef PMAP
4022 /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
4023 /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
4024 /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
4025 #else
4026 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
4027 /*queryuc_ptr*/Sequence_fullpointer(queryuc),
4028 #endif
4029 pairpool,dynprogL,dynprogM,dynprogR,
4030 oligoindices_minor,diagpool,cellpool);
4031 #ifndef PMAP
4032 Stage3_guess_cdna_direction(middle);
4033 #endif
4034
4035 if (mergedBp == true) {
4036 *foundp = true;
4037 }
4038
4039 } else {
4040 debug2(printf("Middle segment found but notmergeable\n"));
4041 }
4042
4043 }
4044 }
4045
4046 stage3list = List_append(stage3list,middlepieces);
4047
4048 return stage3list;
4049 }
4050
4051
4052
4053 static List_T
apply_stage3(bool * mergedp,Chimera_T * chimera,List_T gregions,Sequence_T queryseq,Sequence_T queryuc,Sequence_T queryntseq,Sequence_T usersegment,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Matchpool_T matchpool,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Stopwatch_T worker_stopwatch)4054 apply_stage3 (bool *mergedp, Chimera_T *chimera, List_T gregions, Sequence_T queryseq, Sequence_T queryuc,
4055 #ifdef PMAP
4056 Sequence_T queryntseq,
4057 #endif
4058 Sequence_T usersegment, Stage2_alloc_T stage2_alloc,
4059 Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
4060 Matchpool_T matchpool, Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
4061 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR, Stopwatch_T worker_stopwatch) {
4062 List_T stage3list, newstage3list, split_objects, p, q;
4063 Stage3_T nonchimericbest, chimera1, chimera2, stage3, newstage3;
4064 bool testlocalp, testchimerap, foundp;
4065 int effective_start, effective_end;
4066 int queryntlength;
4067 int iter;
4068
4069 Chrnum_T chrnum;
4070 Univcoord_T chroffset, chrhigh;
4071 Chrpos_T chrlength;
4072 List_T pairs_below, pairs_above;
4073 bool watsonp;
4074 int cdna_direction, genestrand, sensedir;
4075
4076
4077 *mergedp = false;
4078 *chimera = NULL;
4079
4080 debug(printf("Calling stage3_from_gregions\n"));
4081 stage3list = stage3_from_gregions(/*stage3list*/(List_T) NULL,gregions,queryseq,queryuc,
4082 #ifdef PMAP
4083 queryntseq,
4084 #endif
4085 usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
4086 pairpool,diagpool,cellpool,
4087 dynprogL,dynprogM,dynprogR,worker_stopwatch);
4088
4089 debug2(printf("Initial search gives stage3list of length %d\n",List_length(stage3list)));
4090 #ifdef DEBUG2
4091 for (p = stage3list; p != NULL; p = List_next(p)) {
4092 Stage3_print_ends(List_head(p));
4093 }
4094 #endif
4095
4096 if (diag_debug == true) {
4097 return stage3list; /* really diagonals */
4098 }
4099
4100 queryntlength = Sequence_ntlength(queryseq);
4101
4102 if (stage3list != NULL) {
4103 iter = 0;
4104 testlocalp = true;
4105 while (testlocalp == true && iter++ < MAX_CHIMERA_ITER) {
4106 debug2(printf("\n\n*** Testing for local on %d Stage3_T objects, iter %d ***\n",
4107 List_length(stage3list),iter));
4108
4109 /* Stage3_recompute_goodness(stage3list); */
4110 /* stage3list = stage3list_remove_duplicates(stage3list); */
4111 stage3list = stage3list_sort(stage3list);
4112
4113 #ifdef DEBUG2
4114 for (p = stage3list; p != NULL; p = List_next(p)) {
4115 Stage3_print_ends(List_head(p));
4116 }
4117 printf("\n");
4118 #endif
4119 nonchimericbest = (Stage3_T) List_head(stage3list);
4120 debug2(printf("nonchimericbest is %p\n",nonchimericbest));
4121
4122 #if 0
4123 if (List_length(stage3list) <= 1) {
4124 debug2(printf("Only 0 or 1 alignments, so won't look for local\n"));
4125 testlocalp = false;
4126 }
4127 else
4128 #endif
4129
4130 if (Stage3_domain(nonchimericbest) < chimera_margin) {
4131 debug2(printf("Existing alignment is too short, so won't look for local\n"));
4132 testlocalp = false;
4133
4134 #if 0
4135 } else if (Stage3_fracidentity(nonchimericbest) < CHIMERA_IDENTITY &&
4136 Chimera_alignment_break(&effective_start,&effective_end,nonchimericbest,Sequence_ntlength(queryseq),CHIMERA_FVALUE) >= chimera_margin
4137 ) {
4138 debug2(printf("Break in alignment quality at %d..%d detected, so will look for local\n",
4139 effective_start,effective_end));
4140 testlocalp = true;
4141 #endif
4142
4143 } else if (Stage3_largemargin(&effective_start,&effective_end,nonchimericbest,Sequence_ntlength(queryseq)) >= chimera_margin) {
4144 debug2(printf("Large margin at %d..%d detected (%d >= %d), so will look for local\n",
4145 effective_start,effective_end,Stage3_largemargin(&effective_start,&effective_end,nonchimericbest,Sequence_ntlength(queryseq)),chimera_margin));
4146 testlocalp = true;
4147
4148 } else {
4149 debug2(printf("Good alignment already with identity %f, so won't look for local\n",
4150 Stage3_fracidentity(nonchimericbest)));
4151 testlocalp = false;
4152 }
4153
4154 if (testlocalp == true) {
4155 testlocalp = false;
4156 debug2(printf("Checking for local, starting with list length %d, effective_start %d, effective_end %d\n",
4157 List_length(stage3list),effective_start,effective_end));
4158 stage3list = check_for_local(&(*mergedp),stage3list,effective_start,effective_end,
4159 queryseq,queryuc,
4160 #ifdef PMAP
4161 queryntseq,
4162 #endif
4163 queryntlength,usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
4164 matchpool,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR);
4165 debug2(printf("After check for local, we still have %d paths\n",List_length(stage3list)));
4166
4167 #if 0
4168 /* For some reason, we need to filter out cases where npairs is 0 */
4169 old = stage3list;
4170 stage3list = (List_T) NULL;
4171 for (p = old; p != NULL; p = List_next(p)) {
4172 stage3 = (Stage3_T) List_head(p);
4173 if (Stage3_npairs(stage3) == 0) {
4174 Stage3_free(&stage3);
4175 } else {
4176 stage3list = List_push(stage3list,(void *) stage3);
4177 }
4178 }
4179 List_free(&old);
4180 #endif
4181
4182 if (*mergedp == true) {
4183 testlocalp = true; /* Local merge */
4184 } else if (iter == 1) {
4185 /* Check for middle pieces only on first iteration */
4186 debug2(printf("Checking for middle piece local, starting with list length %d\n",List_length(stage3list)));
4187 stage3list = check_middle_piece_local(&foundp,stage3list,queryseq,queryuc,
4188 #ifdef PMAP
4189 queryntseq,
4190 #endif
4191 queryntlength,stage2_alloc,oligoindices_major,oligoindices_minor,
4192 pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR);
4193 if (foundp == true) {
4194 /* Iterate */
4195 testlocalp = true;
4196 }
4197 } else {
4198 testlocalp = false;
4199 }
4200 }
4201 }
4202 }
4203
4204 if (stage3list != NULL) {
4205 iter = 0;
4206 testchimerap = true;
4207 while (testchimerap == true && iter++ < MAX_CHIMERA_ITER) {
4208 debug2(printf("\n\n*** Testing for chimera on %d Stage3_T objects, iter %d ***\n",
4209 List_length(stage3list),iter));
4210
4211 /* Stage3_recompute_goodness(stage3list); */
4212 /* stage3list = stage3list_remove_duplicates(stage3list); */
4213 stage3list = stage3list_sort(stage3list);
4214
4215 #ifdef DEBUG2
4216 for (p = stage3list; p != NULL; p = List_next(p)) {
4217 Stage3_print_ends(List_head(p));
4218 }
4219 printf("\n");
4220 #endif
4221 nonchimericbest = (Stage3_T) List_head(stage3list);
4222 debug2(printf("nonchimericbest is %p\n",nonchimericbest));
4223
4224 if (novelsplicingp == false) {
4225 testchimerap = false;
4226
4227 } else if (chimera_margin <= 0) {
4228 debug2(printf("turned off\n"));
4229 testchimerap = false;
4230
4231 } else if (maxpaths_report == 1) {
4232 debug2(printf("maxpaths set to 1\n"));
4233 testchimerap = false;
4234
4235 } else if (Stage3_domain(nonchimericbest) < chimera_margin) {
4236 debug2(printf("Existing alignment is too short, so won't look for chimera\n"));
4237 testchimerap = false;
4238
4239 #if 0
4240 } else if (Stage3_fracidentity(nonchimericbest) < CHIMERA_IDENTITY &&
4241 Chimera_alignment_break(&effective_start,&effective_end,nonchimericbest,Sequence_ntlength(queryseq),CHIMERA_FVALUE) >= chimera_margin
4242 ) {
4243 debug2(printf("Break in alignment quality at %d..%d detected, so will look for chimera\n",
4244 effective_start,effective_end));
4245 testchimerap = true;
4246 #endif
4247
4248 } else if (Stage3_largemargin(&effective_start,&effective_end,nonchimericbest,Sequence_ntlength(queryseq)) >= chimera_margin) {
4249 debug2(printf("Large margin at %d..%d detected (%d >= %d), so will look for chimera\n",
4250 effective_start,effective_end,Stage3_largemargin(&effective_start,&effective_end,nonchimericbest,Sequence_ntlength(queryseq)),chimera_margin));
4251 testchimerap = true;
4252
4253 } else {
4254 debug2(printf("Good alignment already with identity %f, so won't look for chimera\n",
4255 Stage3_fracidentity(nonchimericbest)));
4256 testchimerap = false;
4257 }
4258
4259 if (testchimerap == true) {
4260 testchimerap = false;
4261 debug2(printf("Checking for chimera, starting with list length %d, effective_start %d, effective_end %d\n",
4262 List_length(stage3list),effective_start,effective_end));
4263 stage3list = check_for_chimera(&(*mergedp),&(*chimera),stage3list,effective_start,effective_end,
4264 queryseq,queryuc,
4265 #ifdef PMAP
4266 queryntseq,
4267 #endif
4268 queryntlength,usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
4269 matchpool,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR);
4270 debug2(printf("chimera is %p\n",*chimera));
4271 if (*chimera != NULL) {
4272 testchimerap = false;
4273 } else {
4274 if (*mergedp == true) {
4275 testchimerap = true; /* Local merge */
4276 } else if (iter == 1) {
4277 /* Check for middle pieces only on first iteration */
4278 debug2(printf("Checking for middle piece chimera, starting with list length %d\n",List_length(stage3list)));
4279 stage3list = check_middle_piece_chimera(&foundp,stage3list,queryseq,queryuc,
4280 #ifdef PMAP
4281 queryntseq,
4282 #endif
4283 queryntlength,usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
4284 matchpool,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR);
4285 if (foundp == true) {
4286 /* Iterate */
4287 testchimerap = true;
4288 } else {
4289 testchimerap = false;
4290 }
4291 } else {
4292 testchimerap = false;
4293 }
4294 }
4295 debug2(printf("testchimerap is %d\n",testchimerap));
4296 }
4297 }
4298 }
4299
4300 debug2(printf("apply_stage3 returning list of length %d\n",List_length(stage3list)));
4301 #ifdef DEBUG2
4302 for (p = stage3list; p != NULL; p = List_next(p)) {
4303 stage3 = (Stage3_T) List_head(p);
4304 printf("%p %p\n",stage3,Stage3_pairs(stage3));
4305 }
4306 #endif
4307
4308 /* Split on large introns (may need to check whether stage3 belongs
4309 to a chimera) */
4310 if (split_large_introns_p == true) {
4311 newstage3list = (List_T) NULL;
4312 for (p = stage3list; p != NULL; p = List_next(p)) {
4313 stage3 = (Stage3_T) List_head(p);
4314 if (Stage3_npairs(stage3) == 0) {
4315 Stage3_free(&stage3);
4316 } else if ((split_objects = Stage3_split(stage3,queryseq,pairpool)) == NULL) {
4317 debug(printf("Pushing %p onto newstage3list\n",stage3));
4318 newstage3list = List_push(newstage3list,(void *) stage3);
4319 } else {
4320 for (q = split_objects; q != NULL; q = List_next(q)) {
4321 newstage3 = (Stage3_T) List_head(q);
4322 debug(printf("Pushing %p onto newstage3list\n",newstage3));
4323 newstage3list = List_push(newstage3list,(void *) newstage3);
4324 }
4325 List_free(&split_objects);
4326 Stage3_free(&stage3);
4327 }
4328 }
4329 List_free(&stage3list);
4330 stage3list = newstage3list;
4331 }
4332
4333
4334 /* Split circular alignments (need to guarantee that chimeras do not
4335 contain alignments to circular chromosomes) */
4336 newstage3list = (List_T) NULL;
4337 for (p = stage3list; p != NULL; p = List_next(p)) {
4338 stage3 = (Stage3_T) List_head(p);
4339 chrnum = Stage3_chrnum(stage3);
4340 if (circularp[chrnum] == false) {
4341 newstage3list = List_push(newstage3list,(void *) stage3);
4342 } else {
4343 chroffset = Stage3_chroffset(stage3);
4344 chrhigh = Stage3_chrhigh(stage3);
4345 chrlength = Stage3_chrlength(stage3);
4346 watsonp = Stage3_watsonp(stage3);
4347
4348 Pair_split_circular(&pairs_below,&pairs_above,Stage3_pairs(stage3),
4349 chrlength,pairpool,watsonp);
4350 #if 0
4351 printf("PAIRS BELOW\n");
4352 Pair_dump_list(pairs_below,true);
4353 printf("PAIRS ABOVE\n");
4354 Pair_dump_list(pairs_above,true);
4355 #endif
4356
4357 cdna_direction = Stage3_cdna_direction(stage3);
4358 genestrand = Stage3_genestrand(stage3);
4359 sensedir = Stage3_sensedir(stage3);
4360
4361 if ((newstage3 = Stage3_new_from_pairs(pairs_below,cdna_direction,watsonp,genestrand,sensedir,
4362 pairpool,queryseq,/*query_subseq_offset*/0,
4363 chrnum,chroffset,chrhigh,chrlength)) != NULL) {
4364 debug(printf("Pushing %p onto stage3list\n",newstage3));
4365 newstage3list = List_push(newstage3list,(void *) newstage3);
4366 }
4367 if ((newstage3 = Stage3_new_from_pairs(pairs_above,cdna_direction,watsonp,genestrand,sensedir,
4368 pairpool,queryseq,/*query_subseq_offset*/0,
4369 chrnum,chroffset,chrhigh,chrlength)) != NULL) {
4370 debug(printf("Pushing %p onto stage3list\n",newstage3));
4371 newstage3list = List_push(newstage3list,(void *) newstage3);
4372 }
4373 Stage3_free(&stage3);
4374 }
4375 }
4376 List_free(&stage3list);
4377 stage3list = newstage3list;
4378
4379
4380 /* Needed after call to stage3_from_gregions */
4381 /* Stage3_recompute_goodness(stage3list); */
4382
4383 /* Final call, so do both filtering and sorting */
4384 Stage3_recompute_coverage(stage3list,queryseq);
4385 stage3list = stage3list_filter_and_sort(&(*chimera),stage3list);
4386 debug2(printf("After filter and sort, have %d paths\n",List_length(stage3list)));
4387
4388 if (*chimera != NULL && List_length(stage3list) > 2) {
4389 /* Compare chimera against non-chimeric alignments */
4390 chimera1 = (Stage3_T) List_head(stage3list);
4391 chimera2 = (Stage3_T) List_head(List_next(stage3list));
4392 nonchimericbest = (Stage3_T) List_head(List_next(List_next(stage3list)));
4393 debug2(printf("chimera1 %d, chimera2 %d\n",Stage3_goodness(chimera1),Stage3_goodness(chimera2)));
4394 debug2(printf("%p non-chimeric %d %d..%d\n",
4395 nonchimericbest,Stage3_goodness(nonchimericbest),Stage3_querystart(nonchimericbest),Stage3_queryend(nonchimericbest)));
4396
4397 if (Stage3_queryend(nonchimericbest) > (Stage3_querystart(chimera2) + Stage3_queryend(chimera2))/2 &&
4398 Stage3_querystart(nonchimericbest) < (Stage3_querystart(chimera1) + Stage3_queryend(chimera1))/2) {
4399 stage3list = List_pop(stage3list,(void **) &chimera1);
4400 stage3list = List_pop(stage3list,(void **) &chimera2);
4401 Stage3_free(&chimera1);
4402 Stage3_free(&chimera2);
4403 Chimera_free(&(*chimera));
4404 *chimera = (Chimera_T) NULL;
4405 }
4406 }
4407
4408 debug2(printf("apply_stage3 returning %d paths\n",List_length(stage3list)));
4409 #ifdef DEBUG2
4410 for (p = stage3list; p != NULL; p = List_next(p)) {
4411 stage3 = (Stage3_T) List_head(p);
4412 printf("%p %p\n",stage3,Stage3_pairs(stage3));
4413 }
4414 #endif
4415
4416 return stage3list;
4417 }
4418
4419
4420 static Filestring_T
process_request(Filestring_T * fp_failedinput,double * worker_runtime,Request_T request,Sequence_T usersegment,Matchpool_T matchpool,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Stopwatch_T worker_stopwatch)4421 process_request (Filestring_T *fp_failedinput, double *worker_runtime, Request_T request, Sequence_T usersegment,
4422 Matchpool_T matchpool, Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
4423 Stage2_alloc_T stage2_alloc, Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
4424 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
4425 Stopwatch_T worker_stopwatch) {
4426 Filestring_T fp;
4427 Result_T result;
4428 int jobid;
4429 Diagnostic_T diagnostic;
4430 Sequence_T queryseq, queryuc;
4431 Chimera_T chimera = NULL;
4432 bool mergedp, lowidentityp;
4433 bool repetitivep = false, poorp = false;
4434
4435 List_T gregions = NULL, stage3list;
4436 Stage3_T *stage3array;
4437 int npaths_primary, npaths_altloc, first_absmq, second_absmq;
4438 #ifdef PMAP
4439 Sequence_T queryntseq;
4440 #endif
4441
4442 jobid = Request_id(request);
4443 queryseq = Request_queryseq(request);
4444 Matchpool_reset(matchpool);
4445 Pairpool_reset(pairpool);
4446 Diagpool_reset(diagpool);
4447 Cellpool_reset(cellpool);
4448
4449
4450 if (worker_stopwatch != NULL) {
4451 Stopwatch_start(worker_stopwatch);
4452 }
4453
4454 if (Sequence_fulllength_given(queryseq) <= 0) {
4455 result = Result_new(jobid,/*mergedp*/false,(Chimera_T) NULL,(Stage3_T *) NULL,
4456 /*npaths_primary*/0,/*npaths_altloc*/0,/*first_absmq*/0,/*second_absmq*/0,
4457 /*diagnostic*/NULL,EMPTY_SEQUENCE);
4458
4459 } else if (Sequence_fulllength_given(queryseq) <
4460 #ifdef PMAP
4461 index1part_aa
4462 #else
4463 index1part
4464 #endif
4465 ) {
4466 result = Result_new(jobid,/*mergedp*/false,(Chimera_T) NULL,(Stage3_T *) NULL,
4467 /*npaths_primary*/0,/*npaths_altloc*/0,/*first_absmq*/0,/*second_absmq*/0,
4468 /*diagnostic*/NULL,SHORT_SEQUENCE);
4469
4470 } else { /* Sequence_fulllength_given(queryseq) > 0 */
4471 queryuc = Sequence_uppercase(queryseq);
4472 #ifdef PMAP
4473 queryntseq = Sequence_convert_to_nucleotides(queryseq);
4474 #endif
4475
4476 diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(queryuc),
4477 Sequence_fulllength(queryuc),Oligoindex_array_elt(oligoindices_major,0));
4478
4479 #ifndef PMAP
4480 if (poorp == true && prune_poor_p == true) {
4481 result = Result_new(jobid,/*mergedp*/false,(Chimera_T) NULL,(Stage3_T *) NULL,
4482 /*npaths_primary*/0,/*npaths_altloc*/0,/*first_absmq*/0,/*second_absmq*/0,
4483 diagnostic,POOR_SEQUENCE);
4484 } else if (repetitivep == true && prune_repetitive_p == true) {
4485 result = Result_new(jobid,/*mergedp*/false,(Chimera_T) NULL,(Stage3_T *) NULL,
4486 /*npaths_primary*/0,/*npaths_altloc*/0,/*first_absmq*/0,/*second_absmq*/0,
4487 diagnostic,REPETITIVE);
4488 }
4489 #endif
4490
4491 if (usersegment != NULL) {
4492 #ifndef PMAP
4493 #if 0
4494 /* Don't do Sequence_trim, because it affects sequences like NM_018406 */
4495 Sequence_trim(queryseq,diagnostic->query_trim_start,diagnostic->query_trim_end);
4496 Sequence_trim(queryuc,diagnostic->query_trim_start,diagnostic->query_trim_end);
4497 #endif
4498 #endif
4499 stage3array = stage3_from_usersegment(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,queryseq,queryuc,
4500 #ifdef PMAP
4501 queryntseq,
4502 #endif
4503 usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
4504 pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,worker_stopwatch);
4505 result = Result_new(jobid,/*mergedp*/false,(Chimera_T) NULL,stage3array,npaths_primary,npaths_altloc,
4506 first_absmq,second_absmq,diagnostic,NO_FAILURE);
4507
4508 } else { /* Not user segment and not maponly */
4509 #ifndef PMAP
4510 #if 0
4511 /* Don't do Sequence_trim, because it affects sequences like NM_018406 */
4512 Sequence_trim(queryseq,diagnostic->query_trim_start,diagnostic->query_trim_end);
4513 Sequence_trim(queryuc,diagnostic->query_trim_start,diagnostic->query_trim_end);
4514 #endif
4515 #endif
4516
4517 debug(printf("Calling stage 1\n"));
4518 if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
4519 gregions = Stage1_compute_nonstranded(&lowidentityp,queryuc,indexdb_fwd,indexdb_fwd,
4520 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
4521 stutterhits,diagnostic,worker_stopwatch,/*nbest*/10);
4522
4523 } else {
4524 gregions = Stage1_compute(&lowidentityp,queryuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
4525 chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
4526 stutterhits,diagnostic,worker_stopwatch,/*nbest*/10);
4527 }
4528 debug(printf("Got %d gregions\n",List_length(gregions)));
4529
4530 if (stage1debug == true) {
4531 /* result = Result_new_stage1debug(jobid,gregions,diagnostic,NO_FAILURE); */
4532 abort();
4533 } else {
4534 debug(printf("Applying stage 3\n"));
4535 stage3list = apply_stage3(&mergedp,&chimera,gregions,queryseq,queryuc,
4536 #ifdef PMAP
4537 queryntseq,
4538 #endif
4539 usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
4540 matchpool,pairpool,diagpool,cellpool,
4541 dynprogL,dynprogM,dynprogR,worker_stopwatch);
4542 if (diag_debug == true) {
4543 #if 0
4544 result = Result_new_diag_debug(jobid,/*diagonals*/stage3list,diagnostic,NO_FAILURE);
4545 #endif
4546 abort();
4547 } else if (stage3list == NULL) {
4548 result = Result_new(jobid,mergedp,chimera,/*stage3array*/NULL,/*npaths_primary*/0,/*npaths_altloc*/0,
4549 /*first_absmq*/0,/*second_absmq*/0,diagnostic,NO_FAILURE);
4550 } else if (chimera == NULL) {
4551 stage3array = stage3array_from_list(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,
4552 stage3list,/*chimerap*/false,/*remove_overlaps_p*/true);
4553 debug2(printf("chimera is NULL. npaths_primary %d, npaths_altloc %d\n",npaths_primary,npaths_altloc));
4554 result = Result_new(jobid,mergedp,/*chimera*/NULL,stage3array,npaths_primary,npaths_altloc,
4555 first_absmq,second_absmq,diagnostic,NO_FAILURE);
4556 } else {
4557 stage3array = stage3array_from_list(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,
4558 stage3list,/*chimerap*/true,/*remove_overlaps_p*/false);
4559 debug2(printf("chimera is not NULL. npaths_primary %d, npaths_altloc %d\n",npaths_primary,npaths_altloc));
4560 result = Result_new(jobid,mergedp,chimera,stage3array,npaths_primary,npaths_altloc,
4561 first_absmq,second_absmq,diagnostic,NO_FAILURE);
4562 }
4563 }
4564
4565 Oligoindex_clear_inquery(Oligoindex_array_elt(oligoindices_major,0),/*queryuc_ptr*/Sequence_fullpointer(queryuc),
4566 /*querystart*/0,/*queryend*/Sequence_fulllength(queryuc));
4567
4568 } /* Matches not user segment and not maponly */
4569
4570 #ifdef PMAP
4571 Sequence_free(&queryntseq);
4572 #endif
4573 Sequence_free(&queryuc);
4574 } /* Matches sequence length > 0 */
4575
4576 fp = Output_filestring_fromresult(&(*fp_failedinput),result,request,
4577 /*headerseq*/user_pairalign_p == true ? usersegment : queryseq);
4578 *worker_runtime = worker_stopwatch == NULL ? 0.00 : Stopwatch_stop(worker_stopwatch);
4579 Result_free(&result);
4580 return fp;
4581 }
4582
4583
4584 #ifdef HAVE_SIGACTION
4585 static const Except_T sigfpe_error = {"SIGFPE--arithmetic exception"};
4586 static const Except_T sigsegv_error = {"SIGSEGV--segmentation violation"};
4587 static const Except_T sigtrap_error = {"SIGTRAP--hardware fault"};
4588 static const Except_T misc_signal_error = {"Miscellaneous signal"};
4589
4590 static void
signal_handler(int sig)4591 signal_handler (int sig) {
4592 Request_T request;
4593 Sequence_T queryseq;
4594
4595 if (sig == SIGPIPE) {
4596 /* Allow pipe */
4597 return;
4598 }
4599
4600 switch (sig) {
4601 case SIGABRT: fprintf(stderr,"Signal received: SIGABRT\n"); break;
4602 case SIGFPE: fprintf(stderr,"Signal received: SIGFPE\n"); break;
4603 case SIGHUP: fprintf(stderr,"Signal received: SIGHUP\n"); break;
4604 case SIGILL:
4605 fprintf(stderr,"Signal received: SIGILL\n");
4606 fprintf(stderr,"An illegal instruction means that this program is being run on a computer\n");
4607 fprintf(stderr," with different features than the computer used to compile the program\n");
4608 fprintf(stderr,"You may need to re-compile the program on the same computer type as the target machine\n");
4609 fprintf(stderr," or re-compile with fewer features by doing something like\n");
4610 fprintf(stderr," ./configure --disable-simd\n");
4611 break;
4612 case SIGINT: fprintf(stderr,"Signal received: SIGINT\n"); break;
4613 case SIGPIPE: fprintf(stderr,"Signal received: SIGPIPE\n"); break;
4614 case SIGQUIT: fprintf(stderr,"Signal received: SIGQUIT\n"); break;
4615 case SIGSEGV: fprintf(stderr,"Signal received: SIGSEGV\n"); break;
4616 case SIGSYS: fprintf(stderr,"Signal received: SIGSYS\n"); break;
4617 case SIGTERM: fprintf(stderr,"Signal received: SIGTERM\n"); break;
4618 case SIGTRAP: fprintf(stderr,"Signal received: SIGTRAP\n"); break;
4619 case SIGXCPU: fprintf(stderr,"Signal received: SIGXCPU\n"); break;
4620 case SIGXFSZ: fprintf(stderr,"Signal received: SIGXFSZ\n"); break;
4621 }
4622
4623 Access_emergency_cleanup();
4624
4625 #ifdef USE_MPI
4626 MPI_Barrier(MPI_COMM_WORLD);
4627 #endif
4628
4629 #ifdef HAVE_PTHREAD
4630 request = (Request_T) pthread_getspecific(global_request_key);
4631 if (request == NULL) {
4632 /* fprintf(stderr,"Unable to retrieve request for thread\n"); */
4633 } else {
4634 queryseq = Request_queryseq(request);
4635 if (queryseq == NULL) {
4636 fprintf(stderr,"Unable to retrieve queryseq for request\n");
4637 } else {
4638 fprintf(stderr,"Problem sequence: ");
4639 fprintf(stderr,"%s (%d bp)\n",Sequence_accession(queryseq),Sequence_fulllength(queryseq));
4640 }
4641 }
4642 #endif
4643
4644 exit(9);
4645
4646 return;
4647 }
4648 #endif
4649
4650
4651 #define POOL_FREE_INTERVAL 200
4652
4653 #ifdef USE_MPI
4654 static void
worker_mpi_process(int worker_id,Inbuffer_T inbuffer)4655 worker_mpi_process (int worker_id, Inbuffer_T inbuffer) {
4656 bool donep = false;
4657 int nread = 0;
4658 MPI_Status status;
4659
4660 Stage2_alloc_T stage2_alloc;
4661 Oligoindex_array_T oligoindices_major, oligoindices_minor;
4662 Dynprog_T dynprogL, dynprogM, dynprogR;
4663 Matchpool_T matchpool;
4664 Pairpool_T pairpool;
4665 Diagpool_T diagpool;
4666 Cellpool_T cellpool;
4667 Stopwatch_T worker_stopwatch;
4668 Request_T request;
4669 Filestring_T fp, fp_failedinput;
4670 Sequence_T queryseq, usersegment, pairalign_segment;
4671 int filestringid, requestid, i;
4672 int ret;
4673 int worker_jobid = 0;
4674 double worker_runtime;
4675
4676 #ifdef MEMUSAGE
4677 Sequence_T queryseq;
4678 long int memusage_constant = 0, memusage, max_memusage;
4679 char procname[12];
4680 char acc[100+1], comma0[20], comma1[20], comma2[20], comma3[20], comma4[20], comma5[20];
4681 sprintf(procname,"proc-%ld",worker_id);
4682 Mem_usage_set_threadname(procname);
4683 #endif
4684
4685 stage2_alloc = Stage2_alloc_new(MAX_QUERYLENGTH_FOR_ALLOC);
4686 oligoindices_major = Oligoindex_array_new_major(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
4687 oligoindices_minor = Oligoindex_array_new_minor(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
4688 dynprogL = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
4689 /*doublep*/true);
4690 dynprogM = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
4691 /*doublep*/false);
4692 dynprogR = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
4693 /*doublep*/true);
4694 matchpool = Matchpool_new();
4695 pairpool = Pairpool_new();
4696 diagpool = Diagpool_new();
4697 cellpool = Cellpool_new();
4698 worker_stopwatch = (timingp == true) ? Stopwatch_new() : (Stopwatch_T) NULL;
4699
4700 usersegment = global_usersegment;
4701
4702 /* Except_stack_create(); -- no worker threads, so no need to store request in global_request_key */
4703
4704 #ifdef MEMUSAGE
4705 memusage_constant += Mem_usage_report_std_heap();
4706 Genomicpos_commafmt_fill(comma0,memusage_constant);
4707 Mem_usage_reset_heap_baseline(0);
4708 #endif
4709
4710 /* Initial message to say that we are ready for a request */
4711 filestringid = -1;
4712
4713 /* Use a synchronized send here to make sure outbuffer is ready */
4714 if ((ret = MPI_SSEND(&filestringid,1,MPI_INT,/*dest*/0,/*tag*/MPI_TAG_FILESTRING_AVAIL,MPI_COMM_WORLD)) != 0) {
4715 fprintf(stderr,"MPI_SSEND returns error %d\n",ret);
4716 MPI_Finalize();
4717 exit(9);
4718 }
4719
4720 while (donep == false) {
4721 MPI_RECV(&requestid,1,MPI_INT,/*source*/0,/*tag*/MPI_ANY_TAG,MPI_COMM_WORLD,&status);
4722 debugm(printf("worker_id %ld got request %d\n",worker_id,requestid));
4723
4724 while (nread < requestid &&
4725 (queryseq = Inbuffer_read(&pairalign_segment,inbuffer,/*skipp*/true)) != NULL) {
4726 /* No need to free queryseq */
4727 nread++;
4728 }
4729
4730 if (nread < requestid) {
4731 debugm(printf("because nread %d < requestid %d, worker_id %ld is done\n",nread,requestid,worker_id));
4732 donep = true;
4733 } else if ((queryseq = Inbuffer_read(&pairalign_segment,inbuffer,/*skipp*/false)) == NULL) {
4734 debugm(printf("because final read is NULL, worker_id %ld is done\n",worker_id));
4735 donep = true;
4736 } else {
4737 debugm(printf("worker_id %ld starting to process request %d\n",worker_id,requestid));
4738 request = Request_new(requestid,queryseq);
4739 nread++;
4740
4741 if (user_pairalign_p == true) {
4742 genomecomp_blocks = Compress_create_blocks_comp(Sequence_fullpointer(usersegment),Sequence_fulllength(usersegment));
4743 Genome_user_setup(genomecomp_blocks,genomelength);
4744 Genome_sites_setup(genomecomp_blocks,/*snp_blocks*/NULL);
4745 Maxent_hr_setup(genomecomp_blocks,/*genomealt_blocks*/genomecomp_blocks);
4746 #ifdef PMAP
4747 Oligoindex_pmap_setup(genomecomp);
4748 #else
4749 Oligoindex_hr_setup(genomecomp_blocks,mode);
4750 /* Oligoindex_localdb_setup(chromosome_iit,circular_typeint,localdb,local1part); */
4751 #endif
4752 }
4753
4754 #ifdef MEMUSAGE
4755 queryseq = Request_queryseq(request);
4756 fprintf(stderr,"Proc %d starting %s\n",worker_id,Sequence_accession(queryseq));
4757 Mem_usage_reset_stack_max();
4758 Mem_usage_reset_heap_max();
4759 #endif
4760
4761 TRY
4762 fp = process_request(&fp_failedinput,&worker_runtime,request,usersegment,
4763 matchpool,pairpool,diagpool,cellpool,
4764 stage2_alloc,oligoindices_major,oligoindices_minor,
4765 dynprogL,dynprogM,dynprogR,worker_stopwatch);
4766
4767 ELSE
4768 queryseq = Request_queryseq(request);
4769 if (Sequence_accession(queryseq) == NULL) {
4770 fprintf(stderr,"Problem with unnamed sequence (%d bp)\n",Sequence_fulllength_given(queryseq));
4771 } else {
4772 fprintf(stderr,"Problem with sequence %s (%d bp)\n",
4773 Sequence_accession(queryseq),Sequence_fulllength_given(queryseq));
4774 }
4775 fprintf(stderr,"To obtain a core dump, re-run program on problem sequence with the -0 [zero] flag\n");
4776 fprintf(stderr,"Exiting...\n");
4777 exit(9);
4778 RERAISE;
4779 END_TRY;
4780
4781 if (user_pairalign_p == true) {
4782 usersegment = pairalign_segment;
4783 FREE(genomecomp_blocks);
4784 }
4785
4786 filestringid = Filestring_id(fp);
4787 debugm(printf("worker proc %d sending filestring %d...",worker_id,filestringid));
4788
4789 /* Use a synchronized send here to make sure outbuffer is ready */
4790 if ((ret = MPI_SSEND(&filestringid,1,MPI_INT,/*dest*/0,/*tag*/MPI_TAG_FILESTRING_AVAIL,MPI_COMM_WORLD)) != 0) {
4791 fprintf(stderr,"MPI_SSEND returns error %d\n",ret);
4792 MPI_Finalize();
4793 exit(9);
4794 }
4795 Filestring_Send(fp,/*dest*/0,/*tag*/MPI_TAG_DEFAULT,MPI_COMM_WORLD);
4796 if (failedinput_root != NULL) {
4797 Filestring_Send(fp_failedinput,/*dest*/0,/*tag*/MPI_TAG_DEFAULT,MPI_COMM_WORLD);
4798 }
4799 debugm(printf("done with filestring %d\n",filestringid));
4800
4801 if (worker_jobid % POOL_FREE_INTERVAL == 0) {
4802 Pairpool_free_memory(pairpool);
4803 Diagpool_free_memory(diagpool);
4804 Cellpool_free_memory(cellpool);
4805 Matchpool_free_memory(matchpool);
4806 }
4807
4808 #ifdef MEMUSAGE
4809 /* Copy acc before we free the request */
4810 queryseq = Request_queryseq(request);
4811 strncpy(acc,Sequence_accession(queryseq),100);
4812 acc[100] = '\0';
4813 #endif
4814
4815 Request_free(&request);
4816
4817 #ifdef MEMUSAGE
4818 Genomicpos_commafmt_fill(comma1,Mem_usage_report_std_heap_max());
4819 Genomicpos_commafmt_fill(comma2,Mem_usage_report_std_heap());
4820 Genomicpos_commafmt_fill(comma3,Mem_usage_report_keep());
4821 Genomicpos_commafmt_fill(comma4,Mem_usage_report_in());
4822 Genomicpos_commafmt_fill(comma5,Mem_usage_report_out());
4823
4824 fprintf(stderr,"Acc %s, proc %d: constant %s max %s std %s keep %s in %s out %s\n",
4825 acc,worker_id,comma0,comma1,comma2,comma3,comma4,comma5);
4826
4827 if ((memusage = Mem_usage_report_std_heap()) != 0) {
4828 fprintf(stderr,"Memory leak in proc of %ld bytes: %ld\n",worker_id,memusage);
4829 fflush(stdout);
4830 MPI_Finalize();
4831 exit(9);
4832 }
4833 #endif
4834 }
4835 }
4836
4837 /* Final message to say that we are done with all requests */
4838 debugm(printf("worker_id %ld sending final message to say it is done\n",worker_id));
4839 filestringid = -1;
4840 if ((ret = MPI_SSEND(&filestringid,1,MPI_INT,/*dest*/0,/*tag*/MPI_TAG_FILESTRING_AVAIL,MPI_COMM_WORLD)) != 0) {
4841 fprintf(stderr,"MPI_SSEND returns error %d\n",ret);
4842 MPI_Finalize();
4843 exit(9);
4844 }
4845
4846 #ifdef MEMUSAGE
4847 Mem_usage_std_heap_add(memusage_constant);
4848 #endif
4849
4850 /* Except_stack_destroy(); */
4851
4852 Stopwatch_free(&worker_stopwatch);
4853 Cellpool_free(&cellpool);
4854 Diagpool_free(&diagpool);
4855 Pairpool_free(&pairpool);
4856 Matchpool_free(&matchpool);
4857 Dynprog_free(&dynprogR);
4858 Dynprog_free(&dynprogM);
4859 Dynprog_free(&dynprogL);
4860 Oligoindex_array_free(&oligoindices_minor);
4861 Oligoindex_array_free(&oligoindices_major);
4862 Stage2_alloc_free(&stage2_alloc);
4863
4864 #ifdef MEMUSAGE
4865 Mem_usage_set_threadname("main");
4866 #endif
4867
4868 debugm(printf("worker_id %ld is now returning\n",worker_id));
4869 return;
4870 }
4871 #endif
4872
4873
4874 static void
single_thread()4875 single_thread () {
4876 Stage2_alloc_T stage2_alloc;
4877 Oligoindex_array_T oligoindices_major, oligoindices_minor;
4878 Dynprog_T dynprogL, dynprogM, dynprogR;
4879 Matchpool_T matchpool;
4880 Pairpool_T pairpool;
4881 Diagpool_T diagpool;
4882 Cellpool_T cellpool;
4883 Stopwatch_T worker_stopwatch;
4884 Request_T request;
4885 Sequence_T usersegment, pairalign_segment;
4886 Filestring_T fp, fp_failedinput;
4887 Sequence_T queryseq;
4888 int jobid = 0;
4889 double worker_runtime;
4890
4891 #ifdef MEMUSAGE
4892 long int memusage, memusage_constant = 0;
4893 char acc[100+1], comma0[20], comma1[20], comma2[20], comma3[20], comma4[20], comma5[20];
4894 #endif
4895
4896 stage2_alloc = Stage2_alloc_new(MAX_QUERYLENGTH_FOR_ALLOC);
4897 oligoindices_major = Oligoindex_array_new_major(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
4898 oligoindices_minor = Oligoindex_array_new_minor(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
4899 dynprogL = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
4900 /*doublep*/true);
4901 dynprogM = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
4902 /*doublep*/false);
4903 dynprogR = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
4904 /*doublep*/true);
4905 matchpool = Matchpool_new();
4906 pairpool = Pairpool_new();
4907 diagpool = Diagpool_new();
4908 cellpool = Cellpool_new();
4909 worker_stopwatch = (timingp == true) ? Stopwatch_new() : (Stopwatch_T) NULL;
4910
4911 usersegment = global_usersegment;
4912
4913 /* Except_stack_create(); -- requires pthreads */
4914
4915 #ifdef MEMUSAGE
4916 memusage_constant += Mem_usage_report_std_heap();
4917 Genomicpos_commafmt_fill(comma0,memusage_constant);
4918 Mem_usage_reset_heap_baseline(0);
4919 #endif
4920
4921 while ((request = Inbuffer_get_request(&pairalign_segment,inbuffer)) != NULL) {
4922
4923 if (user_pairalign_p == true) {
4924 genomecomp_blocks = Compress_create_blocks_comp(Sequence_fullpointer(usersegment),Sequence_fulllength(usersegment));
4925 Genome_user_setup(genomecomp_blocks,genomelength);
4926 Genome_sites_setup(genomecomp_blocks,/*snp_blocks*/NULL);
4927 Maxent_hr_setup(genomecomp_blocks,/*genomealt_blocks*/genomecomp_blocks);
4928 #ifdef PMAP
4929 Oligoindex_pmap_setup(genomecomp);
4930 #else
4931 Oligoindex_hr_setup(genomecomp_blocks,mode);
4932 /* Oligoindex_localdb_setup(chromosome_iit,circular_typeint,localdb,local1part); */
4933 #endif
4934 }
4935
4936 #ifdef MEMUSAGE
4937 queryseq = Request_queryseq(request);
4938 fprintf(stderr,"Single thread starting %s\n",Sequence_accession(queryseq));
4939 Mem_usage_reset_stack_max();
4940 Mem_usage_reset_heap_max();
4941 #endif
4942
4943 TRY
4944 fp = process_request(&fp_failedinput,&worker_runtime,request,usersegment,
4945 matchpool,pairpool,diagpool,cellpool,
4946 stage2_alloc,oligoindices_major,oligoindices_minor,
4947 dynprogL,dynprogM,dynprogR,worker_stopwatch);
4948 if (timingp == true) {
4949 queryseq = Request_queryseq(request);
4950 fprintf(stderr,"%s\t%.6f\n",Sequence_accession(queryseq),worker_runtime);
4951 }
4952
4953 ELSE
4954 queryseq = Request_queryseq(request);
4955 if (Sequence_accession(queryseq) == NULL) {
4956 fprintf(stderr,"Problem with unnamed sequence (%d bp)\n",Sequence_fulllength_given(queryseq));
4957 } else {
4958 fprintf(stderr,"Problem with sequence %s (%d bp)\n",
4959 Sequence_accession(queryseq),Sequence_fulllength_given(queryseq));
4960 }
4961 fprintf(stderr,"To obtain a core dump, re-run program on problem sequence with the -0 [zero] flag\n");
4962 fprintf(stderr,"Exiting...\n");
4963 exit(9);
4964 RERAISE;
4965 END_TRY;
4966
4967 if (user_pairalign_p == true) {
4968 usersegment = pairalign_segment;
4969 FREE(genomecomp_blocks);
4970 }
4971
4972 Outbuffer_print_filestrings(fp,fp_failedinput);
4973
4974 if (jobid % POOL_FREE_INTERVAL == 0) {
4975 Pairpool_free_memory(pairpool);
4976 Diagpool_free_memory(diagpool);
4977 Cellpool_free_memory(cellpool);
4978 Matchpool_free_memory(matchpool);
4979 }
4980
4981 #ifdef MEMUSAGE
4982 /* Copy acc before we free the request */
4983 queryseq = Request_queryseq(request);
4984 strncpy(acc,Sequence_accession(queryseq),100);
4985 acc[100] = '\0';
4986 #endif
4987
4988 Request_free(&request);
4989
4990 #ifdef MEMUSAGE
4991 Genomicpos_commafmt_fill(comma1,Mem_usage_report_std_heap_max());
4992 Genomicpos_commafmt_fill(comma2,Mem_usage_report_std_heap());
4993 Genomicpos_commafmt_fill(comma3,Mem_usage_report_keep());
4994 Genomicpos_commafmt_fill(comma4,Mem_usage_report_in());
4995 Genomicpos_commafmt_fill(comma5,Mem_usage_report_out());
4996
4997 fprintf(stderr,"Acc %s: constant %s max %s std %s keep %s in %s out %s\n",
4998 acc,comma0,comma1,comma2,comma3,comma4,comma5);
4999
5000 if ((memusage = Mem_usage_report_std_heap()) != 0) {
5001 fprintf(stderr,"Memory leak in single thread of %ld bytes\n",memusage);
5002 fflush(stdout);
5003 exit(9);
5004 }
5005 #endif
5006 }
5007
5008 #ifdef MEMUSAGE
5009 Mem_usage_std_heap_add(memusage_constant);
5010 #endif
5011
5012 /* Except_stack_destroy(); -- requires pthreads */
5013
5014 if (worker_stopwatch != NULL) {
5015 Stopwatch_free(&worker_stopwatch);
5016 }
5017 Cellpool_free(&cellpool);
5018 Diagpool_free(&diagpool);
5019 Pairpool_free(&pairpool);
5020 Matchpool_free(&matchpool);
5021 Dynprog_free(&dynprogR);
5022 Dynprog_free(&dynprogM);
5023 Dynprog_free(&dynprogL);
5024 Oligoindex_array_free(&oligoindices_minor);
5025 Oligoindex_array_free(&oligoindices_major);
5026 Stage2_alloc_free(&stage2_alloc);
5027
5028 #ifdef MEMUSAGE
5029 Mem_usage_set_threadname("main");
5030 #endif
5031
5032 return;
5033 }
5034
5035
5036 #ifdef HAVE_PTHREAD
5037 static void *
worker_thread(void * data)5038 worker_thread (void *data) {
5039 Stage2_alloc_T stage2_alloc;
5040 Oligoindex_array_T oligoindices_major, oligoindices_minor;
5041 Dynprog_T dynprogL, dynprogM, dynprogR;
5042 Matchpool_T matchpool;
5043 Pairpool_T pairpool;
5044 Diagpool_T diagpool;
5045 Cellpool_T cellpool;
5046 Stopwatch_T worker_stopwatch;
5047 Request_T request;
5048 Filestring_T fp, fp_failedinput;
5049 Sequence_T queryseq, usersegment, pairalign_segment;
5050 int worker_jobid = 0;
5051 double worker_runtime;
5052 #if defined(DEBUG) || defined(MEMUSAGE)
5053 long int worker_id = (long int) data;
5054 #endif
5055
5056 #ifdef MEMUSAGE
5057 long int memusage_constant = 0, memusage, max_memusage;
5058 char threadname[12];
5059 char acc[100+1], comma0[20], comma1[20], comma2[20], comma3[20], comma4[20], comma5[20];
5060 sprintf(threadname,"thread-%ld",worker_id);
5061 Mem_usage_set_threadname(threadname);
5062 #endif
5063
5064 /* Thread-specific data and storage */
5065 stage2_alloc = Stage2_alloc_new(MAX_QUERYLENGTH_FOR_ALLOC);
5066 oligoindices_major = Oligoindex_array_new_major(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
5067 oligoindices_minor = Oligoindex_array_new_minor(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
5068 dynprogL = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
5069 /*doublep*/true);
5070 dynprogM = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
5071 /*doublep*/false);
5072 dynprogR = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
5073 /*doublep*/true);
5074 matchpool = Matchpool_new();
5075 pairpool = Pairpool_new();
5076 diagpool = Diagpool_new();
5077 cellpool = Cellpool_new();
5078 worker_stopwatch = (timingp == true) ? Stopwatch_new() : (Stopwatch_T) NULL;
5079
5080 usersegment = global_usersegment;
5081
5082 Except_stack_create();
5083
5084 #ifdef MEMUSAGE
5085 memusage_constant += Mem_usage_report_std_heap();
5086 Genomicpos_commafmt_fill(comma0,memusage_constant);
5087 Mem_usage_reset_heap_baseline(0);
5088 #endif
5089
5090 while ((request = Inbuffer_get_request(&pairalign_segment,inbuffer)) != NULL) {
5091 debug(printf("worker_thread %ld got request %d\n",worker_id,Request_id(request)));
5092 pthread_setspecific(global_request_key,(void *) request);
5093
5094 if (user_pairalign_p == true) {
5095 genomecomp_blocks = Compress_create_blocks_comp(Sequence_fullpointer(usersegment),Sequence_fulllength(usersegment));
5096 Genome_user_setup(genomecomp_blocks,genomelength);
5097 Genome_sites_setup(genomecomp_blocks,/*snp_blocks*/NULL);
5098 Maxent_hr_setup(genomecomp_blocks,/*genomealt_blocks*/genomecomp_blocks);
5099 #ifdef PMAP
5100 Oligoindex_pmap_setup(genomecomp);
5101 #else
5102 Oligoindex_hr_setup(genomecomp_blocks,mode);
5103 /* Oligoindex_localdb_setup(chromosome_iit,circular_typeint,localdb,local1part); */
5104 #endif
5105 }
5106
5107 #ifdef MEMUSAGE
5108 queryseq = Request_queryseq(request);
5109 fprintf(stderr,"Thread %d starting %s\n",worker_id,Sequence_accession(queryseq));
5110 Mem_usage_reset_stack_max();
5111 Mem_usage_reset_heap_max();
5112 #endif
5113
5114 TRY
5115 fp = process_request(&fp_failedinput,&worker_runtime,request,usersegment,
5116 matchpool,pairpool,diagpool,cellpool,
5117 stage2_alloc,oligoindices_major,oligoindices_minor,
5118 dynprogL,dynprogM,dynprogR,worker_stopwatch);
5119 if (timingp == true) {
5120 queryseq = Request_queryseq(request);
5121 fprintf(stderr,"%s\t%.6f\n",Sequence_accession(queryseq),worker_runtime);
5122 }
5123
5124 ELSE
5125 queryseq = Request_queryseq(request);
5126 if (queryseq == NULL) {
5127 fprintf(stderr,"NULL");
5128 } else if (Sequence_accession(queryseq) == NULL) {
5129 fprintf(stderr,"unnamed (%d bp)",Sequence_fulllength_given(queryseq));
5130 } else {
5131 fprintf(stderr,"%s (%d bp)",Sequence_accession(queryseq),Sequence_fulllength_given(queryseq));
5132 }
5133 fprintf(stderr,"\n");
5134 fprintf(stderr,"To obtain a core dump, re-run program on problem sequence with the -0 [zero] flag\n");
5135
5136 fprintf(stderr,"Exiting...\n");
5137 exit(9);
5138 RERAISE;
5139 END_TRY;
5140
5141 if (user_pairalign_p == true) {
5142 usersegment = pairalign_segment;
5143 FREE(genomecomp_blocks);
5144 }
5145
5146 debug(printf("worker_thread %ld putting filestring %d\n",worker_id,Filestring_id(fp)));
5147 Outbuffer_put_filestrings(outbuffer,fp,fp_failedinput);
5148
5149 if (worker_jobid % POOL_FREE_INTERVAL == 0) {
5150 Pairpool_free_memory(pairpool);
5151 Diagpool_free_memory(diagpool);
5152 Cellpool_free_memory(cellpool);
5153 Matchpool_free_memory(matchpool);
5154 }
5155
5156 #ifdef MEMUSAGE
5157 /* Copy acc before we free the request */
5158 queryseq = Request_queryseq(request);
5159 strncpy(acc,Sequence_accession(queryseq),100);
5160 acc[100] = '\0';
5161 #endif
5162
5163 Request_free(&request);
5164
5165 #ifdef MEMUSAGE
5166 Genomicpos_commafmt_fill(comma1,Mem_usage_report_std_heap_max());
5167 Genomicpos_commafmt_fill(comma2,Mem_usage_report_std_heap());
5168 Genomicpos_commafmt_fill(comma3,Mem_usage_report_keep());
5169 Genomicpos_commafmt_fill(comma4,Mem_usage_report_in());
5170 Genomicpos_commafmt_fill(comma5,Mem_usage_report_out());
5171
5172 fprintf(stderr,"Acc %s, thread %d: constant %s max %s std %s keep %s in %s out %s\n",
5173 acc,worker_id,comma0,comma1,comma2,comma3,comma4,comma5);
5174
5175 if ((memusage = Mem_usage_report_std_heap()) != 0) {
5176 fprintf(stderr,"Memory leak in worker thread %ld of %ld bytes\n",worker_id,memusage);
5177 fflush(stdout);
5178 exit(9);
5179 }
5180 #endif
5181 }
5182
5183 #ifdef MEMUSAGE
5184 Mem_usage_std_heap_add(memusage_constant);
5185 #endif
5186
5187 Except_stack_destroy();
5188
5189 if (worker_stopwatch != NULL) {
5190 Stopwatch_free(&worker_stopwatch);
5191 }
5192 Cellpool_free(&cellpool);
5193 Diagpool_free(&diagpool);
5194 Pairpool_free(&pairpool);
5195 Matchpool_free(&matchpool);
5196 Dynprog_free(&dynprogR);
5197 Dynprog_free(&dynprogM);
5198 Dynprog_free(&dynprogL);
5199 Oligoindex_array_free(&oligoindices_minor);
5200 Oligoindex_array_free(&oligoindices_major);
5201 Stage2_alloc_free(&stage2_alloc);
5202
5203 #ifdef MEMUSAGE
5204 Mem_usage_set_threadname("main");
5205 #endif
5206
5207 return (void *) NULL;
5208 }
5209 #endif
5210
5211
5212 #if 0
5213
5214 static void
5215 align_relative (FILE *input, char **files, int nfiles, int nextchar,
5216 Sequence_T queryseq, Sequence_T referenceseq) {
5217 Stage2_alloc_T stage2_alloc;
5218 Oligoindex_array_T oligoindices_major, oligoindices_minor;
5219 Diagnostic_T diagnostic;
5220 bool lowidentityp;
5221 #ifndef PMAP
5222 bool poorp, repetitivep;
5223 #endif
5224 Dynprog_T dynprogL, dynprogM, dynprogR;
5225 Matchpool_T matchpool;
5226 Pairpool_T pairpool;
5227 Diagpool_T diagpool;
5228 Cellpool_T cellpool;
5229 Stopwatch_T stopwatch;
5230
5231 Chrpos_T genomicstart, genomiclength;
5232 Sequence_T genomicseg, queryuc, referenceuc;
5233 int jobid = 0;
5234
5235 Chimera_T chimera = NULL;
5236 List_T gregions, stage3list;
5237 Stage3_T *stage3array, stage3, stage3ref;
5238 int npaths_primary, npaths_altloc, i;
5239
5240 oligoindices_major = Oligoindex_array_new_major(&noligoindices_major);
5241 oligoindices_minor = Oligoindex_array_new_minor(&noligoindices_minor);
5242 dynprogL = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired);
5243 dynprogM = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired);
5244 dynprogR = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired);
5245 matchpool = Matchpool_new();
5246 pairpool = Pairpool_new();
5247 diagpool = Diagpool_new();
5248 cellpool = Cellpool_new();
5249 stopwatch = (timingp == true) ? Stopwatch_new() : (Stopwatch_T) NULL;
5250
5251 Matchpool_reset(matchpool);
5252 Pairpool_reset(pairpool);
5253 Diagpool_reset(diagpool);
5254 Cellpool_reset(cellpool);
5255
5256 referenceuc = Sequence_uppercase(referenceseq);
5257
5258 /* Do not trim the mutation refseq */
5259 diagnostic = Diagnostic_new();
5260 Oligoindex_set_inquery(&diagnostic->query_badoligos,&diagnostic->query_repoligos,&diagnostic->query_trimoligos,
5261 &diagnostic->query_trim_start,&diagnostic->query_trim_end,Oligoindex_array_elt(oligoindices_major,0),
5262 Sequence_fullpointer(referenceuc),Sequence_fulllength(referenceuc),/*trimp*/false);
5263 #ifndef PMAP
5264 #if 0
5265 /* Don't do Sequence_trim, because it affects sequences like NM_018406 */
5266 Sequence_trim(referenceseq,diagnostic->query_trim_start,diagnostic->query_trim_end);
5267 #endif
5268 #endif
5269 if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
5270 gregions = Stage1_compute_nonstranded(&lowidentityp,referenceuc,indexdb_fwd,indexdb_rev,
5271 chromosome_iit,chrsubset_start,chrsubet_end,matchpool,
5272 stutterhits,diagnostic,/*stopwatch*/NULL);
5273 } else {
5274 gregions = Stage1_compute(&lowidentityp,referenceuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
5275 chromosome_iit,chrsubset_start,chrsubet_end,matchpool,
5276 stutterhits,diagnostic,/*stopwatch*/NULL);
5277 }
5278 stage3list = apply_stage3(&chimera,gregions,referenceseq,referenceuc,/*usersegment*/NULL,
5279 oligoindices_major,oligoindices_minor,
5280 matchpool,pairpool,diagpool,cellpool,
5281 dynprogL,dynprogM,dynprogR,stopwatch);
5282 if (stage3list == NULL) {
5283 npaths_primary = npaths_altloc = 0;
5284 stage3array = (Stage3_T *) NULL;
5285 } else {
5286 stage3array = stage3array_from_list(&npaths_primary,&npaths_altloc,stage3list,
5287 /*chimerap*/false,/*remove_overlaps_p*/true);
5288 }
5289 debug2(printf("npaths_primary %d, npaths_altloc %d\n",npaths_primary,npaths_altloc));
5290
5291 Diagnostic_free(&diagnostic);
5292
5293 /* chimera should be NULL */
5294 for (i = 1; i < npaths_primary + npaths_altloc; i++) {
5295 stage3 = stage3array[i];
5296 Stage3_free(&stage3);
5297 }
5298 if (npaths_primary + npaths_altloc > 0) {
5299 stage3ref = stage3array[0];
5300 #ifdef PMAP
5301 Stage3_translate_cdna(stage3ref,queryseq,strictp);
5302 Stage3_backtranslate_cdna(stage3ref,/*diagnosticp*/false);
5303 #else
5304 Stage3_translate_genomic(stage3ref,/*fulllengthp*/true,/*cds_startpos*/-1,
5305 Sequence_fulllength_given(queryseq),/*truncatep*/false,strictp);
5306 #endif
5307 FREE(stage3array);
5308
5309 Stage3_genomicbounds(&genomicstart,&genomiclength,stage3ref);
5310 if (genomealt != NULL) {
5311 genomicseg = Genome_get_segment(genomealt,genomicstart,genomiclength,chromosome_iit,/*revcomp*/false);
5312 } else {
5313 genomicseg = Genome_get_segment(genome,genomicstart,genomiclength,chromosome_iit,/*revcomp*/false);
5314 }
5315
5316 while (jobid == 0 || (queryseq = Sequence_read_multifile(&nextchar,&input,read_files_command,&files,&nfiles)) != NULL) {
5317 Matchpool_reset(matchpool);
5318 Pairpool_reset(pairpool);
5319 Diagpool_reset(diagpool);
5320 Cellpool_reset(cellpool);
5321
5322 fprintf(fp,">");
5323 Sequence_print_header(stdout,queryseq,checksump);
5324 diagnostic = Diagnostic_new();
5325 if (Sequence_fulllength_given(queryseq) <= 0) {
5326 print_npaths(fp,0,diagnostic,/*usersegment*/NULL,chrsubset,/*chimera*/NULL,EMPTY_SEQUENCE);
5327
5328 } else if (Sequence_fulllength_given(queryseq) <
5329 #ifdef PMAP
5330 index1part_aa
5331 #else
5332 index1part
5333 #endif
5334 ) {
5335 print_npaths(fp,0,diagnostic,/*usersegment*/NULL,chrsubset,/*chimera*/NULL,SHORT_SEQUENCE);
5336
5337 } else {
5338
5339 queryuc = Sequence_uppercase(queryseq);
5340 #ifdef PMAP
5341 Oligoindex_set_inquery(&diagnostic->query_badoligos,&diagnostic->query_repoligos,
5342 &diagnostic->query_trimoligos,&diagnostic->query_trim_start,
5343 &diagnostic->query_trim_end,Oligoindex_array_elt(oligoindices_major,0),
5344 Sequence_fullpointer(queryuc),Sequence_fulllength(queryuc),/*trimp*/false);
5345 #else
5346 diagnostic->query_oligodepth =
5347 Oligoindex_set_inquery(&diagnostic->query_badoligos,&diagnostic->query_repoligos,
5348 &diagnostic->query_trimoligos,&diagnostic->query_trim_start,
5349 &diagnostic->query_trim_end,Oligoindex_array_elt(oligoindices_major,0),
5350 Sequence_fullpointer(queryuc),/*querystart*/0,/*queryend*/Sequence_fulllength(queryuc),
5351 /*trimp*/true);
5352
5353 if (diagnostic->query_trimoligos == 0) {
5354 poorp = true;
5355 } else if (((double) diagnostic->query_badoligos/(double) diagnostic->query_trimoligos > MAX_BADOLIGOS) ||
5356 (diagnostic->query_trim_end - diagnostic->query_trim_start < 80 && diagnostic->query_badoligos > 0)) {
5357 poorp = true;
5358 } else {
5359 poorp = false;
5360 }
5361 #if 0
5362 if (diagnostic->query_trimoligos == 0) {
5363 repetitivep = false;
5364 } else if (diagnostic->query_oligodepth > MAX_OLIGODEPTH ||
5365 (double) diagnostic->query_repoligos/(double) diagnostic->query_trimoligos > MAX_REPOLIGOS) {
5366 repetitivep = true;
5367 } else {
5368 repetitivep = false;
5369 }
5370 #endif
5371 repetitivep = false;
5372
5373 if (poorp == true && prune_poor_p == true) {
5374 print_npaths(fp,0,diagnostic,/*usersegment*/NULL,chrsubset,/*chimera*/NULL,POOR_SEQUENCE);
5375 } else if (repetitivep == true && prune_repetitive_p == true) {
5376 print_npaths(fp,0,diagnostic,/*usersegment*/NULL,chrsubset,/*chimera*/NULL,REPETITIVE);
5377 } else {
5378 #endif /* PMAP */
5379 stage3array = stage3_from_usersegment(&npaths_primary,&npaths_altloc,queryseq,queryuc,genomicseg,
5380 oligoindices_major,oligoindices_minor,
5381 pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,stopwatch);
5382
5383 if (npaths_primary + npaths_altloc == 0) {
5384 print_npaths(fp,0,diagnostic,/*usersegment*/NULL,chrsubset,/*chimera*/NULL,NO_FAILURE);
5385 } else if (printtype == COORDS) {
5386 Stage3_print_coordinates(fp,stage3array[0],chromosome_iit,invertmode);
5387
5388 } else {
5389 /* Usual output */
5390 print_npaths(fp,1,diagnostic,/*usersegment*/NULL,chrsubset,/*chimera*/NULL,NO_FAILURE);
5391 #ifndef PMAP
5392 Stage3_translate_cdna_via_reference(stage3array[0],stage3ref);
5393 #endif
5394 Stage3_fix_cdna_direction(stage3array[0],stage3ref);
5395 Stage3_print_mutations(stage3array[0],stage3ref,chromosome_iit,queryseq,
5396 dbversion,printtype,proteinmode,
5397 invertmode,nointronlenp,wraplength,
5398 /*snps_p*/snp_blocks ? true : false);
5399 for (i = 0; i < npaths_primary + npaths_altloc; i++) {
5400 stage3 = stage3array[i];
5401 Stage3_free(&stage3);
5402 }
5403 FREE(stage3array);
5404
5405 }
5406
5407 #ifndef PMAP
5408 }
5409 #endif
5410
5411 Oligoindex_clear_inquery(Oligoindex_array_elt(oligoindices_major,0));
5412
5413 Sequence_free(&queryuc);
5414 }
5415 Sequence_free(&queryseq);
5416 jobid++;
5417 }
5418 Sequence_free(&genomicseg);
5419 Stage3_free(&stage3ref);
5420 }
5421
5422 Stopwatch_free(&stopwatch);
5423 Cellpool_free(&cellpool);
5424 Diagpool_free(&diagpool);
5425 Pairpool_free(&pairpool);
5426 Dynprog_free(&dynprogR);
5427 Dynprog_free(&dynprogM);
5428 Dynprog_free(&dynprogL);
5429 Oligoindex_array_free(&oligoindices_minor);
5430 Oligoindex_array_free(&oligoindices_major);
5431
5432 return;
5433 }
5434
5435 #endif
5436
5437 void
check_map_iit(IIT_T map_iit,Univ_IIT_T chromosome_iit)5438 check_map_iit (IIT_T map_iit, Univ_IIT_T chromosome_iit) {
5439 char *typestring, *lookup, *p;
5440 int type, destranded_len;
5441 bool errorp = false;
5442
5443 for (type = 1; type < IIT_ntypes(map_iit); type++) {
5444 lookup = typestring = IIT_typestring(map_iit,type);
5445 if ((p = rindex(typestring,'+')) != NULL) {
5446 destranded_len = (p - typestring)/sizeof(char);
5447 lookup = (char *) MALLOC((destranded_len+1)*sizeof(char));
5448 strncpy(lookup,typestring,destranded_len);
5449 lookup[destranded_len] = '\0';
5450
5451 } else if ((p = rindex(typestring,'-')) != NULL) {
5452 destranded_len = (p - typestring)/sizeof(char);
5453 lookup = (char *) MALLOC((destranded_len+1)*sizeof(char));
5454 strncpy(lookup,typestring,destranded_len);
5455 lookup[destranded_len] = '\0';
5456 }
5457
5458 if (Univ_IIT_find_one(chromosome_iit,lookup) < 0) {
5459 if (p != NULL) {
5460 fprintf(stderr,"Warning: In %s, type %s (without the %s) does not correspond to a known chromosome in %s.\n",
5461 map_iitfile,typestring,p,dbversion);
5462 } else {
5463 fprintf(stderr,"Warning: In %s, type %s does not correspond to a known chromosome in %s.\n",
5464 map_iitfile,typestring,dbversion);
5465 }
5466 errorp = true;
5467 }
5468
5469 if (p != NULL) {
5470 FREE(lookup);
5471 }
5472 }
5473 if (errorp == true) {
5474 fprintf(stderr,"Known chromosomes: ");
5475 Univ_IIT_dump_labels(stderr,chromosome_iit);
5476 }
5477 return;
5478 }
5479
5480
5481 void
parse_part(int * part_modulus,int * part_interval,char * string)5482 parse_part (int *part_modulus, int *part_interval, char *string) {
5483 char *p = string;
5484
5485 if (sscanf(p,"%d",&(*part_modulus)) < 1) {
5486 fprintf(stderr,"Cannot parse first integer from %s\n",string);
5487 exit(9);
5488 }
5489
5490 while (*p != '\0' && isdigit(*p)) {
5491 p++;
5492 }
5493 while (*p != '\0' && !isdigit(*p)) {
5494 p++;
5495 }
5496 if (sscanf(p,"%d",&(*part_interval)) < 1) {
5497 fprintf(stderr,"Cannot parse first integer from %s\n",string);
5498 exit(9);
5499 }
5500 if ((*part_modulus) >= (*part_interval)) {
5501 fprintf(stderr,"In %s, batch number %d must be less than the number of batches %d\n",
5502 string,*part_modulus,*part_interval);
5503 exit(9);
5504 }
5505 if (*part_interval == 0) {
5506 fprintf(stderr,"Bad batch specification %s. Batch interval cannot be 0.\n",string);
5507 exit(9);
5508 }
5509
5510 return;
5511 }
5512
5513
5514 static char *
check_valid_int(char * string)5515 check_valid_int (char *string) {
5516 char *p = string;
5517
5518 if (*p == '+' || *p == '-') {
5519 p++;
5520 }
5521
5522 if (!isdigit(*p)) {
5523 fprintf(stderr,"value %s is not a valid int\n",string);
5524 exit(9);
5525 return NULL;
5526 }
5527 while (*p != '\0' && isdigit(*p)) {
5528 p++;
5529 }
5530
5531 if (*p == 'e') {
5532 p++;
5533 if (*p == '+') {
5534 p++;
5535 }
5536 if (!isdigit(*p)) {
5537 return false;
5538 }
5539 while (*p != '\0' && isdigit(*p)) {
5540 p++;
5541 }
5542 }
5543
5544 if (*p == '\0') {
5545 return string;
5546 } else {
5547 fprintf(stderr,"value %s is not a valid int\n",string);
5548 exit(9);
5549 return NULL;
5550 }
5551 }
5552
5553
5554 static double
check_valid_float(char * string,const char * option)5555 check_valid_float (char *string, const char *option) {
5556 double value;
5557 char *p = string;
5558
5559 if (*p == '+' || *p == '-') {
5560 p++;
5561 }
5562
5563 while (*p != '\0' && isdigit(*p)) {
5564 p++;
5565 }
5566 if (*p == '\0') {
5567 if ((value = atof(string)) > 1.0 || value < 0.0) {
5568 fprintf(stderr,"Value for option %s should be between 0.0 and 1.0\n",option);
5569 exit(9);
5570 } else {
5571 return value;
5572 }
5573 }
5574
5575 if (*p == '.') {
5576 p++;
5577 }
5578
5579 if (!isdigit(*p)) {
5580 fprintf(stderr,"Value %s for option %s is not a valid float\n",string,option);
5581 exit(9);
5582 return 0.0;
5583 }
5584 while (*p != '\0' && isdigit(*p)) {
5585 p++;
5586 }
5587
5588 if (*p == 'e') {
5589 p++;
5590 if (*p == '+' || *p == '-') {
5591 p++;
5592 }
5593 if (!isdigit(*p)) {
5594 fprintf(stderr,"Value %s for option %s is not a valid float\n",string,option);
5595 exit(9);
5596 return 0.0;
5597 }
5598 while (*p != '\0' && isdigit(*p)) {
5599 p++;
5600 }
5601 }
5602
5603 if (*p == '\0') {
5604 if ((value = atof(string)) > 1.0 || value < 0.0) {
5605 fprintf(stderr,"Value for option %s should be between 0.0 and 1.0\n",option);
5606 exit(9);
5607 } else {
5608 return value;
5609 }
5610 } else {
5611 fprintf(stderr,"Value %s for option %s is not a valid float\n",string,option);
5612 exit(9);
5613 return 0.0;
5614 }
5615 }
5616
5617 static char *
check_valid_float_or_int(char * string)5618 check_valid_float_or_int (char *string) {
5619 char *p = string;
5620
5621 if (*p == '+' || *p == '-') {
5622 p++;
5623 }
5624
5625 while (*p != '\0' && isdigit(*p)) {
5626 p++;
5627 }
5628 if (*p == '\0') {
5629 return string;
5630 }
5631
5632 if (*p == '.') {
5633 p++;
5634 }
5635
5636 if (!isdigit(*p)) {
5637 fprintf(stderr,"value %s is not a valid float\n",string);
5638 exit(9);
5639 return NULL;
5640 }
5641 while (*p != '\0' && isdigit(*p)) {
5642 p++;
5643 }
5644
5645 if (*p == 'e') {
5646 p++;
5647 if (*p == '+' || *p == '-') {
5648 p++;
5649 }
5650 if (!isdigit(*p)) {
5651 fprintf(stderr,"value %s is not a valid float\n",string);
5652 exit(9);
5653 return NULL;
5654 }
5655 while (*p != '\0' && isdigit(*p)) {
5656 p++;
5657 }
5658 }
5659
5660 if (*p == '\0') {
5661 return string;
5662 } else {
5663 fprintf(stderr,"value %s is not a valid float\n",string);
5664 exit(9);
5665 return NULL;
5666 }
5667 }
5668
5669
5670 static int
parse_command_line(int argc,char * argv[],int optind)5671 parse_command_line (int argc, char *argv[], int optind) {
5672 int opt, c;
5673 extern char *optarg;
5674 int long_option_index = 0;
5675 const char *long_name;
5676 char **argstart;
5677
5678 int len;
5679 int user_ngap = -1;
5680
5681
5682 fprintf(stderr,"GMAP version %s called with args:",PACKAGE_VERSION);
5683 argstart = &(argv[-optind]);
5684 for (c = 1; c < argc + optind; c++) {
5685 fprintf(stderr," %s",argstart[c]);
5686 }
5687 fprintf(stderr,"\n");
5688
5689 while ((opt = getopt_long(argc,argv,
5690 #ifdef PMAP
5691 "q:D:a:d:k:g:2B:K:w:L:x:1t:s:c:SA03468:9n:f:ZO5o:V:v:M:m:ebu:E:PQYNI:i:l:",
5692 #else
5693 "q:D:d:k:g:2B:K:w:L:x:1t:s:c:p:SA03468:9n:f:ZO5o:V:v:M:m:ebu:E:PQFa:Tz:j:YNI:i:l:",
5694 #endif
5695 long_options, &long_option_index)) != -1) {
5696 switch (opt) {
5697 case 0:
5698 long_name = long_options[long_option_index].name;
5699 if (!strcmp(long_name,"version")) {
5700 print_program_version();
5701 return 1;
5702 } else if (!strcmp(long_name,"check")) {
5703 check_compiler_assumptions();
5704 return 1;
5705 } else if (!strcmp(long_name,"help")) {
5706 print_program_usage();
5707 return 1;
5708
5709 } else if (!strcmp(long_name,"time")) {
5710 timingp = true;
5711
5712 } else if (!strcmp(long_name,"use-shared-memory")) {
5713 if (!strcmp(optarg,"1")) {
5714 sharedp = true;
5715 } else if (!strcmp(optarg,"0")) {
5716 sharedp = false;
5717 } else {
5718 fprintf(stderr,"--use-shared-memory flag must be 0 or 1\n");
5719 return 9;
5720 }
5721
5722 } else if (!strcmp(long_name,"preload-shared-memory")) {
5723 preload_shared_memory_p = true;
5724
5725 } else if (!strcmp(long_name,"unload-shared-memory")) {
5726 unload_shared_memory_p = true;
5727
5728 } else if (!strcmp(long_name,"expand-offsets")) {
5729 fprintf(stderr,"Note: --expand-offsets flag is no longer supported. With the latest algorithms, it doesn't improve speed much. Ignoring this flag");
5730
5731 } else if (!strcmp(long_name,"sampling")) {
5732 required_index1interval = atoi(check_valid_int(optarg));
5733
5734 } else if (!strcmp(long_name,"cmdline")) {
5735 user_cmdline = optarg;
5736
5737 } else if (!strcmp(long_name,"suboptimal-score")) {
5738 suboptimal_score_float = atof(check_valid_float_or_int(optarg));
5739 if (suboptimal_score_float > 1.0 && suboptimal_score_float != rint(suboptimal_score_float)) {
5740 fprintf(stderr,"Cannot specify fractional value %f for --suboptimal-score except between 0.0 and 1.0\n",
5741 suboptimal_score_float);
5742 return 9;
5743 }
5744
5745 } else if (!strcmp(long_name,"require-splicedir")) {
5746 require_splicedir_p = true;
5747
5748 } else if (!strcmp(long_name,"splicingdir")) {
5749 user_splicingdir = optarg;
5750
5751 } else if (!strcmp(long_name,"nosplicing")) {
5752 novelsplicingp = false;
5753
5754 } else if (!strcmp(long_name,"no-chimeras")) {
5755 chimera_margin = 0;
5756
5757 } else if (!strcmp(long_name,"translation-code")) {
5758 translation_code = atoi(check_valid_int(optarg));
5759
5760 } else if (!strcmp(long_name,"alt-start-codons")) {
5761 alt_initiation_codons_p = true;
5762
5763 } else if (!strcmp(long_name,"min-intronlength")) {
5764 min_intronlength = atoi(check_valid_int(optarg));
5765
5766 } else if (!strcmp(long_name,"max-intronlength-middle")) {
5767 maxintronlen = atoi(check_valid_int(optarg));
5768
5769 } else if (!strcmp(long_name,"max-intronlength-ends")) {
5770 maxintronlen_ends = atoi(check_valid_int(optarg));
5771
5772 } else if (!strcmp(long_name,"split-large-introns")) {
5773 split_large_introns_p = true;
5774
5775 } else if (!strcmp(long_name,"trim-end-exons")) {
5776 minendexon = atoi(check_valid_int(optarg));
5777
5778 } else if (!strcmp(long_name,"allow-close-indels")) {
5779 if (!strcmp(optarg,"0")) {
5780 /* Disallow */
5781 close_indels_mode = -1;
5782 extraband_single = 0;
5783 } else if (!strcmp(optarg,"1")) {
5784 /* Always allow */
5785 close_indels_mode = +1;
5786 extraband_single = 3;
5787 } else if (!strcmp(optarg,"2")) {
5788 /* Allow for high-quality alignments */
5789 close_indels_mode = 0;
5790 extraband_single = 3;
5791 } else {
5792 fprintf(stderr,"allow-close-indels argument %s not recognized. Only allow 0, 1, or 2. Run 'gsnap --help' for more information.\n",optarg);
5793 return 9;
5794 }
5795 } else if (!strcmp(long_name,"microexon-spliceprob")) {
5796 microexon_spliceprob = check_valid_float(optarg,long_name);
5797 } else if (!strcmp(long_name,"stage2-start")) {
5798 suboptimal_score_start = atoi(check_valid_int(optarg));
5799 } else if (!strcmp(long_name,"stage2-end")) {
5800 suboptimal_score_end = atoi(check_valid_int(optarg));
5801
5802 } else if (!strcmp(long_name,"canonical-mode")) {
5803 if (!strcmp(optarg,"0")) {
5804 canonical_mode = 0;
5805 } else if (!strcmp(optarg,"1")) {
5806 canonical_mode = 1;
5807 } else if (!strcmp(optarg,"2")) {
5808 canonical_mode = 2;
5809 } else {
5810 fprintf(stderr,"Canonical level %s not recognized.\n",optarg);
5811 fprintf(stderr,"0=low reward for canonical introns, 1=high reward for canonical introns (default)\n");
5812 fprintf(stderr,"2=low reward for high-identity seqs, high reward otherwise\n");
5813 return 9;
5814 }
5815
5816 } else if (!strcmp(long_name,"cross-species")) {
5817 cross_species_p = true;
5818
5819 } else if (!strcmp(long_name,"homopolymer")) {
5820 homopolymerp = true;
5821
5822 } else if (!strcmp(long_name,"cmetdir")) {
5823 user_modedir = optarg;
5824
5825 } else if (!strcmp(long_name,"atoidir")) {
5826 user_modedir = optarg;
5827
5828 } else if (!strcmp(long_name,"mode")) {
5829 if (!strcmp(optarg,"standard")) {
5830 mode = STANDARD;
5831 } else if (!strcmp(optarg,"cmet-stranded")) {
5832 mode = CMET_STRANDED;
5833 } else if (!strcmp(optarg,"cmet-nonstranded")) {
5834 mode = CMET_NONSTRANDED;
5835 fprintf(stderr,"Non-stranded mode not yet working properly\n");
5836 exit(9);
5837 } else if (!strcmp(optarg,"atoi-stranded")) {
5838 mode = ATOI_STRANDED;
5839 } else if (!strcmp(optarg,"atoi-nonstranded")) {
5840 mode = ATOI_NONSTRANDED;
5841 fprintf(stderr,"Non-stranded mode not yet working properly\n");
5842 exit(9);
5843 } else if (!strcmp(optarg,"ttoc-stranded")) {
5844 mode = TTOC_STRANDED;
5845 } else if (!strcmp(optarg,"ttoc-nonstranded")) {
5846 mode = TTOC_NONSTRANDED;
5847 fprintf(stderr,"Non-stranded mode not yet working properly\n");
5848 exit(9);
5849 } else {
5850 fprintf(stderr,"--mode must be standard, cmet-stranded, cmet-nonstranded, atoi-stranded, atoi-nonstranded, ttoc-stranded, or ttoc-nonstranded\n");
5851 return 9;
5852 }
5853
5854 } else if (!strcmp(long_name,"min-trimmed-coverage")) {
5855 min_trimmed_coverage = check_valid_float(optarg,long_name);
5856 } else if (!strcmp(long_name,"min-identity")) {
5857 min_identity = check_valid_float(optarg,long_name);
5858
5859 } else if (!strcmp(long_name,"read-files-command")) {
5860 read_files_command = optarg;
5861
5862 } else if (!strcmp(long_name,"input-buffer-size")) {
5863 inbuffer_nspaces = atoi(check_valid_int(optarg));
5864 } else if (!strcmp(long_name,"output-buffer-size")) {
5865 output_buffer_size = atoi(check_valid_int(optarg));
5866 } else if (!strcmp(long_name,"print-comment")) {
5867 print_comment_p = true;
5868 } else if (!strcmp(long_name,"failsonly")) {
5869 if (nofailsp == true) {
5870 fprintf(stderr,"Cannot specify both --nofails and --failsonly\n");
5871 return 9;
5872 } else {
5873 failsonlyp = true;
5874 }
5875 } else if (!strcmp(long_name,"failed-input")) {
5876 failedinput_root = optarg;
5877 #if 0
5878 } else if (!strcmp(long_name,"quiet-if-excessive")) {
5879 quiet_if_excessive_p = true;
5880 #endif
5881 } else if (!strcmp(long_name,"nofails")) {
5882 if (failsonlyp == true) {
5883 fprintf(stderr,"Cannot specify both --nofails and --failsonly\n");
5884 return 9;
5885 } else {
5886 nofailsp = true;
5887 }
5888 } else if (!strcmp(long_name,"split-output")) {
5889 split_output_root = optarg;
5890 } else if (!strcmp(long_name,"append-output")) {
5891 appendp = true;
5892
5893 } else if (!strcmp(long_name,"gff3-add-separators")) {
5894 if (!strcmp(optarg,"1")) {
5895 gff3_separators_p = true;
5896 } else if (!strcmp(optarg,"0")) {
5897 gff3_separators_p = false;
5898 } else {
5899 fprintf(stderr,"--gff3-add-separators flag must be 0 or 1\n");
5900 return 9;
5901 }
5902
5903 } else if (!strcmp(long_name,"gff3-swap-phase")) {
5904 if (!strcmp(optarg,"1")) {
5905 gff3_phase_swap_p = true;
5906 } else if (!strcmp(optarg,"0")) {
5907 gff3_phase_swap_p = false;
5908 } else {
5909 fprintf(stderr,"--gff3-swap-phase flag must be 0 or 1\n");
5910 return 9;
5911 }
5912
5913 } else if (!strcmp(long_name,"gff3-fasta-annotation")) {
5914 if (!strcmp(optarg,"0")) {
5915 gff3_fasta_annotation_type = NO_ANNOTATION;
5916 } else if (!strcmp(optarg,"1")) {
5917 gff3_fasta_annotation_type = INSERT_ANNOTATION;
5918 } else if (!strcmp(optarg,"2")) {
5919 gff3_fasta_annotation_type = KEYVALUE_ANNOTATION;
5920 } else {
5921 fprintf(stderr,"--gff3-fasta-annotation flag must be 0, 1, or 2\n");
5922 return 9;
5923 }
5924
5925 } else if (!strcmp(long_name,"gff3-cds")) {
5926 if (!strcmp(optarg,"cdna")) {
5927 cdstype = CDS_CDNA;
5928 } else if (!strcmp(optarg,"genomic")) {
5929 cdstype = CDS_GENOMIC;
5930 } else {
5931 fprintf(stderr,"--gff3-cds flag must be cdna or genomic\n");
5932 return 9;
5933 }
5934
5935 #ifndef PMAP
5936 } else if (!strcmp(long_name,"no-sam-headers")) {
5937 sam_headers_p = false;
5938 } else if (!strcmp(long_name,"sam-use-0M")) {
5939 sam_insert_0M_p = true;
5940 } else if (!strcmp(long_name,"sam-extended-cigar")) {
5941 sam_cigar_extended_p = true;
5942 } else if (!strcmp(long_name,"quality-protocol")) {
5943 if (user_quality_shift == true) {
5944 fprintf(stderr,"Cannot specify both -j (--quality-print-shift) and --quality-protocol\n");
5945 return 9;
5946 } else if (!strcmp(optarg,"illumina")) {
5947 quality_shift = -31;
5948 user_quality_shift = true;
5949 } else if (!strcmp(optarg,"sanger")) {
5950 quality_shift = 0;
5951 user_quality_shift = true;
5952 } else {
5953 fprintf(stderr,"The only values allowed for --quality-protocol are illumina or sanger\n");
5954 return 9;
5955 }
5956
5957 } else if (!strcmp(long_name,"force-xs-dir")) {
5958 force_xs_direction_p = true;
5959
5960 } else if (!strcmp(long_name,"md-lowercase-snp")) {
5961 md_lowercase_variant_p = true;
5962
5963 } else if (!strcmp(long_name,"action-if-cigar-error")) {
5964 if (!strcmp(optarg,"ignore")) {
5965 cigar_action = CIGAR_ACTION_IGNORE;
5966 } else if (!strcmp(optarg,"warning")) {
5967 cigar_action = CIGAR_ACTION_WARNING;
5968 } else if (!strcmp(optarg,"noprint")) {
5969 cigar_action = CIGAR_ACTION_NOPRINT;
5970 } else if (!strcmp(optarg,"abort")) {
5971 cigar_action = CIGAR_ACTION_ABORT;
5972 } else {
5973 fprintf(stderr,"The only values allowed for --action-if-cigar-error are ignore, warning, noprint, abort\n");
5974 return 9;
5975 }
5976
5977 } else if (!strcmp(long_name,"read-group-id")) {
5978 sam_read_group_id = optarg;
5979 } else if (!strcmp(long_name,"read-group-name")) {
5980 sam_read_group_name = optarg;
5981 } else if (!strcmp(long_name,"read-group-library")) {
5982 sam_read_group_library = optarg;
5983 } else if (!strcmp(long_name,"read-group-platform")) {
5984 sam_read_group_platform = optarg;
5985 #endif
5986 } else {
5987 /* Shouldn't reach here */
5988 fprintf(stderr,"Don't recognize option %s. For usage, run 'gmap --help'",long_name);
5989 return 9;
5990 }
5991 break;
5992
5993 case 'q': parse_part(&part_modulus,&part_interval,optarg); break;
5994 case 'D': user_genomedir = optarg; break;
5995 case 'd':
5996 dbroot = (char *) CALLOC(strlen(optarg)+1,sizeof(char));
5997 strcpy(dbroot,optarg);
5998 break;
5999 #ifdef PMAP
6000 case 'a':
6001 if ((required_alphabet = Alphabet_find(optarg)) == AA0) {
6002 return 9;
6003 }
6004 break;
6005 case 'k': required_index1part = atoi(check_valid_int(optarg)); break;
6006 #else
6007 case 'k':
6008 required_index1part = atoi(check_valid_int(optarg));
6009 if (required_index1part > MAXIMUM_KMER) {
6010 fprintf(stderr,"The value for k-mer size must be %d or less\n",MAXIMUM_KMER);
6011 return 9;
6012 }
6013 break;
6014 #endif
6015 #if 0
6016 case 'G': uncompressedp = true; break;
6017 #endif
6018 case 'g': user_genomicseg = optarg; break;
6019 case '1': user_selfalign_p = true; break;
6020 case '2': user_pairalign_p = true; break;
6021
6022 case 'B':
6023 if (!strcmp(optarg,"5")) {
6024 fprintf(stderr,"Note: Batch mode 5 is now the same as batch mode 4.\n");
6025 offsetsstrm_access = USE_ALLOCATE; /* Doesn't matter */
6026 positions_access = USE_ALLOCATE;
6027 locoffsetsstrm_access = USE_ALLOCATE; /* Doesn't matter */
6028 locpositions_access = USE_ALLOCATE;
6029
6030 genome_access = USE_ALLOCATE;
6031
6032 } else if (!strcmp(optarg,"4")) {
6033 offsetsstrm_access = USE_ALLOCATE;
6034 positions_access = USE_ALLOCATE;
6035 locoffsetsstrm_access = USE_ALLOCATE;
6036 locpositions_access = USE_ALLOCATE;
6037
6038 genome_access = USE_ALLOCATE;
6039 #ifdef HAVE_MMAP
6040
6041 } else if (!strcmp(optarg,"3")) {
6042 offsetsstrm_access = USE_ALLOCATE;
6043 positions_access = USE_ALLOCATE;
6044 locoffsetsstrm_access = USE_ALLOCATE;
6045 locpositions_access = USE_ALLOCATE;
6046
6047 genome_access = USE_MMAP_PRELOAD; /* was batch_genome_p = true */
6048
6049 } else if (!strcmp(optarg,"2")) {
6050 offsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
6051 positions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
6052 locoffsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
6053 locpositions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
6054
6055 genome_access = USE_MMAP_PRELOAD; /* was batch_genome_p = true */
6056
6057 } else if (!strcmp(optarg,"1")) {
6058 offsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
6059 positions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
6060 locoffsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
6061 locpositions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
6062
6063 genome_access = USE_MMAP_ONLY; /* was batch_genome_p = false */
6064
6065 } else if (!strcmp(optarg,"0")) {
6066 offsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
6067 positions_access = USE_MMAP_ONLY; /* was batch_positions_p = false */
6068 locoffsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
6069 locpositions_access = USE_MMAP_ONLY; /* was batch_positions_p = false */
6070
6071 genome_access = USE_MMAP_ONLY; /* was batch_genome_p = false */
6072 #endif
6073
6074 } else {
6075 #ifdef HAVE_MMAP
6076 fprintf(stderr,"Batch mode %s not recognized. Only allow 0-5. Run 'gmap --help' for more information.\n",optarg);
6077 #else
6078 fprintf(stderr,"Batch mode %s not recognized. Only allow 4-5, since mmap is disabled. Run 'gmap --help' for more information.\n",optarg);
6079 #endif
6080 return 9;
6081 }
6082 break;
6083
6084 case 'K': maxintronlen = maxintronlen_ends = atoi(check_valid_int(optarg)); break;
6085
6086 case 'w': shortsplicedist = strtoul(check_valid_int(optarg),NULL,10); break;
6087
6088 case 'L': maxtotallen_bound = atoi(check_valid_int(optarg)); break;
6089 case 'x':
6090 #ifdef PMAP
6091 chimera_margin = atoi(check_valid_int(optarg))/3;
6092 #else
6093 chimera_margin = atoi(check_valid_int(optarg));
6094 #endif
6095 if (chimera_margin <= 0) {
6096 /* Disable finding of chimeras */
6097 #if 0
6098 } else if (chimera_margin < CHIMERA_SLOP) {
6099 /* Not sure why chimera_margin should be tied to CHIMERA_SLOP */
6100 chimera_margin = CHIMERA_SLOP;
6101 #endif
6102 }
6103 break;
6104 /* case 'w': referencefile = optarg; break; */
6105
6106 #ifdef HAVE_PTHREAD
6107 case 't': nworkers = atoi(check_valid_int(optarg)); break;
6108 #else
6109 case 't': fprintf(stderr,"This version of GMAP has pthreads disabled, so ignoring the value of %s for -t\n",optarg); break;
6110 #endif
6111
6112 case 's': splicing_file = optarg; knownsplicingp = true; break;
6113 case 'c': user_chrsubsetname = optarg; break;
6114
6115 #ifndef PMAP
6116 case 'p': switch (atoi(check_valid_int(optarg))) {
6117 case 0: prune_poor_p = false, prune_repetitive_p = false; break;
6118 case 1: prune_poor_p = true; prune_repetitive_p = false; break;
6119 case 2: prune_poor_p = false; prune_repetitive_p = true; break;
6120 case 3: prune_poor_p = true; prune_repetitive_p = true; break;
6121 default: fprintf(stderr,"Prune level %s not recognized.\n",optarg);
6122 fprintf(stderr,"0=no pruning, 1=poor seqs, 2=repetitive seqs, 3=both poor and repetitive seqs (default)\n");
6123 return 9;
6124 }
6125 break;
6126 #endif
6127
6128 case 'S': printtype = SUMMARY; break;
6129 case 'A': printtype = ALIGNMENT; break;
6130 case '0': exception_raise_p = false; break; /* Allows signals to pass through */
6131 case '3': printtype = CONTINUOUS; break;
6132 case '4': printtype = CONTINUOUS_BY_EXON; break;
6133 case '6': debug_graphic_p = true; break;
6134 case '8':
6135 if (!strcmp(optarg,"stage1")) {
6136 stage1debug = true;
6137 } else if (!strcmp(optarg,"diag")) {
6138 diag_debug = true;
6139 } else if (!strcmp(optarg,"stage2")) {
6140 stage3debug = POST_STAGE2;
6141 } else if (!strcmp(optarg,"singles")) {
6142 stage3debug = POST_SINGLES;
6143 } else if (!strcmp(optarg,"introns")) {
6144 stage3debug = POST_INTRONS;
6145 } else if (!strcmp(optarg,"hmm")) {
6146 stage3debug = POST_HMM;
6147 } else if (!strcmp(optarg,"smoothing")) {
6148 stage3debug = POST_SMOOTHING;
6149 } else if (!strcmp(optarg,"dualintrons")) {
6150 stage3debug = POST_DUAL_INTRONS;
6151 } else if (!strcmp(optarg,"cycles")) {
6152 stage3debug = POST_CYCLES;
6153 } else if (!strcmp(optarg,"dualbreaks")) {
6154 stage3debug = POST_DUAL_BREAKS;
6155 } else if (!strcmp(optarg,"middle")) {
6156 stage3debug = POST_MIDDLE;
6157 } else if (!strcmp(optarg,"ends")) {
6158 stage3debug = POST_ENDS;
6159 } else if (!strcmp(optarg,"canonical")) {
6160 stage3debug = POST_CANONICAL;
6161 } else if (!strcmp(optarg,"trim")) {
6162 stage3debug = POST_CANONICAL;
6163 } else if (!strcmp(optarg,"changepoint")) {
6164 stage3debug = POST_CHANGEPOINT;
6165 } else if (!strcmp(optarg,"distalmedial")) {
6166 stage3debug = POST_DISTAL_MEDIAL;
6167 } else {
6168 fprintf(stderr,"Allowed arguments for -8 flag are stage2, smoothing, singles, introns, hmm, dualbreaks, cycles, canonical, changepoint, distalmedial\n");
6169 return 9;
6170 }
6171 break;
6172 case '9': checkp = true; break;
6173 case 'n':
6174 maxpaths_report = atoi(check_valid_int(optarg));
6175 if (maxpaths_report == 1) {
6176 fprintf(stderr,"Note: -n 1 will not report chimeric alignments. If you want a single alignment plus chimeras, use -n 0 instead.\n");
6177 }
6178 break;
6179 case 'f':
6180 if (!strcmp(optarg,"1") || !strcmp(optarg,"psl_nt")) {
6181 printtype = PSL_NT;
6182 #ifdef PMAP
6183 } else if (!strcmp(optarg,"0") || !strcmp(optarg,"psl_pro")) {
6184 printtype = PSL_PRO;
6185 #else
6186 } else if (!strcmp(optarg,"psl")) {
6187 printtype = PSL_NT;
6188 } else if (!strcmp(optarg,"6") || !strcmp(optarg,"splicesites")) {
6189 printtype = SPLICESITES;
6190 } else if (!strcmp(optarg,"introns")) {
6191 printtype = INTRONS;
6192 } else if (!strcmp(optarg,"mask_introns")) {
6193 printtype = MASK_INTRONS;
6194 } else if (!strcmp(optarg,"mask_utr_introns")) {
6195 printtype = MASK_UTR_INTRONS;
6196 } else if (!strcmp(optarg,"samse")) {
6197 printtype = SAM;
6198 sam_paired_p = false;
6199 } else if (!strcmp(optarg,"sampe")) {
6200 printtype = SAM;
6201 sam_paired_p = true;
6202 } else if (!strcmp(optarg,"bedpe")) {
6203 printtype = BEDPE;
6204 #endif
6205 } else if (!strcmp(optarg,"2") || !strcmp(optarg,"gff3_gene")) {
6206 printtype = GFF3_GENE;
6207 } else if (!strcmp(optarg,"3") || !strcmp(optarg,"gff3_match_cdna")) {
6208 printtype = GFF3_MATCH_CDNA;
6209 } else if (!strcmp(optarg,"4") || !strcmp(optarg,"gff3_match_est")) {
6210 printtype = GFF3_MATCH_EST;
6211 } else if (!strcmp(optarg,"7") || !strcmp(optarg,"map_exons")) {
6212 printtype = MAP_EXONS;
6213 } else if (!strcmp(optarg,"8") || !strcmp(optarg,"map_ranges")) {
6214 printtype = MAP_RANGES;
6215 } else if (!strcmp(optarg,"9") || !strcmp(optarg,"coords")) {
6216 printtype = COORDS;
6217 } else {
6218 fprintf(stderr,"Output format \"%s\" not recognized. Allowed formats are:\n",optarg);
6219 fprintf(stderr," psl_nt (1)\n");
6220 #ifdef PMAP
6221 fprintf(stderr," psl_pro (0)\n");
6222 #else
6223 fprintf(stderr," psl\n");
6224 fprintf(stderr," splicesites (6)\n");
6225 fprintf(stderr," introns\n");
6226 fprintf(stderr," mask_introns\n");
6227 fprintf(stderr," mask_utr_introns\n");
6228 fprintf(stderr," samse\n");
6229 fprintf(stderr," sampe\n");
6230 fprintf(stderr," bedpe\n");
6231 #endif
6232 fprintf(stderr," gff3_gene (2)\n");
6233 fprintf(stderr," gff3_match_cdna (3)\n");
6234 fprintf(stderr," gff3_match_est (4)\n");
6235 fprintf(stderr," map_exons (7)\n");
6236 fprintf(stderr," map_ranges (8)\n");
6237 fprintf(stderr," coords (9)\n");
6238 return 9;
6239 }
6240 break;
6241 case 'Z': printtype = COMPRESSED; break;
6242 case 'O': orderedp = true; break;
6243 case '5': checksump = true; break;
6244 case 'o': chimera_overlap = atoi(check_valid_int(optarg)); break;
6245
6246 case 'V': user_snpsdir = optarg; break;
6247 case 'v': snps_root = optarg; break;
6248
6249 case 'M': user_mapdir = optarg; break;
6250 case 'm':
6251 map_iitfile = (char *) CALLOC(strlen(optarg)+1,sizeof(char));
6252 strcpy(map_iitfile,optarg);
6253 if ((len = strlen(map_iitfile)) > 4 && strcmp(&(map_iitfile[len-4]),".iit") == 0) {
6254 map_iitfile[len-4] = '\0';
6255 }
6256 break;
6257
6258 case 'e': map_exons_p = true; break;
6259 case 'b': map_bothstrands_p = true; break;
6260 case 'u': nflanking = atoi(check_valid_int(optarg)); break;
6261
6262 case 'E':
6263 if (!strcmp(optarg,"cdna")) {
6264 printtype = EXONS_CDNA;
6265 } else if (!strcmp(optarg,"genomic")) {
6266 printtype = EXONS_GENOMIC;
6267 } else if (!strcmp(optarg,"cdna+introns")) {
6268 printtype = EXONS_CDNA_WINTRONS;
6269 } else if (!strcmp(optarg,"genomic+introns")) {
6270 printtype = EXONS_GENOMIC_WINTRONS;
6271 } else {
6272 fprintf(stderr,"Argument to -E flag must be either \"cdna\" or \"genomic\"\n");
6273 return 9;
6274 }
6275 break;
6276
6277 #ifdef PMAP
6278 case 'P': printtype = PROTEIN_GENOMIC; break;
6279 case 'Q': printtype = CDNA; break;
6280 #else
6281 case 'P': printtype = CDNA; break;
6282 case 'Q': printtype = PROTEIN_GENOMIC; break;
6283 case 'F': fulllengthp = true; break;
6284 case 'a': cds_startpos = atoi(check_valid_int(optarg)); break;
6285 case 'T': truncatep = true; fulllengthp = true; break;
6286 case 'z':
6287 if (!strcmp(optarg,"sense_force")) {
6288 sense_try = +1;
6289 sense_filter = 0;
6290 } else if (!strcmp(optarg,"antisense_force")) {
6291 sense_try = -1;
6292 sense_filter = 0;
6293 } else if (!strcmp(optarg,"sense_filter")) {
6294 sense_try = 0;
6295 sense_filter = +1;
6296 } else if (!strcmp(optarg,"antisense_filter")) {
6297 sense_try = 0;
6298 sense_filter = -1;
6299 } else if (!strcmp(optarg,"auto")) {
6300 sense_try = 0;
6301 sense_filter = 0;
6302 } else {
6303 fprintf(stderr,"direction %s not recognized. Must be sense_force, antisense_force, sense_filter, antisense_filter, or auto\n",optarg);
6304 return 9;
6305 }
6306 break;
6307
6308 case 'j':
6309 if (user_quality_shift == true) {
6310 fprintf(stderr,"Cannot specify both -j (--quality-print-shift) and --quality-protocol\n");
6311 return 9;
6312 } else {
6313 quality_shift = atoi(check_valid_int(optarg));
6314 user_quality_shift = true;
6315 }
6316 break;
6317
6318 #endif
6319 case 'Y': strictp = false; break;
6320 case 'N': nointronlenp = true; break;
6321 case 'I': invertmode = atoi(check_valid_int(optarg)); break;
6322 case 'i': user_ngap = atoi(check_valid_int(optarg)); break;
6323 case 'l': wraplength = atoi(check_valid_int(optarg)); break;
6324
6325 case '?': fprintf(stderr,"For usage, run 'gmap --help'\n"); return 9;
6326 default: return 9;
6327 }
6328 }
6329
6330 if (printtype == SPLICESITES || printtype == INTRONS) {
6331 if (maxpaths_report > 1 || (sense_try != +1 && sense_filter != +1)) {
6332 fprintf(stderr,"For splicesites or introns output, you should probably add flags '-n 1' and either '-z sense_force' or '-z sense_filter'.\n");
6333 }
6334 }
6335
6336 if (user_ngap >= 0) {
6337 ngap = user_ngap;
6338 } else if (printtype == EXONS_CDNA || printtype == EXONS_GENOMIC) {
6339 /* If user didn't specify, then set to zero */
6340 ngap = 0;
6341 } else if (printtype == EXONS_CDNA_WINTRONS || printtype == EXONS_GENOMIC_WINTRONS) {
6342 /* If user didn't specify, then set to infinity */
6343 ngap = 2147483647; /* INT_MAX */
6344 };
6345
6346 if (maxintronlen > maxtotallen_bound) {
6347 maxintronlen = maxtotallen_bound;
6348 }
6349
6350 #ifdef HAVE_PTHREAD
6351 #ifdef USE_DIAGPOOL
6352 if (diag_debug == true && nworkers > 0) {
6353 fprintf(stderr,"For diag output, must specify 0 threads\n");
6354 exit(9);
6355 }
6356 #endif
6357 #endif
6358
6359 if (user_cmdline != NULL) {
6360 part_modulus = 0;
6361 part_interval = 1;
6362 inbuffer_nspaces = 0;
6363 nchromosomes = 1;
6364 dbroot = (char *) NULL;
6365 } else if (user_selfalign_p == true) {
6366 nchromosomes = 1;
6367 dbroot = (char *) NULL;
6368 } else if (user_pairalign_p == true) {
6369 nchromosomes = 1;
6370 dbroot = (char *) NULL;
6371 } else if (user_genomicseg != NULL) {
6372 /* Ignore -D and -d flags */
6373 nchromosomes = 1;
6374 dbroot = (char *) NULL;
6375 } else if (dbroot == NULL) {
6376 fprintf(stderr,"Need to specify the -d, -g, -1, -2, or --cmdline flag\n");
6377 print_program_usage();
6378 return 9;
6379 } else if (!strcmp(dbroot,"?")) {
6380 Datadir_avail_gmap_databases(stdout,user_genomedir);
6381 return 1;
6382 }
6383
6384 #ifndef PMAP
6385 if (printtype == SAM) {
6386 if (sam_read_group_id == NULL && sam_read_group_name != NULL) {
6387 sam_read_group_id = sam_read_group_name;
6388 } else if (sam_read_group_id != NULL && sam_read_group_name == NULL) {
6389 sam_read_group_name = sam_read_group_id;
6390 }
6391 }
6392 #endif
6393
6394 return 0;
6395 }
6396
6397
6398 static Inbuffer_T
open_input_stream(int * nread,Sequence_T * usersegment,int argc,char ** argv)6399 open_input_stream (int *nread, Sequence_T *usersegment, int argc, char **argv) {
6400 Inbuffer_T inbuffer;
6401 int nextchar = '\0';
6402 FILE *input = NULL;
6403 char **files;
6404 int nfiles;
6405
6406 Request_T request;
6407 char *p;
6408
6409 /* Read user segment before rest of sequences, because of shared usage of sequence.c */
6410 if (user_cmdline != NULL) {
6411 p = user_cmdline;
6412 while (*p != '\0' && *p != ',') {
6413 p++;
6414 }
6415 if (*p == '\0') {
6416 fprintf(stderr,"--cmdline requires two strings separated by a comma");
6417 exit(9);
6418 } else {
6419 *usersegment = global_usersegment = Sequence_genomic_new(user_cmdline,(int) (p - user_cmdline),/*copyp*/true);
6420 if ((min_matches = Sequence_fulllength(*usersegment)/2) > MIN_MATCHES) {
6421 min_matches = MIN_MATCHES;
6422 }
6423 p++;
6424 }
6425
6426 } else if (user_selfalign_p == true) {
6427 /* usersegment will be assigned to query sequence below */
6428
6429 } else if (user_pairalign_p == true) {
6430 /* Unfortunately, this procedure reads header of queryseq */
6431 *usersegment = global_usersegment = Sequence_read_unlimited(&nextchar,stdin);
6432 if ((min_matches = Sequence_fulllength(*usersegment)/2) > MIN_MATCHES) {
6433 min_matches = MIN_MATCHES;
6434 }
6435
6436 } else if (user_genomicseg != NULL) {
6437 if ((input = FOPEN_READ_TEXT(user_genomicseg)) == NULL) {
6438 fprintf(stderr,"Can't open file %s\n",user_genomicseg);
6439 exit(9);
6440 }
6441 if ((*usersegment = global_usersegment = Sequence_read_unlimited(&nextchar,input)) == NULL) {
6442 fprintf(stderr,"File %s is empty\n",user_genomicseg);
6443 exit(9);
6444 } else {
6445 genomelength = (Univcoord_T) Sequence_fulllength(*usersegment);
6446 }
6447
6448 if ((min_matches = Sequence_fulllength(*usersegment)/2) > MIN_MATCHES) {
6449 min_matches = MIN_MATCHES;
6450 }
6451 fclose(input);
6452
6453 } else {
6454 min_matches = MIN_MATCHES;
6455 }
6456
6457 Inbuffer_setup(/*filter_if_both_p*/false,user_pairalign_p,global_usersegment,
6458 part_modulus,part_interval);
6459 if (user_cmdline != NULL) {
6460 inbuffer = Inbuffer_cmdline(p,strlen(p));
6461 *nread = 1;
6462
6463 } else if (user_selfalign_p == true) {
6464 input = stdin;
6465 files = (char **) NULL;
6466 nfiles = 0;
6467
6468 /* Read in first batch of sequences */
6469 inbuffer = Inbuffer_new(nextchar,input,read_files_command,files,nfiles,inbuffer_nspaces);
6470 *nread = Inbuffer_fill_init(inbuffer);
6471 request = Inbuffer_first_request(inbuffer); /* Need usersegment, not the request itself */
6472 *usersegment = Request_queryseq(request);
6473
6474 } else {
6475 /* Open input stream and peek at first char */
6476 if (user_pairalign_p == true) {
6477 input = stdin;
6478 files = (char **) NULL;
6479 nfiles = 0;
6480 inbuffer_nspaces = 1;
6481 } else if (argc == 0) {
6482 fprintf(stderr,"Reading from stdin\n");
6483 input = stdin;
6484 files = (char **) NULL;
6485 nfiles = 0;
6486 } else {
6487 input = NULL;
6488 files = argv;
6489 nfiles = argc;
6490 }
6491
6492 /* Read in first batch of sequences */
6493 inbuffer = Inbuffer_new(nextchar,input,read_files_command,files,nfiles,inbuffer_nspaces);
6494 #ifdef USE_MPI
6495 *nread = 0;
6496 #else
6497 *nread = Inbuffer_fill_init(inbuffer);
6498 #endif
6499 }
6500
6501 return inbuffer;
6502 }
6503
6504
6505 int
main(int argc,char * argv[])6506 main (int argc, char *argv[]) {
6507 int cmdline_status;
6508
6509 char *genomesubdir = NULL, *snpsdir = NULL, *modedir = NULL, *mapdir = NULL, *iitfile = NULL, *fileroot = NULL;
6510 char *idx_filesuffix1, *idx_filesuffix2;
6511 int divno;
6512 Univinterval_T interval;
6513 Sequence_T usersegment = NULL;
6514
6515 int nread;
6516 double runtime;
6517
6518 Splicestringpool_T splicestringpool;
6519
6520 #ifdef HAVE_PTHREAD
6521 int ret, i;
6522 pthread_attr_t thread_attr_join;
6523 #ifdef WORKER_DETACH
6524 pthread_attr_t thread_attr_detach;
6525 #endif
6526 #endif
6527
6528 #ifdef HAVE_SIGACTION
6529 struct sigaction signal_action;
6530 #endif
6531
6532 extern int optind;
6533
6534 #ifdef MEMUSAGE
6535 Mem_usage_init();
6536 Mem_usage_set_threadname("main");
6537 #endif
6538
6539
6540 #ifdef USE_MPI
6541 MPI_Init(&argc,&argv);
6542 MPI_Comm_rank(MPI_COMM_WORLD,&myid);
6543 MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
6544
6545 if ((n_worker_procs = nprocs - 1) == 0) {
6546 if (myid == 0) {
6547 fprintf(stderr,"Need at least 2 processes for MPI version\n");
6548 }
6549 MPI_Finalize();
6550 exit(0);
6551
6552 } else {
6553 MPI_Debug_setup(myid);
6554 }
6555 #endif
6556
6557 cmdline_status = parse_command_line(argc,argv,optind);
6558 argc -= optind;
6559 argv += optind;
6560
6561 if (cmdline_status == 0) {
6562 /* okay to continue */
6563 } else if (cmdline_status == 1) {
6564 /* only information needed */
6565 #ifdef USE_MPI
6566 MPI_Finalize();
6567 #endif
6568 exit(0);
6569 } else {
6570 #ifdef USE_MPI
6571 MPI_Finalize();
6572 #endif
6573 exit(cmdline_status);
6574 }
6575
6576 check_compiler_assumptions();
6577
6578 if (exception_raise_p == false) {
6579 fprintf(stderr,"Allowing signals and exceptions to pass through. If using shared memory, need to remove segments manually.\n");
6580 Except_inactivate();
6581 } else {
6582 #ifdef HAVE_SIGACTION
6583 signal_action.sa_handler = signal_handler;
6584 signal_action.sa_flags = 0;
6585 sigfillset(&signal_action.sa_mask); /* After first signal, block all other signals */
6586
6587 /* Note: SIGKILL and SIGSTOP cannot be caught */
6588
6589 sigaction(SIGFPE,&signal_action,NULL);
6590 sigaction(SIGSEGV,&signal_action,NULL);
6591 sigaction(SIGTRAP,&signal_action,NULL);
6592 sigaction(SIGUSR1,&signal_action,NULL);
6593 sigaction(SIGABRT,&signal_action,NULL); /* abnormal termination (abort) */
6594 sigaction(SIGBUS,&signal_action,NULL); /* bus error */
6595 sigaction(SIGFPE,&signal_action,NULL); /* arithmetic exception */
6596 sigaction(SIGHUP,&signal_action,NULL); /* hangup */
6597 sigaction(SIGILL,&signal_action,NULL); /* illegal hardware instruction */
6598 sigaction(SIGINT,&signal_action,NULL); /* terminal interruption (control-C) */
6599 sigaction(SIGPIPE,&signal_action,NULL); /* write to pipe with no readers */
6600 sigaction(SIGQUIT,&signal_action,NULL); /* terminal quit (control-backslash) */
6601 sigaction(SIGSEGV,&signal_action,NULL); /* invalid memory reference */
6602 sigaction(SIGSYS,&signal_action,NULL); /* invalid system call */
6603 sigaction(SIGTERM,&signal_action,NULL); /* Unix kill command */
6604 sigaction(SIGTRAP,&signal_action,NULL); /* hardware fault */
6605 sigaction(SIGXCPU,&signal_action,NULL); /* CPU limit exceeded */
6606 sigaction(SIGXFSZ,&signal_action,NULL); /* file size limit exceeded */
6607 #endif
6608 }
6609
6610 #ifdef USE_MPI
6611 if (myid > 0) {
6612 inbuffer = open_input_stream(&nread,&usersegment,argc,argv);
6613 }
6614
6615 #else
6616 inbuffer = open_input_stream(&nread,&usersegment,argc,argv);
6617
6618 if (nread > 1) {
6619 multiple_sequences_p = true;
6620 #if 0
6621 #ifdef HAVE_MMAP
6622 if (offsetsstrm_access != USE_ALLOCATE || genome_access != USE_ALLOCATE) {
6623 fprintf(stderr,"Note: >1 sequence detected, so index files are being memory mapped.\n");
6624 fprintf(stderr," GMAP can run slowly at first while the computer starts to accumulate\n");
6625 fprintf(stderr," pages from the hard disk into its cache. To copy index files into RAM\n");
6626 fprintf(stderr," instead of memory mapping, use -B 3, -B 4, or -B 5, if you have enough RAM.\n");
6627 #ifdef HAVE_PTHREAD
6628 fprintf(stderr," For more speed, also try multiple threads (-t <int>), if you have multiple processors or cores.");
6629 #endif
6630 fprintf(stderr,"\n");
6631 #endif
6632 }
6633 #endif
6634
6635 } else {
6636 /* fprintf(stderr,"Note: only 1 sequence detected. Ignoring batch (-B) command\n"); */
6637 multiple_sequences_p = false;
6638 expand_offsets_p = false;
6639 #ifdef HAVE_MMAP
6640 offsetsstrm_access = USE_MMAP_ONLY;
6641 positions_access = USE_MMAP_ONLY;
6642 genome_access = USE_MMAP_ONLY;
6643 #else
6644 offsetsstrm_access = USE_ALLOCATE;
6645 positions_access = USE_ALLOCATE;
6646 genome_access = USE_ALLOCATE;
6647 #endif
6648 }
6649
6650 #endif
6651
6652
6653 if (dbroot != NULL) {
6654 /* Prepare genomic data */
6655 genomesubdir = Datadir_find_genomesubdir(&fileroot,&dbversion,user_genomedir,dbroot);
6656
6657 iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
6658 strlen(fileroot)+strlen(".chromosome.iit")+1,sizeof(char));
6659 sprintf(iitfile,"%s/%s.chromosome.iit",genomesubdir,fileroot);
6660 if ((chromosome_iit = Univ_IIT_read(iitfile,/*readonlyp*/true,/*add_iit_p*/false)) == NULL) {
6661 fprintf(stderr,"IIT file %s is not valid\n",iitfile);
6662 exit(9);
6663 #ifdef LARGE_GENOMES
6664 } else if (Univ_IIT_coord_values_8p(chromosome_iit) == false) {
6665 fprintf(stderr,"This program gmapl is designed for large genomes.\n");
6666 fprintf(stderr,"For small genomes of less than 2^32 (4 billion) bp, please run gmap instead.\n");
6667 exit(9);
6668 #endif
6669 } else {
6670 FREE(iitfile);
6671 nchromosomes = Univ_IIT_total_nintervals(chromosome_iit);
6672 circular_typeint = Univ_IIT_typeint(chromosome_iit,"circular");
6673
6674 iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
6675 strlen(fileroot)+strlen(".altscaffold.iit")+1,sizeof(char));
6676 sprintf(iitfile,"%s/%s.altscaffold.iit",genomesubdir,fileroot);
6677 if ((altscaffold_iit = Univ_IIT_read(iitfile,/*readonlyp*/true,/*add_iit_p*/false)) == NULL) {
6678 /* fprintf(stderr,"No altscaffold file found\n"); */
6679 altlocp = (bool *) CALLOC(nchromosomes+1,sizeof(bool));
6680 alias_starts = (Univcoord_T *) CALLOC(nchromosomes+1,sizeof(Univcoord_T));
6681 alias_ends = (Univcoord_T *) CALLOC(nchromosomes+1,sizeof(Univcoord_T));
6682
6683 } else {
6684 fprintf(stderr,"Found altscaffold file found\n");
6685 altlocp = Univ_IIT_altlocp(&alias_starts,&alias_ends,chromosome_iit,altscaffold_iit);
6686 Univ_IIT_free(&altscaffold_iit);
6687 }
6688 FREE(iitfile);
6689 }
6690
6691 genomelength = Univ_IIT_genomelength(chromosome_iit,/*with_circular_alias*/true);
6692 }
6693
6694 #ifdef USE_MPI
6695 /* Can prevent loading of files by rank 0 process */
6696 #endif
6697
6698 if (map_iitfile == NULL) {
6699 /* Skip */
6700 } else if (!strcmp(map_iitfile,"?")) {
6701 Datadir_avail_maps(stdout,user_mapdir,genomesubdir,fileroot);
6702 exit(0);
6703 } else {
6704 mapdir = Datadir_find_mapdir(user_mapdir,genomesubdir,fileroot);
6705 iitfile = (char *) CALLOC(strlen(mapdir)+strlen("/")+
6706 strlen(map_iitfile)+strlen(".iit")+1,sizeof(char));
6707 sprintf(iitfile,"%s/%s.iit",mapdir,map_iitfile);
6708 if ((map_iit = IIT_read(iitfile,/*name*/map_iitfile,/*readonlyp*/true,/*divread*/READ_ALL,
6709 /*divstring*/NULL,/*add_iit_p*/true)) == NULL) {
6710 fprintf(stderr,"Map file %s.iit not found in %s. Available files:\n",map_iitfile,mapdir);
6711 Datadir_list_directory(stderr,mapdir);
6712 fprintf(stderr,"Either install file %s.iit or specify a directory for the IIT file\n",iitfile);
6713 fprintf(stderr,"using the -M flag.\n");
6714 exit(9);
6715 } else {
6716 map_divint_crosstable = Univ_IIT_divint_crosstable(chromosome_iit,map_iit);
6717 }
6718
6719 check_map_iit(map_iit,chromosome_iit);
6720
6721 FREE(iitfile);
6722 FREE(mapdir);
6723 FREE(map_iitfile);
6724 }
6725
6726 if (splicing_file != NULL) {
6727 if (user_splicingdir == NULL) {
6728 if ((splicing_iit = IIT_read(splicing_file,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
6729 /*divstring*/NULL,/*add_iit_p*/false)) != NULL) {
6730 fprintf(stderr,"Reading splicing file %s locally...",splicing_file);
6731 } else {
6732 iitfile = (char *) CALLOC(strlen(user_splicingdir)+strlen("/")+strlen(splicing_file)+1,sizeof(char));
6733 sprintf(iitfile,"%s/%s",user_splicingdir,splicing_file);
6734 if ((splicing_iit = IIT_read(splicing_file,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
6735 /*divstring*/NULL,/*add_iit_p*/false)) != NULL) {
6736 fprintf(stderr,"Reading splicing file %s locally...",splicing_file);
6737 FREE(iitfile);
6738 }
6739 }
6740 }
6741
6742 if (splicing_iit == NULL) {
6743 mapdir = Datadir_find_mapdir(/*user_mapdir*/NULL,genomesubdir,fileroot);
6744 iitfile = (char *) CALLOC(strlen(mapdir)+strlen("/")+
6745 strlen(splicing_file)+1,sizeof(char));
6746 sprintf(iitfile,"%s/%s",mapdir,splicing_file);
6747 if ((splicing_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
6748 /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
6749 fprintf(stderr,"Reading splicing file %s...",iitfile);
6750 FREE(iitfile);
6751 FREE(mapdir);
6752 } else {
6753 fprintf(stderr,"Splicing file %s.iit not found locally or in %s. Available files:\n",splicing_file,mapdir);
6754 Datadir_list_directory(stderr,mapdir);
6755 fprintf(stderr,"Either install file %s or specify a full directory path\n",splicing_file);
6756 exit(9);
6757 }
6758 }
6759 }
6760
6761 /* Complement_init(); */
6762 Dynprog_init(mode);
6763 #ifdef PMAP
6764 Backtranslation_init();
6765 #endif
6766
6767 if (user_pairalign_p == true) {
6768 /* maxpaths_report = 1; -- no; could have different paths against the user segment. */
6769
6770 genomecomp = (Genome_T) NULL;
6771 genomecomp_alt = (Genome_T) NULL;
6772 dbversion = (char *) NULL;
6773 altlocp = (bool *) MALLOC(sizeof(bool));
6774 altlocp[0] = false;
6775 alias_starts = (Univcoord_T *) CALLOC(1,sizeof(Univcoord_T));
6776 alias_ends = (Univcoord_T *) CALLOC(1,sizeof(Univcoord_T));
6777
6778 /* Create genomecomp_blocks for each usersegment */
6779
6780 } else if (global_usersegment != NULL) {
6781 /* Map against user-provided genomic segment */
6782 /* maxpaths_report = 1; -- no; could have different paths against the user segment. */
6783
6784 genomecomp = (Genome_T) NULL;
6785 genomecomp_alt = (Genome_T) NULL;
6786 dbversion = (char *) NULL;
6787 genomecomp_blocks = Compress_create_blocks_comp(Sequence_fullpointer(global_usersegment),Sequence_fulllength(global_usersegment));
6788 altlocp = (bool *) MALLOC(sizeof(bool));
6789 altlocp[0] = false;
6790 alias_starts = (Univcoord_T *) CALLOC(1,sizeof(Univcoord_T));
6791 alias_ends = (Univcoord_T *) CALLOC(1,sizeof(Univcoord_T));
6792
6793 if (Sequence_fulllength(global_usersegment) > 1000000) {
6794 fprintf(stderr,"Genomic sequence is unusually long (%d bp). GMAP handles genomes better when\n",
6795 Sequence_fulllength(global_usersegment));
6796 fprintf(stderr," they are converted into gmap databases first using gmap_build, and then accessed\n");
6797 fprintf(stderr," with the -d flag.\n");
6798 }
6799
6800 } else {
6801 if (snps_root == NULL) {
6802 genomecomp = Genome_new(genomesubdir,fileroot,/*snps_root*/NULL,/*genometype*/GENOME_OLIGOS,
6803 uncompressedp,genome_access,sharedp);
6804 genomecomp_blocks = Genome_blocks(genomecomp);
6805 genomecomp_alt = (Genome_T) NULL;
6806
6807 } else {
6808 /* Map against genome with SNPs */
6809 if (user_snpsdir == NULL) {
6810 snpsdir = genomesubdir;
6811 } else {
6812 snpsdir = user_snpsdir;
6813 }
6814
6815 genomecomp = Genome_new(genomesubdir,fileroot,/*snps_root*/NULL,/*genometype*/GENOME_OLIGOS,
6816 uncompressedp,genome_access,sharedp);
6817 genomecomp_blocks = Genome_blocks(genomecomp);
6818 genomecomp_alt = Genome_new(snpsdir,fileroot,snps_root,/*genometype*/GENOME_OLIGOS,
6819 uncompressedp,genome_access,sharedp);
6820 }
6821
6822 if (user_modedir != NULL) {
6823 modedir = user_modedir;
6824 } else {
6825 modedir = genomesubdir;
6826 }
6827
6828 if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
6829 idx_filesuffix1 = "metct";
6830 idx_filesuffix2 = "metga";
6831 } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED) {
6832 idx_filesuffix1 = "a2iag";
6833 idx_filesuffix2 = "a2itc";
6834 } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
6835 idx_filesuffix1 = "a2itc";
6836 idx_filesuffix2 = "a2iag";
6837 } else {
6838 idx_filesuffix1 = IDX_FILESUFFIX; /* "ref" */
6839 idx_filesuffix2 = (char *) NULL;
6840 }
6841
6842 if ((indexdb_fwd = Indexdb_new_genome(&index1part,&index1interval,
6843 modedir,fileroot,idx_filesuffix1,snps_root,
6844 required_index1part,required_index1interval,
6845 offsetsstrm_access,positions_access,
6846 sharedp,multiple_sequences_p,/*preload_shared_memory_p*/false,
6847 /*unload_shared_memory_p*/false)) == NULL) {
6848
6849 if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
6850 fprintf(stderr,"Cannot find %s index file. Need to run cmetindex first\n",idx_filesuffix1);
6851 } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED ||
6852 mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
6853 fprintf(stderr,"Cannot find %s index file. Need to run atoiindex first\n",idx_filesuffix1);
6854 } else {
6855 fprintf(stderr,"Cannot find %s index file\n",idx_filesuffix1);
6856 }
6857 exit(9);
6858 }
6859
6860 if (idx_filesuffix2 == NULL) {
6861 indexdb_rev = indexdb_fwd;
6862 } else if ((indexdb_rev = Indexdb_new_genome(&index1part,&index1interval,
6863 modedir,fileroot,idx_filesuffix2,snps_root,
6864 required_index1part,required_index1interval,
6865 offsetsstrm_access,positions_access,
6866 sharedp,multiple_sequences_p,/*preload_shared_memory_p*/false,
6867 /*unload_shared_memory_p*/false)) == NULL) {
6868 if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
6869 fprintf(stderr,"Cannot find %s index file. Need to run cmetindex first\n",idx_filesuffix2);
6870 } else {
6871 fprintf(stderr,"Cannot find %s index file. Need to run atoiindex first\n",idx_filesuffix2);
6872 }
6873 exit(9);
6874 }
6875
6876
6877 if (user_chrsubsetname != NULL) {
6878 if ((divno = Univ_IIT_find_one(chromosome_iit,user_chrsubsetname)) < 0) {
6879 fprintf(stderr,"Cannot find chrsubset %s in chromosome IIT file. Ignoring.\n",user_chrsubsetname);
6880 } else {
6881 interval = Univ_IIT_interval(chromosome_iit,divno);
6882 chrsubset_start = Univinterval_low(interval);
6883 chrsubset_end = Univinterval_high(interval);
6884 }
6885 }
6886 }
6887
6888 FREE(genomesubdir);
6889 FREE(fileroot);
6890 FREE(dbroot);
6891
6892
6893 if (splicing_file != NULL && genomecomp != NULL) {
6894 if (Genome_blocks(genomecomp) == NULL) {
6895 fprintf(stderr,"known splicing can be used only with compressed genome\n");
6896 } else {
6897 /* TODO: Handle case for observed distances */
6898 /* min_extra_end no longer used by gregion.c */
6899 /* min_extra_end = shortsplicedist; */
6900
6901 splicing_divint_crosstable = Univ_IIT_divint_crosstable(chromosome_iit,splicing_iit);
6902 if ((donor_typeint = IIT_typeint(splicing_iit,"donor")) >= 0 &&
6903 (acceptor_typeint = IIT_typeint(splicing_iit,"acceptor")) >= 0) {
6904 fprintf(stderr,"found donor and acceptor tags, so treating as splicesites file\n");
6905 splicestringpool = Splicestringpool_new();
6906 splicesites = Splicetrie_retrieve_via_splicesites(&distances_observed_p,&splicetypes,&splicedists,
6907 &splicestrings,&splicefrags_ref,&splicefrags_alt,
6908 &nsplicesites,splicing_iit,splicing_divint_crosstable,
6909 donor_typeint,acceptor_typeint,chromosome_iit,
6910 genomecomp,genomecomp_alt/*can be NULL*/,shortsplicedist,
6911 splicestringpool);
6912 if (nsplicesites == 0) {
6913 fprintf(stderr,"\nWarning: No splicesites observed for genome %s. Are you sure this splicesite file was built for this genome? Please compare chromosomes below:\n",
6914 dbroot);
6915 fprintf(stderr,"Chromosomes in the genome: ");
6916 Univ_IIT_dump_labels(stderr,chromosome_iit);
6917 fprintf(stderr,"Chromosomes in the splicesites IIT file: ");
6918 IIT_dump_divstrings(stderr,splicing_iit);
6919 exit(9);
6920
6921 } else {
6922 Splicetrie_npartners(&nsplicepartners_skip,&nsplicepartners_obs,&nsplicepartners_max,splicesites,splicetypes,splicedists,
6923 splicestrings,nsplicesites,chromosome_iit,shortsplicedist,distances_observed_p);
6924 Splicetrie_build_via_splicesites(&triecontents_obs,&trieoffsets_obs,&triecontents_max,&trieoffsets_max,
6925 nsplicepartners_skip,nsplicepartners_obs,nsplicepartners_max,splicetypes,
6926 splicestrings,nsplicesites);
6927 FREE(nsplicepartners_max);
6928 FREE(nsplicepartners_obs);
6929 FREE(nsplicepartners_skip);
6930 /* Splicestring_gc(splicestrings,nsplicesites); */
6931 FREE(splicestrings);
6932 }
6933 Splicestringpool_free(&splicestringpool);
6934
6935 } else {
6936 fprintf(stderr,"no donor or acceptor tags found, so treating as introns file\n");
6937 splicestringpool = Splicestringpool_new();
6938 splicesites = Splicetrie_retrieve_via_introns(&splicetypes,&splicedists,
6939 &splicestrings,&splicefrags_ref,&splicefrags_alt,
6940 &nsplicesites,splicing_iit,splicing_divint_crosstable,
6941 chromosome_iit,genomecomp,genomecomp_alt/*can be NULL*/,
6942 splicestringpool);
6943 if (nsplicesites == 0) {
6944 fprintf(stderr,"\nWarning: No splicesites observed for genome %s. Are you sure this splicesite file was built for this genome? Please compare chromosomes below:\n",
6945 dbroot);
6946 fprintf(stderr,"Chromosomes in the genome: ");
6947 Univ_IIT_dump_labels(stderr,chromosome_iit);
6948 fprintf(stderr,"Chromosomes in the splicesites IIT file: ");
6949 IIT_dump_divstrings(stderr,splicing_iit);
6950 exit(9);
6951 } else {
6952 Splicetrie_build_via_introns(&triecontents_obs,&trieoffsets_obs,splicesites,splicetypes,
6953 splicestrings,nsplicesites,chromosome_iit,splicing_iit,splicing_divint_crosstable);
6954 triecontents_max = (Triecontent_T *) NULL;
6955 trieoffsets_max = (Trieoffset_T *) NULL;
6956 /* Splicestring_gc(splicestrings,nsplicesites); */
6957 FREE(splicestrings);
6958 }
6959 Splicestringpool_free(&splicestringpool);
6960
6961 }
6962 }
6963
6964 fprintf(stderr,"done\n");
6965 }
6966
6967
6968 Translation_setup(translation_code,alt_initiation_codons_p);
6969
6970 if (user_pairalign_p == true) {
6971 /* Creation of genomecomp and initialization done within single_thread() for each input sequence */
6972 any_circular_p = false;
6973 circularp = (bool *) MALLOC(1*sizeof(bool));
6974 circularp[0] = false;
6975
6976 } else if (usersegment != NULL) {
6977 any_circular_p = false;
6978 circularp = (bool *) MALLOC(1*sizeof(bool));
6979 circularp[0] = false;
6980
6981 Genome_user_setup(genomecomp_blocks,genomelength);
6982 Genome_sites_setup(genomecomp_blocks,/*snp_blocks*/NULL);
6983 Maxent_hr_setup(genomecomp_blocks,/*genomealt_blocks*/genomecomp_blocks);
6984 #ifdef PMAP
6985 Oligoindex_pmap_setup(genomecomp);
6986 #else
6987 Oligoindex_hr_setup(genomecomp_blocks,mode);
6988 /* Oligoindex_localdb_setup(chromosome_iit,circular_typeint,localdb,local1part); */
6989 #endif
6990
6991 } else if (genomecomp != NULL) {
6992 circularp = Univ_IIT_circularp(&any_circular_p,chromosome_iit);
6993
6994 Genome_setup(genomecomp,genomecomp_alt/*can be NULL*/,genomelength,mode,circular_typeint);
6995 Genome_sites_setup(Genome_blocks(genomecomp),/*snp_blocks*/genomecomp_alt ? Genome_blocks(genomecomp_alt) : NULL);
6996 Maxent_hr_setup(Genome_blocks(genomecomp),/*snp_blocks*/genomecomp_alt ? Genome_blocks(genomecomp_alt) : NULL);
6997 #ifdef PMAP
6998 Alphabet_setup(alphabet,alphabet_size,index1part_aa);
6999 Oligoindex_pmap_setup(genomecomp);
7000 Oligop_setup(alphabet,alphabet_size,index1part_aa);
7001 Indexdb_setup(index1part_aa);
7002 Stage1_setup(index1part_aa,maxextension,maxtotallen_bound,circular_typeint);
7003 #else
7004 Oligoindex_hr_setup(Genome_blocks(genomecomp),mode);
7005 /* Oligoindex_localdb_setup(chromosome_iit,circular_typeint,localdb,local1part); */
7006 Oligo_setup(index1part,mode);
7007 Indexdb_setup(index1part);
7008 Stage1_setup(index1part,maxextension,maxtotallen_bound,circular_typeint);
7009 #endif
7010 }
7011
7012 Stage2_setup(/*splicingp*/novelsplicingp == true || knownsplicingp == true,cross_species_p,
7013 suboptimal_score_start,suboptimal_score_end,sufflookback,nsufflookback,maxintronlen,mode,
7014 /*snps_p*/genomecomp_alt ? true : false);
7015 Dynprog_single_setup(homopolymerp);
7016 Dynprog_genome_setup(novelsplicingp,splicing_iit,splicing_divint_crosstable,
7017 donor_typeint,acceptor_typeint);
7018 Dynprog_end_setup(splicesites,splicetypes,splicedists,nsplicesites,
7019 trieoffsets_obs,triecontents_obs,trieoffsets_max,triecontents_max);
7020 Pair_setup(novelsplicingp,splicing_iit,trim_indel_score,
7021 gff3_separators_p,sam_insert_0M_p,force_xs_direction_p,
7022 md_lowercase_variant_p,/*snps_p*/genomecomp_alt ? true : false,
7023 gff3_phase_swap_p,cdstype,sam_cigar_extended_p,cigar_action);
7024
7025 Stage3_setup(/*splicingp*/novelsplicingp == true || knownsplicingp == true,novelsplicingp,
7026 require_splicedir_p,shortsplicedist,splicing_iit,splicing_divint_crosstable,
7027 donor_typeint,acceptor_typeint,splicesites,circularp,altlocp,alias_starts,alias_ends,
7028 min_intronlength,max_deletionlength,/*min_indel_end_matches*/6,
7029 maxpeelback_distalmedial,nullgap,extramaterial_end,extramaterial_paired,
7030 extraband_single,extraband_end,extraband_paired,
7031 ngap,maxintronlen,maxintronlen_ends,minendexon,homopolymerp,gff3_fasta_annotation_type,
7032 stage3debug,/*genome_totallength*/genomelength);
7033
7034 Splicetrie_setup(splicesites,splicefrags_ref,splicefrags_alt,
7035 trieoffsets_obs,triecontents_obs,trieoffsets_max,triecontents_max,
7036 /*snpp*/false,amb_closest_p,/*amb_clip_p*/true,/*min_shortend*/2);
7037 Output_setup(chromosome_iit,nofailsp,failsonlyp,quiet_if_excessive_p,maxpaths_report,
7038 failedinput_root,quality_shift,
7039 printtype,invertmode,wraplength,ngap,nointronlenp,sam_paired_p,cds_startpos,
7040 fulllengthp,truncatep,strictp,checksump,genomecomp,usersegment,user_genomicseg,
7041 dbversion,user_chrsubsetname,contig_iit,altstrain_iit,
7042 /*chimeras_allowed_p*/chimera_margin > 0 ? true : false,
7043 map_iit,map_divint_crosstable,map_exons_p,map_bothstrands_p,
7044 nflanking,print_comment_p,sam_read_group_id);
7045
7046 #ifdef USE_MPI
7047 if (myid == 0) {
7048 Outbuffer_setup(argc,argv,optind,chromosome_iit,any_circular_p,
7049 nworkers,orderedp,quiet_if_excessive_p,
7050 printtype,usersegment,sam_headers_p,sam_read_group_id,sam_read_group_name,
7051 sam_read_group_library,sam_read_group_platform,
7052 appendp,/*output_file*/NULL,split_output_root,failedinput_root);
7053 outbuffer = Outbuffer_new(output_buffer_size,/*nread*/0);
7054 /* Inbuffer_set_outbuffer(inbuffer,outbuffer); */
7055
7056 fprintf(stderr,"Starting alignment\n");
7057 stopwatch = Stopwatch_new();
7058 Stopwatch_start(stopwatch);
7059 }
7060 #else
7061 Outbuffer_setup(argc,argv,optind,chromosome_iit,any_circular_p,
7062 nworkers,orderedp,quiet_if_excessive_p,
7063 printtype,usersegment,sam_headers_p,sam_read_group_id,sam_read_group_name,
7064 sam_read_group_library,sam_read_group_platform,
7065 appendp,/*output_file*/NULL,split_output_root,failedinput_root);
7066 outbuffer = Outbuffer_new(output_buffer_size,nread);
7067 Inbuffer_set_outbuffer(inbuffer,outbuffer);
7068
7069 fprintf(stderr,"Starting alignment\n");
7070 stopwatch = Stopwatch_new();
7071 Stopwatch_start(stopwatch);
7072 #endif
7073
7074
7075 #ifdef USE_MPI
7076 /* MPI version */
7077 if (myid == 0) {
7078 #ifdef WORKER_DETACH
7079 pthread_attr_init(&thread_attr_detach);
7080 if ((ret = pthread_attr_setdetachstate(&thread_attr_detach,PTHREAD_CREATE_DETACHED)) != 0) {
7081 fprintf(stderr,"ERROR: pthread_attr_setdetachstate %d\n",ret);
7082 exit(1);
7083 }
7084 #endif
7085 pthread_attr_init(&thread_attr_join);
7086 if ((ret = pthread_attr_setdetachstate(&thread_attr_join,PTHREAD_CREATE_JOINABLE)) != 0) {
7087 fprintf(stderr,"ERROR: pthread_attr_setdetachstate %d\n",ret);
7088 exit(1);
7089 }
7090
7091 Except_init_pthread();
7092 /* pthread_key_create(&global_request_key,NULL); */
7093
7094 if (orderedp == true) {
7095 pthread_create(&output_thread_id,&thread_attr_join,Outbuffer_thread_ordered,
7096 (void *) outbuffer);
7097 } else {
7098 pthread_create(&output_thread_id,&thread_attr_join,Outbuffer_thread_anyorder,
7099 (void *) outbuffer);
7100 }
7101
7102 Outbuffer_mpi_process(outbuffer,/*n_worker_procs*/nprocs - 1,part_modulus,part_interval);
7103 pthread_join(output_thread_id,NULL);
7104
7105 /* pthread_key_delete(global_request_key); */
7106 /* Except_term_pthread(); */
7107
7108 } else {
7109 worker_mpi_process(/*worker_id*/myid,inbuffer);
7110 }
7111
7112 #elif !defined(HAVE_PTHREAD)
7113 /* Serial version */
7114 single_thread();
7115
7116 #else
7117 /* Pthreads version */
7118 if (nworkers == 0) {
7119 single_thread();
7120
7121 } else if (multiple_sequences_p == false) {
7122 single_thread();
7123
7124 } else {
7125 #ifdef WORKER_DETACH
7126 pthread_attr_init(&thread_attr_detach);
7127 if ((ret = pthread_attr_setdetachstate(&thread_attr_detach,PTHREAD_CREATE_DETACHED)) != 0) {
7128 fprintf(stderr,"ERROR: pthread_attr_setdetachstate %d\n",ret);
7129 exit(1);
7130 }
7131 #endif
7132 pthread_attr_init(&thread_attr_join);
7133 if ((ret = pthread_attr_setdetachstate(&thread_attr_join,PTHREAD_CREATE_JOINABLE)) != 0) {
7134 fprintf(stderr,"ERROR: pthread_attr_setdetachstate %d\n",ret);
7135 exit(1);
7136 }
7137
7138 worker_thread_ids = (pthread_t *) CALLOC(nworkers,sizeof(pthread_t));
7139 Except_init_pthread();
7140 pthread_key_create(&global_request_key,NULL);
7141
7142 if (orderedp == true) {
7143 pthread_create(&output_thread_id,&thread_attr_join,Outbuffer_thread_ordered,
7144 (void *) outbuffer);
7145 } else {
7146 pthread_create(&output_thread_id,&thread_attr_join,Outbuffer_thread_anyorder,
7147 (void *) outbuffer);
7148 }
7149
7150 for (i = 0; i < nworkers; i++) {
7151 #ifdef WORKER_DETACH
7152 pthread_create(&(worker_thread_ids[i]),&thread_attr_detach,worker_thread,(void *) NULL);
7153 #else
7154 /* Need to have worker threads finish before we call Inbuffer_free() */
7155 pthread_create(&(worker_thread_ids[i]),&thread_attr_join,worker_thread,(void *) NULL);
7156 #endif
7157 }
7158
7159 pthread_join(output_thread_id,NULL);
7160 for (i = 0; i < nworkers; i++) {
7161 pthread_join(worker_thread_ids[i],NULL);
7162 }
7163
7164 pthread_key_delete(global_request_key);
7165 /* Do not delete global_except_key, because worker threads might still need it */
7166 /* Except_term_pthread(); */
7167
7168 FREE(worker_thread_ids);
7169
7170 }
7171 #endif /* HAVE_PTHREAD */
7172
7173
7174 #ifdef USE_MPI
7175 if (myid == 0) {
7176 runtime = Stopwatch_stop(stopwatch);
7177 Stopwatch_free(&stopwatch);
7178
7179 nread = Outbuffer_nread(outbuffer);
7180 nbeyond = Outbuffer_nbeyond(outbuffer);
7181 fprintf(stderr,"Processed %u queries in %.2f seconds (%.2f queries/sec)\n",
7182 nread-nbeyond,runtime,(double) nread/runtime);
7183
7184 Outbuffer_free(&outbuffer);
7185 Inbuffer_free(&inbuffer); /* Also closes inputs */
7186 }
7187
7188 Outbuffer_close_files(); /* All ranks have to close the files */
7189
7190 #else
7191 /* Single CPU or Pthreads version */
7192 runtime = Stopwatch_stop(stopwatch);
7193 Stopwatch_free(&stopwatch);
7194
7195 nread = Outbuffer_nread(outbuffer);
7196 /* nbeyond = Outbuffer_nbeyond(outbuffer); */
7197 fprintf(stderr,"Processed %u queries in %.2f seconds (%.2f queries/sec)\n",
7198 nread,runtime,(double) nread/runtime);
7199
7200 Outbuffer_free(&outbuffer);
7201 Inbuffer_free(&inbuffer); /* Also closes inputs */
7202
7203 Outbuffer_close_files();
7204 #endif
7205
7206 #ifdef PMAP
7207 Backtranslation_term();
7208 #endif
7209 Dynprog_term(mode);
7210
7211
7212 if (nsplicesites > 0) {
7213 if (splicetrie_precompute_p == true) {
7214 FREE(triecontents_max);
7215 FREE(trieoffsets_max);
7216 FREE(triecontents_obs);
7217 FREE(trieoffsets_obs);
7218 } else {
7219 FREE(nsplicepartners_max);
7220 FREE(nsplicepartners_obs);
7221 FREE(nsplicepartners_skip);
7222 /* Splicestring_gc(splicestrings,nsplicesites); */
7223 FREE(splicestrings);
7224 }
7225 FREE(splicefrags_ref);
7226 FREE(splicedists);
7227 FREE(splicetypes);
7228 FREE(splicesites);
7229 }
7230
7231 if (splicing_iit != NULL) {
7232 FREE(splicing_divint_crosstable);
7233 IIT_free(&splicing_iit);
7234 }
7235
7236 #if 0
7237 /* Oligoindex_localdb_cleanup(); */
7238 if (localdb != NULL) {
7239 Localdb_free(&localdb);
7240 }
7241 #endif
7242
7243 #ifdef PMAP
7244 if (indexdb_rev != NULL) {
7245 Indexdb_free(&indexdb_rev);
7246 }
7247 if (indexdb_fwd != NULL) {
7248 Indexdb_free(&indexdb_fwd);
7249 }
7250 #else
7251 if (indexdb_rev != indexdb_fwd) {
7252 Indexdb_free(&indexdb_rev);
7253 }
7254 if (indexdb_fwd != NULL) {
7255 Indexdb_free(&indexdb_fwd);
7256 }
7257 #endif
7258 if (dbversion != NULL) {
7259 FREE(dbversion);
7260 }
7261 if (altstrain_iit != NULL) {
7262 IIT_free(&altstrain_iit);
7263 }
7264 if (genomecomp_alt != NULL) {
7265 Genome_free(&genomecomp_alt);
7266 }
7267 if (user_pairalign_p == true) {
7268 /* genomecomp_blocks freed within single_thread */
7269 } else if (usersegment != NULL) {
7270 FREE(genomecomp_blocks);
7271 } else if (genomecomp != NULL) {
7272 Genome_free(&genomecomp);
7273 }
7274
7275 if (map_iit != NULL) {
7276 IIT_free(&map_iit);
7277 }
7278 if (contig_iit != NULL) {
7279 Univ_IIT_free(&contig_iit);
7280 }
7281 if (altlocp != NULL) {
7282 FREE(alias_ends);
7283 FREE(alias_starts);
7284 FREE(altlocp);
7285 }
7286 if (circularp != NULL) {
7287 FREE(circularp);
7288 }
7289 if (chromosome_iit != NULL) {
7290 Univ_IIT_free(&chromosome_iit);
7291 }
7292
7293 if (user_selfalign_p == true) {
7294 /* Do not free usersegment */
7295 } else if (usersegment != NULL) {
7296 Sequence_free(&usersegment);
7297 }
7298
7299 Outbuffer_cleanup();
7300
7301 Access_controlled_cleanup();
7302
7303 #ifdef USE_MPI
7304 MPI_Barrier(MPI_COMM_WORLD); /* Make sure all processes have cleaned up */
7305 MPI_Finalize();
7306 #endif
7307
7308 return 0;
7309 }
7310
7311
7312 static void
print_program_usage()7313 print_program_usage () {
7314 #ifdef PMAP
7315 fprintf(stdout,"\
7316 Usage: pmap [OPTIONS...] <FASTA files...>, or\n\
7317 cat <FASTA files...> | pmap [OPTIONS...]\n\
7318 ");
7319 #else
7320 fprintf(stdout,"\
7321 Usage: gmap [OPTIONS...] <FASTA files...>, or\n\
7322 cat <FASTA files...> | gmap [OPTIONS...]\n\
7323 ");
7324 #endif
7325 fprintf(stdout,"\n");
7326
7327 fprintf(stdout,"Input options (must include -d or -g)\n");
7328 fprintf(stdout,"\
7329 -D, --dir=directory Genome directory. Default (as specified by --with-gmapdb to the configure program) is\n \
7330 %s\n\
7331 ",GMAPDB);
7332 fprintf(stdout,"\
7333 -d, --db=STRING Genome database. If argument is '?' (with\n\
7334 the quotes), this command lists available databases.\n\
7335 ");
7336 fprintf(stdout,"\n");
7337
7338 #ifdef PMAP
7339 fprintf(stdout,"\
7340 -a, --alphabet=STRING Alphabet to use in PMAP genome database\n\
7341 (allowed values in order of preference: 20, 15a, 12a).\n\
7342 If not specified, the program will find the first available\n\
7343 alphabet in the genome database in preference order\n\
7344 ");
7345 #endif
7346
7347 #if 0
7348 /* No longer supported */
7349 fprintf(stdout,"\
7350 -G, --genomefull Use full genome (all ASCII chars allowed;\n \
7351 built explicitly during setup), not\n\
7352 compressed version\n\
7353 ");
7354 #endif
7355
7356 fprintf(stdout,"\
7357 -k, --kmer=INT kmer size to use in genome database (allowed values: 16 or less).\n\
7358 If not specified, the program will find the highest available\n\
7359 kmer size in the genome database\n\
7360 --sampling=INT Sampling to use in genome database. If not specified, the program\n\
7361 will find the smallest available sampling value in the genome database\n\
7362 within selected k-mer size\n\
7363 -g, --gseg=filename User-supplied genomic segment\n\
7364 -1, --selfalign Align one sequence against itself in FASTA format via stdin\n\
7365 (Useful for getting protein translation of a nucleotide sequence)\n\
7366 -2, --pairalign Align two sequences in FASTA format via stdin, first one being\n\
7367 genomic and second one being cDNA\n\
7368 --cmdline=STRING,STRING Align these two sequences provided on the command line,\n\
7369 first one being genomic and second one being cDNA\n\
7370 -q, --part=INT/INT Process only the i-th out of every n sequences\n\
7371 e.g., 0/100 or 99/100 (useful for distributing jobs\n\
7372 to a computer farm).\n\
7373 ");
7374 fprintf(stdout,"\
7375 --input-buffer-size=INT Size of input buffer (program reads this many sequences\n\
7376 at a time for efficiency) (default %d)\n\
7377 ",inbuffer_nspaces);
7378 fprintf(stdout,"\n");
7379
7380 fprintf(stdout,"Computation options\n");
7381 #ifdef HAVE_MMAP
7382 fprintf(stdout,"\
7383 -B, --batch=INT Batch mode (default = 2)\n\
7384 Mode Positions Genome\n\
7385 0 mmap mmap\n\
7386 1 mmap & preload mmap\n\
7387 (default) 2 mmap & preload mmap & preload\n\
7388 3 allocate mmap & preload\n\
7389 4 allocate allocate\n\
7390 5 allocate allocate (same as 4)\n\
7391 Note: For a single sequence, all data structures use mmap\n\
7392 If mmap not available and allocate not chosen, then will use fileio (very slow)\n\
7393 ");
7394 #else
7395 fprintf(stdout,"\
7396 -B, --batch=INT Batch mode (default = 4, modes 0-3 disallowed because program configured without mmap)\n\
7397 Mode Positions Genome\n\
7398 (default) 4 allocate allocate\n\
7399 5 allocate allocate (same as 4)\n\
7400 ");
7401 #endif
7402
7403 fprintf(stdout,"\
7404 --use-shared-memory=INT If 1, then allocated memory is shared among all processes on this node\n\
7405 If 0 (default), then each process has private allocated memory\n\
7406 ");
7407
7408 fprintf(stdout,"\
7409 --nosplicing Turns off splicing (useful for aligning genomic sequences\n\
7410 onto a genome)\n\
7411 ");
7412 fprintf(stdout,"\
7413 --min-intronlength=INT Min length for one internal intron (default %d). Below this size,\n\
7414 a genomic gap will be considered a deletion rather than an intron.\n\
7415 ",min_intronlength);
7416 fprintf(stdout,"\
7417 --max-intronlength-middle=INT Max length for one internal intron (default %d). Note: for backward\n\
7418 compatibility, the -K or --intronlength flag will set both\n\
7419 --max-intronlength-middle and --max-intronlength-ends.\n\
7420 Also see --split-large-introns below.\n\
7421 ",maxintronlen);
7422 fprintf(stdout,"\
7423 --max-intronlength-ends=INT Max length for first or last intron (default %d). Note: for backward\n\
7424 compatibility, the -K or --intronlength flag will set both\n\
7425 --max-intronlength-middle and --max-intronlength-ends.\n\
7426 ",maxintronlen_ends);
7427 fprintf(stdout,"\
7428 --split-large-introns Sometimes GMAP will exceed the value for --max-intronlength-middle,\n\
7429 if it finds a good single alignment. However, you can force GMAP\n\
7430 to split such alignments by using this flag\n\
7431 ");
7432 fprintf(stdout,"\
7433 --trim-end-exons=INT Trim end exons with fewer than given number of matches\n\
7434 (in nt, default %d)\n\
7435 ",minendexon);
7436 fprintf(stdout,"\
7437 -w, --localsplicedist=INT Max length for known splice sites at ends of sequence\n\
7438 (default %d)\n\
7439 ",shortsplicedist);
7440 fprintf(stdout,"\
7441 -L, --totallength=INT Max total intron length (default %d)\n\
7442 ",maxtotallen_bound);
7443 fprintf(stdout,"\
7444 -x, --chimera-margin=INT Amount of unaligned sequence that triggers\n\
7445 search for the remaining sequence (default %d).\n\
7446 Enables alignment of chimeric reads, and may help\n\
7447 with some non-chimeric reads. To turn off, set to\n\
7448 zero.\n\
7449 ",chimera_margin);
7450 fprintf(stdout,"\
7451 --no-chimeras Turns off finding of chimeras. Same effect as --chimera-margin=0\n\
7452 ");
7453
7454 #if 0
7455 fprintf(stdout,"\
7456 -w, --reference=filename Reference cDNA sequence for relative alignment\n\
7457 ");
7458 #endif
7459
7460 #ifdef HAVE_PTHREAD
7461 fprintf(stdout,"\
7462 -t, --nthreads=INT Number of worker threads\n\
7463 ");
7464 #else
7465 fprintf(stdout,"\
7466 -t, --nthreads=INT Number of worker threads. Flag is ignored in this version of GMAP, which has pthreads disabled\n\
7467 ");
7468 #endif
7469 fprintf(stdout,"\
7470 -c, --chrsubset=string Limit search to given chromosome\n\
7471 -z, --direction=STRING cDNA direction (sense_force, antisense_force,\n\
7472 sense_filter, antisense_filter,or auto (default))\n\
7473 ");
7474 fprintf(stdout,"\
7475 --canonical-mode=INT Reward for canonical and semi-canonical introns\n\
7476 0=low reward, 1=high reward (default), 2=low reward for\n\
7477 high-identity sequences and high reward otherwise\n\
7478 --cross-species Use a more sensitive search for canonical splicing, which helps especially\n\
7479 for cross-species alignments and other difficult cases\n\
7480 --allow-close-indels=INT Allow an insertion and deletion close to each other\n\
7481 (0=no, 1=yes (default), 2=only for high-quality alignments)\n\
7482 ");
7483 fprintf(stdout,"\
7484 --microexon-spliceprob=FLOAT Allow microexons only if one of the splice site probabilities is\n\
7485 greater than this value (default %.2f)\n\
7486 ",microexon_spliceprob);
7487
7488 #if 0
7489 fprintf(stdout,"\
7490 --homopolymer In dynamic programming, favor indels in regions of homopolymers,\n\
7491 e.g., AAAAAA. Useful for some platforms, such as Pacific Biosciences\n\
7492 ");
7493 #endif
7494
7495 #ifndef PMAP
7496 fprintf(stdout,"\
7497 --cmetdir=STRING Directory for methylcytosine index files (created using cmetindex)\n\
7498 (default is location of genome index files specified using -D, -V, and -d)\n\
7499 --atoidir=STRING Directory for A-to-I RNA editing index files (created using atoiindex)\n\
7500 (default is location of genome index files specified using -D, -V, and -d)\n\
7501 --mode=STRING Alignment mode: standard (default), cmet-stranded, cmet-nonstranded,\n\
7502 atoi-stranded, atoi-nonstranded, ttoc-stranded, or ttoc-nonstranded.\n\
7503 Non-standard modes requires you to have previously run the cmetindex\n\
7504 or atoiindex programs (which also cover the ttoc modes) on the genome\n\
7505 ");
7506 #endif
7507
7508 #if 0
7509 /* Causes seg faults, so do not advertise */
7510 fprintf(stdout,"\
7511 -s, --splicing=STRING Look for splicing involving known sites\n\
7512 (in <STRING>.iit)\n\
7513 ");
7514 #endif
7515
7516 #ifndef PMAP
7517 fprintf(stdout,"\
7518 -p, --prunelevel Pruning level: 0=no pruning (default), 1=poor seqs,\n\
7519 2=repetitive seqs, 3=poor and repetitive\n\
7520 ");
7521 #endif
7522 fprintf(stdout,"\n");
7523
7524 fprintf(stdout,"\
7525 Output types\n\
7526 -S, --summary Show summary of alignments only\n\
7527 -A, --align Show alignments\n\
7528 -3, --continuous Show alignment in three continuous lines\n\
7529 -4, --continuous-by-exon Show alignment in three lines per exon\n\
7530 -Z, --compress Print output in compressed format\n\
7531 -E, --exons=STRING Print exons (\"cdna\" or \"genomic\")\n\
7532 Will also print introns with \"cdna+introns\" or\n\
7533 \"genomic+introns\"\n\
7534 ");
7535
7536 #ifdef PMAP
7537 fprintf(stdout,"\
7538 -P, --protein_gen Print protein sequence (genomic)\n\
7539 -Q, --nucleotide Print inferred nucleotide sequence from protein\n\
7540 ");
7541 #else
7542 fprintf(stdout,"\
7543 -P, --protein_dna Print protein sequence (cDNA)\n\
7544 -Q, --protein_gen Print protein sequence (genomic)\n\
7545 ");
7546 #endif
7547
7548 #ifdef PMAP
7549 fprintf(stdout,"\
7550 -f, --format=INT Other format for output (also note the -A and -S options\n\
7551 and other options listed under Output types):\n\
7552 mask_introns,\n\
7553 mask_utr_introns,\n\
7554 psl_pro (or 0) = PSL format in protein coords,\n\
7555 psl_nt (or 1) = PSL format in nucleotide coords,\n\
7556 gff3_gene (or 2) = GFF3 gene format,\n\
7557 gff3_match_cdna (or 3) = GFF3 cDNA_match format,\n\
7558 gff3_match_est (or 4) = GFF3 EST_match format,\n\
7559 map_exons (or 7) = IIT FASTA exon map format,\n\
7560 map_ranges (or 8) = IIT FASTA range map format,\n\
7561 coords (or 9) = coords in table format\n\
7562 ");
7563 #else
7564 fprintf(stdout,"\
7565 -f, --format=INT Other format for output (also note the -A and -S options\n\
7566 and other options listed under Output types):\n\
7567 mask_introns,\n\
7568 mask_utr_introns,\n\
7569 psl (or 1) = PSL (BLAT) format,\n\
7570 gff3_gene (or 2) = GFF3 gene format,\n\
7571 gff3_match_cdna (or 3) = GFF3 cDNA_match format,\n\
7572 gff3_match_est (or 4) = GFF3 EST_match format,\n\
7573 splicesites (or 6) = splicesites output (for GSNAP splicing file),\n\
7574 introns = introns output (for GSNAP splicing file),\n\
7575 map_exons (or 7) = IIT FASTA exon map format,\n\
7576 map_ranges (or 8) = IIT FASTA range map format,\n\
7577 coords (or 9) = coords in table format,\n\
7578 sampe = SAM format (setting paired_read bit in flag),\n\
7579 samse = SAM format (without setting paired_read bit),\n\
7580 bedpe = indels and gaps in BEDPE format\n\
7581 ");
7582 #endif
7583 fprintf(stdout,"\n");
7584
7585 fprintf(stdout,"\
7586 Output options\n\
7587 -n, --npaths=INT Maximum number of paths to show (default %d). If set to 1, GMAP\n\
7588 will not report chimeric alignments, since those imply\n\
7589 two paths. If you want a single alignment plus chimeric\n\
7590 alignments, then set this to be 0.\n\
7591 ",maxpaths_report);
7592 fprintf(stdout,"\
7593 --suboptimal-score=FLOAT Report only paths whose score is within this value of the\n\
7594 best path.\n\
7595 If specified between 0.0 and 1.0, then treated as a fraction\n\
7596 of the score of the best alignment (matches minus penalties for\n\
7597 mismatches and indels). Otherwise, treated as an integer\n\
7598 number to be subtracted from the score of the best alignment.\n\
7599 Default value is 0.50.\n\
7600 -O, --ordered Print output in same order as input (relevant\n\
7601 only if there is more than one worker thread)\n\
7602 -5, --md5 Print MD5 checksum for each query sequence\n\
7603 -o, --chimera-overlap Overlap to show, if any, at chimera breakpoint\n\
7604 --failsonly Print only failed alignments, those with no results\n\
7605 --nofails Exclude printing of failed alignments\n\
7606 \n\
7607 -V, --snpsdir=STRING Directory for SNPs index files (created using snpindex) (default is\n\
7608 location of genome index files specified using -D and -d)\n \
7609 -v, --use-snps=STRING Use database containing known SNPs (in <STRING>.iit, built\n\
7610 previously using snpindex) for tolerance to SNPs\n\
7611 ");
7612
7613 fprintf(stdout,"\
7614 --split-output=STRING Basename for multiple-file output, separately for nomapping,\n\
7615 uniq, mult, (and chimera, if --chimera-margin is selected)\n\
7616 --failed-input=STRING Print completely failed alignments as input FASTA or FASTQ format\n\
7617 to the given file. If the --split-output flag is also given, this file\n\
7618 is generated in addition to the output in the .nomapping file.\n\
7619 --append-output When --split-output or --failedinput is given, this flag will append output\n\
7620 to the existing files. Otherwise, the default is to create new files.\n\
7621 ");
7622 fprintf(stdout,"\
7623 --output-buffer-size=INT Buffer size, in queries, for output thread (default %d). When the number\n\
7624 of results to be printed exceeds this size, the worker threads are halted\n\
7625 until the backlog is cleared\n\
7626 ",output_buffer_size);
7627
7628
7629 fprintf(stdout,"\
7630 --translation-code=INT Genetic code used for translating codons to amino acids and computing CDS\n\
7631 Integer value (default=1) corresponds to an available code at\n\
7632 http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi\n\
7633 --alt-start-codons Also, use the alternate initiation codons shown in the above Web site\n\
7634 By default, without this option, only ATG is considered an initiation codon\n\
7635 ");
7636
7637 #ifdef PMAP
7638 fprintf(stdout,"\
7639 -Y, --tolerant Translates genome with corrections for frameshifts\n\
7640 ");
7641 #else
7642 fprintf(stdout,"\
7643 -F, --fulllength Assume full-length protein, starting with Met\n\
7644 -a, --cdsstart=INT Translate codons from given nucleotide (1-based)\n\
7645 -T, --truncate Truncate alignment around full-length protein, Met to Stop\n\
7646 Implies -F flag.\n\
7647 -Y, --tolerant Translates cDNA with corrections for frameshifts\n\
7648 ");
7649 #endif
7650
7651 fprintf(stdout,"\n");
7652
7653 #ifndef PMAP
7654 fprintf(stdout,"Options for GFF3 output\n");
7655 fprintf(stdout,"\
7656 --gff3-add-separators=INT Whether to add a ### separator after each query sequence\n\
7657 Values: 0 (no), 1 (yes, default)\n\
7658 --gff3-swap-phase=INT Whether to swap phase (0 => 0, 1 => 2, 2 => 1) in gff3_gene format\n\
7659 Needed by some analysis programs, but deviates from GFF3 specification\n\
7660 Values: 0 (no, default), 1 (yes)\n\
7661 --gff3-fasta-annotation=INT Whether to include annotation from the FASTA header into the GFF3 output\n\
7662 Values: 0 (default): Do not include\n\
7663 1: Wrap all annotation as Annot=\"<header>\"\n\
7664 2: Include key=value pairs, replacing brackets with quotation marks\n\
7665 and replacing spaces between key=value pairs with semicolons\n\
7666 --gff3-cds=STRING Whether to use cDNA or genomic translation for the CDS coordinates\n\
7667 Values: cdna (default), genomic\n\
7668 ");
7669 fprintf(stdout,"\n");
7670
7671 fprintf(stdout,"Options for SAM output\n");
7672 fprintf(stdout,"\
7673 --no-sam-headers Do not print headers beginning with '@'\n\
7674 --sam-use-0M Insert 0M in CIGAR between adjacent insertions and deletions\n\
7675 Required by Picard, but can cause errors in other tools\n\
7676 --sam-extended-cigar Use extended CIGAR format (using X and = symbols instead of M,\n\
7677 to indicate matches and mismatches, respectively\n\
7678 --force-xs-dir For RNA-Seq alignments, disallows XS:A:? when the sense direction\n\
7679 is unclear, and replaces this value arbitrarily with XS:A:+.\n\
7680 May be useful for some programs, such as Cufflinks, that cannot\n\
7681 handle XS:A:?. However, if you use this flag, the reported value\n\
7682 of XS:A:+ in these cases will not be meaningful.\n\
7683 --md-lowercase-snp In MD string, when known SNPs are given by the -v flag,\n\
7684 prints difference nucleotides as lower-case when they,\n\
7685 differ from reference but match a known alternate allele\n\
7686 --action-if-cigar-error Action to take if there is a disagreement between CIGAR length and sequence length\n\
7687 Allowed values: ignore, warning (default), noprint, abort\n\
7688 Note that the noprint option does not print the CIGAR string at all if there\n\
7689 is an error, so it may break a SAM parser\n\
7690 --read-group-id=STRING Value to put into read-group id (RG-ID) field\n\
7691 --read-group-name=STRING Value to put into read-group name (RG-SM) field\n\
7692 --read-group-library=STRING Value to put into read-group library (RG-LB) field\n\
7693 --read-group-platform=STRING Value to put into read-group library (RG-PL) field\n\
7694 ");
7695 fprintf(stdout,"\n");
7696
7697 /* Quality score options */
7698 fprintf(stdout,"Options for quality scores\n");
7699 fprintf(stdout,"\
7700 --quality-protocol=STRING Protocol for input quality scores. Allowed values:\n\
7701 illumina (ASCII 64-126) (equivalent to -J 64 -j -31)\n\
7702 sanger (ASCII 33-126) (equivalent to -J 33 -j 0)\n\
7703 Default is sanger (no quality print shift)\n\
7704 SAM output files should have quality scores in sanger protocol\n\
7705 \n\
7706 Or you can specify the print shift with this flag:\n\
7707 -j, --quality-print-shift=INT Shift FASTQ quality scores by this amount in output\n\
7708 (default is 0 for sanger protocol; to change Illumina input\n\
7709 to Sanger output, select -31)\n\
7710 ");
7711 #endif
7712
7713 fprintf(stdout,"\
7714 External map file options\n\
7715 -M, --mapdir=directory Map directory\n\
7716 -m, --map=iitfile Map file. If argument is '?' (with the quotes),\n\
7717 this lists available map files.\n\
7718 -e, --mapexons Map each exon separately\n\
7719 -b, --mapboth Report hits from both strands of genome\n\
7720 -u, --flanking=INT Show flanking hits (default 0)\n\
7721 --print-comment Show comment line for each hit\n\
7722 ");
7723 fprintf(stdout,"\n");
7724
7725 fprintf(stdout,"\
7726 Alignment output options\n\
7727 -N, --nolengths No intron lengths in alignment\n\
7728 -I, --invertmode=INT Mode for alignments to genomic (-) strand:\n\
7729 0=Don't invert the cDNA (default)\n\
7730 1=Invert cDNA and print genomic (-) strand\n\
7731 2=Invert cDNA and print genomic (+) strand\n\
7732 ");
7733 fprintf(stdout,"\
7734 -i, --introngap=INT Nucleotides to show on each end of intron (default %d)\n\
7735 ",ngap);
7736 fprintf(stdout,"\
7737 -l, --wraplength=INT Wrap length for alignment (default %d)\n\
7738 ",wraplength);
7739 fprintf(stdout,"\n");
7740
7741 fprintf(stdout,"\
7742 Filtering output options\n\
7743 --min-trimmed-coverage=FLOAT Do not print alignments with trimmed coverage less\n\
7744 this value (default=0.0, which means no filtering)\n\
7745 Note that chimeric alignments will be output regardless\n\
7746 of this filter\n\
7747 --min-identity=FLOAT Do not print alignments with identity less\n\
7748 this value (default=0.0, which means no filtering)\n\
7749 Note that chimeric alignments will be output regardless\n\
7750 of this filter\n\
7751 \n\
7752 Help options\n\
7753 --check Check compiler assumptions\n\
7754 --version Show version\n\
7755 --help Show this help message\n\
7756 ");
7757
7758 return;
7759 }
7760