1 static char rcsid[] = "$Id: gmap.c 222810 2020-06-03 22:01:50Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #ifdef USE_MPI
7 #include <mpi.h>
8 #include "mpidebug.h"
9 #endif
10 
11 #ifdef HAVE_SYS_TYPES_H
12 #include <sys/types.h>		/* Needed to define pthread_t on Solaris */
13 #endif
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>		/* For strcpy */
17 #include <strings.h>		/* For rindex */
18 #include <ctype.h>
19 #include <math.h>		/* For rint */
20 #ifdef HAVE_SSE2
21 #include <emmintrin.h>
22 #endif
23 #ifdef HAVE_SSE4_1
24 #include <smmintrin.h>
25 #endif
26 #if !defined(HAVE_SSE4_2)
27 /* Skip popcnt */
28 #elif defined(HAVE_POPCNT)
29 #include <immintrin.h>
30 #endif
31 
32 #if !defined(HAVE_SSE4_2)
33 /* Skip mm_popcnt */
34 #elif defined(HAVE_MM_POPCNT)
35 #include <nmmintrin.h>
36 #endif
37 
38 
39 #if !defined(HAVE_SSE4_2)
40 /* Skip lzcnt/tzcnt */
41 #elif defined(HAVE_LZCNT) || defined(HAVE_TZCNT)
42 #include <immintrin.h>
43 #endif
44 
45 
46 #ifdef HAVE_PTHREAD
47 #include <pthread.h>
48 #endif
49 
50 #include <signal.h>
51 
52 #include "except.h"
53 #include "mem.h"
54 #include "bool.h"
55 #include "fopen.h"
56 #include "access.h"
57 #include "filesuffix.h"
58 
59 #include "sequence.h"
60 #include "match.h"
61 #include "matchpool.h"
62 #include "pairpool.h"
63 #include "diagpool.h"
64 #include "cellpool.h"
65 #include "stopwatch.h"
66 #include "translation.h"	/* For Translation_setup */
67 #include "genome.h"
68 #include "genome-write.h"
69 #include "genome128_hr.h"	/* For Genome_hr_setup */
70 #include "genome_sites.h"	/* For Genome_sites_setup */
71 #include "compress-write.h"
72 #include "maxent_hr.h"		/* For Maxent_hr_setup */
73 #include "stage1.h"
74 #include "gregion.h"
75 #ifdef PMAP
76 #include "oligoindex_pmap.h"
77 #else
78 #include "oligoindex_hr.h"	/* For Oligoindex_hr_setup */
79 /* #include "oligoindex_localdb.h" --  For Oligoindex_localdb_setup */
80 #endif
81 #include "stage2.h"
82 #include "splicestringpool.h"
83 #include "splicetrie.h"
84 #include "splicetrie_build.h"
85 #include "dynprog.h"
86 #include "dynprog_single.h"
87 #include "dynprog_genome.h"
88 #include "dynprog_end.h"
89 #include "pair.h"
90 #include "stage3.h"
91 #include "comp.h"
92 #include "chimera.h"
93 #ifdef PMAP
94 #include "oligop.h"		/* For Oligop_setup */
95 #include "backtranslation.h"
96 #else
97 #include "oligo.h"		/* For Oligo_setup */
98 #endif
99 #include "indexdb.h"
100 #include "result.h"
101 #include "request.h"
102 #include "intlist.h"
103 #include "list.h"
104 #include "iit-read-univ.h"
105 #include "iit-read.h"
106 #include "datadir.h"
107 
108 #include "filestring.h"
109 #include "output.h"
110 #include "inbuffer.h"
111 #include "outbuffer.h"
112 
113 #include "getopt.h"
114 
115 
116 #define MAX_QUERYLENGTH_FOR_ALLOC    100000
117 #define MAX_GENOMICLENGTH_FOR_ALLOC 1000000
118 
119 
120 #define STAGE1_FIRSTPAIR_SIZELIMIT 10000
121 #define STAGE1_STUTTER_SIZELIMIT 100
122 #define STAGE1_FILLIN_SIZELIMIT 10
123 
124 
125 #define POSSIBLE_OLIGOS 65536	/* 4^8 */
126 #define MAX_OLIGODEPTH 3.0
127 #define MAX_BADOLIGOS 0.30	/* Setting to 1.0 effectively turns this check off */
128 #define MAX_REPOLIGOS 0.40	/* Setting to 1.0 effectively turns this check off */
129 
130 /* Value of 1 can miss end exons, but values larger than 1 can lead to
131    very long (or infinite?) run times when combined with
132    --intronlength */
133 #define MAX_CHIMERA_ITER 3
134 
135 #define CHIMERA_PENALTY 30	/* A small value for chimera_margin will reduce this  */
136 #define CHIMERA_IDENTITY 0.98
137 #define CHIMERA_PVALUE 0.01
138 #define CHIMERA_FVALUE 6.634897	/* qnorm(CHIMERA_PVALUE/2)^2 */
139 #define CHIMERA_SLOP 90	/* in nucleotides */
140 #define CHIMERA_EXTEND 20	/* Was previously 8, but this missed exon-exon boundaries */
141 
142 #define MIN_MATCHES 20
143 
144 
145 #define MAX_NALIGNMENTS 10
146 
147 
148 /* #define EXTRACT_GENOMICSEG 1 */
149 
150 
151 /* MPI Processing */
152 #ifdef DEBUGM
153 #define debugm(x) x
154 #else
155 #define debugm(x)
156 #endif
157 
158 
159 #ifdef DEBUG
160 #define debug(x) x
161 #else
162 #define debug(x)
163 #endif
164 
165 /* Chimera detection */
166 #ifdef DEBUG2
167 #define debug2(x) x
168 #else
169 #define debug2(x)
170 #endif
171 
172 /* Chimera detection, details */
173 #ifdef DEBUG2A
174 #define debug2a(x) x
175 #else
176 #define debug2a(x)
177 #endif
178 
179 /* stage3list_remove_duplicates */
180 #ifdef DEBUG3
181 #define debug3(x) x
182 #else
183 #define debug3(x)
184 #endif
185 
186 
187 
188 /************************************************************************
189  *   Global variables
190  ************************************************************************/
191 
192 static int translation_code = 1;
193 static bool alt_initiation_codons_p = false;
194 
195 static Univ_IIT_T chromosome_iit = NULL;
196 static Univ_IIT_T altscaffold_iit = NULL;
197 static Univcoord_T genomelength;
198 static int circular_typeint = -1;
199 static int nchromosomes;
200 static bool *circularp = NULL;
201 static bool any_circular_p;
202 
203 static bool *altlocp = NULL;
204 static Univcoord_T *alias_starts = NULL;
205 static Univcoord_T *alias_ends = NULL;
206 static Univ_IIT_T contig_iit = NULL;
207 static Genome_T genomecomp = NULL;
208 static Genome_T genomecomp_alt = NULL;
209 static Genomecomp_T *genomecomp_blocks = NULL;
210 
211 #ifdef PMAP
212 static Alphabet_T required_alphabet = AA0;
213 static Alphabet_T alphabet = AA20; /* Initialize in case we have a usersegment */
214 static int alphabet_size = 20;	   /* Initialize in case we have a usersegment */
215 static Width_T index1part_aa = 7;
216 #else
217 static Width_T index1part;
218 #endif
219 
220 static Indexdb_T indexdb_fwd = NULL;
221 static Indexdb_T indexdb_rev = NULL;
222 
223 /* static Localdb_T localdb = NULL; */
224 
225 static Width_T required_index1part = 0;
226 static Width_T index1interval;
227 static Width_T required_index1interval = 0;
228 
229 /* static Width_T local1part = 8; */
230 /* static Width_T required_local1part = 0; */
231 /* static Width_T local1interval; */
232 /* static Width_T required_local1interval = 0; */
233 
234 static IIT_T altstrain_iit = NULL;
235 
236 /* Cmet and AtoI */
237 static char *user_modedir = NULL; /* user_cmetdir, user_atoidir */
238 static Mode_T mode = STANDARD;
239 
240 
241 static char *user_snpsdir = NULL;
242 static char *snps_root = (char *) NULL;
243 static IIT_T map_iit = NULL;
244 static int *map_divint_crosstable = NULL;
245 
246 #ifdef PMAP
247 #if 0
248 static Width_T minindexsize = 3;	/* In stage 2; in aa */
249 static Width_T maxindexsize = 6;	/* In stage 2; in aa */
250 #endif
251 /* Now controlled by defect_rate */
252 static int maxpeelback = 20;	/* Needs to be at least indexsize
253 				   because stage 2 jumps by indexsize.
254 				   Also should exceed length of
255 				   repeated nucleotides (e.g., a
256 				   string of consecutive T's) */
257 #else
258 /* Making minindexsize too small can lead to spurious exons in stage 2 */
259 /* FOOBAR */
260 #if 0
261 static Width_T minindexsize = 8;	/* In stage 2; in nt.  Used if sampling required in stage 1. */
262 static Width_T maxindexsize = 8;	/* In stage 2; in nt */
263 #endif
264 static int maxpeelback = 20;	/* Needs to be at least indexsize
265 				   because stage 2 jumps by indexsize.
266 				   Also should exceed length of
267 				   repeated nucleotides (e.g., a
268 				   string of consecutive T's) */
269 #endif
270 static int maxpeelback_distalmedial = 100; /* Needs to be longer to fix bad end exons */
271 
272 /* static int stuttercycles = 2; */
273 static int stutterhits = 3;
274 static int sufflookback = 60;
275 static int nsufflookback = 5;
276 
277 #if 0
278 static int maxoligohits = 400; /* Must be smaller than ALLOC in oligoindex.c */
279 #endif
280 static int nullgap = 600;
281 static int extramaterial_end = 10;
282 static int extramaterial_paired = 8; /* Should be at least indexsize in nt */
283 static int extraband_single = 6; /* This is in addition to length2 -
284 				    length1.  If onesidegap is true in
285 				    dynprog.c, then this is equivalent
286 				    to extraband_single of 0.  Needs
287 				    to be > 0 to handle default
288 				    close_indels_mode. */
289 static int extraband_end = 6; /* Was 6.  Shouldn't differ from 0, since onesidegapp is true?
290 				 This is only on both sides of main diagonal */
291 static int extraband_paired = 14; /* This is in addition to length2 - length1 */
292 
293 static Stopwatch_T stopwatch = NULL;
294 
295 
296 /************************************************************************
297  *   Program options
298  ************************************************************************/
299 
300 /* Input options */
301 static char *user_genomedir = NULL;
302 static char *dbroot = NULL;
303 static char *dbversion = NULL;
304 static char *user_genomicseg = NULL;
305 static bool user_selfalign_p = false;
306 static bool user_pairalign_p = false;
307 static char *user_cmdline = NULL;
308 static Sequence_T global_usersegment = NULL;
309 static int part_modulus = 0;
310 static int part_interval = 1;
311 
312 static char *read_files_command = NULL;
313 
314 
315 /* Compute options */
316 static int min_matches;
317 
318 #ifdef USE_MPI
319   int nbeyond;
320 #else
321   bool multiple_sequences_p = false;
322 #endif
323 
324 static bool sharedp = false;
325 static bool preload_shared_memory_p = false;
326 static bool unload_shared_memory_p = false;
327 static bool expand_offsets_p = false;
328 
329 #ifdef HAVE_MMAP
330 static Access_mode_T offsetsstrm_access = USE_ALLOCATE;
331 static Access_mode_T positions_access = USE_MMAP_PRELOAD;
332 static Access_mode_T locoffsetsstrm_access = USE_ALLOCATE;
333 static Access_mode_T locpositions_access = USE_ALLOCATE;
334 
335 static Access_mode_T genome_access = USE_MMAP_PRELOAD;
336 #else
337 static Access_mode_T offsetsstrm_access = USE_ALLOCATE;
338 static Access_mode_T positions_access = USE_ALLOCATE;
339 static Access_mode_T locoffsetsstrm_access = USE_ALLOCATE;
340 static Access_mode_T locpositions_access = USE_ALLOCATE;
341 
342 static Access_mode_T genome_access = USE_ALLOCATE;
343 #endif
344 
345 static int min_intronlength = 9;
346 static int max_deletionlength = 50;
347 static int maxtotallen_bound = 2400000;
348 
349 static bool split_large_introns_p = false;
350 
351 /* Need to set higher than 200,000 for many human genes, such as ALK */
352 static int maxintronlen = 500000; /* Was used previously in stage 1.  Now used only in stage 2 and Stage3_mergeable. */
353 static int maxintronlen_ends = 10000; /* Used in stage 3 */
354 
355 static int minendexon = 12;
356 static int maxextension = 1000000; /* Used in stage 1.  Not adjustable by user */
357 static int chimera_margin = 30;	/* Useful for finding readthroughs */
358 static int index1interval = 3; /* Stage 1 interval if user provides a genomic segment */
359 /* static char *referencefile = NULL; */
360 
361 #if 0
362 #ifndef PMAP
363 static bool literalrefp = false;
364 #endif
365 #endif
366 
367 #ifdef USE_MPI
368 static int nprocs, n_worker_procs, proci, myid;
369 #endif
370 
371 
372 /* static bool altstrainp = false; */
373 #ifdef HAVE_PTHREAD
374 static pthread_t output_thread_id, *worker_thread_ids;
375 static pthread_key_t global_request_key;
376 static int nworkers = 1;	/* (int) sysconf(_SC_NPROCESSORS_ONLN) */
377 #else
378 static int nworkers = 0;	/* (int) sysconf(_SC_NPROCESSORS_ONLN) */
379 #endif
380 #ifndef PMAP
381 static bool prune_poor_p = false;
382 static bool prune_repetitive_p = false;
383 #endif
384 static int canonical_mode = 1;
385 static bool cross_species_p = false;
386 static int homopolymerp = false;
387 
388 static char *user_chrsubsetname = NULL;
389 static Univcoord_T chrsubset_start = 0;
390 static Univcoord_T chrsubset_end = -1;
391 
392 static int close_indels_mode = +1;
393 static double microexon_spliceprob = 0.95;
394 static int suboptimal_score_start = -1; /* Determined by simulations to have minimal effect */
395 static int suboptimal_score_end = 3; /* Determined by simulations to have diminishing returns above 3 */
396 
397 static int trim_indel_score = -2; /* was -4 */
398 
399 
400 /* Output options */
401 static unsigned int output_buffer_size = 1000;
402 static Printtype_T printtype = SIMPLE;
403 static bool exception_raise_p = true;
404 static bool debug_graphic_p = false;
405 static bool stage1debug = false;
406 static bool diag_debug = false;
407 static Stage3debug_T stage3debug = NO_STAGE3DEBUG;
408 static bool timingp = false;
409 static bool checkp = false;
410 static int maxpaths_report = 5;	/* 0 means 1 if nonchimeric, 2 if chimeric */
411 static bool quiet_if_excessive_p = false;
412 static double suboptimal_score_float = 0.50;
413 static bool require_splicedir_p = false;
414 
415 
416 /* GFF3 */
417 static bool gff3_separators_p = true;
418 static bool gff3_phase_swap_p = false;
419 static GFF3_fasta_annotation_T gff3_fasta_annotation_type = NO_ANNOTATION;
420 static CDStype_T cdstype = CDS_CDNA;
421 
422 /* SAM */
423 /* Applicable to PMAP? */
424 static bool sam_paired_p = false;
425 static bool user_quality_shift = false;
426 static int quality_shift = 0;
427 static bool sam_headers_p = true;
428 static char *sam_read_group_id = NULL;
429 static char *sam_read_group_name = NULL;
430 static char *sam_read_group_library = NULL;
431 static char *sam_read_group_platform = NULL;
432 static bool sam_insert_0M_p = false;
433 static bool sam_cigar_extended_p = false;
434 static Cigar_action_T cigar_action = CIGAR_ACTION_WARNING;
435 
436 static bool orderedp = false;
437 static bool failsonlyp = false;
438 static bool nofailsp = false;
439 static bool checksump = false;
440 static int chimera_overlap = 0;
441 static bool force_xs_direction_p = false;
442 static bool md_lowercase_variant_p = false;
443 
444 /* Map file options */
445 static char *user_mapdir = NULL;
446 static char *map_iitfile = NULL;
447 static bool map_exons_p = false;
448 static bool map_bothstrands_p = false;
449 static bool print_comment_p = false;
450 static int nflanking = 0;
451 
452 /* Alignment options */
453 static bool fulllengthp = false;
454 static int cds_startpos = -1;
455 static bool truncatep = false;
456 static int sense_try = 0;		/* both */
457 static int sense_filter = 0;		/* both */
458 static double min_trimmed_coverage = 0.0;
459 static double min_identity = 0.0;
460 static bool strictp = true;
461 /* static int proteinmode = 1; */
462 static bool uncompressedp = false;
463 static bool nointronlenp = false;
464 static int invertmode = 0;
465 static int ngap = 3;
466 static int wraplength = 50;
467 
468 
469 /* Splicing IIT */
470 static bool novelsplicingp = true; /* Can be disabled with --nosplicing flag */
471 static bool knownsplicingp = false;
472 static bool distances_observed_p = false;
473 static Chrpos_T shortsplicedist = 2000000;
474 static char *user_splicingdir = (char *) NULL;
475 static char *splicing_file = (char *) NULL;
476 static IIT_T splicing_iit = NULL;
477 static bool amb_closest_p = false;
478 
479 static int donor_typeint = -1;		/* for splicing_iit */
480 static int acceptor_typeint = -1;	/* for splicing_iit */
481 
482 static int *splicing_divint_crosstable = NULL;
483 static Univcoord_T *splicesites = NULL;
484 static Splicetype_T *splicetypes = NULL;
485 static Chrpos_T *splicedists = NULL; /* maximum observed splice distance for given splice site */
486 static List_T *splicestrings = NULL;
487 static Genomecomp_T *splicefrags_ref = NULL;
488 static Genomecomp_T *splicefrags_alt = NULL;
489 static int nsplicesites = 0;
490 
491 /* Splicing via splicesites */
492 static int *nsplicepartners_skip = NULL;
493 static int *nsplicepartners_obs = NULL;
494 static int *nsplicepartners_max = NULL;
495 
496 static bool splicetrie_precompute_p = true;
497 static Trieoffset_T *trieoffsets_obs = NULL;
498 static Triecontent_T *triecontents_obs = NULL;
499 static Trieoffset_T *trieoffsets_max = NULL;
500 static Triecontent_T *triecontents_max = NULL;
501 
502 
503 /* Input/output */
504 static char *split_output_root = NULL;
505 static char *failedinput_root = NULL;
506 static bool appendp = false;
507 static Inbuffer_T inbuffer = NULL;
508 static Outbuffer_T outbuffer = NULL;
509 static unsigned int inbuffer_nspaces = 1000;
510 
511 
512 #ifdef PMAP
513 /* Used alphabetically: 01235789ABbCcDdEefGgHIiKkLlMmNnOoPQRSstuVvwXxYZ */
514 #else
515 /* Used alphabetically: 01235789AaBbCcDdEeFfGgHIijKkLlMmNnOoPpQRSsTtuVvwXxYZ */
516 #endif
517 
518 static struct option long_options[] = {
519   /* Input options */
520   {"dir", required_argument, 0, 'D'},	/* user_genomedir */
521   {"db", required_argument, 0, 'd'}, /* dbroot */
522 #ifdef PMAP
523   {"alphabet", required_argument, 0, 'a'}, /* required_alphabet */
524 #endif
525   {"kmer", required_argument, 0, 'k'}, /* required_index1part, index1part */
526   {"sampling", required_argument, 0, 0}, /* required_nterval, index1interval */
527   {"genomefull", no_argument, 0, 'G'}, /* uncompressedp.  No longer supported. */
528   {"gseg", required_argument, 0, 'g'}, /* user_genomicseg */
529   {"selfalign", no_argument, 0, '1'}, /* user_selfalign_p */
530   {"pairalign", no_argument, 0, '2'}, /* user_pairalign_p */
531   {"cmdline", required_argument, 0, 0}, /* user_cmdline */
532   {"part", required_argument, 0, 'q'}, /* part_modulus, part_interval */
533   {"input-buffer-size", required_argument, 0, 0}, /* inbuffer_nspaces */
534 
535   {"read-files-command", required_argument, 0, 0}, /* read_files_command */
536 
537 
538   /* Compute options */
539   {"use-shared-memory", required_argument, 0, 0}, /* sharedp */
540   {"preload-shared-memory", no_argument, 0, 0},	  /* preload_shared_memory_p */
541   {"unload-shared-memory", no_argument, 0, 0},	  /* unload_shared_memory_p */
542 #ifdef HAVE_MMAP
543   {"batch", required_argument, 0, 'B'}, /* offsetsstrm_access, positions_access, genome_access */
544 #endif
545   {"expand-offsets", required_argument, 0, 0}, /* expand_offsets_p */
546   {"min-intronlength", required_argument, 0, 0}, /* min_intronlength */
547 
548   {"intronlength", required_argument, 0, 'K'},		/* maxintronlen, maxintronlen_ends */
549   {"max-intronlength-middle", required_argument, 0, 0}, /* maxintronlen */
550   {"max-intronlength-ends", required_argument, 0, 0}, /* maxintronlen_ends */
551   {"split-large-introns", no_argument, 0, 0},	      /* split_large_introns_p */
552 
553   {"trim-end-exons", required_argument, 0, 0}, /* minendexon */
554   {"totallength", required_argument, 0, 'L'}, /* maxtotallen_bound */
555   {"chimera-margin", required_argument, 0, 'x'}, /* chimera_margin */
556   {"no-chimeras", no_argument, 0, 0},		 /* chimera_margin */
557 #if 0
558   {"reference", required_argument, 0, 'w'}, /* referencefile */
559 #else
560   {"localsplicedist", required_argument, 0, 'w'}, /* shortsplicedist */
561 #endif
562   {"translation-code", required_argument, 0, 0}, /* translation_code */
563   {"alt-start-codons", no_argument, 0, 0}, /* alt_initiation_codons_p */
564 
565   {"nthreads", required_argument, 0, 't'}, /* nworkers */
566   {"splicingdir", required_argument, 0, 0}, /* user_splicingdir */
567   {"nosplicing", no_argument, 0, 0},	    /* novelsplicingp */
568   {"use-splicing", required_argument, 0, 's'}, /* splicing_iit, knownsplicingp (was previously altstrainp) */
569   {"chrsubset", required_argument, 0, 'c'}, /* user_chrsubsetname */
570   {"canonical-mode", required_argument, 0, 0}, /* canonical_mode */
571   {"cross-species", no_argument, 0, 0}, /* cross_species_p */
572   {"homopolymer", no_argument, 0, 0},	/* homopolymerp */
573 #ifndef PMAP
574   {"prunelevel", required_argument, 0, 'p'}, /* prune_poor_p, prune_repetitive_p */
575 #endif
576   {"allow-close-indels", required_argument, 0, 0}, /* close_indels_mode, extraband_single */
577   {"microexon-spliceprob", required_argument, 0, 0}, /* microexon_spliceprob */
578   {"stage2-start", required_argument, 0, 0},	     /* suboptimal_score_start */
579   {"stage2-end", required_argument, 0, 0},	     /* suboptimal_score_end */
580 
581   {"cmetdir", required_argument, 0, 0}, /* user_modedir */
582   {"atoidir", required_argument, 0, 0}, /* user_modedir */
583   {"mode", required_argument, 0, 0}, /* mode */
584 
585   /* Output options */
586   {"output-buffer-size", required_argument, 0, 0}, /* output_buffer_size */
587   {"summary", no_argument, 0, 'S'}, /* printtype */
588   {"align", no_argument, 0, 'A'}, /* printtype */
589   {"continuous", no_argument, 0, '3'}, /* printtype */
590   {"continuous-by-exon", no_argument, 0, '4'}, /* printtype */
591   {"noexceptions", no_argument, 0, '0'}, /* exception_raise_p */
592   {"graphic", no_argument, 0, '6'}, /* debug_graphic_p */
593   {"stage3debug", required_argument, 0, '8'}, /* stage3debug */
594   {"diagnostic", no_argument, 0, '9'}, /* checkp */
595   {"npaths", required_argument, 0, 'n'}, /* maxpaths_report */
596 #if 0
597   {"quiet-if-excessive", no_argument, 0, 0}, /* quiet_if_excessive_p */
598 #endif
599   {"format", required_argument, 0, 'f'}, /* printtype */
600   {"failsonly", no_argument, 0, 0}, /* failsonlyp */
601   {"nofails", no_argument, 0, 0}, /* nofailsp */
602   {"split-output", required_argument, 0, 0}, /* split_output_root */
603   {"failed-input", required_argument, 0, 0}, /* failedinput_root */
604   {"append-output", no_argument, 0, 0},	     /* appendp */
605   {"suboptimal-score", required_argument, 0, 0}, /* suboptimal_score_float */
606   {"require-splicedir", no_argument, 0, 0}, /* require_splicedir_p */
607 
608   {"gff3-add-separators", required_argument, 0, 0}, /* gff3_separators_p */
609   {"gff3-swap-phase", required_argument, 0, 0}, /* gff3_phase_swap_p */
610   {"gff3-fasta-annotation", required_argument, 0, 0}, /* gff3_fasta_annotation_type */
611   {"gff3-cds", required_argument, 0, 0}, /* cdstype */
612 
613 #ifndef PMAP
614   {"quality-protocol", required_argument, 0, 0}, /* quality_shift */
615   {"quality-print-shift", required_argument, 0, 'j'}, /* quality_shift */
616   {"no-sam-headers", no_argument, 0, 0},	/* sam_headers_p */
617   {"sam-use-0M", no_argument, 0, 0},		/* sam_insert_0M_p */
618   {"sam-extended-cigar", no_argument, 0, 0},	/* sam_cigar_extended_p */
619   {"read-group-id", required_argument, 0, 0},	/* sam_read_group_id */
620   {"read-group-name", required_argument, 0, 0},	/* sam_read_group_name */
621   {"read-group-library", required_argument, 0, 0}, /* sam_read_group_library */
622   {"read-group-platform", required_argument, 0, 0}, /* sam_read_group_platform */
623   {"force-xs-dir", no_argument, 0, 0},		    /* force_xs_direction_p */
624   {"md-lowercase-snp", no_argument, 0, 0},	    /* md_lowercase_variant_p */
625   {"action-if-cigar-error", required_argument, 0, 0},	/* cigar_action */
626 #endif
627 
628   {"compress", no_argument, 0, 'Z'}, /* printtype */
629   {"ordered", no_argument, 0, 'O'}, /* orderedp */
630   {"md5", no_argument, 0, '5'}, /* checksump */
631   {"chimera-overlap", required_argument, 0, 'o'}, /* chimera_overlap */
632   {"snpsdir", required_argument, 0, 'V'},   /* user_snpsdir */
633   {"use-snps", required_argument, 0, 'v'}, /* snps_root */
634 
635   /* Map file options */
636   {"mapdir", required_argument, 0, 'M'}, /* user_mapdir */
637   {"map", required_argument, 0, 'm'},	/* map_iitfile */
638   {"mapexons", no_argument, 0, 'e'}, /* map_exons_p */
639   {"mapboth", no_argument, 0, 'b'}, /* map_bothstrands_p */
640   {"nflanking", required_argument, 0, 'u'}, /* nflanking */
641   {"print-comment", no_argument, 0, 0},	    /* print_comment_p */
642 
643   /* Alignment options */
644   {"exons", required_argument, 0, 'E'}, /* printtype */
645 #ifdef PMAP
646   {"protein_gen", no_argument, 0, 'P'}, /* printtype */
647   {"nucleotide", no_argument, 0, 'Q'}, /* printtype */
648 #else
649   {"protein_dna", no_argument, 0, 'P'}, /* printtype */
650   {"protein_gen", no_argument, 0, 'Q'}, /* printtype */
651   {"fulllength", no_argument, 0, 'F'}, /* fulllengthp */
652   {"cdsstart", required_argument, 0, 'a'}, /* cds_startpos */
653   {"truncate", no_argument, 0, 'T'}, /* truncatep */
654   {"direction", required_argument, 0, 'z'}, /* sense_try, sense_filter */
655 #endif
656   {"tolerant", no_argument, 0, 'Y'}, /* strictp */
657   {"nolengths", no_argument, 0, 'N'},	/* nointronlenp */
658   {"invertmode", required_argument, 0, 'I'}, /* invertmode */
659   {"introngap", required_argument, 0, 'i'}, /* ngap */
660   {"wraplength", required_argument, 0, 'l'}, /* wraplength */
661 
662   /* Filtering options */
663   {"min-trimmed-coverage", required_argument, 0, 0}, /* min_trimmed_coverage */
664   {"min-identity", required_argument, 0, 0},	/* min_identity */
665 
666   /* Diagnostic options */
667   {"time", no_argument, 0, 0},	/* timingp */
668 
669   /* Help options */
670   {"check", no_argument, 0, 0}, /* check_compiler_assumptions */
671   {"version", no_argument, 0, 0}, /* print_program_version */
672   {"help", no_argument, 0, 0}, /* print_program_usage */
673   {0, 0, 0, 0}
674 };
675 
676 
677 static void
print_program_version()678 print_program_version () {
679   char *genomedir;
680 
681   fprintf(stdout,"\n");
682 #ifdef PMAP
683   fprintf(stdout,"PMAP: Protein Mapping and Alignment Program\n");
684 #else
685   fprintf(stdout,"GMAP: Genomic Mapping and Alignment Program\n");
686 #endif
687   fprintf(stdout,"Part of GMAP package, version %s\n",PACKAGE_VERSION);
688   fprintf(stdout,"Build target: %s\n",TARGET);
689   fprintf(stdout,"Features: ");
690 #ifdef HAVE_PTHREAD
691   fprintf(stdout,"pthreads enabled, ");
692 #else
693   fprintf(stdout,"no pthreads, ");
694 #endif
695 #ifdef HAVE_ALLOCA
696   fprintf(stdout,"alloca available, ");
697 #else
698   fprintf(stdout,"no alloca, ");
699 #endif
700 #ifdef HAVE_ZLIB
701   fprintf(stdout,"zlib available, ");
702 #else
703   fprintf(stdout,"no zlib, ");
704 #endif
705 #ifdef HAVE_MMAP
706   fprintf(stdout,"mmap available, ");
707 #else
708   fprintf(stdout,"no mmap, ");
709 #endif
710 #ifdef WORDS_BIGENDIAN
711   fprintf(stdout,"bigendian, ");
712 #else
713   fprintf(stdout,"littleendian, ");
714 #endif
715 #ifdef HAVE_SIGACTION
716   fprintf(stdout,"sigaction available, ");
717 #else
718   fprintf(stdout,"no sigaction, ");
719 #endif
720 #ifdef HAVE_64_BIT
721   fprintf(stdout,"64 bits available");
722 #else
723   fprintf(stdout,"64 bits not available");
724 #endif
725   fprintf(stdout,"\n");
726 
727   fprintf(stdout,"Popcnt:");
728 #ifdef HAVE_POPCNT
729   fprintf(stdout," popcnt/lzcnt/tzcnt");
730 #endif
731 #ifdef HAVE_MM_POPCNT
732   fprintf(stdout," mm_popcnt");
733 #endif
734 #ifdef HAVE_BUILTIN_POPCOUNT
735   fprintf(stdout," builtin_popcount");
736 #endif
737   fprintf(stdout,"\n");
738 
739   fprintf(stdout,"Builtin functions:");
740 #ifdef HAVE_BUILTIN_CLZ
741   fprintf(stdout," builtin_clz");
742 #endif
743 #ifdef HAVE_BUILTIN_CTZ
744   fprintf(stdout," builtin_ctz");
745 #endif
746 #ifdef HAVE_BUILTIN_POPCOUNT
747   fprintf(stdout," builtin_popcount");
748 #endif
749   fprintf(stdout,"\n");
750 
751 
752   fprintf(stdout,"SIMD functions compiled:");
753 #ifdef HAVE_ALTIVEC
754   fprintf(stdout," Altivec");
755 #endif
756 #ifdef HAVE_MMX
757   fprintf(stdout," MMX");
758 #endif
759 #ifdef HAVE_SSE
760   fprintf(stdout," SSE");
761 #endif
762 #ifdef HAVE_SSE2
763   fprintf(stdout," SSE2");
764 #endif
765 #ifdef HAVE_SSE3
766   fprintf(stdout," SSE3");
767 #endif
768 #ifdef HAVE_SSSE3
769   fprintf(stdout," SSSE3");
770 #endif
771 #ifdef HAVE_SSE4_1
772   fprintf(stdout," SSE4.1");
773 #endif
774 #ifdef HAVE_SSE4_2
775   fprintf(stdout," SSE4.2");
776 #endif
777 #ifdef HAVE_AVX2
778   fprintf(stdout," AVX2");
779 #endif
780 #ifdef HAVE_AVX512
781   fprintf(stdout," AVX512");
782 #endif
783 #ifdef HAVE_AVX512BW
784   fprintf(stdout," AVX512BW");
785 #endif
786   fprintf(stdout,"\n");
787 
788 
789 #ifdef PMAP
790   fprintf(stdout,"Stage 1 index size: %d aa\n",index1part_aa);
791 #endif
792   fprintf(stdout,"Sizes: off_t (%d), size_t (%d), unsigned int (%d), long int (%d), long long int (%d)\n",
793 	  (int) sizeof(off_t),(int) sizeof(size_t),(int) sizeof(unsigned int),(int) sizeof(long int),(int) sizeof(long long int));
794   fprintf(stdout,"Default gmap directory (compiled): %s\n",GMAPDB);
795   genomedir = Datadir_find_genomedir(/*user_genomedir*/NULL);
796   fprintf(stdout,"Default gmap directory (environment): %s\n",genomedir);
797   FREE(genomedir);
798   fprintf(stdout,"Thomas D. Wu, Genentech, Inc.\n");
799   fprintf(stdout,"Contact: twu@gene.com\n");
800   fprintf(stdout,"\n");
801   return;
802 }
803 
804 /* This flag is not well-supported, and therefore hidden, but
805    kept for backwards compatibility */
806 /*  -R, --rel=STRING               Release\n\ */
807 
808 static void
809 print_program_usage ();
810 
811 
812 static void
check_compiler_assumptions()813 check_compiler_assumptions () {
814   unsigned int x = rand(), y = rand();
815 #ifdef HAVE_SSE2
816   int z;
817   __m128i a;
818 #ifdef HAVE_SSE4_1
819   char negx, negy;
820 #endif
821 #endif
822 
823 
824 #ifdef HAVE_SSE2
825   fprintf(stderr,"Checking compiler assumptions for SSE2: ");
826   fprintf(stderr,"%08X %08X",x,y);
827   a = _mm_xor_si128(_mm_set1_epi32(x),_mm_set1_epi32(y));
828   z = _mm_cvtsi128_si32(a);
829   fprintf(stderr," xor=%08X\n",z);
830 #endif
831 
832 #ifdef HAVE_SSE4_1
833   if ((negx = (char) x) > 0) {
834     negx = -negx;
835   }
836   if ((negy = (char) y) > 0) {
837     negy = -negy;
838   }
839 
840   fprintf(stderr,"Checking compiler assumptions for SSE4.1: ");
841   fprintf(stderr,"%d %d",negx,negy);
842   a = _mm_max_epi8(_mm_set1_epi8(negx),_mm_set1_epi8(negy));
843   z = _mm_extract_epi8(a,0);
844   fprintf(stderr," max=%d => ",z);
845   if (negx > negy) {
846     if (z == (int) negx) {
847       fprintf(stderr,"compiler sign extends\n"); /* technically incorrect, but SIMD procedures behave properly */
848     } else {
849       fprintf(stderr,"compiler zero extends\n");
850     }
851   } else {
852     if (z == (int) negy) {
853       fprintf(stderr,"compiler sign extends\n"); /* technically incorrect, but SIMD procedures behave properly */
854     } else {
855       fprintf(stderr,"compiler zero extends\n");
856     }
857   }
858 #endif
859 
860 #ifdef HAVE_SSE4_2
861   fprintf(stderr,"Checking compiler options for SSE4.2: ");
862   fprintf(stderr,"%08X ",x);
863 #ifdef HAVE_LZCNT
864   fprintf(stderr,"_lzcnt_u32=%d ",_lzcnt_u32(x));
865 #endif
866 #ifdef HAVE_BUILTIN_CLZ
867   fprintf(stderr,"__builtin_clz=%d ",__builtin_clz(x));
868 #endif
869 #ifdef HAVE_TZCNT
870   fprintf(stderr,"_tzcnt_u32=%d ",_tzcnt_u32(x));
871 #endif
872 #ifdef HAVE_BUILTIN_CTZ
873   fprintf(stderr,"__builtin_ctz=%d ",__builtin_ctz(x));
874 #endif
875 
876 #ifdef HAVE_POPCNT
877   fprintf(stderr,"_popcnt32=%d ",_popcnt32(x));
878 #endif
879 #if defined(HAVE_MM_POPCNT)
880   fprintf(stderr,"_mm_popcnt_u32=%d ",_mm_popcnt_u32(x));
881 #endif
882 #if defined(HAVE_BUILTIN_POPCOUNT)
883   fprintf(stderr,"__builtin_popcount=%d ",__builtin_popcount(x));
884 #endif
885   fprintf(stderr,"\n");
886 
887 #endif
888 
889   fprintf(stderr,"Finished checking compiler assumptions\n");
890 
891   return;
892 }
893 
894 
895 /************************************************************************/
896 
897 
898 /* Call before Stage1_compute */
899 static Diagnostic_T
evaluate_query(bool * poorp,bool * repetitivep,char * queryuc_ptr,int querylength,Oligoindex_T oligoindex)900 evaluate_query (bool *poorp, bool *repetitivep, char *queryuc_ptr, int querylength,
901 		Oligoindex_T oligoindex) {
902   Diagnostic_T diagnostic;
903 
904   diagnostic = Diagnostic_new();
905 
906 #ifdef PMAP
907   Oligoindex_set_inquery(&diagnostic->query_badoligos,&diagnostic->query_repoligos,
908 			 &diagnostic->query_trimoligos,&diagnostic->query_trim_start,
909 			 &diagnostic->query_trim_end,oligoindex,queryuc_ptr,
910 			 /*querystart*/0,/*queryend*/querylength);
911   *poorp = false;
912   *repetitivep = false;
913 #else
914   diagnostic->query_oligodepth =
915     Oligoindex_set_inquery(&diagnostic->query_badoligos,&diagnostic->query_repoligos,
916 			   &diagnostic->query_trimoligos,&diagnostic->query_trim_start,
917 			   &diagnostic->query_trim_end,oligoindex,queryuc_ptr,
918 			   /*querystart*/0,/*queryend*/querylength,/*trimp*/true);
919 
920   debug2(printf("query_trimoligos %d, fraction badoligos %f = %d/%d, oligodepth %f, fraction repoligos %f = %d/%d\n",
921 		diagnostic->query_trimoligos,
922 		(double) diagnostic->query_badoligos/(double) diagnostic->query_trimoligos,
923 		diagnostic->query_badoligos,diagnostic->query_trimoligos,
924 		diagnostic->query_oligodepth,
925 		(double) diagnostic->query_repoligos/(double) diagnostic->query_trimoligos,
926 		diagnostic->query_repoligos,diagnostic->query_trimoligos));
927 
928   if (diagnostic->query_trimoligos == 0) {
929     *poorp = true;
930   } else if (((double) diagnostic->query_badoligos/(double) diagnostic->query_trimoligos > MAX_BADOLIGOS) ||
931 	     (diagnostic->query_trim_end - diagnostic->query_trim_start < 80 && diagnostic->query_badoligos > 0)) {
932     *poorp = true;
933   } else {
934     *poorp = false;
935   }
936 
937   if (diagnostic->query_trimoligos == 0) {
938     *repetitivep = false;
939   } else if (diagnostic->query_oligodepth > MAX_OLIGODEPTH ||
940 	     (double) diagnostic->query_repoligos/(double) diagnostic->query_trimoligos > MAX_REPOLIGOS) {
941     *repetitivep = true;
942   } else {
943     *repetitivep = false;
944   }
945 #endif
946 
947   return diagnostic;
948 }
949 
950 
951 
952 
953 static Stage3_T *
stage3array_from_list(int * npaths_primary,int * npaths_altloc,int * first_absmq,int * second_absmq,List_T stage3list,bool chimerap,bool remove_overlaps_p)954 stage3array_from_list (int *npaths_primary, int *npaths_altloc, int *first_absmq, int *second_absmq,
955 		       List_T stage3list, bool chimerap, bool remove_overlaps_p) {
956   Stage3_T *array1, *array0, x, y;
957   bool *eliminate;
958   int norig_primary, norig_altloc, i_primary, i_altloc, i, j;
959   int threshold_score;
960 
961   Univcoord_T alias_start, alias_end;
962 
963   debug(printf("Entering stage3array_from_list with %d entries\n",List_length(stage3list)));
964 
965   /* Stage3_recompute_goodness(stage3list); -- No longer necessary */
966   Stage3_compute_mapq(stage3list);
967 
968   if (stage3list == NULL) {
969     *first_absmq = 0;
970     *second_absmq = 0;
971     *npaths_primary = *npaths_altloc = 0;
972     return (Stage3_T *) NULL;
973 
974 #if 0
975   } else if (mergedp == true) {
976     debug(printf("mergedp is true\n"));
977     Stage3_count_paths(&norig_primary,&norig_altloc,stage3list);
978     array0 = (Stage3_T *) List_to_array_out(stage3list,NULL);
979     List_free(&stage3list);
980     *first_absmq = 0;
981     *second_absmq = 0;
982     *npaths_primary = norig_primary;
983     *npaths_altloc = norig_altloc;
984     return array0;
985 #endif
986 
987   } else if (chimerap == true) {
988     debug(printf("chimerap is true\n"));
989     Stage3_count_paths(&norig_primary,&norig_altloc,stage3list);
990     array0 = (Stage3_T *) List_to_array_out(stage3list,NULL);
991     List_free(&stage3list);
992     *first_absmq = Stage3_absmq_score(array0[0]);
993     if (norig_primary + norig_altloc <= 2) {
994       *second_absmq = 0;
995     } else {
996       qsort(&(array0[2]),norig_primary + norig_altloc - 2,sizeof(Stage3_T),Stage3_cmp);
997       *second_absmq = Stage3_absmq_score(array0[2]);
998     }
999     *npaths_primary = norig_primary;
1000     *npaths_altloc = norig_altloc;
1001     return array0;
1002 
1003   } else if (remove_overlaps_p == false) {
1004     debug(printf("remove_overlaps_p is false\n"));
1005     Stage3_count_paths(&norig_primary,&norig_altloc,stage3list);
1006     array0 = (Stage3_T *) List_to_array_out(stage3list,NULL);
1007     List_free(&stage3list);
1008     qsort(array0,norig_primary + norig_altloc,sizeof(Stage3_T),Stage3_cmp);
1009 
1010     if (suboptimal_score_float < 1.0) {
1011       threshold_score = Stage3_goodness(array0[0]) * suboptimal_score_float;
1012       debug(printf("threshold score %d = goodness %d * suboptimal score_float %f\n",
1013 		   threshold_score,Stage3_goodness(array0[0]),suboptimal_score_float));
1014     } else {
1015       threshold_score = Stage3_goodness(array0[0]) - (int) suboptimal_score_float;
1016       debug(printf("threshold score %d = goodness %d - suboptimal score %d\n",
1017 		   threshold_score,Stage3_goodness(array0[0]),(int) suboptimal_score_float));
1018     }
1019 
1020     if (Stage3_altloc_chr(&alias_start,&alias_end,array0[0]) == false) {
1021       i_primary = 1;
1022       i_altloc = 0;
1023     } else {
1024       i_primary = 0;
1025       i_altloc = 1;
1026     }
1027     i = 1;
1028     while (i < norig_primary + norig_altloc && Stage3_goodness(array0[i]) >= threshold_score) {
1029       if (Stage3_altloc_chr(&alias_start,&alias_end,array0[i]) == false) {
1030 	i_primary++;
1031       } else {
1032 	i_altloc++;
1033       }
1034       i++;
1035     }
1036     while (i < norig_primary + norig_altloc) {
1037       Stage3_free(&(array0[i]));
1038       i++;
1039     }
1040 
1041     *npaths_primary = i_primary;
1042     *npaths_altloc = i_altloc;
1043     *first_absmq = Stage3_absmq_score(array0[0]);
1044     if ((*npaths_primary) + (*npaths_altloc) < 2) {
1045       *second_absmq = 0;
1046     } else {
1047       *second_absmq = Stage3_absmq_score(array0[1]);
1048     }
1049 
1050     return array0;
1051 
1052   } else {
1053     debug(printf("remove_overlaps_p is true\n"));
1054     Stage3_count_paths(&norig_primary,&norig_altloc,stage3list);
1055     eliminate = (bool *) CALLOCA(norig_primary + norig_altloc,sizeof(bool));
1056 
1057     /* Initial sort to remove subsumed alignments */
1058     array0 = (Stage3_T *) MALLOCA((norig_primary + norig_altloc) * sizeof(Stage3_T));
1059     List_fill_array_and_free((void **) array0,&stage3list);
1060     qsort(array0,norig_primary + norig_altloc,sizeof(Stage3_T),Stage3_cmp);
1061 
1062     for (i = 0; i < norig_primary + norig_altloc; i++) {
1063       x = array0[i];
1064       debug(printf("%d: chr %d:%u..%u, goodness %d, matches %d, npairs %d\n",
1065 		   i,Stage3_chrnum(x),Stage3_chrstart(x),Stage3_chrend(x),Stage3_goodness(x),Stage3_matches(x),Stage3_npairs(x)));
1066       for (j = i+1; j < norig_primary + norig_altloc; j++) {
1067 	y = array0[j];
1068 	if (Stage3_overlap(x,y)) {
1069 	  eliminate[j] = true;
1070 	}
1071       }
1072     }
1073 
1074 
1075     *npaths_primary = *npaths_altloc = 0;
1076     for (i = 0; i < norig_primary + norig_altloc; i++) {
1077       if (eliminate[i] == false) {
1078 	if (Stage3_altloc_chr(&alias_start,&alias_end,array0[i]) == false) {
1079 	  (*npaths_primary)++;
1080 	} else {
1081 	  (*npaths_altloc)++;
1082 	}
1083       }
1084     }
1085 
1086     array1 = (Stage3_T *) MALLOC_OUT(((*npaths_primary) + (*npaths_altloc)) * sizeof(Stage3_T)); /* Return value */
1087     j = 0;
1088     for (i = 0; i < norig_primary + norig_altloc; i++) {
1089       x = array0[i];
1090       if (eliminate[i] == true) {
1091 	Stage3_free(&x);
1092       } else {
1093 	array1[j++] = x;
1094       }
1095     }
1096     FREEA(array0);
1097     FREEA(eliminate);
1098 
1099     if (suboptimal_score_float < 1.0) {
1100       threshold_score = Stage3_goodness(array1[0]) * suboptimal_score_float;
1101       debug(printf("threshold score %d = goodness %d * suboptimal score %f\n",
1102 		   threshold_score,Stage3_goodness(array1[0]),suboptimal_score_float));
1103     } else {
1104       threshold_score = Stage3_goodness(array1[0]) - (int) suboptimal_score_float;
1105       debug(printf("threshold score %d = goodness %d - suboptimal score %d\n",
1106 		   threshold_score,Stage3_goodness(array1[0]),(int) suboptimal_score_float));
1107     }
1108 
1109     if (Stage3_altloc_chr(&alias_start,&alias_end,array1[0]) == false) {
1110       i_primary = 1;
1111       i_altloc = 0;
1112     } else {
1113       i_primary = 0;
1114       i_altloc = 1;
1115     }
1116     i = 1;
1117     while (i < (*npaths_primary) + (*npaths_altloc) && Stage3_goodness(array1[i]) >= threshold_score) {
1118       if (Stage3_altloc_chr(&alias_start,&alias_end,array1[i]) == false) {
1119 	i_primary++;
1120       } else {
1121 	i_altloc++;
1122       }
1123       i++;
1124     }
1125     while (i < (*npaths_primary) + (*npaths_altloc)) {
1126       Stage3_free(&(array1[i]));
1127       i++;
1128     }
1129 
1130     *npaths_primary = i_primary;
1131     *npaths_altloc = i_altloc;
1132     *first_absmq = Stage3_absmq_score(array1[0]);
1133     if ((*npaths_primary) + (*npaths_altloc) < 2) {
1134       *second_absmq = 0;
1135     } else {
1136       *second_absmq = Stage3_absmq_score(array1[1]);
1137     }
1138     return array1;
1139   }
1140 }
1141 
1142 
1143 static List_T
update_stage3middle_list(List_T stage3middle_list,Sequence_T queryseq,Sequence_T queryntseq,Sequence_T queryuc,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,Chrpos_T chrstart,Chrpos_T chrend,bool watsonp,int genestrand,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Stopwatch_T worker_stopwatch)1144 update_stage3middle_list (List_T stage3middle_list, Sequence_T queryseq,
1145 #ifdef PMAP
1146 			  Sequence_T queryntseq,
1147 #endif
1148 			  Sequence_T queryuc, Stage2_alloc_T stage2_alloc,
1149 			  Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
1150 			  Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
1151 			  Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
1152 			  Chrpos_T chrstart, Chrpos_T chrend, bool watsonp, int genestrand,
1153 			  Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
1154 			  Stopwatch_T worker_stopwatch) {
1155   /* int stage2_source, stage2_indexsize; */
1156   /* double stage3_runtime; */
1157 
1158 #ifdef PMAP
1159   Sequence_T genomicuc = NULL;
1160   char *genomicseg_ptr = NULL, *genomicuc_ptr = NULL;
1161 #elif defined(EXTRACT_GENOMICSEG)
1162   Sequence_T genomicuc = NULL;
1163 #endif
1164 
1165   List_T all_stage2results, all_stage3middle_results = NULL, p;
1166   Stage2_T stage2;
1167   Stage3middle_T stage3middle;
1168 #ifdef PMAP
1169   int subseq_offset;
1170 #endif
1171 
1172 
1173 #ifdef PMAP_OLD
1174   /* Previously used for PMAP */
1175   if (user_genomicseg == NULL && uncompressedp == false && straintype == 0) {
1176     genomicuc = Sequence_alias(genomicseg);
1177   } else {
1178     genomicuc = Sequence_uppercase(genomicseg);
1179   }
1180   genomicseg_ptr = Sequence_fullpointer(genomicseg);
1181   genomicuc_ptr = Sequence_fullpointer(genomicuc);
1182 #elif defined(EXTRACT_GENOMICSEG)
1183   if (user_genomicseg == NULL && uncompressedp == false && straintype == 0) {
1184     genomicuc = Sequence_alias(genomicseg);
1185   } else {
1186     genomicuc = Sequence_uppercase(genomicseg);
1187   }
1188   genomicseg_ptr = Sequence_fullpointer(genomicseg);
1189   genomicuc_ptr = Sequence_fullpointer(genomicuc);
1190 #endif
1191 
1192 #if 0
1193   if (canonical_mode == 0) {
1194     do_final_p = false;
1195   } else if (canonical_mode == 1) {
1196     do_final_p = true;
1197   } else if (lowidentityp == false) {
1198     do_final_p = false;
1199   } else {
1200     do_final_p = true;
1201   }
1202 #endif
1203 
1204   debug(printf("Entering update_stage3middle_list with %d results\n",List_length(stage3middle_list)));
1205   debug2(printf("Beginning Stage2_compute with chrstart %u and chrend %u and query_subseq_offset %d\n",
1206 		chrstart,chrend,Sequence_subseq_offset(queryseq)));
1207   all_stage2results = Stage2_compute(Sequence_trimpointer(queryseq),Sequence_trimpointer(queryuc),
1208 				     Sequence_trimlength(queryseq),/*query_offset*/0,
1209 				     chrstart,chrend,chroffset,chrhigh,/*plusp*/watsonp,genestrand,
1210 				     stage2_alloc,/*proceed_pctcoverage*/0.3,oligoindices_major,
1211 				     pairpool,diagpool,cellpool,
1212 				     /*localp*/true,/*skip_repetitive_p*/true,
1213 				     /*favor_right_p*/false,/*max_nalignments*/MAX_NALIGNMENTS,debug_graphic_p,
1214 				     worker_stopwatch,diag_debug);
1215   debug(printf("End of Stage2_compute\n"));
1216 
1217 
1218   for (p = all_stage2results; p != NULL; p = List_next(p)) {
1219     stage2 = (Stage2_T) List_head(p);
1220     stage3middle = Stage3_compute_middle(Stage2_middle(stage2),Stage2_all_starts(stage2),Stage2_all_ends(stage2),
1221 #ifdef PMAP
1222 					 /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
1223 					 /*queryseq_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1224 					 /*queryuc_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1225 					 /*querylength*/Sequence_subseq_length(queryntseq,subseq_offset),
1226 #else
1227 					 /*queryseq_ptr*/Sequence_fullpointer(queryseq),
1228 					 /*queryuc_ptr*/Sequence_fullpointer(queryuc),
1229 					 /*querylength*/Sequence_fulllength(queryseq),
1230 #endif
1231 					 chrnum,chroffset,chrhigh,chrlength,
1232 					 watsonp,genestrand,/*jump_late_p*/watsonp ? false : true,maxpeelback,
1233 					 oligoindices_minor,diagpool,cellpool,
1234 					 pairpool,dynprogL,dynprogM,dynprogR,sense_try);
1235     Stage2_free(&stage2);
1236     all_stage3middle_results = List_push(all_stage3middle_results,(void *) stage3middle);
1237   }
1238   List_free(&all_stage2results);
1239 
1240   return List_append(all_stage3middle_results,stage3middle_list);
1241 }
1242 
1243 
1244 
1245 /* Combination of update_stage3middle_list and Stage3_compute_ends,
1246    Needed for solving middle segments of chimeras */
1247 static List_T
update_stage3list(List_T stage3list,Sequence_T queryseq,Sequence_T queryntseq,Sequence_T queryuc,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,int straintype,char * strain,Chrnum_T chrnum,Univcoord_T chroffset,Univcoord_T chrhigh,Chrpos_T chrlength,Chrpos_T chrstart,Chrpos_T chrend,bool watsonp,int genestrand,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Stopwatch_T worker_stopwatch)1248 update_stage3list (List_T stage3list, Sequence_T queryseq,
1249 #ifdef PMAP
1250 		   Sequence_T queryntseq,
1251 #endif
1252 		   Sequence_T queryuc, Stage2_alloc_T stage2_alloc,
1253 		   Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
1254 		   Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool, int straintype, char *strain,
1255 		   Chrnum_T chrnum, Univcoord_T chroffset, Univcoord_T chrhigh, Chrpos_T chrlength,
1256 		   Chrpos_T chrstart, Chrpos_T chrend, bool watsonp, int genestrand,
1257 		   Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
1258 		   Stopwatch_T worker_stopwatch) {
1259   /* int stage2_source, stage2_indexsize; */
1260   /* double stage3_runtime; */
1261 
1262 #ifdef PMAP
1263   Sequence_T genomicuc = NULL;
1264   char *genomicseg_ptr = NULL, *genomicuc_ptr = NULL;
1265 #elif defined(EXTRACT_GENOMICSEG)
1266   Sequence_T genomicuc = NULL;
1267 #endif
1268   List_T all_stage2results, p;
1269   Stage2_T stage2;
1270   Stage3_T stage3;
1271 
1272   struct Pair_T *pairarray;
1273   List_T pairs;
1274   int goodness;
1275   int npairs, cdna_direction, matches, unknowns, mismatches, qopens, qindels, topens, tindels,
1276     ncanonical, nsemicanonical, nnoncanonical;
1277   int sensedir;
1278   int nmatches_posttrim, max_match_length, ambig_end_length_5, ambig_end_length_3;
1279   Splicetype_T ambig_splicetype_5, ambig_splicetype_3;
1280   double ambig_prob_5, ambig_prob_3;
1281   double min_splice_prob;
1282 #ifdef PMAP
1283   int subseq_offset;
1284 #endif
1285 
1286 
1287 #ifdef PMAP_OLD
1288   /* Previously used for PMAP */
1289   if (user_genomicseg == NULL && uncompressedp == false && straintype == 0) {
1290     genomicuc = Sequence_alias(genomicseg);
1291   } else {
1292     genomicuc = Sequence_uppercase(genomicseg);
1293   }
1294   genomicseg_ptr = Sequence_fullpointer(genomicseg);
1295   genomicuc_ptr = Sequence_fullpointer(genomicuc);
1296 #elif defined(EXTRACT_GENOMICSEG)
1297   if (user_genomicseg == NULL && uncompressedp == false && straintype == 0) {
1298     genomicuc = Sequence_alias(genomicseg);
1299   } else {
1300     genomicuc = Sequence_uppercase(genomicseg);
1301   }
1302   genomicseg_ptr = Sequence_fullpointer(genomicseg);
1303   genomicuc_ptr = Sequence_fullpointer(genomicuc);
1304 #endif
1305 
1306 #if 0
1307   if (canonical_mode == 0) {
1308     do_final_p = false;
1309   } else if (canonical_mode == 1) {
1310     do_final_p = true;
1311   } else if (lowidentityp == false) {
1312     do_final_p = false;
1313   } else {
1314     do_final_p = true;
1315   }
1316 #endif
1317 
1318   debug(printf("Entering update_stage3list with %d results\n",List_length(stage3list)));
1319   debug2(printf("Beginning Stage2_compute with chrstart %u and chrend %u and query_subseq_offset %d\n",
1320 		chrstart,chrend,Sequence_subseq_offset(queryseq)));
1321   all_stage2results = Stage2_compute(Sequence_trimpointer(queryseq),Sequence_trimpointer(queryuc),
1322 				     Sequence_trimlength(queryseq),/*query_offset*/0,
1323 				     chrstart,chrend,chroffset,chrhigh,/*plusp*/watsonp,genestrand,
1324 				     stage2_alloc,/*proceed_pctcoverage*/0.3,oligoindices_major,
1325 				     pairpool,diagpool,cellpool,
1326 				     /*localp*/true,/*skip_repetitive_p*/true,
1327 				     /*favor_right_p*/false,/*max_nalignments*/MAX_NALIGNMENTS,debug_graphic_p,
1328 				     worker_stopwatch,diag_debug);
1329 
1330   debug(printf("End of Stage2_compute\n"));
1331 
1332   for (p = all_stage2results; p != NULL; p = List_next(p)) {
1333     stage2 = (Stage2_T) List_head(p);
1334 
1335     /* Stopwatch_start(worker_stopwatch); */
1336 #ifdef PMAP
1337     subseq_offset = Sequence_subseq_offset(queryseq); /* in nucleotides */
1338 #endif
1339     pairarray = Stage3_compute_one(&cdna_direction,&sensedir,&pairs,&npairs,&goodness,
1340 				   &matches,&nmatches_posttrim,&max_match_length,
1341 				   &ambig_end_length_5,&ambig_end_length_3,
1342 				   &ambig_splicetype_5,&ambig_splicetype_3,
1343 				   &ambig_prob_5,&ambig_prob_3,
1344 				   &unknowns,&mismatches,&qopens,&qindels,&topens,&tindels,
1345 				   &ncanonical,&nsemicanonical,&nnoncanonical,&min_splice_prob,
1346 				   Stage2_middle(stage2),Stage2_all_starts(stage2),Stage2_all_ends(stage2),
1347 #ifdef PMAP
1348 				   /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
1349 				   /*queryseq_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1350 				   /*queryuc_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1351 				   /*querylength*/Sequence_subseq_length(queryntseq,subseq_offset),
1352 				   /*skiplength*/Sequence_skiplength(queryntseq),
1353 				   /*query_subseq_offset*/subseq_offset,
1354 #else
1355 				   /*queryseq_ptr*/Sequence_fullpointer(queryseq),
1356 				   /*queryuc_ptr*/Sequence_fullpointer(queryuc),
1357 				   /*querylength*/Sequence_fulllength(queryseq),
1358 				   /*skiplength*/Sequence_skiplength(queryseq),
1359 				   /*query_subseq_offset*/Sequence_subseq_offset(queryseq),
1360 #endif
1361 				   chrnum,chroffset,chrhigh,
1362 				   /*knownsplice_limit_low*/0U,/*knownsplice_limit_high*/-1U,
1363 				   watsonp,genestrand,/*jump_late_p*/watsonp ? false : true,maxpeelback,
1364 				   oligoindices_minor,diagpool,cellpool,
1365 				   pairpool,dynprogL,dynprogM,dynprogR,sense_try,sense_filter);
1366     /* stage3_runtime = Stopwatch_stop(worker_stopwatch); */
1367     if (pairarray == NULL) {
1368       /* Skip */
1369     } else if (matches < min_matches) {
1370       FREE_OUT(pairarray);
1371     } else if ((stage3 = Stage3_new(pairarray,pairs,npairs,goodness,cdna_direction,sensedir,
1372 				    matches,unknowns,mismatches,
1373 				    qopens,qindels,topens,tindels,ncanonical,nsemicanonical,nnoncanonical,
1374 				    chrnum,chroffset,chrhigh,chrlength,watsonp,genestrand,
1375 				    /*querylength*/Sequence_fulllength(queryseq),
1376 				    /*skiplength*/Sequence_skiplength(queryseq),
1377 				    /*trimlength*/Sequence_trimlength(queryseq),
1378 				    straintype,strain,altstrain_iit)) != NULL) {
1379       debug(printf("Pushing %p onto stage3list\n",stage3));
1380       stage3list = List_push(stage3list,(void *) stage3);
1381     }
1382 
1383     Stage2_free(&stage2);
1384   }
1385 
1386   List_free(&all_stage2results);
1387 
1388 #ifdef PMAP_OLD
1389   Sequence_free(&genomicuc);
1390 #elif defined(EXTRACT_GENOMICSEG)
1391   Sequence_free(&genomicuc);
1392 #endif
1393 
1394   return stage3list;
1395 }
1396 
1397 
1398 
1399 #if 0
1400 /* This code is duplicated in get-genome.c */
1401 static int
1402 index_compare (const void *a, const void *b) {
1403   int index1 = * (int *) a;
1404   int index2 = * (int *) b;
1405   int type1, type2;
1406   Chrpos_T pos1, pos2;
1407 
1408   type1 = Interval_type(IIT_interval(altstrain_iit,index1));
1409   type2 = Interval_type(IIT_interval(altstrain_iit,index2));
1410 
1411   if (type1 < type2) {
1412     return -1;
1413   } else if (type1 > type2) {
1414     return +1;
1415   } else {
1416     /* Store in descending genomic position, so right shifting works
1417        in Genome_patch_strain */
1418     pos1 = Interval_low(IIT_interval(altstrain_iit,index1));
1419     pos2 = Interval_low(IIT_interval(altstrain_iit,index2));
1420 
1421     if (pos1 > pos2) {
1422       return -1;
1423     } else if (pos1 < pos2) {
1424       return +1;
1425     } else {
1426       return 0;
1427     }
1428   }
1429 }
1430 #endif
1431 
1432 
1433 /* Not sure how to treat genestrand for usersegment */
1434 static Stage3_T *
stage3_from_usersegment(int * npaths_primary,int * npaths_altloc,int * first_absmq,int * second_absmq,Sequence_T queryseq,Sequence_T queryuc,Sequence_T queryntseq,Sequence_T usersegment,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Stopwatch_T worker_stopwatch)1435 stage3_from_usersegment (int *npaths_primary, int *npaths_altloc, int *first_absmq, int *second_absmq,
1436 			 Sequence_T queryseq, Sequence_T queryuc,
1437 #ifdef PMAP
1438 			 Sequence_T queryntseq,
1439 #endif
1440 			 Sequence_T usersegment, Stage2_alloc_T stage2_alloc,
1441 			 Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
1442 			 Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
1443 			 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
1444 			 Stopwatch_T worker_stopwatch) {
1445   List_T stage3list, stage3middle_list, p;
1446   Stage3middle_T stage3middle;
1447   Stage3_T stage3;
1448   bool watsonp;
1449 
1450   struct Pair_T *pairarray;
1451   List_T pairs;
1452   int goodness;
1453   int npairs, cdna_direction, matches, unknowns, mismatches, qopens, qindels, topens, tindels,
1454     ncanonical, nsemicanonical, nnoncanonical;
1455   int sensedir;
1456   int nmatches_posttrim, max_match_length, ambig_end_length_5, ambig_end_length_3;
1457   Splicetype_T ambig_splicetype_5, ambig_splicetype_3;
1458   double ambig_prob_5, ambig_prob_3;
1459   double min_splice_prob;
1460 #ifdef PMAP
1461   int subseq_offset;
1462 #endif
1463 
1464   Univcoord_T chroffset, chrhigh;
1465   Chrpos_T chrlength, chrpos;
1466   Chrnum_T chrnum = 0;
1467 
1468 #ifdef PMAP
1469   Sequence_T revcomp;
1470 #endif
1471 
1472   chroffset = chrpos = 0U;
1473   chrhigh = chrlength = Sequence_fulllength(usersegment);
1474 
1475   stage3middle_list = update_stage3middle_list(/*stage3middle_list*/NULL,queryseq,
1476 #ifdef PMAP
1477 					       queryntseq,
1478 #endif
1479 					       queryuc,stage2_alloc,oligoindices_major,oligoindices_minor,
1480 					       pairpool,diagpool,cellpool,chrnum,chroffset,chrhigh,chrlength,
1481 					       /*chrstart*/0,/*chrend*/chrhigh,/*watsonp*/true,
1482 					       /*genestrand for usersegment*/0,
1483 					       dynprogL,dynprogM,dynprogR,worker_stopwatch);
1484 
1485 #ifdef PMAP
1486   revcomp = Sequence_revcomp(usersegment);
1487 #endif
1488 
1489   stage3middle_list = update_stage3middle_list(stage3middle_list,queryseq,
1490 #ifdef PMAP
1491 					       queryntseq,
1492 #endif
1493 					       queryuc,stage2_alloc,oligoindices_major,oligoindices_minor,
1494 					       pairpool,diagpool,cellpool,chrnum,chroffset,chrhigh,chrlength,
1495 					       /*chrstart*/0,/*chrend*/chrhigh,/*watsonp*/false,
1496 					       /*genestrand for usersegment*/0,
1497 					       dynprogL,dynprogM,dynprogR,worker_stopwatch);
1498 
1499 #ifdef PMAP
1500   Sequence_free(&revcomp);
1501 #endif
1502 
1503   if (stage3middle_list == NULL) {
1504     *npaths_primary = *npaths_altloc = 0;
1505     return (Stage3_T *) NULL;
1506 
1507   } else {
1508     stage3list = (List_T) NULL;
1509     for (p = stage3middle_list; p != NULL; p = List_next(p)) {
1510       stage3middle = (Stage3middle_T) List_head(p);
1511       watsonp = Stage3middle_watsonp(stage3middle);
1512 
1513 #ifdef PMAP
1514       subseq_offset = Sequence_subseq_offset(queryseq); /* in nucleotides */
1515 #endif
1516       pairarray = Stage3_compute_ends(&cdna_direction,&sensedir,&pairs,&npairs,&goodness,
1517 				      &matches,&nmatches_posttrim,&max_match_length,
1518 				      &ambig_end_length_5,&ambig_end_length_3,
1519 				      &ambig_splicetype_5,&ambig_splicetype_3,
1520 				      &ambig_prob_5,&ambig_prob_3,
1521 				      &unknowns,&mismatches,&qopens,&qindels,&topens,&tindels,
1522 				      &ncanonical,&nsemicanonical,&nnoncanonical,&min_splice_prob,
1523 				      stage3middle,
1524 #ifdef PMAP
1525 				      /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
1526 				      /*queryseq_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1527 				      /*queryuc_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1528 				      /*querylength*/Sequence_subseq_length(queryntseq,subseq_offset),
1529 				      /*skiplength*/Sequence_skiplength(queryntseq),
1530 				      /*query_subseq_offset*/subseq_offset,
1531 #else
1532 				      /*queryseq_ptr*/Sequence_fullpointer(queryseq),
1533 				      /*queryuc_ptr*/Sequence_fullpointer(queryuc),
1534 				      /*querylength*/Sequence_fulllength(queryseq),
1535 				      /*skiplength*/Sequence_skiplength(queryseq),
1536 				      /*query_subseq_offset*/Sequence_subseq_offset(queryseq),
1537 #endif
1538 				      /*knownsplice_limit_low*/0U,/*knownsplice_limit_high*/-1U,
1539 				      maxpeelback,pairpool,dynprogL,dynprogM,dynprogR,
1540 				      sense_filter,oligoindices_minor,diagpool,cellpool);
1541       /* stage3_runtime = Stopwatch_stop(worker_stopwatch); */
1542       if (pairarray == NULL) {
1543 	/* Skip */
1544       } else if (matches < min_matches) {
1545 	FREE_OUT(pairarray);
1546       } else if ((stage3 = Stage3_new(pairarray,pairs,npairs,goodness,cdna_direction,sensedir,
1547 				      matches,unknowns,mismatches,
1548 				      qopens,qindels,topens,tindels,ncanonical,nsemicanonical,nnoncanonical,
1549 				      chrnum,chroffset,chrhigh,chrlength,watsonp,/*genestrand for usersegment*/0,
1550 				      /*querylength*/Sequence_fulllength(queryseq),
1551 				      /*skiplength*/Sequence_skiplength(queryseq),
1552 				      /*trimlength*/Sequence_trimlength(queryseq),
1553 				      /*straintype*/0,/*strain*/NULL,altstrain_iit)) != NULL) {
1554 	debug(printf("Pushing %p onto stage3list\n",stage3));
1555 	stage3list = List_push(stage3list,(void *) stage3);
1556       }
1557       Stage3middle_free(&stage3middle);
1558     }
1559     List_free(&stage3middle_list);
1560 
1561     return stage3array_from_list(&(*npaths_primary),&(*npaths_altloc),&(*first_absmq),&(*second_absmq),
1562 				 stage3list,/*chimerap*/false,/*remove_overlaps_p*/true);
1563   }
1564 }
1565 
1566 
1567 #if 0
1568 static List_T
1569 stage3list_remove_duplicates (List_T stage3list) {
1570   List_T unique = NULL;
1571   Stage3_T *array;
1572   int best_score;
1573   Chrpos_T shortest_genomiclength;
1574   int n, besti, i, j, k;
1575 
1576   if ((n = List_length(stage3list)) == 0) {
1577     return (List_T) NULL;
1578   } else if (n == 1) {
1579     return stage3list;
1580   } else {
1581     array = (Stage3_T *) List_to_array(stage3list,NULL);
1582     List_free(&stage3list);
1583     qsort(array,n,sizeof(Stage3_T),Stage3_position_cmp);
1584 
1585     i = 0;
1586     while (i < n) {
1587       best_score = Stage3_goodness(array[i]);
1588       shortest_genomiclength = Stage3_genomiclength(array[i]);
1589       besti = i;
1590       debug3(printf("i = %d, score %d, genomiclength %u\n",
1591 		    i,best_score,shortest_genomiclength));
1592 
1593       j = i + 1;
1594       while (j < n && Stage3_position_cmp(&(array[i]),&(array[j])) == 0) {
1595 	debug3(printf("  j = %d, score %d, genomiclength %u\n",
1596 		      j,Stage3_goodness(array[j]),Stage3_genomiclength(array[j])));
1597 
1598 	if (Stage3_goodness(array[j]) < best_score) {
1599 	  best_score = Stage3_goodness(array[j]);
1600 	  shortest_genomiclength = Stage3_genomiclength(array[j]);
1601 	  besti = j;
1602 
1603 	} else if (Stage3_goodness(array[j]) == best_score &&
1604 		   Stage3_genomiclength(array[j]) < shortest_genomiclength) {
1605 	  best_score = Stage3_goodness(array[j]);
1606 	  shortest_genomiclength = Stage3_genomiclength(array[j]);
1607 	  besti = j;
1608 	}
1609 
1610 	j++;
1611       }
1612       debug3(printf("  => besti = %d, score %d, genomiclength %u\n",
1613 		    besti,best_score,shortest_genomiclength));
1614 
1615       for (k = i; k < j; k++) {
1616 	if (k == besti) {
1617 	  unique = List_push(unique,(void *) array[besti]);
1618 	} else {
1619 	  Stage3_free(&(array[k]));
1620 	}
1621       }
1622 
1623       i = j;
1624     }
1625 
1626     FREE(array);
1627 
1628     return unique;
1629   }
1630 }
1631 #endif
1632 
1633 
1634 #if 0
1635 static List_T
1636 stage3list_remove_empties (List_T stage3list) {
1637   List_T nonempty = NULL, p;
1638   Stage3_T stage3;
1639 
1640   for (p = stage3list; p != NULL; p = List_next(p)) {
1641     stage3 = (Stage3_T) List_head(p);
1642     if (Stage3_pairs == NULL) {
1643       debug2(printf("Removing empty stage3 %p\n",stage3));
1644       Stage3_free(&stage3);
1645     } else {
1646       nonempty = List_push(nonempty,(void *) stage3);
1647     }
1648   }
1649 
1650   return nonempty;
1651 }
1652 #endif
1653 
1654 
1655 static List_T
stage3list_sort(List_T stage3list)1656 stage3list_sort (List_T stage3list) {
1657   List_T sorted = NULL;
1658   Stage3_T *array;
1659   int n, i;
1660 
1661   if ((n = List_length(stage3list)) == 0) {
1662     return (List_T) NULL;
1663   } else if (n == 1) {
1664     return stage3list;
1665   } else {
1666     array = (Stage3_T *) List_to_array(stage3list,NULL);
1667     List_free(&stage3list);
1668     qsort(array,n,sizeof(Stage3_T),Stage3_cmp);
1669     for (i = n-1; i >= 0; i--) {
1670       sorted = List_push(sorted,(void *) array[i]);
1671     }
1672     FREE(array);
1673 
1674     return sorted;
1675   }
1676 }
1677 
1678 
1679 static List_T
stage3list_filter_and_sort(Chimera_T * chimera,List_T stage3list)1680 stage3list_filter_and_sort (Chimera_T *chimera, List_T stage3list) {
1681   List_T sorted = NULL;
1682   Stage3_T *array, stage3;
1683   int n, i;
1684 
1685   if ((n = List_length(stage3list)) == 0) {
1686     return (List_T) NULL;
1687 
1688   } else if (n == 1) {
1689     stage3 = (Stage3_T) List_head(stage3list);
1690     if (Stage3_passes_filter(stage3,min_trimmed_coverage,min_identity) == false) {
1691       Stage3_free(&stage3);
1692       List_free(&stage3list);
1693       return (List_T) NULL;
1694     } else {
1695       return stage3list;
1696     }
1697 
1698   } else if (*chimera == NULL) {
1699     array = (Stage3_T *) List_to_array(stage3list,NULL);
1700     List_free(&stage3list);
1701     qsort(array,n,sizeof(Stage3_T),Stage3_cmp);
1702     for (i = n-1; i >= 0; i--) {
1703       if (Stage3_passes_filter(array[i],min_trimmed_coverage,min_identity) == false) {
1704 	Stage3_free(&(array[i]));
1705       } else {
1706 	sorted = List_push(sorted,(void *) array[i]);
1707       }
1708     }
1709     FREE(array);
1710     return sorted;
1711 
1712   } else if (Stage3_passes_filter_chimera(*chimera,min_trimmed_coverage,min_identity) == true) {
1713     array = (Stage3_T *) List_to_array(stage3list,NULL);
1714     List_free(&stage3list);
1715     qsort(array,n,sizeof(Stage3_T),Stage3_cmp);
1716     for (i = n-1; i >= 0; i--) {
1717       if (Stage3_chimera_left_p(array[i]) == true) {
1718 	sorted = List_push(sorted,(void *) array[i]);
1719       } else if (Stage3_chimera_right_p(array[i]) == true) {
1720 	sorted = List_push(sorted,(void *) array[i]);
1721       } else if (Stage3_passes_filter(array[i],min_trimmed_coverage,min_identity) == false) {
1722 	Stage3_free(&(array[i]));
1723       } else {
1724 	sorted = List_push(sorted,(void *) array[i]);
1725       }
1726     }
1727     FREE(array);
1728     return sorted;
1729 
1730   } else {
1731     array = (Stage3_T *) List_to_array(stage3list,NULL);
1732     List_free(&stage3list);
1733     qsort(array,n,sizeof(Stage3_T),Stage3_cmp);
1734     for (i = n-1; i >= 0; i--) {
1735       if (Stage3_chimera_left_p(array[i]) == true) {
1736 	Stage3_free(&(array[i]));
1737       } else if (Stage3_chimera_right_p(array[i]) == true) {
1738 	Stage3_free(&(array[i]));
1739       } else if (Stage3_passes_filter(array[i],min_trimmed_coverage,min_identity) == false) {
1740 	Stage3_free(&(array[i]));
1741       } else {
1742 	sorted = List_push(sorted,(void *) array[i]);
1743       }
1744     }
1745     FREE(array);
1746 
1747     Chimera_free(&(*chimera));
1748     *chimera = (Chimera_T) NULL;
1749 
1750     return sorted;
1751   }
1752 }
1753 
1754 
1755 /* Each gregion has its own genestrand */
1756 static List_T
stage3_from_gregions(List_T stage3list,List_T gregions,Sequence_T queryseq,Sequence_T queryuc,Sequence_T queryntseq,Sequence_T usersegment,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Stopwatch_T worker_stopwatch)1757 stage3_from_gregions (List_T stage3list, List_T gregions,
1758 		      Sequence_T queryseq, Sequence_T queryuc,
1759 #ifdef PMAP
1760 		      Sequence_T queryntseq,
1761 #endif
1762 		      Sequence_T usersegment, Stage2_alloc_T stage2_alloc,
1763 		      Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
1764 		      Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
1765 		      Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
1766 		      Stopwatch_T worker_stopwatch) {
1767   Gregion_T gregion, *gregion_array;
1768   int ngregions, ncovered, max_ncovered, stage2_source;
1769   int n, i;
1770 
1771   List_T stage3middle_list = NULL;
1772   Stage3middle_T stage3middle, *stage3middle_array;
1773   Stage3_T stage3;
1774   bool watsonp;
1775   int genestrand;
1776 
1777   Chrnum_T chrnum;
1778   Univcoord_T chroffset, chrhigh;
1779   Chrpos_T chrlength;
1780 
1781   struct Pair_T *pairarray;
1782   List_T pairs;
1783   int goodness, best_score;
1784   int npairs, cdna_direction, matches, unknowns, mismatches, qopens, qindels, topens, tindels,
1785     ncanonical, nsemicanonical, nnoncanonical;
1786   int sensedir;
1787   int nmatches_posttrim, max_match_length, ambig_end_length_5, ambig_end_length_3;
1788   Splicetype_T ambig_splicetype_5, ambig_splicetype_3;
1789   double ambig_prob_5, ambig_prob_3;
1790   double min_splice_prob;
1791 #ifdef PMAP
1792   int subseq_offset;
1793 #endif
1794 
1795 #if 0
1796   int *indexarray, nindices, straintype, j;
1797 #endif
1798   void *item;
1799 
1800 #ifdef EXTRACT_GENOMICSEG
1801   genomicuc_ptr = Sequence_fullpointer(genomicuc);
1802   Sequence_T genomicseg = NULL, genomicuc = NULL;
1803 #endif
1804 
1805   if (usersegment == NULL && (ngregions = List_length(gregions)) > 0) {
1806     gregion_array = (Gregion_T *) List_to_array(gregions,NULL);
1807     List_free(&gregions);
1808 
1809     for (i = 0; i < ngregions; i++) {
1810       gregion = gregion_array[i];
1811 
1812 #if defined(EXTRACT_GENOMICSEG)
1813       genomicseg = Genome_get_segment(genome,Gregion_genomicstart(gregion),Gregion_genomiclength(gregion),
1814 				      /*chromosome_iit*/NULL,Gregion_revcompp(gregion));
1815       genomicuc = Sequence_uppercase(genomicseg);
1816       genomicuc_ptr = Sequence_fullpointer(genomicuc);
1817 #endif
1818       ncovered = Stage2_scan(&stage2_source,Sequence_trimpointer(queryuc),Sequence_trimlength(queryseq),
1819 			     Gregion_chrstart(gregion),Gregion_chrend(gregion),
1820 			     Gregion_chroffset(gregion),Gregion_chrhigh(gregion),
1821 			     /*plusp*/Gregion_revcompp(gregion) ? false : true,Gregion_genestrand(gregion),
1822 			     stage2_alloc,oligoindices_major,diagpool,debug_graphic_p);
1823       Gregion_set_ncovered(gregion,ncovered,stage2_source);
1824 #if defined(EXTRACT_GENOMICSEG)
1825       Sequence_free(&genomicuc);
1826       Sequence_free(&genomicseg);
1827 #endif
1828     }
1829     qsort(gregion_array,ngregions,sizeof(Gregion_T),Gregion_cmp);
1830     max_ncovered = Gregion_ncovered(gregion_array[0]);
1831     debug(printf("max_ncovered of gregion_array[0] = %d\n",max_ncovered));
1832     if (max_ncovered < 0.10*Sequence_fulllength(queryseq)) {
1833       debug(printf("coverage is too short, so skipping\n"));
1834       for (i = 0; i < ngregions; i++) {
1835 	Gregion_free(&(gregion_array[i]));
1836       }
1837       FREE(gregion_array);
1838 
1839     } else {
1840       gregions = (List_T) NULL;
1841       i = 0;
1842       while (i < ngregions && Gregion_ncovered(gregion_array[i]) > 0.25*max_ncovered) {
1843 	debug(printf("Keeping %d ncovered relative to %d\n",Gregion_ncovered(gregion_array[i]),max_ncovered));
1844 	gregions = List_push(gregions,(void *) gregion_array[i]);
1845 	i++;
1846       }
1847       while (i < ngregions) {
1848 	debug(printf("Discarding array %d with ncovered = %d\n",i,Gregion_ncovered(gregion_array[i])));
1849 	Gregion_free(&(gregion_array[i]));
1850 	i++;
1851       }
1852       FREE(gregion_array);
1853     }
1854 
1855     while (gregions != NULL) {
1856       gregions = List_pop(gregions,&item);
1857       gregion = (Gregion_T) item;
1858 
1859       /* if (Match_usep(match) == true) { */
1860       if (1) {
1861 	if (usersegment != NULL) {
1862 	  /* chrlength = Sequence_fulllength(usersegment); */
1863 	  /* strain = NULL; */
1864 	  stage3middle_list = update_stage3middle_list(stage3middle_list,queryseq,
1865 #ifdef PMAP
1866 						       queryntseq,
1867 #endif
1868 						       queryuc,stage2_alloc,oligoindices_major,oligoindices_minor,
1869 						       pairpool,diagpool,cellpool,Gregion_chrnum(gregion),
1870 						       Gregion_chroffset(gregion),Gregion_chrhigh(gregion),Gregion_chrlength(gregion),
1871 						       Gregion_chrstart(gregion),Gregion_chrend(gregion),
1872 						       Gregion_plusp(gregion),Gregion_genestrand(gregion),
1873 						       dynprogL,dynprogM,dynprogR,worker_stopwatch);
1874 	} else {
1875 	  stage3middle_list = update_stage3middle_list(stage3middle_list,queryseq,
1876 #ifdef PMAP
1877 						       queryntseq,
1878 #endif
1879 						       queryuc,stage2_alloc,oligoindices_major,oligoindices_minor,
1880 						       pairpool,diagpool,cellpool,Gregion_chrnum(gregion),
1881 						       Gregion_chroffset(gregion),Gregion_chrhigh(gregion),Gregion_chrlength(gregion),
1882 						       Gregion_chrstart(gregion),Gregion_chrend(gregion),
1883 						       Gregion_plusp(gregion),Gregion_genestrand(gregion),
1884 						       dynprogL,dynprogM,dynprogR,worker_stopwatch);
1885 	}
1886       }
1887       Gregion_free(&gregion);
1888     }
1889 
1890     if (stage3middle_list != NULL) {
1891       stage3middle_array = (Stage3middle_T *) List_to_array_n(&n,stage3middle_list);
1892       qsort(stage3middle_array,n,sizeof(Stage3middle_T),Stage3middle_cmp);
1893       List_free(&stage3middle_list);
1894 
1895       best_score = Stage3middle_goodness(stage3middle_array[0]);
1896       i = 0;
1897 
1898       while (i < n && Stage3middle_goodness(stage3middle_array[i]) > best_score - 20) {
1899 	stage3middle = stage3middle_array[i];
1900 	debug(printf("Processing stage3middle %d with goodness %d\n",i,Stage3middle_goodness(stage3middle)));
1901 
1902 	chrnum = Stage3middle_chrnum(stage3middle);
1903 	chroffset = Stage3middle_chroffset(stage3middle);
1904 	chrhigh = Stage3middle_chrhigh(stage3middle);
1905 	chrlength = Stage3middle_chrlength(stage3middle);
1906 	watsonp = Stage3middle_watsonp(stage3middle);
1907 	genestrand = Stage3middle_genestrand(stage3middle);
1908 
1909 #ifdef PMAP
1910 	subseq_offset = Sequence_subseq_offset(queryseq); /* in nucleotides */
1911 #endif
1912 	pairarray = Stage3_compute_ends(&cdna_direction,&sensedir,&pairs,&npairs,&goodness,
1913 					&matches,&nmatches_posttrim,&max_match_length,
1914 					&ambig_end_length_5,&ambig_end_length_3,
1915 					&ambig_splicetype_5,&ambig_splicetype_3,
1916 					&ambig_prob_5,&ambig_prob_3,
1917 					&unknowns,&mismatches,&qopens,&qindels,&topens,&tindels,
1918 					&ncanonical,&nsemicanonical,&nnoncanonical,&min_splice_prob,
1919 					stage3middle,
1920 #ifdef PMAP
1921 					/*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
1922 					/*queryseq_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1923 					/*queryuc_ptr*/Sequence_subseq_pointer(queryntseq,subseq_offset),
1924 					/*querylength*/Sequence_subseq_length(queryntseq,subseq_offset),
1925 					/*skiplength*/Sequence_skiplength(queryntseq),
1926 					/*query_subseq_offset*/subseq_offset,
1927 #else
1928 					/*queryseq_ptr*/Sequence_fullpointer(queryseq),
1929 					/*queryuc_ptr*/Sequence_fullpointer(queryuc),
1930 					/*querylength*/Sequence_fulllength(queryseq),
1931 					/*skiplength*/Sequence_skiplength(queryseq),
1932 					/*query_subseq_offset*/Sequence_subseq_offset(queryseq),
1933 #endif
1934 					/*knownsplice_limit_low*/0U,/*knownsplice_limit_high*/-1U,
1935 					maxpeelback,pairpool,dynprogL,dynprogM,dynprogR,
1936 					sense_filter,oligoindices_minor,diagpool,cellpool);
1937 	/* stage3_runtime = Stopwatch_stop(worker_stopwatch); */
1938 	if (pairarray == NULL) {
1939 	  /* Skip */
1940 	} else if (matches < min_matches) {
1941 	  FREE_OUT(pairarray);
1942 	} else if ((stage3 = Stage3_new(pairarray,pairs,npairs,goodness,cdna_direction,sensedir,
1943 					matches,unknowns,mismatches,
1944 					qopens,qindels,topens,tindels,ncanonical,nsemicanonical,nnoncanonical,
1945 					chrnum,chroffset,chrhigh,chrlength,watsonp,genestrand,
1946 					/*querylength*/Sequence_fulllength(queryseq),
1947 					/*skiplength*/Sequence_skiplength(queryseq),
1948 					/*trimlength*/Sequence_trimlength(queryseq),
1949 					/*straintype*/0,/*strain*/NULL,altstrain_iit)) != NULL) {
1950 	  debug(printf("Pushing %p onto stage3list\n",stage3));
1951 	  stage3list = List_push(stage3list,(void *) stage3);
1952 	}
1953 	Stage3middle_free(&stage3middle);
1954 	i++;
1955       }
1956 
1957       while (i < n) {
1958 	stage3middle = stage3middle_array[i];
1959 	debug(printf("Ignoring stage3middle %d with goodness %d\n",i,Stage3middle_goodness(stage3middle)));
1960 	Stage3middle_free(&stage3middle);
1961 	i++;
1962       }
1963 
1964       FREE(stage3middle_array);
1965     }
1966   }
1967 
1968 #ifdef PMAP_OLD
1969   Sequence_free(&genomicuc);
1970 #elif defined(EXTRACT_GENOMICSEG)
1971   Sequence_free(&genomicuc);
1972 #endif
1973 
1974   return stage3list;
1975 }
1976 
1977 
1978 static bool
middle_piece_local_p(int * querystart,int * queryend,Chrpos_T * chrstart,Chrpos_T * chrend,Chrnum_T * chrnum,Univcoord_T * chroffset,Univcoord_T * chrhigh,Chrpos_T * chrlength,bool * plusp,int * genestrand,Stage3_T from,Stage3_T to)1979 middle_piece_local_p (int *querystart, int *queryend,
1980 		      Chrpos_T *chrstart, Chrpos_T *chrend,
1981 		      Chrnum_T *chrnum, Univcoord_T *chroffset, Univcoord_T *chrhigh,
1982 		      Chrpos_T *chrlength, bool *plusp, int *genestrand,
1983 		      Stage3_T from, Stage3_T to) {
1984 
1985   debug2(printf("? middle_piece_local_p from [%p] %d..%d (%u..%u) -> to [%p] %d..%d (%u..%u) => ",
1986 		from,Stage3_querystart(from),Stage3_queryend(from),
1987 		Stage3_chrstart(from),Stage3_chrend(from),
1988 		to,Stage3_querystart(to),Stage3_queryend(to),
1989 		Stage3_chrstart(to),Stage3_chrend(to)));
1990 
1991   if (Stage3_chimera_right_p(from) == true) {
1992     debug2(printf("false, because from is already part of a chimera on its right\n"));
1993     return false;
1994 
1995   } else if (Stage3_chimera_left_p(to) == true) {
1996     debug2(printf("false, because to is already part of a chimera on its left\n"));
1997     return false;
1998 
1999   } else if ((*chrnum = Stage3_chrnum(from)) != Stage3_chrnum(to)) {
2000     /* Different chromosomes */
2001     debug2(printf("different chromosomes\n"));
2002     return false;
2003 
2004   } else if (Stage3_watsonp(from) != Stage3_watsonp(to)) {
2005     /* Different strands */
2006     debug2(printf("different strands\n"));
2007     return false;
2008 
2009   } else if (Stage3_genestrand(from) != Stage3_genestrand(to)) {
2010     /* Different genestrands */
2011     debug2(printf("different genestrands\n"));
2012     return false;
2013 
2014   } else if (Stage3_querystart(to) <= Stage3_queryend(from) + CHIMERA_SLOP) {
2015     /* Already joinable */
2016     debug2(printf("wrong query order or already joinable\n"));
2017     return false;
2018 
2019   } else if ((*plusp = Stage3_watsonp(from)) == true) {
2020     if (Stage3_chrend(from) < Stage3_chrstart(to) &&
2021 	Stage3_chrend(from) + 1000000 > Stage3_chrstart(to)) {
2022       debug2(printf("true, because %u < %u and %u + %u > %u\n",
2023 		    Stage3_chrend(from),Stage3_chrstart(to),
2024 		    Stage3_chrend(from),1000000,Stage3_chrstart(to)));
2025       Univ_IIT_interval_bounds(&(*chroffset),&(*chrhigh),&(*chrlength),chromosome_iit,
2026 			       *chrnum,circular_typeint);
2027       *querystart = Stage3_queryend(from);
2028       *queryend = Stage3_querystart(to);
2029       *chrstart = Stage3_chrend(from);
2030       *chrend = Stage3_chrstart(to);
2031       *genestrand = Stage3_genestrand(from);
2032       return true;
2033     } else {
2034       debug2(printf("false, watsonp true, from_end %u, to start %u\n",
2035 		    Stage3_chrend(from),Stage3_chrstart(to)));
2036       return false;
2037     }
2038 
2039   } else {
2040     if (Stage3_chrstart(to) < Stage3_chrend(from) &&
2041 	Stage3_chrstart(to) + 1000000 > Stage3_chrend(from)) {
2042       debug2(printf("true, because %u < %u and %u + %u > %u\n",
2043 		    Stage3_chrstart(to),Stage3_chrend(from),
2044 		    Stage3_chrstart(to),1000000,Stage3_chrend(from)));
2045       Univ_IIT_interval_bounds(&(*chroffset),&(*chrhigh),&(*chrlength),chromosome_iit,
2046 			       *chrnum,circular_typeint);
2047       *querystart = Stage3_queryend(from);
2048       *queryend = Stage3_querystart(to);
2049       *chrstart = Stage3_chrstart(to);
2050       *chrend = Stage3_chrend(from);
2051       *genestrand = Stage3_genestrand(from);
2052       return true;
2053     } else {
2054       debug2(printf("false, watsonp false, from_end %u, to start %u\n",
2055 		    Stage3_chrend(from),Stage3_chrstart(to)));
2056       return false;
2057     }
2058   }
2059 }
2060 
2061 
2062 static bool
middle_piece_chimera_p(int * querystart,int * queryend,Stage3_T from,Stage3_T to)2063 middle_piece_chimera_p (int *querystart, int *queryend, Stage3_T from, Stage3_T to) {
2064 
2065   debug2(printf("? middle_piece_chimera_p from [%p] %d..%d (%u..%u) -> to [%p] %d..%d (%u..%u) => ",
2066 		from,Stage3_querystart(from),Stage3_queryend(from),
2067 		Stage3_chrstart(from),Stage3_chrend(from),
2068 		to,Stage3_querystart(to),Stage3_queryend(to),
2069 		Stage3_chrstart(to),Stage3_chrend(to)));
2070 
2071   if (Stage3_chimera_right_p(from) == true) {
2072     debug2(printf("false, because from is already part of a chimera on its right\n"));
2073     return false;
2074 
2075   } else if (Stage3_chimera_left_p(to) == true) {
2076     debug2(printf("false, because to is already part of a chimera on its left\n"));
2077     return false;
2078 
2079   } else if (Stage3_querystart(to) <= Stage3_queryend(from) + CHIMERA_SLOP) {
2080     /* Already joinable */
2081     debug2(printf("wrong query order or already joinable\n"));
2082     return false;
2083 
2084   } else {
2085     *querystart = Stage3_queryend(from);
2086     *queryend = Stage3_querystart(to);
2087     return true;
2088   }
2089 }
2090 
2091 
2092 /* Does not alter stage3list.  Puts Stage3_T objects into stage3array_sub1, stage3array_sub2, or both */
2093 static void
local_separate_paths(Stage3_T ** stage3array_sub1,int * npaths_sub1,Stage3_T ** stage3array_sub2,int * npaths_sub2,List_T stage3list)2094 local_separate_paths (Stage3_T **stage3array_sub1, int *npaths_sub1,
2095 		      Stage3_T **stage3array_sub2, int *npaths_sub2,
2096 		      List_T stage3list) {
2097   List_T p;
2098   Stage3_T from, to, stage3;
2099   Stage3_T *by_queryend, *by_querystart;
2100   Chrnum_T chrnum;
2101   int npaths, i, j, k, kstart, kend;
2102   int queryend;
2103 
2104   debug2(printf("local_separate_paths called with list length %d\n",List_length(stage3list)));
2105 #ifdef DEBUG2
2106   for (p = stage3list; p != NULL; p = List_next(p)) {
2107     printf("%p\n",List_head(p));
2108   }
2109 #endif
2110 
2111   if (stage3list == NULL) {
2112     *stage3array_sub1 = (Stage3_T *) NULL;
2113     *npaths_sub1 = 0;
2114     *stage3array_sub2 = (Stage3_T *) NULL;
2115     *npaths_sub2 = 0;
2116     return;
2117 
2118   } else {
2119     for (p = stage3list; p != NULL; p = List_next(p)) {
2120       stage3 = (Stage3_T) List_head(p);
2121       Stage3_clear_joinable(stage3);
2122     }
2123   }
2124 
2125   by_queryend = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
2126   qsort(by_queryend,npaths,sizeof(Stage3_T),Stage3_chrnum_queryend_cmp);
2127 
2128   by_querystart = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
2129   qsort(by_querystart,npaths,sizeof(Stage3_T),Stage3_chrnum_querystart_cmp);
2130 
2131 #ifdef DEBUG2
2132   for (i = 0; i < npaths; i++) {
2133     stage3 = (Stage3_T) by_queryend[i];
2134     printf("from: %p query %d..%d, chrnum %d, genomic %u..%u\t",
2135 	   stage3,Stage3_querystart(stage3),Stage3_queryend(stage3),
2136 	   Stage3_chrnum(stage3),Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
2137 
2138     stage3 = (Stage3_T) by_querystart[i];
2139     printf("to: %p query %d..%d, chrnum %d, genomic %u..%u\n",
2140 	   stage3,Stage3_querystart(stage3),Stage3_queryend(stage3),
2141 	   Stage3_chrnum(stage3),Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
2142   }
2143 #endif
2144 
2145   kend = 0;
2146   for (i = 0; i < npaths; i++) {
2147     debug2(printf("queryend %d:",i));
2148     from = by_queryend[i];
2149 
2150     /* Find matching chromosomal bounds for querystart */
2151     chrnum = Stage3_chrnum(from);
2152     while (kend < npaths && Stage3_chrnum(by_querystart[kend]) == chrnum) {
2153       kend++;
2154     }
2155     kstart = kend - 1;
2156     while (kstart >= 0 && Stage3_chrnum(by_querystart[kstart]) == chrnum) {
2157       kstart--;
2158     }
2159     kstart++;
2160     debug2(printf(" querystart bounded by %d..%d:",kstart,kend));
2161 
2162 
2163     /* Find matching querystart */
2164     queryend = Stage3_queryend(from);
2165     j = kstart;
2166     while (j < kend && Stage3_querystart(by_querystart[j]) < queryend + CHIMERA_SLOP) {
2167       j++;
2168     }
2169     j--;
2170 
2171     while (j >= kstart && Stage3_querystart(by_querystart[j]) > queryend - CHIMERA_SLOP) {
2172       j--;
2173     }
2174     j++;
2175 
2176     while (j < kend && Stage3_querystart(by_querystart[j]) < queryend + CHIMERA_SLOP) {
2177       to = by_querystart[j];
2178 
2179       debug2(printf(" %d",j));
2180       if (Chimera_local_join_p(from,to,CHIMERA_SLOP) == true) {
2181 	debug2(printf("(to %d)",i));
2182 	Stage3_set_joinable_left(from);
2183 	Stage3_set_joinable_right(to);
2184       }
2185 
2186       j++;
2187     }
2188     debug2(printf("\n"));
2189   }
2190 
2191   FREE(by_querystart);
2192   FREE(by_queryend);
2193 
2194 
2195   *npaths_sub1 = *npaths_sub2 = 0;
2196   for (p = stage3list; p != NULL; p = List_next(p)) {
2197     stage3 = (Stage3_T) List_head(p);
2198     debug2(printf("Stage3 %p.  joinable_left_p %d, joinable_right_p %d\n",
2199 		  stage3,Stage3_joinable_left_p(stage3),Stage3_joinable_right_p(stage3)));
2200     if (Stage3_joinable_left_p(stage3) == true) {
2201       debug2(printf("Putting stage3 %p into local sub1\n",stage3));
2202       (*npaths_sub1)++;
2203     }
2204     if (Stage3_joinable_right_p(stage3) == true) {
2205       debug2(printf("Putting stage3 %p into local sub2\n",stage3));
2206       (*npaths_sub2)++;
2207     }
2208   }
2209 
2210   if (*npaths_sub1 == 0 || *npaths_sub2 == 0) {
2211     *stage3array_sub1 = (Stage3_T *) NULL;
2212     *npaths_sub1 = 0;
2213     *stage3array_sub2 = (Stage3_T *) NULL;
2214     *npaths_sub2 = 0;
2215 
2216   } else {
2217     *stage3array_sub1 = (Stage3_T *) MALLOC((*npaths_sub1) * sizeof(Stage3_T)); /* Return value */
2218     *stage3array_sub2 = (Stage3_T *) MALLOC((*npaths_sub2) * sizeof(Stage3_T)); /* Return value */
2219     j = k = 0;
2220     for (p = stage3list; p != NULL; p = List_next(p)) {
2221       stage3 = (Stage3_T) List_head(p);
2222       /* Note: it is possible that the same stage3 object gets put into both lists */
2223       if (Stage3_joinable_left_p(stage3) == true) {
2224 	debug2(printf("Putting %p into sub1\n",stage3));
2225 	(*stage3array_sub1)[j++] = stage3;
2226       }
2227       if (Stage3_joinable_right_p(stage3) == true) {
2228 	debug2(printf("Putting %p into sub2\n",stage3));
2229 	(*stage3array_sub2)[k++] = stage3;
2230       }
2231     }
2232   }
2233 
2234   debug2(printf("local_separate_paths returning %d paths\n",List_length(stage3list)));
2235 #ifdef DEBUG2
2236   for (p = stage3list; p != NULL; p = List_next(p)) {
2237     stage3 = (Stage3_T) List_head(p);
2238     printf("%p %p\n",stage3,Stage3_pairs(stage3));
2239   }
2240 #endif
2241 
2242   return;
2243 }
2244 
2245 
2246 /* Does not alter stage3list.  Puts Stage3_T objects into stage3array_sub1, stage3array_sub2, or both */
2247 static void
distant_separate_paths(Stage3_T ** stage3array_sub1,int * npaths_sub1,Stage3_T ** stage3array_sub2,int * npaths_sub2,List_T stage3list)2248 distant_separate_paths (Stage3_T **stage3array_sub1, int *npaths_sub1,
2249 			Stage3_T **stage3array_sub2, int *npaths_sub2,
2250 			List_T stage3list) {
2251   List_T p;
2252   Stage3_T from, to, stage3;
2253   Stage3_T *by_queryend, *by_querystart;
2254   int npaths, i, j, k;
2255   int queryend;
2256 
2257   debug2(printf("distant_separate_paths called with list length %d\n",List_length(stage3list)));
2258 #ifdef DEBUG2
2259   for (p = stage3list; p != NULL; p = List_next(p)) {
2260     stage3 = (Stage3_T) List_head(p);
2261     printf("%p %p\n",stage3,Stage3_pairs(stage3));
2262   }
2263 #endif
2264 
2265 
2266   if (stage3list == NULL) {
2267     *stage3array_sub1 = (Stage3_T *) NULL;
2268     *npaths_sub1 = 0;
2269     *stage3array_sub2 = (Stage3_T *) NULL;
2270     *npaths_sub2 = 0;
2271     return;
2272 
2273   } else {
2274     for (p = stage3list; p != NULL; p = List_next(p)) {
2275       stage3 = (Stage3_T) List_head(p);
2276       Stage3_clear_joinable(stage3);
2277     }
2278   }
2279 
2280   by_queryend = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
2281   qsort(by_queryend,npaths,sizeof(Stage3_T),Stage3_queryend_cmp);
2282 
2283   by_querystart = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
2284   qsort(by_querystart,npaths,sizeof(Stage3_T),Stage3_querystart_cmp);
2285 
2286   j = 0;
2287   for (i = 0; i < npaths; i++) {
2288     from = by_queryend[i];
2289     queryend = Stage3_queryend(from);
2290 
2291     while (j < npaths && Stage3_querystart(by_querystart[j]) < queryend + CHIMERA_SLOP) {
2292       j++;
2293     }
2294     j--;
2295 
2296     while (j >= 0 && Stage3_querystart(by_querystart[j]) > queryend - CHIMERA_SLOP) {
2297       j--;
2298     }
2299     j++;
2300 
2301     while (j < npaths && Stage3_querystart(by_querystart[j]) < queryend + CHIMERA_SLOP) {
2302       to = by_querystart[j];
2303 
2304       if (Chimera_distant_join_p(from,to,CHIMERA_SLOP) == true) {
2305 	debug2(printf("Found distant join from %d to %d\n",i,j));
2306 	Stage3_set_joinable_left(from);
2307 	Stage3_set_joinable_right(to);
2308       }
2309 
2310       j++;
2311     }
2312   }
2313 
2314   FREE(by_querystart);
2315   FREE(by_queryend);
2316 
2317 
2318   *npaths_sub1 = *npaths_sub2 = 0;
2319   for (p = stage3list; p != NULL; p = List_next(p)) {
2320     stage3 = (Stage3_T) List_head(p);
2321     if (Stage3_joinable_left_p(stage3) == true) {
2322       (*npaths_sub1)++;
2323     }
2324     if (Stage3_joinable_right_p(stage3) == true) {
2325       (*npaths_sub2)++;
2326     }
2327   }
2328 
2329   if (*npaths_sub1 == 0 || *npaths_sub2 == 0) {
2330     *stage3array_sub1 = (Stage3_T *) NULL;
2331     *npaths_sub1 = 0;
2332     *stage3array_sub2 = (Stage3_T *) NULL;
2333     *npaths_sub2 = 0;
2334   } else {
2335     *stage3array_sub1 = (Stage3_T *) MALLOC((*npaths_sub1) * sizeof(Stage3_T)); /* Return value */
2336     *stage3array_sub2 = (Stage3_T *) MALLOC((*npaths_sub2) * sizeof(Stage3_T)); /* Return value */
2337     j = k = 0;
2338     for (p = stage3list; p != NULL; p = List_next(p)) {
2339       stage3 = (Stage3_T) List_head(p);
2340       /* Note: it is possible that the same stage3 object gets put into both lists */
2341       if (Stage3_joinable_left_p(stage3) == true) {
2342 	debug2(printf("Putting stage3 %p into distant sub1\n",stage3));
2343 	(*stage3array_sub1)[j++] = stage3;
2344       }
2345       if (Stage3_joinable_right_p(stage3) == true) {
2346 	debug2(printf("Putting stage3 %p into distant sub2\n",stage3));
2347 	(*stage3array_sub2)[k++] = stage3;
2348       }
2349     }
2350   }
2351 
2352   debug2(printf("distant_separate_paths returning %d paths\n",List_length(stage3list)));
2353 #ifdef DEBUG2
2354   for (p = stage3list; p != NULL; p = List_next(p)) {
2355     stage3 = (Stage3_T) List_head(p);
2356     printf("%p %p\n",stage3,Stage3_pairs(stage3));
2357   }
2358 #endif
2359 
2360   return;
2361 }
2362 
2363 
2364 static List_T
merge_left_and_right_readthrough(bool * mergedp,List_T stage3list,Stage3_T * stage3array_sub1,int bestfrom,Stage3_T * stage3array_sub2,int bestto,int breakpoint,int queryntlength,char * queryaaseq_ptr,Sequence_T queryseq,char * queryseq_ptr,char * queryuc_ptr,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Oligoindex_array_T oligoindices_minor,Diagpool_T diagpool,Cellpool_T cellpool)2365 merge_left_and_right_readthrough (bool *mergedp, List_T stage3list,
2366 				  Stage3_T *stage3array_sub1, int bestfrom,
2367 				  Stage3_T *stage3array_sub2, int bestto,
2368 				  int breakpoint, int queryntlength,
2369 #ifdef PMAP
2370 				  char *queryaaseq_ptr,
2371 #endif
2372 				  Sequence_T queryseq, char *queryseq_ptr, char *queryuc_ptr,
2373 				  Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
2374 				  Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool) {
2375   Stage3_T stage3, best0, best1;
2376 
2377   best0 = stage3array_sub1[bestfrom];
2378   best1 = stage3array_sub2[bestto];
2379 
2380   debug2(printf("\nEntering merge_left_and_right_readthrough with bestfrom %d: %p, bestto %d: %p\n",
2381 		bestfrom,best0,bestto,best1));
2382   debug2(printf("Running Stage3_merge_local\n"));
2383 
2384   if ((stage3 = Stage3_merge_local(best0,best1,/*minpos1*/0,/*maxpos1*/breakpoint,
2385 				   /*minpos2*/breakpoint+1,/*maxpos2*/queryntlength,queryseq,
2386 #ifdef PMAP
2387 				   queryaaseq_ptr,
2388 #endif
2389 				   queryseq_ptr,queryuc_ptr,
2390 				   oligoindices_minor,diagpool,cellpool,
2391 				   pairpool,dynprogL,dynprogM,dynprogR,maxpeelback)) == NULL) {
2392     *mergedp = false;
2393 
2394   } else {
2395     debug2(printf("Changing genomicend of merged stage3 from %u to %u\n",Stage3_genomicend(stage3),Stage3_genomicend(best1)));
2396     Stage3_set_genomicend(stage3,Stage3_genomicend(best1));
2397     debug(printf("Pushing %p onto newstage3list\n",stage3));
2398     stage3list = List_push(stage3list,(void *) stage3);
2399     debug2(Stage3_print_ends(stage3));
2400     *mergedp = true;
2401   }
2402 
2403   return stage3list;
2404 }
2405 
2406 
2407 #if 0
2408 /* Returns a list with only two Stage3_T objects */
2409 static List_T
2410 merge_left_and_right_transloc (Stage3_T *stage3array_sub1, int npaths_sub1, int bestfrom,
2411 			       Stage3_T *stage3array_sub2, int npaths_sub2, int bestto,
2412 			       List_T stage3list) {
2413   List_T newstage3list, p;
2414   Stage3_T best0, best1, stage3, *array;
2415   int i, k;
2416 
2417   best0 = stage3array_sub1[bestfrom];
2418   best1 = stage3array_sub2[bestto];
2419 
2420   debug2(printf("\nEntering merge_left_and_right_transloc with bestfrom %d: %p, bestto %d: %p, and stage3list %d\n",
2421 		bestfrom,best0,bestto,best1,List_length(stage3list)));
2422 
2423   debug2(printf("Before Stage3_merge_chimera, best0 is %p, query %d..%d\n",
2424 		best0,Stage3_querystart(best0),Stage3_queryend(best0)));
2425   debug2(Stage3_print_ends(best0));
2426   debug2(printf("Before Stage3_merge_chimera, best1 is %p, query %d..%d\n",
2427 		best1,Stage3_querystart(best1),Stage3_queryend(best1)));
2428   debug2(Stage3_print_ends(best1));
2429 
2430   debug2(printf("Rearranging paths\n"));
2431   newstage3list = (List_T) NULL;
2432 
2433   debug(printf("Pushing %p onto newstage3list\n",best0));
2434   debug(printf("Pushing %p onto newstage3list\n",best1));
2435   if (Stage3_npairs(best0) == 0) {
2436     Stage3_free(&best0);
2437     best0 = (Stage3_T) NULL;
2438   } else {
2439     newstage3list = List_push(newstage3list,(void *) best0);
2440     debug2(Stage3_print_ends(best0));
2441   }
2442   if (Stage3_npairs(best1) == 0) {
2443     Stage3_free(&best1);
2444     best1 = (Stage3_T) NULL;
2445   } else {
2446     newstage3list = List_push(newstage3list,(void *) best1);
2447     debug2(Stage3_print_ends(best1));
2448   }
2449 
2450   if (List_length(stage3list) > 2) {
2451     /* Push rest of results, taking care not to have duplicates */
2452     array = (Stage3_T *) MALLOCA((List_length(stage3list) - 2) * sizeof(Stage3_T));
2453     k = 0;
2454     for (p = stage3list; p != NULL; p = List_next(p)) {
2455       stage3 = (Stage3_T) List_head(p);
2456       if (Stage3_npairs(stage3) == 0) {
2457 	Stage3_free(&stage3);
2458       } else if (stage3 == best0 || stage3 == best1) {
2459 	/* Skip */
2460       } else {
2461 	array[k++] = stage3;
2462 	debug(printf("Pushing %p onto newstage3list\n",stage3));
2463 	newstage3list = List_push(newstage3list,(void *) stage3);
2464       }
2465     }
2466     qsort(array,k,sizeof(Stage3_T),Stage3_identity_cmp);
2467     FREEA(array);
2468   }
2469 
2470   List_free(&stage3list);
2471   return List_reverse(newstage3list);
2472 }
2473 #endif
2474 
2475 
2476 static int
find_breakpoint(int * cdna_direction,int * chimerapos,int * chimeraequivpos,int * exonexonpos,char * donor1,char * donor2,char * acceptor2,char * acceptor1,bool * donor_watsonp,bool * acceptor_watsonp,double * donor_prob,double * acceptor_prob,Stage3_T from,Stage3_T to,Sequence_T queryntseq,Sequence_T queryseq,Sequence_T queryuc,int queryntlength,Genome_T genome,Genome_T genomealt,Univ_IIT_T chromosome_iit,Pairpool_T pairpool)2477 find_breakpoint (int *cdna_direction, int *chimerapos, int *chimeraequivpos, int *exonexonpos,
2478 		 char *donor1, char *donor2, char *acceptor2, char *acceptor1,
2479 		 bool *donor_watsonp, bool *acceptor_watsonp, double *donor_prob, double *acceptor_prob,
2480 		 Stage3_T from, Stage3_T to,
2481 #ifdef PMAP
2482 		 Sequence_T queryntseq,
2483 #endif
2484 		 Sequence_T queryseq, Sequence_T queryuc,
2485 		 int queryntlength, Genome_T genome, Genome_T genomealt,
2486 		 Univ_IIT_T chromosome_iit, Pairpool_T pairpool) {
2487   int breakpoint, rangelow, rangehigh, leftpos, rightpos, midpos;
2488   int maxpeelback_from, maxpeelback_to;
2489   int found_cdna_direction, try_cdna_direction;
2490   char comp;			/* Not really used anywhere */
2491 
2492   int queryjump;
2493   int genomejump;
2494   bool max_extend_p;
2495   Chrpos_T left_chrlength, right_chrlength;
2496   Univcoord_T chroffset, chrhigh;
2497 
2498   if (Stage3_queryend(from) < Stage3_querystart(to)) {
2499     /* Gap exists between the two parts */
2500     if ((leftpos = Stage3_queryend(from) - CHIMERA_EXTEND) < 0) {
2501       leftpos = 0;
2502     }
2503     if ((rightpos = Stage3_querystart(to) + CHIMERA_EXTEND) >= queryntlength) {
2504       rightpos = queryntlength - 1;
2505     }
2506     maxpeelback_from = CHIMERA_EXTEND;
2507     maxpeelback_to = CHIMERA_EXTEND;
2508     debug2(printf("overlap: leftpos %d, rightpos %d, queryntlength %d, maxpeelback_from %d, maxpeelback_to %d\n",
2509 		  leftpos,rightpos,queryntlength,maxpeelback_from,maxpeelback_to));
2510 
2511     if (Stage3_watsonp(from) == true && Stage3_watsonp(to) == true) {
2512       queryjump = Stage3_querystart(to) - Stage3_queryend(from) - 1;
2513       genomejump = Stage3_genomicstart(to) - Stage3_genomicend(from) - 1U;
2514       max_extend_p = ((int) genomejump == queryjump) ? false : true;
2515       debug2(printf("gap exists: genomejump = %u, queryjump = %d, max_extend_p = %d\n",genomejump,queryjump,max_extend_p));
2516     } else if (Stage3_watsonp(from) == false && Stage3_watsonp(to) == false) {
2517       queryjump = Stage3_querystart(to) - Stage3_queryend(from) - 1;
2518       genomejump = Stage3_genomicend(from) - Stage3_genomicstart(to) - 1U;
2519       max_extend_p = ((int) genomejump == queryjump) ? false : true;
2520       debug2(printf("gap exists: genomejump = %u, queryjump = %d, max_extend_p = %d\n",genomejump,queryjump,max_extend_p));
2521     } else {
2522       max_extend_p = false;
2523     }
2524 
2525   } else {
2526     /* Two parts overlap */
2527     if ((leftpos = Stage3_querystart(to) - CHIMERA_EXTEND) < 0) {
2528       leftpos = 0;
2529     }
2530     if ((rightpos = Stage3_queryend(from) + CHIMERA_EXTEND) >= queryntlength) {
2531       rightpos = queryntlength - 1;
2532     }
2533     midpos = (leftpos+rightpos)/2;
2534     /* maxpeelback_from = rightpos - Stage3_querystart(to); */
2535     /* maxpeelback_to = Stage3_queryend(from) - leftpos; */
2536     maxpeelback_from = rightpos - midpos;
2537     maxpeelback_to = midpos - leftpos;
2538     debug2(printf("overlap: leftpos %d, rightpos %d, midpos %d, queryntlength %d, maxpeelback_from %d, maxpeelback_to %d\n",
2539 		  leftpos,rightpos,midpos,queryntlength,maxpeelback_from,maxpeelback_to));
2540 #if 0
2541     if (Stage3_watsonp(from) == true && Stage3_watsonp(to) == true) {
2542       queryjump = Stage3_queryend(from) - Stage3_querystart(to) - 1;
2543       genomejump = Stage3_genomicend(from) - Stage3_genomicstart(to) - 1U;
2544       max_extend_p = (genomejump == queryjump) ? false : true;
2545     } else if (Stage3_watsonp(from) == false && Stage3_watsonp(to) == false) {
2546       queryjump = Stage3_queryend(from) - Stage3_querystart(to) - 1;
2547       genomejump = Stage3_genomicstart(to) - Stage3_genomicend(from) - 1U;
2548       max_extend_p = (genomejump == queryjump) ? false : true;
2549     } else {
2550       max_extend_p = false;
2551     }
2552 #else
2553     debug2(printf("parts overlap: max_extend_p is false\n"));
2554     max_extend_p = false;
2555 #endif
2556   }
2557 
2558   debug2(printf("Before Stage3_extend_right, bestfrom is %p, query %d..%d, rightpos %d, pairs %p\n",
2559 		from,Stage3_querystart(from),Stage3_queryend(from),rightpos,Stage3_pairs(from)));
2560   debug2(Stage3_print_ends(from));
2561   debug2(printf("Before Stage3_extend_left, bestto is %p, query %d..%d, leftpos %d, pairs %p\n",
2562 		to,Stage3_querystart(to),Stage3_queryend(to),leftpos,Stage3_pairs(to)));
2563   debug2(Stage3_print_ends(to));
2564 
2565   Stage3_extend_right(from,/*goal*/rightpos,
2566 #ifdef PMAP
2567 		      /*querylength*/Sequence_fulllength(queryntseq),
2568 		      /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
2569 		      /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
2570 #else
2571 		      /*querylength*/Sequence_fulllength(queryseq),
2572 		      /*queryseq_ptr*/Sequence_fullpointer(queryseq),
2573 		      /*queryuc_ptr*/Sequence_fullpointer(queryuc),
2574 #endif
2575 		      max_extend_p,pairpool,Stage3_genestrand(from),maxpeelback_from);
2576 
2577   Stage3_extend_left(to,/*goal*/leftpos,
2578 #ifdef PMAP
2579 		     /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
2580 		     /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
2581 #else
2582 		     /*queryseq_ptr*/Sequence_fullpointer(queryseq),
2583 		     /*queryuc_ptr*/Sequence_fullpointer(queryuc),
2584 #endif
2585 		     max_extend_p,pairpool,Stage3_genestrand(to),maxpeelback_to);
2586 
2587   debug2(printf("Before Chimera_find_breakpoint, bestfrom is %p, query %d..%d, pairs %p\n",
2588                  from,Stage3_querystart(from),Stage3_queryend(from),Stage3_pairs(from)));
2589   debug2(Stage3_print_ends(from));
2590   debug2(printf("Before Chimera_find_breakpoint, bestto is %p, query %d..%d, pairs %p\n",
2591                  to,Stage3_querystart(to),Stage3_queryend(to),Stage3_pairs(to)));
2592   debug2(Stage3_print_ends(to));
2593 
2594   Univ_IIT_interval_bounds(&chroffset,&chrhigh,&left_chrlength,chromosome_iit,Stage3_chrnum(from),circular_typeint);
2595   Univ_IIT_interval_bounds(&chroffset,&chrhigh,&right_chrlength,chromosome_iit,Stage3_chrnum(to),circular_typeint);
2596 
2597   if ((*chimerapos = Chimera_find_breakpoint(&(*chimeraequivpos),&rangelow,&rangehigh,
2598 					     &(*donor1),&(*donor2),&(*acceptor2),&(*acceptor1),
2599 					     from,to,queryntlength,genome,left_chrlength,right_chrlength)) < 0) {
2600     /* TODO: Allow finding a breakpoint for DNA-Seq, which needs no donor or acceptor nucleotides */
2601     debug2(printf("Chimera_find_breakpoint returns no value\n"));
2602     *donor_prob = *acceptor_prob = 0.0;
2603     *donor_watsonp = *acceptor_watsonp = true;
2604     *cdna_direction = 0;
2605     return -1;
2606 
2607   } else {
2608     debug2(printf("Chimera_find_breakpoint has chimerapos %d..%d\n",*chimerapos,*chimeraequivpos));
2609 
2610     Stage3_trim_right(from,/*goal*/rangehigh,
2611 		      /*queryseq_ptr*/Sequence_fullpointer(queryseq),
2612 		      pairpool);
2613 
2614     Stage3_trim_left(to,/*goal*/rangelow,
2615 		     /*queryseq_ptr*/Sequence_fullpointer(queryseq),
2616 		     pairpool);
2617 
2618     debug2(printf("Before Chimera_find_exonexon, bestfrom is %p, query %d..%d, pairs %p\n",
2619                   from,Stage3_querystart(from),Stage3_queryend(from),Stage3_pairs(from)));
2620     debug2(printf("Before Chimera_find_exonexon, bestto is %p, query %d..%d, pairs %p\n",
2621                   to,Stage3_querystart(to),Stage3_queryend(to),Stage3_pairs(to)));
2622 
2623     if ((*exonexonpos = Chimera_find_exonexon(&found_cdna_direction,&try_cdna_direction,
2624 					      &(*donor1),&(*donor2),&(*acceptor2),&(*acceptor1),
2625 					      &comp,&(*donor_watsonp),&(*acceptor_watsonp),&(*donor_prob),&(*acceptor_prob),
2626 					      /*left_part*/from,/*right_part*/to,genome,genomealt ? genomealt : genome,
2627 					      chromosome_iit,/*breakpoint_start*/Stage3_querystart(to),
2628 					      /*breakpoint_end*/Stage3_queryend(from))) <= 0) {
2629       /* Couldn't find a good exon-exon junction, so rely on sequence */
2630       *donor_prob = *acceptor_prob = 0.0;
2631       *donor_watsonp = *acceptor_watsonp = true;
2632 
2633       debug2(printf("Chimera_find_breakpoint returns boundary at %d..%d (switch can occur at %d..%d)\n",
2634 		    *chimerapos,*chimeraequivpos,(*chimerapos)-1,*chimeraequivpos));
2635 
2636       breakpoint = ((*chimerapos) + (*chimeraequivpos))/2;
2637       *cdna_direction = try_cdna_direction;
2638       debug2(printf("Exon-exon boundary not found, but setting breakpoint to be %d\n",breakpoint));
2639       return breakpoint;
2640 
2641     } else {
2642       /* Use the exon-exon solution */
2643       breakpoint = *chimerapos = *chimeraequivpos = *exonexonpos;
2644       *cdna_direction = found_cdna_direction;
2645       debug2(printf("Exon-exon boundary found at %d, which is breakpoint.  Comp = %c\n",
2646 		    *exonexonpos,comp));
2647       return breakpoint;
2648     }
2649   }
2650 }
2651 
2652 
2653 /* Can potentially include a larger stage3list */
2654 static List_T
check_for_local(bool * mergedp,List_T stage3list,int effective_start,int effective_end,Sequence_T queryseq,Sequence_T queryuc,Sequence_T queryntseq,int queryntlength,Sequence_T usersegment,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Matchpool_T matchpool,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR)2655 check_for_local (bool *mergedp, List_T stage3list, int effective_start, int effective_end,
2656 		 Sequence_T queryseq, Sequence_T queryuc,
2657 #ifdef PMAP
2658 		 Sequence_T queryntseq,
2659 #endif
2660 		 int queryntlength, Sequence_T usersegment, Stage2_alloc_T stage2_alloc,
2661 		 Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
2662 		 Matchpool_T matchpool, Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
2663 		 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR) {
2664   List_T gregions = NULL, p;
2665   Stage3_T *stage3array_sub1 = NULL, *stage3array_sub2 = NULL, from, to, stage3;
2666   Sequence_T querysubseq = NULL, querysubuc = NULL;
2667   Diagnostic_T diagnostic;
2668   int bestfrom, bestto;
2669   int five_margin, three_margin, five_score = 0, three_score = 0;
2670   int extension;
2671   int npaths_sub1 = 0, npaths_sub2 = 0;
2672   bool lowidentityp, poorp, repetitivep;
2673 
2674   int max_single_goodness;
2675   int breakpoint, chimerapos, chimeraequivpos, exonexonpos;
2676   int chimera_cdna_direction;
2677   char donor1, donor2, acceptor2, acceptor1;
2678   bool donor_watsonp, acceptor_watsonp;
2679   double donor_prob, acceptor_prob;
2680 
2681   int kstart1, kstart2, kend1, kend2;
2682   Chrnum_T chrnum;
2683 #ifdef DEBUG2
2684   int k;
2685 #endif
2686 
2687 
2688 #ifdef PMAP
2689   five_margin = effective_start - 3*Sequence_trim_start(queryseq);
2690   three_margin = 3*Sequence_trim_end(queryseq) - effective_end;
2691   debug2(printf("Margins are %d = %d - %d on the 5' end and %d = %d - %d on the 3' end\n",
2692 		five_margin,effective_start,3*Sequence_trim_start(queryseq),
2693 		three_margin,3*Sequence_trim_end(queryseq),effective_end));
2694 #else
2695   five_margin = effective_start - Sequence_trim_start(queryseq);
2696   three_margin = Sequence_trim_end(queryseq) - effective_end;
2697   debug2(printf("Margins are %d = %d - %d on the 5' end and %d = %d - %d on the 3' end\n",
2698 		five_margin,effective_start,Sequence_trim_start(queryseq),
2699 		three_margin,Sequence_trim_end(queryseq),effective_end));
2700 #endif
2701 
2702 #ifdef DEBUG2A
2703   for (p = stage3list; p != NULL; p = List_next(p)) {
2704     stage3 = (Stage3_T) List_head(p);
2705     Pair_dump_array(Stage3_pairarray(stage3),Stage3_npairs(stage3),/*zerobasedp*/true);
2706     printf("\n");
2707   }
2708 #endif
2709 
2710   /* Stage3_recompute_goodness(stage3list); */
2711   max_single_goodness = 0;
2712   for (p = stage3list; p != NULL; p = List_next(p)) {
2713     stage3 = (Stage3_T) List_head(p);
2714     if (Stage3_goodness(stage3) > max_single_goodness) {
2715       max_single_goodness = Stage3_goodness(stage3);
2716     }
2717   }
2718   debug2(printf("max single goodness = %d\n",max_single_goodness));
2719 
2720 
2721   debug2(printf("Running local_separate_paths\n"));
2722   local_separate_paths(&stage3array_sub1,&npaths_sub1,&stage3array_sub2,&npaths_sub2,
2723 		       stage3list);
2724   debug2(printf("local: npaths_sub1 %d, npaths_sub2 %d, stage3list %d\n",
2725 		npaths_sub1,npaths_sub2,List_length(stage3list)));
2726 
2727   if (npaths_sub1 == 0 && npaths_sub2 == 0) {
2728     /* Need to compute on margin explicitly */
2729     if (five_margin < chimera_margin && three_margin < chimera_margin) {
2730       debug2(printf("Insufficient margins\n"));
2731     } else if (five_margin > three_margin) {
2732 #if 0
2733       /* extension makes it harder to find the other alignment.  The merging process will help fill in any gap. */
2734       extension = CHIMERA_SLOP;
2735       debug2(printf("Comparing extension %d with %d = (effective_start %d)/2\n",
2736 		    extension,effective_start/2,effective_start));
2737       if (extension > effective_start/2) {
2738 	/* Extension occupies more than 1/3 of sequence */
2739 	debug2(printf("Proposed extension of %d is too long relative to effective_start %d\n",extension,effective_start));
2740 	extension = effective_start/3;
2741       }
2742 #else
2743       extension = 0;
2744 #endif
2745       if ((querysubseq = Sequence_subsequence(queryseq,0,effective_start+extension)) != NULL) {
2746 	if ((querysubuc = Sequence_subsequence(queryuc,0,effective_start+extension)) != NULL) {
2747 	  debug2(printf("5 margin > 3 margin.  "));
2748 	  debug2(printf("Beginning Stage1_compute on 5' margin from effective_start %d (%d..%d)\n",
2749 			effective_start,0,effective_start+extension));
2750 	  debug2a(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
2751 
2752 	  diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
2753 				      Oligoindex_array_elt(oligoindices_major,0));
2754 	  if (poorp == true || repetitivep == true) {
2755 	    debug2(printf("Subsequence is poor or repetitive\n"));
2756 	  } else {
2757 	    if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
2758 	      gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
2759 						    chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2760 						    stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2761 	    } else {
2762 	      gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
2763 					chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2764 					stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2765 	    }
2766 	    debug2(printf("A.  Performing Stage 3 starting with list length %d\n",List_length(stage3list)));
2767 	    stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
2768 #ifdef PMAP
2769 					      queryntseq,
2770 #endif
2771 					      usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
2772 					      pairpool,diagpool,cellpool,
2773 					      dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
2774 #ifdef DEBUG2
2775 	    for (p = stage3list; p != NULL; p = List_next(p)) {
2776 	      stage3 = (Stage3_T) List_head(p);
2777 	      printf("%d..%d, %u..%u\n",
2778 		     Stage3_querystart(stage3),Stage3_queryend(stage3),
2779 		     Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
2780 	    }
2781 #endif
2782 	  }
2783 	  Diagnostic_free(&diagnostic);
2784 
2785 	  /* Above function frees gregions */
2786 	  Sequence_free(&querysubuc);
2787 	}
2788 	Sequence_free(&querysubseq);
2789       }
2790 
2791       /* And recompute on original part, just in case stage 1 was led astray by the ends */
2792       if ((querysubseq = Sequence_subsequence(queryseq,effective_start,queryntlength)) != NULL) {
2793 	if ((querysubuc = Sequence_subsequence(queryuc,effective_start,queryntlength)) != NULL) {
2794 	  debug2(printf("Recomputing on original part.  "));
2795 	  debug2(printf("Beginning Stage1_compute on 5' margin from effective_start %d (%d..%d)\n",
2796 			effective_start,effective_start,queryntlength));
2797 	  debug2a(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
2798 
2799 	  diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
2800 				      Oligoindex_array_elt(oligoindices_major,0));
2801 	  if (poorp == true || repetitivep == true) {
2802 	    debug2(printf("Subsequence is poor or repetitive\n"));
2803 	  } else {
2804 	    if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
2805 	      gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
2806 						    chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2807 						    stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2808 	    } else {
2809 	      gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
2810 					chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2811 					stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2812 	    }
2813 	    debug2(printf("B.  Performing Stage 3 starting with list length %d\n",List_length(stage3list)));
2814 	    stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
2815 #ifdef PMAP
2816 					      queryntseq,
2817 #endif
2818 					      usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
2819 					      pairpool,diagpool,cellpool,
2820 					      dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
2821 #ifdef DEBUG2
2822 	    for (p = stage3list; p != NULL; p = List_next(p)) {
2823 	      stage3 = (Stage3_T) List_head(p);
2824 	      printf("%d..%d, %u..%u\n",
2825 		     Stage3_querystart(stage3),Stage3_queryend(stage3),
2826 		     Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
2827 	    }
2828 #endif
2829 	  }
2830 	  Diagnostic_free(&diagnostic);
2831 
2832 	  /* Above function frees gregions */
2833 	  Sequence_free(&querysubuc);
2834 	}
2835 	Sequence_free(&querysubseq);
2836       }
2837 
2838       debug2(printf("Running local_separate_paths\n"));
2839       local_separate_paths(&stage3array_sub1,&npaths_sub1,&stage3array_sub2,&npaths_sub2,
2840 			   stage3list);
2841       debug2(printf("local: npaths_sub1 %d, npaths_sub2 %d, stage3list %d\n",
2842 		    npaths_sub1,npaths_sub2,List_length(stage3list)));
2843 
2844     } else {
2845 #if 0
2846       /* extension makes it harder to find the other alignment.  The merging process will help fill in any gap. */
2847       extension = CHIMERA_SLOP;
2848       debug2(printf("Comparing extension %d with %d = (queryntlength %d - effective_end %d)/2\n",
2849 		    extension,(queryntlength-effective_end)/2,queryntlength,effective_end));
2850       if (extension > (queryntlength - effective_end)/2) {
2851 	/* Extension occupies more than 1/3 of sequence */
2852 	debug2(printf("Proposed extension of %d is too long relative to queryntlength %d and effective_end %d\n",
2853 		      extension,queryntlength,effective_end));
2854 	extension = (queryntlength - effective_end)/3;
2855       }
2856 #else
2857       extension = 0;
2858 #endif
2859       if ((querysubseq = Sequence_subsequence(queryseq,effective_end-extension,queryntlength)) != NULL) {
2860 	if ((querysubuc = Sequence_subsequence(queryuc,effective_end-extension,queryntlength)) != NULL) {
2861 	  debug2(printf("5 margin <= 3 margin.  "));
2862 	  debug2(printf("Beginning Stage1_compute on 3' margin from effective_end %d (%d..%d) (extension %d)\n",
2863 			effective_end,effective_end-extension,queryntlength,extension));
2864 	  debug2(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
2865 
2866 	  diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
2867 				      Oligoindex_array_elt(oligoindices_major,0));
2868 	  if (poorp == true || repetitivep == true) {
2869 	    debug2(printf("Subsequence is poor or repetitive\n"));
2870 	  } else {
2871 	    if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
2872 	      gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
2873 						    chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2874 						    stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2875 	    } else {
2876 	      gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
2877 					chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2878 					stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2879 	    }
2880 	    debug2(printf("C.  Performing Stage 3 with list length %d\n",List_length(stage3list)));
2881 	    stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
2882 #ifdef PMAP
2883 					      queryntseq,
2884 #endif
2885 					      usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
2886 					      pairpool,diagpool,cellpool,
2887 					      dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
2888 #ifdef DEBUG2
2889 	    for (p = stage3list; p != NULL; p = List_next(p)) {
2890 	      stage3 = (Stage3_T) List_head(p);
2891 	      printf("%d..%d, %u..%u\n",
2892 		     Stage3_querystart(stage3),Stage3_queryend(stage3),
2893 		     Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
2894 	    }
2895 #endif
2896 	  }
2897 	  Diagnostic_free(&diagnostic);
2898 
2899 	  /* Above function frees gregions */
2900 	  Sequence_free(&querysubuc);
2901 	}
2902 	Sequence_free(&querysubseq);
2903       }
2904 
2905       /* And recompute on original part, just in case stage 1 was led astray by the ends */
2906       if ((querysubseq = Sequence_subsequence(queryseq,0,effective_end)) != NULL) {
2907 	if ((querysubuc = Sequence_subsequence(queryuc,0,effective_end)) != NULL) {
2908 	  debug2(printf("Recomputing on original part.  "));
2909 	  debug2(printf("Beginning Stage1_compute on 3' margin from effective_end %d (%d..%d), extension %d\n",
2910 			effective_end,0,effective_end,extension));
2911 	  debug2(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
2912 
2913 	  diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
2914 				      Oligoindex_array_elt(oligoindices_major,0));
2915 	  if (poorp == true || repetitivep == true) {
2916 	    debug2(printf("Subsequence is poor or repetitive\n"));
2917 	  } else {
2918 	    if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
2919 	      gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
2920 						    chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2921 						    stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2922 	    } else {
2923 	      gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
2924 					chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2925 					stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
2926 	    }
2927 	    debug2(printf("D.  Performing Stage 3 with list length %d\n",List_length(stage3list)));
2928 	    stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
2929 #ifdef PMAP
2930 					      queryntseq,
2931 #endif
2932 					      usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
2933 					      pairpool,diagpool,cellpool,
2934 					      dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
2935 #ifdef DEBUG2
2936 	    for (p = stage3list; p != NULL; p = List_next(p)) {
2937 	      stage3 = (Stage3_T) List_head(p);
2938 	      printf("%d..%d, %u..%u\n",
2939 		     Stage3_querystart(stage3),Stage3_queryend(stage3),
2940 		     Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
2941 	    }
2942 #endif
2943 	  }
2944 	  Diagnostic_free(&diagnostic);
2945 
2946 	  /* Above function frees gregions */
2947 	  Sequence_free(&querysubuc);
2948 
2949 	}
2950 	Sequence_free(&querysubseq);
2951       }
2952 
2953       debug2(printf("Running local_separate_paths\n"));
2954       local_separate_paths(&stage3array_sub1,&npaths_sub1,&stage3array_sub2,&npaths_sub2,
2955 			   stage3list);
2956       debug2(printf("local: npaths_sub1 %d, npaths_sub2 %d, stage3list %d\n",
2957 		    npaths_sub1,npaths_sub2,List_length(stage3list)));
2958     }
2959   }
2960 
2961   *mergedp = false;
2962   if (npaths_sub1 == 0 && npaths_sub2 == 0) {
2963     /* Skip */
2964 
2965   } else if (npaths_sub1 == 0) {
2966     /* Skip */
2967     FREE(stage3array_sub2);
2968 
2969   } else if (npaths_sub2 == 0) {
2970     /* Skip */
2971     FREE(stage3array_sub1);
2972 
2973   } else {
2974     /* Iterate for each chromosome */
2975     qsort(stage3array_sub1,npaths_sub1,sizeof(Stage3_T),Stage3_chrnum_cmp);
2976     qsort(stage3array_sub2,npaths_sub2,sizeof(Stage3_T),Stage3_chrnum_cmp);
2977 
2978 
2979     kend1 = kend2 = 0;
2980     *mergedp = false;
2981     /* List_free(&stage3list); */
2982 
2983     while (kend1 < npaths_sub1 && kend2 < npaths_sub2) {
2984       kstart1 = kend1;
2985       kstart2 = kend2;
2986       chrnum = Stage3_chrnum(stage3array_sub1[kstart1]);
2987       while (kend1 < npaths_sub1 && Stage3_chrnum(stage3array_sub1[kend1]) == chrnum) {
2988 	kend1++;
2989       }
2990       while (kend2 < npaths_sub2 && Stage3_chrnum(stage3array_sub2[kend2]) == chrnum) {
2991 	kend2++;
2992       }
2993 
2994 #ifdef DEBUG2
2995       printf("Chimera_bestpath left\n");
2996       for (k = kstart1; k < kend1; k++) {
2997 	stage3 = stage3array_sub1[k];
2998 	printf("%d..%d, %d:%u..%u\n",
2999 	       Stage3_querystart(stage3),Stage3_queryend(stage3),
3000 	       Stage3_chrnum(stage3),Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
3001       }
3002       printf("Chimera_bestpath right\n");
3003       for (k = kstart2; k < kend2; k++) {
3004 	stage3 = stage3array_sub2[k];
3005 	printf("%d..%d, %d:%u..%u\n",
3006 	       Stage3_querystart(stage3),Stage3_queryend(stage3),
3007 	       Stage3_chrnum(stage3),Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
3008       }
3009 #endif
3010 
3011       if (Chimera_bestpath(&five_score,&three_score,&chimerapos,&chimeraequivpos,&bestfrom,&bestto,
3012 			   &(stage3array_sub1[kstart1]),/*npaths1*/kend1-kstart1,
3013 			   &(stage3array_sub2[kstart2]),/*npaths2*/kend2-kstart2,
3014 			   queryntlength,CHIMERA_SLOP,/*circularp*/NULL,/*localp*/true) == false) {
3015 	/* Skip */
3016 	debug2(printf("Chimera_bestpath returns false\n"));
3017 
3018       } else {
3019 	from = stage3array_sub1[kstart1 + bestfrom];
3020 	to = stage3array_sub2[kstart2 + bestto];
3021 	debug2(printf("Chimera_bestpath returns bestfrom %d (%d..%d, %u..%u) to bestto %d (%d..%d, %u..%u)\n",
3022 		      bestfrom,Stage3_querystart(from),Stage3_queryend(from),Stage3_genomicstart(from),Stage3_genomicend(from),
3023 		      bestto,Stage3_querystart(to),Stage3_queryend(to),Stage3_genomicstart(to),Stage3_genomicend(to)));
3024 
3025 	if ((breakpoint = find_breakpoint(&chimera_cdna_direction,&chimerapos,&chimeraequivpos,&exonexonpos,
3026 					  &donor1,&donor2,&acceptor2,&acceptor1,
3027 					  &donor_watsonp,&acceptor_watsonp,&donor_prob,&acceptor_prob,from,to,
3028 #ifdef PMAP
3029 					  queryntseq,
3030 #endif
3031 					  queryseq,queryuc,queryntlength,
3032 					  genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3033 	  debug2(printf("Cannot find breakpoint\n"));
3034 
3035 	} else {
3036 	  debug2(printf("find_breakpoint returns %d\n",breakpoint));
3037 
3038 	  /* Check to see if we can merge chimeric parts */
3039 	  debug2(printf("Before Stage3_mergeable, bestfrom is %p, query %d..%d, pairs %p\n",
3040 			from,Stage3_querystart(from),Stage3_queryend(from),Stage3_pairs(from)));
3041 	  debug2(printf("Before Stage3_mergeable, bestto is %p, query %d..%d, pairs %p\n",
3042 			to,Stage3_querystart(to),Stage3_queryend(to),Stage3_pairs(to)));
3043 
3044 	  if (Stage3_mergeable(from,to,breakpoint,queryntlength) == true) {
3045 	    debug2(printf("Mergeable! -- Merging left and right as a readthrough\n"));
3046 	    stage3list = merge_left_and_right_readthrough(&(*mergedp),stage3list,
3047 							  &(stage3array_sub1[kstart1]),/*npaths1:kend1-kstart1,*/bestfrom,
3048 							  &(stage3array_sub2[kstart2]),/*npaths2:kend2-kstart2,*/bestto,
3049 							  breakpoint,queryntlength,queryseq,
3050 #ifdef PMAP
3051 							  /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
3052 							  /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
3053 							  /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
3054 #else
3055 							  /*queryseq_ptr*/Sequence_fullpointer(queryseq),
3056 							  /*queryuc_ptr*/Sequence_fullpointer(queryuc),
3057 #endif
3058 							  pairpool,dynprogL,dynprogM,dynprogR,
3059 							  oligoindices_minor,diagpool,cellpool);
3060 
3061 	    debug2(printf("After merge_left_and_right_readthrough, bestfrom is %p, query %d..%d, pairs %p\n",
3062 			  from,Stage3_querystart(from),Stage3_queryend(from),Stage3_pairs(from)));
3063 	    debug2(printf("After merge_left_and_right_readthrough, bestto is %p, query %d..%d, pairs %p\n",
3064 			  to,Stage3_querystart(to),Stage3_queryend(to),Stage3_pairs(to)));
3065 	  }
3066 	}
3067       }
3068     }
3069 
3070     FREE(stage3array_sub2);
3071     FREE(stage3array_sub1);
3072 
3073     /* stage3list = List_reverse(stage3list); */
3074   }
3075 
3076   debug2(printf("check_for_local returning list of length %d\n",List_length(stage3list)));
3077 #ifdef DEBUG2
3078   for (p = stage3list; p != NULL; p = List_next(p)) {
3079     stage3 = (Stage3_T) List_head(p);
3080     printf("%p %p\n",stage3,Stage3_pairs(stage3));
3081   }
3082 #endif
3083 
3084   /* stage3list = stage3list_remove_empties(stage3list); */
3085 
3086 #if 0
3087   /* Should be handled by apply_stage3 loop */
3088   /* Needed after calls to stage3_from_gregions */
3089   Stage3_recompute_goodness(stage3list);
3090   stage3list = stage3list_remove_duplicates(stage3list);
3091 #endif
3092 
3093   return stage3list;
3094 }
3095 
3096 
3097 static List_T
check_for_chimera(bool * mergedp,Chimera_T * chimera,List_T stage3list,int effective_start,int effective_end,Sequence_T queryseq,Sequence_T queryuc,Sequence_T queryntseq,int queryntlength,Sequence_T usersegment,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Matchpool_T matchpool,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR)3098 check_for_chimera (bool *mergedp, Chimera_T *chimera, List_T stage3list, int effective_start, int effective_end,
3099 		   Sequence_T queryseq, Sequence_T queryuc,
3100 #ifdef PMAP
3101 		   Sequence_T queryntseq,
3102 #endif
3103 		   int queryntlength, Sequence_T usersegment, Stage2_alloc_T stage2_alloc,
3104 		   Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
3105 		   Matchpool_T matchpool, Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
3106 		   Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR) {
3107   List_T gregions = NULL, p;
3108   Stage3_T new_left, new_right;
3109   Stage3_T *stage3array_sub1 = NULL, *stage3array_sub2 = NULL, from, to, stage3;
3110   Sequence_T querysubseq = NULL, querysubuc = NULL;
3111   Diagnostic_T diagnostic;
3112   int bestfrom, bestto;
3113   int five_margin, three_margin, five_score = 0, three_score = 0;
3114   int extension;
3115   int npaths_sub1 = 0, npaths_sub2 = 0;
3116   bool lowidentityp, poorp, repetitivep;
3117 
3118   int max_single_goodness, chimeric_goodness, penalty, matches0, matches1;
3119   int breakpoint, chimerapos, chimeraequivpos, exonexonpos;
3120   int chimera_cdna_direction;
3121   char donor1, donor2, acceptor2, acceptor1;
3122   bool donor_watsonp, acceptor_watsonp;
3123   double donor_prob, acceptor_prob;
3124 
3125 
3126   debug2(printf("check_for_chimera called with %d paths\n",List_length(stage3list)));
3127 #ifdef DEBUG2
3128   for (p = stage3list; p != NULL; p = List_next(p)) {
3129     stage3 = (Stage3_T) List_head(p);
3130     printf("%p %p\n",stage3,Stage3_pairs(stage3));
3131   }
3132 #endif
3133 
3134 
3135 
3136 #ifdef PMAP
3137   five_margin = effective_start - 3*Sequence_trim_start(queryseq);
3138   three_margin = 3*Sequence_trim_end(queryseq) - effective_end;
3139   debug2(printf("Margins are %d = %d - %d on the 5' end and %d = %d - %d on the 3' end\n",
3140 		five_margin,effective_start,3*Sequence_trim_start(queryseq),
3141 		three_margin,3*Sequence_trim_end(queryseq),effective_end));
3142 #else
3143   five_margin = effective_start - Sequence_trim_start(queryseq);
3144   three_margin = Sequence_trim_end(queryseq) - effective_end;
3145   debug2(printf("Margins are %d = %d - %d on the 5' end and %d = %d - %d on the 3' end\n",
3146 		five_margin,effective_start,Sequence_trim_start(queryseq),
3147 		three_margin,Sequence_trim_end(queryseq),effective_end));
3148 #endif
3149 
3150 #ifdef DEBUG2A
3151   for (p = stage3list; p != NULL; p = List_next(p)) {
3152     stage3 = (Stage3_T) List_head(p);
3153     Pair_dump_array(Stage3_pairarray(stage3),Stage3_npairs(stage3),/*zerobasedp*/true);
3154     printf("\n");
3155   }
3156 #endif
3157 
3158   /* Stage3_recompute_goodness(stage3list); */
3159   max_single_goodness = 0;
3160   for (p = stage3list; p != NULL; p = List_next(p)) {
3161     stage3 = (Stage3_T) List_head(p);
3162     if (Stage3_goodness(stage3) > max_single_goodness) {
3163       max_single_goodness = Stage3_goodness(stage3);
3164     }
3165   }
3166   debug2(printf("max single goodness = %d\n",max_single_goodness));
3167 
3168 
3169   debug2(printf("Running distant_separate_paths\n"));
3170   distant_separate_paths(&stage3array_sub1,&npaths_sub1,&stage3array_sub2,&npaths_sub2,
3171 			 stage3list);
3172   debug2(printf("chimera: npaths_sub1 %d, npaths_sub2 %d, stage3list %d\n",
3173 		npaths_sub1,npaths_sub2,List_length(stage3list)));
3174 
3175   if (npaths_sub1 == 0 && npaths_sub2 == 0) {
3176     /* Need to compute on margin explicitly */
3177     if (five_margin < chimera_margin && three_margin < chimera_margin) {
3178       debug2(printf("Insufficient margins\n"));
3179     } else if (five_margin > three_margin) {
3180       extension = CHIMERA_SLOP;
3181       debug2(printf("Comparing extension %d with %d = (effective_start %d)/2\n",
3182 		    extension,effective_start/2,effective_start));
3183       if (extension > effective_start/2) {
3184 	/* Extension occupies more than 1/3 of sequence */
3185 	debug2(printf("Proposed extension of %d is too long relative to effective_start %d\n",extension,effective_start));
3186 	extension = effective_start/3;
3187       }
3188       if ((querysubseq = Sequence_subsequence(queryseq,0,effective_start+extension)) != NULL) {
3189 	if ((querysubuc = Sequence_subsequence(queryuc,0,effective_start+extension)) != NULL) {
3190 	  debug2(printf("5 margin > 3 margin.  "));
3191 	  debug2(printf("Beginning Stage1_compute on 5' margin from effective_start %d (%d..%d)\n",
3192 			effective_start,0,effective_start+extension));
3193 	  debug2a(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
3194 
3195 	  diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
3196 				      Oligoindex_array_elt(oligoindices_major,0));
3197 	  if (poorp == true || repetitivep == true) {
3198 	    debug2(printf("Subsequence is poor or repetitive\n"));
3199 	  } else {
3200 	    if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
3201 	      gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
3202 						    chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3203 						    stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3204 	    } else {
3205 	      gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
3206 					chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3207 					stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3208 	    }
3209 	    debug2(printf("A.  Performing Stage 3 starting with list length %d\n",List_length(stage3list)));
3210 	    stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
3211 #ifdef PMAP
3212 					      queryntseq,
3213 #endif
3214 					      usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
3215 					      pairpool,diagpool,cellpool,
3216 					      dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
3217 #ifdef DEBUG2
3218 	    for (p = stage3list; p != NULL; p = List_next(p)) {
3219 	      stage3 = (Stage3_T) List_head(p);
3220 	      printf("%d..%d, %u..%u\n",
3221 		     Stage3_querystart(stage3),Stage3_queryend(stage3),
3222 		     Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
3223 	    }
3224 #endif
3225 	  }
3226 	  Diagnostic_free(&diagnostic);
3227 
3228 	  /* Above function frees gregions */
3229 	  Sequence_free(&querysubuc);
3230 	}
3231 	Sequence_free(&querysubseq);
3232       }
3233 
3234       /* And recompute on original part, just in case stage 1 was led astray by the ends */
3235       if ((querysubseq = Sequence_subsequence(queryseq,effective_start,queryntlength)) != NULL) {
3236 	if ((querysubuc = Sequence_subsequence(queryuc,effective_start,queryntlength)) != NULL) {
3237 	  debug2(printf("Recomputing on original part.  "));
3238 	  debug2(printf("Beginning Stage1_compute on 5' margin from effective_start %d (%d..%d)\n",
3239 			effective_start,effective_start,queryntlength));
3240 	  debug2a(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
3241 
3242 	  diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
3243 				      Oligoindex_array_elt(oligoindices_major,0));
3244 	  if (poorp == true || repetitivep == true) {
3245 	    debug2(printf("Subsequence is poor or repetitive\n"));
3246 	  } else {
3247 	    if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
3248 	      gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
3249 						    chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3250 						    stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3251 	    } else {
3252 	      gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
3253 					chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3254 					stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3255 	    }
3256 	    debug2(printf("B.  Performing Stage 3 starting with list length %d\n",List_length(stage3list)));
3257 	    stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
3258 #ifdef PMAP
3259 					      queryntseq,
3260 #endif
3261 					      usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
3262 					      pairpool,diagpool,cellpool,
3263 					      dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
3264 #ifdef DEBUG2
3265 	    for (p = stage3list; p != NULL; p = List_next(p)) {
3266 	      stage3 = (Stage3_T) List_head(p);
3267 	      printf("%d..%d, %u..%u\n",
3268 		     Stage3_querystart(stage3),Stage3_queryend(stage3),
3269 		     Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
3270 	    }
3271 #endif
3272 	  }
3273 	  Diagnostic_free(&diagnostic);
3274 
3275 	  /* Above function frees gregions */
3276 	  Sequence_free(&querysubuc);
3277 	}
3278 	Sequence_free(&querysubseq);
3279       }
3280 
3281       debug2(printf("Running distant_separate_paths\n"));
3282       distant_separate_paths(&stage3array_sub1,&npaths_sub1,&stage3array_sub2,&npaths_sub2,
3283 			     stage3list);
3284       debug2(printf("chimera: npaths_sub1 %d, npaths_sub2 %d, stage3list %d\n",
3285 		    npaths_sub1,npaths_sub2,List_length(stage3list)));
3286 
3287     } else {
3288       extension = CHIMERA_SLOP;
3289       debug2(printf("Comparing extension %d with %d = (queryntlength %d - effective_end %d)/2\n",
3290 		    extension,(queryntlength-effective_end)/2,queryntlength,effective_end));
3291       if (extension > (queryntlength - effective_end)/2) {
3292 	/* Extension occupies more than 1/3 of sequence */
3293 	debug2(printf("Proposed extension of %d is too long relative to queryntlength %d and effective_end %d\n",
3294 		      extension,queryntlength,effective_end));
3295 	extension = (queryntlength - effective_end)/3;
3296       }
3297       if ((querysubseq = Sequence_subsequence(queryseq,effective_end-extension,queryntlength)) != NULL) {
3298 	if ((querysubuc = Sequence_subsequence(queryuc,effective_end-extension,queryntlength)) != NULL) {
3299 	  debug2(printf("5 margin <= 3 margin.  "));
3300 	  debug2(printf("Beginning Stage1_compute on 3' margin from effective_end %d (%d..%d)\n",
3301 			effective_end,effective_end-extension,queryntlength));
3302 	  debug2(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
3303 
3304 	  diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
3305 				      Oligoindex_array_elt(oligoindices_major,0));
3306 	  if (poorp == true || repetitivep == true) {
3307 	    debug2(printf("Subsequence is poor or repetitive\n"));
3308 	  } else {
3309 	    if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
3310 	      gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
3311 						    chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3312 						    stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3313 	    } else {
3314 	      gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
3315 					chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3316 					stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3317 	    }
3318 	    debug2(printf("C.  Performing Stage 3 with list length %d\n",List_length(stage3list)));
3319 	    stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
3320 #ifdef PMAP
3321 					      queryntseq,
3322 #endif
3323 					      usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
3324 					      pairpool,diagpool,cellpool,
3325 					      dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
3326 #ifdef DEBUG2
3327 	    for (p = stage3list; p != NULL; p = List_next(p)) {
3328 	      stage3 = (Stage3_T) List_head(p);
3329 	      printf("%d..%d, %u..%u\n",
3330 		     Stage3_querystart(stage3),Stage3_queryend(stage3),
3331 		     Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
3332 	    }
3333 #endif
3334 	  }
3335 	  Diagnostic_free(&diagnostic);
3336 
3337 	  /* Above function frees gregions */
3338 	  Sequence_free(&querysubuc);
3339 	}
3340 	Sequence_free(&querysubseq);
3341       }
3342 
3343       /* And recompute on original part, just in case stage 1 was led astray by the ends */
3344       if ((querysubseq = Sequence_subsequence(queryseq,0,effective_end)) != NULL) {
3345 	if ((querysubuc = Sequence_subsequence(queryuc,0,effective_end)) != NULL) {
3346 	  debug2(printf("Recomputing on original part.  "));
3347 	  debug2(printf("Beginning Stage1_compute on 3' margin from effective_end %d (%d..%d)\n",
3348 			effective_end,0,effective_end));
3349 	  debug2(Sequence_stdout(querysubseq,/*uppercasep*/true,wraplength,/*trimmedp*/true));
3350 
3351 	  diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),Sequence_fulllength(querysubuc),
3352 				      Oligoindex_array_elt(oligoindices_major,0));
3353 	  if (poorp == true || repetitivep == true) {
3354 	    debug2(printf("Subsequence is poor or repetitive\n"));
3355 	  } else {
3356 	    if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
3357 	      gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
3358 						    chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3359 						    stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3360 	    } else {
3361 	      gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
3362 					chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3363 					stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3364 	    }
3365 	    debug2(printf("D.  Performing Stage 3 with list length %d\n",List_length(stage3list)));
3366 	    stage3list = stage3_from_gregions(stage3list,gregions,querysubseq,querysubuc,
3367 #ifdef PMAP
3368 					      queryntseq,
3369 #endif
3370 					      usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
3371 					      pairpool,diagpool,cellpool,
3372 					      dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
3373 #ifdef DEBUG2
3374 	    for (p = stage3list; p != NULL; p = List_next(p)) {
3375 	      stage3 = (Stage3_T) List_head(p);
3376 	      printf("%d..%d, %u..%u\n",
3377 		     Stage3_querystart(stage3),Stage3_queryend(stage3),
3378 		     Stage3_genomicstart(stage3),Stage3_genomicend(stage3));
3379 	    }
3380 #endif
3381 	  }
3382 	  Diagnostic_free(&diagnostic);
3383 
3384 	  /* Above function frees gregions */
3385 	  Sequence_free(&querysubuc);
3386 
3387 	}
3388 	Sequence_free(&querysubseq);
3389       }
3390 
3391       debug2(printf("Running distant_separate_paths\n"));
3392       distant_separate_paths(&stage3array_sub1,&npaths_sub1,&stage3array_sub2,&npaths_sub2,
3393 			     stage3list);
3394       debug2(printf("chimera: npaths_sub1 %d, npaths_sub2 %d, stage3list %d\n",
3395 		    npaths_sub1,npaths_sub2,List_length(stage3list)));
3396     }
3397   }
3398 
3399   *mergedp = false;
3400   *chimera = (Chimera_T) NULL;
3401   if (npaths_sub1 == 0 || npaths_sub2 == 0) {
3402     /* Skip */
3403 
3404   } else if (Chimera_bestpath(&five_score,&three_score,&chimerapos,&chimeraequivpos,&bestfrom,&bestto,
3405 			      stage3array_sub1,npaths_sub1,stage3array_sub2,npaths_sub2,queryntlength,
3406 			      CHIMERA_SLOP,circularp,/*localp*/false) == false) {
3407     /* Skip */
3408     debug2(printf("Chimera_bestpath returns false, so skipping\n"));
3409     FREE(stage3array_sub2);
3410     FREE(stage3array_sub1);
3411 
3412   } else {
3413     from = stage3array_sub1[bestfrom];
3414     to = stage3array_sub2[bestto];
3415     debug2(printf("Chimera_bestpath returns bestfrom %d (%d..%d, %u..%u) to bestto %d (%d..%d, %u..%u)\n",
3416 		  bestfrom,Stage3_querystart(from),Stage3_queryend(from),Stage3_genomicstart(from),Stage3_genomicend(from),
3417 		  bestto,Stage3_querystart(to),Stage3_queryend(to),Stage3_genomicstart(to),Stage3_genomicend(to)));
3418 
3419     chimeric_goodness = Stage3_chimeric_goodness(&matches0,&matches1,from,to,chimerapos);
3420     debug2(printf("chimeric goodness = %d\n",chimeric_goodness));
3421 
3422     penalty = CHIMERA_PENALTY;
3423     if (chimera_margin < penalty) {
3424       /* User is looking for higher sensitivity */
3425       penalty = chimera_margin;
3426     }
3427 
3428     if (chimeric_goodness < max_single_goodness + penalty) {
3429       debug2(printf("chimeric goodness not good enough relative to max_single_goodness %d and penalty %d\n",
3430 		    max_single_goodness,penalty));
3431 
3432     } else if ((breakpoint = find_breakpoint(&chimera_cdna_direction,&chimerapos,&chimeraequivpos,&exonexonpos,
3433 					     &donor1,&donor2,&acceptor2,&acceptor1,
3434 					     &donor_watsonp,&acceptor_watsonp,&donor_prob,&acceptor_prob,from,to,
3435 #ifdef PMAP
3436 					     queryntseq,
3437 #endif
3438 					     queryseq,queryuc,queryntlength,
3439 					     genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3440       debug2(printf("find_breakpoint returns no value\n"));
3441 
3442     } else {
3443       debug2(printf("find_breakpoint returns %d\n",breakpoint));
3444 
3445       /* Check to see if we can merge chimeric parts */
3446       debug2(printf("Before Stage3_mergeable, bestfrom is %p, query %d..%d, pairs %p\n",
3447 		    from,Stage3_querystart(from),Stage3_queryend(from),Stage3_pairs(from)));
3448       debug2(printf("Before Stage3_mergeable, bestto is %p, query %d..%d, pairs %p\n",
3449 		    to,Stage3_querystart(to),Stage3_queryend(to),Stage3_pairs(to)));
3450 
3451       if (maxpaths_report != 1 && /* if maxpaths_report == 1, then don't want distant chimeras */
3452 	  Stage3_mergeable(from,to,breakpoint,queryntlength) == false &&
3453 	  Stage3_test_bounds(from,0,chimeraequivpos+chimera_overlap) == true &&
3454 	  Stage3_test_bounds(to,chimerapos+1-chimera_overlap,queryntlength) == true &&
3455 	  Stage3_merge_chimera(&new_left,&new_right,/*best0*/from,/*best1*/to,
3456 			       /*minpos1*/0,/*maxpos1*/breakpoint,
3457 			       /*minpos2*/breakpoint+1,/*maxpos2*/queryntlength,queryseq,
3458 #ifdef PMAP
3459 			       Sequence_fullpointer(queryntseq),Sequence_fullpointer(queryntseq),
3460 #else
3461 			       Sequence_fullpointer(queryseq),Sequence_fullpointer(queryuc),
3462 #endif
3463 			       pairpool,dynprogL,dynprogR,maxpeelback) == true) {
3464 
3465 	debug2(printf("Not mergeable -- Merging left and right as a transloc\n"));
3466 	*chimera = Chimera_new(new_left,new_right,chimerapos,chimeraequivpos,exonexonpos,chimera_cdna_direction,
3467 			       donor1,donor2,acceptor2,acceptor1,donor_watsonp,acceptor_watsonp,
3468 			       donor_prob,acceptor_prob);
3469 
3470 	debug2(printf("Before merge_left_and_right_transloc, bestfrom is %p, query %d..%d\n",
3471 		      from,Stage3_querystart(from),Stage3_queryend(from)));
3472 	debug2(printf("Before merge_left_and_right_transloc, bestto is %p, query %d..%d\n",
3473 		      to,Stage3_querystart(to),Stage3_queryend(to)));
3474 
3475 	/* Used to call merge_left_and_right_transloc */
3476 	for (p = stage3list; p != NULL; p = List_next(p)) {
3477 	  stage3 = (Stage3_T) List_head(p);
3478 	  Stage3_free(&stage3);
3479 	}
3480 	List_free(&stage3list);
3481 
3482 	stage3list = List_push(NULL,(void *) new_right);
3483 	stage3list = List_push(stage3list,(void *) new_left);
3484       }
3485 
3486       debug2(printf("After Stage3_mergeable, bestfrom is %p, query %d..%d, pairs %p\n",
3487 		    from,Stage3_querystart(from),Stage3_queryend(from),Stage3_pairs(from)));
3488       debug2(printf("After Stage3_mergeable, bestto is %p, query %d..%d, pairs %p\n",
3489 		    to,Stage3_querystart(to),Stage3_queryend(to),Stage3_pairs(to)));
3490     }
3491 
3492     FREE(stage3array_sub2);
3493     FREE(stage3array_sub1);
3494   }
3495 
3496   debug2(printf("check_for_chimera returning list of length %d\n",List_length(stage3list)));
3497 #ifdef DEBUG2
3498   for (p = stage3list; p != NULL; p = List_next(p)) {
3499     stage3 = (Stage3_T) List_head(p);
3500     printf("%p %p\n",stage3,Stage3_pairs(stage3));
3501   }
3502 #endif
3503 
3504 #if 0
3505   /* Should be handled by apply_stage3 loop */
3506   /* Needed after calls to stage3_from_gregions */
3507   Stage3_recompute_goodness(stage3list);
3508   stage3list = stage3list_remove_duplicates(stage3list);
3509 #endif
3510 
3511   return stage3list;
3512 }
3513 
3514 
3515 /* Needs to guarantee that all elements of stage3list and middlepieces end up in result */
3516 /* The Stage3_T objects from and to come from stage3list */
3517 /* The Stage3_T object middle does not come from stage3list (but from middlepieces in caller) */
3518 static List_T
merge_middlepieces(List_T stage3list,Stage3_T from,Stage3_T to,Stage3_T middle,bool mergeableAp,bool mergeableBp,int breakpointA,int breakpointB,Sequence_T queryseq,Sequence_T queryntseq,Sequence_T queryuc,int queryntlength,Pairpool_T pairpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Oligoindex_array_T oligoindices_minor,Diagpool_T diagpool,Cellpool_T cellpool)3519 merge_middlepieces (List_T stage3list, Stage3_T from, Stage3_T to, Stage3_T middle,
3520 		    bool mergeableAp, bool mergeableBp,
3521 		    int breakpointA, int breakpointB, Sequence_T queryseq,
3522 #ifdef PMAP
3523 		    Sequence_T queryntseq,
3524 #endif
3525 		    Sequence_T queryuc, int queryntlength,
3526 		    Pairpool_T pairpool, Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
3527 		    Oligoindex_array_T oligoindices_minor, Diagpool_T diagpool, Cellpool_T cellpool) {
3528   bool mergedAp, mergedBp;
3529   /* List_T r, p; */
3530   /* Stage3_T stage3; */
3531 
3532 
3533   if (mergeableAp == true && mergeableBp == true) {
3534     stage3list = merge_left_and_right_readthrough(&mergedAp,stage3list,
3535 						  /*stage3array_sub1*/&from,/*npaths_sub1:1,*//*bestfrom*/0,
3536 						  /*stage3array_sub2*/&middle,/*npaths_sub2:1,*//*bestto*/0,
3537 						  breakpointA,queryntlength,queryseq,
3538 #ifdef PMAP
3539 						  /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
3540 						  /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
3541 						  /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
3542 #else
3543 						  /*queryseq_ptr*/Sequence_fullpointer(queryseq),
3544 						  /*queryuc_ptr*/Sequence_fullpointer(queryuc),
3545 #endif
3546 						  pairpool,dynprogL,dynprogM,dynprogR,
3547 						  oligoindices_minor,diagpool,cellpool);
3548     /* List_free(&merged); */
3549 
3550     stage3list = merge_left_and_right_readthrough(&mergedBp,stage3list,
3551 						  /*stage3array_sub1*/&from,/*npaths_sub1:1,*//*bestfrom*/0,
3552 						  /*stage3array_sub2*/&to,/*npaths_sub2:1,*//*bestto*/0,
3553 						  breakpointB,queryntlength,queryseq,
3554 #ifdef PMAP
3555 						  /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
3556 						  /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
3557 						  /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
3558 #else
3559 						  /*queryseq_ptr*/Sequence_fullpointer(queryseq),
3560 						  /*queryuc_ptr*/Sequence_fullpointer(queryuc),
3561 #endif
3562 						  pairpool,dynprogL,dynprogM,dynprogR,
3563 						  oligoindices_minor,diagpool,cellpool);
3564 
3565 #ifndef PMAP
3566     Stage3_guess_cdna_direction(from);
3567 #endif
3568 
3569   } else if (mergeableBp == true) {
3570     stage3list = merge_left_and_right_readthrough(&mergedBp,stage3list,
3571 						  /*stage3array_sub1*/&middle,/*npaths_sub1:1,*//*bestfrom*/0,
3572 						  /*stage3array_sub2*/&to,/*npaths_sub2:1,*//*bestto*/0,
3573 						  breakpointB,queryntlength,queryseq,
3574 #ifdef PMAP
3575 						  /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
3576 						  /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
3577 						  /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
3578 #else
3579 						  /*queryseq_ptr*/Sequence_fullpointer(queryseq),
3580 						  /*queryuc_ptr*/Sequence_fullpointer(queryuc),
3581 #endif
3582 						  pairpool,dynprogL,dynprogM,dynprogR,
3583 						  oligoindices_minor,diagpool,cellpool);
3584 #ifndef PMAP
3585     Stage3_guess_cdna_direction(middle);
3586 #endif
3587 
3588   } else if (mergeableAp == true) {
3589     stage3list = merge_left_and_right_readthrough(&mergedAp,stage3list,
3590 						  /*stage3array_sub1*/&from,/*npaths_sub1:1,*//*bestfrom*/0,
3591 						  /*stage3array_sub2*/&middle,/*npaths_sub2:1,*//*bestto*/0,
3592 						  breakpointA,queryntlength,queryseq,
3593 #ifdef PMAP
3594 						  /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
3595 						  /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
3596 						  /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
3597 #else
3598 						  /*queryseq_ptr*/Sequence_fullpointer(queryseq),
3599 						  /*queryuc_ptr*/Sequence_fullpointer(queryuc),
3600 #endif
3601 						  pairpool,dynprogL,dynprogM,dynprogR,
3602 						  oligoindices_minor,diagpool,cellpool);
3603 
3604 #ifndef PMAP
3605     Stage3_guess_cdna_direction(from);
3606 #endif
3607   }
3608 
3609   return stage3list;
3610 }
3611 
3612 
3613 
3614 /* Returns stage3list with additional merged alignments and middle pieces */
3615 static List_T
check_middle_piece_local(bool * foundp,List_T stage3list,Sequence_T queryseq,Sequence_T queryuc,Sequence_T queryntseq,int queryntlength,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR)3616 check_middle_piece_local (bool *foundp, List_T stage3list, Sequence_T queryseq, Sequence_T queryuc,
3617 #ifdef PMAP
3618 			  Sequence_T queryntseq,
3619 #endif
3620 			  int queryntlength, Stage2_alloc_T stage2_alloc,
3621 			  Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
3622 			  Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
3623 			  Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR) {
3624   Sequence_T querysubseq = NULL, querysubuc = NULL;
3625   int npaths, i, j;
3626   Stage3_T from = NULL, to = NULL, middle = NULL;
3627   Stage3_T *by_queryend, *by_querystart;
3628   List_T r;
3629   bool plusp;
3630   int genestrand;
3631 
3632   int querystart, queryend;
3633   Chrpos_T chrstart, chrend, chrlength;
3634   Univcoord_T chroffset, chrhigh;
3635   Chrnum_T chrnum;
3636 
3637   int breakpointA = 0, chimeraposA, chimeraequivposA, exonexonposA;
3638   char donorA1, donorA2, acceptorA2, acceptorA1;
3639   bool donor_watsonp_A, acceptor_watsonp_A;
3640   double donor_prob_A, acceptor_prob_A;
3641 
3642   int breakpointB = 0, chimeraposB, chimeraequivposB, exonexonposB;
3643   char donorB1, donorB2, acceptorB2, acceptorB1;
3644   bool donor_watsonp_B, acceptor_watsonp_B;
3645   double donor_prob_B, acceptor_prob_B;
3646 
3647   int chimera_cdna_direction_A, chimera_cdna_direction_B;
3648   bool mergeableAp, mergeableBp;
3649 
3650   List_T all_middlepieces = NULL, middlepieces;
3651 #ifdef DEBUG2A
3652   List_T p;
3653   Stage3_T stage3;
3654 #endif
3655 
3656 
3657 #ifdef DEBUG2A
3658   for (p = stage3list; p != NULL; p = List_next(p)) {
3659     stage3 = (Stage3_T) List_head(p);
3660     Pair_dump_array(Stage3_pairarray(stage3),Stage3_npairs(stage3),/*zerobasedp*/true);
3661     printf("\n");
3662   }
3663 #endif
3664 
3665   *foundp = false;
3666 
3667   by_queryend = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
3668   qsort(by_queryend,npaths,sizeof(Stage3_T),Stage3_queryend_cmp);
3669 
3670   by_querystart = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
3671   qsort(by_querystart,npaths,sizeof(Stage3_T),Stage3_querystart_cmp);
3672 
3673   j = 0;
3674   for (i = 0; i < npaths && *foundp == false; i++) {
3675     from = by_queryend[i];
3676     queryend = Stage3_queryend(from);
3677 
3678     while (j < npaths && Stage3_querystart(by_querystart[j]) < queryend) {
3679       j++;
3680     }
3681     j--;
3682 
3683     while (j >= 0 && Stage3_querystart(by_querystart[j]) > queryend) {
3684       j--;
3685     }
3686     j++;
3687 
3688     for ( ; j < npaths && *foundp == false; j++) {
3689       to = by_querystart[j];
3690 
3691       if (middle_piece_local_p(&querystart,&queryend,&chrstart,&chrend,
3692 			       &chrnum,&chroffset,&chrhigh,&chrlength,&plusp,&genestrand,
3693 			       from,to) == true) {
3694 	debug2(printf("Found middle piece missing from %d to %d\n",i,j));
3695 
3696 	if ((querysubseq = Sequence_subsequence(queryseq,querystart,queryend)) != NULL) {
3697 	  if ((querysubuc = Sequence_subsequence(queryuc,querystart,queryend)) != NULL) {
3698 	    debug2(printf("Performing Stage 3 on %d..%d against %u..%u\n",
3699 			  querystart,queryend,chrstart,chrend));
3700 	    if ((middlepieces = update_stage3list(/*stage3list*/NULL,querysubseq,
3701 #ifdef PMAP
3702 						  queryntseq,
3703 #endif
3704 						  querysubuc,stage2_alloc,oligoindices_major,oligoindices_minor,
3705 						  pairpool,diagpool,cellpool,
3706 						  /*straintype*/0,/*strain*/NULL,chrnum,
3707 						  chroffset,chrhigh,chrlength,chrstart,chrend,plusp,genestrand,
3708 						  dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL)) != NULL) {
3709 	      middlepieces = stage3list_sort(middlepieces);
3710 
3711 	      /* 1.  Look first for middle piece that joins locally on both ends */
3712 	      r = middlepieces;
3713 	      mergeableAp = mergeableBp = false;
3714 	      while (r != NULL && (mergeableAp == false || mergeableBp == false)) {
3715 		middle = (Stage3_T) List_head(r);
3716 		if (Chimera_local_join_p(from,middle,CHIMERA_SLOP) == true && Chimera_local_join_p(middle,to,CHIMERA_SLOP) == true) {
3717 		  if ((breakpointA = find_breakpoint(&chimera_cdna_direction_A,&chimeraposA,&chimeraequivposA,&exonexonposA,
3718 						     &donorA1,&donorA2,&acceptorA2,&acceptorA1,
3719 						     &donor_watsonp_A,&acceptor_watsonp_A,&donor_prob_A,&acceptor_prob_A,
3720 						     from,/*to*/middle,
3721 #ifdef PMAP
3722 						     queryntseq,
3723 #endif
3724 						     queryseq,queryuc,queryntlength,
3725 						     genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3726 		    mergeableAp = false;
3727 		  } else {
3728 		    mergeableAp = Stage3_mergeable(from,/*to*/middle,breakpointA,queryntlength);
3729 		  }
3730 
3731 		  if ((breakpointB = find_breakpoint(&chimera_cdna_direction_B,&chimeraposB,&chimeraequivposB,&exonexonposB,
3732 						     &donorB1,&donorB2,&acceptorB2,&acceptorB1,
3733 						     &donor_watsonp_B,&acceptor_watsonp_B,&donor_prob_B,&acceptor_prob_B,
3734 						     /*from*/middle,to,
3735 #ifdef PMAP
3736 						     queryntseq,
3737 #endif
3738 						     queryseq,queryuc,queryntlength,
3739 						     genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3740 		    mergeableBp = false;
3741 		  } else {
3742 		    mergeableBp = Stage3_mergeable(/*from*/middle,to,breakpointB,queryntlength);
3743 		  }
3744 		}
3745 		r = List_next(r);
3746 	      }	/* End of while loop looking for dual merge */
3747 
3748 	      if (mergeableAp == true && mergeableBp == true) {
3749 		debug2(printf("Middle segment %p found and mergeable locally with both! -- Merging three as a readthrough.\n",middle));
3750 		*foundp = true;
3751 	      } else {
3752 		/* 2.  Look for middle piece that joins locally on one end */
3753 		r = middlepieces;
3754 		mergeableAp = mergeableBp = false;
3755 		while (r != NULL && mergeableAp == false && mergeableBp == false) {
3756 		  middle = (Stage3_T) List_head(r);
3757 		  if (Chimera_local_join_p(from,middle,CHIMERA_SLOP) == true && Chimera_local_join_p(middle,to,CHIMERA_SLOP) == true) {
3758 		    if ((breakpointA = find_breakpoint(&chimera_cdna_direction_A,&chimeraposA,&chimeraequivposA,&exonexonposA,
3759 						       &donorA1,&donorA2,&acceptorA2,&acceptorA1,
3760 						       &donor_watsonp_A,&acceptor_watsonp_A,&donor_prob_A,&acceptor_prob_A,
3761 						       from,/*to*/middle,
3762 #ifdef PMAP
3763 						       queryntseq,
3764 #endif
3765 						       queryseq,queryuc,queryntlength,
3766 						       genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3767 		      mergeableAp = false;
3768 		    } else {
3769 		      mergeableAp = Stage3_mergeable(from,/*to*/middle,breakpointA,queryntlength);
3770 		    }
3771 
3772 		    if ((breakpointB = find_breakpoint(&chimera_cdna_direction_B,&chimeraposB,&chimeraequivposB,&exonexonposB,
3773 						       &donorB1,&donorB2,&acceptorB2,&acceptorB1,
3774 						       &donor_watsonp_B,&acceptor_watsonp_B,&donor_prob_B,&acceptor_prob_B,
3775 						       /*from*/middle,to,
3776 #ifdef PMAP
3777 						       queryntseq,
3778 #endif
3779 						       queryseq,queryuc,queryntlength,
3780 						       genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3781 		      mergeableBp = false;
3782 		    } else {
3783 		      mergeableBp = Stage3_mergeable(/*from*/middle,to,breakpointB,queryntlength);
3784 		    }
3785 		  }
3786 		  r = List_next(r);
3787 		} /* End of while loop looking for single merge */
3788 
3789 		if (mergeableAp == true || mergeableBp == true) {
3790 		  *foundp = true;
3791 		}
3792 	      }
3793 
3794 	      stage3list = merge_middlepieces(stage3list,from,to,middle,mergeableAp,mergeableBp,
3795 					      breakpointA,breakpointB,queryseq,
3796 #ifdef PMAP
3797 					      queryntseq,
3798 #endif
3799 					      queryuc,queryntlength,pairpool,dynprogL,dynprogM,dynprogR,
3800 					      oligoindices_minor,diagpool,cellpool);
3801 	      all_middlepieces = List_append(all_middlepieces,middlepieces);
3802 	    }
3803 
3804 	    Sequence_free(&querysubuc);
3805 	  }
3806 	  Sequence_free(&querysubseq);
3807 	}
3808       }
3809     }
3810   }
3811 
3812   FREE(by_querystart);
3813   FREE(by_queryend);
3814 
3815   stage3list = List_append(stage3list,all_middlepieces);
3816 
3817   return stage3list;
3818 }
3819 
3820 
3821 /* Returns stage3list with additional merged alignments and middle pieces */
3822 static List_T
check_middle_piece_chimera(bool * foundp,List_T stage3list,Sequence_T queryseq,Sequence_T queryuc,Sequence_T queryntseq,int queryntlength,Sequence_T usersegment,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Matchpool_T matchpool,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR)3823 check_middle_piece_chimera (bool *foundp, List_T stage3list, Sequence_T queryseq, Sequence_T queryuc,
3824 #ifdef PMAP
3825 			    Sequence_T queryntseq,
3826 #endif
3827 			    int queryntlength, Sequence_T usersegment, Stage2_alloc_T stage2_alloc,
3828 			    Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
3829 			    Matchpool_T matchpool, Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
3830 			    Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR) {
3831   Sequence_T querysubseq = NULL, querysubuc = NULL;
3832   int npaths, i, j;
3833   Stage3_T bestfrom, bestto, from, to, middle;
3834   Stage3_T *by_queryend, *by_querystart;
3835   List_T r;
3836   int querystart, queryend, maxdist, dist;
3837 
3838   int breakpointA, chimeraposA, chimeraequivposA, exonexonposA;
3839   char donorA1, donorA2, acceptorA2, acceptorA1;
3840   bool donor_watsonp_A, acceptor_watsonp_A;
3841   double donor_prob_A, acceptor_prob_A;
3842 
3843   int breakpointB, chimeraposB, chimeraequivposB, exonexonposB;
3844   char donorB1, donorB2, acceptorB2, acceptorB1;
3845   bool donor_watsonp_B, acceptor_watsonp_B;
3846   double donor_prob_B, acceptor_prob_B;
3847 
3848   int chimera_cdna_direction_A, chimera_cdna_direction_B;
3849   bool mergeableAp, mergeableBp, mergedAp, mergedBp;
3850 
3851   List_T middlepieces = NULL;
3852   Diagnostic_T diagnostic;
3853   List_T gregions;
3854   bool lowidentityp, poorp, repetitivep;
3855 
3856 
3857 #ifdef DEBUG2A
3858   for (p = stage3list; p != NULL; p = List_next(p)) {
3859     stage3 = (Stage3_T) List_head(p);
3860     Pair_dump_array(Stage3_pairarray(stage3),Stage3_npairs(stage3),/*zerobasedp*/true);
3861     printf("\n");
3862   }
3863 #endif
3864 
3865   by_queryend = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
3866   qsort(by_queryend,npaths,sizeof(Stage3_T),Stage3_queryend_cmp);
3867 
3868   by_querystart = (Stage3_T *) List_to_array_out_n(&npaths,stage3list);
3869   qsort(by_querystart,npaths,sizeof(Stage3_T),Stage3_querystart_cmp);
3870 
3871   maxdist = 0;
3872   j = 0;
3873   for (i = 0; i < npaths; i++) {
3874     from = by_queryend[i];
3875     queryend = Stage3_queryend(from);
3876 
3877     while (j < npaths && Stage3_querystart(by_querystart[j]) < queryend) {
3878       j++;
3879     }
3880     j--;
3881 
3882     while (j >= 0 && Stage3_querystart(by_querystart[j]) > queryend) {
3883       j--;
3884     }
3885     j++;
3886 
3887     if (j < npaths) {
3888       /* Should have the first querystart just after queryend */
3889       to = by_querystart[j];
3890 
3891       if ((dist = Stage3_queryend(to) - Stage3_querystart(from)) > maxdist) {
3892 	bestfrom = from;
3893 	bestto = to;
3894 	maxdist = dist;
3895       }
3896     }
3897   }
3898 
3899   FREE(by_querystart);
3900   FREE(by_queryend);
3901 
3902 
3903   *foundp = false;
3904   if (maxdist < CHIMERA_SLOP) {
3905     debug2(printf("maxdist %d < CHIMERA_SLOP %d\n",maxdist,CHIMERA_SLOP));
3906   } else {
3907     if (middle_piece_chimera_p(&querystart,&queryend,bestfrom,bestto) == true) {
3908       if ((querysubseq = Sequence_subsequence(queryseq,querystart,queryend)) != NULL) {
3909 	if ((querysubuc = Sequence_subsequence(queryuc,querystart,queryend)) != NULL) {
3910 	  debug2(printf("Performing Stage 3 on %d..%d\n",querystart,queryend));
3911 
3912 	  diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(querysubuc),
3913 				      Sequence_fulllength(querysubuc),Oligoindex_array_elt(oligoindices_major,0));
3914 	  if (poorp == true || repetitivep == true) {
3915 	    debug2(printf("Subsequence is poor or repetitive\n"));
3916 	  } else {
3917 	    if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
3918 	      gregions = Stage1_compute_nonstranded(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,
3919 						    chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3920 						    stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3921 	    } else {
3922 	      gregions = Stage1_compute(&lowidentityp,querysubuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
3923 					chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
3924 					stutterhits,diagnostic,/*worker_stopwatch*/NULL,/*nbest*/10);
3925 	    }
3926 	    debug2(printf("Performing Stage 3 starting with list length %d\n",List_length(stage3list)));
3927 	    middlepieces = stage3_from_gregions(/*stage3list*/NULL,gregions,querysubseq,querysubuc,
3928 #ifdef PMAP
3929 						queryntseq,
3930 #endif
3931 						usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
3932 						pairpool,diagpool,cellpool,
3933 						dynprogL,dynprogM,dynprogR,/*worker_stopwatch*/NULL);
3934 	  }
3935 	  Diagnostic_free(&diagnostic);
3936 
3937 	  /* Above function frees gregions */
3938 	  Sequence_free(&querysubuc);
3939 	}
3940 	Sequence_free(&querysubseq);
3941       }
3942     }
3943 
3944     if (middlepieces != NULL) {
3945       middlepieces = stage3list_sort(middlepieces);
3946 
3947       r = middlepieces;
3948       mergeableAp = mergeableBp = false;
3949       while (r != NULL && mergeableAp == false && mergeableBp == false) {
3950 	middle = (Stage3_T) List_head(r);
3951 	if (middle != bestfrom && middle != bestto) {
3952 	  if (Chimera_local_join_p(bestfrom,middle,CHIMERA_SLOP) == true) {
3953 	    if ((breakpointA = find_breakpoint(&chimera_cdna_direction_A,&chimeraposA,&chimeraequivposA,&exonexonposA,
3954 					       &donorA1,&donorA2,&acceptorA2,&acceptorA1,
3955 					       &donor_watsonp_A,&acceptor_watsonp_A,&donor_prob_A,&acceptor_prob_A,
3956 					       bestfrom,/*to*/middle,
3957 #ifdef PMAP
3958 					       queryntseq,
3959 #endif
3960 					       queryseq,queryuc,queryntlength,
3961 					       genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3962 	      mergeableAp = false;
3963 	    } else {
3964 	      mergeableAp = Stage3_mergeable(bestfrom,/*to*/middle,breakpointA,queryntlength);
3965 	    }
3966 	  }
3967 	  if (Chimera_local_join_p(middle,bestto,CHIMERA_SLOP) == true) {
3968 	    if ((breakpointB = find_breakpoint(&chimera_cdna_direction_B,&chimeraposB,&chimeraequivposB,&exonexonposB,
3969 					       &donorB1,&donorB2,&acceptorB2,&acceptorB1,
3970 					       &donor_watsonp_B,&acceptor_watsonp_B,&donor_prob_B,&acceptor_prob_B,
3971 					       /*from*/middle,to,
3972 #ifdef PMAP
3973 					       queryntseq,
3974 #endif
3975 					       queryseq,queryuc,queryntlength,
3976 					       genomecomp,genomecomp_alt,chromosome_iit,pairpool)) <= 0) {
3977 	      mergeableBp = false;
3978 	    } else {
3979 	      mergeableBp = Stage3_mergeable(/*from*/middle,bestto,breakpointB,queryntlength);
3980 	    }
3981 	  }
3982 	}
3983 	r = List_next(r);
3984       }
3985 
3986       if (mergeableAp == true) {
3987 	debug2(printf("Middle segment %p found and mergeable locally with from! -- Merging as a readthrough.  cdna_direction = %d\n",
3988  	              middle,chimera_cdna_direction_A));
3989 	stage3list =
3990 	  merge_left_and_right_readthrough(&mergedAp,stage3list,
3991 					   /*stage3array_sub1*/&bestfrom,/*npaths_sub1:1,*//*bestfrom*/0,
3992 					   /*stage3array_sub2*/&middle,/*npaths_sub2:1,*//*bestto*/0,
3993 					   breakpointA,queryntlength,queryseq,
3994 #ifdef PMAP
3995 					   /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
3996 					   /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
3997 					   /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
3998 #else
3999 					   /*queryseq_ptr*/Sequence_fullpointer(queryseq),
4000 					   /*queryuc_ptr*/Sequence_fullpointer(queryuc),
4001 #endif
4002 					   pairpool,dynprogL,dynprogM,dynprogR,
4003 					   oligoindices_minor,diagpool,cellpool);
4004 
4005 #ifndef PMAP
4006 	Stage3_guess_cdna_direction(from);
4007 #endif
4008 
4009 	if (mergedAp == true) {
4010 	  *foundp = true;
4011 	}
4012 
4013       } else if (mergeableBp == true) {
4014 	debug2(printf("Middle segment %p found and mergeable locally with to! -- Merging as a readthrough.  cdna_direction = %d\n",
4015 		      middle,chimera_cdna_direction_B));
4016 	stage3list =
4017 	  merge_left_and_right_readthrough(&mergedBp,stage3list,
4018 					   /*stage3array_sub1*/&middle,/*npaths_sub1:1,*//*bestfrom*/0,
4019 					   /*stage3array_sub2*/&bestto,/*npaths_sub2:1,*//*bestto*/0,
4020 					   breakpointB,queryntlength,queryseq,
4021 #ifdef PMAP
4022 					   /*queryaaseq_ptr*/Sequence_fullpointer(queryseq),
4023 					   /*queryseq_ptr*/Sequence_fullpointer(queryntseq),
4024 					   /*queryuc_ptr*/Sequence_fullpointer(queryntseq),
4025 #else
4026 					   /*queryseq_ptr*/Sequence_fullpointer(queryseq),
4027 					   /*queryuc_ptr*/Sequence_fullpointer(queryuc),
4028 #endif
4029 					   pairpool,dynprogL,dynprogM,dynprogR,
4030 					   oligoindices_minor,diagpool,cellpool);
4031 #ifndef PMAP
4032 	Stage3_guess_cdna_direction(middle);
4033 #endif
4034 
4035 	if (mergedBp == true) {
4036 	  *foundp = true;
4037 	}
4038 
4039       } else {
4040 	debug2(printf("Middle segment found but notmergeable\n"));
4041       }
4042 
4043     }
4044   }
4045 
4046   stage3list = List_append(stage3list,middlepieces);
4047 
4048   return stage3list;
4049 }
4050 
4051 
4052 
4053 static List_T
apply_stage3(bool * mergedp,Chimera_T * chimera,List_T gregions,Sequence_T queryseq,Sequence_T queryuc,Sequence_T queryntseq,Sequence_T usersegment,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Matchpool_T matchpool,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Stopwatch_T worker_stopwatch)4054 apply_stage3 (bool *mergedp, Chimera_T *chimera, List_T gregions, Sequence_T queryseq, Sequence_T queryuc,
4055 #ifdef PMAP
4056 	      Sequence_T queryntseq,
4057 #endif
4058 	      Sequence_T usersegment, Stage2_alloc_T stage2_alloc,
4059 	      Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
4060 	      Matchpool_T matchpool, Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
4061 	      Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR, Stopwatch_T worker_stopwatch) {
4062   List_T stage3list, newstage3list, split_objects, p, q;
4063   Stage3_T nonchimericbest, chimera1, chimera2, stage3, newstage3;
4064   bool testlocalp, testchimerap, foundp;
4065   int effective_start, effective_end;
4066   int queryntlength;
4067   int iter;
4068 
4069   Chrnum_T chrnum;
4070   Univcoord_T chroffset, chrhigh;
4071   Chrpos_T chrlength;
4072   List_T pairs_below, pairs_above;
4073   bool watsonp;
4074   int cdna_direction, genestrand, sensedir;
4075 
4076 
4077   *mergedp = false;
4078   *chimera = NULL;
4079 
4080   debug(printf("Calling stage3_from_gregions\n"));
4081   stage3list = stage3_from_gregions(/*stage3list*/(List_T) NULL,gregions,queryseq,queryuc,
4082 #ifdef PMAP
4083 				    queryntseq,
4084 #endif
4085 				    usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
4086 				    pairpool,diagpool,cellpool,
4087 				    dynprogL,dynprogM,dynprogR,worker_stopwatch);
4088 
4089   debug2(printf("Initial search gives stage3list of length %d\n",List_length(stage3list)));
4090 #ifdef DEBUG2
4091   for (p = stage3list; p != NULL; p = List_next(p)) {
4092     Stage3_print_ends(List_head(p));
4093   }
4094 #endif
4095 
4096   if (diag_debug == true) {
4097     return stage3list;		/* really diagonals */
4098   }
4099 
4100   queryntlength = Sequence_ntlength(queryseq);
4101 
4102   if (stage3list != NULL) {
4103     iter = 0;
4104     testlocalp = true;
4105     while (testlocalp == true && iter++ < MAX_CHIMERA_ITER) {
4106       debug2(printf("\n\n*** Testing for local on %d Stage3_T objects, iter %d ***\n",
4107 		    List_length(stage3list),iter));
4108 
4109       /* Stage3_recompute_goodness(stage3list); */
4110       /* stage3list = stage3list_remove_duplicates(stage3list); */
4111       stage3list = stage3list_sort(stage3list);
4112 
4113 #ifdef DEBUG2
4114       for (p = stage3list; p != NULL; p = List_next(p)) {
4115 	Stage3_print_ends(List_head(p));
4116       }
4117       printf("\n");
4118 #endif
4119       nonchimericbest = (Stage3_T) List_head(stage3list);
4120       debug2(printf("nonchimericbest is %p\n",nonchimericbest));
4121 
4122 #if 0
4123       if (List_length(stage3list) <= 1) {
4124 	debug2(printf("Only 0 or 1 alignments, so won't look for local\n"));
4125 	testlocalp = false;
4126       }
4127       else
4128 #endif
4129 
4130       if (Stage3_domain(nonchimericbest) < chimera_margin) {
4131 	debug2(printf("Existing alignment is too short, so won't look for local\n"));
4132 	testlocalp = false;
4133 
4134 #if 0
4135       } else if (Stage3_fracidentity(nonchimericbest) < CHIMERA_IDENTITY &&
4136 		 Chimera_alignment_break(&effective_start,&effective_end,nonchimericbest,Sequence_ntlength(queryseq),CHIMERA_FVALUE) >= chimera_margin
4137 		 ) {
4138 	debug2(printf("Break in alignment quality at %d..%d detected, so will look for local\n",
4139 		      effective_start,effective_end));
4140 	testlocalp = true;
4141 #endif
4142 
4143       } else if (Stage3_largemargin(&effective_start,&effective_end,nonchimericbest,Sequence_ntlength(queryseq)) >= chimera_margin) {
4144 	debug2(printf("Large margin at %d..%d detected (%d >= %d), so will look for local\n",
4145 		      effective_start,effective_end,Stage3_largemargin(&effective_start,&effective_end,nonchimericbest,Sequence_ntlength(queryseq)),chimera_margin));
4146 	testlocalp = true;
4147 
4148       } else {
4149 	debug2(printf("Good alignment already with identity %f, so won't look for local\n",
4150 		      Stage3_fracidentity(nonchimericbest)));
4151 	testlocalp = false;
4152       }
4153 
4154       if (testlocalp == true) {
4155 	testlocalp = false;
4156 	debug2(printf("Checking for local, starting with list length %d, effective_start %d, effective_end %d\n",
4157 		      List_length(stage3list),effective_start,effective_end));
4158 	stage3list = check_for_local(&(*mergedp),stage3list,effective_start,effective_end,
4159 				     queryseq,queryuc,
4160 #ifdef PMAP
4161 				     queryntseq,
4162 #endif
4163 				     queryntlength,usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
4164 				     matchpool,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR);
4165 	debug2(printf("After check for local, we still have %d paths\n",List_length(stage3list)));
4166 
4167 #if 0
4168 	/* For some reason, we need to filter out cases where npairs is 0 */
4169 	old = stage3list;
4170 	stage3list = (List_T) NULL;
4171 	for (p = old; p != NULL; p = List_next(p)) {
4172           stage3 = (Stage3_T) List_head(p);
4173           if (Stage3_npairs(stage3) == 0) {
4174             Stage3_free(&stage3);
4175           } else {
4176             stage3list = List_push(stage3list,(void *) stage3);
4177           }
4178         }
4179 	List_free(&old);
4180 #endif
4181 
4182 	if (*mergedp == true) {
4183 	  testlocalp = true;	/* Local merge */
4184 	} else if (iter == 1) {
4185 	  /* Check for middle pieces only on first iteration */
4186 	  debug2(printf("Checking for middle piece local, starting with list length %d\n",List_length(stage3list)));
4187 	  stage3list = check_middle_piece_local(&foundp,stage3list,queryseq,queryuc,
4188 #ifdef PMAP
4189 						queryntseq,
4190 #endif
4191 						queryntlength,stage2_alloc,oligoindices_major,oligoindices_minor,
4192 						pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR);
4193 	  if (foundp == true) {
4194 	    /* Iterate */
4195 	    testlocalp = true;
4196 	  }
4197 	} else {
4198 	  testlocalp = false;
4199 	}
4200       }
4201     }
4202   }
4203 
4204   if (stage3list != NULL) {
4205     iter = 0;
4206     testchimerap = true;
4207     while (testchimerap == true && iter++ < MAX_CHIMERA_ITER) {
4208       debug2(printf("\n\n*** Testing for chimera on %d Stage3_T objects, iter %d ***\n",
4209 		    List_length(stage3list),iter));
4210 
4211       /* Stage3_recompute_goodness(stage3list); */
4212       /* stage3list = stage3list_remove_duplicates(stage3list); */
4213       stage3list = stage3list_sort(stage3list);
4214 
4215 #ifdef DEBUG2
4216       for (p = stage3list; p != NULL; p = List_next(p)) {
4217 	Stage3_print_ends(List_head(p));
4218       }
4219       printf("\n");
4220 #endif
4221       nonchimericbest = (Stage3_T) List_head(stage3list);
4222       debug2(printf("nonchimericbest is %p\n",nonchimericbest));
4223 
4224       if (novelsplicingp == false) {
4225 	testchimerap = false;
4226 
4227       } else if (chimera_margin <= 0) {
4228 	debug2(printf("turned off\n"));
4229 	testchimerap = false;
4230 
4231       } else if (maxpaths_report == 1) {
4232 	debug2(printf("maxpaths set to 1\n"));
4233 	testchimerap = false;
4234 
4235       } else if (Stage3_domain(nonchimericbest) < chimera_margin) {
4236 	debug2(printf("Existing alignment is too short, so won't look for chimera\n"));
4237 	testchimerap = false;
4238 
4239 #if 0
4240       } else if (Stage3_fracidentity(nonchimericbest) < CHIMERA_IDENTITY &&
4241 		 Chimera_alignment_break(&effective_start,&effective_end,nonchimericbest,Sequence_ntlength(queryseq),CHIMERA_FVALUE) >= chimera_margin
4242 		 ) {
4243 	debug2(printf("Break in alignment quality at %d..%d detected, so will look for chimera\n",
4244 		      effective_start,effective_end));
4245 	testchimerap = true;
4246 #endif
4247 
4248       } else if (Stage3_largemargin(&effective_start,&effective_end,nonchimericbest,Sequence_ntlength(queryseq)) >= chimera_margin) {
4249 	debug2(printf("Large margin at %d..%d detected (%d >= %d), so will look for chimera\n",
4250 		      effective_start,effective_end,Stage3_largemargin(&effective_start,&effective_end,nonchimericbest,Sequence_ntlength(queryseq)),chimera_margin));
4251 	testchimerap = true;
4252 
4253       } else {
4254 	debug2(printf("Good alignment already with identity %f, so won't look for chimera\n",
4255 		      Stage3_fracidentity(nonchimericbest)));
4256 	testchimerap = false;
4257       }
4258 
4259       if (testchimerap == true) {
4260 	testchimerap = false;
4261 	debug2(printf("Checking for chimera, starting with list length %d, effective_start %d, effective_end %d\n",
4262 		      List_length(stage3list),effective_start,effective_end));
4263 	stage3list = check_for_chimera(&(*mergedp),&(*chimera),stage3list,effective_start,effective_end,
4264 				       queryseq,queryuc,
4265 #ifdef PMAP
4266 				       queryntseq,
4267 #endif
4268 				       queryntlength,usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
4269 				       matchpool,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR);
4270 	debug2(printf("chimera is %p\n",*chimera));
4271 	if (*chimera != NULL) {
4272 	  testchimerap = false;
4273 	} else {
4274 	  if (*mergedp == true) {
4275 	    testchimerap = true;	/* Local merge */
4276 	  } else if (iter == 1) {
4277 	    /* Check for middle pieces only on first iteration */
4278 	    debug2(printf("Checking for middle piece chimera, starting with list length %d\n",List_length(stage3list)));
4279 	    stage3list = check_middle_piece_chimera(&foundp,stage3list,queryseq,queryuc,
4280 #ifdef PMAP
4281 						    queryntseq,
4282 #endif
4283 						    queryntlength,usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
4284 						    matchpool,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR);
4285 	    if (foundp == true) {
4286 	      /* Iterate */
4287 	      testchimerap = true;
4288 	    } else {
4289 	      testchimerap = false;
4290 	    }
4291 	  } else {
4292 	    testchimerap = false;
4293 	  }
4294 	}
4295 	debug2(printf("testchimerap is %d\n",testchimerap));
4296       }
4297     }
4298   }
4299 
4300   debug2(printf("apply_stage3 returning list of length %d\n",List_length(stage3list)));
4301 #ifdef DEBUG2
4302   for (p = stage3list; p != NULL; p = List_next(p)) {
4303     stage3 = (Stage3_T) List_head(p);
4304     printf("%p %p\n",stage3,Stage3_pairs(stage3));
4305   }
4306 #endif
4307 
4308   /* Split on large introns (may need to check whether stage3 belongs
4309      to a chimera) */
4310   if (split_large_introns_p == true) {
4311     newstage3list = (List_T) NULL;
4312     for (p = stage3list; p != NULL; p = List_next(p)) {
4313       stage3 = (Stage3_T) List_head(p);
4314       if (Stage3_npairs(stage3) == 0) {
4315 	Stage3_free(&stage3);
4316       } else if ((split_objects = Stage3_split(stage3,queryseq,pairpool)) == NULL) {
4317 	debug(printf("Pushing %p onto newstage3list\n",stage3));
4318 	newstage3list = List_push(newstage3list,(void *) stage3);
4319       } else {
4320 	for (q = split_objects; q != NULL; q = List_next(q)) {
4321 	  newstage3 = (Stage3_T) List_head(q);
4322 	  debug(printf("Pushing %p onto newstage3list\n",newstage3));
4323 	  newstage3list = List_push(newstage3list,(void *) newstage3);
4324 	}
4325 	List_free(&split_objects);
4326 	Stage3_free(&stage3);
4327       }
4328     }
4329     List_free(&stage3list);
4330     stage3list = newstage3list;
4331   }
4332 
4333 
4334   /* Split circular alignments (need to guarantee that chimeras do not
4335      contain alignments to circular chromosomes) */
4336   newstage3list = (List_T) NULL;
4337   for (p = stage3list; p != NULL; p = List_next(p)) {
4338     stage3 = (Stage3_T) List_head(p);
4339     chrnum = Stage3_chrnum(stage3);
4340     if (circularp[chrnum] == false) {
4341       newstage3list = List_push(newstage3list,(void *) stage3);
4342     } else {
4343       chroffset = Stage3_chroffset(stage3);
4344       chrhigh = Stage3_chrhigh(stage3);
4345       chrlength = Stage3_chrlength(stage3);
4346       watsonp = Stage3_watsonp(stage3);
4347 
4348       Pair_split_circular(&pairs_below,&pairs_above,Stage3_pairs(stage3),
4349 			  chrlength,pairpool,watsonp);
4350 #if 0
4351       printf("PAIRS BELOW\n");
4352       Pair_dump_list(pairs_below,true);
4353       printf("PAIRS ABOVE\n");
4354       Pair_dump_list(pairs_above,true);
4355 #endif
4356 
4357       cdna_direction = Stage3_cdna_direction(stage3);
4358       genestrand = Stage3_genestrand(stage3);
4359       sensedir = Stage3_sensedir(stage3);
4360 
4361       if ((newstage3 = Stage3_new_from_pairs(pairs_below,cdna_direction,watsonp,genestrand,sensedir,
4362 					     pairpool,queryseq,/*query_subseq_offset*/0,
4363 					     chrnum,chroffset,chrhigh,chrlength)) != NULL) {
4364 	debug(printf("Pushing %p onto stage3list\n",newstage3));
4365 	newstage3list = List_push(newstage3list,(void *) newstage3);
4366       }
4367       if ((newstage3 = Stage3_new_from_pairs(pairs_above,cdna_direction,watsonp,genestrand,sensedir,
4368 					     pairpool,queryseq,/*query_subseq_offset*/0,
4369 					     chrnum,chroffset,chrhigh,chrlength)) != NULL) {
4370 	debug(printf("Pushing %p onto stage3list\n",newstage3));
4371 	newstage3list = List_push(newstage3list,(void *) newstage3);
4372       }
4373       Stage3_free(&stage3);
4374     }
4375   }
4376   List_free(&stage3list);
4377   stage3list = newstage3list;
4378 
4379 
4380   /* Needed after call to stage3_from_gregions */
4381   /* Stage3_recompute_goodness(stage3list); */
4382 
4383   /* Final call, so do both filtering and sorting */
4384   Stage3_recompute_coverage(stage3list,queryseq);
4385   stage3list = stage3list_filter_and_sort(&(*chimera),stage3list);
4386   debug2(printf("After filter and sort, have %d paths\n",List_length(stage3list)));
4387 
4388   if (*chimera != NULL && List_length(stage3list) > 2) {
4389     /* Compare chimera against non-chimeric alignments */
4390     chimera1 = (Stage3_T) List_head(stage3list);
4391     chimera2 = (Stage3_T) List_head(List_next(stage3list));
4392     nonchimericbest = (Stage3_T) List_head(List_next(List_next(stage3list)));
4393     debug2(printf("chimera1 %d, chimera2 %d\n",Stage3_goodness(chimera1),Stage3_goodness(chimera2)));
4394     debug2(printf("%p non-chimeric %d %d..%d\n",
4395 		  nonchimericbest,Stage3_goodness(nonchimericbest),Stage3_querystart(nonchimericbest),Stage3_queryend(nonchimericbest)));
4396 
4397     if (Stage3_queryend(nonchimericbest) > (Stage3_querystart(chimera2) + Stage3_queryend(chimera2))/2 &&
4398 	Stage3_querystart(nonchimericbest) < (Stage3_querystart(chimera1) + Stage3_queryend(chimera1))/2) {
4399       stage3list = List_pop(stage3list,(void **) &chimera1);
4400       stage3list = List_pop(stage3list,(void **) &chimera2);
4401       Stage3_free(&chimera1);
4402       Stage3_free(&chimera2);
4403       Chimera_free(&(*chimera));
4404       *chimera = (Chimera_T) NULL;
4405     }
4406   }
4407 
4408   debug2(printf("apply_stage3 returning %d paths\n",List_length(stage3list)));
4409 #ifdef DEBUG2
4410   for (p = stage3list; p != NULL; p = List_next(p)) {
4411     stage3 = (Stage3_T) List_head(p);
4412     printf("%p %p\n",stage3,Stage3_pairs(stage3));
4413   }
4414 #endif
4415 
4416   return stage3list;
4417 }
4418 
4419 
4420 static Filestring_T
process_request(Filestring_T * fp_failedinput,double * worker_runtime,Request_T request,Sequence_T usersegment,Matchpool_T matchpool,Pairpool_T pairpool,Diagpool_T diagpool,Cellpool_T cellpool,Stage2_alloc_T stage2_alloc,Oligoindex_array_T oligoindices_major,Oligoindex_array_T oligoindices_minor,Dynprog_T dynprogL,Dynprog_T dynprogM,Dynprog_T dynprogR,Stopwatch_T worker_stopwatch)4421 process_request (Filestring_T *fp_failedinput, double *worker_runtime, Request_T request, Sequence_T usersegment,
4422 		 Matchpool_T matchpool, Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
4423 		 Stage2_alloc_T stage2_alloc, Oligoindex_array_T oligoindices_major, Oligoindex_array_T oligoindices_minor,
4424 		 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
4425 		 Stopwatch_T worker_stopwatch) {
4426   Filestring_T fp;
4427   Result_T result;
4428   int jobid;
4429   Diagnostic_T diagnostic;
4430   Sequence_T queryseq, queryuc;
4431   Chimera_T chimera = NULL;
4432   bool mergedp, lowidentityp;
4433   bool repetitivep = false, poorp = false;
4434 
4435   List_T gregions = NULL, stage3list;
4436   Stage3_T *stage3array;
4437   int npaths_primary, npaths_altloc, first_absmq, second_absmq;
4438 #ifdef PMAP
4439   Sequence_T queryntseq;
4440 #endif
4441 
4442   jobid = Request_id(request);
4443   queryseq = Request_queryseq(request);
4444   Matchpool_reset(matchpool);
4445   Pairpool_reset(pairpool);
4446   Diagpool_reset(diagpool);
4447   Cellpool_reset(cellpool);
4448 
4449 
4450   if (worker_stopwatch != NULL) {
4451     Stopwatch_start(worker_stopwatch);
4452   }
4453 
4454   if (Sequence_fulllength_given(queryseq) <= 0) {
4455     result = Result_new(jobid,/*mergedp*/false,(Chimera_T) NULL,(Stage3_T *) NULL,
4456 			/*npaths_primary*/0,/*npaths_altloc*/0,/*first_absmq*/0,/*second_absmq*/0,
4457 			/*diagnostic*/NULL,EMPTY_SEQUENCE);
4458 
4459   } else if (Sequence_fulllength_given(queryseq) <
4460 #ifdef PMAP
4461 	     index1part_aa
4462 #else
4463 	     index1part
4464 #endif
4465 	     ) {
4466     result = Result_new(jobid,/*mergedp*/false,(Chimera_T) NULL,(Stage3_T *) NULL,
4467 			/*npaths_primary*/0,/*npaths_altloc*/0,/*first_absmq*/0,/*second_absmq*/0,
4468 			/*diagnostic*/NULL,SHORT_SEQUENCE);
4469 
4470   } else {			/* Sequence_fulllength_given(queryseq) > 0 */
4471     queryuc = Sequence_uppercase(queryseq);
4472 #ifdef PMAP
4473     queryntseq = Sequence_convert_to_nucleotides(queryseq);
4474 #endif
4475 
4476     diagnostic = evaluate_query(&poorp,&repetitivep,Sequence_fullpointer(queryuc),
4477 				Sequence_fulllength(queryuc),Oligoindex_array_elt(oligoindices_major,0));
4478 
4479 #ifndef PMAP
4480     if (poorp == true && prune_poor_p == true) {
4481       result = Result_new(jobid,/*mergedp*/false,(Chimera_T) NULL,(Stage3_T *) NULL,
4482 			  /*npaths_primary*/0,/*npaths_altloc*/0,/*first_absmq*/0,/*second_absmq*/0,
4483 			  diagnostic,POOR_SEQUENCE);
4484     } else if (repetitivep == true && prune_repetitive_p == true) {
4485       result = Result_new(jobid,/*mergedp*/false,(Chimera_T) NULL,(Stage3_T *) NULL,
4486 			  /*npaths_primary*/0,/*npaths_altloc*/0,/*first_absmq*/0,/*second_absmq*/0,
4487 			  diagnostic,REPETITIVE);
4488     }
4489 #endif
4490 
4491     if (usersegment != NULL) {
4492 #ifndef PMAP
4493 #if 0
4494       /* Don't do Sequence_trim, because it affects sequences like NM_018406 */
4495       Sequence_trim(queryseq,diagnostic->query_trim_start,diagnostic->query_trim_end);
4496       Sequence_trim(queryuc,diagnostic->query_trim_start,diagnostic->query_trim_end);
4497 #endif
4498 #endif
4499       stage3array = stage3_from_usersegment(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,queryseq,queryuc,
4500 #ifdef PMAP
4501 					    queryntseq,
4502 #endif
4503 					    usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
4504 					    pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,worker_stopwatch);
4505       result = Result_new(jobid,/*mergedp*/false,(Chimera_T) NULL,stage3array,npaths_primary,npaths_altloc,
4506 			  first_absmq,second_absmq,diagnostic,NO_FAILURE);
4507 
4508     } else {		/* Not user segment and not maponly */
4509 #ifndef PMAP
4510 #if 0
4511       /* Don't do Sequence_trim, because it affects sequences like NM_018406 */
4512       Sequence_trim(queryseq,diagnostic->query_trim_start,diagnostic->query_trim_end);
4513       Sequence_trim(queryuc,diagnostic->query_trim_start,diagnostic->query_trim_end);
4514 #endif
4515 #endif
4516 
4517       debug(printf("Calling stage 1\n"));
4518       if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
4519 	gregions = Stage1_compute_nonstranded(&lowidentityp,queryuc,indexdb_fwd,indexdb_fwd,
4520 					      chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
4521 					      stutterhits,diagnostic,worker_stopwatch,/*nbest*/10);
4522 
4523       } else {
4524 	gregions = Stage1_compute(&lowidentityp,queryuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
4525 				  chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
4526 				  stutterhits,diagnostic,worker_stopwatch,/*nbest*/10);
4527       }
4528       debug(printf("Got %d gregions\n",List_length(gregions)));
4529 
4530       if (stage1debug == true) {
4531 	/* result = Result_new_stage1debug(jobid,gregions,diagnostic,NO_FAILURE); */
4532 	abort();
4533       } else {
4534 	debug(printf("Applying stage 3\n"));
4535 	stage3list = apply_stage3(&mergedp,&chimera,gregions,queryseq,queryuc,
4536 #ifdef PMAP
4537 				  queryntseq,
4538 #endif
4539 				  usersegment,stage2_alloc,oligoindices_major,oligoindices_minor,
4540 				  matchpool,pairpool,diagpool,cellpool,
4541 				  dynprogL,dynprogM,dynprogR,worker_stopwatch);
4542 	if (diag_debug == true) {
4543 #if 0
4544 	  result = Result_new_diag_debug(jobid,/*diagonals*/stage3list,diagnostic,NO_FAILURE);
4545 #endif
4546 	  abort();
4547 	} else if (stage3list == NULL) {
4548 	  result = Result_new(jobid,mergedp,chimera,/*stage3array*/NULL,/*npaths_primary*/0,/*npaths_altloc*/0,
4549 			      /*first_absmq*/0,/*second_absmq*/0,diagnostic,NO_FAILURE);
4550 	} else if (chimera == NULL) {
4551 	  stage3array = stage3array_from_list(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,
4552 					      stage3list,/*chimerap*/false,/*remove_overlaps_p*/true);
4553 	  debug2(printf("chimera is NULL.  npaths_primary %d, npaths_altloc %d\n",npaths_primary,npaths_altloc));
4554 	  result = Result_new(jobid,mergedp,/*chimera*/NULL,stage3array,npaths_primary,npaths_altloc,
4555 			      first_absmq,second_absmq,diagnostic,NO_FAILURE);
4556 	} else {
4557 	  stage3array = stage3array_from_list(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,
4558 					      stage3list,/*chimerap*/true,/*remove_overlaps_p*/false);
4559 	  debug2(printf("chimera is not NULL.  npaths_primary %d, npaths_altloc %d\n",npaths_primary,npaths_altloc));
4560 	  result = Result_new(jobid,mergedp,chimera,stage3array,npaths_primary,npaths_altloc,
4561 			      first_absmq,second_absmq,diagnostic,NO_FAILURE);
4562 	}
4563       }
4564 
4565       Oligoindex_clear_inquery(Oligoindex_array_elt(oligoindices_major,0),/*queryuc_ptr*/Sequence_fullpointer(queryuc),
4566 			       /*querystart*/0,/*queryend*/Sequence_fulllength(queryuc));
4567 
4568     } /* Matches not user segment and not maponly */
4569 
4570 #ifdef PMAP
4571     Sequence_free(&queryntseq);
4572 #endif
4573     Sequence_free(&queryuc);
4574   } /* Matches sequence length > 0 */
4575 
4576   fp = Output_filestring_fromresult(&(*fp_failedinput),result,request,
4577 				    /*headerseq*/user_pairalign_p == true ? usersegment : queryseq);
4578   *worker_runtime = worker_stopwatch == NULL ? 0.00 : Stopwatch_stop(worker_stopwatch);
4579   Result_free(&result);
4580   return fp;
4581 }
4582 
4583 
4584 #ifdef HAVE_SIGACTION
4585 static const Except_T sigfpe_error = {"SIGFPE--arithmetic exception"};
4586 static const Except_T sigsegv_error = {"SIGSEGV--segmentation violation"};
4587 static const Except_T sigtrap_error = {"SIGTRAP--hardware fault"};
4588 static const Except_T misc_signal_error = {"Miscellaneous signal"};
4589 
4590 static void
signal_handler(int sig)4591 signal_handler (int sig) {
4592   Request_T request;
4593   Sequence_T queryseq;
4594 
4595   if (sig == SIGPIPE) {
4596     /* Allow pipe */
4597     return;
4598   }
4599 
4600   switch (sig) {
4601   case SIGABRT: fprintf(stderr,"Signal received: SIGABRT\n"); break;
4602   case SIGFPE: fprintf(stderr,"Signal received: SIGFPE\n"); break;
4603   case SIGHUP: fprintf(stderr,"Signal received: SIGHUP\n"); break;
4604   case SIGILL:
4605     fprintf(stderr,"Signal received: SIGILL\n");
4606     fprintf(stderr,"An illegal instruction means that this program is being run on a computer\n");
4607     fprintf(stderr,"  with different features than the computer used to compile the program\n");
4608     fprintf(stderr,"You may need to re-compile the program on the same computer type as the target machine\n");
4609     fprintf(stderr,"  or re-compile with fewer features by doing something like\n");
4610     fprintf(stderr,"  ./configure --disable-simd\n");
4611     break;
4612   case SIGINT: fprintf(stderr,"Signal received: SIGINT\n"); break;
4613   case SIGPIPE: fprintf(stderr,"Signal received: SIGPIPE\n"); break;
4614   case SIGQUIT: fprintf(stderr,"Signal received: SIGQUIT\n"); break;
4615   case SIGSEGV: fprintf(stderr,"Signal received: SIGSEGV\n"); break;
4616   case SIGSYS: fprintf(stderr,"Signal received: SIGSYS\n"); break;
4617   case SIGTERM: fprintf(stderr,"Signal received: SIGTERM\n"); break;
4618   case SIGTRAP: fprintf(stderr,"Signal received: SIGTRAP\n"); break;
4619   case SIGXCPU: fprintf(stderr,"Signal received: SIGXCPU\n"); break;
4620   case SIGXFSZ: fprintf(stderr,"Signal received: SIGXFSZ\n"); break;
4621   }
4622 
4623   Access_emergency_cleanup();
4624 
4625 #ifdef USE_MPI
4626   MPI_Barrier(MPI_COMM_WORLD);
4627 #endif
4628 
4629 #ifdef HAVE_PTHREAD
4630   request = (Request_T) pthread_getspecific(global_request_key);
4631   if (request == NULL) {
4632     /* fprintf(stderr,"Unable to retrieve request for thread\n"); */
4633   } else {
4634     queryseq = Request_queryseq(request);
4635     if (queryseq == NULL) {
4636       fprintf(stderr,"Unable to retrieve queryseq for request\n");
4637     } else {
4638       fprintf(stderr,"Problem sequence: ");
4639       fprintf(stderr,"%s (%d bp)\n",Sequence_accession(queryseq),Sequence_fulllength(queryseq));
4640     }
4641   }
4642 #endif
4643 
4644   exit(9);
4645 
4646   return;
4647 }
4648 #endif
4649 
4650 
4651 #define POOL_FREE_INTERVAL 200
4652 
4653 #ifdef USE_MPI
4654 static void
worker_mpi_process(int worker_id,Inbuffer_T inbuffer)4655 worker_mpi_process (int worker_id, Inbuffer_T inbuffer) {
4656   bool donep = false;
4657   int nread = 0;
4658   MPI_Status status;
4659 
4660   Stage2_alloc_T stage2_alloc;
4661   Oligoindex_array_T oligoindices_major, oligoindices_minor;
4662   Dynprog_T dynprogL, dynprogM, dynprogR;
4663   Matchpool_T matchpool;
4664   Pairpool_T pairpool;
4665   Diagpool_T diagpool;
4666   Cellpool_T cellpool;
4667   Stopwatch_T worker_stopwatch;
4668   Request_T request;
4669   Filestring_T fp, fp_failedinput;
4670   Sequence_T queryseq, usersegment, pairalign_segment;
4671   int filestringid, requestid, i;
4672   int ret;
4673   int worker_jobid = 0;
4674   double worker_runtime;
4675 
4676 #ifdef MEMUSAGE
4677   Sequence_T queryseq;
4678   long int memusage_constant = 0, memusage, max_memusage;
4679   char procname[12];
4680   char acc[100+1], comma0[20], comma1[20], comma2[20], comma3[20], comma4[20], comma5[20];
4681   sprintf(procname,"proc-%ld",worker_id);
4682   Mem_usage_set_threadname(procname);
4683 #endif
4684 
4685   stage2_alloc = Stage2_alloc_new(MAX_QUERYLENGTH_FOR_ALLOC);
4686   oligoindices_major = Oligoindex_array_new_major(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
4687   oligoindices_minor = Oligoindex_array_new_minor(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
4688   dynprogL = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
4689 			 /*doublep*/true);
4690   dynprogM = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
4691 			 /*doublep*/false);
4692   dynprogR = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
4693 			 /*doublep*/true);
4694   matchpool = Matchpool_new();
4695   pairpool = Pairpool_new();
4696   diagpool = Diagpool_new();
4697   cellpool = Cellpool_new();
4698   worker_stopwatch = (timingp == true) ? Stopwatch_new() : (Stopwatch_T) NULL;
4699 
4700   usersegment = global_usersegment;
4701 
4702   /* Except_stack_create(); -- no worker threads, so no need to store request in global_request_key */
4703 
4704 #ifdef MEMUSAGE
4705   memusage_constant += Mem_usage_report_std_heap();
4706   Genomicpos_commafmt_fill(comma0,memusage_constant);
4707   Mem_usage_reset_heap_baseline(0);
4708 #endif
4709 
4710   /* Initial message to say that we are ready for a request */
4711   filestringid = -1;
4712 
4713   /* Use a synchronized send here to make sure outbuffer is ready */
4714   if ((ret = MPI_SSEND(&filestringid,1,MPI_INT,/*dest*/0,/*tag*/MPI_TAG_FILESTRING_AVAIL,MPI_COMM_WORLD)) != 0) {
4715     fprintf(stderr,"MPI_SSEND returns error %d\n",ret);
4716     MPI_Finalize();
4717     exit(9);
4718   }
4719 
4720   while (donep == false) {
4721     MPI_RECV(&requestid,1,MPI_INT,/*source*/0,/*tag*/MPI_ANY_TAG,MPI_COMM_WORLD,&status);
4722     debugm(printf("worker_id %ld got request %d\n",worker_id,requestid));
4723 
4724     while (nread < requestid &&
4725 	   (queryseq = Inbuffer_read(&pairalign_segment,inbuffer,/*skipp*/true)) != NULL) {
4726       /* No need to free queryseq */
4727       nread++;
4728     }
4729 
4730     if (nread < requestid) {
4731       debugm(printf("because nread %d < requestid %d, worker_id %ld is done\n",nread,requestid,worker_id));
4732       donep = true;
4733     } else if ((queryseq = Inbuffer_read(&pairalign_segment,inbuffer,/*skipp*/false)) == NULL) {
4734       debugm(printf("because final read is NULL, worker_id %ld is done\n",worker_id));
4735       donep = true;
4736     } else {
4737       debugm(printf("worker_id %ld starting to process request %d\n",worker_id,requestid));
4738       request = Request_new(requestid,queryseq);
4739       nread++;
4740 
4741       if (user_pairalign_p == true) {
4742 	genomecomp_blocks = Compress_create_blocks_comp(Sequence_fullpointer(usersegment),Sequence_fulllength(usersegment));
4743 	Genome_user_setup(genomecomp_blocks,genomelength);
4744 	Genome_sites_setup(genomecomp_blocks,/*snp_blocks*/NULL);
4745 	Maxent_hr_setup(genomecomp_blocks,/*genomealt_blocks*/genomecomp_blocks);
4746 #ifdef PMAP
4747 	Oligoindex_pmap_setup(genomecomp);
4748 #else
4749 	Oligoindex_hr_setup(genomecomp_blocks,mode);
4750 	/* Oligoindex_localdb_setup(chromosome_iit,circular_typeint,localdb,local1part); */
4751 #endif
4752       }
4753 
4754 #ifdef MEMUSAGE
4755       queryseq = Request_queryseq(request);
4756       fprintf(stderr,"Proc %d starting %s\n",worker_id,Sequence_accession(queryseq));
4757       Mem_usage_reset_stack_max();
4758       Mem_usage_reset_heap_max();
4759 #endif
4760 
4761       TRY
4762         fp = process_request(&fp_failedinput,&worker_runtime,request,usersegment,
4763 			     matchpool,pairpool,diagpool,cellpool,
4764 			     stage2_alloc,oligoindices_major,oligoindices_minor,
4765 			     dynprogL,dynprogM,dynprogR,worker_stopwatch);
4766 
4767       ELSE
4768 	queryseq = Request_queryseq(request);
4769         if (Sequence_accession(queryseq) == NULL) {
4770 	  fprintf(stderr,"Problem with unnamed sequence (%d bp)\n",Sequence_fulllength_given(queryseq));
4771 	} else {
4772 	  fprintf(stderr,"Problem with sequence %s (%d bp)\n",
4773 		  Sequence_accession(queryseq),Sequence_fulllength_given(queryseq));
4774 	}
4775 	fprintf(stderr,"To obtain a core dump, re-run program on problem sequence with the -0 [zero] flag\n");
4776 	fprintf(stderr,"Exiting...\n");
4777 	exit(9);
4778       RERAISE;
4779       END_TRY;
4780 
4781       if (user_pairalign_p == true) {
4782 	usersegment = pairalign_segment;
4783 	FREE(genomecomp_blocks);
4784       }
4785 
4786       filestringid = Filestring_id(fp);
4787       debugm(printf("worker proc %d sending filestring %d...",worker_id,filestringid));
4788 
4789       /* Use a synchronized send here to make sure outbuffer is ready */
4790       if ((ret = MPI_SSEND(&filestringid,1,MPI_INT,/*dest*/0,/*tag*/MPI_TAG_FILESTRING_AVAIL,MPI_COMM_WORLD)) != 0) {
4791 	fprintf(stderr,"MPI_SSEND returns error %d\n",ret);
4792 	MPI_Finalize();
4793 	exit(9);
4794       }
4795       Filestring_Send(fp,/*dest*/0,/*tag*/MPI_TAG_DEFAULT,MPI_COMM_WORLD);
4796       if (failedinput_root != NULL) {
4797 	Filestring_Send(fp_failedinput,/*dest*/0,/*tag*/MPI_TAG_DEFAULT,MPI_COMM_WORLD);
4798       }
4799       debugm(printf("done with filestring %d\n",filestringid));
4800 
4801       if (worker_jobid % POOL_FREE_INTERVAL == 0) {
4802 	Pairpool_free_memory(pairpool);
4803 	Diagpool_free_memory(diagpool);
4804 	Cellpool_free_memory(cellpool);
4805 	Matchpool_free_memory(matchpool);
4806       }
4807 
4808 #ifdef MEMUSAGE
4809       /* Copy acc before we free the request */
4810       queryseq = Request_queryseq(request);
4811       strncpy(acc,Sequence_accession(queryseq),100);
4812       acc[100] = '\0';
4813 #endif
4814 
4815       Request_free(&request);
4816 
4817 #ifdef MEMUSAGE
4818       Genomicpos_commafmt_fill(comma1,Mem_usage_report_std_heap_max());
4819       Genomicpos_commafmt_fill(comma2,Mem_usage_report_std_heap());
4820       Genomicpos_commafmt_fill(comma3,Mem_usage_report_keep());
4821       Genomicpos_commafmt_fill(comma4,Mem_usage_report_in());
4822       Genomicpos_commafmt_fill(comma5,Mem_usage_report_out());
4823 
4824       fprintf(stderr,"Acc %s, proc %d: constant %s  max %s  std %s  keep %s  in %s  out %s\n",
4825 	      acc,worker_id,comma0,comma1,comma2,comma3,comma4,comma5);
4826 
4827       if ((memusage = Mem_usage_report_std_heap()) != 0) {
4828 	fprintf(stderr,"Memory leak in proc of %ld bytes: %ld\n",worker_id,memusage);
4829 	fflush(stdout);
4830 	MPI_Finalize();
4831 	exit(9);
4832       }
4833 #endif
4834     }
4835   }
4836 
4837   /* Final message to say that we are done with all requests */
4838   debugm(printf("worker_id %ld sending final message to say it is done\n",worker_id));
4839   filestringid = -1;
4840   if ((ret = MPI_SSEND(&filestringid,1,MPI_INT,/*dest*/0,/*tag*/MPI_TAG_FILESTRING_AVAIL,MPI_COMM_WORLD)) != 0) {
4841     fprintf(stderr,"MPI_SSEND returns error %d\n",ret);
4842     MPI_Finalize();
4843     exit(9);
4844   }
4845 
4846 #ifdef MEMUSAGE
4847   Mem_usage_std_heap_add(memusage_constant);
4848 #endif
4849 
4850   /* Except_stack_destroy(); */
4851 
4852   Stopwatch_free(&worker_stopwatch);
4853   Cellpool_free(&cellpool);
4854   Diagpool_free(&diagpool);
4855   Pairpool_free(&pairpool);
4856   Matchpool_free(&matchpool);
4857   Dynprog_free(&dynprogR);
4858   Dynprog_free(&dynprogM);
4859   Dynprog_free(&dynprogL);
4860   Oligoindex_array_free(&oligoindices_minor);
4861   Oligoindex_array_free(&oligoindices_major);
4862   Stage2_alloc_free(&stage2_alloc);
4863 
4864 #ifdef MEMUSAGE
4865   Mem_usage_set_threadname("main");
4866 #endif
4867 
4868   debugm(printf("worker_id %ld is now returning\n",worker_id));
4869   return;
4870 }
4871 #endif
4872 
4873 
4874 static void
single_thread()4875 single_thread () {
4876   Stage2_alloc_T stage2_alloc;
4877   Oligoindex_array_T oligoindices_major, oligoindices_minor;
4878   Dynprog_T dynprogL, dynprogM, dynprogR;
4879   Matchpool_T matchpool;
4880   Pairpool_T pairpool;
4881   Diagpool_T diagpool;
4882   Cellpool_T cellpool;
4883   Stopwatch_T worker_stopwatch;
4884   Request_T request;
4885   Sequence_T usersegment, pairalign_segment;
4886   Filestring_T fp, fp_failedinput;
4887   Sequence_T queryseq;
4888   int jobid = 0;
4889   double worker_runtime;
4890 
4891 #ifdef MEMUSAGE
4892   long int memusage, memusage_constant = 0;
4893   char acc[100+1], comma0[20], comma1[20], comma2[20], comma3[20], comma4[20], comma5[20];
4894 #endif
4895 
4896   stage2_alloc = Stage2_alloc_new(MAX_QUERYLENGTH_FOR_ALLOC);
4897   oligoindices_major = Oligoindex_array_new_major(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
4898   oligoindices_minor = Oligoindex_array_new_minor(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
4899   dynprogL = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
4900 			 /*doublep*/true);
4901   dynprogM = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
4902 			 /*doublep*/false);
4903   dynprogR = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
4904 			 /*doublep*/true);
4905   matchpool = Matchpool_new();
4906   pairpool = Pairpool_new();
4907   diagpool = Diagpool_new();
4908   cellpool = Cellpool_new();
4909   worker_stopwatch = (timingp == true) ? Stopwatch_new() : (Stopwatch_T) NULL;
4910 
4911   usersegment = global_usersegment;
4912 
4913   /* Except_stack_create(); -- requires pthreads */
4914 
4915 #ifdef MEMUSAGE
4916   memusage_constant += Mem_usage_report_std_heap();
4917   Genomicpos_commafmt_fill(comma0,memusage_constant);
4918   Mem_usage_reset_heap_baseline(0);
4919 #endif
4920 
4921   while ((request = Inbuffer_get_request(&pairalign_segment,inbuffer)) != NULL) {
4922 
4923     if (user_pairalign_p == true) {
4924       genomecomp_blocks = Compress_create_blocks_comp(Sequence_fullpointer(usersegment),Sequence_fulllength(usersegment));
4925       Genome_user_setup(genomecomp_blocks,genomelength);
4926       Genome_sites_setup(genomecomp_blocks,/*snp_blocks*/NULL);
4927       Maxent_hr_setup(genomecomp_blocks,/*genomealt_blocks*/genomecomp_blocks);
4928 #ifdef PMAP
4929       Oligoindex_pmap_setup(genomecomp);
4930 #else
4931       Oligoindex_hr_setup(genomecomp_blocks,mode);
4932       /* Oligoindex_localdb_setup(chromosome_iit,circular_typeint,localdb,local1part); */
4933 #endif
4934     }
4935 
4936 #ifdef MEMUSAGE
4937     queryseq = Request_queryseq(request);
4938     fprintf(stderr,"Single thread starting %s\n",Sequence_accession(queryseq));
4939     Mem_usage_reset_stack_max();
4940     Mem_usage_reset_heap_max();
4941 #endif
4942 
4943     TRY
4944       fp = process_request(&fp_failedinput,&worker_runtime,request,usersegment,
4945 			   matchpool,pairpool,diagpool,cellpool,
4946 			   stage2_alloc,oligoindices_major,oligoindices_minor,
4947 			   dynprogL,dynprogM,dynprogR,worker_stopwatch);
4948       if (timingp == true) {
4949         queryseq = Request_queryseq(request);
4950         fprintf(stderr,"%s\t%.6f\n",Sequence_accession(queryseq),worker_runtime);
4951       }
4952 
4953     ELSE
4954       queryseq = Request_queryseq(request);
4955       if (Sequence_accession(queryseq) == NULL) {
4956         fprintf(stderr,"Problem with unnamed sequence (%d bp)\n",Sequence_fulllength_given(queryseq));
4957       } else {
4958         fprintf(stderr,"Problem with sequence %s (%d bp)\n",
4959   	      Sequence_accession(queryseq),Sequence_fulllength_given(queryseq));
4960       }
4961       fprintf(stderr,"To obtain a core dump, re-run program on problem sequence with the -0 [zero] flag\n");
4962       fprintf(stderr,"Exiting...\n");
4963       exit(9);
4964     RERAISE;
4965     END_TRY;
4966 
4967     if (user_pairalign_p == true) {
4968       usersegment = pairalign_segment;
4969       FREE(genomecomp_blocks);
4970     }
4971 
4972     Outbuffer_print_filestrings(fp,fp_failedinput);
4973 
4974     if (jobid % POOL_FREE_INTERVAL == 0) {
4975       Pairpool_free_memory(pairpool);
4976       Diagpool_free_memory(diagpool);
4977       Cellpool_free_memory(cellpool);
4978       Matchpool_free_memory(matchpool);
4979     }
4980 
4981 #ifdef MEMUSAGE
4982     /* Copy acc before we free the request */
4983     queryseq = Request_queryseq(request);
4984     strncpy(acc,Sequence_accession(queryseq),100);
4985     acc[100] = '\0';
4986 #endif
4987 
4988     Request_free(&request);
4989 
4990 #ifdef MEMUSAGE
4991     Genomicpos_commafmt_fill(comma1,Mem_usage_report_std_heap_max());
4992     Genomicpos_commafmt_fill(comma2,Mem_usage_report_std_heap());
4993     Genomicpos_commafmt_fill(comma3,Mem_usage_report_keep());
4994     Genomicpos_commafmt_fill(comma4,Mem_usage_report_in());
4995     Genomicpos_commafmt_fill(comma5,Mem_usage_report_out());
4996 
4997     fprintf(stderr,"Acc %s: constant %s  max %s  std %s  keep %s  in %s  out %s\n",
4998 	    acc,comma0,comma1,comma2,comma3,comma4,comma5);
4999 
5000     if ((memusage = Mem_usage_report_std_heap()) != 0) {
5001       fprintf(stderr,"Memory leak in single thread of %ld bytes\n",memusage);
5002       fflush(stdout);
5003       exit(9);
5004     }
5005 #endif
5006   }
5007 
5008 #ifdef MEMUSAGE
5009   Mem_usage_std_heap_add(memusage_constant);
5010 #endif
5011 
5012   /* Except_stack_destroy(); -- requires pthreads */
5013 
5014   if (worker_stopwatch != NULL) {
5015     Stopwatch_free(&worker_stopwatch);
5016   }
5017   Cellpool_free(&cellpool);
5018   Diagpool_free(&diagpool);
5019   Pairpool_free(&pairpool);
5020   Matchpool_free(&matchpool);
5021   Dynprog_free(&dynprogR);
5022   Dynprog_free(&dynprogM);
5023   Dynprog_free(&dynprogL);
5024   Oligoindex_array_free(&oligoindices_minor);
5025   Oligoindex_array_free(&oligoindices_major);
5026   Stage2_alloc_free(&stage2_alloc);
5027 
5028 #ifdef MEMUSAGE
5029   Mem_usage_set_threadname("main");
5030 #endif
5031 
5032   return;
5033 }
5034 
5035 
5036 #ifdef HAVE_PTHREAD
5037 static void *
worker_thread(void * data)5038 worker_thread (void *data) {
5039   Stage2_alloc_T stage2_alloc;
5040   Oligoindex_array_T oligoindices_major, oligoindices_minor;
5041   Dynprog_T dynprogL, dynprogM, dynprogR;
5042   Matchpool_T matchpool;
5043   Pairpool_T pairpool;
5044   Diagpool_T diagpool;
5045   Cellpool_T cellpool;
5046   Stopwatch_T worker_stopwatch;
5047   Request_T request;
5048   Filestring_T fp, fp_failedinput;
5049   Sequence_T queryseq, usersegment, pairalign_segment;
5050   int worker_jobid = 0;
5051   double worker_runtime;
5052 #if defined(DEBUG) || defined(MEMUSAGE)
5053   long int worker_id = (long int) data;
5054 #endif
5055 
5056 #ifdef MEMUSAGE
5057   long int memusage_constant = 0, memusage, max_memusage;
5058   char threadname[12];
5059   char acc[100+1], comma0[20], comma1[20], comma2[20], comma3[20], comma4[20], comma5[20];
5060   sprintf(threadname,"thread-%ld",worker_id);
5061   Mem_usage_set_threadname(threadname);
5062 #endif
5063 
5064   /* Thread-specific data and storage */
5065   stage2_alloc = Stage2_alloc_new(MAX_QUERYLENGTH_FOR_ALLOC);
5066   oligoindices_major = Oligoindex_array_new_major(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
5067   oligoindices_minor = Oligoindex_array_new_minor(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
5068   dynprogL = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
5069 			 /*doublep*/true);
5070   dynprogM = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
5071 			 /*doublep*/false);
5072   dynprogR = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
5073 			 /*doublep*/true);
5074   matchpool = Matchpool_new();
5075   pairpool = Pairpool_new();
5076   diagpool = Diagpool_new();
5077   cellpool = Cellpool_new();
5078   worker_stopwatch = (timingp == true) ? Stopwatch_new() : (Stopwatch_T) NULL;
5079 
5080   usersegment = global_usersegment;
5081 
5082   Except_stack_create();
5083 
5084 #ifdef MEMUSAGE
5085   memusage_constant += Mem_usage_report_std_heap();
5086   Genomicpos_commafmt_fill(comma0,memusage_constant);
5087   Mem_usage_reset_heap_baseline(0);
5088 #endif
5089 
5090   while ((request = Inbuffer_get_request(&pairalign_segment,inbuffer)) != NULL) {
5091     debug(printf("worker_thread %ld got request %d\n",worker_id,Request_id(request)));
5092     pthread_setspecific(global_request_key,(void *) request);
5093 
5094     if (user_pairalign_p == true) {
5095       genomecomp_blocks = Compress_create_blocks_comp(Sequence_fullpointer(usersegment),Sequence_fulllength(usersegment));
5096       Genome_user_setup(genomecomp_blocks,genomelength);
5097       Genome_sites_setup(genomecomp_blocks,/*snp_blocks*/NULL);
5098       Maxent_hr_setup(genomecomp_blocks,/*genomealt_blocks*/genomecomp_blocks);
5099 #ifdef PMAP
5100       Oligoindex_pmap_setup(genomecomp);
5101 #else
5102       Oligoindex_hr_setup(genomecomp_blocks,mode);
5103       /* Oligoindex_localdb_setup(chromosome_iit,circular_typeint,localdb,local1part); */
5104 #endif
5105     }
5106 
5107 #ifdef MEMUSAGE
5108     queryseq = Request_queryseq(request);
5109     fprintf(stderr,"Thread %d starting %s\n",worker_id,Sequence_accession(queryseq));
5110     Mem_usage_reset_stack_max();
5111     Mem_usage_reset_heap_max();
5112 #endif
5113 
5114     TRY
5115       fp = process_request(&fp_failedinput,&worker_runtime,request,usersegment,
5116 			   matchpool,pairpool,diagpool,cellpool,
5117 			   stage2_alloc,oligoindices_major,oligoindices_minor,
5118 			   dynprogL,dynprogM,dynprogR,worker_stopwatch);
5119       if (timingp == true) {
5120         queryseq = Request_queryseq(request);
5121         fprintf(stderr,"%s\t%.6f\n",Sequence_accession(queryseq),worker_runtime);
5122       }
5123 
5124     ELSE
5125       queryseq = Request_queryseq(request);
5126       if (queryseq == NULL) {
5127 	fprintf(stderr,"NULL");
5128       } else if (Sequence_accession(queryseq) == NULL) {
5129 	fprintf(stderr,"unnamed (%d bp)",Sequence_fulllength_given(queryseq));
5130       } else {
5131 	fprintf(stderr,"%s (%d bp)",Sequence_accession(queryseq),Sequence_fulllength_given(queryseq));
5132       }
5133       fprintf(stderr,"\n");
5134       fprintf(stderr,"To obtain a core dump, re-run program on problem sequence with the -0 [zero] flag\n");
5135 
5136       fprintf(stderr,"Exiting...\n");
5137       exit(9);
5138     RERAISE;
5139     END_TRY;
5140 
5141     if (user_pairalign_p == true) {
5142       usersegment = pairalign_segment;
5143       FREE(genomecomp_blocks);
5144     }
5145 
5146     debug(printf("worker_thread %ld putting filestring %d\n",worker_id,Filestring_id(fp)));
5147     Outbuffer_put_filestrings(outbuffer,fp,fp_failedinput);
5148 
5149     if (worker_jobid % POOL_FREE_INTERVAL == 0) {
5150       Pairpool_free_memory(pairpool);
5151       Diagpool_free_memory(diagpool);
5152       Cellpool_free_memory(cellpool);
5153       Matchpool_free_memory(matchpool);
5154     }
5155 
5156 #ifdef MEMUSAGE
5157     /* Copy acc before we free the request */
5158     queryseq = Request_queryseq(request);
5159     strncpy(acc,Sequence_accession(queryseq),100);
5160     acc[100] = '\0';
5161 #endif
5162 
5163     Request_free(&request);
5164 
5165 #ifdef MEMUSAGE
5166     Genomicpos_commafmt_fill(comma1,Mem_usage_report_std_heap_max());
5167     Genomicpos_commafmt_fill(comma2,Mem_usage_report_std_heap());
5168     Genomicpos_commafmt_fill(comma3,Mem_usage_report_keep());
5169     Genomicpos_commafmt_fill(comma4,Mem_usage_report_in());
5170     Genomicpos_commafmt_fill(comma5,Mem_usage_report_out());
5171 
5172     fprintf(stderr,"Acc %s, thread %d: constant %s  max %s  std %s  keep %s  in %s  out %s\n",
5173 	    acc,worker_id,comma0,comma1,comma2,comma3,comma4,comma5);
5174 
5175     if ((memusage = Mem_usage_report_std_heap()) != 0) {
5176       fprintf(stderr,"Memory leak in worker thread %ld of %ld bytes\n",worker_id,memusage);
5177       fflush(stdout);
5178       exit(9);
5179     }
5180 #endif
5181   }
5182 
5183 #ifdef MEMUSAGE
5184   Mem_usage_std_heap_add(memusage_constant);
5185 #endif
5186 
5187   Except_stack_destroy();
5188 
5189   if (worker_stopwatch != NULL) {
5190     Stopwatch_free(&worker_stopwatch);
5191   }
5192   Cellpool_free(&cellpool);
5193   Diagpool_free(&diagpool);
5194   Pairpool_free(&pairpool);
5195   Matchpool_free(&matchpool);
5196   Dynprog_free(&dynprogR);
5197   Dynprog_free(&dynprogM);
5198   Dynprog_free(&dynprogL);
5199   Oligoindex_array_free(&oligoindices_minor);
5200   Oligoindex_array_free(&oligoindices_major);
5201   Stage2_alloc_free(&stage2_alloc);
5202 
5203 #ifdef MEMUSAGE
5204   Mem_usage_set_threadname("main");
5205 #endif
5206 
5207   return (void *) NULL;
5208 }
5209 #endif
5210 
5211 
5212 #if 0
5213 
5214 static void
5215 align_relative (FILE *input, char **files, int nfiles, int nextchar,
5216 		Sequence_T queryseq, Sequence_T referenceseq) {
5217   Stage2_alloc_T stage2_alloc;
5218   Oligoindex_array_T oligoindices_major, oligoindices_minor;
5219   Diagnostic_T diagnostic;
5220   bool lowidentityp;
5221 #ifndef PMAP
5222   bool poorp, repetitivep;
5223 #endif
5224   Dynprog_T dynprogL, dynprogM, dynprogR;
5225   Matchpool_T matchpool;
5226   Pairpool_T pairpool;
5227   Diagpool_T diagpool;
5228   Cellpool_T cellpool;
5229   Stopwatch_T stopwatch;
5230 
5231   Chrpos_T genomicstart, genomiclength;
5232   Sequence_T genomicseg, queryuc, referenceuc;
5233   int jobid = 0;
5234 
5235   Chimera_T chimera = NULL;
5236   List_T gregions, stage3list;
5237   Stage3_T *stage3array, stage3, stage3ref;
5238   int npaths_primary, npaths_altloc, i;
5239 
5240   oligoindices_major = Oligoindex_array_new_major(&noligoindices_major);
5241   oligoindices_minor = Oligoindex_array_new_minor(&noligoindices_minor);
5242   dynprogL = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired);
5243   dynprogM = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired);
5244   dynprogR = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired);
5245   matchpool = Matchpool_new();
5246   pairpool = Pairpool_new();
5247   diagpool = Diagpool_new();
5248   cellpool = Cellpool_new();
5249   stopwatch = (timingp == true) ? Stopwatch_new() : (Stopwatch_T) NULL;
5250 
5251   Matchpool_reset(matchpool);
5252   Pairpool_reset(pairpool);
5253   Diagpool_reset(diagpool);
5254   Cellpool_reset(cellpool);
5255 
5256   referenceuc = Sequence_uppercase(referenceseq);
5257 
5258   /* Do not trim the mutation refseq */
5259   diagnostic = Diagnostic_new();
5260   Oligoindex_set_inquery(&diagnostic->query_badoligos,&diagnostic->query_repoligos,&diagnostic->query_trimoligos,
5261 			 &diagnostic->query_trim_start,&diagnostic->query_trim_end,Oligoindex_array_elt(oligoindices_major,0),
5262 			 Sequence_fullpointer(referenceuc),Sequence_fulllength(referenceuc),/*trimp*/false);
5263 #ifndef PMAP
5264 #if 0
5265   /* Don't do Sequence_trim, because it affects sequences like NM_018406 */
5266   Sequence_trim(referenceseq,diagnostic->query_trim_start,diagnostic->query_trim_end);
5267 #endif
5268 #endif
5269   if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
5270     gregions = Stage1_compute_nonstranded(&lowidentityp,referenceuc,indexdb_fwd,indexdb_rev,
5271 					  chromosome_iit,chrsubset_start,chrsubet_end,matchpool,
5272 					  stutterhits,diagnostic,/*stopwatch*/NULL);
5273   } else {
5274     gregions = Stage1_compute(&lowidentityp,referenceuc,indexdb_fwd,indexdb_rev,/*genestrand*/0,
5275 			      chromosome_iit,chrsubset_start,chrsubet_end,matchpool,
5276 			      stutterhits,diagnostic,/*stopwatch*/NULL);
5277   }
5278   stage3list = apply_stage3(&chimera,gregions,referenceseq,referenceuc,/*usersegment*/NULL,
5279 			    oligoindices_major,oligoindices_minor,
5280 			    matchpool,pairpool,diagpool,cellpool,
5281 			    dynprogL,dynprogM,dynprogR,stopwatch);
5282   if (stage3list == NULL) {
5283     npaths_primary = npaths_altloc = 0;
5284     stage3array = (Stage3_T *) NULL;
5285   } else {
5286     stage3array = stage3array_from_list(&npaths_primary,&npaths_altloc,stage3list,
5287 					/*chimerap*/false,/*remove_overlaps_p*/true);
5288   }
5289   debug2(printf("npaths_primary %d, npaths_altloc %d\n",npaths_primary,npaths_altloc));
5290 
5291   Diagnostic_free(&diagnostic);
5292 
5293   /* chimera should be NULL */
5294   for (i = 1; i < npaths_primary + npaths_altloc; i++) {
5295     stage3 = stage3array[i];
5296     Stage3_free(&stage3);
5297   }
5298   if (npaths_primary + npaths_altloc > 0) {
5299     stage3ref = stage3array[0];
5300 #ifdef PMAP
5301     Stage3_translate_cdna(stage3ref,queryseq,strictp);
5302     Stage3_backtranslate_cdna(stage3ref,/*diagnosticp*/false);
5303 #else
5304     Stage3_translate_genomic(stage3ref,/*fulllengthp*/true,/*cds_startpos*/-1,
5305 			     Sequence_fulllength_given(queryseq),/*truncatep*/false,strictp);
5306 #endif
5307     FREE(stage3array);
5308 
5309     Stage3_genomicbounds(&genomicstart,&genomiclength,stage3ref);
5310     if (genomealt != NULL) {
5311       genomicseg = Genome_get_segment(genomealt,genomicstart,genomiclength,chromosome_iit,/*revcomp*/false);
5312     } else {
5313       genomicseg = Genome_get_segment(genome,genomicstart,genomiclength,chromosome_iit,/*revcomp*/false);
5314     }
5315 
5316     while (jobid == 0 || (queryseq = Sequence_read_multifile(&nextchar,&input,read_files_command,&files,&nfiles)) != NULL) {
5317       Matchpool_reset(matchpool);
5318       Pairpool_reset(pairpool);
5319       Diagpool_reset(diagpool);
5320       Cellpool_reset(cellpool);
5321 
5322       fprintf(fp,">");
5323       Sequence_print_header(stdout,queryseq,checksump);
5324       diagnostic = Diagnostic_new();
5325       if (Sequence_fulllength_given(queryseq) <= 0) {
5326 	print_npaths(fp,0,diagnostic,/*usersegment*/NULL,chrsubset,/*chimera*/NULL,EMPTY_SEQUENCE);
5327 
5328       } else if (Sequence_fulllength_given(queryseq) <
5329 #ifdef PMAP
5330 		 index1part_aa
5331 #else
5332 		 index1part
5333 #endif
5334 		 ) {
5335 	print_npaths(fp,0,diagnostic,/*usersegment*/NULL,chrsubset,/*chimera*/NULL,SHORT_SEQUENCE);
5336 
5337       } else {
5338 
5339 	queryuc = Sequence_uppercase(queryseq);
5340 #ifdef PMAP
5341 	Oligoindex_set_inquery(&diagnostic->query_badoligos,&diagnostic->query_repoligos,
5342 			       &diagnostic->query_trimoligos,&diagnostic->query_trim_start,
5343 			       &diagnostic->query_trim_end,Oligoindex_array_elt(oligoindices_major,0),
5344 			       Sequence_fullpointer(queryuc),Sequence_fulllength(queryuc),/*trimp*/false);
5345 #else
5346 	diagnostic->query_oligodepth =
5347 	  Oligoindex_set_inquery(&diagnostic->query_badoligos,&diagnostic->query_repoligos,
5348 				 &diagnostic->query_trimoligos,&diagnostic->query_trim_start,
5349 				 &diagnostic->query_trim_end,Oligoindex_array_elt(oligoindices_major,0),
5350 				 Sequence_fullpointer(queryuc),/*querystart*/0,/*queryend*/Sequence_fulllength(queryuc),
5351 				 /*trimp*/true);
5352 
5353 	if (diagnostic->query_trimoligos == 0) {
5354 	  poorp = true;
5355 	} else if (((double) diagnostic->query_badoligos/(double) diagnostic->query_trimoligos > MAX_BADOLIGOS) ||
5356 		   (diagnostic->query_trim_end - diagnostic->query_trim_start < 80 && diagnostic->query_badoligos > 0)) {
5357 	  poorp = true;
5358 	} else {
5359 	  poorp = false;
5360 	}
5361 #if 0
5362 	if (diagnostic->query_trimoligos == 0) {
5363 	  repetitivep = false;
5364 	} else if (diagnostic->query_oligodepth > MAX_OLIGODEPTH ||
5365 		   (double) diagnostic->query_repoligos/(double) diagnostic->query_trimoligos > MAX_REPOLIGOS) {
5366 	  repetitivep = true;
5367 	} else {
5368 	  repetitivep = false;
5369 	}
5370 #endif
5371 	repetitivep = false;
5372 
5373 	if (poorp == true && prune_poor_p == true) {
5374 	  print_npaths(fp,0,diagnostic,/*usersegment*/NULL,chrsubset,/*chimera*/NULL,POOR_SEQUENCE);
5375 	} else if (repetitivep == true && prune_repetitive_p == true) {
5376 	  print_npaths(fp,0,diagnostic,/*usersegment*/NULL,chrsubset,/*chimera*/NULL,REPETITIVE);
5377 	} else {
5378 #endif /* PMAP */
5379 	  stage3array = stage3_from_usersegment(&npaths_primary,&npaths_altloc,queryseq,queryuc,genomicseg,
5380 						oligoindices_major,oligoindices_minor,
5381 						pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,stopwatch);
5382 
5383 	  if (npaths_primary + npaths_altloc == 0) {
5384 	    print_npaths(fp,0,diagnostic,/*usersegment*/NULL,chrsubset,/*chimera*/NULL,NO_FAILURE);
5385 	  } else if (printtype == COORDS) {
5386 	    Stage3_print_coordinates(fp,stage3array[0],chromosome_iit,invertmode);
5387 
5388 	  } else {
5389 	    /* Usual output */
5390 	    print_npaths(fp,1,diagnostic,/*usersegment*/NULL,chrsubset,/*chimera*/NULL,NO_FAILURE);
5391 #ifndef PMAP
5392 	    Stage3_translate_cdna_via_reference(stage3array[0],stage3ref);
5393 #endif
5394 	    Stage3_fix_cdna_direction(stage3array[0],stage3ref);
5395 	    Stage3_print_mutations(stage3array[0],stage3ref,chromosome_iit,queryseq,
5396 				   dbversion,printtype,proteinmode,
5397 				   invertmode,nointronlenp,wraplength,
5398 				   /*snps_p*/snp_blocks ? true : false);
5399 	    for (i = 0; i < npaths_primary + npaths_altloc; i++) {
5400 	      stage3 = stage3array[i];
5401 	      Stage3_free(&stage3);
5402 	    }
5403 	    FREE(stage3array);
5404 
5405 	  }
5406 
5407 #ifndef PMAP
5408 	}
5409 #endif
5410 
5411 	Oligoindex_clear_inquery(Oligoindex_array_elt(oligoindices_major,0));
5412 
5413 	Sequence_free(&queryuc);
5414       }
5415       Sequence_free(&queryseq);
5416       jobid++;
5417     }
5418     Sequence_free(&genomicseg);
5419     Stage3_free(&stage3ref);
5420   }
5421 
5422   Stopwatch_free(&stopwatch);
5423   Cellpool_free(&cellpool);
5424   Diagpool_free(&diagpool);
5425   Pairpool_free(&pairpool);
5426   Dynprog_free(&dynprogR);
5427   Dynprog_free(&dynprogM);
5428   Dynprog_free(&dynprogL);
5429   Oligoindex_array_free(&oligoindices_minor);
5430   Oligoindex_array_free(&oligoindices_major);
5431 
5432   return;
5433 }
5434 
5435 #endif
5436 
5437 void
check_map_iit(IIT_T map_iit,Univ_IIT_T chromosome_iit)5438 check_map_iit (IIT_T map_iit, Univ_IIT_T chromosome_iit) {
5439   char *typestring, *lookup, *p;
5440   int type, destranded_len;
5441   bool errorp = false;
5442 
5443   for (type = 1; type < IIT_ntypes(map_iit); type++) {
5444     lookup = typestring = IIT_typestring(map_iit,type);
5445     if ((p = rindex(typestring,'+')) != NULL) {
5446       destranded_len = (p - typestring)/sizeof(char);
5447       lookup = (char *) MALLOC((destranded_len+1)*sizeof(char));
5448       strncpy(lookup,typestring,destranded_len);
5449       lookup[destranded_len] = '\0';
5450 
5451     } else if ((p = rindex(typestring,'-')) != NULL) {
5452       destranded_len = (p - typestring)/sizeof(char);
5453       lookup = (char *) MALLOC((destranded_len+1)*sizeof(char));
5454       strncpy(lookup,typestring,destranded_len);
5455       lookup[destranded_len] = '\0';
5456     }
5457 
5458     if (Univ_IIT_find_one(chromosome_iit,lookup) < 0) {
5459       if (p != NULL) {
5460 	fprintf(stderr,"Warning: In %s, type %s (without the %s) does not correspond to a known chromosome in %s.\n",
5461 		map_iitfile,typestring,p,dbversion);
5462       } else {
5463 	fprintf(stderr,"Warning: In %s, type %s does not correspond to a known chromosome in %s.\n",
5464 		map_iitfile,typestring,dbversion);
5465       }
5466       errorp = true;
5467     }
5468 
5469     if (p != NULL) {
5470       FREE(lookup);
5471     }
5472   }
5473   if (errorp == true) {
5474     fprintf(stderr,"Known chromosomes: ");
5475     Univ_IIT_dump_labels(stderr,chromosome_iit);
5476   }
5477   return;
5478 }
5479 
5480 
5481 void
parse_part(int * part_modulus,int * part_interval,char * string)5482 parse_part (int *part_modulus, int *part_interval, char *string) {
5483   char *p = string;
5484 
5485   if (sscanf(p,"%d",&(*part_modulus)) < 1) {
5486     fprintf(stderr,"Cannot parse first integer from %s\n",string);
5487     exit(9);
5488   }
5489 
5490   while (*p != '\0' && isdigit(*p)) {
5491     p++;
5492   }
5493   while (*p != '\0' && !isdigit(*p)) {
5494     p++;
5495   }
5496   if (sscanf(p,"%d",&(*part_interval)) < 1) {
5497     fprintf(stderr,"Cannot parse first integer from %s\n",string);
5498     exit(9);
5499   }
5500   if ((*part_modulus) >= (*part_interval)) {
5501     fprintf(stderr,"In %s, batch number %d must be less than the number of batches %d\n",
5502 	    string,*part_modulus,*part_interval);
5503     exit(9);
5504   }
5505   if (*part_interval == 0) {
5506     fprintf(stderr,"Bad batch specification %s.  Batch interval cannot be 0.\n",string);
5507     exit(9);
5508   }
5509 
5510   return;
5511 }
5512 
5513 
5514 static char *
check_valid_int(char * string)5515 check_valid_int (char *string) {
5516   char *p = string;
5517 
5518   if (*p == '+' || *p == '-') {
5519     p++;
5520   }
5521 
5522   if (!isdigit(*p)) {
5523     fprintf(stderr,"value %s is not a valid int\n",string);
5524     exit(9);
5525     return NULL;
5526   }
5527   while (*p != '\0' && isdigit(*p)) {
5528     p++;
5529   }
5530 
5531   if (*p == 'e') {
5532     p++;
5533     if (*p == '+') {
5534       p++;
5535     }
5536     if (!isdigit(*p)) {
5537       return false;
5538     }
5539     while (*p != '\0' && isdigit(*p)) {
5540       p++;
5541     }
5542   }
5543 
5544   if (*p == '\0') {
5545     return string;
5546   } else {
5547     fprintf(stderr,"value %s is not a valid int\n",string);
5548     exit(9);
5549     return NULL;
5550   }
5551 }
5552 
5553 
5554 static double
check_valid_float(char * string,const char * option)5555 check_valid_float (char *string, const char *option) {
5556   double value;
5557   char *p = string;
5558 
5559   if (*p == '+' || *p == '-') {
5560     p++;
5561   }
5562 
5563   while (*p != '\0' && isdigit(*p)) {
5564     p++;
5565   }
5566   if (*p == '\0') {
5567     if ((value = atof(string)) > 1.0 || value < 0.0) {
5568       fprintf(stderr,"Value for option %s should be between 0.0 and 1.0\n",option);
5569       exit(9);
5570     } else {
5571       return value;
5572     }
5573   }
5574 
5575   if (*p == '.') {
5576     p++;
5577   }
5578 
5579   if (!isdigit(*p)) {
5580     fprintf(stderr,"Value %s for option %s is not a valid float\n",string,option);
5581     exit(9);
5582     return 0.0;
5583   }
5584   while (*p != '\0' && isdigit(*p)) {
5585     p++;
5586   }
5587 
5588   if (*p == 'e') {
5589     p++;
5590     if (*p == '+' || *p == '-') {
5591       p++;
5592     }
5593     if (!isdigit(*p)) {
5594       fprintf(stderr,"Value %s for option %s is not a valid float\n",string,option);
5595       exit(9);
5596       return 0.0;
5597     }
5598     while (*p != '\0' && isdigit(*p)) {
5599       p++;
5600     }
5601   }
5602 
5603   if (*p == '\0') {
5604     if ((value = atof(string)) > 1.0 || value < 0.0) {
5605       fprintf(stderr,"Value for option %s should be between 0.0 and 1.0\n",option);
5606       exit(9);
5607     } else {
5608       return value;
5609     }
5610   } else {
5611     fprintf(stderr,"Value %s for option %s is not a valid float\n",string,option);
5612     exit(9);
5613     return 0.0;
5614   }
5615 }
5616 
5617 static char *
check_valid_float_or_int(char * string)5618 check_valid_float_or_int (char *string) {
5619   char *p = string;
5620 
5621   if (*p == '+' || *p == '-') {
5622     p++;
5623   }
5624 
5625   while (*p != '\0' && isdigit(*p)) {
5626     p++;
5627   }
5628   if (*p == '\0') {
5629     return string;
5630   }
5631 
5632   if (*p == '.') {
5633     p++;
5634   }
5635 
5636   if (!isdigit(*p)) {
5637     fprintf(stderr,"value %s is not a valid float\n",string);
5638     exit(9);
5639     return NULL;
5640   }
5641   while (*p != '\0' && isdigit(*p)) {
5642     p++;
5643   }
5644 
5645   if (*p == 'e') {
5646     p++;
5647     if (*p == '+' || *p == '-') {
5648       p++;
5649     }
5650     if (!isdigit(*p)) {
5651       fprintf(stderr,"value %s is not a valid float\n",string);
5652       exit(9);
5653       return NULL;
5654     }
5655     while (*p != '\0' && isdigit(*p)) {
5656       p++;
5657     }
5658   }
5659 
5660   if (*p == '\0') {
5661     return string;
5662   } else {
5663     fprintf(stderr,"value %s is not a valid float\n",string);
5664     exit(9);
5665     return NULL;
5666   }
5667 }
5668 
5669 
5670 static int
parse_command_line(int argc,char * argv[],int optind)5671 parse_command_line (int argc, char *argv[], int optind) {
5672   int opt, c;
5673   extern char *optarg;
5674   int long_option_index = 0;
5675   const char *long_name;
5676   char **argstart;
5677 
5678   int len;
5679   int user_ngap = -1;
5680 
5681 
5682   fprintf(stderr,"GMAP version %s called with args:",PACKAGE_VERSION);
5683   argstart = &(argv[-optind]);
5684   for (c = 1; c < argc + optind; c++) {
5685     fprintf(stderr," %s",argstart[c]);
5686   }
5687   fprintf(stderr,"\n");
5688 
5689   while ((opt = getopt_long(argc,argv,
5690 #ifdef PMAP
5691 			    "q:D:a:d:k:g:2B:K:w:L:x:1t:s:c:SA03468:9n:f:ZO5o:V:v:M:m:ebu:E:PQYNI:i:l:",
5692 #else
5693 			    "q:D:d:k:g:2B:K:w:L:x:1t:s:c:p:SA03468:9n:f:ZO5o:V:v:M:m:ebu:E:PQFa:Tz:j:YNI:i:l:",
5694 #endif
5695 			    long_options, &long_option_index)) != -1) {
5696     switch (opt) {
5697     case 0:
5698       long_name = long_options[long_option_index].name;
5699       if (!strcmp(long_name,"version")) {
5700 	print_program_version();
5701 	return 1;
5702       } else if (!strcmp(long_name,"check")) {
5703 	check_compiler_assumptions();
5704 	return 1;
5705       } else if (!strcmp(long_name,"help")) {
5706 	print_program_usage();
5707 	return 1;
5708 
5709       } else if (!strcmp(long_name,"time")) {
5710 	timingp = true;
5711 
5712       } else if (!strcmp(long_name,"use-shared-memory")) {
5713 	if (!strcmp(optarg,"1")) {
5714 	  sharedp = true;
5715 	} else if (!strcmp(optarg,"0")) {
5716 	  sharedp = false;
5717 	} else {
5718 	  fprintf(stderr,"--use-shared-memory flag must be 0 or 1\n");
5719 	  return 9;
5720 	}
5721 
5722       } else if (!strcmp(long_name,"preload-shared-memory")) {
5723 	preload_shared_memory_p = true;
5724 
5725       } else if (!strcmp(long_name,"unload-shared-memory")) {
5726 	unload_shared_memory_p = true;
5727 
5728       } else if (!strcmp(long_name,"expand-offsets")) {
5729 	fprintf(stderr,"Note: --expand-offsets flag is no longer supported.  With the latest algorithms, it doesn't improve speed much.  Ignoring this flag");
5730 
5731       } else if (!strcmp(long_name,"sampling")) {
5732 	required_index1interval = atoi(check_valid_int(optarg));
5733 
5734       } else if (!strcmp(long_name,"cmdline")) {
5735 	user_cmdline = optarg;
5736 
5737       } else if (!strcmp(long_name,"suboptimal-score")) {
5738 	suboptimal_score_float = atof(check_valid_float_or_int(optarg));
5739 	if (suboptimal_score_float > 1.0 && suboptimal_score_float != rint(suboptimal_score_float)) {
5740 	  fprintf(stderr,"Cannot specify fractional value %f for --suboptimal-score except between 0.0 and 1.0\n",
5741 		  suboptimal_score_float);
5742 	  return 9;
5743 	}
5744 
5745       } else if (!strcmp(long_name,"require-splicedir")) {
5746 	require_splicedir_p = true;
5747 
5748       } else if (!strcmp(long_name,"splicingdir")) {
5749 	user_splicingdir = optarg;
5750 
5751       } else if (!strcmp(long_name,"nosplicing")) {
5752 	novelsplicingp = false;
5753 
5754       } else if (!strcmp(long_name,"no-chimeras")) {
5755 	chimera_margin = 0;
5756 
5757       } else if (!strcmp(long_name,"translation-code")) {
5758 	translation_code = atoi(check_valid_int(optarg));
5759 
5760       } else if (!strcmp(long_name,"alt-start-codons")) {
5761 	alt_initiation_codons_p = true;
5762 
5763       } else if (!strcmp(long_name,"min-intronlength")) {
5764 	min_intronlength = atoi(check_valid_int(optarg));
5765 
5766       } else if (!strcmp(long_name,"max-intronlength-middle")) {
5767 	maxintronlen = atoi(check_valid_int(optarg));
5768 
5769       } else if (!strcmp(long_name,"max-intronlength-ends")) {
5770 	maxintronlen_ends = atoi(check_valid_int(optarg));
5771 
5772       } else if (!strcmp(long_name,"split-large-introns")) {
5773 	split_large_introns_p = true;
5774 
5775       } else if (!strcmp(long_name,"trim-end-exons")) {
5776 	minendexon = atoi(check_valid_int(optarg));
5777 
5778       } else if (!strcmp(long_name,"allow-close-indels")) {
5779 	if (!strcmp(optarg,"0")) {
5780 	  /* Disallow */
5781 	  close_indels_mode = -1;
5782 	  extraband_single = 0;
5783 	} else if (!strcmp(optarg,"1")) {
5784 	  /* Always allow */
5785 	  close_indels_mode = +1;
5786 	  extraband_single = 3;
5787 	} else if (!strcmp(optarg,"2")) {
5788 	  /* Allow for high-quality alignments */
5789 	  close_indels_mode = 0;
5790 	  extraband_single = 3;
5791 	} else {
5792 	  fprintf(stderr,"allow-close-indels argument %s not recognized.  Only allow 0, 1, or 2.  Run 'gsnap --help' for more information.\n",optarg);
5793 	  return 9;
5794 	}
5795       } else if (!strcmp(long_name,"microexon-spliceprob")) {
5796 	microexon_spliceprob = check_valid_float(optarg,long_name);
5797       } else if (!strcmp(long_name,"stage2-start")) {
5798 	suboptimal_score_start = atoi(check_valid_int(optarg));
5799       } else if (!strcmp(long_name,"stage2-end")) {
5800 	suboptimal_score_end = atoi(check_valid_int(optarg));
5801 
5802       } else if (!strcmp(long_name,"canonical-mode")) {
5803 	if (!strcmp(optarg,"0")) {
5804 	  canonical_mode = 0;
5805 	} else if (!strcmp(optarg,"1")) {
5806 	  canonical_mode = 1;
5807 	} else if (!strcmp(optarg,"2")) {
5808 	  canonical_mode = 2;
5809 	} else {
5810 	  fprintf(stderr,"Canonical level %s not recognized.\n",optarg);
5811 	  fprintf(stderr,"0=low reward for canonical introns, 1=high reward for canonical introns (default)\n");
5812 	  fprintf(stderr,"2=low reward for high-identity seqs, high reward otherwise\n");
5813 	  return 9;
5814 	}
5815 
5816       } else if (!strcmp(long_name,"cross-species")) {
5817 	cross_species_p = true;
5818 
5819       } else if (!strcmp(long_name,"homopolymer")) {
5820 	homopolymerp = true;
5821 
5822       } else if (!strcmp(long_name,"cmetdir")) {
5823 	user_modedir = optarg;
5824 
5825       } else if (!strcmp(long_name,"atoidir")) {
5826 	user_modedir = optarg;
5827 
5828       } else if (!strcmp(long_name,"mode")) {
5829 	if (!strcmp(optarg,"standard")) {
5830 	  mode = STANDARD;
5831 	} else if (!strcmp(optarg,"cmet-stranded")) {
5832 	  mode = CMET_STRANDED;
5833 	} else if (!strcmp(optarg,"cmet-nonstranded")) {
5834 	  mode = CMET_NONSTRANDED;
5835 	  fprintf(stderr,"Non-stranded mode not yet working properly\n");
5836 	  exit(9);
5837 	} else if (!strcmp(optarg,"atoi-stranded")) {
5838 	  mode = ATOI_STRANDED;
5839 	} else if (!strcmp(optarg,"atoi-nonstranded")) {
5840 	  mode = ATOI_NONSTRANDED;
5841 	  fprintf(stderr,"Non-stranded mode not yet working properly\n");
5842 	  exit(9);
5843 	} else if (!strcmp(optarg,"ttoc-stranded")) {
5844 	  mode = TTOC_STRANDED;
5845 	} else if (!strcmp(optarg,"ttoc-nonstranded")) {
5846 	  mode = TTOC_NONSTRANDED;
5847 	  fprintf(stderr,"Non-stranded mode not yet working properly\n");
5848 	  exit(9);
5849 	} else {
5850 	  fprintf(stderr,"--mode must be standard, cmet-stranded, cmet-nonstranded, atoi-stranded, atoi-nonstranded, ttoc-stranded, or ttoc-nonstranded\n");
5851 	  return 9;
5852 	}
5853 
5854       } else if (!strcmp(long_name,"min-trimmed-coverage")) {
5855 	min_trimmed_coverage = check_valid_float(optarg,long_name);
5856       } else if (!strcmp(long_name,"min-identity")) {
5857 	min_identity = check_valid_float(optarg,long_name);
5858 
5859       } else if (!strcmp(long_name,"read-files-command")) {
5860 	read_files_command = optarg;
5861 
5862       } else if (!strcmp(long_name,"input-buffer-size")) {
5863 	inbuffer_nspaces = atoi(check_valid_int(optarg));
5864       } else if (!strcmp(long_name,"output-buffer-size")) {
5865 	output_buffer_size = atoi(check_valid_int(optarg));
5866       } else if (!strcmp(long_name,"print-comment")) {
5867 	print_comment_p = true;
5868       } else if (!strcmp(long_name,"failsonly")) {
5869 	if (nofailsp == true) {
5870 	  fprintf(stderr,"Cannot specify both --nofails and --failsonly\n");
5871 	  return 9;
5872 	} else {
5873 	  failsonlyp = true;
5874 	}
5875       } else if (!strcmp(long_name,"failed-input")) {
5876 	failedinput_root = optarg;
5877 #if 0
5878       } else if (!strcmp(long_name,"quiet-if-excessive")) {
5879 	quiet_if_excessive_p = true;
5880 #endif
5881       } else if (!strcmp(long_name,"nofails")) {
5882 	if (failsonlyp == true) {
5883 	  fprintf(stderr,"Cannot specify both --nofails and --failsonly\n");
5884 	  return 9;
5885 	} else {
5886 	  nofailsp = true;
5887 	}
5888       } else if (!strcmp(long_name,"split-output")) {
5889 	split_output_root = optarg;
5890       } else if (!strcmp(long_name,"append-output")) {
5891 	appendp = true;
5892 
5893       } else if (!strcmp(long_name,"gff3-add-separators")) {
5894 	if (!strcmp(optarg,"1")) {
5895 	  gff3_separators_p = true;
5896 	} else if (!strcmp(optarg,"0")) {
5897 	  gff3_separators_p = false;
5898 	} else {
5899 	  fprintf(stderr,"--gff3-add-separators flag must be 0 or 1\n");
5900 	  return 9;
5901 	}
5902 
5903       } else if (!strcmp(long_name,"gff3-swap-phase")) {
5904 	if (!strcmp(optarg,"1")) {
5905 	  gff3_phase_swap_p = true;
5906 	} else if (!strcmp(optarg,"0")) {
5907 	  gff3_phase_swap_p = false;
5908 	} else {
5909 	  fprintf(stderr,"--gff3-swap-phase flag must be 0 or 1\n");
5910 	  return 9;
5911 	}
5912 
5913       } else if (!strcmp(long_name,"gff3-fasta-annotation")) {
5914 	if (!strcmp(optarg,"0")) {
5915 	  gff3_fasta_annotation_type = NO_ANNOTATION;
5916 	} else if (!strcmp(optarg,"1")) {
5917 	  gff3_fasta_annotation_type = INSERT_ANNOTATION;
5918 	} else if (!strcmp(optarg,"2")) {
5919 	  gff3_fasta_annotation_type = KEYVALUE_ANNOTATION;
5920 	} else {
5921 	  fprintf(stderr,"--gff3-fasta-annotation flag must be 0, 1, or 2\n");
5922 	  return 9;
5923 	}
5924 
5925       } else if (!strcmp(long_name,"gff3-cds")) {
5926 	if (!strcmp(optarg,"cdna")) {
5927 	  cdstype = CDS_CDNA;
5928 	} else if (!strcmp(optarg,"genomic")) {
5929 	  cdstype = CDS_GENOMIC;
5930 	} else {
5931 	  fprintf(stderr,"--gff3-cds flag must be cdna or genomic\n");
5932 	  return 9;
5933 	}
5934 
5935 #ifndef PMAP
5936       } else if (!strcmp(long_name,"no-sam-headers")) {
5937 	sam_headers_p = false;
5938       } else if (!strcmp(long_name,"sam-use-0M")) {
5939 	sam_insert_0M_p = true;
5940       } else if (!strcmp(long_name,"sam-extended-cigar")) {
5941 	sam_cigar_extended_p = true;
5942       } else if (!strcmp(long_name,"quality-protocol")) {
5943 	if (user_quality_shift == true) {
5944 	  fprintf(stderr,"Cannot specify both -j (--quality-print-shift) and --quality-protocol\n");
5945 	  return 9;
5946 	} else if (!strcmp(optarg,"illumina")) {
5947 	  quality_shift = -31;
5948 	  user_quality_shift = true;
5949 	} else if (!strcmp(optarg,"sanger")) {
5950 	  quality_shift = 0;
5951 	  user_quality_shift = true;
5952 	} else {
5953 	  fprintf(stderr,"The only values allowed for --quality-protocol are illumina or sanger\n");
5954 	  return 9;
5955 	}
5956 
5957       } else if (!strcmp(long_name,"force-xs-dir")) {
5958 	force_xs_direction_p = true;
5959 
5960       } else if (!strcmp(long_name,"md-lowercase-snp")) {
5961 	md_lowercase_variant_p = true;
5962 
5963       } else if (!strcmp(long_name,"action-if-cigar-error")) {
5964 	if (!strcmp(optarg,"ignore")) {
5965 	  cigar_action = CIGAR_ACTION_IGNORE;
5966 	} else if (!strcmp(optarg,"warning")) {
5967 	  cigar_action = CIGAR_ACTION_WARNING;
5968 	} else if (!strcmp(optarg,"noprint")) {
5969 	  cigar_action = CIGAR_ACTION_NOPRINT;
5970 	} else if (!strcmp(optarg,"abort")) {
5971 	  cigar_action = CIGAR_ACTION_ABORT;
5972 	} else {
5973 	  fprintf(stderr,"The only values allowed for --action-if-cigar-error are ignore, warning, noprint, abort\n");
5974 	  return 9;
5975 	}
5976 
5977       } else if (!strcmp(long_name,"read-group-id")) {
5978 	sam_read_group_id = optarg;
5979       } else if (!strcmp(long_name,"read-group-name")) {
5980 	sam_read_group_name = optarg;
5981       } else if (!strcmp(long_name,"read-group-library")) {
5982 	sam_read_group_library = optarg;
5983       } else if (!strcmp(long_name,"read-group-platform")) {
5984 	sam_read_group_platform = optarg;
5985 #endif
5986       } else {
5987 	/* Shouldn't reach here */
5988 	fprintf(stderr,"Don't recognize option %s.  For usage, run 'gmap --help'",long_name);
5989 	return 9;
5990       }
5991       break;
5992 
5993     case 'q': parse_part(&part_modulus,&part_interval,optarg); break;
5994     case 'D': user_genomedir = optarg; break;
5995     case 'd':
5996       dbroot = (char *) CALLOC(strlen(optarg)+1,sizeof(char));
5997       strcpy(dbroot,optarg);
5998       break;
5999 #ifdef PMAP
6000     case 'a':
6001       if ((required_alphabet = Alphabet_find(optarg)) == AA0) {
6002 	return 9;
6003       }
6004       break;
6005     case 'k': required_index1part = atoi(check_valid_int(optarg)); break;
6006 #else
6007     case 'k':
6008       required_index1part = atoi(check_valid_int(optarg));
6009       if (required_index1part > MAXIMUM_KMER) {
6010 	fprintf(stderr,"The value for k-mer size must be %d or less\n",MAXIMUM_KMER);
6011 	return 9;
6012       }
6013       break;
6014 #endif
6015 #if 0
6016     case 'G': uncompressedp = true; break;
6017 #endif
6018     case 'g': user_genomicseg = optarg; break;
6019     case '1': user_selfalign_p = true; break;
6020     case '2': user_pairalign_p = true; break;
6021 
6022     case 'B':
6023       if (!strcmp(optarg,"5")) {
6024 	fprintf(stderr,"Note: Batch mode 5 is now the same as batch mode 4.\n");
6025 	offsetsstrm_access = USE_ALLOCATE; /* Doesn't matter */
6026 	positions_access = USE_ALLOCATE;
6027 	locoffsetsstrm_access = USE_ALLOCATE; /* Doesn't matter */
6028 	locpositions_access = USE_ALLOCATE;
6029 
6030 	genome_access = USE_ALLOCATE;
6031 
6032       } else if (!strcmp(optarg,"4")) {
6033 	offsetsstrm_access = USE_ALLOCATE;
6034 	positions_access = USE_ALLOCATE;
6035 	locoffsetsstrm_access = USE_ALLOCATE;
6036 	locpositions_access = USE_ALLOCATE;
6037 
6038 	genome_access = USE_ALLOCATE;
6039 #ifdef HAVE_MMAP
6040 
6041       } else if (!strcmp(optarg,"3")) {
6042 	offsetsstrm_access = USE_ALLOCATE;
6043 	positions_access = USE_ALLOCATE;
6044 	locoffsetsstrm_access = USE_ALLOCATE;
6045 	locpositions_access = USE_ALLOCATE;
6046 
6047 	genome_access = USE_MMAP_PRELOAD; /* was batch_genome_p = true */
6048 
6049       } else if (!strcmp(optarg,"2")) {
6050 	offsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
6051 	positions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
6052 	locoffsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
6053 	locpositions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
6054 
6055 	genome_access = USE_MMAP_PRELOAD; /* was batch_genome_p = true */
6056 
6057       } else if (!strcmp(optarg,"1")) {
6058 	offsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
6059 	positions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
6060 	locoffsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
6061 	locpositions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
6062 
6063 	genome_access = USE_MMAP_ONLY; /* was batch_genome_p = false */
6064 
6065       } else if (!strcmp(optarg,"0")) {
6066 	offsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
6067 	positions_access = USE_MMAP_ONLY; /* was batch_positions_p = false */
6068 	locoffsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
6069 	locpositions_access = USE_MMAP_ONLY; /* was batch_positions_p = false */
6070 
6071 	genome_access = USE_MMAP_ONLY; /* was batch_genome_p = false */
6072 #endif
6073 
6074       } else {
6075 #ifdef HAVE_MMAP
6076 	fprintf(stderr,"Batch mode %s not recognized.  Only allow 0-5.  Run 'gmap --help' for more information.\n",optarg);
6077 #else
6078 	fprintf(stderr,"Batch mode %s not recognized.  Only allow 4-5, since mmap is disabled.  Run 'gmap --help' for more information.\n",optarg);
6079 #endif
6080 	return 9;
6081       }
6082       break;
6083 
6084     case 'K': maxintronlen = maxintronlen_ends = atoi(check_valid_int(optarg)); break;
6085 
6086     case 'w': shortsplicedist = strtoul(check_valid_int(optarg),NULL,10); break;
6087 
6088     case 'L': maxtotallen_bound = atoi(check_valid_int(optarg)); break;
6089     case 'x':
6090 #ifdef PMAP
6091       chimera_margin = atoi(check_valid_int(optarg))/3;
6092 #else
6093       chimera_margin = atoi(check_valid_int(optarg));
6094 #endif
6095       if (chimera_margin <= 0) {
6096 	/* Disable finding of chimeras */
6097 #if 0
6098       } else if (chimera_margin < CHIMERA_SLOP) {
6099 	/* Not sure why chimera_margin should be tied to CHIMERA_SLOP */
6100 	chimera_margin = CHIMERA_SLOP;
6101 #endif
6102       }
6103       break;
6104       /* case 'w': referencefile = optarg; break; */
6105 
6106 #ifdef HAVE_PTHREAD
6107     case 't': nworkers = atoi(check_valid_int(optarg)); break;
6108 #else
6109     case 't': fprintf(stderr,"This version of GMAP has pthreads disabled, so ignoring the value of %s for -t\n",optarg); break;
6110 #endif
6111 
6112     case 's': splicing_file = optarg; knownsplicingp = true; break;
6113     case 'c': user_chrsubsetname = optarg; break;
6114 
6115 #ifndef PMAP
6116     case 'p': switch (atoi(check_valid_int(optarg))) {
6117       case 0: prune_poor_p = false, prune_repetitive_p = false; break;
6118       case 1: prune_poor_p = true; prune_repetitive_p = false; break;
6119       case 2: prune_poor_p = false; prune_repetitive_p = true; break;
6120       case 3: prune_poor_p = true; prune_repetitive_p = true; break;
6121       default: fprintf(stderr,"Prune level %s not recognized.\n",optarg);
6122 	fprintf(stderr,"0=no pruning, 1=poor seqs, 2=repetitive seqs, 3=both poor and repetitive seqs (default)\n");
6123 	return 9;
6124       }
6125       break;
6126 #endif
6127 
6128     case 'S': printtype = SUMMARY; break;
6129     case 'A': printtype = ALIGNMENT; break;
6130     case '0': exception_raise_p = false; break; /* Allows signals to pass through */
6131     case '3': printtype = CONTINUOUS; break;
6132     case '4': printtype = CONTINUOUS_BY_EXON; break;
6133     case '6': debug_graphic_p = true; break;
6134     case '8':
6135       if (!strcmp(optarg,"stage1")) {
6136 	stage1debug = true;
6137       } else if (!strcmp(optarg,"diag")) {
6138 	diag_debug = true;
6139       } else if (!strcmp(optarg,"stage2")) {
6140 	stage3debug = POST_STAGE2;
6141       } else if (!strcmp(optarg,"singles")) {
6142 	stage3debug = POST_SINGLES;
6143       } else if (!strcmp(optarg,"introns")) {
6144 	stage3debug = POST_INTRONS;
6145       } else if (!strcmp(optarg,"hmm")) {
6146 	stage3debug = POST_HMM;
6147       } else if (!strcmp(optarg,"smoothing")) {
6148 	stage3debug = POST_SMOOTHING;
6149       } else if (!strcmp(optarg,"dualintrons")) {
6150 	stage3debug = POST_DUAL_INTRONS;
6151       } else if (!strcmp(optarg,"cycles")) {
6152 	stage3debug = POST_CYCLES;
6153       } else if (!strcmp(optarg,"dualbreaks")) {
6154 	stage3debug = POST_DUAL_BREAKS;
6155       } else if (!strcmp(optarg,"middle")) {
6156 	stage3debug = POST_MIDDLE;
6157       } else if (!strcmp(optarg,"ends")) {
6158 	stage3debug = POST_ENDS;
6159       } else if (!strcmp(optarg,"canonical")) {
6160 	stage3debug = POST_CANONICAL;
6161       } else if (!strcmp(optarg,"trim")) {
6162 	stage3debug = POST_CANONICAL;
6163       } else if (!strcmp(optarg,"changepoint")) {
6164 	stage3debug = POST_CHANGEPOINT;
6165       } else if (!strcmp(optarg,"distalmedial")) {
6166 	stage3debug = POST_DISTAL_MEDIAL;
6167       } else {
6168 	fprintf(stderr,"Allowed arguments for -8 flag are stage2, smoothing, singles, introns, hmm, dualbreaks, cycles, canonical, changepoint, distalmedial\n");
6169 	return 9;
6170       }
6171       break;
6172     case '9': checkp = true; break;
6173     case 'n':
6174       maxpaths_report = atoi(check_valid_int(optarg));
6175       if (maxpaths_report == 1) {
6176 	fprintf(stderr,"Note: -n 1 will not report chimeric alignments.  If you want a single alignment plus chimeras, use -n 0 instead.\n");
6177       }
6178       break;
6179     case 'f':
6180       if (!strcmp(optarg,"1") || !strcmp(optarg,"psl_nt")) {
6181 	printtype = PSL_NT;
6182 #ifdef PMAP
6183       } else if (!strcmp(optarg,"0") || !strcmp(optarg,"psl_pro")) {
6184 	printtype = PSL_PRO;
6185 #else
6186       } else if (!strcmp(optarg,"psl")) {
6187 	printtype = PSL_NT;
6188       } else if (!strcmp(optarg,"6") || !strcmp(optarg,"splicesites")) {
6189 	printtype = SPLICESITES;
6190       } else if (!strcmp(optarg,"introns")) {
6191 	printtype = INTRONS;
6192       } else if (!strcmp(optarg,"mask_introns")) {
6193 	printtype = MASK_INTRONS;
6194       } else if (!strcmp(optarg,"mask_utr_introns")) {
6195 	printtype = MASK_UTR_INTRONS;
6196       } else if (!strcmp(optarg,"samse")) {
6197 	printtype = SAM;
6198 	sam_paired_p = false;
6199       } else if (!strcmp(optarg,"sampe")) {
6200 	printtype = SAM;
6201 	sam_paired_p = true;
6202       } else if (!strcmp(optarg,"bedpe")) {
6203 	printtype = BEDPE;
6204 #endif
6205       } else if (!strcmp(optarg,"2") || !strcmp(optarg,"gff3_gene")) {
6206 	printtype = GFF3_GENE;
6207       } else if (!strcmp(optarg,"3") || !strcmp(optarg,"gff3_match_cdna")) {
6208 	printtype = GFF3_MATCH_CDNA;
6209       } else if (!strcmp(optarg,"4") || !strcmp(optarg,"gff3_match_est")) {
6210 	printtype = GFF3_MATCH_EST;
6211       } else if (!strcmp(optarg,"7") || !strcmp(optarg,"map_exons")) {
6212 	printtype = MAP_EXONS;
6213       } else if (!strcmp(optarg,"8") || !strcmp(optarg,"map_ranges")) {
6214 	printtype = MAP_RANGES;
6215       } else if (!strcmp(optarg,"9") || !strcmp(optarg,"coords")) {
6216 	printtype = COORDS;
6217       } else {
6218 	fprintf(stderr,"Output format \"%s\" not recognized.  Allowed formats are:\n",optarg);
6219 	fprintf(stderr,"  psl_nt (1)\n");
6220 #ifdef PMAP
6221 	fprintf(stderr,"  psl_pro (0)\n");
6222 #else
6223 	fprintf(stderr,"  psl\n");
6224 	fprintf(stderr,"  splicesites (6)\n");
6225 	fprintf(stderr,"  introns\n");
6226 	fprintf(stderr,"  mask_introns\n");
6227 	fprintf(stderr,"  mask_utr_introns\n");
6228 	fprintf(stderr,"  samse\n");
6229 	fprintf(stderr,"  sampe\n");
6230 	fprintf(stderr,"  bedpe\n");
6231 #endif
6232 	fprintf(stderr,"  gff3_gene (2)\n");
6233 	fprintf(stderr,"  gff3_match_cdna (3)\n");
6234 	fprintf(stderr,"  gff3_match_est (4)\n");
6235 	fprintf(stderr,"  map_exons (7)\n");
6236 	fprintf(stderr,"  map_ranges (8)\n");
6237 	fprintf(stderr,"  coords (9)\n");
6238 	return 9;
6239       }
6240       break;
6241     case 'Z': printtype = COMPRESSED; break;
6242     case 'O': orderedp = true; break;
6243     case '5': checksump = true; break;
6244     case 'o': chimera_overlap = atoi(check_valid_int(optarg)); break;
6245 
6246     case 'V': user_snpsdir = optarg; break;
6247     case 'v': snps_root = optarg; break;
6248 
6249     case 'M': user_mapdir = optarg; break;
6250     case 'm':
6251       map_iitfile = (char *) CALLOC(strlen(optarg)+1,sizeof(char));
6252       strcpy(map_iitfile,optarg);
6253       if ((len = strlen(map_iitfile)) > 4 && strcmp(&(map_iitfile[len-4]),".iit") == 0) {
6254 	map_iitfile[len-4] = '\0';
6255       }
6256       break;
6257 
6258     case 'e': map_exons_p = true; break;
6259     case 'b': map_bothstrands_p = true; break;
6260     case 'u': nflanking = atoi(check_valid_int(optarg)); break;
6261 
6262     case 'E':
6263       if (!strcmp(optarg,"cdna")) {
6264 	printtype = EXONS_CDNA;
6265       } else if (!strcmp(optarg,"genomic")) {
6266 	printtype = EXONS_GENOMIC;
6267       } else if (!strcmp(optarg,"cdna+introns")) {
6268 	printtype = EXONS_CDNA_WINTRONS;
6269       } else if (!strcmp(optarg,"genomic+introns")) {
6270 	printtype = EXONS_GENOMIC_WINTRONS;
6271       } else {
6272 	fprintf(stderr,"Argument to -E flag must be either \"cdna\" or \"genomic\"\n");
6273 	return 9;
6274       }
6275       break;
6276 
6277 #ifdef PMAP
6278     case 'P': printtype = PROTEIN_GENOMIC; break;
6279     case 'Q': printtype = CDNA; break;
6280 #else
6281     case 'P': printtype = CDNA; break;
6282     case 'Q': printtype = PROTEIN_GENOMIC; break;
6283     case 'F': fulllengthp = true; break;
6284     case 'a': cds_startpos = atoi(check_valid_int(optarg)); break;
6285     case 'T': truncatep = true; fulllengthp = true; break;
6286     case 'z':
6287       if (!strcmp(optarg,"sense_force")) {
6288 	sense_try = +1;
6289 	sense_filter = 0;
6290       } else if (!strcmp(optarg,"antisense_force")) {
6291 	sense_try = -1;
6292 	sense_filter = 0;
6293       } else if (!strcmp(optarg,"sense_filter")) {
6294 	sense_try = 0;
6295 	sense_filter = +1;
6296       } else if (!strcmp(optarg,"antisense_filter")) {
6297 	sense_try = 0;
6298 	sense_filter = -1;
6299       } else if (!strcmp(optarg,"auto")) {
6300 	sense_try = 0;
6301 	sense_filter = 0;
6302       } else {
6303 	fprintf(stderr,"direction %s not recognized.  Must be sense_force, antisense_force, sense_filter, antisense_filter, or auto\n",optarg);
6304 	return 9;
6305       }
6306       break;
6307 
6308     case 'j':
6309       if (user_quality_shift == true) {
6310 	fprintf(stderr,"Cannot specify both -j (--quality-print-shift) and --quality-protocol\n");
6311 	return 9;
6312       } else {
6313 	quality_shift = atoi(check_valid_int(optarg));
6314 	user_quality_shift = true;
6315       }
6316       break;
6317 
6318 #endif
6319     case 'Y': strictp = false; break;
6320     case 'N': nointronlenp = true; break;
6321     case 'I': invertmode = atoi(check_valid_int(optarg)); break;
6322     case 'i': user_ngap = atoi(check_valid_int(optarg)); break;
6323     case 'l': wraplength = atoi(check_valid_int(optarg)); break;
6324 
6325     case '?': fprintf(stderr,"For usage, run 'gmap --help'\n"); return 9;
6326     default: return 9;
6327     }
6328   }
6329 
6330   if (printtype == SPLICESITES || printtype == INTRONS) {
6331     if (maxpaths_report > 1 || (sense_try != +1 && sense_filter != +1)) {
6332       fprintf(stderr,"For splicesites or introns output, you should probably add flags '-n 1' and either '-z sense_force' or '-z sense_filter'.\n");
6333     }
6334   }
6335 
6336   if (user_ngap >= 0) {
6337     ngap = user_ngap;
6338   } else if (printtype == EXONS_CDNA || printtype == EXONS_GENOMIC) {
6339     /* If user didn't specify, then set to zero */
6340     ngap = 0;
6341   } else if (printtype == EXONS_CDNA_WINTRONS || printtype == EXONS_GENOMIC_WINTRONS) {
6342     /* If user didn't specify, then set to infinity */
6343     ngap = 2147483647;		/* INT_MAX */
6344   };
6345 
6346   if (maxintronlen > maxtotallen_bound) {
6347     maxintronlen = maxtotallen_bound;
6348   }
6349 
6350 #ifdef HAVE_PTHREAD
6351 #ifdef USE_DIAGPOOL
6352   if (diag_debug == true && nworkers > 0) {
6353     fprintf(stderr,"For diag output, must specify 0 threads\n");
6354     exit(9);
6355   }
6356 #endif
6357 #endif
6358 
6359   if (user_cmdline != NULL) {
6360     part_modulus = 0;
6361     part_interval = 1;
6362     inbuffer_nspaces = 0;
6363     nchromosomes = 1;
6364     dbroot = (char *) NULL;
6365   } else if (user_selfalign_p == true) {
6366     nchromosomes = 1;
6367     dbroot = (char *) NULL;
6368   } else if (user_pairalign_p == true) {
6369     nchromosomes = 1;
6370     dbroot = (char *) NULL;
6371   } else if (user_genomicseg != NULL) {
6372     /* Ignore -D and -d flags */
6373     nchromosomes = 1;
6374     dbroot = (char *) NULL;
6375   } else if (dbroot == NULL) {
6376     fprintf(stderr,"Need to specify the -d, -g, -1, -2, or --cmdline flag\n");
6377     print_program_usage();
6378     return 9;
6379   } else if (!strcmp(dbroot,"?")) {
6380     Datadir_avail_gmap_databases(stdout,user_genomedir);
6381     return 1;
6382   }
6383 
6384 #ifndef PMAP
6385   if (printtype == SAM) {
6386     if (sam_read_group_id == NULL && sam_read_group_name != NULL) {
6387       sam_read_group_id = sam_read_group_name;
6388     } else if (sam_read_group_id != NULL && sam_read_group_name == NULL) {
6389       sam_read_group_name = sam_read_group_id;
6390     }
6391   }
6392 #endif
6393 
6394   return 0;
6395 }
6396 
6397 
6398 static Inbuffer_T
open_input_stream(int * nread,Sequence_T * usersegment,int argc,char ** argv)6399 open_input_stream (int *nread, Sequence_T *usersegment, int argc, char **argv) {
6400   Inbuffer_T inbuffer;
6401   int nextchar = '\0';
6402   FILE *input = NULL;
6403   char **files;
6404   int nfiles;
6405 
6406   Request_T request;
6407   char *p;
6408 
6409   /* Read user segment before rest of sequences, because of shared usage of sequence.c */
6410   if (user_cmdline != NULL) {
6411     p = user_cmdline;
6412     while (*p != '\0' && *p != ',') {
6413       p++;
6414     }
6415     if (*p == '\0') {
6416       fprintf(stderr,"--cmdline requires two strings separated by a comma");
6417       exit(9);
6418     } else {
6419       *usersegment = global_usersegment = Sequence_genomic_new(user_cmdline,(int) (p - user_cmdline),/*copyp*/true);
6420       if ((min_matches = Sequence_fulllength(*usersegment)/2) > MIN_MATCHES) {
6421 	min_matches = MIN_MATCHES;
6422       }
6423       p++;
6424     }
6425 
6426   } else if (user_selfalign_p == true) {
6427     /* usersegment will be assigned to query sequence below */
6428 
6429   } else if (user_pairalign_p == true) {
6430     /* Unfortunately, this procedure reads header of queryseq */
6431     *usersegment = global_usersegment = Sequence_read_unlimited(&nextchar,stdin);
6432     if ((min_matches = Sequence_fulllength(*usersegment)/2) > MIN_MATCHES) {
6433       min_matches = MIN_MATCHES;
6434     }
6435 
6436   } else if (user_genomicseg != NULL) {
6437     if ((input = FOPEN_READ_TEXT(user_genomicseg)) == NULL) {
6438       fprintf(stderr,"Can't open file %s\n",user_genomicseg);
6439       exit(9);
6440     }
6441     if ((*usersegment = global_usersegment = Sequence_read_unlimited(&nextchar,input)) == NULL) {
6442       fprintf(stderr,"File %s is empty\n",user_genomicseg);
6443       exit(9);
6444     } else {
6445       genomelength = (Univcoord_T) Sequence_fulllength(*usersegment);
6446     }
6447 
6448     if ((min_matches = Sequence_fulllength(*usersegment)/2) > MIN_MATCHES) {
6449       min_matches = MIN_MATCHES;
6450     }
6451     fclose(input);
6452 
6453   } else {
6454     min_matches = MIN_MATCHES;
6455   }
6456 
6457   Inbuffer_setup(/*filter_if_both_p*/false,user_pairalign_p,global_usersegment,
6458 		 part_modulus,part_interval);
6459   if (user_cmdline != NULL) {
6460     inbuffer = Inbuffer_cmdline(p,strlen(p));
6461     *nread = 1;
6462 
6463   } else if (user_selfalign_p == true) {
6464       input = stdin;
6465       files = (char **) NULL;
6466       nfiles = 0;
6467 
6468       /* Read in first batch of sequences */
6469       inbuffer = Inbuffer_new(nextchar,input,read_files_command,files,nfiles,inbuffer_nspaces);
6470       *nread = Inbuffer_fill_init(inbuffer);
6471       request = Inbuffer_first_request(inbuffer); /* Need usersegment, not the request itself */
6472       *usersegment = Request_queryseq(request);
6473 
6474   } else {
6475     /* Open input stream and peek at first char */
6476     if (user_pairalign_p == true) {
6477       input = stdin;
6478       files = (char **) NULL;
6479       nfiles = 0;
6480       inbuffer_nspaces = 1;
6481     } else if (argc == 0) {
6482       fprintf(stderr,"Reading from stdin\n");
6483       input = stdin;
6484       files = (char **) NULL;
6485       nfiles = 0;
6486     } else {
6487       input = NULL;
6488       files = argv;
6489       nfiles = argc;
6490     }
6491 
6492     /* Read in first batch of sequences */
6493     inbuffer = Inbuffer_new(nextchar,input,read_files_command,files,nfiles,inbuffer_nspaces);
6494 #ifdef USE_MPI
6495     *nread = 0;
6496 #else
6497     *nread = Inbuffer_fill_init(inbuffer);
6498 #endif
6499   }
6500 
6501   return inbuffer;
6502 }
6503 
6504 
6505 int
main(int argc,char * argv[])6506 main (int argc, char *argv[]) {
6507   int cmdline_status;
6508 
6509   char *genomesubdir = NULL, *snpsdir = NULL, *modedir = NULL, *mapdir = NULL, *iitfile = NULL, *fileroot = NULL;
6510   char *idx_filesuffix1, *idx_filesuffix2;
6511   int divno;
6512   Univinterval_T interval;
6513   Sequence_T usersegment = NULL;
6514 
6515   int nread;
6516   double runtime;
6517 
6518   Splicestringpool_T splicestringpool;
6519 
6520 #ifdef HAVE_PTHREAD
6521   int ret, i;
6522   pthread_attr_t thread_attr_join;
6523 #ifdef WORKER_DETACH
6524   pthread_attr_t thread_attr_detach;
6525 #endif
6526 #endif
6527 
6528 #ifdef HAVE_SIGACTION
6529   struct sigaction signal_action;
6530 #endif
6531 
6532   extern int optind;
6533 
6534 #ifdef MEMUSAGE
6535   Mem_usage_init();
6536   Mem_usage_set_threadname("main");
6537 #endif
6538 
6539 
6540 #ifdef USE_MPI
6541   MPI_Init(&argc,&argv);
6542   MPI_Comm_rank(MPI_COMM_WORLD,&myid);
6543   MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
6544 
6545   if ((n_worker_procs = nprocs - 1) == 0) {
6546     if (myid == 0) {
6547       fprintf(stderr,"Need at least 2 processes for MPI version\n");
6548     }
6549     MPI_Finalize();
6550     exit(0);
6551 
6552   } else {
6553     MPI_Debug_setup(myid);
6554   }
6555 #endif
6556 
6557   cmdline_status = parse_command_line(argc,argv,optind);
6558   argc -= optind;
6559   argv += optind;
6560 
6561   if (cmdline_status == 0) {
6562     /* okay to continue */
6563   } else if (cmdline_status == 1) {
6564     /* only information needed */
6565 #ifdef USE_MPI
6566     MPI_Finalize();
6567 #endif
6568     exit(0);
6569   } else {
6570 #ifdef USE_MPI
6571     MPI_Finalize();
6572 #endif
6573     exit(cmdline_status);
6574   }
6575 
6576   check_compiler_assumptions();
6577 
6578   if (exception_raise_p == false) {
6579     fprintf(stderr,"Allowing signals and exceptions to pass through.  If using shared memory, need to remove segments manually.\n");
6580     Except_inactivate();
6581   } else {
6582 #ifdef HAVE_SIGACTION
6583     signal_action.sa_handler = signal_handler;
6584     signal_action.sa_flags = 0;
6585     sigfillset(&signal_action.sa_mask); /* After first signal, block all other signals */
6586 
6587     /* Note: SIGKILL and SIGSTOP cannot be caught */
6588 
6589     sigaction(SIGFPE,&signal_action,NULL);
6590     sigaction(SIGSEGV,&signal_action,NULL);
6591     sigaction(SIGTRAP,&signal_action,NULL);
6592     sigaction(SIGUSR1,&signal_action,NULL);
6593     sigaction(SIGABRT,&signal_action,NULL); /* abnormal termination (abort) */
6594     sigaction(SIGBUS,&signal_action,NULL);  /* bus error */
6595     sigaction(SIGFPE,&signal_action,NULL);  /* arithmetic exception */
6596     sigaction(SIGHUP,&signal_action,NULL);  /* hangup */
6597     sigaction(SIGILL,&signal_action,NULL);  /* illegal hardware instruction */
6598     sigaction(SIGINT,&signal_action,NULL);  /* terminal interruption (control-C) */
6599     sigaction(SIGPIPE,&signal_action,NULL);  /* write to pipe with no readers */
6600     sigaction(SIGQUIT,&signal_action,NULL);  /* terminal quit (control-backslash) */
6601     sigaction(SIGSEGV,&signal_action,NULL);  /* invalid memory reference */
6602     sigaction(SIGSYS,&signal_action,NULL);  /* invalid system call */
6603     sigaction(SIGTERM,&signal_action,NULL);  /* Unix kill command */
6604     sigaction(SIGTRAP,&signal_action,NULL);  /* hardware fault */
6605     sigaction(SIGXCPU,&signal_action,NULL);  /* CPU limit exceeded */
6606     sigaction(SIGXFSZ,&signal_action,NULL);  /* file size limit exceeded */
6607 #endif
6608   }
6609 
6610 #ifdef USE_MPI
6611   if (myid > 0) {
6612     inbuffer = open_input_stream(&nread,&usersegment,argc,argv);
6613   }
6614 
6615 #else
6616   inbuffer = open_input_stream(&nread,&usersegment,argc,argv);
6617 
6618   if (nread > 1) {
6619     multiple_sequences_p = true;
6620 #if 0
6621 #ifdef HAVE_MMAP
6622     if (offsetsstrm_access != USE_ALLOCATE || genome_access != USE_ALLOCATE) {
6623       fprintf(stderr,"Note: >1 sequence detected, so index files are being memory mapped.\n");
6624       fprintf(stderr,"  GMAP can run slowly at first while the computer starts to accumulate\n");
6625       fprintf(stderr,"  pages from the hard disk into its cache.  To copy index files into RAM\n");
6626       fprintf(stderr,"  instead of memory mapping, use -B 3, -B 4, or -B 5, if you have enough RAM.\n");
6627 #ifdef HAVE_PTHREAD
6628       fprintf(stderr,"  For more speed, also try multiple threads (-t <int>), if you have multiple processors or cores.");
6629 #endif
6630       fprintf(stderr,"\n");
6631 #endif
6632     }
6633 #endif
6634 
6635   } else {
6636     /* fprintf(stderr,"Note: only 1 sequence detected.  Ignoring batch (-B) command\n"); */
6637     multiple_sequences_p = false;
6638     expand_offsets_p = false;
6639 #ifdef HAVE_MMAP
6640     offsetsstrm_access = USE_MMAP_ONLY;
6641     positions_access = USE_MMAP_ONLY;
6642     genome_access = USE_MMAP_ONLY;
6643 #else
6644     offsetsstrm_access = USE_ALLOCATE;
6645     positions_access = USE_ALLOCATE;
6646     genome_access = USE_ALLOCATE;
6647 #endif
6648   }
6649 
6650 #endif
6651 
6652 
6653   if (dbroot != NULL) {
6654     /* Prepare genomic data */
6655     genomesubdir = Datadir_find_genomesubdir(&fileroot,&dbversion,user_genomedir,dbroot);
6656 
6657     iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
6658 			      strlen(fileroot)+strlen(".chromosome.iit")+1,sizeof(char));
6659     sprintf(iitfile,"%s/%s.chromosome.iit",genomesubdir,fileroot);
6660     if ((chromosome_iit = Univ_IIT_read(iitfile,/*readonlyp*/true,/*add_iit_p*/false)) == NULL) {
6661       fprintf(stderr,"IIT file %s is not valid\n",iitfile);
6662       exit(9);
6663 #ifdef LARGE_GENOMES
6664     } else if (Univ_IIT_coord_values_8p(chromosome_iit) == false) {
6665       fprintf(stderr,"This program gmapl is designed for large genomes.\n");
6666       fprintf(stderr,"For small genomes of less than 2^32 (4 billion) bp, please run gmap instead.\n");
6667       exit(9);
6668 #endif
6669     } else {
6670       FREE(iitfile);
6671       nchromosomes = Univ_IIT_total_nintervals(chromosome_iit);
6672       circular_typeint = Univ_IIT_typeint(chromosome_iit,"circular");
6673 
6674       iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
6675 				strlen(fileroot)+strlen(".altscaffold.iit")+1,sizeof(char));
6676       sprintf(iitfile,"%s/%s.altscaffold.iit",genomesubdir,fileroot);
6677       if ((altscaffold_iit = Univ_IIT_read(iitfile,/*readonlyp*/true,/*add_iit_p*/false)) == NULL) {
6678 	/* fprintf(stderr,"No altscaffold file found\n"); */
6679 	altlocp = (bool *) CALLOC(nchromosomes+1,sizeof(bool));
6680 	alias_starts = (Univcoord_T *) CALLOC(nchromosomes+1,sizeof(Univcoord_T));
6681 	alias_ends = (Univcoord_T *) CALLOC(nchromosomes+1,sizeof(Univcoord_T));
6682 
6683       } else {
6684 	fprintf(stderr,"Found altscaffold file found\n");
6685 	altlocp = Univ_IIT_altlocp(&alias_starts,&alias_ends,chromosome_iit,altscaffold_iit);
6686 	Univ_IIT_free(&altscaffold_iit);
6687       }
6688       FREE(iitfile);
6689     }
6690 
6691     genomelength = Univ_IIT_genomelength(chromosome_iit,/*with_circular_alias*/true);
6692   }
6693 
6694 #ifdef USE_MPI
6695   /* Can prevent loading of files by rank 0 process */
6696 #endif
6697 
6698   if (map_iitfile == NULL) {
6699     /* Skip */
6700   } else if (!strcmp(map_iitfile,"?")) {
6701     Datadir_avail_maps(stdout,user_mapdir,genomesubdir,fileroot);
6702     exit(0);
6703   } else {
6704     mapdir = Datadir_find_mapdir(user_mapdir,genomesubdir,fileroot);
6705     iitfile = (char *) CALLOC(strlen(mapdir)+strlen("/")+
6706 			      strlen(map_iitfile)+strlen(".iit")+1,sizeof(char));
6707     sprintf(iitfile,"%s/%s.iit",mapdir,map_iitfile);
6708     if ((map_iit = IIT_read(iitfile,/*name*/map_iitfile,/*readonlyp*/true,/*divread*/READ_ALL,
6709 			    /*divstring*/NULL,/*add_iit_p*/true)) == NULL) {
6710       fprintf(stderr,"Map file %s.iit not found in %s.  Available files:\n",map_iitfile,mapdir);
6711       Datadir_list_directory(stderr,mapdir);
6712       fprintf(stderr,"Either install file %s.iit or specify a directory for the IIT file\n",iitfile);
6713       fprintf(stderr,"using the -M flag.\n");
6714       exit(9);
6715     } else {
6716       map_divint_crosstable = Univ_IIT_divint_crosstable(chromosome_iit,map_iit);
6717     }
6718 
6719     check_map_iit(map_iit,chromosome_iit);
6720 
6721     FREE(iitfile);
6722     FREE(mapdir);
6723     FREE(map_iitfile);
6724   }
6725 
6726   if (splicing_file != NULL) {
6727     if (user_splicingdir == NULL) {
6728       if ((splicing_iit = IIT_read(splicing_file,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
6729 				   /*divstring*/NULL,/*add_iit_p*/false)) != NULL) {
6730 	fprintf(stderr,"Reading splicing file %s locally...",splicing_file);
6731       } else {
6732 	iitfile = (char *) CALLOC(strlen(user_splicingdir)+strlen("/")+strlen(splicing_file)+1,sizeof(char));
6733 	sprintf(iitfile,"%s/%s",user_splicingdir,splicing_file);
6734 	if ((splicing_iit = IIT_read(splicing_file,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
6735 				     /*divstring*/NULL,/*add_iit_p*/false)) != NULL) {
6736 	  fprintf(stderr,"Reading splicing file %s locally...",splicing_file);
6737 	  FREE(iitfile);
6738 	}
6739       }
6740     }
6741 
6742     if (splicing_iit == NULL) {
6743       mapdir = Datadir_find_mapdir(/*user_mapdir*/NULL,genomesubdir,fileroot);
6744       iitfile = (char *) CALLOC(strlen(mapdir)+strlen("/")+
6745 				strlen(splicing_file)+1,sizeof(char));
6746       sprintf(iitfile,"%s/%s",mapdir,splicing_file);
6747       if ((splicing_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
6748 				   /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
6749 	fprintf(stderr,"Reading splicing file %s...",iitfile);
6750 	FREE(iitfile);
6751 	FREE(mapdir);
6752       } else {
6753 	fprintf(stderr,"Splicing file %s.iit not found locally or in %s.  Available files:\n",splicing_file,mapdir);
6754 	Datadir_list_directory(stderr,mapdir);
6755 	fprintf(stderr,"Either install file %s or specify a full directory path\n",splicing_file);
6756 	exit(9);
6757       }
6758     }
6759   }
6760 
6761   /* Complement_init(); */
6762   Dynprog_init(mode);
6763 #ifdef PMAP
6764   Backtranslation_init();
6765 #endif
6766 
6767   if (user_pairalign_p == true) {
6768     /* maxpaths_report = 1; -- no; could have different paths against the user segment. */
6769 
6770     genomecomp = (Genome_T) NULL;
6771     genomecomp_alt = (Genome_T) NULL;
6772     dbversion = (char *) NULL;
6773     altlocp = (bool *) MALLOC(sizeof(bool));
6774     altlocp[0] = false;
6775     alias_starts = (Univcoord_T *) CALLOC(1,sizeof(Univcoord_T));
6776     alias_ends = (Univcoord_T *) CALLOC(1,sizeof(Univcoord_T));
6777 
6778     /* Create genomecomp_blocks for each usersegment */
6779 
6780   } else if (global_usersegment != NULL) {
6781     /* Map against user-provided genomic segment */
6782     /* maxpaths_report = 1; -- no; could have different paths against the user segment. */
6783 
6784     genomecomp = (Genome_T) NULL;
6785     genomecomp_alt = (Genome_T) NULL;
6786     dbversion = (char *) NULL;
6787     genomecomp_blocks = Compress_create_blocks_comp(Sequence_fullpointer(global_usersegment),Sequence_fulllength(global_usersegment));
6788     altlocp = (bool *) MALLOC(sizeof(bool));
6789     altlocp[0] = false;
6790     alias_starts = (Univcoord_T *) CALLOC(1,sizeof(Univcoord_T));
6791     alias_ends = (Univcoord_T *) CALLOC(1,sizeof(Univcoord_T));
6792 
6793     if (Sequence_fulllength(global_usersegment) > 1000000) {
6794       fprintf(stderr,"Genomic sequence is unusually long (%d bp).  GMAP handles genomes better when\n",
6795 	      Sequence_fulllength(global_usersegment));
6796       fprintf(stderr,"  they are converted into gmap databases first using gmap_build, and then accessed\n");
6797       fprintf(stderr,"  with the -d flag.\n");
6798     }
6799 
6800   } else {
6801     if (snps_root == NULL) {
6802       genomecomp = Genome_new(genomesubdir,fileroot,/*snps_root*/NULL,/*genometype*/GENOME_OLIGOS,
6803 			      uncompressedp,genome_access,sharedp);
6804       genomecomp_blocks = Genome_blocks(genomecomp);
6805       genomecomp_alt = (Genome_T) NULL;
6806 
6807     } else {
6808       /* Map against genome with SNPs */
6809       if (user_snpsdir == NULL) {
6810 	snpsdir = genomesubdir;
6811       } else {
6812 	snpsdir = user_snpsdir;
6813       }
6814 
6815       genomecomp = Genome_new(genomesubdir,fileroot,/*snps_root*/NULL,/*genometype*/GENOME_OLIGOS,
6816 			      uncompressedp,genome_access,sharedp);
6817       genomecomp_blocks = Genome_blocks(genomecomp);
6818       genomecomp_alt = Genome_new(snpsdir,fileroot,snps_root,/*genometype*/GENOME_OLIGOS,
6819 				  uncompressedp,genome_access,sharedp);
6820     }
6821 
6822     if (user_modedir != NULL) {
6823       modedir = user_modedir;
6824     } else {
6825       modedir = genomesubdir;
6826     }
6827 
6828     if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
6829       idx_filesuffix1 = "metct";
6830       idx_filesuffix2 = "metga";
6831     } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED) {
6832       idx_filesuffix1 = "a2iag";
6833       idx_filesuffix2 = "a2itc";
6834     } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
6835       idx_filesuffix1 = "a2itc";
6836       idx_filesuffix2 = "a2iag";
6837     } else {
6838       idx_filesuffix1 = IDX_FILESUFFIX; /* "ref" */
6839       idx_filesuffix2 = (char *) NULL;
6840     }
6841 
6842     if ((indexdb_fwd = Indexdb_new_genome(&index1part,&index1interval,
6843 					  modedir,fileroot,idx_filesuffix1,snps_root,
6844 					  required_index1part,required_index1interval,
6845 					  offsetsstrm_access,positions_access,
6846 					  sharedp,multiple_sequences_p,/*preload_shared_memory_p*/false,
6847 					  /*unload_shared_memory_p*/false)) == NULL) {
6848 
6849       if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
6850 	fprintf(stderr,"Cannot find %s index file.  Need to run cmetindex first\n",idx_filesuffix1);
6851       } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED ||
6852 		 mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
6853 	fprintf(stderr,"Cannot find %s index file.  Need to run atoiindex first\n",idx_filesuffix1);
6854       } else {
6855 	fprintf(stderr,"Cannot find %s index file\n",idx_filesuffix1);
6856       }
6857       exit(9);
6858     }
6859 
6860     if (idx_filesuffix2 == NULL) {
6861       indexdb_rev = indexdb_fwd;
6862     } else if ((indexdb_rev = Indexdb_new_genome(&index1part,&index1interval,
6863 						 modedir,fileroot,idx_filesuffix2,snps_root,
6864 						 required_index1part,required_index1interval,
6865 						 offsetsstrm_access,positions_access,
6866 						 sharedp,multiple_sequences_p,/*preload_shared_memory_p*/false,
6867 						 /*unload_shared_memory_p*/false)) == NULL) {
6868       if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
6869 	fprintf(stderr,"Cannot find %s index file.  Need to run cmetindex first\n",idx_filesuffix2);
6870       } else {
6871 	fprintf(stderr,"Cannot find %s index file.  Need to run atoiindex first\n",idx_filesuffix2);
6872       }
6873       exit(9);
6874     }
6875 
6876 
6877     if (user_chrsubsetname != NULL) {
6878       if ((divno = Univ_IIT_find_one(chromosome_iit,user_chrsubsetname)) < 0) {
6879 	fprintf(stderr,"Cannot find chrsubset %s in chromosome IIT file.  Ignoring.\n",user_chrsubsetname);
6880       } else {
6881 	interval = Univ_IIT_interval(chromosome_iit,divno);
6882 	chrsubset_start = Univinterval_low(interval);
6883 	chrsubset_end = Univinterval_high(interval);
6884       }
6885     }
6886   }
6887 
6888   FREE(genomesubdir);
6889   FREE(fileroot);
6890   FREE(dbroot);
6891 
6892 
6893   if (splicing_file != NULL && genomecomp != NULL) {
6894     if (Genome_blocks(genomecomp) == NULL) {
6895       fprintf(stderr,"known splicing can be used only with compressed genome\n");
6896     } else {
6897       /* TODO: Handle case for observed distances */
6898       /* min_extra_end no longer used by gregion.c */
6899       /* min_extra_end = shortsplicedist; */
6900 
6901       splicing_divint_crosstable = Univ_IIT_divint_crosstable(chromosome_iit,splicing_iit);
6902       if ((donor_typeint = IIT_typeint(splicing_iit,"donor")) >= 0 &&
6903 	  (acceptor_typeint = IIT_typeint(splicing_iit,"acceptor")) >= 0) {
6904 	fprintf(stderr,"found donor and acceptor tags, so treating as splicesites file\n");
6905 	splicestringpool = Splicestringpool_new();
6906 	splicesites = Splicetrie_retrieve_via_splicesites(&distances_observed_p,&splicetypes,&splicedists,
6907 							  &splicestrings,&splicefrags_ref,&splicefrags_alt,
6908 							  &nsplicesites,splicing_iit,splicing_divint_crosstable,
6909 							  donor_typeint,acceptor_typeint,chromosome_iit,
6910 							  genomecomp,genomecomp_alt/*can be NULL*/,shortsplicedist,
6911 							  splicestringpool);
6912 	if (nsplicesites == 0) {
6913 	  fprintf(stderr,"\nWarning: No splicesites observed for genome %s.  Are you sure this splicesite file was built for this genome?  Please compare chromosomes below:\n",
6914 		  dbroot);
6915 	  fprintf(stderr,"Chromosomes in the genome: ");
6916 	  Univ_IIT_dump_labels(stderr,chromosome_iit);
6917 	  fprintf(stderr,"Chromosomes in the splicesites IIT file: ");
6918 	  IIT_dump_divstrings(stderr,splicing_iit);
6919 	  exit(9);
6920 
6921 	} else {
6922 	  Splicetrie_npartners(&nsplicepartners_skip,&nsplicepartners_obs,&nsplicepartners_max,splicesites,splicetypes,splicedists,
6923 			       splicestrings,nsplicesites,chromosome_iit,shortsplicedist,distances_observed_p);
6924 	  Splicetrie_build_via_splicesites(&triecontents_obs,&trieoffsets_obs,&triecontents_max,&trieoffsets_max,
6925 					   nsplicepartners_skip,nsplicepartners_obs,nsplicepartners_max,splicetypes,
6926 					   splicestrings,nsplicesites);
6927 	  FREE(nsplicepartners_max);
6928 	  FREE(nsplicepartners_obs);
6929 	  FREE(nsplicepartners_skip);
6930 	  /* Splicestring_gc(splicestrings,nsplicesites); */
6931 	  FREE(splicestrings);
6932 	}
6933 	Splicestringpool_free(&splicestringpool);
6934 
6935       } else {
6936 	fprintf(stderr,"no donor or acceptor tags found, so treating as introns file\n");
6937 	splicestringpool = Splicestringpool_new();
6938 	splicesites = Splicetrie_retrieve_via_introns(&splicetypes,&splicedists,
6939 						      &splicestrings,&splicefrags_ref,&splicefrags_alt,
6940 						      &nsplicesites,splicing_iit,splicing_divint_crosstable,
6941 						      chromosome_iit,genomecomp,genomecomp_alt/*can be NULL*/,
6942 						      splicestringpool);
6943 	if (nsplicesites == 0) {
6944 	  fprintf(stderr,"\nWarning: No splicesites observed for genome %s.  Are you sure this splicesite file was built for this genome?  Please compare chromosomes below:\n",
6945 		  dbroot);
6946 	  fprintf(stderr,"Chromosomes in the genome: ");
6947 	  Univ_IIT_dump_labels(stderr,chromosome_iit);
6948 	  fprintf(stderr,"Chromosomes in the splicesites IIT file: ");
6949 	  IIT_dump_divstrings(stderr,splicing_iit);
6950 	  exit(9);
6951 	} else {
6952 	  Splicetrie_build_via_introns(&triecontents_obs,&trieoffsets_obs,splicesites,splicetypes,
6953 				       splicestrings,nsplicesites,chromosome_iit,splicing_iit,splicing_divint_crosstable);
6954 	  triecontents_max = (Triecontent_T *) NULL;
6955 	  trieoffsets_max =  (Trieoffset_T *) NULL;
6956 	  /* Splicestring_gc(splicestrings,nsplicesites); */
6957 	  FREE(splicestrings);
6958 	}
6959 	Splicestringpool_free(&splicestringpool);
6960 
6961       }
6962     }
6963 
6964     fprintf(stderr,"done\n");
6965   }
6966 
6967 
6968   Translation_setup(translation_code,alt_initiation_codons_p);
6969 
6970   if (user_pairalign_p == true) {
6971     /* Creation of genomecomp and initialization done within single_thread() for each input sequence */
6972     any_circular_p = false;
6973     circularp = (bool *) MALLOC(1*sizeof(bool));
6974     circularp[0] = false;
6975 
6976   } else if (usersegment != NULL) {
6977     any_circular_p = false;
6978     circularp = (bool *) MALLOC(1*sizeof(bool));
6979     circularp[0] = false;
6980 
6981     Genome_user_setup(genomecomp_blocks,genomelength);
6982     Genome_sites_setup(genomecomp_blocks,/*snp_blocks*/NULL);
6983     Maxent_hr_setup(genomecomp_blocks,/*genomealt_blocks*/genomecomp_blocks);
6984 #ifdef PMAP
6985     Oligoindex_pmap_setup(genomecomp);
6986 #else
6987     Oligoindex_hr_setup(genomecomp_blocks,mode);
6988     /* Oligoindex_localdb_setup(chromosome_iit,circular_typeint,localdb,local1part); */
6989 #endif
6990 
6991   } else if (genomecomp != NULL) {
6992     circularp = Univ_IIT_circularp(&any_circular_p,chromosome_iit);
6993 
6994     Genome_setup(genomecomp,genomecomp_alt/*can be NULL*/,genomelength,mode,circular_typeint);
6995     Genome_sites_setup(Genome_blocks(genomecomp),/*snp_blocks*/genomecomp_alt ? Genome_blocks(genomecomp_alt) : NULL);
6996     Maxent_hr_setup(Genome_blocks(genomecomp),/*snp_blocks*/genomecomp_alt ? Genome_blocks(genomecomp_alt) : NULL);
6997 #ifdef PMAP
6998     Alphabet_setup(alphabet,alphabet_size,index1part_aa);
6999     Oligoindex_pmap_setup(genomecomp);
7000     Oligop_setup(alphabet,alphabet_size,index1part_aa);
7001     Indexdb_setup(index1part_aa);
7002     Stage1_setup(index1part_aa,maxextension,maxtotallen_bound,circular_typeint);
7003 #else
7004     Oligoindex_hr_setup(Genome_blocks(genomecomp),mode);
7005     /* Oligoindex_localdb_setup(chromosome_iit,circular_typeint,localdb,local1part); */
7006     Oligo_setup(index1part,mode);
7007     Indexdb_setup(index1part);
7008     Stage1_setup(index1part,maxextension,maxtotallen_bound,circular_typeint);
7009 #endif
7010   }
7011 
7012   Stage2_setup(/*splicingp*/novelsplicingp == true || knownsplicingp == true,cross_species_p,
7013 	       suboptimal_score_start,suboptimal_score_end,sufflookback,nsufflookback,maxintronlen,mode,
7014 	       /*snps_p*/genomecomp_alt ? true : false);
7015   Dynprog_single_setup(homopolymerp);
7016   Dynprog_genome_setup(novelsplicingp,splicing_iit,splicing_divint_crosstable,
7017 		       donor_typeint,acceptor_typeint);
7018   Dynprog_end_setup(splicesites,splicetypes,splicedists,nsplicesites,
7019 		    trieoffsets_obs,triecontents_obs,trieoffsets_max,triecontents_max);
7020   Pair_setup(novelsplicingp,splicing_iit,trim_indel_score,
7021 	     gff3_separators_p,sam_insert_0M_p,force_xs_direction_p,
7022 	     md_lowercase_variant_p,/*snps_p*/genomecomp_alt ? true : false,
7023 	     gff3_phase_swap_p,cdstype,sam_cigar_extended_p,cigar_action);
7024 
7025   Stage3_setup(/*splicingp*/novelsplicingp == true || knownsplicingp == true,novelsplicingp,
7026 	       require_splicedir_p,shortsplicedist,splicing_iit,splicing_divint_crosstable,
7027 	       donor_typeint,acceptor_typeint,splicesites,circularp,altlocp,alias_starts,alias_ends,
7028 	       min_intronlength,max_deletionlength,/*min_indel_end_matches*/6,
7029 	       maxpeelback_distalmedial,nullgap,extramaterial_end,extramaterial_paired,
7030 	       extraband_single,extraband_end,extraband_paired,
7031 	       ngap,maxintronlen,maxintronlen_ends,minendexon,homopolymerp,gff3_fasta_annotation_type,
7032 	       stage3debug,/*genome_totallength*/genomelength);
7033 
7034   Splicetrie_setup(splicesites,splicefrags_ref,splicefrags_alt,
7035 		   trieoffsets_obs,triecontents_obs,trieoffsets_max,triecontents_max,
7036 		   /*snpp*/false,amb_closest_p,/*amb_clip_p*/true,/*min_shortend*/2);
7037   Output_setup(chromosome_iit,nofailsp,failsonlyp,quiet_if_excessive_p,maxpaths_report,
7038 	       failedinput_root,quality_shift,
7039 	       printtype,invertmode,wraplength,ngap,nointronlenp,sam_paired_p,cds_startpos,
7040 	       fulllengthp,truncatep,strictp,checksump,genomecomp,usersegment,user_genomicseg,
7041 	       dbversion,user_chrsubsetname,contig_iit,altstrain_iit,
7042 	       /*chimeras_allowed_p*/chimera_margin > 0 ? true : false,
7043 	       map_iit,map_divint_crosstable,map_exons_p,map_bothstrands_p,
7044 	       nflanking,print_comment_p,sam_read_group_id);
7045 
7046 #ifdef USE_MPI
7047   if (myid == 0) {
7048     Outbuffer_setup(argc,argv,optind,chromosome_iit,any_circular_p,
7049 		    nworkers,orderedp,quiet_if_excessive_p,
7050 		    printtype,usersegment,sam_headers_p,sam_read_group_id,sam_read_group_name,
7051 		    sam_read_group_library,sam_read_group_platform,
7052 		    appendp,/*output_file*/NULL,split_output_root,failedinput_root);
7053     outbuffer = Outbuffer_new(output_buffer_size,/*nread*/0);
7054     /* Inbuffer_set_outbuffer(inbuffer,outbuffer); */
7055 
7056     fprintf(stderr,"Starting alignment\n");
7057     stopwatch = Stopwatch_new();
7058     Stopwatch_start(stopwatch);
7059   }
7060 #else
7061   Outbuffer_setup(argc,argv,optind,chromosome_iit,any_circular_p,
7062 		  nworkers,orderedp,quiet_if_excessive_p,
7063 		  printtype,usersegment,sam_headers_p,sam_read_group_id,sam_read_group_name,
7064 		  sam_read_group_library,sam_read_group_platform,
7065 		  appendp,/*output_file*/NULL,split_output_root,failedinput_root);
7066   outbuffer = Outbuffer_new(output_buffer_size,nread);
7067   Inbuffer_set_outbuffer(inbuffer,outbuffer);
7068 
7069   fprintf(stderr,"Starting alignment\n");
7070   stopwatch = Stopwatch_new();
7071   Stopwatch_start(stopwatch);
7072 #endif
7073 
7074 
7075 #ifdef USE_MPI
7076   /* MPI version */
7077   if (myid == 0) {
7078 #ifdef WORKER_DETACH
7079     pthread_attr_init(&thread_attr_detach);
7080     if ((ret = pthread_attr_setdetachstate(&thread_attr_detach,PTHREAD_CREATE_DETACHED)) != 0) {
7081       fprintf(stderr,"ERROR: pthread_attr_setdetachstate %d\n",ret);
7082       exit(1);
7083     }
7084 #endif
7085     pthread_attr_init(&thread_attr_join);
7086     if ((ret = pthread_attr_setdetachstate(&thread_attr_join,PTHREAD_CREATE_JOINABLE)) != 0) {
7087       fprintf(stderr,"ERROR: pthread_attr_setdetachstate %d\n",ret);
7088       exit(1);
7089     }
7090 
7091     Except_init_pthread();
7092     /* pthread_key_create(&global_request_key,NULL); */
7093 
7094     if (orderedp == true) {
7095       pthread_create(&output_thread_id,&thread_attr_join,Outbuffer_thread_ordered,
7096 		     (void *) outbuffer);
7097     } else {
7098       pthread_create(&output_thread_id,&thread_attr_join,Outbuffer_thread_anyorder,
7099 		     (void *) outbuffer);
7100     }
7101 
7102     Outbuffer_mpi_process(outbuffer,/*n_worker_procs*/nprocs - 1,part_modulus,part_interval);
7103     pthread_join(output_thread_id,NULL);
7104 
7105     /* pthread_key_delete(global_request_key); */
7106     /* Except_term_pthread(); */
7107 
7108   } else {
7109     worker_mpi_process(/*worker_id*/myid,inbuffer);
7110   }
7111 
7112 #elif !defined(HAVE_PTHREAD)
7113   /* Serial version */
7114   single_thread();
7115 
7116 #else
7117   /* Pthreads version */
7118   if (nworkers == 0) {
7119     single_thread();
7120 
7121   } else if (multiple_sequences_p == false) {
7122     single_thread();
7123 
7124   } else {
7125 #ifdef WORKER_DETACH
7126     pthread_attr_init(&thread_attr_detach);
7127     if ((ret = pthread_attr_setdetachstate(&thread_attr_detach,PTHREAD_CREATE_DETACHED)) != 0) {
7128       fprintf(stderr,"ERROR: pthread_attr_setdetachstate %d\n",ret);
7129       exit(1);
7130     }
7131 #endif
7132     pthread_attr_init(&thread_attr_join);
7133     if ((ret = pthread_attr_setdetachstate(&thread_attr_join,PTHREAD_CREATE_JOINABLE)) != 0) {
7134       fprintf(stderr,"ERROR: pthread_attr_setdetachstate %d\n",ret);
7135       exit(1);
7136     }
7137 
7138     worker_thread_ids = (pthread_t *) CALLOC(nworkers,sizeof(pthread_t));
7139     Except_init_pthread();
7140     pthread_key_create(&global_request_key,NULL);
7141 
7142     if (orderedp == true) {
7143       pthread_create(&output_thread_id,&thread_attr_join,Outbuffer_thread_ordered,
7144 		     (void *) outbuffer);
7145     } else {
7146       pthread_create(&output_thread_id,&thread_attr_join,Outbuffer_thread_anyorder,
7147 		     (void *) outbuffer);
7148     }
7149 
7150     for (i = 0; i < nworkers; i++) {
7151 #ifdef WORKER_DETACH
7152       pthread_create(&(worker_thread_ids[i]),&thread_attr_detach,worker_thread,(void *) NULL);
7153 #else
7154       /* Need to have worker threads finish before we call Inbuffer_free() */
7155       pthread_create(&(worker_thread_ids[i]),&thread_attr_join,worker_thread,(void *) NULL);
7156 #endif
7157     }
7158 
7159     pthread_join(output_thread_id,NULL);
7160     for (i = 0; i < nworkers; i++) {
7161       pthread_join(worker_thread_ids[i],NULL);
7162     }
7163 
7164     pthread_key_delete(global_request_key);
7165     /* Do not delete global_except_key, because worker threads might still need it */
7166     /* Except_term_pthread(); */
7167 
7168     FREE(worker_thread_ids);
7169 
7170   }
7171 #endif /* HAVE_PTHREAD */
7172 
7173 
7174 #ifdef USE_MPI
7175   if (myid == 0) {
7176     runtime = Stopwatch_stop(stopwatch);
7177     Stopwatch_free(&stopwatch);
7178 
7179     nread = Outbuffer_nread(outbuffer);
7180     nbeyond = Outbuffer_nbeyond(outbuffer);
7181     fprintf(stderr,"Processed %u queries in %.2f seconds (%.2f queries/sec)\n",
7182 	    nread-nbeyond,runtime,(double) nread/runtime);
7183 
7184     Outbuffer_free(&outbuffer);
7185     Inbuffer_free(&inbuffer);	/* Also closes inputs */
7186   }
7187 
7188   Outbuffer_close_files();	/* All ranks have to close the files */
7189 
7190 #else
7191   /* Single CPU or Pthreads version */
7192   runtime = Stopwatch_stop(stopwatch);
7193   Stopwatch_free(&stopwatch);
7194 
7195   nread = Outbuffer_nread(outbuffer);
7196   /* nbeyond = Outbuffer_nbeyond(outbuffer); */
7197   fprintf(stderr,"Processed %u queries in %.2f seconds (%.2f queries/sec)\n",
7198 	  nread,runtime,(double) nread/runtime);
7199 
7200   Outbuffer_free(&outbuffer);
7201   Inbuffer_free(&inbuffer);	/* Also closes inputs */
7202 
7203   Outbuffer_close_files();
7204 #endif
7205 
7206 #ifdef PMAP
7207   Backtranslation_term();
7208 #endif
7209   Dynprog_term(mode);
7210 
7211 
7212   if (nsplicesites > 0) {
7213     if (splicetrie_precompute_p == true) {
7214       FREE(triecontents_max);
7215       FREE(trieoffsets_max);
7216       FREE(triecontents_obs);
7217       FREE(trieoffsets_obs);
7218     } else {
7219       FREE(nsplicepartners_max);
7220       FREE(nsplicepartners_obs);
7221       FREE(nsplicepartners_skip);
7222       /* Splicestring_gc(splicestrings,nsplicesites); */
7223       FREE(splicestrings);
7224     }
7225     FREE(splicefrags_ref);
7226     FREE(splicedists);
7227     FREE(splicetypes);
7228     FREE(splicesites);
7229   }
7230 
7231   if (splicing_iit != NULL) {
7232     FREE(splicing_divint_crosstable);
7233     IIT_free(&splicing_iit);
7234   }
7235 
7236 #if 0
7237   /* Oligoindex_localdb_cleanup(); */
7238   if (localdb != NULL) {
7239     Localdb_free(&localdb);
7240   }
7241 #endif
7242 
7243 #ifdef PMAP
7244  if (indexdb_rev != NULL) {
7245     Indexdb_free(&indexdb_rev);
7246   }
7247   if (indexdb_fwd != NULL) {
7248     Indexdb_free(&indexdb_fwd);
7249   }
7250 #else
7251   if (indexdb_rev != indexdb_fwd) {
7252     Indexdb_free(&indexdb_rev);
7253   }
7254   if (indexdb_fwd != NULL) {
7255     Indexdb_free(&indexdb_fwd);
7256   }
7257 #endif
7258   if (dbversion != NULL) {
7259     FREE(dbversion);
7260   }
7261   if (altstrain_iit != NULL) {
7262     IIT_free(&altstrain_iit);
7263   }
7264   if (genomecomp_alt != NULL) {
7265     Genome_free(&genomecomp_alt);
7266   }
7267   if (user_pairalign_p == true) {
7268     /* genomecomp_blocks freed within single_thread */
7269   } else if (usersegment != NULL) {
7270     FREE(genomecomp_blocks);
7271   } else if (genomecomp != NULL) {
7272     Genome_free(&genomecomp);
7273   }
7274 
7275   if (map_iit != NULL) {
7276     IIT_free(&map_iit);
7277   }
7278   if (contig_iit != NULL) {
7279     Univ_IIT_free(&contig_iit);
7280   }
7281   if (altlocp != NULL) {
7282     FREE(alias_ends);
7283     FREE(alias_starts);
7284     FREE(altlocp);
7285   }
7286   if (circularp != NULL) {
7287     FREE(circularp);
7288   }
7289   if (chromosome_iit != NULL) {
7290     Univ_IIT_free(&chromosome_iit);
7291   }
7292 
7293   if (user_selfalign_p == true) {
7294     /* Do not free usersegment */
7295   } else if (usersegment != NULL) {
7296     Sequence_free(&usersegment);
7297   }
7298 
7299   Outbuffer_cleanup();
7300 
7301   Access_controlled_cleanup();
7302 
7303 #ifdef USE_MPI
7304   MPI_Barrier(MPI_COMM_WORLD);	/* Make sure all processes have cleaned up */
7305   MPI_Finalize();
7306 #endif
7307 
7308   return 0;
7309 }
7310 
7311 
7312 static void
print_program_usage()7313 print_program_usage () {
7314 #ifdef PMAP
7315     fprintf(stdout,"\
7316 Usage: pmap [OPTIONS...] <FASTA files...>, or\n\
7317        cat <FASTA files...> | pmap [OPTIONS...]\n\
7318 ");
7319 #else
7320     fprintf(stdout,"\
7321 Usage: gmap [OPTIONS...] <FASTA files...>, or\n\
7322        cat <FASTA files...> | gmap [OPTIONS...]\n\
7323 ");
7324 #endif
7325     fprintf(stdout,"\n");
7326 
7327     fprintf(stdout,"Input options (must include -d or -g)\n");
7328     fprintf(stdout,"\
7329   -D, --dir=directory            Genome directory.  Default (as specified by --with-gmapdb to the configure program) is\n \
7330                                    %s\n\
7331 ",GMAPDB);
7332     fprintf(stdout,"\
7333   -d, --db=STRING                Genome database.  If argument is '?' (with\n\
7334                                    the quotes), this command lists available databases.\n\
7335 ");
7336     fprintf(stdout,"\n");
7337 
7338 #ifdef PMAP
7339     fprintf(stdout,"\
7340   -a, --alphabet=STRING          Alphabet to use in PMAP genome database\n\
7341                                    (allowed values in order of preference: 20, 15a, 12a).\n\
7342                                    If not specified, the program will find the first available\n\
7343                                    alphabet in the genome database in preference order\n\
7344 ");
7345 #endif
7346 
7347 #if 0
7348     /* No longer supported */
7349     fprintf(stdout,"\
7350     -G, --genomefull               Use full genome (all ASCII chars allowed;\n \
7351                                    built explicitly during setup), not\n\
7352                                    compressed version\n\
7353 ");
7354 #endif
7355 
7356     fprintf(stdout,"\
7357   -k, --kmer=INT                 kmer size to use in genome database (allowed values: 16 or less).\n\
7358                                    If not specified, the program will find the highest available\n\
7359                                    kmer size in the genome database\n\
7360   --sampling=INT                 Sampling to use in genome database.  If not specified, the program\n\
7361                                    will find the smallest available sampling value in the genome database\n\
7362                                    within selected k-mer size\n\
7363   -g, --gseg=filename            User-supplied genomic segment\n\
7364   -1, --selfalign                Align one sequence against itself in FASTA format via stdin\n\
7365                                    (Useful for getting protein translation of a nucleotide sequence)\n\
7366   -2, --pairalign                Align two sequences in FASTA format via stdin, first one being\n\
7367                                    genomic and second one being cDNA\n\
7368   --cmdline=STRING,STRING        Align these two sequences provided on the command line,\n\
7369                                    first one being genomic and second one being cDNA\n\
7370   -q, --part=INT/INT             Process only the i-th out of every n sequences\n\
7371                                    e.g., 0/100 or 99/100 (useful for distributing jobs\n\
7372                                    to a computer farm).\n\
7373 ");
7374     fprintf(stdout,"\
7375   --input-buffer-size=INT        Size of input buffer (program reads this many sequences\n\
7376                                    at a time for efficiency) (default %d)\n\
7377 ",inbuffer_nspaces);
7378     fprintf(stdout,"\n");
7379 
7380     fprintf(stdout,"Computation options\n");
7381 #ifdef HAVE_MMAP
7382     fprintf(stdout,"\
7383   -B, --batch=INT                Batch mode (default = 2)\n\
7384                                  Mode     Positions       Genome\n\
7385                                    0      mmap            mmap\n\
7386                                    1      mmap & preload  mmap\n\
7387                       (default)    2      mmap & preload  mmap & preload\n\
7388                                    3      allocate        mmap & preload\n\
7389                                    4      allocate        allocate\n\
7390                                    5      allocate        allocate     (same as 4)\n\
7391                            Note: For a single sequence, all data structures use mmap\n\
7392                            If mmap not available and allocate not chosen, then will use fileio (very slow)\n\
7393 ");
7394 #else
7395     fprintf(stdout,"\
7396   -B, --batch=INT                Batch mode (default = 4, modes 0-3 disallowed because program configured without mmap)\n\
7397                                  Mode     Positions       Genome\n\
7398                       (default)    4      allocate        allocate\n\
7399                                    5      allocate        allocate     (same as 4)\n\
7400 ");
7401 #endif
7402 
7403   fprintf(stdout,"\
7404   --use-shared-memory=INT        If 1, then allocated memory is shared among all processes on this node\n\
7405                                    If 0 (default), then each process has private allocated memory\n\
7406 ");
7407 
7408     fprintf(stdout,"\
7409   --nosplicing                   Turns off splicing (useful for aligning genomic sequences\n\
7410                                    onto a genome)\n\
7411 ");
7412     fprintf(stdout,"\
7413   --min-intronlength=INT         Min length for one internal intron (default %d).  Below this size,\n\
7414                                    a genomic gap will be considered a deletion rather than an intron.\n\
7415 ",min_intronlength);
7416     fprintf(stdout,"\
7417   --max-intronlength-middle=INT  Max length for one internal intron (default %d).  Note: for backward\n\
7418                                    compatibility, the -K or --intronlength flag will set both\n\
7419                                    --max-intronlength-middle and --max-intronlength-ends.\n\
7420                                    Also see --split-large-introns below.\n\
7421 ",maxintronlen);
7422     fprintf(stdout,"\
7423   --max-intronlength-ends=INT    Max length for first or last intron (default %d).  Note: for backward\n\
7424                                    compatibility, the -K or --intronlength flag will set both\n\
7425                                    --max-intronlength-middle and --max-intronlength-ends.\n\
7426 ",maxintronlen_ends);
7427     fprintf(stdout,"\
7428   --split-large-introns          Sometimes GMAP will exceed the value for --max-intronlength-middle,\n\
7429                                    if it finds a good single alignment.  However, you can force GMAP\n\
7430                                    to split such alignments by using this flag\n\
7431 ");
7432     fprintf(stdout,"\
7433   --trim-end-exons=INT           Trim end exons with fewer than given number of matches\n\
7434                                    (in nt, default %d)\n\
7435 ",minendexon);
7436     fprintf(stdout,"\
7437   -w, --localsplicedist=INT      Max length for known splice sites at ends of sequence\n\
7438                                    (default %d)\n\
7439 ",shortsplicedist);
7440     fprintf(stdout,"\
7441   -L, --totallength=INT          Max total intron length (default %d)\n\
7442 ",maxtotallen_bound);
7443     fprintf(stdout,"\
7444   -x, --chimera-margin=INT       Amount of unaligned sequence that triggers\n\
7445                                    search for the remaining sequence (default %d).\n\
7446                                    Enables alignment of chimeric reads, and may help\n\
7447                                    with some non-chimeric reads.  To turn off, set to\n\
7448                                    zero.\n\
7449 ",chimera_margin);
7450     fprintf(stdout,"\
7451   --no-chimeras                  Turns off finding of chimeras.  Same effect as --chimera-margin=0\n\
7452 ");
7453 
7454 #if 0
7455     fprintf(stdout,"\
7456   -w, --reference=filename       Reference cDNA sequence for relative alignment\n\
7457 ");
7458 #endif
7459 
7460 #ifdef HAVE_PTHREAD
7461     fprintf(stdout,"\
7462   -t, --nthreads=INT             Number of worker threads\n\
7463 ");
7464 #else
7465   fprintf(stdout,"\
7466   -t, --nthreads=INT             Number of worker threads.  Flag is ignored in this version of GMAP, which has pthreads disabled\n\
7467 ");
7468 #endif
7469     fprintf(stdout,"\
7470   -c, --chrsubset=string         Limit search to given chromosome\n\
7471   -z, --direction=STRING         cDNA direction (sense_force, antisense_force,\n\
7472                                    sense_filter, antisense_filter,or auto (default))\n\
7473 ");
7474     fprintf(stdout,"\
7475   --canonical-mode=INT           Reward for canonical and semi-canonical introns\n\
7476                                    0=low reward, 1=high reward (default), 2=low reward for\n\
7477                                    high-identity sequences and high reward otherwise\n\
7478   --cross-species                Use a more sensitive search for canonical splicing, which helps especially\n\
7479                                    for cross-species alignments and other difficult cases\n\
7480   --allow-close-indels=INT       Allow an insertion and deletion close to each other\n\
7481                                    (0=no, 1=yes (default), 2=only for high-quality alignments)\n\
7482 ");
7483     fprintf(stdout,"\
7484   --microexon-spliceprob=FLOAT   Allow microexons only if one of the splice site probabilities is\n\
7485                                    greater than this value (default %.2f)\n\
7486 ",microexon_spliceprob);
7487 
7488 #if 0
7489     fprintf(stdout,"\
7490   --homopolymer                  In dynamic programming, favor indels in regions of homopolymers,\n\
7491                                    e.g., AAAAAA.  Useful for some platforms, such as Pacific Biosciences\n\
7492 ");
7493 #endif
7494 
7495 #ifndef PMAP
7496     fprintf(stdout,"\
7497   --cmetdir=STRING               Directory for methylcytosine index files (created using cmetindex)\n\
7498                                    (default is location of genome index files specified using -D, -V, and -d)\n\
7499   --atoidir=STRING               Directory for A-to-I RNA editing index files (created using atoiindex)\n\
7500                                    (default is location of genome index files specified using -D, -V, and -d)\n\
7501   --mode=STRING                  Alignment mode: standard (default), cmet-stranded, cmet-nonstranded,\n\
7502                                     atoi-stranded, atoi-nonstranded, ttoc-stranded, or ttoc-nonstranded.\n\
7503                                     Non-standard modes requires you to have previously run the cmetindex\n\
7504                                     or atoiindex programs (which also cover the ttoc modes) on the genome\n\
7505 ");
7506 #endif
7507 
7508 #if 0
7509     /* Causes seg faults, so do not advertise */
7510     fprintf(stdout,"\
7511   -s, --splicing=STRING          Look for splicing involving known sites\n\
7512                                    (in <STRING>.iit)\n\
7513 ");
7514 #endif
7515 
7516 #ifndef PMAP
7517     fprintf(stdout,"\
7518   -p, --prunelevel               Pruning level: 0=no pruning (default), 1=poor seqs,\n\
7519                                    2=repetitive seqs, 3=poor and repetitive\n\
7520 ");
7521 #endif
7522     fprintf(stdout,"\n");
7523 
7524     fprintf(stdout,"\
7525 Output types\n\
7526   -S, --summary                  Show summary of alignments only\n\
7527   -A, --align                    Show alignments\n\
7528   -3, --continuous               Show alignment in three continuous lines\n\
7529   -4, --continuous-by-exon       Show alignment in three lines per exon\n\
7530   -Z, --compress                 Print output in compressed format\n\
7531   -E, --exons=STRING             Print exons (\"cdna\" or \"genomic\")\n\
7532                                    Will also print introns with \"cdna+introns\" or\n\
7533                                    \"genomic+introns\"\n\
7534 ");
7535 
7536 #ifdef PMAP
7537     fprintf(stdout,"\
7538   -P, --protein_gen              Print protein sequence (genomic)\n\
7539   -Q, --nucleotide               Print inferred nucleotide sequence from protein\n\
7540 ");
7541 #else
7542     fprintf(stdout,"\
7543   -P, --protein_dna              Print protein sequence (cDNA)\n\
7544   -Q, --protein_gen              Print protein sequence (genomic)\n\
7545 ");
7546 #endif
7547 
7548 #ifdef PMAP
7549     fprintf(stdout,"\
7550   -f, --format=INT               Other format for output (also note the -A and -S options\n\
7551                                    and other options listed under Output types):\n\
7552                                    mask_introns,\n\
7553                                    mask_utr_introns,\n\
7554                                    psl_pro (or 0) = PSL format in protein coords,\n\
7555                                    psl_nt (or 1) = PSL format in nucleotide coords,\n\
7556                                    gff3_gene (or 2) = GFF3 gene format,\n\
7557                                    gff3_match_cdna (or 3) = GFF3 cDNA_match format,\n\
7558                                    gff3_match_est (or 4) = GFF3 EST_match format,\n\
7559                                    map_exons (or 7) = IIT FASTA exon map format,\n\
7560                                    map_ranges (or 8) = IIT FASTA range map format,\n\
7561                                    coords (or 9) = coords in table format\n\
7562 ");
7563 #else
7564     fprintf(stdout,"\
7565   -f, --format=INT               Other format for output (also note the -A and -S options\n\
7566                                    and other options listed under Output types):\n\
7567                                    mask_introns,\n\
7568                                    mask_utr_introns,\n\
7569                                    psl (or 1) = PSL (BLAT) format,\n\
7570                                    gff3_gene (or 2) = GFF3 gene format,\n\
7571                                    gff3_match_cdna (or 3) = GFF3 cDNA_match format,\n\
7572                                    gff3_match_est (or 4) = GFF3 EST_match format,\n\
7573                                    splicesites (or 6) = splicesites output (for GSNAP splicing file),\n\
7574                                    introns = introns output (for GSNAP splicing file),\n\
7575                                    map_exons (or 7) = IIT FASTA exon map format,\n\
7576                                    map_ranges (or 8) = IIT FASTA range map format,\n\
7577                                    coords (or 9) = coords in table format,\n\
7578                                    sampe = SAM format (setting paired_read bit in flag),\n\
7579                                    samse = SAM format (without setting paired_read bit),\n\
7580                                    bedpe = indels and gaps in BEDPE format\n\
7581 ");
7582 #endif
7583     fprintf(stdout,"\n");
7584 
7585     fprintf(stdout,"\
7586 Output options\n\
7587   -n, --npaths=INT               Maximum number of paths to show (default %d).  If set to 1, GMAP\n\
7588                                    will not report chimeric alignments, since those imply\n\
7589                                    two paths.  If you want a single alignment plus chimeric\n\
7590                                    alignments, then set this to be 0.\n\
7591 ",maxpaths_report);
7592     fprintf(stdout,"\
7593   --suboptimal-score=FLOAT       Report only paths whose score is within this value of the\n\
7594                                    best path.\n\
7595                                  If specified between 0.0 and 1.0, then treated as a fraction\n\
7596                                    of the score of the best alignment (matches minus penalties for\n\
7597                                    mismatches and indels).  Otherwise, treated as an integer\n\
7598                                    number to be subtracted from the score of the best alignment.\n\
7599                                    Default value is 0.50.\n\
7600   -O, --ordered                  Print output in same order as input (relevant\n\
7601                                    only if there is more than one worker thread)\n\
7602   -5, --md5                      Print MD5 checksum for each query sequence\n\
7603   -o, --chimera-overlap          Overlap to show, if any, at chimera breakpoint\n\
7604   --failsonly                    Print only failed alignments, those with no results\n\
7605   --nofails                      Exclude printing of failed alignments\n\
7606 \n\
7607   -V, --snpsdir=STRING           Directory for SNPs index files (created using snpindex) (default is\n\
7608                                    location of genome index files specified using -D and -d)\n \
7609   -v, --use-snps=STRING          Use database containing known SNPs (in <STRING>.iit, built\n\
7610                                    previously using snpindex) for tolerance to SNPs\n\
7611 ");
7612 
7613   fprintf(stdout,"\
7614   --split-output=STRING          Basename for multiple-file output, separately for nomapping,\n\
7615                                    uniq, mult, (and chimera, if --chimera-margin is selected)\n\
7616   --failed-input=STRING          Print completely failed alignments as input FASTA or FASTQ format\n\
7617                                    to the given file.  If the --split-output flag is also given, this file\n\
7618                                    is generated in addition to the output in the .nomapping file.\n\
7619   --append-output                When --split-output or --failedinput is given, this flag will append output\n\
7620                                    to the existing files.  Otherwise, the default is to create new files.\n\
7621 ");
7622   fprintf(stdout,"\
7623   --output-buffer-size=INT       Buffer size, in queries, for output thread (default %d).  When the number\n\
7624                                    of results to be printed exceeds this size, the worker threads are halted\n\
7625                                    until the backlog is cleared\n\
7626 ",output_buffer_size);
7627 
7628 
7629   fprintf(stdout,"\
7630   --translation-code=INT         Genetic code used for translating codons to amino acids and computing CDS\n\
7631                                    Integer value (default=1) corresponds to an available code at\n\
7632                                    http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi\n\
7633   --alt-start-codons             Also, use the alternate initiation codons shown in the above Web site\n\
7634                                    By default, without this option, only ATG is considered an initiation codon\n\
7635 ");
7636 
7637 #ifdef PMAP
7638     fprintf(stdout,"\
7639   -Y, --tolerant                 Translates genome with corrections for frameshifts\n\
7640 ");
7641 #else
7642     fprintf(stdout,"\
7643   -F, --fulllength               Assume full-length protein, starting with Met\n\
7644   -a, --cdsstart=INT             Translate codons from given nucleotide (1-based)\n\
7645   -T, --truncate                 Truncate alignment around full-length protein, Met to Stop\n\
7646                                  Implies -F flag.\n\
7647   -Y, --tolerant                 Translates cDNA with corrections for frameshifts\n\
7648 ");
7649 #endif
7650 
7651     fprintf(stdout,"\n");
7652 
7653 #ifndef PMAP
7654   fprintf(stdout,"Options for GFF3 output\n");
7655   fprintf(stdout,"\
7656   --gff3-add-separators=INT      Whether to add a ### separator after each query sequence\n\
7657                                    Values: 0 (no), 1 (yes, default)\n\
7658   --gff3-swap-phase=INT          Whether to swap phase (0 => 0, 1 => 2, 2 => 1) in gff3_gene format\n\
7659                                    Needed by some analysis programs, but deviates from GFF3 specification\n\
7660                                    Values: 0 (no, default), 1 (yes)\n\
7661   --gff3-fasta-annotation=INT    Whether to include annotation from the FASTA header into the GFF3 output\n\
7662                                    Values: 0 (default): Do not include\n\
7663                                            1: Wrap all annotation as Annot=\"<header>\"\n\
7664                                            2: Include key=value pairs, replacing brackets with quotation marks\n\
7665                                               and replacing spaces between key=value pairs with semicolons\n\
7666   --gff3-cds=STRING              Whether to use cDNA or genomic translation for the CDS coordinates\n\
7667                                    Values: cdna (default), genomic\n\
7668 ");
7669   fprintf(stdout,"\n");
7670 
7671   fprintf(stdout,"Options for SAM output\n");
7672   fprintf(stdout,"\
7673   --no-sam-headers               Do not print headers beginning with '@'\n\
7674   --sam-use-0M                   Insert 0M in CIGAR between adjacent insertions and deletions\n\
7675                                    Required by Picard, but can cause errors in other tools\n\
7676   --sam-extended-cigar           Use extended CIGAR format (using X and = symbols instead of M,\n\
7677                                    to indicate matches and mismatches, respectively\n\
7678   --force-xs-dir                 For RNA-Seq alignments, disallows XS:A:? when the sense direction\n\
7679                                    is unclear, and replaces this value arbitrarily with XS:A:+.\n\
7680                                    May be useful for some programs, such as Cufflinks, that cannot\n\
7681                                    handle XS:A:?.  However, if you use this flag, the reported value\n\
7682                                    of XS:A:+ in these cases will not be meaningful.\n\
7683   --md-lowercase-snp             In MD string, when known SNPs are given by the -v flag,\n\
7684                                    prints difference nucleotides as lower-case when they,\n\
7685                                    differ from reference but match a known alternate allele\n\
7686   --action-if-cigar-error        Action to take if there is a disagreement between CIGAR length and sequence length\n\
7687                                    Allowed values: ignore, warning (default), noprint, abort\n\
7688                                    Note that the noprint option does not print the CIGAR string at all if there\n\
7689                                    is an error, so it may break a SAM parser\n\
7690   --read-group-id=STRING         Value to put into read-group id (RG-ID) field\n\
7691   --read-group-name=STRING       Value to put into read-group name (RG-SM) field\n\
7692   --read-group-library=STRING    Value to put into read-group library (RG-LB) field\n\
7693   --read-group-platform=STRING   Value to put into read-group library (RG-PL) field\n\
7694 ");
7695   fprintf(stdout,"\n");
7696 
7697   /* Quality score options */
7698   fprintf(stdout,"Options for quality scores\n");
7699   fprintf(stdout,"\
7700   --quality-protocol=STRING      Protocol for input quality scores.  Allowed values:\n\
7701                                    illumina (ASCII 64-126) (equivalent to -J 64 -j -31)\n\
7702                                    sanger   (ASCII 33-126) (equivalent to -J 33 -j 0)\n\
7703                                  Default is sanger (no quality print shift)\n\
7704                                  SAM output files should have quality scores in sanger protocol\n\
7705 \n\
7706                                  Or you can specify the print shift with this flag:\n\
7707   -j, --quality-print-shift=INT  Shift FASTQ quality scores by this amount in output\n\
7708                                    (default is 0 for sanger protocol; to change Illumina input\n\
7709                                    to Sanger output, select -31)\n\
7710 ");
7711 #endif
7712 
7713     fprintf(stdout,"\
7714 External map file options\n\
7715   -M, --mapdir=directory         Map directory\n\
7716   -m, --map=iitfile              Map file.  If argument is '?' (with the quotes),\n\
7717                                    this lists available map files.\n\
7718   -e, --mapexons                 Map each exon separately\n\
7719   -b, --mapboth                  Report hits from both strands of genome\n\
7720   -u, --flanking=INT             Show flanking hits (default 0)\n\
7721   --print-comment                Show comment line for each hit\n\
7722 ");
7723     fprintf(stdout,"\n");
7724 
7725     fprintf(stdout,"\
7726 Alignment output options\n\
7727   -N, --nolengths                No intron lengths in alignment\n\
7728   -I, --invertmode=INT           Mode for alignments to genomic (-) strand:\n\
7729                                    0=Don't invert the cDNA (default)\n\
7730                                    1=Invert cDNA and print genomic (-) strand\n\
7731                                    2=Invert cDNA and print genomic (+) strand\n\
7732 ");
7733     fprintf(stdout,"\
7734   -i, --introngap=INT            Nucleotides to show on each end of intron (default %d)\n\
7735 ",ngap);
7736     fprintf(stdout,"\
7737   -l, --wraplength=INT           Wrap length for alignment (default %d)\n\
7738 ",wraplength);
7739     fprintf(stdout,"\n");
7740 
7741     fprintf(stdout,"\
7742 Filtering output options\n\
7743   --min-trimmed-coverage=FLOAT   Do not print alignments with trimmed coverage less\n\
7744                                    this value (default=0.0, which means no filtering)\n\
7745                                    Note that chimeric alignments will be output regardless\n\
7746                                    of this filter\n\
7747   --min-identity=FLOAT           Do not print alignments with identity less\n\
7748                                    this value (default=0.0, which means no filtering)\n\
7749                                    Note that chimeric alignments will be output regardless\n\
7750                                    of this filter\n\
7751 \n\
7752 Help options\n\
7753   --check                        Check compiler assumptions\n\
7754   --version                      Show version\n\
7755   --help                         Show this help message\n\
7756 ");
7757 
7758     return;
7759 }
7760