1 static char rcsid[] = "$Id: stage1hr.c 222932 2020-06-29 16:11:37Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 #define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 #define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11 
12 #include "stage1hr.h"
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>		/* For memset() */
16 #include <math.h>
17 #include <ctype.h>		/* for tolower() */
18 #include "assert.h"
19 #include "mem.h"
20 #include "types.h"		/* Needed for HAVE_64_BIT */
21 #include "univcoord.h"
22 
23 #include "reader.h"
24 #include "oligo.h"
25 #include "indexdb.h"
26 
27 #include "list.h"
28 #include "intlist.h"
29 #include "splice.h"
30 #include "indel.h"
31 #include "stage3hr.h"
32 #include "concordance.h"
33 #include "substring.h"
34 #include "complement.h"
35 #include "compress.h"
36 #include "genome128_hr.h"
37 #include "genome_sites.h"
38 #include "maxent.h"
39 #include "maxent_hr.h"
40 #include "iitdef.h"
41 #include "univinterval.h"
42 #ifdef LARGE_GENOMES
43 #include "uint8list.h"
44 #else
45 #include "uintlist.h"
46 #endif
47 #include "record.h"
48 
49 #ifdef HAVE_64_BIT
50 /* Oligospace_T is 64 bits */
51 #include "uint8table_rh.h"
52 #else
53 /* Oligospace_T is 32 bits */
54 #include "uinttable_rh.h"
55 #endif
56 
57 
58 #include "orderstat.h"
59 #include "path-solve.h"
60 #include "kmer-search.h"
61 #include "extension-search.h"
62 #include "segment-search.h"
63 #include "terminal.h"
64 #include "distant-rna.h"
65 #include "distant-dna.h"
66 
67 #include "univdiag.h"
68 #include "univdiagdef.h"
69 
70 #include "comp.h"
71 
72 
73 #ifdef WORDS_BIGENDIAN
74 #include "bigendian.h"
75 #endif
76 
77 
78 #ifdef HAVE_64_BIT
79 #ifdef LARGE_GENOMES
80 #else
81 #define DIAGONAL_ADD_QUERYPOS 1
82 #endif
83 #endif
84 
85 /* Three methods for performing a multiway merge.  Need to define one below. */
86 
87 #define SPEED 1
88 
89 
90 /* Note FORMULA: formulas for querypos <-> diagonal (diagterm in call to Indexdb_read) are:
91 
92 plus: diagonal = position + querylength - querypos
93 minus: diagonal = position + querypos + index1part
94 
95 For minus, the index1part is needed in call to Indexdb_read because
96 position is stored at beginning of plus oligomer, which corresponds to
97 end of minus oligomer.  As a result, we have the following formulas:
98 
99 high genomic position = diagonal (corresponds to querypos =
100 querylength for plus, and querypos = 0 for minus)
101 
102 low genomic position = diagonal - querylength (corresponds to querypos
103 = 0 for plus, and querypos = querylength for minus)
104 
105 Holds when we use Reader_T to read from 5' end of forward query and 3'
106 end of revcomp query simultaneously.  If we create a queryrc sequence,
107 then we can use just the plus formula, and convert the query
108 coordinates later.
109 
110 */
111 
112 
113 #define MAX_ITER 3		/* For looping through Segment_search */
114 
115 #define NO_EXTENSIONS_BEFORE_ZERO 1
116 
117 #define ALLOW_MIDDLE_ALIGNMENTS 1
118 
119 /* #define EXTRACT_GENOMICSEG 1 */
120 #ifdef EXTRACT_GENOMICSEG
121 #define MAX_INDEXSIZE 8
122 #endif
123 
124 
125 /* MAX_NALIGNMENTS of 2 vs 1 gets 1600 improvements in 275,000 reads */
126 /* MAX_NALIGNMENTS of 3 vs 2 gets 96 improvements in 275,000 reads */
127 #define MAX_NALIGNMENTS 3
128 
129 #define MAX_ALLOCATION 200
130 
131 #define PAIRMAX_ADDITIONAL 10000 /* Allows for finding of unpaired GMAP alignments beyond pairmax */
132 
133 #define MIN_SIZELIMIT 100
134 
135 /* static int kmer_search_sizelimit = 100; */
136 /* static int stage1hr_sizelimit = 3000; */
137 /* static int extension_search_sizelimit = 3000; */
138 
139 #define POLY_A 0x00000000
140 #define POLY_T 0xFFFFFFFF
141 static Oligospace_T poly_a;
142 static Oligospace_T poly_t;
143 
144 static bool use_only_transcriptome_p;
145 static bool remap_transcriptome_p = false;
146 
147 static Univ_IIT_T transcript_iit;
148 static Transcriptome_T transcriptome;
149 static Genome_T transcriptomebits;
150 
151 static Indexdb_T indexdb_fwd;
152 static Indexdb_T indexdb_rev;
153 static Indexdb_T indexdb_tr;
154 
155 static int index1part;
156 static int index1part_tr;
157 static int index1interval;
158 
159 static double user_mismatches_refalt_float;
160 static double user_mismatches_ref_float;
161 static double user_mincoverage_float;
162 
163 static double max_middle_insertions_float;
164 static double max_middle_deletions_float;
165 
166 static Univ_IIT_T chromosome_iit;
167 static int circular_typeint;
168 
169 static int nchromosomes;
170 static Genome_T genomecomp;
171 static Genome_T genomebits;
172 static Genome_T genomebits_alt;
173 
174 static int leftreadshift;
175 static Oligospace_T oligobase_mask; /* same as kmer_mask */
176 
177 
178 /* Mode */
179 static Mode_T mode;
180 static bool snpp;
181 static int maxpaths_search;
182 static int maxpaths_report;
183 
184 
185 /* Other distances */
186 static int max_end_insertions;
187 static int max_end_deletions;
188 static bool splicingp;
189 static Chrpos_T shortsplicedist;
190 static Chrpos_T shortsplicedist_novelend;
191 
192 
193 /* Penalties */
194 static int subopt_levels;
195 
196 static bool find_dna_chimeras_p;
197 static bool distances_observed_p;
198 
199 static Chrpos_T min_intronlength;
200 static Chrpos_T expected_pairlength;
201 static Chrpos_T pairlength_deviation;
202 
203 static int distantsplicing_penalty;
204 static int min_distantsplicing_end_matches;
205 static int min_distantsplicing_identity;
206 
207 
208 #define A_CHAR 0x0
209 #define C_CHAR 0x1
210 #define G_CHAR 0x2
211 #define T_CHAR 0x3
212 
213 
214 /* Originally allowed only 1, to print only unique translocations.
215    But need to allow enough to avoid missing some translocations. */
216 /* For transcript splicing, need to increase MAXCHIMERAPATHS */
217 /* #define MAXCHIMERAPATHS 100 */
218 #define MAXCHIMERAPATHS 10000
219 
220 #define NREQUIRED_FAST 2	/* For candidate generation using
221 				   multimiss.  A value of 2 implies
222 				   specificity of a 24-mer, which
223 				   should be low for a human-sized
224 				   genome */
225 
226 #define MAX_INDEX1INTERVAL 3
227 
228 
229 
230 /* Overall flow */
231 #ifdef DEBUG
232 #define debug(x) x
233 #else
234 #define debug(x)
235 #endif
236 
237 /* Stage1_init */
238 #ifdef DEBUG1
239 #define debug1(x) x
240 #else
241 #define debug1(x)
242 #endif
243 
244 /* Filling oligos */
245 #ifdef DEBUG8
246 #define debug8(x) x
247 #else
248 #define debug8(x)
249 #endif
250 
251 /* consolidate_paired_results and choose_among_paired */
252 #ifdef DEBUG16
253 #define debug16(x) x
254 #else
255 #define debug16(x)
256 #endif
257 
258 
259 #define T Stage1_T
260 static T
Stage1_new(int querylength)261 Stage1_new (int querylength) {
262   T new = (T) MALLOC(sizeof(*new));
263   int overhang = index1interval-1;
264   int mod;
265 
266   new->plus_validp = (bool *) CALLOC(querylength+overhang,sizeof(bool));
267   new->minus_validp = (bool *) CALLOC(querylength+overhang,sizeof(bool));
268 
269   new->plus_oligos = (Oligospace_T *) MALLOC((querylength+overhang)*sizeof(Oligospace_T));
270   new->minus_oligos = (Oligospace_T *) MALLOC((querylength+overhang)*sizeof(Oligospace_T));
271 
272   new->retrievedp_allocated = (bool *) CALLOC(2 * (querylength+overhang),sizeof(bool));
273   new->plus_retrievedp = &(new->retrievedp_allocated[overhang]);
274   new->minus_retrievedp = &(new->retrievedp_allocated[(querylength+overhang)+overhang]);
275 
276 #ifdef LARGE_GENOMES
277   new->positions_high_allocated = (unsigned char **) CALLOC(2 * (querylength+overhang),sizeof(unsigned char *));
278   new->plus_positions_high = &(new->positions_high_allocated[overhang]);
279   new->minus_positions_high = &(new->positions_high_allocated[(querylength+overhang)+overhang]);
280 #endif
281 
282   new->positions_allocated = (UINT4 **) CALLOC(2 * (querylength+overhang),sizeof(UINT4 *));
283   new->plus_positions = &(new->positions_allocated[overhang]);
284   new->minus_positions = &(new->positions_allocated[(querylength+overhang)+overhang]);
285 
286   new->npositions_allocated = (int *) CALLOC(2 * (querylength+overhang),sizeof(int));
287   new->plus_npositions = &(new->npositions_allocated[overhang]);
288   new->minus_npositions = &(new->npositions_allocated[(querylength+overhang)+overhang]);
289 
290   /* Need to allocate (max_mismatches + 1), where max_mismatches is provided to Genome_mismatches_left or Genome_mismatches_right */
291   /* new->mismatch_positions_alloc = (int *) MALLOC((querylength+1)*sizeof(int)); */
292   new->positions_alloc = (int *) MALLOC((querylength+1)*sizeof(int));
293   new->spliceinfo = Spliceinfo_new(querylength);
294 
295   /* Memory allocated for Segment_identify in segment-search.c, and
296      Merge_diagonals in kmer-search.c (which needs four sets of
297      arrays) */
298 #ifdef LARGE_GENOMES
299   new->stream_high_alloc = (unsigned char **) MALLOC(4*querylength*sizeof(unsigned char *));
300   new->gplus_stream_high_array_5 = &(new->stream_high_alloc[0]);
301   new->gminus_stream_high_array_5 = &(new->stream_high_alloc[querylength]);
302   new->gplus_stream_high_array_3 = &(new->stream_high_alloc[2*querylength]);
303   new->gminus_stream_high_array_3 = &(new->stream_high_alloc[3*querylength]);
304 
305   new->stream_low_alloc = (UINT4 **) MALLOC(4*querylength*sizeof(UINT4 *));
306   new->gplus_stream_low_array_5 = &(new->stream_low_alloc[0]);
307   new->gminus_stream_low_array_5 = &(new->stream_low_alloc[querylength]);
308   new->gplus_stream_low_array_3 = &(new->stream_low_alloc[2*querylength]);
309   new->gminus_stream_low_array_3 = &(new->stream_low_alloc[3*querylength]);
310 #endif
311 
312   new->stream_alloc = (Univcoord_T **) MALLOC(4*querylength*sizeof(Univcoord_T *));
313   new->gplus_stream_array_5 = &(new->stream_alloc[0]);
314   new->gminus_stream_array_5 = &(new->stream_alloc[querylength]);
315   new->gplus_stream_array_3 = &(new->stream_alloc[2*querylength]);
316   new->gminus_stream_array_3 = &(new->stream_alloc[3*querylength]);
317 
318 #ifdef LARGE_GENOMES
319   new->tplus_stream_array = new->gplus_stream_low_array_5;
320   new->tminus_stream_array = new->gminus_stream_low_array_5;
321 #else
322   new->tplus_stream_array = new->gplus_stream_array_5;
323   new->tminus_stream_array = new->gminus_stream_array_5;
324 #endif
325 
326   new->streamsize_alloc = (int *) MALLOC(4*querylength*sizeof(int));
327   new->tplus_streamsize_array = new->gplus_streamsize_array_5 = &(new->streamsize_alloc[0]);
328   new->tminus_streamsize_array = new->gminus_streamsize_array_5 = &(new->streamsize_alloc[querylength]);
329   new->gplus_streamsize_array_3 = &(new->streamsize_alloc[2*querylength]);
330   new->gminus_streamsize_array_3 = &(new->streamsize_alloc[3*querylength]);
331 
332   new->querypos_diagterm_alloc = (int *) MALLOC(4*querylength*sizeof(int));
333   new->tplus_diagterm_array = new->gplus_diagterm_array_5 = &(new->querypos_diagterm_alloc[0]);
334   new->tminus_diagterm_array = new->gminus_diagterm_array_5 = &(new->querypos_diagterm_alloc[querylength]);
335   new->gplus_diagterm_array_3 = &(new->querypos_diagterm_alloc[2*querylength]);
336   new->gminus_diagterm_array_3 = &(new->querypos_diagterm_alloc[3*querylength]);
337 
338 
339   for (mod = 0; mod < 2*index1interval; mod++) {
340 #ifdef LARGE_GENOMES
341     new->plus_rawpositions_high_5[mod] = (unsigned char *) NULL;
342     new->minus_rawpositions_high_5[mod] = (unsigned char *) NULL;
343     new->plus_rawpositions_high_3[mod] = (unsigned char *) NULL;
344     new->minus_rawpositions_high_3[mod] = (unsigned char *) NULL;
345 #endif
346     new->plus_rawpositions_5[mod] = (UINT4 *) NULL;
347     new->minus_rawpositions_5[mod] = (UINT4 *) NULL;
348     new->plus_rawpositions_3[mod] = (UINT4 *) NULL;
349     new->minus_rawpositions_3[mod] = (UINT4 *) NULL;
350   }
351 
352   /* Uses Listpool_T procedures */
353   new->queryfwd_plus_set = (List_T) NULL;
354   new->queryfwd_minus_set = (List_T) NULL;
355   new->queryrev_plus_set = (List_T) NULL;
356   new->queryrev_minus_set = (List_T) NULL;
357 
358   return new;
359 }
360 
361 #ifdef DEBUG
362 static void
Stage1_dump(T this,int querylength)363 Stage1_dump (T this, int querylength) {
364   int query_lastpos = querylength - index1part, querypos;
365   int i;
366 
367   for (querypos = 0; querypos <= query_lastpos; querypos++) {
368     if (this->plus_retrievedp[querypos] == true) {
369       printf("plus %d (%d):",querypos,this->plus_npositions[querypos]);
370       for (i = 0; i < this->plus_npositions[querypos]; i++) {
371 	printf(" %u",this->plus_positions[querypos][i]);
372       }
373       printf("\n");
374     }
375     if (this->minus_retrievedp[querypos] == true) {
376       printf("minus %d (%d):",querypos,this->minus_npositions[querypos]);
377       for (i = 0; i < this->minus_npositions[querypos]; i++) {
378 	printf(" %u",this->minus_positions[querypos][i]);
379       }
380       printf("\n");
381     }
382   }
383   printf("\n");
384   return;
385 }
386 #endif
387 
388 
389 static void
Stage1_init(T this,char * queryuc_ptr,int querylength,int genestrand)390 Stage1_init (T this, char *queryuc_ptr, int querylength, int genestrand) {
391   Reader_T reader;
392   Oligostate_T last_state;
393   Oligospace_T forward, revcomp, forward_oligo, revcomp_oligo;
394   int querypos, query_lastpos;
395   int mod;
396 
397 
398   debug1(printf("%s\n",queryuc_ptr));
399   reader = Reader_new(queryuc_ptr,/*querystart*/0,/*queryend*/querylength);
400   last_state = INIT;
401   forward = revcomp = 0;
402   mod = 0;
403 
404   while (mod < index1interval &&
405 	 (last_state = Oligo_next_5(last_state,&querypos,&forward,&revcomp,reader,genestrand)) != DONE) {
406     while (mod < index1interval && mod < querypos) {
407       debug1(printf("Skipping rawpositions_5 %d, because querypos is not the expected one\n",mod));
408 #ifdef LARGE_GENOMES
409       this->plus_rawpositions_high_5[mod] = (unsigned char *) NULL;
410       this->minus_rawpositions_high_5[mod] = (unsigned char *) NULL;
411 #endif
412       this->plus_rawpositions_5[mod] = (UINT4 *) NULL;
413       this->minus_rawpositions_5[mod] = (UINT4 *) NULL;
414       this->plus_nrawpositions_5[mod] = 0;
415       this->minus_nrawpositions_5[mod] = 0;
416       mod++;
417     }
418 
419     if (mod < index1interval) {
420       forward_oligo = forward & oligobase_mask;
421       this->plus_diagterms_5[mod] = querylength - querypos;
422 #ifdef LARGE_GENOMES
423       this->plus_nrawpositions_5[mod] =
424 	Indexdb_largeptr(&(this->plus_rawpositions_high_5[mod]),&(this->plus_rawpositions_5[mod]),
425 			 /*plus_indexdb*/indexdb_fwd,forward_oligo);
426 #else
427       this->plus_nrawpositions_5[mod] =
428 	Indexdb_ptr(&(this->plus_rawpositions_5[mod]),/*plus_indexdb*/indexdb_fwd,forward_oligo);
429 #endif
430       debug1(printf("(1) plus_nrawpositions_5[%d] = %d, oligo %016lX\n",mod,this->plus_nrawpositions_5[mod],forward_oligo));
431 
432       revcomp_oligo = (revcomp >> leftreadshift) & oligobase_mask;
433       this->minus_diagterms_5[mod] = querypos + index1part;
434 #ifdef LARGE_GENOMES
435       this->minus_nrawpositions_5[mod] =
436 	Indexdb_largeptr(&(this->minus_rawpositions_high_5[mod]),&(this->minus_rawpositions_5[mod]),
437 			 /*minus_indexdb*/indexdb_rev,revcomp_oligo);
438 #else
439       this->minus_nrawpositions_5[mod] =
440 	Indexdb_ptr(&(this->minus_rawpositions_5[mod]),/*minus_indexdb*/indexdb_rev,revcomp_oligo);
441 #endif
442       debug1(printf("(2) minus_nrawpositions_5[%d] = %d, oligo %016lX\n",mod,this->minus_nrawpositions_5[mod],revcomp_oligo));
443 
444       debug1(printf("5' end: %s %s: %d plus positions, %d minus positions, genestrand %d\n",
445 		    Oligo_one_nt(forward_oligo,index1part),Oligo_one_nt(revcomp_oligo,index1part),
446 		    this->plus_nrawpositions_5[mod],this->minus_nrawpositions_5[mod],genestrand));
447     }
448 
449     mod++;
450   }
451 
452   while (mod < index1interval) {
453     debug1(printf("Skipping rawpositions_5 %d, because last_state was DONE\n",mod));
454 #ifdef LARGE_GENOMES
455     this->plus_rawpositions_high_5[mod] = (unsigned char *) NULL;
456     this->minus_rawpositions_high_5[mod] = (unsigned char *) NULL;
457 #endif
458     this->plus_rawpositions_5[mod] = (UINT4 *) NULL;
459     this->minus_rawpositions_5[mod] = (UINT4 *) NULL;
460     this->plus_nrawpositions_5[mod] = 0;
461     this->minus_nrawpositions_5[mod] = 0;
462     mod++;
463   }
464   Reader_free(&reader);
465 
466 
467   query_lastpos = querylength - index1part;
468   reader = Reader_new(queryuc_ptr,/*querystart*/0,/*queryend*/querylength);
469   last_state = INIT;
470   forward = revcomp = 0;
471   mod = 0;
472   while (mod < index1interval &&
473 	 (last_state = Oligo_next_3(last_state,&querypos,&forward,&revcomp,reader,genestrand)) != DONE) {
474     while (mod < index1interval && mod < query_lastpos - querypos) {
475       debug1(printf("Skipping rawpositions_3 %d, because querypos is not the expected one\n",mod));
476 #ifdef LARGE_GENOMES
477       this->plus_rawpositions_high_3[mod] = (unsigned char *) NULL;
478       this->minus_rawpositions_high_3[mod] = (unsigned char *) NULL;
479 #endif
480       this->plus_rawpositions_3[mod] = (UINT4 *) NULL;
481       this->minus_rawpositions_3[mod] = (UINT4 *) NULL;
482       this->plus_nrawpositions_3[mod] = 0;
483       this->minus_nrawpositions_3[mod] = 0;
484       mod++;
485     }
486 
487     if (mod < index1interval) {
488       forward_oligo = (forward >> leftreadshift) & oligobase_mask;
489       this->plus_diagterms_3[mod] = querylength - querypos;
490 #ifdef LARGE_GENOMES
491       this->plus_nrawpositions_3[mod] =
492 	Indexdb_largeptr(&(this->plus_rawpositions_high_3[mod]),&(this->plus_rawpositions_3[mod]),
493 			 /*plus_indexdb*/indexdb_fwd,forward_oligo);
494 #else
495       this->plus_nrawpositions_3[mod] =
496 	Indexdb_ptr(&(this->plus_rawpositions_3[mod]),/*plus_indexdb*/indexdb_fwd,forward_oligo);
497 #endif
498       debug1(printf("(3) plus_nrawpositions_3[%d] = %d, oligo %016lX\n",mod,this->plus_nrawpositions_3[mod],forward_oligo));
499 
500       revcomp_oligo = revcomp & oligobase_mask;
501       this->minus_diagterms_3[mod] = querypos + index1part;
502 #ifdef LARGE_GENOMES
503       this->minus_nrawpositions_3[mod] =
504 	Indexdb_largeptr(&(this->minus_rawpositions_high_3[mod]),&(this->minus_rawpositions_3[mod]),
505 			 /*minus_indexdb*/indexdb_rev,revcomp_oligo);
506 #else
507       this->minus_nrawpositions_3[mod] =
508 	Indexdb_ptr(&(this->minus_rawpositions_3[mod]),/*minus_indexdb*/indexdb_rev,revcomp_oligo);
509 #endif
510       debug1(printf("(4) minus_nrawpositions_3[%d] = %d, oligo %016lX\n",mod,this->minus_nrawpositions_3[mod],revcomp_oligo));
511 
512       debug1(printf("3' end: %s %s: %d plus positions, %d minus positions, genestrand %d\n",
513 		    Oligo_one_nt(forward_oligo,index1part),Oligo_one_nt(revcomp_oligo,index1part),
514 		    this->plus_nrawpositions_3[mod],this->minus_nrawpositions_3[mod],genestrand));
515     }
516 
517     mod++;
518   }
519 
520   while (mod < index1interval) {
521     /* printf("Skipping rawpositions_3 %d, because last_state was DONE\n",mod); */
522 #ifdef LARGE_GENOMES
523     this->plus_rawpositions_high_3[mod] = (unsigned char *) NULL;
524     this->minus_rawpositions_high_3[mod] = (unsigned char *) NULL;
525 #endif
526     this->plus_rawpositions_3[mod] = (UINT4 *) NULL;
527     this->minus_rawpositions_3[mod] = (UINT4 *) NULL;
528     this->plus_nrawpositions_3[mod] = 0;
529     this->minus_nrawpositions_3[mod] = 0;
530     mod++;
531   }
532   Reader_free(&reader);
533 
534   return;
535 }
536 
537 
538 
539 
540 /* onep indicates whether Kmer_search_genome_one_end was run, which
541    fills beyond the first index1interval */
542 static void
Stage1_rearrange(T this,int querylength,bool onep)543 Stage1_rearrange (T this, int querylength, bool onep) {
544   int query_lastpos = querylength - index1part;
545   int mod;
546 
547   for (mod = 0; mod < index1interval; mod++) {
548     this->plus_retrievedp[mod] = true;
549 #ifdef LARGE_GENOMES
550     this->plus_positions_high[mod] = this->plus_rawpositions_high_5[mod];
551 #endif
552     this->plus_positions[mod] = this->plus_rawpositions_5[mod];
553     this->plus_npositions[mod] = this->plus_nrawpositions_5[mod];
554 
555     this->plus_retrievedp[query_lastpos-mod] = true;
556 #ifdef LARGE_GENOMES
557     this->plus_positions_high[query_lastpos-mod] = this->plus_rawpositions_high_3[mod];
558 #endif
559     this->plus_positions[query_lastpos-mod] = this->plus_rawpositions_3[mod];
560     this->plus_npositions[query_lastpos-mod] = this->plus_nrawpositions_3[mod];
561 
562     /* Using new sarray and segment-based conventions */
563     this->minus_retrievedp[query_lastpos-mod] = true;
564 #ifdef LARGE_GENOMES
565     this->minus_positions_high[query_lastpos-mod] = this->minus_rawpositions_high_5[mod];
566 #endif
567     this->minus_positions[query_lastpos-mod] = this->minus_rawpositions_5[mod];
568     this->minus_npositions[query_lastpos-mod] = this->minus_nrawpositions_5[mod];
569 
570     this->minus_retrievedp[mod] = true;
571 #ifdef LARGE_GENOMES
572     this->minus_positions_high[mod] = this->minus_rawpositions_high_3[mod];
573 #endif
574     this->minus_positions[mod] = this->minus_rawpositions_3[mod];
575     this->minus_npositions[mod] = this->minus_nrawpositions_3[mod];
576 
577 #if 0
578     printf("Initializing plus_positions[%d] to be %p, with count of %d\n",
579 	   mod,this->plus_positions[mod],this->plus_npositions[mod]);
580     printf("Initializing plus_positions[%d] to be %p, with count of %d\n",
581 	 query_lastpos-mod,this->plus_positions[query_lastpos-mod],this->plus_npositions[query_lastpos-mod]);
582 
583     printf("Initializing minus_positions[%d] to be %p, with count of %d\n",
584 	   mod,this->minus_positions[mod],this->minus_npositions[mod]);
585     printf("Initializing minus_positions[%d] to be %p, with count of %d\n",
586 	 query_lastpos-mod,this->minus_positions[query_lastpos-mod],this->minus_npositions[query_lastpos-mod]);
587 #endif
588   }
589 
590   if (onep == true) {
591     /* Not currently doing one_kmer */
592     for (mod = 0; mod < index1interval; mod++) {
593       this->plus_retrievedp[index1part+mod] = true;
594 #ifdef LARGE_GENOMES
595       this->plus_positions_high[index1part+mod] = this->plus_rawpositions_high_5[index1interval+mod];
596 #endif
597       this->plus_positions[index1part+mod] = this->plus_rawpositions_5[index1interval+mod];
598       this->plus_npositions[index1part+mod] = this->plus_nrawpositions_5[index1interval+mod];
599 
600       this->plus_retrievedp[query_lastpos-index1part-mod] = true;
601 #ifdef LARGE_GENOMES
602       this->plus_positions_high[query_lastpos-index1part-mod] = this->plus_rawpositions_high_3[index1interval+mod];
603 #endif
604       this->plus_positions[query_lastpos-index1part-mod] = this->plus_rawpositions_3[index1interval+mod];
605       this->plus_npositions[query_lastpos-index1part-mod] = this->plus_nrawpositions_3[index1interval+mod];
606 
607       /* Using new sarray and segment-based conventions */
608       this->minus_retrievedp[query_lastpos-index1part-mod] = true;
609 #ifdef LARGE_GENOMES
610       this->minus_positions_high[query_lastpos-index1part-mod] = this->minus_rawpositions_high_5[index1interval+mod];
611 #endif
612       this->minus_positions[query_lastpos-index1part-mod] = this->minus_rawpositions_5[index1interval+mod];
613       this->minus_npositions[query_lastpos-index1part-mod] = this->minus_nrawpositions_5[index1interval+mod];
614 
615       this->minus_retrievedp[index1part+mod] = true;
616 #ifdef LARGE_GENOMES
617       this->minus_positions_high[index1part+mod] = this->minus_rawpositions_high_3[index1interval+mod];
618 #endif
619       this->minus_positions[index1part+mod] = this->minus_rawpositions_3[index1interval+mod];
620       this->minus_npositions[index1part+mod] = this->minus_nrawpositions_3[index1interval+mod];
621 
622 #if 0
623       printf("Initializing plus_positions[%d] to be %p, with count of %d\n",
624 	     index1part+mod,this->plus_positions[index1part+mod],this->plus_npositions[index1part+mod]);
625       printf("Initializing plus_positions[%d] to be %p, with count of %d\n",
626 	     query_lastpos-index1part-mod,this->plus_positions[query_lastpos-index1part-mod],
627 	     this->plus_npositions[query_lastpos-index1part-mod]);
628 
629       printf("Initializing minus_positions[%d] to be %p, with count of %d\n",
630 	     index1part+mod,this->minus_positions[index1part+mod],this->minus_npositions[index1part+mod]);
631       printf("Initializing minus_positions[%d] to be %p, with count of %d\n",
632 	     query_lastpos-index1part-mod,this->minus_positions[query_lastpos-index1part-mod],
633 	     this->minus_npositions[query_lastpos-index1part-mod]);
634 #endif
635     }
636   }
637 
638   /* Stage1_dump(this,querylength); */
639 
640   return;
641 }
642 
643 
644 void
Stage1_fill_all_oligos(T this,char * queryuc_ptr,int querylength,int genestrand)645 Stage1_fill_all_oligos (T this, char *queryuc_ptr, int querylength, int genestrand) {
646   Reader_T reader;
647   int query_lastpos, querypos, querypos_rc;
648   Oligostate_T last_state = INIT;
649   Oligospace_T forward = 0, revcomp = 0, oligo;
650 #ifdef HAVE_64_BIT
651   Uint8table_T plus_seenp, minus_seenp;
652 #else
653   Uinttable_T plus_seenp, minus_seenp;
654 #endif
655 
656 #ifdef HAVE_64_BIT
657   plus_seenp = Uint8table_new(/*hint*/querylength,/*save_contents_p*/false);
658   minus_seenp = Uint8table_new(/*hint*/querylength,/*save_contents_p*/false);
659 #else
660   plus_seenp = Uinttable_new(/*hint*/querylength,/*save_contents_p*/false);
661   minus_seenp = Uinttable_new(/*hint*/querylength,/*save_contents_p*/false);
662 #endif
663 
664   query_lastpos = querylength - index1part;
665   reader = Reader_new(queryuc_ptr,/*querystart*/0,/*queryend*/querylength);
666 
667   /* Note: leftshifting is done here, rather than in Oligo_lookup */
668   /* Format is 010llX because 19-mer is maximum k-mer size, which would require 10 chars */
669   /* debug(printf("oligobase_mask: %010llX\n",oligobase_mask)); */
670   querypos = 0;
671   while ((last_state = Oligo_next_5(last_state,&querypos,&forward,&revcomp,reader,genestrand)) != DONE) {
672     if (last_state != VALID) {
673       /* querypos is not defined when last_state != VALID */
674       debug8(printf("oligo at plus %d, minus %d is not valid\n",querypos,querypos_rc));
675     } else {
676       querypos_rc = query_lastpos - querypos;
677       oligo = this->plus_oligos[querypos] = forward & oligobase_mask;
678       debug8(printf("Putting oligo %016lX at plus %d\n",oligo,querypos));
679 #ifdef HAVE_64_BIT
680       if (Uint8table_get(plus_seenp,oligo) != NULL) {
681 	debug8(printf("oligo at plus %d already seen, so marking as invalid\n",querypos));
682 	this->plus_validp[querypos] = false;
683       } else {
684 	this->plus_validp[querypos] = true;
685 	Uint8table_put(plus_seenp,oligo,(void *) true);
686       }
687 #else
688       if (Uinttable_get(plus_seenp,oligo) != NULL) {
689 	debug8(printf("oligo at plus %d already seen, so marking as invalid\n",querypos));
690 	this->plus_validp[querypos] = false;
691       } else {
692 	this->plus_validp[querypos] = true;
693 	Uinttable_put(plus_seenp,oligo,(void *) true);
694       }
695 #endif
696 
697       oligo = this->minus_oligos[querypos_rc] = (revcomp >> leftreadshift) & oligobase_mask;
698       debug8(printf("Putting oligo %016lX at minus %d\n",oligo,querypos_rc));
699 #ifdef HAVE_64_BIT
700       if (Uint8table_get(minus_seenp,oligo) != NULL) {
701 	debug8(printf("oligo at minus %d already seen, so marking as invalid\n",querypos));
702 	this->minus_validp[querypos_rc] = false;
703       } else {
704 	this->minus_validp[querypos_rc] = true;
705 	Uint8table_put(minus_seenp,oligo,(void *) true);
706       }
707 #else
708       if (Uinttable_get(minus_seenp,oligo) != NULL) {
709 	debug8(printf("oligo at minus %d already seen, so marking as invalid\n",querypos));
710 	this->minus_validp[querypos_rc] = false;
711       } else {
712 	this->minus_validp[querypos_rc] = true;
713 	Uinttable_put(minus_seenp,oligo,(void *) true);
714       }
715 #endif
716     }
717   }
718 
719   Reader_free(&reader);
720 
721 #ifdef HAVE_64_BIT
722   Uint8table_free(&minus_seenp);
723   Uint8table_free(&plus_seenp);
724 #else
725   Uinttable_free(&minus_seenp);
726   Uinttable_free(&plus_seenp);
727 #endif
728 
729   return;
730 }
731 
732 
733 void
Stage1_fill_position_ptrs_plus(T this,int querystart,int queryend,int genestrand)734 Stage1_fill_position_ptrs_plus (T this, int querystart, int queryend, int genestrand) {
735   int query_lastpos, querypos;
736   Indexdb_T plus_indexdb;
737 
738   if (genestrand == +2) {
739     plus_indexdb = indexdb_rev;
740   } else {
741     plus_indexdb = indexdb_fwd;
742   }
743 
744   query_lastpos = queryend - index1part;
745 
746   /* Assumes that plus_oligos have been filled in (by Stage1_fill_all_oligos */
747   /* Format is 010llX because 19-mer is maximum k-mer size, which would require 10 chars */
748   /* debug(printf("oligobase_mask: %010llX\n",oligobase_mask)); */
749   for (querypos = querystart; querypos <= query_lastpos; querypos++) {
750     if (this->plus_retrievedp[querypos] == true) {
751       /* No need to do anything */
752     } else if (this->plus_validp[querypos] == false) {
753       /* Oligo not valid */
754     } else {
755 #ifdef LARGE_GENOMES
756       this->plus_npositions[querypos] =
757 	Indexdb_largeptr(&this->plus_positions_high[querypos],&this->plus_positions[querypos],
758 			 plus_indexdb,this->plus_oligos[querypos]);
759 #else
760       this->plus_npositions[querypos] =
761 	Indexdb_ptr(&this->plus_positions[querypos],plus_indexdb,this->plus_oligos[querypos]);
762 #endif
763       this->plus_retrievedp[querypos] = true;
764     }
765   }
766 
767   return;
768 }
769 
770 void
Stage1_fill_position_ptrs_minus(T this,int querystart,int queryend,int genestrand)771 Stage1_fill_position_ptrs_minus (T this, int querystart, int queryend, int genestrand) {
772   int query_lastpos, querypos;
773   Indexdb_T minus_indexdb;
774 
775   if (genestrand == +2) {
776     minus_indexdb = indexdb_fwd;
777   } else {
778     minus_indexdb = indexdb_rev;
779   }
780 
781   query_lastpos = queryend - index1part;
782 
783   /* Assumes that minus_oligos have been filled in (by Stage1_fill_all_oligos */
784   /* Format is 010llX because 19-mer is maximum k-mer size, which would require 10 chars */
785   /* debug(printf("oligobase_mask: %010llX\n",oligobase_mask)); */
786   for (querypos = querystart; querypos <= query_lastpos; querypos++) {
787     if (this->minus_retrievedp[querypos] == true) {
788       /* No need to do anything */
789     } else if (this->minus_validp[querypos] == false) {
790       /* Oligo not valid */
791     } else {
792 #ifdef LARGE_GENOMES
793       this->minus_npositions[querypos] =
794 	Indexdb_largeptr(&this->minus_positions_high[querypos],&this->minus_positions[querypos],
795 			 minus_indexdb,this->minus_oligos[querypos]);
796 #else
797       this->minus_npositions[querypos] =
798 	Indexdb_ptr(&this->minus_positions[querypos],minus_indexdb,this->minus_oligos[querypos]);
799 #endif
800       this->minus_retrievedp[querypos] = true;
801     }
802   }
803 
804   return;
805 }
806 
807 
808 #if 0
809 static void
810 Stage1_ptr_all_positions (T this, int querylength, int genestrand) {
811   int query_lastpos, querypos, querypos_rc;
812   Indexdb_T plus_indexdb, minus_indexdb;
813 
814   if (genestrand == +2) {
815     plus_indexdb = indexdb_rev;
816     minus_indexdb = indexdb_fwd;
817   } else {
818     plus_indexdb = indexdb_fwd;
819     minus_indexdb = indexdb_rev;
820   }
821 
822   query_lastpos = querylength - index1part;
823 
824   /* Note: leftshifting is done here, rather than in Oligo_lookup */
825   /* Format is 010llX because 19-mer is maximum k-mer size, which would require 10 chars */
826   /* debug(printf("oligobase_mask: %010llX\n",oligobase_mask)); */
827   for (querypos = 0, querypos_rc = query_lastpos; querypos <= query_lastpos; querypos++, --querypos_rc) {
828     if (this->plus_retrievedp[querypos] == true) {
829       /* No need to do anything */
830     } else if (this->plus_validp[querypos] == false) {
831       /* Oligo not valid */
832     } else {
833 #ifdef LARGE_GENOMES
834       this->plus_npositions[querypos] =
835 	Indexdb_largeptr(&this->plus_positions_high[querypos],&this->plus_positions[querypos],
836 			 plus_indexdb,this->plus_oligos[querypos]);
837 #else
838       this->plus_npositions[querypos] =
839 	Indexdb_ptr(&this->plus_positions[querypos],plus_indexdb,this->plus_oligos[querypos]);
840 #endif
841       this->plus_retrievedp[querypos] = true;
842     }
843 
844     if (this->minus_retrievedp[querypos_rc] == true) {
845       /* No need to do anything */
846     } else if (this->minus_validp[querypos_rc] == false) {
847       /* Oligo not valid */
848     } else {
849 #ifdef LARGE_GENOMES
850       this->minus_npositions[querypos_rc] =
851 	Indexdb_largeptr(&this->minus_positions_high[querypos_rc],&this->minus_positions[querypos_rc],
852 			 minus_indexdb,this->minus_oligos[querypos_rc]);
853 #else
854       this->minus_npositions[querypos_rc] =
855 	Indexdb_ptr(&this->minus_positions[querypos_rc],minus_indexdb,this->minus_oligos[querypos_rc]);
856 #endif
857       this->minus_retrievedp[querypos_rc] = true;
858     }
859   }
860 
861   return;
862 }
863 #endif
864 
865 
866 static int
determine_sizelimit(T this,int querylength)867 determine_sizelimit (T this, int querylength) {
868   int cutoff, *set, count;
869   int n;
870   int query_lastpos, querypos, querypos_rc;
871 
872   assert(querylength >= index1part);
873 
874   query_lastpos = querylength - index1part;
875   set = (int *) MALLOC(2*(query_lastpos+1)*sizeof(int));
876   n = 0;
877   for (querypos = 0, querypos_rc = query_lastpos; querypos <= query_lastpos; querypos++, --querypos_rc) {
878     if (this->plus_validp[querypos] == true) {
879       set[n++] = count = this->plus_npositions[querypos];
880     }
881 
882     if (this->minus_validp[querypos_rc] == true) {
883       set[n++] = count = this->minus_npositions[querypos_rc];
884     }
885   }
886 
887   if (n < 5) {
888     cutoff = MIN_SIZELIMIT;
889   } else if ((cutoff = Orderstat_int_pct_inplace(set,n,/*pct*/0.60)) < MIN_SIZELIMIT) {
890     cutoff = MIN_SIZELIMIT;
891   }
892   FREE(set);
893 
894   return cutoff;
895 }
896 
897 
898 
899 
900 
901 void
Stage1_free(T * old)902 Stage1_free (T *old) {
903 
904   /* Stage1hr_check(*old); */
905 
906   if (*old) {
907 #if 0
908     /* Now pointing to data structure, and not copying values */
909     for (i = 0; i <= query_lastpos; i++) {
910       if ((*old)->plus_retrievedp[i] == true) {
911 #ifdef LARGE_GENOMES
912 	FREE((*old)->plus_positions_high[i]);
913 #endif
914 	FREE((*old)->plus_positions[i]);
915       }
916 
917       if ((*old)->minus_retrievedp[i] == true) {
918 #ifdef LARGE_GENOMES
919 	FREE((*old)->minus_positions_high[i]);
920 #endif
921 	FREE((*old)->minus_positions[i]);
922       }
923     }
924 #endif
925 
926     /* FREE((*old)->mismatch_positions_alloc); */
927     FREE((*old)->positions_alloc);
928     Spliceinfo_free(&(*old)->spliceinfo);
929 
930 #ifdef LARGE_GENOMES
931     FREE((*old)->stream_high_alloc);
932     FREE((*old)->stream_low_alloc);
933 #endif
934     FREE((*old)->stream_alloc);
935     FREE((*old)->streamsize_alloc);
936     FREE((*old)->querypos_diagterm_alloc);
937 
938 
939 #ifdef LARGE_GENOMES
940     FREE((*old)->positions_high_allocated);
941 #endif
942     FREE((*old)->positions_allocated);
943     FREE((*old)->npositions_allocated);
944 
945     FREE((*old)->retrievedp_allocated);
946 
947     FREE((*old)->minus_oligos);
948     FREE((*old)->plus_oligos);
949 
950     FREE((*old)->minus_validp);
951     FREE((*old)->plus_validp);
952 
953     Elt_gc(&(*old)->queryfwd_plus_set);
954     Elt_gc(&(*old)->queryfwd_minus_set);
955     Elt_gc(&(*old)->queryrev_plus_set);
956     Elt_gc(&(*old)->queryrev_minus_set);
957 
958     FREE(*old);
959   }
960 
961   return;
962 }
963 
964 
965 /************************************************************************/
966 
967 static char complCode[128] = COMPLEMENT_LC;
968 
969 static void
make_complement_buffered(char * complement,char * sequence,unsigned int length)970 make_complement_buffered (char *complement, char *sequence, unsigned int length) {
971   int i, j;
972 
973   /* complement = (char *) CALLOC(length+1,sizeof(char)); */
974   for (i = length-1, j = 0; i >= 0; i--, j++) {
975     complement[j] = complCode[(int) sequence[i]];
976   }
977   complement[length] = '\0';
978   return;
979 }
980 
981 
982 #if 0
983 static void
984 make_complement_inplace (char *sequence, unsigned int length) {
985   char temp;
986   unsigned int i, j;
987 
988   for (i = 0, j = length-1; i < length/2; i++, j--) {
989     temp = complCode[(int) sequence[i]];
990     sequence[i] = complCode[(int) sequence[j]];
991     sequence[j] = temp;
992   }
993   if (i == j) {
994     sequence[i] = complCode[(int) sequence[i]];
995   }
996 
997   return;
998 }
999 #endif
1000 
1001 /************************************************************************/
1002 
1003 
1004 
1005 static void
stage3list_gc(List_T * old)1006 stage3list_gc (List_T *old) {
1007   List_T p;
1008   Stage3end_T hit;
1009 
1010   for (p = *old; p != NULL; p = p->rest) {
1011     hit = (Stage3end_T) p->first;
1012     Stage3end_free(&hit);
1013   }
1014   Hitlist_free(&(*old));
1015 
1016   return;
1017 }
1018 
1019 
1020 static void
remap_to_transcriptome(List_T hits)1021 remap_to_transcriptome (List_T hits) {
1022   List_T p;
1023   Stage3end_T hit;
1024   List_T transcripts;
1025 
1026   char *remap_sequence;
1027   int remap_seqlength;
1028 
1029   for (p = hits; p != NULL; p = List_next(p)) {
1030     hit = (Stage3end_T) List_head(p);
1031     if (Stage3end_transcripts(hit) != NULL) {
1032       /* Already mapped to transcriptome */
1033     } else {
1034       remap_sequence = Stage3end_substrings_genomic_sequence(&remap_seqlength,hit,genomecomp);
1035       if ((transcripts = Kmer_remap_transcriptome(remap_sequence,remap_seqlength,
1036 						  Stage3end_chrnum(hit),Stage3end_chrpos_low(hit),Stage3end_chrpos_high(hit),
1037 						  transcript_iit,transcriptomebits,transcriptome)) != NULL) {
1038 	/* Following procedure frees the list hit->transcripts */
1039 	Stage3end_set_transcripts(hit,transcripts);
1040       }
1041       FREE(remap_sequence);
1042     }
1043   }
1044 
1045   return;
1046 }
1047 
1048 
1049 
1050 static int
single_read_search(int * found_score_overall,int * found_score_within_trims,List_T * sense_hits_gplus,List_T * sense_hits_gminus,List_T * antisense_hits_gplus,List_T * antisense_hits_gminus,T this,char * queryuc_ptr,char * queryrc,int querylength,int * mismatch_positions_alloc,Compress_T query_compress_fwd,Compress_T query_compress_rev,int genestrand,Intlistpool_T intlistpool,Univcoordlistpool_T univcoordlistpool,Listpool_T listpool,Univdiagpool_T univdiagpool,Hitlistpool_T hitlistpool,bool paired_end_p,bool first_read_p)1051 single_read_search (int *found_score_overall, int *found_score_within_trims,
1052 		    List_T *sense_hits_gplus, List_T *sense_hits_gminus,
1053 		    List_T *antisense_hits_gplus, List_T *antisense_hits_gminus,
1054 		    T this, char *queryuc_ptr, char *queryrc, int querylength,
1055 		    int *mismatch_positions_alloc,
1056 		    Compress_T query_compress_fwd, Compress_T query_compress_rev, int genestrand,
1057 
1058 		    Intlistpool_T intlistpool, Univcoordlistpool_T univcoordlistpool,
1059 		    Listpool_T listpool, Univdiagpool_T univdiagpool, Hitlistpool_T hitlistpool,
1060 		    bool paired_end_p, bool first_read_p) {
1061   int sizelimit;
1062 
1063   int done_level, nmismatches_search;
1064   int max_middle_insertions, max_middle_deletions, max_insertionlen, max_deletionlen;
1065   Chrpos_T overall_max_distance, overall_end_distance;
1066   /* int max_terminal_mismatches; */
1067   int max_splice_mismatches;
1068   int min_trim;
1069 
1070   bool abort_exact_p = false;
1071   /* bool elt_set_extended_p = false; */
1072   Stage3end_T hit;
1073 
1074   List_T p;
1075   bool onep = false;
1076 
1077 
1078   /* For Segment_search */
1079   struct Record_T *plus_records = NULL, *minus_records = NULL;
1080   int plus_nrecords = 0, minus_nrecords = 0;
1081 
1082   List_T startfrags_plus = NULL, endfrags_plus = NULL,
1083     startfrags_minus = NULL, endfrags_minus = NULL;
1084 
1085 
1086   debug(printf("Entered single_read_search with queryuc_ptr %s\n",queryuc_ptr));
1087   *sense_hits_gplus = *sense_hits_gminus = (List_T) NULL;
1088   *antisense_hits_gplus = *antisense_hits_gminus = (List_T) NULL;
1089 
1090   nmismatches_search = querylength/index1part;
1091 
1092   if (max_middle_insertions_float > 0.0 && max_middle_insertions_float < 1.0) {
1093     max_middle_insertions = (int) rint(max_middle_insertions_float * (double) querylength);
1094   } else {
1095     max_middle_insertions = (int) max_middle_insertions_float;
1096   }
1097   max_insertionlen = max_middle_insertions > max_end_insertions ? max_middle_insertions : max_end_insertions;
1098 
1099   if (max_middle_deletions_float > 0.0 && max_middle_deletions_float < 1.0) {
1100     max_middle_deletions = (int) rint(max_middle_deletions_float * (double) querylength);
1101   } else {
1102     max_middle_deletions = (int) max_middle_deletions_float;
1103   }
1104   max_deletionlen = max_middle_deletions > max_end_deletions ? max_middle_deletions : max_end_deletions;
1105 
1106   overall_max_distance = shortsplicedist;
1107   if ((Chrpos_T) max_middle_deletions > overall_max_distance) {
1108     overall_max_distance = (Chrpos_T) max_middle_deletions;
1109   }
1110   if ((Chrpos_T) max_middle_insertions > overall_max_distance) {
1111     overall_max_distance = (Chrpos_T) max_middle_insertions;
1112   }
1113   overall_end_distance = shortsplicedist_novelend > (Chrpos_T) max_deletionlen ? shortsplicedist_novelend : (Chrpos_T) max_deletionlen;
1114 
1115 
1116   *found_score_overall = *found_score_within_trims = querylength;
1117 
1118   if (querylength < index1part + index1interval - 1) {
1119     return /*level*/0;
1120   }
1121 
1122   if (indexdb_tr != NULL) {
1123     debug(printf("Running Kmer_search_transcriptome_single\n"));
1124     Kmer_search_transcriptome_single(&(*found_score_overall),&(*found_score_within_trims),
1125 				     &(*sense_hits_gplus),&(*sense_hits_gminus),
1126 				     &(*antisense_hits_gplus),&(*antisense_hits_gminus),queryuc_ptr,querylength,
1127 				     this->tplus_stream_array,this->tplus_streamsize_array,this->tplus_diagterm_array,
1128 				     this->tminus_stream_array,this->tminus_streamsize_array,this->tminus_diagterm_array,
1129 				     query_compress_fwd,query_compress_rev,
1130 				     transcript_iit,transcriptome,transcriptomebits,
1131 				     nmismatches_search,listpool,hitlistpool,/*level*/0);
1132   }
1133 
1134   if (use_only_transcriptome_p == true || *found_score_within_trims <= nmismatches_search) {
1135     return TR;
1136   } else {
1137     debug(printf("LEVEL %d: RUNNING KMER_SEARCH_GENOME_ENDS\n",1));
1138     Stage1_init(this,queryuc_ptr,querylength,genestrand);
1139 
1140     Kmer_search_genome_ends_exact(&abort_exact_p,&(*found_score_overall),&(*found_score_within_trims),
1141 				  &(*sense_hits_gplus),&(*sense_hits_gminus),&(*antisense_hits_gplus),&(*antisense_hits_gminus),
1142 				  this,querylength,mismatch_positions_alloc,
1143 				  query_compress_fwd,query_compress_rev,genestrand,nmismatches_search,
1144 				  listpool,hitlistpool,/*level*/1);
1145 
1146     debug(printf("After Kmer_search_genome_ends, we have found_score %d (vs allowed %d), sense %d plus and %d minus hits, antisense %d plus and %d minus hits\n",
1147 		 *found_score_within_trims,nmismatches_search,
1148 		 List_length(*sense_hits_gplus),List_length(*sense_hits_gminus),
1149 		 List_length(*antisense_hits_gplus),List_length(*antisense_hits_gminus)));
1150   }
1151 
1152   if (*found_score_within_trims <= nmismatches_search) {
1153     if (indexdb_tr != NULL && remap_transcriptome_p == true) {
1154       remap_to_transcriptome(*sense_hits_gplus);
1155       remap_to_transcriptome(*sense_hits_gminus);
1156       remap_to_transcriptome(*antisense_hits_gplus);
1157       remap_to_transcriptome(*antisense_hits_gminus);
1158     }
1159     return KMER_EXACT;
1160 
1161   } else {
1162     Stage1_rearrange(this,querylength,onep);
1163     Stage1_fill_all_oligos(this,queryuc_ptr,querylength,genestrand);
1164     debug(printf("LEVEL %d: RUNNING EXTENSION_SEARCH\n",2));
1165     Extension_search(&(*found_score_overall),&(*found_score_within_trims),
1166 		     &(*sense_hits_gplus),&(*sense_hits_gminus),&(*antisense_hits_gplus),&(*antisense_hits_gminus),
1167 		     this,queryuc_ptr,queryrc,querylength,mismatch_positions_alloc,query_compress_fwd,query_compress_rev,
1168 		     intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1169 		     nmismatches_search,max_insertionlen,max_deletionlen,overall_end_distance,
1170 		     genestrand,paired_end_p,first_read_p,/*level*/2);
1171     debug(printf("After Extension search, we have found_score_within_trims %d (vs allowed %d), sense %d plus and %d minus hits, antisense %d plus and %d minus hits\n",
1172 		 *found_score_within_trims,nmismatches_search,
1173 		 List_length(*sense_hits_gplus),List_length(*sense_hits_gminus),
1174 		 List_length(*antisense_hits_gplus),List_length(*antisense_hits_gminus)));
1175 
1176   }
1177 
1178   /* Need to use found_score_overall here and afterwards in order to find fusions */
1179   debug(printf("Comparing found_score_overall %d with nmismatches_search %d\n",
1180 	       *found_score_overall,nmismatches_search));
1181   if (abort_exact_p == true || *found_score_overall <= nmismatches_search) {
1182     if (indexdb_tr != NULL && remap_transcriptome_p == true) {
1183       remap_to_transcriptome(*sense_hits_gplus);
1184       remap_to_transcriptome(*sense_hits_gminus);
1185       remap_to_transcriptome(*antisense_hits_gplus);
1186       remap_to_transcriptome(*antisense_hits_gminus);
1187     }
1188     return EXT;
1189 
1190   } else {
1191     debug(printf("LEVEL %d: RUNNING KMER_SEARCH_GENOME_ENDS_APPROX\n",3));
1192     Kmer_search_genome_ends_approx(&(*found_score_overall),&(*found_score_within_trims),
1193 				   &(*sense_hits_gplus),&(*sense_hits_gminus),&(*antisense_hits_gplus),&(*antisense_hits_gminus),
1194 				   this,mismatch_positions_alloc,query_compress_fwd,query_compress_rev,querylength,
1195 				   genestrand,nmismatches_search,max_deletionlen,overall_max_distance,
1196 				   /*sizelimit*/3000,listpool,hitlistpool,/*level*/3);
1197   }
1198 
1199   debug(printf("found_score_overall %d\n",*found_score_overall));
1200   if (*found_score_overall <= nmismatches_search) {
1201     if (indexdb_tr != NULL && remap_transcriptome_p == true) {
1202       debug(printf("Remapping to transcriptome\n"));
1203       remap_to_transcriptome(*sense_hits_gplus);
1204       remap_to_transcriptome(*sense_hits_gminus);
1205       remap_to_transcriptome(*antisense_hits_gplus);
1206       remap_to_transcriptome(*antisense_hits_gminus);
1207       debug(printf("Done with remapping to transcriptome\n"));
1208     }
1209     return KMER_APPROX;
1210 
1211   } else {
1212     if (querylength >= index1part) {
1213       Stage1_fill_position_ptrs_plus(this,/*querystart*/0,/*queryend*/querylength,genestrand);
1214       Stage1_fill_position_ptrs_minus(this,/*querystart*/0,/*queryend*/querylength,genestrand);
1215       /* Need sizelimit to constrain segment search */
1216       sizelimit = determine_sizelimit(this,querylength);
1217 
1218       debug(printf("LEVEL %d: RUNNING SEGMENT_SEARCH\n",4));
1219       debug(printf("Starting Segment_identify on plus strand\n"));
1220       plus_records = Segment_identify(&plus_nrecords,
1221 #ifdef LARGE_GENOMES
1222 				      this->plus_positions_high,
1223 #endif
1224 				      this->plus_positions,this->plus_npositions,this->plus_validp,
1225 #ifdef LARGE_GENOMES
1226 				      this->stream_high_alloc,this->stream_low_alloc,
1227 #else
1228 				      this->stream_alloc,
1229 #endif
1230 				      this->streamsize_alloc,this->querypos_diagterm_alloc,
1231 				      overall_max_distance,querylength,sizelimit);
1232       debug(printf("Done\n"));
1233 
1234       debug(printf("Starting Segment_identify on minus strand\n"));
1235       minus_records = Segment_identify(&minus_nrecords,
1236 #ifdef LARGE_GENOMES
1237 				       this->minus_positions_high,
1238 #endif
1239 				       this->minus_positions,this->minus_npositions,this->minus_validp,
1240 #ifdef LARGE_GENOMES
1241 				       this->stream_high_alloc,this->stream_low_alloc,
1242 #else
1243 				       this->stream_alloc,
1244 #endif
1245 				       this->streamsize_alloc,this->querypos_diagterm_alloc,
1246 				       overall_max_distance,querylength,sizelimit);
1247       debug(printf("Done\n"));
1248 
1249       debug(printf("Starting Segment_search_all\n"));
1250       Segment_search_all(&(*found_score_overall),&(*found_score_within_trims),
1251 			 &(*sense_hits_gplus),&(*sense_hits_gminus),&(*antisense_hits_gplus),&(*antisense_hits_gminus),
1252 			 plus_records,plus_nrecords,minus_records,minus_nrecords,
1253 
1254 			 queryuc_ptr,queryrc,querylength,mismatch_positions_alloc,
1255 			 this->spliceinfo,this->stream_alloc,this->streamsize_alloc,
1256 			 query_compress_fwd,query_compress_rev,
1257 			 max_insertionlen,max_deletionlen,overall_max_distance,overall_end_distance,
1258 			 genestrand,paired_end_p,first_read_p,
1259 			 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1260 			 /*method*/SEGMENT1,/*level*/4);
1261       debug(printf("Done\n"));
1262 
1263       FREE(minus_records);
1264       FREE(plus_records);
1265     }
1266   }
1267 
1268 
1269   debug(printf("Comparing found_score_overall %d with nmismatches_search %d\n",
1270 	       *found_score_overall,nmismatches_search));
1271   if (*found_score_overall <= nmismatches_search) {
1272     if (indexdb_tr != NULL && remap_transcriptome_p == true) {
1273       remap_to_transcriptome(*sense_hits_gplus);
1274       remap_to_transcriptome(*sense_hits_gminus);
1275       remap_to_transcriptome(*antisense_hits_gplus);
1276       remap_to_transcriptome(*antisense_hits_gminus);
1277     }
1278     return SEGMENT1;
1279   }
1280 
1281   /* Distant RNA splicing.  No longer remap to transcriptome */
1282   if ((done_level = (*found_score_overall) + subopt_levels) > nmismatches_search) {
1283       done_level = nmismatches_search;
1284     }
1285   debug(printf("done_level %d = found_score %d + subopt_levels %d\n",done_level,*found_score_within_trims,subopt_levels));
1286 
1287   min_trim = querylength;	/* min of max(trim5,trim3) over each hit */
1288   for (p = *sense_hits_gplus; p != NULL; p = List_next(p)) {
1289     hit = (Stage3end_T) List_head(p);
1290     if (Stage3end_max_trim(hit) < min_trim) {
1291       min_trim = Stage3end_max_trim(hit);
1292     }
1293   }
1294   for (p = *sense_hits_gminus; p != NULL; p = List_next(p)) {
1295     hit = (Stage3end_T) List_head(p);
1296     if (Stage3end_max_trim(hit) < min_trim) {
1297       min_trim = Stage3end_max_trim(hit);
1298     }
1299   }
1300 
1301   for (p = *antisense_hits_gplus; p != NULL; p = List_next(p)) {
1302     hit = (Stage3end_T) List_head(p);
1303     if (Stage3end_max_trim(hit) < min_trim) {
1304       min_trim = Stage3end_max_trim(hit);
1305     }
1306   }
1307   for (p = *antisense_hits_gminus; p != NULL; p = List_next(p)) {
1308     hit = (Stage3end_T) List_head(p);
1309     if (Stage3end_max_trim(hit) < min_trim) {
1310       min_trim = Stage3end_max_trim(hit);
1311     }
1312   }
1313 
1314   debug(printf("Comparing min_trim %d with min_distantsplicing_end_matches %d\n",
1315 	       min_trim,min_distantsplicing_end_matches));
1316   if (min_trim < min_distantsplicing_end_matches) {
1317     /* Don't find distant splicing */
1318     debug(printf("Skipping distant splicing because min_trim %d < min_distantsplicing_end_matches %d\n",
1319 		 min_trim,min_distantsplicing_end_matches));
1320 
1321   } else if ((max_splice_mismatches = done_level - distantsplicing_penalty) < 0) {
1322     debug(printf("Skipping distant splicing because done_level %d - distantsplicing_penalty %d < 0\n",
1323 		 done_level,distantsplicing_penalty));
1324 
1325   } else if (splicingp == false) {
1326     /* TODO: Implement distant DNA fusions */
1327     debug(printf("Skipping distant splicing because splicingp is false\n"));
1328 
1329   } else {
1330     debug(printf("Candidate for distant splicing because min_trim %d >= min_distantsplicing_end_matches %d\n",
1331 		 min_trim,min_distantsplicing_end_matches));
1332     debug(printf("Have sets: %d %d %d %d\n",
1333 		 List_length(this->queryfwd_plus_set),List_length(this->queryfwd_minus_set),
1334 		 List_length(this->queryrev_plus_set),List_length(this->queryrev_minus_set)));
1335     debug(printf("LEVEL %d: RUNNING DISTANT_RNA_SOLVE\n",5));
1336 
1337     Distant_rna_solve(&(*found_score_overall),&(*found_score_within_trims),
1338 		      &(*sense_hits_gplus),&(*sense_hits_gminus),&(*antisense_hits_gplus),&(*antisense_hits_gminus),
1339 		      &startfrags_plus,&endfrags_plus,&startfrags_minus,&endfrags_minus,
1340 
1341 		      this->queryfwd_plus_set,this->queryfwd_minus_set,
1342 		      this->queryrev_plus_set,this->queryrev_minus_set,
1343 
1344 		      mismatch_positions_alloc,this->positions_alloc,
1345 		      query_compress_fwd,query_compress_rev,querylength,
1346 		      max_splice_mismatches,genestrand,/*first_read_p*/true,listpool,hitlistpool,
1347 		      /*level*/5);
1348   }
1349 
1350   if (*found_score_overall <= nmismatches_search) {
1351     Substring_list_gc(&startfrags_plus);
1352     Substring_list_gc(&endfrags_plus);
1353     Substring_list_gc(&startfrags_minus);
1354     Substring_list_gc(&endfrags_minus);
1355     return DISTANT_RNA;
1356 
1357   } else if (find_dna_chimeras_p == true) {
1358     debug(printf("LEVEL %d: RUNNING DISTANT_DNA_SOLVE\n",6));
1359     startfrags_plus = Substring_sort_siteN_halves(startfrags_plus,listpool,/*ascendingp*/true);
1360     endfrags_plus = Substring_sort_siteN_halves(endfrags_plus,listpool,/*ascendingp*/true);
1361     startfrags_minus = Substring_sort_siteN_halves(startfrags_minus,listpool,/*ascendingp*/true);
1362     endfrags_minus = Substring_sort_siteN_halves(endfrags_minus,listpool,/*ascendingp*/true);
1363 
1364     Distant_dna_solve(&(*found_score_overall),&(*found_score_within_trims),
1365 		      &(*sense_hits_gplus),&(*sense_hits_gminus),&(*antisense_hits_gplus),&(*antisense_hits_gminus),
1366 		      startfrags_plus,endfrags_plus,startfrags_minus,endfrags_minus,
1367 		      mismatch_positions_alloc,query_compress_fwd,query_compress_rev,querylength,
1368 		      first_read_p,listpool,hitlistpool,/*level*/6);
1369     debug(printf("Returning from distant DNA solve\n"));
1370 
1371     if (*found_score_overall <= nmismatches_search) {
1372       Substring_list_gc(&startfrags_plus);
1373       Substring_list_gc(&endfrags_plus);
1374       Substring_list_gc(&startfrags_minus);
1375       Substring_list_gc(&endfrags_minus);
1376       return DISTANT_DNA;
1377 
1378     } else {
1379       debug(printf("Consider terminals.  Have sense %d hits plus and %d hits minus, antisense %d hits plus and %d hits minus\n",
1380 		   List_length(*sense_hits_gplus),List_length(*sense_hits_gminus),
1381 		   List_length(*antisense_hits_gplus),List_length(*antisense_hits_gminus)));
1382 
1383       /* Terminals.  Use hits == NULL as criterion, rather than
1384 	 found_score, since Terminal_solve cannot improve found_score */
1385 
1386       if (*sense_hits_gplus == NULL && *sense_hits_gminus == NULL &&
1387 	  *antisense_hits_gplus == NULL && *antisense_hits_gminus == NULL) {
1388 
1389 	debug(printf("LEVEL %d: RUNNING TERMINAL_SOLVE_PLUS\n",7));
1390 	Terminal_solve_plus(&(*found_score_overall),&(*found_score_within_trims),
1391 			    &(*sense_hits_gplus),&(*antisense_hits_gplus),
1392 			    this->queryfwd_plus_set,this->queryrev_plus_set,
1393 			    mismatch_positions_alloc,/*queryuc_ptr,*/query_compress_fwd,querylength,
1394 			    genestrand,listpool,hitlistpool,/*level*/7);
1395 
1396 	debug(printf("LEVEL %d: RUNNING TERMINAL_SOLVE_MINUS\n",7));
1397 	Terminal_solve_minus(&(*found_score_overall),&(*found_score_within_trims),
1398 			     &(*sense_hits_gminus),&(*antisense_hits_gminus),
1399 			     this->queryfwd_minus_set,this->queryrev_minus_set,
1400 			     mismatch_positions_alloc,/*queryrc,*/query_compress_rev,querylength,
1401 			     genestrand,listpool,hitlistpool,/*level*/7);
1402 	debug(printf("Got sense %d plus and %d minus terminals, antisense %d plus and %d minus terminals\n",
1403 		     List_length(*sense_hits_gplus),List_length(*sense_hits_gminus),
1404 		     List_length(*antisense_hits_gplus),List_length(*antisense_hits_gminus)));
1405       }
1406     }
1407 
1408     debug(printf("Returning from terminals\n"));
1409   }
1410 
1411   Substring_list_gc(&startfrags_plus);
1412   Substring_list_gc(&endfrags_plus);
1413   Substring_list_gc(&startfrags_minus);
1414   Substring_list_gc(&endfrags_minus);
1415   return TERMINAL;
1416 }
1417 
1418 
1419 Stage3end_T *
Stage1_single_read(int * npaths_primary,int * npaths_altloc,int * first_absmq,int * second_absmq,Shortread_T queryseq,Intlistpool_T intlistpool,Univcoordlistpool_T univcoordlistpool,Listpool_T listpool,Univdiagpool_T univdiagpool,Hitlistpool_T hitlistpool)1420 Stage1_single_read (int *npaths_primary, int *npaths_altloc, int *first_absmq, int *second_absmq,
1421 		    Shortread_T queryseq,
1422 #if 0
1423 		    Oligoindex_array_T oligoindices_minor,
1424 		    Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
1425 		    Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
1426 #endif
1427 		    Intlistpool_T intlistpool, Univcoordlistpool_T univcoordlistpool,
1428 		    Listpool_T listpool, Univdiagpool_T univdiagpool, Hitlistpool_T hitlistpool) {
1429   Stage3end_T *stage3array;
1430   T this;
1431   List_T hits, geneplus_hits, geneminus_hits;
1432   List_T sense_hits_gplus, sense_hits_gminus, antisense_hits_gplus, antisense_hits_gminus;
1433   int *mismatch_positions_alloc;
1434 
1435   int found_score_overall, found_score_within_trims;
1436   int querylength, max_mismatches_refalt, max_mismatches_ref, min_coverage;
1437   char *queryuc_ptr, *queryrc, *quality_string;
1438   Compress_T query_compress_fwd, query_compress_rev;
1439 
1440 
1441   querylength = Shortread_fulllength(queryseq);
1442   queryuc_ptr = Shortread_fullpointer_uc(queryseq);
1443 
1444   queryrc = (char *) MALLOC((querylength+1)*sizeof(char));
1445   make_complement_buffered(queryrc,queryuc_ptr,querylength);
1446 
1447   mismatch_positions_alloc = (int *) MALLOC((querylength+1)*sizeof(int));
1448   query_compress_fwd = Compress_new_fwd(queryuc_ptr,querylength);
1449   query_compress_rev = Compress_new_rev(queryuc_ptr,querylength);
1450 
1451   /* Previously used user_mismatches_refalt_float in searching, now just for filtering */
1452   if (user_mismatches_refalt_float < 0.0) {
1453     max_mismatches_refalt = querylength;
1454   } else if (user_mismatches_refalt_float > 0.0 && user_mismatches_refalt_float < 1.0) {
1455     max_mismatches_refalt = (int) rint(user_mismatches_refalt_float * (double) querylength);
1456   } else {
1457     /* Assuming that --max-mismatches=1 must mean 1 bp and not querylength */
1458     max_mismatches_refalt = (int) user_mismatches_refalt_float;
1459   }
1460 
1461   /* Previously used user_mismatches_ref_float in searching, now just for filtering */
1462   if (user_mismatches_ref_float < 0.0) {
1463     max_mismatches_ref = querylength;
1464   } else if (user_mismatches_ref_float > 0.0 && user_mismatches_ref_float < 1.0) {
1465     max_mismatches_ref = (int) rint(user_mismatches_ref_float * (double) querylength);
1466   } else {
1467     /* Assuming that --max-mismatches=1 must mean 1 bp and not querylength */
1468     max_mismatches_ref = (int) user_mismatches_ref_float;
1469   }
1470 
1471   if (user_mincoverage_float <= 0.0) {
1472     min_coverage = 0;
1473   } else if (user_mincoverage_float <= 1.0) {
1474     /* Assuming that --min-coverage=1 must mean 1.0 and not a coverage of 1 bp */
1475     min_coverage = (int) rint(user_mincoverage_float * (double) querylength);
1476   } else {
1477     min_coverage = (int) user_mincoverage_float;
1478   }
1479 
1480   if (mode == STANDARD || mode == CMET_STRANDED || mode == ATOI_STRANDED || mode == TTOC_STRANDED) {
1481     this = Stage1_new(querylength);
1482     single_read_search(&found_score_overall,&found_score_within_trims,
1483 		       &sense_hits_gplus,&sense_hits_gminus,&antisense_hits_gplus,&antisense_hits_gminus,
1484 		       this,queryuc_ptr,queryrc,querylength,
1485 		       mismatch_positions_alloc,query_compress_fwd,query_compress_rev,/*genestrand*/0,
1486 		       intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1487 		       /*paired_end_p*/false,/*first_read_p*/true);
1488     Stage1_free(&this);
1489 
1490     sense_hits_gplus = Stage3end_filter(sense_hits_gplus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1491     sense_hits_gminus = Stage3end_filter(sense_hits_gminus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1492     antisense_hits_gplus = Stage3end_filter(antisense_hits_gplus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1493     antisense_hits_gminus = Stage3end_filter(antisense_hits_gminus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1494     hits = List_append(sense_hits_gplus,List_append(sense_hits_gminus,List_append(antisense_hits_gplus,antisense_hits_gminus)));
1495 
1496   } else if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
1497     this = Stage1_new(querylength);
1498     single_read_search(&found_score_overall,&found_score_within_trims,
1499 		       &sense_hits_gplus,&sense_hits_gminus,&antisense_hits_gplus,&antisense_hits_gminus,
1500 		       this,queryuc_ptr,queryrc,querylength,
1501 		       mismatch_positions_alloc,query_compress_fwd,query_compress_rev,/*genestrand*/+1,
1502 		       intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1503 		       /*paired_end_p*/false,/*first_read_p*/true);
1504     Stage1_free(&this);
1505 
1506     sense_hits_gplus = Stage3end_filter(sense_hits_gplus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1507     sense_hits_gminus = Stage3end_filter(sense_hits_gminus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1508     antisense_hits_gplus = Stage3end_filter(antisense_hits_gplus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1509     antisense_hits_gminus = Stage3end_filter(antisense_hits_gminus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1510     geneplus_hits = List_append(sense_hits_gplus,List_append(sense_hits_gminus,List_append(antisense_hits_gplus,antisense_hits_gminus)));
1511 
1512 
1513     this = Stage1_new(querylength);
1514     single_read_search(&found_score_overall,&found_score_within_trims,
1515 		       &sense_hits_gplus,&sense_hits_gminus,&antisense_hits_gplus,&antisense_hits_gminus,
1516 		       this,queryuc_ptr,queryrc,querylength,
1517 		       mismatch_positions_alloc,query_compress_fwd,query_compress_rev,/*genestrand*/+2,
1518 		       intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1519 		       /*paired_end_p*/false,/*first_read_p*/true);
1520     Stage1_free(&this);
1521 
1522     sense_hits_gplus = Stage3end_filter(sense_hits_gplus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1523     sense_hits_gminus = Stage3end_filter(sense_hits_gminus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1524     antisense_hits_gplus = Stage3end_filter(antisense_hits_gplus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1525     antisense_hits_gminus = Stage3end_filter(antisense_hits_gminus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1526     geneminus_hits = List_append(sense_hits_gplus,List_append(sense_hits_gminus,List_append(antisense_hits_gplus,antisense_hits_gminus)));
1527 
1528     hits = List_append(geneplus_hits,geneminus_hits);
1529 
1530   } else {
1531     fprintf(stderr,"Do not recognize mode %d\n",mode);
1532     abort();
1533   }
1534 
1535 
1536   if (hits == NULL) {
1537     *npaths_primary = *npaths_altloc = 0;
1538     stage3array = (Stage3end_T *) NULL;
1539 
1540   } else {
1541     hits = Stage3end_remove_circular_alias(hits,hitlistpool); /* Contains a call to unalias_circular */
1542     hits = Stage3end_remove_duplicates(hits,hitlistpool); /* Aliases can cause duplicates */
1543     debug(printf("Have %d hits after remove_duplicates\n",List_length(hits)));
1544 
1545     hits = Stage3end_optimal_score(hits,hitlistpool,querylength,/*finalp*/false);
1546     debug(printf("Have %d hits after optimal_score\n",List_length(hits)));
1547     hits = Stage3end_remove_overlaps(hits,hitlistpool,querylength,/*finalp*/true);
1548     debug(printf("Have %d hits after remove_overlaps\n",List_length(hits)));
1549     hits = Stage3end_optimal_score(hits,hitlistpool,querylength,/*finalp*/true);
1550     debug(printf("Have %d hits after final optimal_score\n",List_length(hits)));
1551 
1552     quality_string = Shortread_quality_string(queryseq);
1553     Stage3end_count_hits(&(*npaths_primary),&(*npaths_altloc),hits);
1554     stage3array = (Stage3end_T *) List_to_array_out(hits,NULL); Hitlist_free(&hits); /* Return value */
1555     stage3array = Stage3end_eval_and_sort(/*npaths*/(*npaths_primary) + (*npaths_altloc),
1556 					  &(*first_absmq),&(*second_absmq),
1557 					  stage3array,queryuc_ptr,quality_string,/*displayp*/true);
1558   }
1559 
1560   FREE(queryrc);
1561 
1562   Compress_free(&query_compress_fwd);
1563   Compress_free(&query_compress_rev);
1564   FREE(mismatch_positions_alloc);
1565 
1566   return stage3array;
1567 }
1568 
1569 
1570 
1571 
1572 static Pairtype_T
choose_among_paired(int * best_nmatches_paired,int * best_nmatches_5,int * best_nmatches_3,List_T hitpairs,List_T samechr,List_T conc_transloc)1573 choose_among_paired (int *best_nmatches_paired, int *best_nmatches_5, int *best_nmatches_3,
1574 		     List_T hitpairs, List_T samechr, List_T conc_transloc) {
1575   Pairtype_T final_pairtype = UNPAIRED;
1576   List_T p;
1577   Stage3pair_T hitpair;
1578   int nmatches, nmatches5, nmatches3;
1579 
1580   debug16(printf("choose: %d hitpairs, %d conc_transloc, %d samechr\n",
1581 		 List_length(hitpairs),List_length(conc_transloc),List_length(samechr)));
1582 
1583   *best_nmatches_paired = 0;
1584   *best_nmatches_5 = 0;
1585   *best_nmatches_3 = 0;
1586   for (p = hitpairs; p != NULL; p = p->rest) {
1587     hitpair = (Stage3pair_T) p->first;
1588     if ((nmatches = Stage3pair_nmatches_to_trims(&nmatches5,&nmatches3,hitpair)) > *best_nmatches_paired) {
1589       final_pairtype = CONCORDANT;
1590       *best_nmatches_paired = nmatches;
1591       *best_nmatches_5 = nmatches5;
1592       *best_nmatches_3 = nmatches3;
1593     }
1594   }
1595 
1596   *best_nmatches_paired += 1; /* penalty for choosing translocation over others */
1597 
1598   for (p = conc_transloc; p != NULL; p = p->rest) {
1599     hitpair = (Stage3pair_T) p->first;
1600     if ((nmatches = Stage3pair_nmatches_to_trims(&nmatches5,&nmatches3,hitpair)) > *best_nmatches_paired) {
1601       final_pairtype = CONCORDANT_TRANSLOCATIONS;
1602       *best_nmatches_paired = nmatches;
1603       *best_nmatches_5 = nmatches5;
1604       *best_nmatches_3 = nmatches3;
1605     }
1606   }
1607 
1608   for (p = samechr; p != NULL; p = p->rest) {
1609     hitpair = (Stage3pair_T) p->first;
1610     if ((nmatches = Stage3pair_nmatches_to_trims(&nmatches5,&nmatches3,hitpair)) > *best_nmatches_paired) {
1611       final_pairtype = PAIRED_UNSPECIFIED;
1612       *best_nmatches_paired = nmatches;
1613       *best_nmatches_5 = nmatches5;
1614       *best_nmatches_3 = nmatches3;
1615     }
1616   }
1617 
1618   debug16(printf("best_nmatches_paired among paired = %d = %d + %d\n",
1619 		 *best_nmatches_paired,*best_nmatches_5,*best_nmatches_3));
1620   debug16(printf("final pairtype: %s\n",Pairtype_string(final_pairtype)));
1621 
1622   return final_pairtype;
1623 }
1624 
1625 
1626 
1627 static List_T
paired_read_segment_search(bool * abort_pairing_p,List_T * samechr,List_T * conc_transloc,List_T hitpairs,int last_level_5,int last_level_3,int found_score_overall_5,int found_score_within_trims_5,int found_score_overall_3,int found_score_within_trims_3,T this5,T this3,Ladder_T sense_ladder5_plus,Ladder_T sense_ladder5_minus,Ladder_T sense_ladder3_plus,Ladder_T sense_ladder3_minus,Ladder_T antisense_ladder5_plus,Ladder_T antisense_ladder5_minus,Ladder_T antisense_ladder3_plus,Ladder_T antisense_ladder3_minus,char * queryuc_ptr_5,char * queryrc5,int querylength5,char * queryuc_ptr_3,char * queryrc3,int querylength3,int * mismatch_positions_alloc_5,int * mismatch_positions_alloc_3,Compress_T query5_compress_fwd,Compress_T query5_compress_rev,Compress_T query3_compress_fwd,Compress_T query3_compress_rev,int max_insertionlen_5,int max_insertionlen_3,int max_deletionlen_5,int max_deletionlen_3,int genestrand,Chrpos_T pairmax_linear,Chrpos_T overall_max_distance_5,Chrpos_T overall_max_distance_3,Chrpos_T overall_end_distance_5,Chrpos_T overall_end_distance_3,int max_mismatches_refalt_5,int max_mismatches_refalt_3,int max_mismatches_ref_5,int max_mismatches_ref_3,int min_coverage_5,int min_coverage_3,Intlistpool_T intlistpool,Univcoordlistpool_T univcoordlistpool,Listpool_T listpool,Univdiagpool_T univdiagpool,Hitlistpool_T hitlistpool)1628 paired_read_segment_search (bool *abort_pairing_p, List_T *samechr, List_T *conc_transloc,
1629 			    List_T hitpairs, int last_level_5, int last_level_3,
1630 			    int found_score_overall_5, int found_score_within_trims_5,
1631 			    int found_score_overall_3, int found_score_within_trims_3, T this5, T this3,
1632 			    Ladder_T sense_ladder5_plus, Ladder_T sense_ladder5_minus,
1633 			    Ladder_T sense_ladder3_plus, Ladder_T sense_ladder3_minus,
1634 			    Ladder_T antisense_ladder5_plus, Ladder_T antisense_ladder5_minus,
1635 			    Ladder_T antisense_ladder3_plus, Ladder_T antisense_ladder3_minus,
1636 			    char *queryuc_ptr_5, char *queryrc5, int querylength5, char *queryuc_ptr_3, char *queryrc3, int querylength3,
1637 			    int *mismatch_positions_alloc_5, int *mismatch_positions_alloc_3,
1638 			    Compress_T query5_compress_fwd, Compress_T query5_compress_rev,
1639 			    Compress_T query3_compress_fwd, Compress_T query3_compress_rev,
1640 			    int max_insertionlen_5, int max_insertionlen_3, int max_deletionlen_5, int max_deletionlen_3,
1641 			    int genestrand, Chrpos_T pairmax_linear, Chrpos_T overall_max_distance_5, Chrpos_T overall_max_distance_3,
1642 			    Chrpos_T overall_end_distance_5, Chrpos_T overall_end_distance_3,
1643 
1644 			    int max_mismatches_refalt_5, int max_mismatches_refalt_3,
1645 			    int max_mismatches_ref_5, int max_mismatches_ref_3,
1646 			    int min_coverage_5, int min_coverage_3,
1647 
1648 			    Intlistpool_T intlistpool, Univcoordlistpool_T univcoordlistpool,
1649 			    Listpool_T listpool, Univdiagpool_T univdiagpool, Hitlistpool_T hitlistpool) {
1650   int last_level;
1651 
1652   /* int plus_specifici_5[3], minus_specifici_5[3], plus_specifici_3[3], minus_specifici_3[3]; */
1653   Univcoord_T *gplus5_diagonals, *gminus5_diagonals, *gplus3_diagonals, *gminus3_diagonals;
1654   int gplus5_ndiagonals, gminus5_ndiagonals, gplus3_ndiagonals, gminus3_ndiagonals;
1655 
1656   /* concordant_score is for all hitpairs; adjacent score is for those hitpairs within adjacent_pairlength */
1657   int adjacent_score, concordant_score;
1658   int done_level_5, done_level_3, nmismatches_search, nmismatches_search_5, nmismatches_search_3;
1659   /* int max_terminal_mismatches; */
1660   int max_splice_mismatches;
1661   int min_trim_5, min_trim_3;
1662 
1663   int maxpairedpaths;
1664 
1665   List_T sense_hits5_gplus = NULL, sense_hits5_gminus = NULL, sense_hits3_gplus = NULL, sense_hits3_gminus = NULL,
1666     antisense_hits5_gplus = NULL, antisense_hits5_gminus = NULL, antisense_hits3_gplus = NULL, antisense_hits3_gminus = NULL;
1667 
1668 
1669 #if 0
1670   /* If we prioritize transcriptome hits over genomic hits */
1671   List_T transcriptome_hits5 = NULL, transcriptome_hits3 = NULL;
1672   List_T genome_hits5 = NULL, genome_hits5 = NULL;
1673 #endif
1674 
1675 
1676   /* For Segment_search */
1677   struct Record_T *plus_records5 = NULL, *minus_records5 = NULL, *plus_records3 = NULL, *minus_records3 = NULL;
1678   int plus_nrecords5 = 0, minus_nrecords5 = 0, plus_nrecords3 = 0, minus_nrecords3 = 0;
1679 
1680   List_T startfrags5_plus = NULL, endfrags5_plus = NULL, startfrags5_minus = NULL, endfrags5_minus = NULL,
1681     startfrags3_plus = NULL, endfrags3_plus = NULL, startfrags3_minus = NULL, endfrags3_minus = NULL;
1682 
1683   int level;
1684 
1685 
1686   debug(printf("Entered paired_read_segment_search with queryuc_ptr_5 %s\n",queryuc_ptr_5));
1687   debug(printf("Entered paired_read_segment_search with queryuc_ptr_3 %s\n",queryuc_ptr_3));
1688 
1689   *abort_pairing_p = false;
1690   *samechr = (List_T) NULL;
1691   /* *conc_transloc = (List_T) NULL; -- May contain results from concordant alignment of single ends */
1692 
1693   if (last_level_5 < last_level_3) {
1694     level = last_level = last_level_5;
1695   } else {
1696     level = last_level = last_level_3;
1697   }
1698   debug(printf("last_level is %s <- %s and %s\n",
1699 	       Method_string(last_level),Method_string(last_level_5),Method_string(last_level_3)));
1700 
1701 
1702   /* Take the larger of maxpaths_search and 10*maxpaths_report */
1703   maxpairedpaths = maxpaths_search;
1704   if (maxpairedpaths < 10*maxpaths_report) {
1705     maxpairedpaths = 10*maxpaths_report;
1706   }
1707 
1708   adjacent_score = concordant_score = querylength5 + querylength3;
1709 
1710   nmismatches_search_5 = querylength5/index1part_tr;
1711   nmismatches_search_3 = querylength3/index1part_tr;
1712   nmismatches_search = nmismatches_search_5 + nmismatches_search_3;
1713 
1714 
1715   /* found_score_5 = querylength5; */
1716   /* found_score_3 = querylength3; */
1717   /* found_score = querylength5 + querylength3; */
1718 
1719 
1720   if (last_level < KMER_EXACT) {
1721     if (last_level_5 < KMER_EXACT) {
1722       debug(printf("LEVEL %d: RUNNING KMER_SEARCH_GENOME_ENDS ON 5' READ\n",level));
1723       Stage1_init(this5,queryuc_ptr_5,querylength5,genestrand);
1724     }
1725 
1726     if (last_level_3 < KMER_EXACT) {
1727       debug(printf("LEVEL %d: RUNNING KMER_SEARCH_GENOME_ENDS ON 3' READ\n",level));
1728       Stage1_init(this3,queryuc_ptr_3,querylength3,genestrand);
1729     }
1730   }
1731 
1732   if (last_level < EXT) {
1733     if (last_level_5 < EXT && querylength5 >= index1part) {
1734       Stage1_rearrange(this5,querylength5,/*one5p*/false);
1735       Stage1_fill_all_oligos(this5,queryuc_ptr_5,querylength5,genestrand);
1736     }
1737 
1738     if (last_level_3 < EXT && querylength3 >= index1part) {
1739       Stage1_rearrange(this3,querylength3,/*one3p*/false);
1740       Stage1_fill_all_oligos(this3,queryuc_ptr_3,querylength3,genestrand);
1741     }
1742   }
1743 
1744   /* SEGMENT2 algorithm is different from SEGMENT1, so we should do this in all cases */
1745   if (querylength5 >= index1part) {
1746     debug(printf("LEVEL %d: RUNNING SEGMENT_SEARCH_GENOME (ANCHORED) ON 5' READ\n",level));
1747     if (last_level_5 < SEGMENT1) {
1748       Stage1_fill_position_ptrs_plus(this5,/*querystart*/0,/*queryend*/querylength5,genestrand);
1749       Stage1_fill_position_ptrs_minus(this5,/*querystart*/0,/*queryend*/querylength5,genestrand);
1750     }
1751 
1752     /* Sense */
1753     gplus3_diagonals = Ladder_genomicstarts(&gplus3_ndiagonals,sense_ladder3_plus);
1754     gminus3_diagonals = Ladder_genomicends(&gminus3_ndiagonals,sense_ladder3_minus);
1755 
1756     plus_records5 = Segment_identify_lower(&plus_nrecords5,
1757 #ifdef LARGE_GENOMES
1758 					   this5->plus_positions_high,
1759 #endif
1760 					   this5->plus_positions,this5->plus_npositions,this5->plus_validp,
1761 					   this5->stream_alloc,this5->streamsize_alloc,this5->querypos_diagterm_alloc,
1762 					   /*max_pairlength*/pairmax_linear,overall_max_distance_5,querylength5,
1763 					   gplus3_diagonals,gplus3_ndiagonals);
1764 
1765     minus_records5 = Segment_identify_higher(&minus_nrecords5,
1766 #ifdef LARGE_GENOMES
1767 					     this5->minus_positions_high,
1768 #endif
1769 					     this5->minus_positions,this5->minus_npositions,this5->minus_validp,
1770 					     this5->stream_alloc,this5->streamsize_alloc,this5->querypos_diagterm_alloc,
1771 					     /*max_pairlength*/pairmax_linear,overall_max_distance_5,querylength5,
1772 					     gminus3_diagonals,gminus3_ndiagonals);
1773     FREE(gminus3_diagonals);
1774     FREE(gplus3_diagonals);
1775 
1776     Segment_search_all(&found_score_overall_5,&found_score_within_trims_5,
1777 		       &sense_hits5_gplus,&sense_hits5_gminus,&antisense_hits5_gplus,&antisense_hits5_gminus,
1778 		       plus_records5,plus_nrecords5,minus_records5,minus_nrecords5,
1779 
1780 		       queryuc_ptr_5,queryrc5,querylength5,mismatch_positions_alloc_5,
1781 		       this5->spliceinfo,this5->stream_alloc,this5->streamsize_alloc,
1782 		       query5_compress_fwd,query5_compress_rev,
1783 		       max_insertionlen_5,max_deletionlen_5,
1784 		       overall_max_distance_5,overall_end_distance_5,
1785 		       genestrand,/*paired_end_p*/true,/*first_read_p*/true,
1786 		       intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1787 		       /*method*/SEGMENT2,level);
1788     FREE(plus_records5); FREE(minus_records5);
1789 
1790 
1791     if (splicingp == true) {
1792       /* Antisense */
1793       gplus3_diagonals = Ladder_genomicstarts(&gplus3_ndiagonals,antisense_ladder3_plus);
1794       gminus3_diagonals = Ladder_genomicends(&gminus3_ndiagonals,antisense_ladder3_minus);
1795 
1796       plus_records5 = Segment_identify_lower(&plus_nrecords5,
1797 #ifdef LARGE_GENOMES
1798 					     this5->plus_positions_high,
1799 #endif
1800 					     this5->plus_positions,this5->plus_npositions,this5->plus_validp,
1801 					     this5->stream_alloc,this5->streamsize_alloc,this5->querypos_diagterm_alloc,
1802 					     /*max_pairlength*/pairmax_linear,overall_max_distance_5,querylength5,
1803 					     gplus3_diagonals,gplus3_ndiagonals);
1804 
1805       minus_records5 = Segment_identify_higher(&minus_nrecords5,
1806 #ifdef LARGE_GENOMES
1807 					       this5->minus_positions_high,
1808 #endif
1809 					       this5->minus_positions,this5->minus_npositions,this5->minus_validp,
1810 					       this5->stream_alloc,this5->streamsize_alloc,this5->querypos_diagterm_alloc,
1811 					       /*max_pairlength*/pairmax_linear,overall_max_distance_5,querylength5,
1812 					       gminus3_diagonals,gminus3_ndiagonals);
1813       FREE(gminus3_diagonals);
1814       FREE(gplus3_diagonals);
1815 
1816       Segment_search_all(&found_score_overall_5,&found_score_within_trims_5,
1817 			 &sense_hits5_gplus,&sense_hits5_gminus,&antisense_hits5_gplus,&antisense_hits5_gminus,
1818 			 plus_records5,plus_nrecords5,minus_records5,minus_nrecords5,
1819 
1820 			 queryuc_ptr_5,queryrc5,querylength5,mismatch_positions_alloc_5,
1821 			 this5->spliceinfo,this5->stream_alloc,this5->streamsize_alloc,
1822 			 query5_compress_fwd,query5_compress_rev,
1823 			 max_insertionlen_5,max_deletionlen_5,
1824 			 overall_max_distance_5,overall_end_distance_5,
1825 			 genestrand,/*paired_end_p*/true,/*first_read_p*/true,
1826 			 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1827 			 /*method*/SEGMENT2,level);
1828       FREE(plus_records5); FREE(minus_records5);
1829     }
1830   }
1831 
1832 
1833   if (querylength3 >= index1part) {
1834     debug(printf("LEVEL %d: RUNNING SEGMENT_SEARCH_GENOME (ANCHORED) ON 3' READ\n",level));
1835     if (last_level_3 < SEGMENT1) {
1836       Stage1_fill_position_ptrs_plus(this3,/*querystart*/0,/*queryend*/querylength3,genestrand);
1837       Stage1_fill_position_ptrs_minus(this3,/*querystart*/0,/*queryend*/querylength3,genestrand);
1838     }
1839 
1840     /* Sense */
1841     gplus5_diagonals = Ladder_genomicstarts(&gplus5_ndiagonals,sense_ladder5_plus);
1842     gminus5_diagonals = Ladder_genomicends(&gminus5_ndiagonals,sense_ladder5_minus);
1843 
1844     plus_records3 = Segment_identify_higher(&plus_nrecords3,
1845 #ifdef LARGE_GENOMES
1846 					    this3->plus_positions_high,
1847 #endif
1848 					    this3->plus_positions,this3->plus_npositions,this3->plus_validp,
1849 					    this3->stream_alloc,this3->streamsize_alloc,this3->querypos_diagterm_alloc,
1850 					    /*max_pairlength*/pairmax_linear,overall_max_distance_3,querylength3,
1851 					    gplus5_diagonals,gplus5_ndiagonals);
1852 
1853     minus_records3 = Segment_identify_lower(&minus_nrecords3,
1854 #ifdef LARGE_GENOMES
1855 					    this3->minus_positions_high,
1856 #endif
1857 					    this3->minus_positions,this3->minus_npositions,this3->minus_validp,
1858 					    this3->stream_alloc,this3->streamsize_alloc,this3->querypos_diagterm_alloc,
1859 					    /*max_pairlength*/pairmax_linear,overall_max_distance_3,querylength3,
1860 					    gminus5_diagonals,gminus5_ndiagonals);
1861     FREE(gminus5_diagonals);
1862     FREE(gplus5_diagonals);
1863 
1864     Segment_search_all(&found_score_overall_3,&found_score_within_trims_3,
1865 		       &sense_hits3_gplus,&sense_hits3_gminus,&antisense_hits3_gplus,&antisense_hits3_gminus,
1866 		       plus_records3,plus_nrecords3,minus_records3,minus_nrecords3,
1867 
1868 		       queryuc_ptr_3,queryrc3,querylength3,mismatch_positions_alloc_3,
1869 		       this3->spliceinfo,this3->stream_alloc,this3->streamsize_alloc,
1870 		       query3_compress_fwd,query3_compress_rev,
1871 		       max_insertionlen_3,max_deletionlen_3,
1872 		       overall_max_distance_3,overall_end_distance_3,
1873 		       genestrand,/*paired_end_p*/true,/*first_read_p*/false,
1874 		       intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1875 		       /*method*/SEGMENT2,level);
1876     FREE(plus_records3); FREE(minus_records3);
1877 
1878     if (splicingp == true) {
1879       /* Antisense */
1880       gplus5_diagonals = Ladder_genomicstarts(&gplus5_ndiagonals,antisense_ladder5_plus);
1881       gminus5_diagonals = Ladder_genomicends(&gminus5_ndiagonals,antisense_ladder5_minus);
1882 
1883       plus_records3 = Segment_identify_higher(&plus_nrecords3,
1884 #ifdef LARGE_GENOMES
1885 					      this3->plus_positions_high,
1886 #endif
1887 					      this3->plus_positions,this3->plus_npositions,this3->plus_validp,
1888 					      this3->stream_alloc,this3->streamsize_alloc,this3->querypos_diagterm_alloc,
1889 					      /*max_pairlength*/pairmax_linear,overall_max_distance_3,querylength3,
1890 					      gplus5_diagonals,gplus5_ndiagonals);
1891 
1892       minus_records3 = Segment_identify_lower(&minus_nrecords3,
1893 #ifdef LARGE_GENOMES
1894 					      this3->minus_positions_high,
1895 #endif
1896 					      this3->minus_positions,this3->minus_npositions,this3->minus_validp,
1897 					      this3->stream_alloc,this3->streamsize_alloc,this3->querypos_diagterm_alloc,
1898 					      /*max_pairlength*/pairmax_linear,overall_max_distance_3,querylength3,
1899 					      gminus5_diagonals,gminus5_ndiagonals);
1900       FREE(gminus5_diagonals);
1901       FREE(gplus5_diagonals);
1902 
1903       Segment_search_all(&found_score_overall_3,&found_score_within_trims_3,
1904 			 &sense_hits3_gplus,&sense_hits3_gminus,&antisense_hits3_gplus,&antisense_hits3_gminus,
1905 			 plus_records3,plus_nrecords3,minus_records3,minus_nrecords3,
1906 
1907 			 queryuc_ptr_3,queryrc3,querylength3,mismatch_positions_alloc_3,
1908 			 this3->spliceinfo,this3->stream_alloc,this3->streamsize_alloc,
1909 			 query3_compress_fwd,query3_compress_rev,
1910 			 max_insertionlen_3,max_deletionlen_3,
1911 			 overall_max_distance_3,overall_end_distance_3,
1912 			 genestrand,/*paired_end_p*/true,/*first_read_p*/false,
1913 			 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1914 			 /*method*/SEGMENT2,level);
1915       FREE(plus_records3); FREE(minus_records3);
1916     }
1917   }
1918 
1919 
1920   debug(printf("found scores: %d and %d (vs %d and %d allowed)\n",
1921 	       found_score_within_trims_5,found_score_within_trims_3,nmismatches_search_5,nmismatches_search_3));
1922 
1923   debug(printf("sense hits5 plus: %d new.  hits5 minus: %d new.\n",
1924 	       List_length(sense_hits5_gplus),List_length(sense_hits5_gminus)));
1925   debug(printf("sense hits3 plus: %d new.  hits3 minus: %d new.\n",
1926 	       List_length(sense_hits3_gplus),List_length(sense_hits3_gminus)));
1927 
1928   sense_hits5_gplus = Stage3end_filter(sense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
1929   sense_hits5_gminus = Stage3end_filter(sense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
1930   sense_hits3_gplus = Stage3end_filter(sense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
1931   sense_hits3_gminus = Stage3end_filter(sense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
1932 
1933   hitpairs = Concordance_pair_up_genome(&(*abort_pairing_p),&adjacent_score,&concordant_score,&(*conc_transloc),
1934 					hitpairs,sense_hits5_gplus,sense_hits5_gminus,sense_hits3_gplus,sense_hits3_gminus,
1935 					sense_ladder5_plus,sense_ladder5_minus,sense_ladder3_plus,sense_ladder3_minus,
1936 					querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
1937 					query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
1938 					listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_FORWARD); level++;
1939   Hitlist_free(&sense_hits5_gplus); Hitlist_free(&sense_hits5_gminus);
1940   Hitlist_free(&sense_hits3_gplus); Hitlist_free(&sense_hits3_gminus);
1941 
1942   if (splicingp == true) {
1943     debug(printf("antisense hits5 plus: %d new.  hits5 minus: %d new.\n",
1944 		 List_length(antisense_hits5_gplus),List_length(antisense_hits5_gminus)));
1945     debug(printf("antisense hits3 plus: %d new.  hits3 minus: %d new.\n",
1946 		 List_length(antisense_hits3_gplus),List_length(antisense_hits3_gminus)));
1947 
1948     antisense_hits5_gplus = Stage3end_filter(antisense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
1949     antisense_hits5_gminus = Stage3end_filter(antisense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
1950     antisense_hits3_gplus = Stage3end_filter(antisense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
1951     antisense_hits3_gminus = Stage3end_filter(antisense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
1952 
1953     hitpairs = Concordance_pair_up_genome(&(*abort_pairing_p),&adjacent_score,&concordant_score,&(*conc_transloc),
1954 					  hitpairs,antisense_hits5_gplus,antisense_hits5_gminus,antisense_hits3_gplus,antisense_hits3_gminus,
1955 					  antisense_ladder5_plus,antisense_ladder5_minus,antisense_ladder3_plus,antisense_ladder3_minus,
1956 					  querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
1957 					  query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
1958 					  listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_ANTI); level++;
1959     Hitlist_free(&antisense_hits5_gplus); Hitlist_free(&antisense_hits5_gminus);
1960     Hitlist_free(&antisense_hits3_gplus); Hitlist_free(&antisense_hits3_gminus);
1961   }
1962 
1963   debug(printf("(4) After level %d, we have adjacent_score %d, concordant_score %d (vs allowed %d), %d pairs\n",
1964 	       level,adjacent_score,concordant_score,nmismatches_search,List_length(hitpairs)));
1965 
1966   if (*abort_pairing_p == true || concordant_score <= nmismatches_search) {
1967     return hitpairs;
1968 
1969   } else if (last_level < DISTANT_RNA) {
1970     /* Find distant splicing.  Use concordant_score, rather than
1971        adjacent_score as criterion, since distant splicing cannot find
1972        adjacent pairs */
1973 
1974     /* distant splicing on 5' read */
1975     if (last_level_5 < DISTANT_RNA && found_score_within_trims_5 >= found_score_within_trims_3) {
1976       if ((done_level_5 = found_score_within_trims_5 + subopt_levels) > nmismatches_search_5) {
1977 	done_level_5 = nmismatches_search_5;
1978       }
1979       debug(printf("done_level_5 %d = found_score_within_trims_5 %d + subopt_levels %d\n",
1980 		   done_level_5,found_score_within_trims_5,subopt_levels));
1981 
1982       /* min of max(trim5,trim3) over each hit */
1983       min_trim_5 = Ladder_minimax_trim(sense_ladder5_plus,sense_ladder5_minus,
1984 				       antisense_ladder5_plus,antisense_ladder5_minus,querylength5);
1985 
1986       debug(printf("For 5' end, comparing min_trim %d with min_distantsplicing_end_matches %d\n",
1987 		   min_trim_5,min_distantsplicing_end_matches));
1988       if (min_trim_5 < min_distantsplicing_end_matches) {
1989 	/* Don't find distant splicing */
1990 	debug(printf("For 5' end, skipping distant splicing because min_trim %d < min_distantsplicing_end_matches %d\n",
1991 		     min_trim_5,min_distantsplicing_end_matches));
1992 
1993       } else if ((max_splice_mismatches = done_level_5 - distantsplicing_penalty) >= 0) {
1994 	if (splicingp == true) {
1995 	  debug(printf("For 5' end, candidate for distant splicing because min_trim %d >= min_distantsplicing_end_matches %d\n",
1996 		       min_trim_5,min_distantsplicing_end_matches));
1997 	  debug(printf("Have sets: %d %d %d %d\n",
1998 		       List_length(this5->queryfwd_plus_set),List_length(this5->queryfwd_minus_set),
1999 		       List_length(this5->queryrev_plus_set),List_length(this5->queryrev_minus_set)));
2000 	  Distant_rna_solve(&found_score_overall_5,&found_score_within_trims_5,
2001 			    &sense_hits5_gplus,&sense_hits5_gminus,&antisense_hits5_gplus,&antisense_hits5_gminus,
2002 			    &startfrags5_plus,&endfrags5_plus,&startfrags5_minus,&endfrags5_minus,
2003 
2004 			    this5->queryfwd_plus_set,this5->queryfwd_minus_set,
2005 			    this5->queryrev_plus_set,this5->queryrev_minus_set,
2006 
2007 			    mismatch_positions_alloc_5,this5->positions_alloc,
2008 			    query5_compress_fwd,query5_compress_rev,querylength5,
2009 			    max_splice_mismatches,genestrand,/*first_read_p*/true,listpool,hitlistpool,level);
2010 	} else {
2011 	  /* TODO: Implement distant DNA fusions */
2012 	}
2013       }
2014     }
2015 
2016     /* Distant splicing on 3' read */
2017     if (last_level_3 < DISTANT_RNA && found_score_within_trims_3 >= found_score_within_trims_5) {
2018       if ((done_level_3 = found_score_within_trims_3 + subopt_levels) > nmismatches_search_3) {
2019 	done_level_3 = nmismatches_search_3;
2020       }
2021       debug(printf("done_level_3 %d = found_score_within_trims_3 %d + subopt_levels %d\n",
2022 		   done_level_3,found_score_within_trims_3,subopt_levels));
2023 
2024       /* min of max(trim5,trim3) over each hit */
2025       min_trim_3 = Ladder_minimax_trim(sense_ladder3_plus,sense_ladder3_minus,
2026 				       antisense_ladder3_plus,antisense_ladder3_minus,querylength3);
2027 
2028       debug(printf("For 3' end, comparing min_trim %d with min_distantsplicing_end_matches %d\n",
2029 		   min_trim_3,min_distantsplicing_end_matches));
2030       if (min_trim_3 < min_distantsplicing_end_matches) {
2031 	/* Don't find distant splicing */
2032 	debug(printf("For 3'end, skipping distant splicing because min_trim %d < min_distantsplicing_end_matches %d\n",
2033 		     min_trim_3,min_distantsplicing_end_matches));
2034 
2035       } else if ((max_splice_mismatches = done_level_3 - distantsplicing_penalty) >= 0) {
2036 	if (splicingp == true) {
2037 	  debug(printf("For 3' end, candidate for distant splicing because min_trim %d >= min_distantsplicing_end_matches %d\n",
2038 		       min_trim_3,min_distantsplicing_end_matches));
2039 	  debug(printf("Have sets: %d %d %d %d\n",
2040 		       List_length(this3->queryfwd_plus_set),List_length(this3->queryfwd_minus_set),
2041 		       List_length(this3->queryrev_plus_set),List_length(this3->queryrev_minus_set)));
2042 	  Distant_rna_solve(&found_score_overall_3,&found_score_within_trims_3,
2043 			    &sense_hits3_gplus,&sense_hits3_gminus,&antisense_hits3_gplus,&antisense_hits3_gminus,
2044 			    &startfrags3_plus,&endfrags3_plus,&startfrags3_minus,&endfrags3_minus,
2045 
2046 			    this3->queryfwd_plus_set,this3->queryfwd_minus_set,
2047 			    this3->queryrev_plus_set,this3->queryrev_minus_set,
2048 
2049 			    mismatch_positions_alloc_3,this3->positions_alloc,
2050 			    query3_compress_fwd,query3_compress_rev,querylength3,
2051 			    max_splice_mismatches,genestrand,/*first_read_p*/false,listpool,hitlistpool,level);
2052 	} else {
2053 	  /* TODO: Implement distant DNA */
2054 	}
2055       }
2056     }
2057 
2058     debug(printf("found scores: %d and %d (vs %d and %d allowed)\n",
2059 		 found_score_within_trims_5,found_score_within_trims_3,nmismatches_search_5,nmismatches_search_3));
2060 
2061     /* Sense */
2062     debug(printf("sense hits5 plus: %d new.  hits5 minus: %d new.\n",
2063 		 List_length(sense_hits5_gplus),List_length(sense_hits5_gminus)));
2064     debug(printf("sense hits3 plus: %d new.  hits3 minus: %d new.\n",
2065 		 List_length(sense_hits3_gplus),List_length(sense_hits3_gminus)));
2066 
2067     sense_hits5_gplus = Stage3end_filter(sense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2068     sense_hits5_gminus = Stage3end_filter(sense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2069     sense_hits3_gplus = Stage3end_filter(sense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2070     sense_hits3_gminus = Stage3end_filter(sense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2071 
2072     hitpairs = Concordance_pair_up_distant(&(*abort_pairing_p),&concordant_score,&(*samechr),&(*conc_transloc),hitpairs,
2073 					   sense_hits5_gplus,sense_hits5_gminus,sense_hits3_gplus,sense_hits3_gminus,
2074 					   sense_ladder5_plus,sense_ladder5_minus,sense_ladder3_plus,sense_ladder3_minus,
2075 					   querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2076 					   query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2077 					   listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_FORWARD); level++;
2078     Hitlist_free(&sense_hits5_gplus); Hitlist_free(&sense_hits5_gminus);
2079     Hitlist_free(&sense_hits3_gplus); Hitlist_free(&sense_hits3_gminus);
2080 
2081 
2082     if (splicingp == true) {
2083       /* Antisense */
2084       debug(printf("antisense hits5 plus: %d new.  hits5 minus: %d new.\n",
2085 		   List_length(antisense_hits5_gplus),List_length(antisense_hits5_gminus)));
2086       debug(printf("antisense hits3 plus: %d new.  hits3 minus: %d new.\n",
2087 		   List_length(antisense_hits3_gplus),List_length(antisense_hits3_gminus)));
2088 
2089       antisense_hits5_gplus = Stage3end_filter(antisense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2090       antisense_hits5_gminus = Stage3end_filter(antisense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2091       antisense_hits3_gplus = Stage3end_filter(antisense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2092       antisense_hits3_gminus = Stage3end_filter(antisense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2093 
2094       hitpairs = Concordance_pair_up_distant(&(*abort_pairing_p),&concordant_score,&(*samechr),&(*conc_transloc),hitpairs,
2095 					     antisense_hits5_gplus,antisense_hits5_gminus,antisense_hits3_gplus,antisense_hits3_gminus,
2096 					     antisense_ladder5_plus,antisense_ladder5_minus,antisense_ladder3_plus,antisense_ladder3_minus,
2097 					     querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2098 					     query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2099 					     listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_ANTI); level++;
2100       Hitlist_free(&antisense_hits5_gplus); Hitlist_free(&antisense_hits5_gminus);
2101       Hitlist_free(&antisense_hits3_gplus); Hitlist_free(&antisense_hits3_gminus);
2102     }
2103 
2104     debug(printf("(6) After level %d, we have adjacent_score %d, concordant_score %d (vs allowed %d), %d pairs\n",
2105 		 level,adjacent_score,concordant_score,nmismatches_search,List_length(hitpairs)));
2106   }
2107 
2108 
2109   if (last_level < DISTANT_RNA && (*abort_pairing_p == true || concordant_score <= nmismatches_search)) {
2110     Substring_list_gc(&startfrags3_plus);
2111     Substring_list_gc(&endfrags3_plus);
2112     Substring_list_gc(&startfrags3_minus);
2113     Substring_list_gc(&endfrags3_minus);
2114     Substring_list_gc(&startfrags5_plus);
2115     Substring_list_gc(&endfrags5_plus);
2116     Substring_list_gc(&startfrags5_minus);
2117     Substring_list_gc(&endfrags5_minus);
2118     return hitpairs;
2119 
2120   } else if (last_level < TERMINAL) {
2121     /* Terminals */
2122     if (last_level_5 < TERMINAL) {
2123       /* Need to recompute, because of new distant hits or because distant splicing was not run */
2124       min_trim_5 = Ladder_minimax_trim(sense_ladder5_plus,sense_ladder5_minus,
2125 				       antisense_ladder5_plus,antisense_ladder5_minus,querylength5);
2126 
2127       if (min_trim_5 < min_distantsplicing_end_matches) {
2128 	debug(printf("For 5' end, skipping terminals because min_trim %d < min_distantsplicing_end_matches %d\n",
2129 		     min_trim_5,min_distantsplicing_end_matches));
2130       } else {
2131 	/* max_terminal_mismatches = done_level_5; */
2132 	Terminal_solve_plus(&found_score_overall_5,&found_score_within_trims_5,
2133 			    &sense_hits5_gplus,&antisense_hits5_gplus,
2134 			    this5->queryfwd_plus_set,this5->queryrev_plus_set,
2135 			    mismatch_positions_alloc_5,/*queryuc_ptr_5,*/query5_compress_fwd,querylength5,
2136 			    genestrand,listpool,hitlistpool,level);
2137 
2138 	Terminal_solve_minus(&found_score_overall_5,&found_score_within_trims_5,
2139 			     &sense_hits5_gminus,&antisense_hits5_gminus,
2140 			     this5->queryfwd_minus_set,this5->queryrev_minus_set,
2141 			     mismatch_positions_alloc_5,/*queryrc5,*/query5_compress_rev,querylength5,
2142 			     genestrand,listpool,hitlistpool,level);
2143       }
2144     }
2145 
2146     if (last_level_3 < DISTANT_RNA) {
2147       /* Need to recompute, because of new distant hits or because distant splicing was not run */
2148       min_trim_3 = Ladder_minimax_trim(sense_ladder3_plus,sense_ladder3_minus,
2149 				       antisense_ladder3_plus,antisense_ladder3_minus,querylength3);
2150 
2151       if (min_trim_3 < min_distantsplicing_end_matches) {
2152 	debug(printf("For 3'end, skipping terminals because min_trim %d < min_distantsplicing_end_matches %d\n",
2153 		     min_trim_3,min_distantsplicing_end_matches));
2154       } else {
2155 	/* max_terminal_mismatches = done_level_3; */
2156 	Terminal_solve_plus(&found_score_overall_3,&found_score_within_trims_3,
2157 			    &sense_hits3_gplus,&antisense_hits3_gplus,
2158 			    this3->queryfwd_plus_set,this3->queryrev_plus_set,
2159 			    mismatch_positions_alloc_3,/*queryuc_ptr_3,*/query3_compress_fwd,querylength3,
2160 			    genestrand,listpool,hitlistpool,level);
2161 
2162 	Terminal_solve_minus(&found_score_overall_3,&found_score_within_trims_3,
2163 			     &sense_hits3_gminus,&antisense_hits3_gminus,
2164 			     this3->queryfwd_minus_set,this3->queryrev_minus_set,
2165 			     mismatch_positions_alloc_3,/*queryrc3,*/query3_compress_rev,querylength3,
2166 			     genestrand,listpool,hitlistpool,level);
2167       }
2168     }
2169 
2170     /* Sense */
2171     sense_hits5_gplus = Stage3end_filter(sense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2172     sense_hits5_gminus = Stage3end_filter(sense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2173     sense_hits3_gplus = Stage3end_filter(sense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2174     sense_hits3_gminus = Stage3end_filter(sense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2175 
2176     hitpairs = Concordance_pair_up_genome(&(*abort_pairing_p),&adjacent_score,&concordant_score,&(*conc_transloc),
2177 					  hitpairs,sense_hits5_gplus,sense_hits5_gminus,sense_hits3_gplus,sense_hits3_gminus,
2178 					  sense_ladder5_plus,sense_ladder5_minus,sense_ladder3_plus,sense_ladder3_minus,
2179 					  querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2180 					  query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2181 					  listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_FORWARD); level++;
2182     Hitlist_free(&sense_hits5_gplus); Hitlist_free(&sense_hits5_gminus);
2183     Hitlist_free(&sense_hits3_gplus); Hitlist_free(&sense_hits3_gminus);
2184 
2185     if (splicingp == true) {
2186       /* Antisense */
2187       antisense_hits5_gplus = Stage3end_filter(antisense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2188       antisense_hits5_gminus = Stage3end_filter(antisense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2189       antisense_hits3_gplus = Stage3end_filter(antisense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2190       antisense_hits3_gminus = Stage3end_filter(antisense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2191 
2192       hitpairs = Concordance_pair_up_genome(&(*abort_pairing_p),&adjacent_score,&concordant_score,&(*conc_transloc),
2193 					    hitpairs,antisense_hits5_gplus,antisense_hits5_gminus,antisense_hits3_gplus,antisense_hits3_gminus,
2194 					    antisense_ladder5_plus,antisense_ladder5_minus,antisense_ladder3_plus,antisense_ladder3_minus,
2195 					    querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2196 					    query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2197 					    listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_ANTI); level++;
2198       Hitlist_free(&antisense_hits5_gplus); Hitlist_free(&antisense_hits5_gminus);
2199       Hitlist_free(&antisense_hits3_gplus); Hitlist_free(&antisense_hits3_gminus);
2200     }
2201 
2202     debug(printf("(7) After level %d, we have adjacent_score %d, concordant_score %d (vs allowed %d), %d pairs\n",
2203 		 level,adjacent_score,concordant_score,nmismatches_search,List_length(hitpairs)));
2204   }
2205 
2206   Substring_list_gc(&startfrags3_plus);
2207   Substring_list_gc(&endfrags3_plus);
2208   Substring_list_gc(&startfrags3_minus);
2209   Substring_list_gc(&endfrags3_minus);
2210   Substring_list_gc(&startfrags5_plus);
2211   Substring_list_gc(&endfrags5_plus);
2212   Substring_list_gc(&startfrags5_minus);
2213   Substring_list_gc(&endfrags5_minus);
2214 
2215   debug(printf("Exiting paired_read_segment_search\n"));
2216   return hitpairs;
2217 }
2218 
2219 
2220 
2221 static List_T
paired_read(bool * abort_pairing_p,List_T * hits5,List_T * hits3,List_T * samechr,List_T * conc_transloc,char * queryuc_ptr_5,char * queryrc5,int querylength5,char * queryuc_ptr_3,char * queryrc3,int querylength3,int * mismatch_positions_alloc_5,int * mismatch_positions_alloc_3,Compress_T query5_compress_fwd,Compress_T query5_compress_rev,Compress_T query3_compress_fwd,Compress_T query3_compress_rev,int max_insertionlen_5,int max_insertionlen_3,int max_deletionlen_5,int max_deletionlen_3,int genestrand,Chrpos_T pairmax_linear,Chrpos_T overall_max_distance_5,Chrpos_T overall_max_distance_3,Chrpos_T overall_end_distance_5,Chrpos_T overall_end_distance_3,int max_mismatches_refalt_5,int max_mismatches_refalt_3,int max_mismatches_ref_5,int max_mismatches_ref_3,int min_coverage_5,int min_coverage_3,Intlistpool_T intlistpool,Univcoordlistpool_T univcoordlistpool,Listpool_T listpool,Univdiagpool_T univdiagpool,Hitlistpool_T hitlistpool)2222 paired_read (bool *abort_pairing_p, List_T *hits5, List_T *hits3, List_T *samechr, List_T *conc_transloc,
2223 	     char *queryuc_ptr_5, char *queryrc5, int querylength5, char *queryuc_ptr_3, char *queryrc3, int querylength3,
2224 	     int *mismatch_positions_alloc_5, int *mismatch_positions_alloc_3,
2225 	     Compress_T query5_compress_fwd, Compress_T query5_compress_rev,
2226 	     Compress_T query3_compress_fwd, Compress_T query3_compress_rev,
2227 	     int max_insertionlen_5, int max_insertionlen_3, int max_deletionlen_5, int max_deletionlen_3,
2228 	     int genestrand, Chrpos_T pairmax_linear, Chrpos_T overall_max_distance_5, Chrpos_T overall_max_distance_3,
2229 	     Chrpos_T overall_end_distance_5, Chrpos_T overall_end_distance_3,
2230 
2231 	     int max_mismatches_refalt_5, int max_mismatches_refalt_3,
2232 	     int max_mismatches_ref_5, int max_mismatches_ref_3,
2233 	     int min_coverage_5, int min_coverage_3,
2234 
2235 	     Intlistpool_T intlistpool, Univcoordlistpool_T univcoordlistpool,
2236 	     Listpool_T listpool, Univdiagpool_T univdiagpool, Hitlistpool_T hitlistpool) {
2237 
2238   List_T sense_hits5_gplus, sense_hits5_gminus, sense_hits3_gplus, sense_hits3_gminus,
2239     antisense_hits5_gplus, antisense_hits5_gminus, antisense_hits3_gplus, antisense_hits3_gminus;
2240   int adjacent_score, concordant_score;
2241   List_T hitpairs;
2242   int maxpairedpaths;
2243   int level5, level3;
2244   int found_score_overall_5, found_score_within_trims_5, found_score_overall_3, found_score_within_trims_3;
2245 
2246   T this5, this3;
2247   Ladder_T sense_ladder5_plus, sense_ladder5_minus, sense_ladder3_plus, sense_ladder3_minus,
2248     antisense_ladder5_plus = NULL, antisense_ladder5_minus = NULL,
2249     antisense_ladder3_plus = NULL, antisense_ladder3_minus = NULL;
2250 
2251 
2252   *abort_pairing_p = false;
2253   hitpairs = (List_T) NULL;
2254   *samechr = (List_T) NULL;
2255   *conc_transloc = (List_T) NULL;
2256 
2257   /* Take the larger of maxpaths_search and 10*maxpaths_report */
2258   maxpairedpaths = maxpaths_search;
2259   if (maxpairedpaths < 10*maxpaths_report) {
2260     maxpairedpaths = 10*maxpaths_report;
2261   }
2262 
2263   adjacent_score = concordant_score = querylength5 + querylength3;
2264 
2265 
2266   /* TODO: Return level from single_read.  Then if we don't have concordant hitpairs, take the min(level5,level3) and start paired_readfrom t
2267 here */
2268 
2269   this5 = Stage1_new(querylength5);
2270   level5 = single_read_search(&found_score_overall_5,&found_score_within_trims_5,
2271 			      &sense_hits5_gplus,&sense_hits5_gminus,&antisense_hits5_gplus,&antisense_hits5_gminus,
2272 			      this5,queryuc_ptr_5,queryrc5,querylength5,
2273 			      mismatch_positions_alloc_5,query5_compress_fwd,query5_compress_rev,genestrand,
2274 			      intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
2275 			      /*paired_end_p*/true,/*first_read_p*/true);
2276   debug(printf("sense: got %d plus and %d minus hits for 5' end\n",
2277 	       List_length(sense_hits5_gplus),List_length(sense_hits5_gminus)));
2278   debug(printf("antisense: got %d plus and %d minus hits for 5' end\n",
2279 	       List_length(antisense_hits5_gplus),List_length(antisense_hits5_gminus)));
2280 
2281   this3 = Stage1_new(querylength3);
2282   level3 = single_read_search(&found_score_overall_3,&found_score_within_trims_3,
2283 			      &sense_hits3_gplus,&sense_hits3_gminus,&antisense_hits3_gplus,&antisense_hits3_gminus,
2284 			      this3,queryuc_ptr_3,queryrc3,querylength3,
2285 			      mismatch_positions_alloc_3,query3_compress_fwd,query3_compress_rev,genestrand,
2286 			      intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
2287 			      /*paired_end_p*/true,/*first_read_p*/false);
2288   debug(printf("sense: got %d plus and %d minus hits for 3' end\n",
2289 	       List_length(sense_hits3_gplus),List_length(sense_hits3_gminus)));
2290   debug(printf("antisense: got %d plus and %d minus hits for 3' end\n",
2291 	       List_length(antisense_hits3_gplus),List_length(antisense_hits3_gminus)));
2292 
2293   /* Sense */
2294   sense_ladder5_plus = Ladder_new(NULL,hitlistpool,/*end5p*/true);
2295   sense_ladder5_minus = Ladder_new(NULL,hitlistpool,/*end5p*/true);
2296   sense_ladder3_plus = Ladder_new(NULL,hitlistpool,/*end5p*/false);
2297   sense_ladder3_minus = Ladder_new(NULL,hitlistpool,/*end5p*/false);
2298 
2299   sense_hits5_gplus = Stage3end_filter(sense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2300   sense_hits5_gminus = Stage3end_filter(sense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2301   sense_hits3_gplus = Stage3end_filter(sense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2302   sense_hits3_gminus = Stage3end_filter(sense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2303 
2304   debug(printf("STARTING CONCORDANCE_PAIR_UP_GENOME, SENSE\n"));
2305   hitpairs = Concordance_pair_up_genome(&(*abort_pairing_p),&adjacent_score,&concordant_score,&(*conc_transloc),
2306 					hitpairs,sense_hits5_gplus,sense_hits5_gminus,sense_hits3_gplus,sense_hits3_gminus,
2307 					sense_ladder5_plus,sense_ladder5_minus,sense_ladder3_plus,sense_ladder3_minus,
2308 					querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2309 					query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2310 					listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_FORWARD);
2311   Hitlist_free(&sense_hits5_gplus); Hitlist_free(&sense_hits5_gminus);
2312   Hitlist_free(&sense_hits3_gplus); Hitlist_free(&sense_hits3_gminus);
2313   debug(printf("RETURNING FROM CONCORDANCE_PAIR_UP_GENOME, SENSE\n"));
2314 
2315   if (splicingp == true) {
2316     /* Antisense */
2317     antisense_ladder5_plus = Ladder_new(NULL,hitlistpool,/*end5p*/true);
2318     antisense_ladder5_minus = Ladder_new(NULL,hitlistpool,/*end5p*/true);
2319     antisense_ladder3_plus = Ladder_new(NULL,hitlistpool,/*end5p*/false);
2320     antisense_ladder3_minus = Ladder_new(NULL,hitlistpool,/*end5p*/false);
2321 
2322     antisense_hits5_gplus = Stage3end_filter(antisense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2323     antisense_hits5_gminus = Stage3end_filter(antisense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2324     antisense_hits3_gplus = Stage3end_filter(antisense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2325     antisense_hits3_gminus = Stage3end_filter(antisense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2326 
2327     debug(printf("STARTING CONCORDANCE_PAIR_UP_GENOME, ANTISENSE\n"));
2328     hitpairs = Concordance_pair_up_genome(&(*abort_pairing_p),&adjacent_score,&concordant_score,&(*conc_transloc),
2329 					  hitpairs,antisense_hits5_gplus,antisense_hits5_gminus,antisense_hits3_gplus,antisense_hits3_gminus,
2330 					  antisense_ladder5_plus,antisense_ladder5_minus,antisense_ladder3_plus,antisense_ladder3_minus,
2331 					  querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2332 					  query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2333 					  listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_ANTI);
2334     Hitlist_free(&antisense_hits5_gplus); Hitlist_free(&antisense_hits5_gminus);
2335     Hitlist_free(&antisense_hits3_gplus); Hitlist_free(&antisense_hits3_gminus);
2336     debug(printf("RETURNING FROM CONCORDANCE_PAIR_UP_GENOME, ANTISENSE\n"));
2337   }
2338 
2339   debug(printf("After initial concordance, have %d concordant hitpairs and %d concordant transloc\n\n",
2340 	       List_length(hitpairs),List_length(*conc_transloc)));
2341 
2342 
2343   /* DEBUGGING */
2344   if (hitpairs == NULL) {
2345 #if 0
2346     /* Computationally too expensive */
2347     Ladder_missing_paired_p(ladder5_plus,/*maxscore*/0) == true
2348     Ladder_missing_paired_p(ladder3_plus,/*maxscore*/0) == true
2349     Ladder_missing_paired_p(ladder5_minus,/*maxscore*/0) == true
2350     Ladder_missing_paired_p(ladder3_minus,/*maxscore*/0) == true
2351 #endif
2352 
2353     debug(printf("CALLING PAIRED_READ_SEGMENT_SEARCH\n"));
2354     hitpairs = paired_read_segment_search(&(*abort_pairing_p),&(*samechr),&(*conc_transloc),hitpairs,
2355 					  level5,level3,found_score_overall_5,found_score_within_trims_5,
2356 					  found_score_overall_3,found_score_within_trims_3,this5,this3,
2357 					  sense_ladder5_plus,sense_ladder5_minus,sense_ladder3_plus,sense_ladder3_minus,
2358 					  antisense_ladder5_plus,antisense_ladder5_minus,antisense_ladder3_plus,antisense_ladder3_minus,
2359 					  queryuc_ptr_5,queryrc5,querylength5,queryuc_ptr_3,queryrc3,querylength3,
2360 					  mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2361 					  query5_compress_fwd,query5_compress_rev,
2362 					  query3_compress_fwd,query3_compress_rev,
2363 					  max_insertionlen_5,max_insertionlen_3,max_deletionlen_5,max_deletionlen_3,
2364 					  genestrand,pairmax_linear,overall_max_distance_5,overall_max_distance_3,
2365 					  overall_end_distance_5,overall_end_distance_3,
2366 					  max_mismatches_refalt_5,max_mismatches_refalt_3,
2367 					  max_mismatches_ref_5,max_mismatches_ref_3,min_coverage_5,min_coverage_3,
2368 					  intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool);
2369   }
2370 
2371   Stage1_free(&this3); Stage1_free(&this5);
2372 
2373   *hits5 =  *hits3 = (List_T) NULL;
2374   Ladder_to_hits(&(*hits5),&(*hits3),
2375 		 &sense_ladder5_plus,&sense_ladder5_minus,&sense_ladder3_plus,&sense_ladder3_minus,
2376 		 hitlistpool);
2377 
2378   if (splicingp == true) {
2379     Ladder_to_hits(&(*hits5),&(*hits3),
2380 		   &antisense_ladder5_plus,&antisense_ladder5_minus,&antisense_ladder3_plus,&antisense_ladder3_minus,
2381 		   hitlistpool);
2382   }
2383 
2384   debug(printf("Exiting paired_read\n"));
2385   return hitpairs;
2386 }
2387 
2388 
2389 
2390 /* Have three lists: hitpairs, samechr, and conc_transloc => result */
2391 static Stage3pair_T *
consolidate_paired_results(int * npaths_primary,int * npaths_altloc,int * first_absmq,int * second_absmq,Pairtype_T * final_pairtype,Stage3end_T ** stage3array5,int * nhits5_primary,int * nhits5_altloc,int * first_absmq5,int * second_absmq5,Stage3end_T ** stage3array3,int * nhits3_primary,int * nhits3_altloc,int * first_absmq3,int * second_absmq3,List_T hitpairs,List_T samechr,List_T conc_transloc,List_T hits5,List_T hits3,char * queryuc_ptr_5,int querylength5,char * queryuc_ptr_3,int querylength3,char * quality_string_5,char * quality_string_3,int * mismatch_positions_alloc_5,int * mismatch_positions_alloc_3,Compress_T query5_compress_fwd,Compress_T query5_compress_rev,Compress_T query3_compress_fwd,Compress_T query3_compress_rev,int max_mismatches_refalt_5,int max_mismatches_refalt_3,int max_mismatches_ref_5,int max_mismatches_ref_3,int min_coverage_5,int min_coverage_3,Listpool_T listpool,Hitlistpool_T hitlistpool)2392 consolidate_paired_results (int *npaths_primary, int *npaths_altloc, int *first_absmq, int *second_absmq, Pairtype_T *final_pairtype,
2393 			    Stage3end_T **stage3array5, int *nhits5_primary, int *nhits5_altloc, int *first_absmq5, int *second_absmq5,
2394 			    Stage3end_T **stage3array3, int *nhits3_primary, int *nhits3_altloc, int *first_absmq3, int *second_absmq3,
2395 			    List_T hitpairs, List_T samechr, List_T conc_transloc, List_T hits5, List_T hits3,
2396 
2397 			    char *queryuc_ptr_5, int querylength5, char *queryuc_ptr_3, int querylength3,
2398 			    char *quality_string_5, char *quality_string_3,
2399 
2400 			    int *mismatch_positions_alloc_5, int *mismatch_positions_alloc_3,
2401 			    Compress_T query5_compress_fwd, Compress_T query5_compress_rev,
2402 			    Compress_T query3_compress_fwd, Compress_T query3_compress_rev,
2403 
2404 			    int max_mismatches_refalt_5, int max_mismatches_refalt_3,
2405 			    int max_mismatches_ref_5, int max_mismatches_ref_3,
2406 			    int min_coverage_5, int min_coverage_3,
2407 			    Listpool_T listpool, Hitlistpool_T hitlistpool) {
2408   Stage3pair_T *stage3pairarray, stage3pair, newpair;
2409   Stage3end_T hit5, hit3;
2410   List_T result, singlehits5, singlehits3, p;
2411   int genestrand, sensedir;
2412   Pairtype_T pairtype;
2413   int best_nmatches_paired, best_nmatches_paired_5, best_nmatches_paired_3;
2414 
2415 
2416   *final_pairtype = choose_among_paired(&best_nmatches_paired,&best_nmatches_paired_5,&best_nmatches_paired_3,
2417 					hitpairs,samechr,conc_transloc);
2418   debug16(printf("Entered consolidate_paired_results with final_pairtype %d\n",*final_pairtype));
2419 
2420   /* DEBUGGING */
2421   if (*final_pairtype == CONCORDANT) {
2422     /* Have concordant results */
2423     debug16(printf("Have %d concordant results, %d samechr, and %d conc_transloc\n",
2424 		   List_length(hitpairs),List_length(samechr),List_length(conc_transloc)));
2425     for (p = samechr; p != NULL; p = List_next(p)) {
2426       stage3pair = (Stage3pair_T) List_head(p);
2427       Stage3pair_free(&stage3pair);
2428     }
2429     Hitlist_free(&samechr);
2430 
2431     for (p = conc_transloc; p != NULL; p = List_next(p)) {
2432       stage3pair = (Stage3pair_T) List_head(p);
2433       Stage3pair_free(&stage3pair);
2434     }
2435     Hitlist_free(&conc_transloc);
2436 
2437     if (1) {
2438       debug16(printf("Before removing overlaps, %d results\n",List_length(hitpairs)));
2439       result = Stage3pair_optimal_score(hitpairs,hitlistpool,querylength5,querylength3,/*finalp*/false);
2440       debug16(printf("After Stage3pair_optimal_score_prefinal, %d results\n",List_length(result)));
2441       result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/false,/*finalp*/true);
2442       debug16(printf("After Stage3pair_remove_overlaps, %d results\n",List_length(result)));
2443       result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/true);
2444       debug16(printf("After Stage3pair_optimal_score_final, %d results\n",List_length(result)));
2445 
2446       /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2447       /* result = Stage3pair_sort_distance(result); */
2448       debug16(printf("After removing overlaps, %d results\n",List_length(result)));
2449 
2450     } else if (true /*gmap_improvement_p == false*/) {
2451       debug16(printf("No GMAP improvement: Before removing overlaps, %d results\n",List_length(hitpairs)));
2452       result = Stage3pair_optimal_score(hitpairs,hitlistpool,querylength5,querylength3,/*finalp*/false);
2453       result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/false,/*finalp*/true);
2454       result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/true);
2455 
2456       /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2457       /* result = Stage3pair_sort_distance(result); */
2458       debug16(printf("After removing overlaps, %d results\n",List_length(result)));
2459 
2460     } else {
2461       debug16(printf("GMAP improvement: Before removing overlaps, %d results\n",List_length(hitpairs)));
2462       result = Stage3pair_optimal_score(hitpairs,hitlistpool,querylength5,querylength3,/*finalp*/false);
2463       result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/false,/*finalp*/false);
2464       result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/false);
2465       /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2466       /* result = Stage3pair_sort_distance(result); */
2467       debug16(printf("After removing overlaps, %d results\n",List_length(result)));
2468 
2469 #ifdef TODO
2470       result = align_pair_with_gmap(&(*final_pairtype),result,
2471 				    queryuc_ptr_5,queryuc_ptr_3,querylength5,querylength3,
2472 				    oligoindices_minor,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
2473 				    cutoff_level_5,cutoff_level_3,/*expect_concordant_p*/true);
2474       if (Stage3pair_sense_consistent_p(result) == false) {
2475 	debug16(printf("sense is not consistent\n"));
2476 	result = align_pair_with_gmap(&(*final_pairtype),result,
2477 				      queryuc_ptr_5,queryuc_ptr_3,querylength5,querylength3,
2478 				      oligoindices_minor,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
2479 				      cutoff_level_5,cutoff_level_3,/*expect_concordant_p*/true);
2480       }
2481 #endif
2482 
2483       result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/false);
2484       result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/false,/*finalp*/true);
2485       result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/true);
2486 
2487       /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2488     }
2489 
2490   } else if (*final_pairtype == PAIRED_UNSPECIFIED) {
2491     /* Have paired results */
2492     debug16(printf("Have paired unspecified\n"));
2493     for (p = hitpairs; p != NULL; p = List_next(p)) {
2494       stage3pair = (Stage3pair_T) List_head(p);
2495       Stage3pair_free(&stage3pair);
2496     }
2497     Hitlist_free(&hitpairs);
2498 
2499     for (p = conc_transloc; p != NULL; p = List_next(p)) {
2500       stage3pair = (Stage3pair_T) List_head(p);
2501       Stage3pair_free(&stage3pair);
2502     }
2503     Hitlist_free(&conc_transloc);
2504 
2505     if (true /*gmap_improvement_p == false*/) {
2506       debug16(printf("No GMAP improvement: Before removing overlaps, %d results\n",List_length(samechr)));
2507       result = Stage3pair_optimal_score(samechr,hitlistpool,querylength5,querylength3,/*finalp*/false);
2508       result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/false,/*finalp*/true);
2509       result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/true);
2510       /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2511     } else {
2512       debug16(printf("GMAP improvement: Before removing overlaps, %d results\n",List_length(samechr)));
2513       result = Stage3pair_optimal_score(samechr,hitlistpool,querylength5,querylength3,/*finalp*/false);
2514       result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/false,/*finalp*/false);
2515       result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/false);
2516       /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2517 
2518 #ifdef TODO
2519       result = align_pair_with_gmap(&(*final_pairtype),result,
2520 				    queryuc_ptr_5,queryuc_ptr_3,querylength5,querylength3,
2521 				    oligoindices_minor,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
2522 				    cutoff_level_5,cutoff_level_3,/*expect_concordant_p*/false);
2523       result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/true);
2524       result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/false,/*finalp*/true);
2525       result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/true);
2526 #endif
2527 
2528       if (Stage3pair_concordantp(result) == true) {
2529 	debug16(printf("Found remaining concordant solution, so removing non-concordant ones\n"));
2530 	*final_pairtype = CONCORDANT;
2531 	result = Stage3pair_filter_nonconcordant(result,hitlistpool);
2532 	debug16(printf("Concordant results: %d\n",List_length(result)));
2533       } else {
2534 	*final_pairtype = PAIRED_UNSPECIFIED;
2535       }
2536 
2537       /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2538     }
2539 
2540     *final_pairtype = CONCORDANT; /* Because paired terminals are each concordant */
2541 
2542 
2543   } else if (*final_pairtype == CONCORDANT_TRANSLOCATIONS) {
2544     debug16(printf("Have %d concordant translocation results\n",List_length(conc_transloc)));
2545     for (p = hitpairs; p != NULL; p = List_next(p)) {
2546       stage3pair = (Stage3pair_T) List_head(p);
2547       Stage3pair_free(&stage3pair);
2548     }
2549     Hitlist_free(&hitpairs);
2550 
2551     for (p = samechr; p != NULL; p = List_next(p)) {
2552       stage3pair = (Stage3pair_T) List_head(p);
2553       Stage3pair_free(&stage3pair);
2554     }
2555     Hitlist_free(&samechr);
2556 
2557     result = Stage3pair_optimal_score(conc_transloc,hitlistpool,querylength5,querylength3,/*finalp*/false);
2558     result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/true,/*finalp*/true);
2559     result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/true);
2560 
2561     /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2562     debug16(printf("Finally, have %d concordant translocation results\n",List_length(result)));
2563 
2564   } else {
2565     debug16(printf("Have unpaired results\n"));
2566     /* Need to free conc_transloc, since we can get here with multiple results */
2567     for (p = conc_transloc; p != NULL; p = List_next(p)) {
2568       stage3pair = (Stage3pair_T) List_head(p);
2569       Stage3pair_free(&stage3pair);
2570     }
2571     Hitlist_free(&conc_transloc);
2572 
2573     result = (List_T) NULL;
2574   }
2575 
2576   debug16(printf("After eval: %d in result, %d hitpairs, %d conc_transloc, %d samechr\n",
2577 		 List_length(result),List_length(hitpairs),List_length(conc_transloc),List_length(samechr)));
2578 
2579   if (result == NULL) {
2580     debug(printf("Have %d hits on 5' end before filter_coverage with min-coverage of %d\n",
2581 		 List_length(hits5),min_coverage_5));
2582     singlehits5 = Stage3end_filter(hits5,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2583     debug(printf("Have %d hits on 5' end after filter_coverage\n",List_length(singlehits5)));
2584     debug16(printf("Entering Stage3end_optimal_score for 5' end\n"));
2585     singlehits5 = Stage3end_optimal_score(singlehits5,hitlistpool,querylength5,/*finalp*/false);
2586 
2587     /* singlehits5 = Stage3end_reject_trimlengths(singlehits5); */
2588     singlehits5 = Stage3end_linearize_5(singlehits5);
2589     singlehits5 = Stage3end_remove_overlaps(singlehits5,hitlistpool,querylength5,/*finalp*/true);
2590     singlehits5 = Stage3end_optimal_score(singlehits5,hitlistpool,querylength5,/*finalp*/true);
2591     /* singlehits5 = Stage3end_resolve_multimapping(singlehits5,hitlistpool); */
2592 
2593 
2594     debug(printf("Have %d hits on 3' end before filter_coverage with min_coverage of %d\n",
2595 		 List_length(hits3),min_coverage_3));
2596     singlehits3 = Stage3end_filter(hits3,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2597     debug(printf("Have %d hits on 3' end after filter_coverage\n",List_length(singlehits3)));
2598     debug16(printf("Entering Stage3end_optimal_score for 3' end\n"));
2599     singlehits3 = Stage3end_optimal_score(singlehits3,hitlistpool,querylength3,/*finalp*/false);
2600 
2601     /* singlehits3 = Stage3end_reject_trimlengths(singlehits3); */
2602     singlehits3 = Stage3end_linearize_3(singlehits3);
2603     singlehits3 = Stage3end_remove_overlaps(singlehits3,hitlistpool,querylength3,/*finalp*/true);
2604     singlehits3 = Stage3end_optimal_score(singlehits3,hitlistpool,querylength3,/*finalp*/true);
2605     /* singlehits3 = Stage3end_resolve_multimapping(singlehits3,hitlistpool); */
2606 
2607 
2608     debug16(printf("5' end has %d hits and 3' end has %d hits\n",
2609 		   List_length(singlehits5),List_length(singlehits3)));
2610 
2611     if (List_length(singlehits5) == 1 && List_length(singlehits3) == 1) {
2612       hit5 = (Stage3end_T) List_head(singlehits5);
2613       hit3 = (Stage3end_T) List_head(singlehits3);
2614       if ((genestrand = Stage3end_genestrand(hit5)) == Stage3end_genestrand(hit3) &&
2615 	  (sensedir = Stage3end_sensedir(hit5) == Stage3end_sensedir(hit3)) &&
2616 	  (pairtype = Stage3_determine_pairtype(hit5,hit3,/*stage3pair*/NULL)) != UNPAIRED) {
2617 	/* Convert unpaired uniq to a paired uniq */
2618 	debug16(printf("Converting unpaired uniq to paired uniq, with initial pairtype %s\n",Pairtype_string(pairtype)));
2619 	if ((newpair = Stage3pair_new(hit5,hit3,genestrand,sensedir,pairtype,
2620 				      mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2621 				      query5_compress_fwd,query5_compress_rev,
2622 				      query3_compress_fwd,query3_compress_rev,
2623 #if 0
2624 				      queryuc_ptr_5,queryuc_ptr_3,
2625 				      pairpool,dynprogL,dynprogM,dynprogR,oligoindices_minor,
2626 				      diagpool,cellpool,
2627 #endif
2628 				      listpool,/*expect_concordant_p*/pairtype == CONCORDANT ? true : false,
2629 				      /*transcriptome_guided_p*/false)) != NULL) {
2630 	  result = Hitlist_push(NULL,hitlistpool,(void *) newpair);
2631 
2632 	  *nhits5_primary = *nhits5_altloc = 0;
2633 	  *nhits3_primary = *nhits3_altloc = 0;
2634 	  *stage3array5 = *stage3array3 = (Stage3end_T *) NULL;
2635 
2636 	  if (Stage3pair_altlocp(newpair) == true) {
2637 	    *npaths_primary = 0;
2638 	    *npaths_altloc = 1;
2639 	  } else {
2640 	    *npaths_primary = 1;
2641 	    *npaths_altloc = 0;
2642 	  }
2643 	  if (pairtype == CONCORDANT) {
2644 	    debug16(printf("final pairtype is CONCORDANT\n"));
2645 	    *final_pairtype = CONCORDANT;
2646 	  } else {
2647 	    debug16(printf("final pairtype is PAIRED_UNSPECIFIED\n"));
2648 	    *final_pairtype = PAIRED_UNSPECIFIED;
2649 	  }
2650 
2651 	  stage3pairarray = (Stage3pair_T *) CALLOC_OUT(1,sizeof(Stage3pair_T));
2652 	  stage3pairarray[0] = (Stage3pair_T) List_head(result);
2653 	  Hitlist_free(&result);
2654 
2655 #if 0
2656 	  Stage3pair_privatize(stage3pairarray,/*npairs*/1);
2657 #endif
2658 	  Stage3pair_eval_and_sort(/*npaths*/(*npaths_primary) + (*npaths_altloc),
2659 				   &(*first_absmq),&(*second_absmq),stage3pairarray,
2660 				   queryuc_ptr_5,queryuc_ptr_3,quality_string_5,quality_string_3);
2661 
2662 	  stage3list_gc(&singlehits3);
2663 	  stage3list_gc(&singlehits5);
2664 	  debug16(printf("1 Exiting consolidate_paired_results with final_pairtype %d\n",*final_pairtype));
2665 	  return stage3pairarray;
2666 	}
2667       }
2668     }
2669 
2670     /* Fall through: halfmapping or unpaired */
2671     *npaths_primary = *npaths_altloc = 0;
2672     *final_pairtype = UNPAIRED;
2673 
2674     /* singlehits5 = Stage3end_filter(singlehits5,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5); */
2675     if (singlehits5 == NULL) {
2676       *nhits5_primary = *nhits5_altloc = 0;
2677       *stage3array5 = (Stage3end_T *) NULL;
2678     } else {
2679 #if 0
2680       singlehits5 = Stage3end_unalias_circular(singlehits5);
2681 #else
2682       singlehits5 = Stage3end_remove_circular_alias(singlehits5,hitlistpool); /* Contains a call to unalias_circular */
2683       singlehits5 = Stage3end_remove_duplicates(singlehits5,hitlistpool); /* Aliases can cause duplicates */
2684       Stage3end_count_hits(&(*nhits5_primary),&(*nhits5_altloc),singlehits5);
2685 #endif
2686       *stage3array5 = (Stage3end_T *) List_to_array_out(singlehits5,NULL); Hitlist_free(&singlehits5); /* Return value */
2687     }
2688 
2689     /* singlehits3 = Stage3end_filter(singlehits3,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3); */
2690     if (singlehits3 == NULL) {
2691       *nhits3_primary = *nhits3_altloc = 0;
2692       *stage3array3 = (Stage3end_T *) NULL;
2693     } else {
2694 #if 0
2695       singlehits3 = Stage3end_unalias_circular(singlehits3);
2696 #else
2697       singlehits3 = Stage3end_remove_circular_alias(singlehits3,hitlistpool); /* Contains a call to unalias_circular */
2698       singlehits3 = Stage3end_remove_duplicates(singlehits3,hitlistpool); /* Aliases can cause duplicates */
2699       Stage3end_count_hits(&(*nhits3_primary),&(*nhits3_altloc),singlehits3);
2700 #endif
2701       *stage3array3 = (Stage3end_T *) List_to_array_out(singlehits3,NULL); Hitlist_free(&singlehits3); /* Return value */
2702     }
2703 
2704     if ((*nhits5_primary) + (*nhits5_altloc) > 0) {
2705       if ((*nhits3_primary) + (*nhits3_altloc) == 1) {
2706       /* Use single 3' hit to guide sorting of multiple 5' hits */
2707         *stage3array5 = Stage3end_eval_and_sort_guided((*nhits5_primary) + (*nhits5_altloc),
2708    	                                               &(*first_absmq5),&(*second_absmq5),/*guide*/(*stage3array3)[0],
2709 						       *stage3array5,queryuc_ptr_5,quality_string_5,/*displayp*/true);
2710       } else {
2711         *stage3array5 = Stage3end_eval_and_sort((*nhits5_primary) + (*nhits5_altloc),&(*first_absmq5),&(*second_absmq5),
2712  						*stage3array5,queryuc_ptr_5,quality_string_5,/*displayp*/true);
2713       }
2714     }
2715 
2716     if ((*nhits3_primary) + (*nhits3_altloc) > 0) {
2717       if ((*nhits5_primary) + (*nhits5_altloc) == 1) {
2718 	/* Use single 5' hit to guide sorting of multiple 3' hits */
2719         *stage3array3 = Stage3end_eval_and_sort_guided((*nhits3_primary) + (*nhits3_altloc),
2720                                                        &(*first_absmq3),&(*second_absmq3),/*guide*/(*stage3array5)[0],
2721 						       *stage3array3,queryuc_ptr_3,quality_string_3,/*displayp*/true);
2722       } else {
2723         *stage3array3 = Stage3end_eval_and_sort((*nhits3_primary) + (*nhits3_altloc),&(*first_absmq3),&(*second_absmq3),
2724 						*stage3array3,queryuc_ptr_3,quality_string_3,/*displayp*/true);
2725       }
2726     }
2727     debug16(printf("Result is NULL, and we have %d hits on 5' end and %d hits on 3' end\n",
2728 		   (*nhits5_primary) + (*nhits5_altloc),(*nhits3_primary) + (*nhits3_altloc)));
2729     debug16(printf("2 Exiting consolidate_paired_results with final_pairtype %d\n",*final_pairtype));
2730     return (Stage3pair_T *) NULL;
2731 
2732   } else {
2733     debug16(printf("final pairtype is %d\n",*final_pairtype));
2734     debug16(printf("Result is not NULL (%d paths), and we fall through to concordant, paired, or transloc pairs\n",
2735 		   List_length(result)));
2736     debug16(printf("Filtering: max_mismatches %d and %d, min_coverage %d and %d\n",
2737 		   max_mismatches_refalt_5,max_mismatches_refalt_3,min_coverage_5,min_coverage_3));
2738 
2739 #if 0
2740     /* Eliminates entire pair even if only one end is bad.  Should filter each end, and not each pair */
2741     result = Stage3pair_filter(result,hitlistpool,max_mismatches_5,max_mismatches_3,min_coverage_5,min_coverage_3);
2742 #endif
2743 
2744     if (result == NULL) {
2745       *npaths_primary = *npaths_altloc = 0;
2746       stage3list_gc(&hits3);
2747       stage3list_gc(&hits5);
2748       *nhits5_primary = *nhits5_altloc = 0;
2749       *nhits3_primary = *nhits3_altloc = 0;
2750       *stage3array5 = *stage3array3 = (Stage3end_T *) NULL;
2751       debug16(printf("3 Exiting consolidate_paired_results with final_pairtype %d\n",*final_pairtype));
2752       return (Stage3pair_T *) NULL;
2753 
2754     } else {
2755       /* result != NULL */
2756       /* Concordant, paired, or transloc pairs found.  Remove single hits. */
2757       Stage3pair_count_hits(&(*npaths_primary),&(*npaths_altloc),result);
2758       stage3pairarray = (Stage3pair_T *) List_to_array_out(result,NULL); Hitlist_free(&result); /* Return value */
2759 #if 0
2760       Stage3pair_privatize(stage3pairarray,/*npaths*/(*npaths_primary) + (*npaths_altloc));
2761 #endif
2762       Stage3pair_eval_and_sort(/*npaths*/(*npaths_primary) + (*npaths_altloc),
2763 			       &(*first_absmq),&(*second_absmq),
2764 			       stage3pairarray,queryuc_ptr_5,queryuc_ptr_3,quality_string_5,quality_string_3);
2765       stage3list_gc(&hits3);
2766       stage3list_gc(&hits5);
2767 
2768       *nhits5_primary = *nhits5_altloc = 0;
2769       *nhits3_primary = *nhits3_altloc = 0;
2770       *stage3array5 = *stage3array3 = (Stage3end_T *) NULL;
2771       debug16(printf("4 Exiting consolidate_paired_results with final_pairtype %d\n",*final_pairtype));
2772 
2773       return stage3pairarray;
2774     }
2775   }
2776 }
2777 
2778 
2779 Stage3pair_T *
Stage1_paired_read(int * npaths_primary,int * npaths_altloc,int * first_absmq,int * second_absmq,Pairtype_T * final_pairtype,Stage3end_T ** stage3array5,int * nhits5_primary,int * nhits5_altloc,int * first_absmq5,int * second_absmq5,Stage3end_T ** stage3array3,int * nhits3_primary,int * nhits3_altloc,int * first_absmq3,int * second_absmq3,Shortread_T queryseq5,Shortread_T queryseq3,Chrpos_T pairmax_linear,Intlistpool_T intlistpool,Univcoordlistpool_T univcoordlistpool,Listpool_T listpool,Univdiagpool_T univdiagpool,Hitlistpool_T hitlistpool)2780 Stage1_paired_read (int *npaths_primary, int *npaths_altloc, int *first_absmq, int *second_absmq, Pairtype_T *final_pairtype,
2781 		    Stage3end_T **stage3array5, int *nhits5_primary, int *nhits5_altloc, int *first_absmq5, int *second_absmq5,
2782 		    Stage3end_T **stage3array3, int *nhits3_primary, int *nhits3_altloc, int *first_absmq3, int *second_absmq3,
2783 		    Shortread_T queryseq5, Shortread_T queryseq3, Chrpos_T pairmax_linear,
2784 		    Intlistpool_T intlistpool, Univcoordlistpool_T univcoordlistpool,
2785 		    Listpool_T listpool, Univdiagpool_T univdiagpool, Hitlistpool_T hitlistpool) {
2786   Stage3pair_T *stage3pairarray;
2787   bool abort_pairing_p, geneplus_abort_pairing_p, geneminus_abort_pairing_p;
2788   List_T hitpairs, geneplus_hitpairs, geneminus_hitpairs;
2789   List_T hits5, hits3, geneplus_hits5, geneplus_hits3, geneminus_hits5, geneminus_hits3;
2790   List_T samechr, geneplus_samechr, geneminus_samechr;
2791   List_T conc_transloc, geneplus_conc_transloc, geneminus_conc_transloc;
2792 
2793   int querylength5, querylength3;
2794   int max_mismatches_refalt_5, max_mismatches_refalt_3;
2795   int max_mismatches_ref_5, max_mismatches_ref_3;
2796   int min_coverage_5, min_coverage_3;
2797   int max_middle_insertions_5, max_middle_insertions_3, max_middle_deletions_5, max_middle_deletions_3;
2798   int max_insertionlen_5, max_insertionlen_3, max_deletionlen_5, max_deletionlen_3;
2799   Chrpos_T overall_max_distance_5, overall_max_distance_3, overall_end_distance_5, overall_end_distance_3;
2800   char *queryuc_ptr_5, *queryuc_ptr_3, *queryrc5, *queryrc3, *quality_string_5, *quality_string_3;
2801   int *mismatch_positions_alloc_5, *mismatch_positions_alloc_3;
2802   Compress_T query5_compress_fwd, query5_compress_rev, query3_compress_fwd, query3_compress_rev;
2803 
2804 
2805   querylength5 = Shortread_fulllength(queryseq5);
2806   querylength3 = Shortread_fulllength(queryseq3);
2807 
2808   /* Previously used user_mismatches_refalt_float in searching, now just for filtering */
2809   if (user_mismatches_refalt_float < 0.0) {
2810     max_mismatches_refalt_5 = querylength5;
2811     max_mismatches_refalt_3 = querylength3;
2812   } else if (user_mismatches_refalt_float > 0.0 && user_mismatches_refalt_float < 1.0) {
2813     max_mismatches_refalt_5 = (int) rint(user_mismatches_refalt_float * (double) querylength5);
2814     max_mismatches_refalt_3 = (int) rint(user_mismatches_refalt_float * (double) querylength3);
2815   } else {
2816     /* Assuming that --max-mismatches=1 must mean 1 bp and not querylength */
2817     max_mismatches_refalt_5 = max_mismatches_refalt_3 = (int) user_mismatches_refalt_float;
2818   }
2819 
2820   if (user_mismatches_ref_float < 0.0) {
2821     max_mismatches_ref_5 = querylength5;
2822     max_mismatches_ref_3 = querylength3;
2823   } else if (user_mismatches_ref_float > 0.0 && user_mismatches_ref_float < 1.0) {
2824     max_mismatches_ref_5 = (int) rint(user_mismatches_ref_float * (double) querylength5);
2825     max_mismatches_ref_3 = (int) rint(user_mismatches_ref_float * (double) querylength3);
2826   } else {
2827     /* Assuming that --max-mismatches=1 must mean 1 bp and not querylength */
2828     max_mismatches_ref_5 = max_mismatches_ref_3 = (int) user_mismatches_ref_float;
2829   }
2830 
2831   if (user_mincoverage_float <= 0.0) {
2832     min_coverage_5 = min_coverage_3 = 0;
2833   } else if (user_mincoverage_float <= 1.0) {
2834     /* Assuming that --min-coverage=1 must mean 1.0 and not a coverage of 1 bp */
2835     min_coverage_5 = (int) rint(user_mincoverage_float * (double) querylength5);
2836     min_coverage_3 = (int) rint(user_mincoverage_float * (double) querylength3);
2837   } else {
2838     min_coverage_5 = min_coverage_3 = (int) user_mincoverage_float;
2839   }
2840 
2841   if (max_middle_insertions_float > 0.0 && max_middle_insertions_float < 1.0) {
2842     max_middle_insertions_5 = (int) rint(max_middle_insertions_float * (double) querylength5);
2843     max_middle_insertions_3 = (int) rint(max_middle_insertions_float * (double) querylength3);
2844   } else {
2845     max_middle_insertions_5 = max_middle_insertions_3 = (int) max_middle_insertions_float;
2846   }
2847   max_insertionlen_5 = max_middle_insertions_5 > max_end_insertions ? max_middle_insertions_5 : max_end_insertions;
2848   max_insertionlen_3 = max_middle_insertions_3 > max_end_insertions ? max_middle_insertions_3 : max_end_insertions;
2849 
2850   if (max_middle_deletions_float > 0.0 && max_middle_deletions_float < 1.0) {
2851     max_middle_deletions_5 = (int) rint(max_middle_deletions_float * (double) querylength5);
2852     max_middle_deletions_3 = (int) rint(max_middle_deletions_float * (double) querylength3);
2853   } else {
2854     max_middle_deletions_5 = max_middle_insertions_3 = (int) max_middle_deletions_float;
2855   }
2856   max_deletionlen_5 = max_middle_deletions_5 > max_end_deletions ? max_middle_deletions_5 : max_end_deletions;
2857   max_deletionlen_3 = max_middle_deletions_3 > max_end_deletions ? max_middle_deletions_3 : max_end_deletions;
2858 
2859   overall_max_distance_5 = shortsplicedist;
2860   if ((Chrpos_T) max_middle_deletions_5 > overall_max_distance_5) {
2861     overall_max_distance_5 = (Chrpos_T) max_middle_deletions_5;
2862   }
2863   if ((Chrpos_T) max_middle_insertions_5 > overall_max_distance_5) {
2864     overall_max_distance_5 = (Chrpos_T) max_middle_insertions_5;
2865   }
2866   overall_end_distance_5 = shortsplicedist_novelend > (Chrpos_T) max_deletionlen_5 ? shortsplicedist_novelend : (Chrpos_T) max_deletionlen_5;
2867 
2868   overall_max_distance_3 = shortsplicedist;
2869   if ((Chrpos_T) max_middle_deletions_3 > overall_max_distance_3) {
2870     overall_max_distance_3 = (Chrpos_T) max_middle_deletions_3;
2871   }
2872   if ((Chrpos_T) max_middle_insertions_3 > overall_max_distance_3) {
2873     overall_max_distance_3 = (Chrpos_T) max_middle_insertions_3;
2874   }
2875   overall_end_distance_3 = shortsplicedist_novelend > (Chrpos_T) max_deletionlen_3 ? shortsplicedist_novelend : (Chrpos_T) max_deletionlen_3;
2876 
2877 
2878   queryuc_ptr_5 = Shortread_fullpointer_uc(queryseq5);
2879   queryuc_ptr_3 = Shortread_fullpointer_uc(queryseq3);
2880   quality_string_5 = Shortread_quality_string(queryseq5);
2881   quality_string_3 = Shortread_quality_string(queryseq3);
2882 
2883   queryrc5 = (char *) MALLOC((querylength5+1)*sizeof(char));
2884   queryrc3 = (char *) MALLOC((querylength3+1)*sizeof(char));
2885   make_complement_buffered(queryrc5,queryuc_ptr_5,querylength5);
2886   make_complement_buffered(queryrc3,queryuc_ptr_3,querylength3);
2887 
2888   mismatch_positions_alloc_5 = (int *) MALLOC((querylength5+1)*sizeof(int));
2889   mismatch_positions_alloc_3 = (int *) MALLOC((querylength3+1)*sizeof(int));
2890 
2891   query5_compress_fwd = Compress_new_fwd(queryuc_ptr_5,querylength5);
2892   query5_compress_rev = Compress_new_rev(queryuc_ptr_5,querylength5);
2893   query3_compress_fwd = Compress_new_fwd(queryuc_ptr_3,querylength3);
2894   query3_compress_rev = Compress_new_rev(queryuc_ptr_3,querylength3);
2895 
2896 
2897   if (mode == STANDARD || mode == CMET_STRANDED || mode == ATOI_STRANDED || mode == TTOC_STRANDED) {
2898     hitpairs = paired_read(&abort_pairing_p,&hits5,&hits3,&samechr,&conc_transloc,
2899 			   queryuc_ptr_5,queryrc5,querylength5,queryuc_ptr_3,queryrc3,querylength3,
2900 			   mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2901 			   query5_compress_fwd,query5_compress_rev,
2902 			   query3_compress_fwd,query3_compress_rev,
2903 			   max_insertionlen_5,max_insertionlen_3,max_deletionlen_5,max_deletionlen_3,
2904 			   /*genestrand*/0,pairmax_linear,overall_max_distance_5,overall_max_distance_3,
2905 			   overall_end_distance_5,overall_end_distance_3,
2906 			   max_mismatches_refalt_5,max_mismatches_refalt_3,
2907 			   max_mismatches_ref_5,max_mismatches_ref_3,
2908 			   min_coverage_5,min_coverage_3,
2909 			   intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool);
2910     if (abort_pairing_p == true) {
2911       /* Too many concordant.  Could put into a separate split output */
2912       debug(printf("abort_pairing_p is true\n"));
2913       *final_pairtype = CONCORDANT;
2914     } else {
2915       *final_pairtype = CONCORDANT;
2916     }
2917 
2918     stage3pairarray =
2919       consolidate_paired_results(&(*npaths_primary),&(*npaths_altloc),&(*first_absmq),&(*second_absmq),&(*final_pairtype),
2920 				 &(*stage3array5),&(*nhits5_primary),&(*nhits5_altloc),&(*first_absmq5),&(*second_absmq5),
2921 				 &(*stage3array3),&(*nhits3_primary),&(*nhits3_altloc),&(*first_absmq3),&(*second_absmq3),
2922 				 hitpairs,samechr,conc_transloc,hits5,hits3,
2923 
2924 				 queryuc_ptr_5,querylength5,queryuc_ptr_3,querylength3,
2925 				 quality_string_5,quality_string_3,
2926 
2927 				 mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2928 				 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2929 
2930 				 max_mismatches_refalt_5,max_mismatches_refalt_3,
2931 				 max_mismatches_ref_5,max_mismatches_ref_3,
2932 				 min_coverage_5,min_coverage_3,
2933 				 /*oligoindices_minor,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,*/
2934 				 listpool,hitlistpool);
2935 
2936   } else if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
2937     geneplus_hitpairs = paired_read(&geneplus_abort_pairing_p,&geneplus_hits5,&geneplus_hits3,
2938 				    &geneplus_samechr,&geneplus_conc_transloc,
2939 				    queryuc_ptr_5,queryrc5,querylength5,queryuc_ptr_3,queryrc3,querylength3,
2940 				    mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2941 				    query5_compress_fwd,query5_compress_rev,
2942 				    query3_compress_fwd,query3_compress_rev,
2943 				    max_insertionlen_5,max_insertionlen_3,max_deletionlen_5,max_deletionlen_3,
2944 				    /*genestrand*/+1,pairmax_linear,overall_max_distance_5,overall_max_distance_3,
2945 				    overall_end_distance_5,overall_end_distance_3,
2946 				    max_mismatches_refalt_5,max_mismatches_refalt_3,
2947 				    max_mismatches_ref_5,max_mismatches_ref_3,
2948 				    min_coverage_5,min_coverage_3,
2949 				    intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool);
2950 
2951     geneminus_hitpairs = paired_read(&geneminus_abort_pairing_p,&geneminus_hits5,&geneminus_hits3,
2952 				     &geneminus_samechr,&geneminus_conc_transloc,
2953 				     queryuc_ptr_5,queryrc5,querylength5,queryuc_ptr_3,queryrc3,querylength3,
2954 				     mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2955 				     query5_compress_fwd,query5_compress_rev,
2956 				     query3_compress_fwd,query3_compress_rev,
2957 				     max_insertionlen_5,max_insertionlen_3,max_deletionlen_5,max_deletionlen_3,
2958 				     /*genestrand*/+2,pairmax_linear,overall_max_distance_5,overall_max_distance_3,
2959 				     overall_end_distance_5,overall_end_distance_3,
2960 				     max_mismatches_refalt_5,max_mismatches_refalt_3,
2961 				     max_mismatches_ref_5,max_mismatches_ref_3,
2962 				     min_coverage_5,min_coverage_3,
2963 				     intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool);
2964 
2965     if (geneplus_abort_pairing_p == true || geneminus_abort_pairing_p == true) {
2966       /* Too many concordant.  Could put into a separate split output */
2967       debug(printf("abort_pairing_p is true\n"));
2968       *final_pairtype = CONCORDANT;
2969     } else {
2970       *final_pairtype = CONCORDANT;
2971     }
2972 
2973     hits5 = List_append(geneplus_hits5,geneminus_hits5);
2974     hits3 = List_append(geneplus_hits3,geneminus_hits3);
2975     hitpairs = List_append(geneplus_hitpairs,geneminus_hitpairs);
2976     samechr = List_append(geneplus_samechr,geneminus_samechr);
2977     conc_transloc = List_append(geneplus_conc_transloc,geneminus_conc_transloc);
2978 
2979     stage3pairarray =
2980       consolidate_paired_results(&(*npaths_primary),&(*npaths_altloc),&(*first_absmq),&(*second_absmq),&(*final_pairtype),
2981 				 &(*stage3array5),&(*nhits5_primary),&(*nhits5_altloc),&(*first_absmq5),&(*second_absmq5),
2982 				 &(*stage3array3),&(*nhits3_primary),&(*nhits3_altloc),&(*first_absmq3),&(*second_absmq3),
2983 				 hitpairs,samechr,conc_transloc,hits5,hits3,
2984 
2985 				 queryuc_ptr_5,querylength5,queryuc_ptr_3,querylength3,
2986 				 quality_string_5,quality_string_3,
2987 
2988 				 mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2989 				 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2990 
2991 				 max_mismatches_refalt_5,max_mismatches_refalt_3,
2992 				 max_mismatches_ref_5,max_mismatches_ref_3,
2993 				 min_coverage_5,min_coverage_3,
2994 				 /*oligoindices_minor,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,*/
2995 				 listpool,hitlistpool);
2996 
2997   } else {
2998     fprintf(stderr,"Do not recognize mode %d\n",mode);
2999     abort();
3000   }
3001 
3002   Compress_free(&query5_compress_fwd);
3003   Compress_free(&query5_compress_rev);
3004   Compress_free(&query3_compress_fwd);
3005   Compress_free(&query3_compress_rev);
3006 
3007   FREE(mismatch_positions_alloc_3);
3008   FREE(mismatch_positions_alloc_5);
3009 
3010   FREE(queryrc3);
3011   FREE(queryrc5);
3012 
3013   debug(printf("Returning with final_pairtype %d\n",*final_pairtype));
3014   return stage3pairarray;
3015 }
3016 
3017 
3018 void
Stage1hr_setup(Univ_IIT_T transcript_iit_in,Transcriptome_T transcriptome_in,Genome_T transcriptomebits_in,bool use_only_transcriptome_p_in,Indexdb_T indexdb_fwd_in,Indexdb_T indexdb_rev_in,Indexdb_T indexdb_tr_in,int index1part_tr_in,int index1part_in,int index1interval_in,double user_mismatches_refalt_float_in,double user_mismatches_ref_float_in,double user_mincoverage_float_in,double max_middle_insertions_float_in,double max_middle_deletions_float_in,int max_end_insertions_in,int max_end_deletions_in,Univ_IIT_T chromosome_iit_in,int nchromosomes_in,Genome_T genomecomp_in,Genome_T genomebits_in,Genome_T genomebits_alt_in,Mode_T mode_in,int maxpaths_search_in,int maxpaths_report_in,bool find_dna_chimeras_p_in,bool distances_observed_p_in,int subopt_levels_in,bool splicingp_in,Chrpos_T shortsplicedist_in,Chrpos_T shortsplicedist_novelend_in,Chrpos_T min_intronlength_in,Chrpos_T expected_pairlength_in,Chrpos_T pairlength_deviation_in,int distantsplicing_penalty_in,int min_distantsplicing_end_matches_in,int min_distantsplicing_identity_in)3019 Stage1hr_setup (Univ_IIT_T transcript_iit_in, Transcriptome_T transcriptome_in, Genome_T transcriptomebits_in,
3020 		bool use_only_transcriptome_p_in,
3021 
3022 		Indexdb_T indexdb_fwd_in, Indexdb_T indexdb_rev_in, Indexdb_T indexdb_tr_in,
3023 
3024 		int index1part_tr_in, int index1part_in, int index1interval_in,
3025 		double user_mismatches_refalt_float_in, double user_mismatches_ref_float_in, double user_mincoverage_float_in,
3026 		double max_middle_insertions_float_in, double max_middle_deletions_float_in,
3027 		int max_end_insertions_in, int max_end_deletions_in,
3028 
3029 		Univ_IIT_T chromosome_iit_in, int nchromosomes_in,
3030 		Genome_T genomecomp_in, Genome_T genomebits_in, Genome_T genomebits_alt_in,
3031 		Mode_T mode_in, int maxpaths_search_in, int maxpaths_report_in,
3032 
3033 		bool find_dna_chimeras_p_in, bool distances_observed_p_in, int subopt_levels_in,
3034 
3035 		bool splicingp_in, Chrpos_T shortsplicedist_in, Chrpos_T shortsplicedist_novelend_in,
3036 		Chrpos_T min_intronlength_in, Chrpos_T expected_pairlength_in, Chrpos_T pairlength_deviation_in,
3037 
3038 		int distantsplicing_penalty_in,
3039 		int min_distantsplicing_end_matches_in, int min_distantsplicing_identity_in) {
3040 
3041   use_only_transcriptome_p = use_only_transcriptome_p_in;
3042 
3043   transcript_iit = transcript_iit_in;
3044   transcriptome = transcriptome_in;
3045   transcriptomebits = transcriptomebits_in;
3046 
3047   indexdb_fwd = indexdb_fwd_in;
3048   indexdb_rev = indexdb_rev_in;
3049   indexdb_tr = indexdb_tr_in;
3050 
3051   index1part_tr = index1part_tr_in;
3052   index1part = index1part_in;
3053   index1interval = index1interval_in;
3054 
3055   user_mismatches_refalt_float = user_mismatches_refalt_float_in;
3056   user_mismatches_ref_float = user_mismatches_ref_float_in;
3057   user_mincoverage_float = user_mincoverage_float_in;
3058 
3059   max_middle_insertions_float = max_middle_insertions_float_in;
3060   max_middle_deletions_float = max_middle_deletions_float_in;
3061 
3062   chromosome_iit = chromosome_iit_in;
3063   circular_typeint = Univ_IIT_typeint(chromosome_iit,"circular");
3064   nchromosomes = nchromosomes_in;
3065 
3066   genomecomp = genomecomp_in;
3067   genomebits = genomebits_in;
3068   genomebits_alt = genomebits_alt_in;
3069 
3070 #ifdef HAVE_64_BIT
3071   leftreadshift = 64 - index1part - index1part;
3072   oligobase_mask = ~(~ (Oligospace_T) 0 << 2*index1part);
3073 #else
3074   leftreadshift = 32 - index1part - index1part;
3075   oligobase_mask = ~(~ (Oligospace_T) 0 << 2*index1part);
3076 #endif
3077 
3078   poly_a = POLY_A & oligobase_mask;
3079   poly_t = POLY_T & oligobase_mask;
3080 
3081   mode = mode_in;
3082   maxpaths_search = maxpaths_search_in;
3083   maxpaths_report = maxpaths_report_in;
3084 
3085   find_dna_chimeras_p = find_dna_chimeras_p_in;
3086   distances_observed_p = distances_observed_p_in;
3087 
3088   subopt_levels = subopt_levels_in;
3089 
3090   max_end_insertions = max_end_insertions_in;
3091   max_end_deletions = max_end_deletions_in;
3092 
3093   splicingp = splicingp_in;
3094   shortsplicedist = shortsplicedist_in;
3095   shortsplicedist_novelend = shortsplicedist_novelend_in;
3096 
3097   min_intronlength = min_intronlength_in;
3098   expected_pairlength = expected_pairlength_in;
3099   pairlength_deviation = pairlength_deviation_in;
3100 
3101   distantsplicing_penalty = distantsplicing_penalty_in;
3102   min_distantsplicing_end_matches = min_distantsplicing_end_matches_in;
3103   min_distantsplicing_identity = min_distantsplicing_identity_in;
3104 
3105   if (genomebits_alt_in != NULL) {
3106     snpp = true;
3107   } else {
3108     snpp = false;
3109   }
3110 
3111   return;
3112 }
3113