1 static char rcsid[] = "$Id: stage1hr.c 222932 2020-06-29 16:11:37Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 #define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 #define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11
12 #include "stage1hr.h"
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h> /* For memset() */
16 #include <math.h>
17 #include <ctype.h> /* for tolower() */
18 #include "assert.h"
19 #include "mem.h"
20 #include "types.h" /* Needed for HAVE_64_BIT */
21 #include "univcoord.h"
22
23 #include "reader.h"
24 #include "oligo.h"
25 #include "indexdb.h"
26
27 #include "list.h"
28 #include "intlist.h"
29 #include "splice.h"
30 #include "indel.h"
31 #include "stage3hr.h"
32 #include "concordance.h"
33 #include "substring.h"
34 #include "complement.h"
35 #include "compress.h"
36 #include "genome128_hr.h"
37 #include "genome_sites.h"
38 #include "maxent.h"
39 #include "maxent_hr.h"
40 #include "iitdef.h"
41 #include "univinterval.h"
42 #ifdef LARGE_GENOMES
43 #include "uint8list.h"
44 #else
45 #include "uintlist.h"
46 #endif
47 #include "record.h"
48
49 #ifdef HAVE_64_BIT
50 /* Oligospace_T is 64 bits */
51 #include "uint8table_rh.h"
52 #else
53 /* Oligospace_T is 32 bits */
54 #include "uinttable_rh.h"
55 #endif
56
57
58 #include "orderstat.h"
59 #include "path-solve.h"
60 #include "kmer-search.h"
61 #include "extension-search.h"
62 #include "segment-search.h"
63 #include "terminal.h"
64 #include "distant-rna.h"
65 #include "distant-dna.h"
66
67 #include "univdiag.h"
68 #include "univdiagdef.h"
69
70 #include "comp.h"
71
72
73 #ifdef WORDS_BIGENDIAN
74 #include "bigendian.h"
75 #endif
76
77
78 #ifdef HAVE_64_BIT
79 #ifdef LARGE_GENOMES
80 #else
81 #define DIAGONAL_ADD_QUERYPOS 1
82 #endif
83 #endif
84
85 /* Three methods for performing a multiway merge. Need to define one below. */
86
87 #define SPEED 1
88
89
90 /* Note FORMULA: formulas for querypos <-> diagonal (diagterm in call to Indexdb_read) are:
91
92 plus: diagonal = position + querylength - querypos
93 minus: diagonal = position + querypos + index1part
94
95 For minus, the index1part is needed in call to Indexdb_read because
96 position is stored at beginning of plus oligomer, which corresponds to
97 end of minus oligomer. As a result, we have the following formulas:
98
99 high genomic position = diagonal (corresponds to querypos =
100 querylength for plus, and querypos = 0 for minus)
101
102 low genomic position = diagonal - querylength (corresponds to querypos
103 = 0 for plus, and querypos = querylength for minus)
104
105 Holds when we use Reader_T to read from 5' end of forward query and 3'
106 end of revcomp query simultaneously. If we create a queryrc sequence,
107 then we can use just the plus formula, and convert the query
108 coordinates later.
109
110 */
111
112
113 #define MAX_ITER 3 /* For looping through Segment_search */
114
115 #define NO_EXTENSIONS_BEFORE_ZERO 1
116
117 #define ALLOW_MIDDLE_ALIGNMENTS 1
118
119 /* #define EXTRACT_GENOMICSEG 1 */
120 #ifdef EXTRACT_GENOMICSEG
121 #define MAX_INDEXSIZE 8
122 #endif
123
124
125 /* MAX_NALIGNMENTS of 2 vs 1 gets 1600 improvements in 275,000 reads */
126 /* MAX_NALIGNMENTS of 3 vs 2 gets 96 improvements in 275,000 reads */
127 #define MAX_NALIGNMENTS 3
128
129 #define MAX_ALLOCATION 200
130
131 #define PAIRMAX_ADDITIONAL 10000 /* Allows for finding of unpaired GMAP alignments beyond pairmax */
132
133 #define MIN_SIZELIMIT 100
134
135 /* static int kmer_search_sizelimit = 100; */
136 /* static int stage1hr_sizelimit = 3000; */
137 /* static int extension_search_sizelimit = 3000; */
138
139 #define POLY_A 0x00000000
140 #define POLY_T 0xFFFFFFFF
141 static Oligospace_T poly_a;
142 static Oligospace_T poly_t;
143
144 static bool use_only_transcriptome_p;
145 static bool remap_transcriptome_p = false;
146
147 static Univ_IIT_T transcript_iit;
148 static Transcriptome_T transcriptome;
149 static Genome_T transcriptomebits;
150
151 static Indexdb_T indexdb_fwd;
152 static Indexdb_T indexdb_rev;
153 static Indexdb_T indexdb_tr;
154
155 static int index1part;
156 static int index1part_tr;
157 static int index1interval;
158
159 static double user_mismatches_refalt_float;
160 static double user_mismatches_ref_float;
161 static double user_mincoverage_float;
162
163 static double max_middle_insertions_float;
164 static double max_middle_deletions_float;
165
166 static Univ_IIT_T chromosome_iit;
167 static int circular_typeint;
168
169 static int nchromosomes;
170 static Genome_T genomecomp;
171 static Genome_T genomebits;
172 static Genome_T genomebits_alt;
173
174 static int leftreadshift;
175 static Oligospace_T oligobase_mask; /* same as kmer_mask */
176
177
178 /* Mode */
179 static Mode_T mode;
180 static bool snpp;
181 static int maxpaths_search;
182 static int maxpaths_report;
183
184
185 /* Other distances */
186 static int max_end_insertions;
187 static int max_end_deletions;
188 static bool splicingp;
189 static Chrpos_T shortsplicedist;
190 static Chrpos_T shortsplicedist_novelend;
191
192
193 /* Penalties */
194 static int subopt_levels;
195
196 static bool find_dna_chimeras_p;
197 static bool distances_observed_p;
198
199 static Chrpos_T min_intronlength;
200 static Chrpos_T expected_pairlength;
201 static Chrpos_T pairlength_deviation;
202
203 static int distantsplicing_penalty;
204 static int min_distantsplicing_end_matches;
205 static int min_distantsplicing_identity;
206
207
208 #define A_CHAR 0x0
209 #define C_CHAR 0x1
210 #define G_CHAR 0x2
211 #define T_CHAR 0x3
212
213
214 /* Originally allowed only 1, to print only unique translocations.
215 But need to allow enough to avoid missing some translocations. */
216 /* For transcript splicing, need to increase MAXCHIMERAPATHS */
217 /* #define MAXCHIMERAPATHS 100 */
218 #define MAXCHIMERAPATHS 10000
219
220 #define NREQUIRED_FAST 2 /* For candidate generation using
221 multimiss. A value of 2 implies
222 specificity of a 24-mer, which
223 should be low for a human-sized
224 genome */
225
226 #define MAX_INDEX1INTERVAL 3
227
228
229
230 /* Overall flow */
231 #ifdef DEBUG
232 #define debug(x) x
233 #else
234 #define debug(x)
235 #endif
236
237 /* Stage1_init */
238 #ifdef DEBUG1
239 #define debug1(x) x
240 #else
241 #define debug1(x)
242 #endif
243
244 /* Filling oligos */
245 #ifdef DEBUG8
246 #define debug8(x) x
247 #else
248 #define debug8(x)
249 #endif
250
251 /* consolidate_paired_results and choose_among_paired */
252 #ifdef DEBUG16
253 #define debug16(x) x
254 #else
255 #define debug16(x)
256 #endif
257
258
259 #define T Stage1_T
260 static T
Stage1_new(int querylength)261 Stage1_new (int querylength) {
262 T new = (T) MALLOC(sizeof(*new));
263 int overhang = index1interval-1;
264 int mod;
265
266 new->plus_validp = (bool *) CALLOC(querylength+overhang,sizeof(bool));
267 new->minus_validp = (bool *) CALLOC(querylength+overhang,sizeof(bool));
268
269 new->plus_oligos = (Oligospace_T *) MALLOC((querylength+overhang)*sizeof(Oligospace_T));
270 new->minus_oligos = (Oligospace_T *) MALLOC((querylength+overhang)*sizeof(Oligospace_T));
271
272 new->retrievedp_allocated = (bool *) CALLOC(2 * (querylength+overhang),sizeof(bool));
273 new->plus_retrievedp = &(new->retrievedp_allocated[overhang]);
274 new->minus_retrievedp = &(new->retrievedp_allocated[(querylength+overhang)+overhang]);
275
276 #ifdef LARGE_GENOMES
277 new->positions_high_allocated = (unsigned char **) CALLOC(2 * (querylength+overhang),sizeof(unsigned char *));
278 new->plus_positions_high = &(new->positions_high_allocated[overhang]);
279 new->minus_positions_high = &(new->positions_high_allocated[(querylength+overhang)+overhang]);
280 #endif
281
282 new->positions_allocated = (UINT4 **) CALLOC(2 * (querylength+overhang),sizeof(UINT4 *));
283 new->plus_positions = &(new->positions_allocated[overhang]);
284 new->minus_positions = &(new->positions_allocated[(querylength+overhang)+overhang]);
285
286 new->npositions_allocated = (int *) CALLOC(2 * (querylength+overhang),sizeof(int));
287 new->plus_npositions = &(new->npositions_allocated[overhang]);
288 new->minus_npositions = &(new->npositions_allocated[(querylength+overhang)+overhang]);
289
290 /* Need to allocate (max_mismatches + 1), where max_mismatches is provided to Genome_mismatches_left or Genome_mismatches_right */
291 /* new->mismatch_positions_alloc = (int *) MALLOC((querylength+1)*sizeof(int)); */
292 new->positions_alloc = (int *) MALLOC((querylength+1)*sizeof(int));
293 new->spliceinfo = Spliceinfo_new(querylength);
294
295 /* Memory allocated for Segment_identify in segment-search.c, and
296 Merge_diagonals in kmer-search.c (which needs four sets of
297 arrays) */
298 #ifdef LARGE_GENOMES
299 new->stream_high_alloc = (unsigned char **) MALLOC(4*querylength*sizeof(unsigned char *));
300 new->gplus_stream_high_array_5 = &(new->stream_high_alloc[0]);
301 new->gminus_stream_high_array_5 = &(new->stream_high_alloc[querylength]);
302 new->gplus_stream_high_array_3 = &(new->stream_high_alloc[2*querylength]);
303 new->gminus_stream_high_array_3 = &(new->stream_high_alloc[3*querylength]);
304
305 new->stream_low_alloc = (UINT4 **) MALLOC(4*querylength*sizeof(UINT4 *));
306 new->gplus_stream_low_array_5 = &(new->stream_low_alloc[0]);
307 new->gminus_stream_low_array_5 = &(new->stream_low_alloc[querylength]);
308 new->gplus_stream_low_array_3 = &(new->stream_low_alloc[2*querylength]);
309 new->gminus_stream_low_array_3 = &(new->stream_low_alloc[3*querylength]);
310 #endif
311
312 new->stream_alloc = (Univcoord_T **) MALLOC(4*querylength*sizeof(Univcoord_T *));
313 new->gplus_stream_array_5 = &(new->stream_alloc[0]);
314 new->gminus_stream_array_5 = &(new->stream_alloc[querylength]);
315 new->gplus_stream_array_3 = &(new->stream_alloc[2*querylength]);
316 new->gminus_stream_array_3 = &(new->stream_alloc[3*querylength]);
317
318 #ifdef LARGE_GENOMES
319 new->tplus_stream_array = new->gplus_stream_low_array_5;
320 new->tminus_stream_array = new->gminus_stream_low_array_5;
321 #else
322 new->tplus_stream_array = new->gplus_stream_array_5;
323 new->tminus_stream_array = new->gminus_stream_array_5;
324 #endif
325
326 new->streamsize_alloc = (int *) MALLOC(4*querylength*sizeof(int));
327 new->tplus_streamsize_array = new->gplus_streamsize_array_5 = &(new->streamsize_alloc[0]);
328 new->tminus_streamsize_array = new->gminus_streamsize_array_5 = &(new->streamsize_alloc[querylength]);
329 new->gplus_streamsize_array_3 = &(new->streamsize_alloc[2*querylength]);
330 new->gminus_streamsize_array_3 = &(new->streamsize_alloc[3*querylength]);
331
332 new->querypos_diagterm_alloc = (int *) MALLOC(4*querylength*sizeof(int));
333 new->tplus_diagterm_array = new->gplus_diagterm_array_5 = &(new->querypos_diagterm_alloc[0]);
334 new->tminus_diagterm_array = new->gminus_diagterm_array_5 = &(new->querypos_diagterm_alloc[querylength]);
335 new->gplus_diagterm_array_3 = &(new->querypos_diagterm_alloc[2*querylength]);
336 new->gminus_diagterm_array_3 = &(new->querypos_diagterm_alloc[3*querylength]);
337
338
339 for (mod = 0; mod < 2*index1interval; mod++) {
340 #ifdef LARGE_GENOMES
341 new->plus_rawpositions_high_5[mod] = (unsigned char *) NULL;
342 new->minus_rawpositions_high_5[mod] = (unsigned char *) NULL;
343 new->plus_rawpositions_high_3[mod] = (unsigned char *) NULL;
344 new->minus_rawpositions_high_3[mod] = (unsigned char *) NULL;
345 #endif
346 new->plus_rawpositions_5[mod] = (UINT4 *) NULL;
347 new->minus_rawpositions_5[mod] = (UINT4 *) NULL;
348 new->plus_rawpositions_3[mod] = (UINT4 *) NULL;
349 new->minus_rawpositions_3[mod] = (UINT4 *) NULL;
350 }
351
352 /* Uses Listpool_T procedures */
353 new->queryfwd_plus_set = (List_T) NULL;
354 new->queryfwd_minus_set = (List_T) NULL;
355 new->queryrev_plus_set = (List_T) NULL;
356 new->queryrev_minus_set = (List_T) NULL;
357
358 return new;
359 }
360
361 #ifdef DEBUG
362 static void
Stage1_dump(T this,int querylength)363 Stage1_dump (T this, int querylength) {
364 int query_lastpos = querylength - index1part, querypos;
365 int i;
366
367 for (querypos = 0; querypos <= query_lastpos; querypos++) {
368 if (this->plus_retrievedp[querypos] == true) {
369 printf("plus %d (%d):",querypos,this->plus_npositions[querypos]);
370 for (i = 0; i < this->plus_npositions[querypos]; i++) {
371 printf(" %u",this->plus_positions[querypos][i]);
372 }
373 printf("\n");
374 }
375 if (this->minus_retrievedp[querypos] == true) {
376 printf("minus %d (%d):",querypos,this->minus_npositions[querypos]);
377 for (i = 0; i < this->minus_npositions[querypos]; i++) {
378 printf(" %u",this->minus_positions[querypos][i]);
379 }
380 printf("\n");
381 }
382 }
383 printf("\n");
384 return;
385 }
386 #endif
387
388
389 static void
Stage1_init(T this,char * queryuc_ptr,int querylength,int genestrand)390 Stage1_init (T this, char *queryuc_ptr, int querylength, int genestrand) {
391 Reader_T reader;
392 Oligostate_T last_state;
393 Oligospace_T forward, revcomp, forward_oligo, revcomp_oligo;
394 int querypos, query_lastpos;
395 int mod;
396
397
398 debug1(printf("%s\n",queryuc_ptr));
399 reader = Reader_new(queryuc_ptr,/*querystart*/0,/*queryend*/querylength);
400 last_state = INIT;
401 forward = revcomp = 0;
402 mod = 0;
403
404 while (mod < index1interval &&
405 (last_state = Oligo_next_5(last_state,&querypos,&forward,&revcomp,reader,genestrand)) != DONE) {
406 while (mod < index1interval && mod < querypos) {
407 debug1(printf("Skipping rawpositions_5 %d, because querypos is not the expected one\n",mod));
408 #ifdef LARGE_GENOMES
409 this->plus_rawpositions_high_5[mod] = (unsigned char *) NULL;
410 this->minus_rawpositions_high_5[mod] = (unsigned char *) NULL;
411 #endif
412 this->plus_rawpositions_5[mod] = (UINT4 *) NULL;
413 this->minus_rawpositions_5[mod] = (UINT4 *) NULL;
414 this->plus_nrawpositions_5[mod] = 0;
415 this->minus_nrawpositions_5[mod] = 0;
416 mod++;
417 }
418
419 if (mod < index1interval) {
420 forward_oligo = forward & oligobase_mask;
421 this->plus_diagterms_5[mod] = querylength - querypos;
422 #ifdef LARGE_GENOMES
423 this->plus_nrawpositions_5[mod] =
424 Indexdb_largeptr(&(this->plus_rawpositions_high_5[mod]),&(this->plus_rawpositions_5[mod]),
425 /*plus_indexdb*/indexdb_fwd,forward_oligo);
426 #else
427 this->plus_nrawpositions_5[mod] =
428 Indexdb_ptr(&(this->plus_rawpositions_5[mod]),/*plus_indexdb*/indexdb_fwd,forward_oligo);
429 #endif
430 debug1(printf("(1) plus_nrawpositions_5[%d] = %d, oligo %016lX\n",mod,this->plus_nrawpositions_5[mod],forward_oligo));
431
432 revcomp_oligo = (revcomp >> leftreadshift) & oligobase_mask;
433 this->minus_diagterms_5[mod] = querypos + index1part;
434 #ifdef LARGE_GENOMES
435 this->minus_nrawpositions_5[mod] =
436 Indexdb_largeptr(&(this->minus_rawpositions_high_5[mod]),&(this->minus_rawpositions_5[mod]),
437 /*minus_indexdb*/indexdb_rev,revcomp_oligo);
438 #else
439 this->minus_nrawpositions_5[mod] =
440 Indexdb_ptr(&(this->minus_rawpositions_5[mod]),/*minus_indexdb*/indexdb_rev,revcomp_oligo);
441 #endif
442 debug1(printf("(2) minus_nrawpositions_5[%d] = %d, oligo %016lX\n",mod,this->minus_nrawpositions_5[mod],revcomp_oligo));
443
444 debug1(printf("5' end: %s %s: %d plus positions, %d minus positions, genestrand %d\n",
445 Oligo_one_nt(forward_oligo,index1part),Oligo_one_nt(revcomp_oligo,index1part),
446 this->plus_nrawpositions_5[mod],this->minus_nrawpositions_5[mod],genestrand));
447 }
448
449 mod++;
450 }
451
452 while (mod < index1interval) {
453 debug1(printf("Skipping rawpositions_5 %d, because last_state was DONE\n",mod));
454 #ifdef LARGE_GENOMES
455 this->plus_rawpositions_high_5[mod] = (unsigned char *) NULL;
456 this->minus_rawpositions_high_5[mod] = (unsigned char *) NULL;
457 #endif
458 this->plus_rawpositions_5[mod] = (UINT4 *) NULL;
459 this->minus_rawpositions_5[mod] = (UINT4 *) NULL;
460 this->plus_nrawpositions_5[mod] = 0;
461 this->minus_nrawpositions_5[mod] = 0;
462 mod++;
463 }
464 Reader_free(&reader);
465
466
467 query_lastpos = querylength - index1part;
468 reader = Reader_new(queryuc_ptr,/*querystart*/0,/*queryend*/querylength);
469 last_state = INIT;
470 forward = revcomp = 0;
471 mod = 0;
472 while (mod < index1interval &&
473 (last_state = Oligo_next_3(last_state,&querypos,&forward,&revcomp,reader,genestrand)) != DONE) {
474 while (mod < index1interval && mod < query_lastpos - querypos) {
475 debug1(printf("Skipping rawpositions_3 %d, because querypos is not the expected one\n",mod));
476 #ifdef LARGE_GENOMES
477 this->plus_rawpositions_high_3[mod] = (unsigned char *) NULL;
478 this->minus_rawpositions_high_3[mod] = (unsigned char *) NULL;
479 #endif
480 this->plus_rawpositions_3[mod] = (UINT4 *) NULL;
481 this->minus_rawpositions_3[mod] = (UINT4 *) NULL;
482 this->plus_nrawpositions_3[mod] = 0;
483 this->minus_nrawpositions_3[mod] = 0;
484 mod++;
485 }
486
487 if (mod < index1interval) {
488 forward_oligo = (forward >> leftreadshift) & oligobase_mask;
489 this->plus_diagterms_3[mod] = querylength - querypos;
490 #ifdef LARGE_GENOMES
491 this->plus_nrawpositions_3[mod] =
492 Indexdb_largeptr(&(this->plus_rawpositions_high_3[mod]),&(this->plus_rawpositions_3[mod]),
493 /*plus_indexdb*/indexdb_fwd,forward_oligo);
494 #else
495 this->plus_nrawpositions_3[mod] =
496 Indexdb_ptr(&(this->plus_rawpositions_3[mod]),/*plus_indexdb*/indexdb_fwd,forward_oligo);
497 #endif
498 debug1(printf("(3) plus_nrawpositions_3[%d] = %d, oligo %016lX\n",mod,this->plus_nrawpositions_3[mod],forward_oligo));
499
500 revcomp_oligo = revcomp & oligobase_mask;
501 this->minus_diagterms_3[mod] = querypos + index1part;
502 #ifdef LARGE_GENOMES
503 this->minus_nrawpositions_3[mod] =
504 Indexdb_largeptr(&(this->minus_rawpositions_high_3[mod]),&(this->minus_rawpositions_3[mod]),
505 /*minus_indexdb*/indexdb_rev,revcomp_oligo);
506 #else
507 this->minus_nrawpositions_3[mod] =
508 Indexdb_ptr(&(this->minus_rawpositions_3[mod]),/*minus_indexdb*/indexdb_rev,revcomp_oligo);
509 #endif
510 debug1(printf("(4) minus_nrawpositions_3[%d] = %d, oligo %016lX\n",mod,this->minus_nrawpositions_3[mod],revcomp_oligo));
511
512 debug1(printf("3' end: %s %s: %d plus positions, %d minus positions, genestrand %d\n",
513 Oligo_one_nt(forward_oligo,index1part),Oligo_one_nt(revcomp_oligo,index1part),
514 this->plus_nrawpositions_3[mod],this->minus_nrawpositions_3[mod],genestrand));
515 }
516
517 mod++;
518 }
519
520 while (mod < index1interval) {
521 /* printf("Skipping rawpositions_3 %d, because last_state was DONE\n",mod); */
522 #ifdef LARGE_GENOMES
523 this->plus_rawpositions_high_3[mod] = (unsigned char *) NULL;
524 this->minus_rawpositions_high_3[mod] = (unsigned char *) NULL;
525 #endif
526 this->plus_rawpositions_3[mod] = (UINT4 *) NULL;
527 this->minus_rawpositions_3[mod] = (UINT4 *) NULL;
528 this->plus_nrawpositions_3[mod] = 0;
529 this->minus_nrawpositions_3[mod] = 0;
530 mod++;
531 }
532 Reader_free(&reader);
533
534 return;
535 }
536
537
538
539
540 /* onep indicates whether Kmer_search_genome_one_end was run, which
541 fills beyond the first index1interval */
542 static void
Stage1_rearrange(T this,int querylength,bool onep)543 Stage1_rearrange (T this, int querylength, bool onep) {
544 int query_lastpos = querylength - index1part;
545 int mod;
546
547 for (mod = 0; mod < index1interval; mod++) {
548 this->plus_retrievedp[mod] = true;
549 #ifdef LARGE_GENOMES
550 this->plus_positions_high[mod] = this->plus_rawpositions_high_5[mod];
551 #endif
552 this->plus_positions[mod] = this->plus_rawpositions_5[mod];
553 this->plus_npositions[mod] = this->plus_nrawpositions_5[mod];
554
555 this->plus_retrievedp[query_lastpos-mod] = true;
556 #ifdef LARGE_GENOMES
557 this->plus_positions_high[query_lastpos-mod] = this->plus_rawpositions_high_3[mod];
558 #endif
559 this->plus_positions[query_lastpos-mod] = this->plus_rawpositions_3[mod];
560 this->plus_npositions[query_lastpos-mod] = this->plus_nrawpositions_3[mod];
561
562 /* Using new sarray and segment-based conventions */
563 this->minus_retrievedp[query_lastpos-mod] = true;
564 #ifdef LARGE_GENOMES
565 this->minus_positions_high[query_lastpos-mod] = this->minus_rawpositions_high_5[mod];
566 #endif
567 this->minus_positions[query_lastpos-mod] = this->minus_rawpositions_5[mod];
568 this->minus_npositions[query_lastpos-mod] = this->minus_nrawpositions_5[mod];
569
570 this->minus_retrievedp[mod] = true;
571 #ifdef LARGE_GENOMES
572 this->minus_positions_high[mod] = this->minus_rawpositions_high_3[mod];
573 #endif
574 this->minus_positions[mod] = this->minus_rawpositions_3[mod];
575 this->minus_npositions[mod] = this->minus_nrawpositions_3[mod];
576
577 #if 0
578 printf("Initializing plus_positions[%d] to be %p, with count of %d\n",
579 mod,this->plus_positions[mod],this->plus_npositions[mod]);
580 printf("Initializing plus_positions[%d] to be %p, with count of %d\n",
581 query_lastpos-mod,this->plus_positions[query_lastpos-mod],this->plus_npositions[query_lastpos-mod]);
582
583 printf("Initializing minus_positions[%d] to be %p, with count of %d\n",
584 mod,this->minus_positions[mod],this->minus_npositions[mod]);
585 printf("Initializing minus_positions[%d] to be %p, with count of %d\n",
586 query_lastpos-mod,this->minus_positions[query_lastpos-mod],this->minus_npositions[query_lastpos-mod]);
587 #endif
588 }
589
590 if (onep == true) {
591 /* Not currently doing one_kmer */
592 for (mod = 0; mod < index1interval; mod++) {
593 this->plus_retrievedp[index1part+mod] = true;
594 #ifdef LARGE_GENOMES
595 this->plus_positions_high[index1part+mod] = this->plus_rawpositions_high_5[index1interval+mod];
596 #endif
597 this->plus_positions[index1part+mod] = this->plus_rawpositions_5[index1interval+mod];
598 this->plus_npositions[index1part+mod] = this->plus_nrawpositions_5[index1interval+mod];
599
600 this->plus_retrievedp[query_lastpos-index1part-mod] = true;
601 #ifdef LARGE_GENOMES
602 this->plus_positions_high[query_lastpos-index1part-mod] = this->plus_rawpositions_high_3[index1interval+mod];
603 #endif
604 this->plus_positions[query_lastpos-index1part-mod] = this->plus_rawpositions_3[index1interval+mod];
605 this->plus_npositions[query_lastpos-index1part-mod] = this->plus_nrawpositions_3[index1interval+mod];
606
607 /* Using new sarray and segment-based conventions */
608 this->minus_retrievedp[query_lastpos-index1part-mod] = true;
609 #ifdef LARGE_GENOMES
610 this->minus_positions_high[query_lastpos-index1part-mod] = this->minus_rawpositions_high_5[index1interval+mod];
611 #endif
612 this->minus_positions[query_lastpos-index1part-mod] = this->minus_rawpositions_5[index1interval+mod];
613 this->minus_npositions[query_lastpos-index1part-mod] = this->minus_nrawpositions_5[index1interval+mod];
614
615 this->minus_retrievedp[index1part+mod] = true;
616 #ifdef LARGE_GENOMES
617 this->minus_positions_high[index1part+mod] = this->minus_rawpositions_high_3[index1interval+mod];
618 #endif
619 this->minus_positions[index1part+mod] = this->minus_rawpositions_3[index1interval+mod];
620 this->minus_npositions[index1part+mod] = this->minus_nrawpositions_3[index1interval+mod];
621
622 #if 0
623 printf("Initializing plus_positions[%d] to be %p, with count of %d\n",
624 index1part+mod,this->plus_positions[index1part+mod],this->plus_npositions[index1part+mod]);
625 printf("Initializing plus_positions[%d] to be %p, with count of %d\n",
626 query_lastpos-index1part-mod,this->plus_positions[query_lastpos-index1part-mod],
627 this->plus_npositions[query_lastpos-index1part-mod]);
628
629 printf("Initializing minus_positions[%d] to be %p, with count of %d\n",
630 index1part+mod,this->minus_positions[index1part+mod],this->minus_npositions[index1part+mod]);
631 printf("Initializing minus_positions[%d] to be %p, with count of %d\n",
632 query_lastpos-index1part-mod,this->minus_positions[query_lastpos-index1part-mod],
633 this->minus_npositions[query_lastpos-index1part-mod]);
634 #endif
635 }
636 }
637
638 /* Stage1_dump(this,querylength); */
639
640 return;
641 }
642
643
644 void
Stage1_fill_all_oligos(T this,char * queryuc_ptr,int querylength,int genestrand)645 Stage1_fill_all_oligos (T this, char *queryuc_ptr, int querylength, int genestrand) {
646 Reader_T reader;
647 int query_lastpos, querypos, querypos_rc;
648 Oligostate_T last_state = INIT;
649 Oligospace_T forward = 0, revcomp = 0, oligo;
650 #ifdef HAVE_64_BIT
651 Uint8table_T plus_seenp, minus_seenp;
652 #else
653 Uinttable_T plus_seenp, minus_seenp;
654 #endif
655
656 #ifdef HAVE_64_BIT
657 plus_seenp = Uint8table_new(/*hint*/querylength,/*save_contents_p*/false);
658 minus_seenp = Uint8table_new(/*hint*/querylength,/*save_contents_p*/false);
659 #else
660 plus_seenp = Uinttable_new(/*hint*/querylength,/*save_contents_p*/false);
661 minus_seenp = Uinttable_new(/*hint*/querylength,/*save_contents_p*/false);
662 #endif
663
664 query_lastpos = querylength - index1part;
665 reader = Reader_new(queryuc_ptr,/*querystart*/0,/*queryend*/querylength);
666
667 /* Note: leftshifting is done here, rather than in Oligo_lookup */
668 /* Format is 010llX because 19-mer is maximum k-mer size, which would require 10 chars */
669 /* debug(printf("oligobase_mask: %010llX\n",oligobase_mask)); */
670 querypos = 0;
671 while ((last_state = Oligo_next_5(last_state,&querypos,&forward,&revcomp,reader,genestrand)) != DONE) {
672 if (last_state != VALID) {
673 /* querypos is not defined when last_state != VALID */
674 debug8(printf("oligo at plus %d, minus %d is not valid\n",querypos,querypos_rc));
675 } else {
676 querypos_rc = query_lastpos - querypos;
677 oligo = this->plus_oligos[querypos] = forward & oligobase_mask;
678 debug8(printf("Putting oligo %016lX at plus %d\n",oligo,querypos));
679 #ifdef HAVE_64_BIT
680 if (Uint8table_get(plus_seenp,oligo) != NULL) {
681 debug8(printf("oligo at plus %d already seen, so marking as invalid\n",querypos));
682 this->plus_validp[querypos] = false;
683 } else {
684 this->plus_validp[querypos] = true;
685 Uint8table_put(plus_seenp,oligo,(void *) true);
686 }
687 #else
688 if (Uinttable_get(plus_seenp,oligo) != NULL) {
689 debug8(printf("oligo at plus %d already seen, so marking as invalid\n",querypos));
690 this->plus_validp[querypos] = false;
691 } else {
692 this->plus_validp[querypos] = true;
693 Uinttable_put(plus_seenp,oligo,(void *) true);
694 }
695 #endif
696
697 oligo = this->minus_oligos[querypos_rc] = (revcomp >> leftreadshift) & oligobase_mask;
698 debug8(printf("Putting oligo %016lX at minus %d\n",oligo,querypos_rc));
699 #ifdef HAVE_64_BIT
700 if (Uint8table_get(minus_seenp,oligo) != NULL) {
701 debug8(printf("oligo at minus %d already seen, so marking as invalid\n",querypos));
702 this->minus_validp[querypos_rc] = false;
703 } else {
704 this->minus_validp[querypos_rc] = true;
705 Uint8table_put(minus_seenp,oligo,(void *) true);
706 }
707 #else
708 if (Uinttable_get(minus_seenp,oligo) != NULL) {
709 debug8(printf("oligo at minus %d already seen, so marking as invalid\n",querypos));
710 this->minus_validp[querypos_rc] = false;
711 } else {
712 this->minus_validp[querypos_rc] = true;
713 Uinttable_put(minus_seenp,oligo,(void *) true);
714 }
715 #endif
716 }
717 }
718
719 Reader_free(&reader);
720
721 #ifdef HAVE_64_BIT
722 Uint8table_free(&minus_seenp);
723 Uint8table_free(&plus_seenp);
724 #else
725 Uinttable_free(&minus_seenp);
726 Uinttable_free(&plus_seenp);
727 #endif
728
729 return;
730 }
731
732
733 void
Stage1_fill_position_ptrs_plus(T this,int querystart,int queryend,int genestrand)734 Stage1_fill_position_ptrs_plus (T this, int querystart, int queryend, int genestrand) {
735 int query_lastpos, querypos;
736 Indexdb_T plus_indexdb;
737
738 if (genestrand == +2) {
739 plus_indexdb = indexdb_rev;
740 } else {
741 plus_indexdb = indexdb_fwd;
742 }
743
744 query_lastpos = queryend - index1part;
745
746 /* Assumes that plus_oligos have been filled in (by Stage1_fill_all_oligos */
747 /* Format is 010llX because 19-mer is maximum k-mer size, which would require 10 chars */
748 /* debug(printf("oligobase_mask: %010llX\n",oligobase_mask)); */
749 for (querypos = querystart; querypos <= query_lastpos; querypos++) {
750 if (this->plus_retrievedp[querypos] == true) {
751 /* No need to do anything */
752 } else if (this->plus_validp[querypos] == false) {
753 /* Oligo not valid */
754 } else {
755 #ifdef LARGE_GENOMES
756 this->plus_npositions[querypos] =
757 Indexdb_largeptr(&this->plus_positions_high[querypos],&this->plus_positions[querypos],
758 plus_indexdb,this->plus_oligos[querypos]);
759 #else
760 this->plus_npositions[querypos] =
761 Indexdb_ptr(&this->plus_positions[querypos],plus_indexdb,this->plus_oligos[querypos]);
762 #endif
763 this->plus_retrievedp[querypos] = true;
764 }
765 }
766
767 return;
768 }
769
770 void
Stage1_fill_position_ptrs_minus(T this,int querystart,int queryend,int genestrand)771 Stage1_fill_position_ptrs_minus (T this, int querystart, int queryend, int genestrand) {
772 int query_lastpos, querypos;
773 Indexdb_T minus_indexdb;
774
775 if (genestrand == +2) {
776 minus_indexdb = indexdb_fwd;
777 } else {
778 minus_indexdb = indexdb_rev;
779 }
780
781 query_lastpos = queryend - index1part;
782
783 /* Assumes that minus_oligos have been filled in (by Stage1_fill_all_oligos */
784 /* Format is 010llX because 19-mer is maximum k-mer size, which would require 10 chars */
785 /* debug(printf("oligobase_mask: %010llX\n",oligobase_mask)); */
786 for (querypos = querystart; querypos <= query_lastpos; querypos++) {
787 if (this->minus_retrievedp[querypos] == true) {
788 /* No need to do anything */
789 } else if (this->minus_validp[querypos] == false) {
790 /* Oligo not valid */
791 } else {
792 #ifdef LARGE_GENOMES
793 this->minus_npositions[querypos] =
794 Indexdb_largeptr(&this->minus_positions_high[querypos],&this->minus_positions[querypos],
795 minus_indexdb,this->minus_oligos[querypos]);
796 #else
797 this->minus_npositions[querypos] =
798 Indexdb_ptr(&this->minus_positions[querypos],minus_indexdb,this->minus_oligos[querypos]);
799 #endif
800 this->minus_retrievedp[querypos] = true;
801 }
802 }
803
804 return;
805 }
806
807
808 #if 0
809 static void
810 Stage1_ptr_all_positions (T this, int querylength, int genestrand) {
811 int query_lastpos, querypos, querypos_rc;
812 Indexdb_T plus_indexdb, minus_indexdb;
813
814 if (genestrand == +2) {
815 plus_indexdb = indexdb_rev;
816 minus_indexdb = indexdb_fwd;
817 } else {
818 plus_indexdb = indexdb_fwd;
819 minus_indexdb = indexdb_rev;
820 }
821
822 query_lastpos = querylength - index1part;
823
824 /* Note: leftshifting is done here, rather than in Oligo_lookup */
825 /* Format is 010llX because 19-mer is maximum k-mer size, which would require 10 chars */
826 /* debug(printf("oligobase_mask: %010llX\n",oligobase_mask)); */
827 for (querypos = 0, querypos_rc = query_lastpos; querypos <= query_lastpos; querypos++, --querypos_rc) {
828 if (this->plus_retrievedp[querypos] == true) {
829 /* No need to do anything */
830 } else if (this->plus_validp[querypos] == false) {
831 /* Oligo not valid */
832 } else {
833 #ifdef LARGE_GENOMES
834 this->plus_npositions[querypos] =
835 Indexdb_largeptr(&this->plus_positions_high[querypos],&this->plus_positions[querypos],
836 plus_indexdb,this->plus_oligos[querypos]);
837 #else
838 this->plus_npositions[querypos] =
839 Indexdb_ptr(&this->plus_positions[querypos],plus_indexdb,this->plus_oligos[querypos]);
840 #endif
841 this->plus_retrievedp[querypos] = true;
842 }
843
844 if (this->minus_retrievedp[querypos_rc] == true) {
845 /* No need to do anything */
846 } else if (this->minus_validp[querypos_rc] == false) {
847 /* Oligo not valid */
848 } else {
849 #ifdef LARGE_GENOMES
850 this->minus_npositions[querypos_rc] =
851 Indexdb_largeptr(&this->minus_positions_high[querypos_rc],&this->minus_positions[querypos_rc],
852 minus_indexdb,this->minus_oligos[querypos_rc]);
853 #else
854 this->minus_npositions[querypos_rc] =
855 Indexdb_ptr(&this->minus_positions[querypos_rc],minus_indexdb,this->minus_oligos[querypos_rc]);
856 #endif
857 this->minus_retrievedp[querypos_rc] = true;
858 }
859 }
860
861 return;
862 }
863 #endif
864
865
866 static int
determine_sizelimit(T this,int querylength)867 determine_sizelimit (T this, int querylength) {
868 int cutoff, *set, count;
869 int n;
870 int query_lastpos, querypos, querypos_rc;
871
872 assert(querylength >= index1part);
873
874 query_lastpos = querylength - index1part;
875 set = (int *) MALLOC(2*(query_lastpos+1)*sizeof(int));
876 n = 0;
877 for (querypos = 0, querypos_rc = query_lastpos; querypos <= query_lastpos; querypos++, --querypos_rc) {
878 if (this->plus_validp[querypos] == true) {
879 set[n++] = count = this->plus_npositions[querypos];
880 }
881
882 if (this->minus_validp[querypos_rc] == true) {
883 set[n++] = count = this->minus_npositions[querypos_rc];
884 }
885 }
886
887 if (n < 5) {
888 cutoff = MIN_SIZELIMIT;
889 } else if ((cutoff = Orderstat_int_pct_inplace(set,n,/*pct*/0.60)) < MIN_SIZELIMIT) {
890 cutoff = MIN_SIZELIMIT;
891 }
892 FREE(set);
893
894 return cutoff;
895 }
896
897
898
899
900
901 void
Stage1_free(T * old)902 Stage1_free (T *old) {
903
904 /* Stage1hr_check(*old); */
905
906 if (*old) {
907 #if 0
908 /* Now pointing to data structure, and not copying values */
909 for (i = 0; i <= query_lastpos; i++) {
910 if ((*old)->plus_retrievedp[i] == true) {
911 #ifdef LARGE_GENOMES
912 FREE((*old)->plus_positions_high[i]);
913 #endif
914 FREE((*old)->plus_positions[i]);
915 }
916
917 if ((*old)->minus_retrievedp[i] == true) {
918 #ifdef LARGE_GENOMES
919 FREE((*old)->minus_positions_high[i]);
920 #endif
921 FREE((*old)->minus_positions[i]);
922 }
923 }
924 #endif
925
926 /* FREE((*old)->mismatch_positions_alloc); */
927 FREE((*old)->positions_alloc);
928 Spliceinfo_free(&(*old)->spliceinfo);
929
930 #ifdef LARGE_GENOMES
931 FREE((*old)->stream_high_alloc);
932 FREE((*old)->stream_low_alloc);
933 #endif
934 FREE((*old)->stream_alloc);
935 FREE((*old)->streamsize_alloc);
936 FREE((*old)->querypos_diagterm_alloc);
937
938
939 #ifdef LARGE_GENOMES
940 FREE((*old)->positions_high_allocated);
941 #endif
942 FREE((*old)->positions_allocated);
943 FREE((*old)->npositions_allocated);
944
945 FREE((*old)->retrievedp_allocated);
946
947 FREE((*old)->minus_oligos);
948 FREE((*old)->plus_oligos);
949
950 FREE((*old)->minus_validp);
951 FREE((*old)->plus_validp);
952
953 Elt_gc(&(*old)->queryfwd_plus_set);
954 Elt_gc(&(*old)->queryfwd_minus_set);
955 Elt_gc(&(*old)->queryrev_plus_set);
956 Elt_gc(&(*old)->queryrev_minus_set);
957
958 FREE(*old);
959 }
960
961 return;
962 }
963
964
965 /************************************************************************/
966
967 static char complCode[128] = COMPLEMENT_LC;
968
969 static void
make_complement_buffered(char * complement,char * sequence,unsigned int length)970 make_complement_buffered (char *complement, char *sequence, unsigned int length) {
971 int i, j;
972
973 /* complement = (char *) CALLOC(length+1,sizeof(char)); */
974 for (i = length-1, j = 0; i >= 0; i--, j++) {
975 complement[j] = complCode[(int) sequence[i]];
976 }
977 complement[length] = '\0';
978 return;
979 }
980
981
982 #if 0
983 static void
984 make_complement_inplace (char *sequence, unsigned int length) {
985 char temp;
986 unsigned int i, j;
987
988 for (i = 0, j = length-1; i < length/2; i++, j--) {
989 temp = complCode[(int) sequence[i]];
990 sequence[i] = complCode[(int) sequence[j]];
991 sequence[j] = temp;
992 }
993 if (i == j) {
994 sequence[i] = complCode[(int) sequence[i]];
995 }
996
997 return;
998 }
999 #endif
1000
1001 /************************************************************************/
1002
1003
1004
1005 static void
stage3list_gc(List_T * old)1006 stage3list_gc (List_T *old) {
1007 List_T p;
1008 Stage3end_T hit;
1009
1010 for (p = *old; p != NULL; p = p->rest) {
1011 hit = (Stage3end_T) p->first;
1012 Stage3end_free(&hit);
1013 }
1014 Hitlist_free(&(*old));
1015
1016 return;
1017 }
1018
1019
1020 static void
remap_to_transcriptome(List_T hits)1021 remap_to_transcriptome (List_T hits) {
1022 List_T p;
1023 Stage3end_T hit;
1024 List_T transcripts;
1025
1026 char *remap_sequence;
1027 int remap_seqlength;
1028
1029 for (p = hits; p != NULL; p = List_next(p)) {
1030 hit = (Stage3end_T) List_head(p);
1031 if (Stage3end_transcripts(hit) != NULL) {
1032 /* Already mapped to transcriptome */
1033 } else {
1034 remap_sequence = Stage3end_substrings_genomic_sequence(&remap_seqlength,hit,genomecomp);
1035 if ((transcripts = Kmer_remap_transcriptome(remap_sequence,remap_seqlength,
1036 Stage3end_chrnum(hit),Stage3end_chrpos_low(hit),Stage3end_chrpos_high(hit),
1037 transcript_iit,transcriptomebits,transcriptome)) != NULL) {
1038 /* Following procedure frees the list hit->transcripts */
1039 Stage3end_set_transcripts(hit,transcripts);
1040 }
1041 FREE(remap_sequence);
1042 }
1043 }
1044
1045 return;
1046 }
1047
1048
1049
1050 static int
single_read_search(int * found_score_overall,int * found_score_within_trims,List_T * sense_hits_gplus,List_T * sense_hits_gminus,List_T * antisense_hits_gplus,List_T * antisense_hits_gminus,T this,char * queryuc_ptr,char * queryrc,int querylength,int * mismatch_positions_alloc,Compress_T query_compress_fwd,Compress_T query_compress_rev,int genestrand,Intlistpool_T intlistpool,Univcoordlistpool_T univcoordlistpool,Listpool_T listpool,Univdiagpool_T univdiagpool,Hitlistpool_T hitlistpool,bool paired_end_p,bool first_read_p)1051 single_read_search (int *found_score_overall, int *found_score_within_trims,
1052 List_T *sense_hits_gplus, List_T *sense_hits_gminus,
1053 List_T *antisense_hits_gplus, List_T *antisense_hits_gminus,
1054 T this, char *queryuc_ptr, char *queryrc, int querylength,
1055 int *mismatch_positions_alloc,
1056 Compress_T query_compress_fwd, Compress_T query_compress_rev, int genestrand,
1057
1058 Intlistpool_T intlistpool, Univcoordlistpool_T univcoordlistpool,
1059 Listpool_T listpool, Univdiagpool_T univdiagpool, Hitlistpool_T hitlistpool,
1060 bool paired_end_p, bool first_read_p) {
1061 int sizelimit;
1062
1063 int done_level, nmismatches_search;
1064 int max_middle_insertions, max_middle_deletions, max_insertionlen, max_deletionlen;
1065 Chrpos_T overall_max_distance, overall_end_distance;
1066 /* int max_terminal_mismatches; */
1067 int max_splice_mismatches;
1068 int min_trim;
1069
1070 bool abort_exact_p = false;
1071 /* bool elt_set_extended_p = false; */
1072 Stage3end_T hit;
1073
1074 List_T p;
1075 bool onep = false;
1076
1077
1078 /* For Segment_search */
1079 struct Record_T *plus_records = NULL, *minus_records = NULL;
1080 int plus_nrecords = 0, minus_nrecords = 0;
1081
1082 List_T startfrags_plus = NULL, endfrags_plus = NULL,
1083 startfrags_minus = NULL, endfrags_minus = NULL;
1084
1085
1086 debug(printf("Entered single_read_search with queryuc_ptr %s\n",queryuc_ptr));
1087 *sense_hits_gplus = *sense_hits_gminus = (List_T) NULL;
1088 *antisense_hits_gplus = *antisense_hits_gminus = (List_T) NULL;
1089
1090 nmismatches_search = querylength/index1part;
1091
1092 if (max_middle_insertions_float > 0.0 && max_middle_insertions_float < 1.0) {
1093 max_middle_insertions = (int) rint(max_middle_insertions_float * (double) querylength);
1094 } else {
1095 max_middle_insertions = (int) max_middle_insertions_float;
1096 }
1097 max_insertionlen = max_middle_insertions > max_end_insertions ? max_middle_insertions : max_end_insertions;
1098
1099 if (max_middle_deletions_float > 0.0 && max_middle_deletions_float < 1.0) {
1100 max_middle_deletions = (int) rint(max_middle_deletions_float * (double) querylength);
1101 } else {
1102 max_middle_deletions = (int) max_middle_deletions_float;
1103 }
1104 max_deletionlen = max_middle_deletions > max_end_deletions ? max_middle_deletions : max_end_deletions;
1105
1106 overall_max_distance = shortsplicedist;
1107 if ((Chrpos_T) max_middle_deletions > overall_max_distance) {
1108 overall_max_distance = (Chrpos_T) max_middle_deletions;
1109 }
1110 if ((Chrpos_T) max_middle_insertions > overall_max_distance) {
1111 overall_max_distance = (Chrpos_T) max_middle_insertions;
1112 }
1113 overall_end_distance = shortsplicedist_novelend > (Chrpos_T) max_deletionlen ? shortsplicedist_novelend : (Chrpos_T) max_deletionlen;
1114
1115
1116 *found_score_overall = *found_score_within_trims = querylength;
1117
1118 if (querylength < index1part + index1interval - 1) {
1119 return /*level*/0;
1120 }
1121
1122 if (indexdb_tr != NULL) {
1123 debug(printf("Running Kmer_search_transcriptome_single\n"));
1124 Kmer_search_transcriptome_single(&(*found_score_overall),&(*found_score_within_trims),
1125 &(*sense_hits_gplus),&(*sense_hits_gminus),
1126 &(*antisense_hits_gplus),&(*antisense_hits_gminus),queryuc_ptr,querylength,
1127 this->tplus_stream_array,this->tplus_streamsize_array,this->tplus_diagterm_array,
1128 this->tminus_stream_array,this->tminus_streamsize_array,this->tminus_diagterm_array,
1129 query_compress_fwd,query_compress_rev,
1130 transcript_iit,transcriptome,transcriptomebits,
1131 nmismatches_search,listpool,hitlistpool,/*level*/0);
1132 }
1133
1134 if (use_only_transcriptome_p == true || *found_score_within_trims <= nmismatches_search) {
1135 return TR;
1136 } else {
1137 debug(printf("LEVEL %d: RUNNING KMER_SEARCH_GENOME_ENDS\n",1));
1138 Stage1_init(this,queryuc_ptr,querylength,genestrand);
1139
1140 Kmer_search_genome_ends_exact(&abort_exact_p,&(*found_score_overall),&(*found_score_within_trims),
1141 &(*sense_hits_gplus),&(*sense_hits_gminus),&(*antisense_hits_gplus),&(*antisense_hits_gminus),
1142 this,querylength,mismatch_positions_alloc,
1143 query_compress_fwd,query_compress_rev,genestrand,nmismatches_search,
1144 listpool,hitlistpool,/*level*/1);
1145
1146 debug(printf("After Kmer_search_genome_ends, we have found_score %d (vs allowed %d), sense %d plus and %d minus hits, antisense %d plus and %d minus hits\n",
1147 *found_score_within_trims,nmismatches_search,
1148 List_length(*sense_hits_gplus),List_length(*sense_hits_gminus),
1149 List_length(*antisense_hits_gplus),List_length(*antisense_hits_gminus)));
1150 }
1151
1152 if (*found_score_within_trims <= nmismatches_search) {
1153 if (indexdb_tr != NULL && remap_transcriptome_p == true) {
1154 remap_to_transcriptome(*sense_hits_gplus);
1155 remap_to_transcriptome(*sense_hits_gminus);
1156 remap_to_transcriptome(*antisense_hits_gplus);
1157 remap_to_transcriptome(*antisense_hits_gminus);
1158 }
1159 return KMER_EXACT;
1160
1161 } else {
1162 Stage1_rearrange(this,querylength,onep);
1163 Stage1_fill_all_oligos(this,queryuc_ptr,querylength,genestrand);
1164 debug(printf("LEVEL %d: RUNNING EXTENSION_SEARCH\n",2));
1165 Extension_search(&(*found_score_overall),&(*found_score_within_trims),
1166 &(*sense_hits_gplus),&(*sense_hits_gminus),&(*antisense_hits_gplus),&(*antisense_hits_gminus),
1167 this,queryuc_ptr,queryrc,querylength,mismatch_positions_alloc,query_compress_fwd,query_compress_rev,
1168 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1169 nmismatches_search,max_insertionlen,max_deletionlen,overall_end_distance,
1170 genestrand,paired_end_p,first_read_p,/*level*/2);
1171 debug(printf("After Extension search, we have found_score_within_trims %d (vs allowed %d), sense %d plus and %d minus hits, antisense %d plus and %d minus hits\n",
1172 *found_score_within_trims,nmismatches_search,
1173 List_length(*sense_hits_gplus),List_length(*sense_hits_gminus),
1174 List_length(*antisense_hits_gplus),List_length(*antisense_hits_gminus)));
1175
1176 }
1177
1178 /* Need to use found_score_overall here and afterwards in order to find fusions */
1179 debug(printf("Comparing found_score_overall %d with nmismatches_search %d\n",
1180 *found_score_overall,nmismatches_search));
1181 if (abort_exact_p == true || *found_score_overall <= nmismatches_search) {
1182 if (indexdb_tr != NULL && remap_transcriptome_p == true) {
1183 remap_to_transcriptome(*sense_hits_gplus);
1184 remap_to_transcriptome(*sense_hits_gminus);
1185 remap_to_transcriptome(*antisense_hits_gplus);
1186 remap_to_transcriptome(*antisense_hits_gminus);
1187 }
1188 return EXT;
1189
1190 } else {
1191 debug(printf("LEVEL %d: RUNNING KMER_SEARCH_GENOME_ENDS_APPROX\n",3));
1192 Kmer_search_genome_ends_approx(&(*found_score_overall),&(*found_score_within_trims),
1193 &(*sense_hits_gplus),&(*sense_hits_gminus),&(*antisense_hits_gplus),&(*antisense_hits_gminus),
1194 this,mismatch_positions_alloc,query_compress_fwd,query_compress_rev,querylength,
1195 genestrand,nmismatches_search,max_deletionlen,overall_max_distance,
1196 /*sizelimit*/3000,listpool,hitlistpool,/*level*/3);
1197 }
1198
1199 debug(printf("found_score_overall %d\n",*found_score_overall));
1200 if (*found_score_overall <= nmismatches_search) {
1201 if (indexdb_tr != NULL && remap_transcriptome_p == true) {
1202 debug(printf("Remapping to transcriptome\n"));
1203 remap_to_transcriptome(*sense_hits_gplus);
1204 remap_to_transcriptome(*sense_hits_gminus);
1205 remap_to_transcriptome(*antisense_hits_gplus);
1206 remap_to_transcriptome(*antisense_hits_gminus);
1207 debug(printf("Done with remapping to transcriptome\n"));
1208 }
1209 return KMER_APPROX;
1210
1211 } else {
1212 if (querylength >= index1part) {
1213 Stage1_fill_position_ptrs_plus(this,/*querystart*/0,/*queryend*/querylength,genestrand);
1214 Stage1_fill_position_ptrs_minus(this,/*querystart*/0,/*queryend*/querylength,genestrand);
1215 /* Need sizelimit to constrain segment search */
1216 sizelimit = determine_sizelimit(this,querylength);
1217
1218 debug(printf("LEVEL %d: RUNNING SEGMENT_SEARCH\n",4));
1219 debug(printf("Starting Segment_identify on plus strand\n"));
1220 plus_records = Segment_identify(&plus_nrecords,
1221 #ifdef LARGE_GENOMES
1222 this->plus_positions_high,
1223 #endif
1224 this->plus_positions,this->plus_npositions,this->plus_validp,
1225 #ifdef LARGE_GENOMES
1226 this->stream_high_alloc,this->stream_low_alloc,
1227 #else
1228 this->stream_alloc,
1229 #endif
1230 this->streamsize_alloc,this->querypos_diagterm_alloc,
1231 overall_max_distance,querylength,sizelimit);
1232 debug(printf("Done\n"));
1233
1234 debug(printf("Starting Segment_identify on minus strand\n"));
1235 minus_records = Segment_identify(&minus_nrecords,
1236 #ifdef LARGE_GENOMES
1237 this->minus_positions_high,
1238 #endif
1239 this->minus_positions,this->minus_npositions,this->minus_validp,
1240 #ifdef LARGE_GENOMES
1241 this->stream_high_alloc,this->stream_low_alloc,
1242 #else
1243 this->stream_alloc,
1244 #endif
1245 this->streamsize_alloc,this->querypos_diagterm_alloc,
1246 overall_max_distance,querylength,sizelimit);
1247 debug(printf("Done\n"));
1248
1249 debug(printf("Starting Segment_search_all\n"));
1250 Segment_search_all(&(*found_score_overall),&(*found_score_within_trims),
1251 &(*sense_hits_gplus),&(*sense_hits_gminus),&(*antisense_hits_gplus),&(*antisense_hits_gminus),
1252 plus_records,plus_nrecords,minus_records,minus_nrecords,
1253
1254 queryuc_ptr,queryrc,querylength,mismatch_positions_alloc,
1255 this->spliceinfo,this->stream_alloc,this->streamsize_alloc,
1256 query_compress_fwd,query_compress_rev,
1257 max_insertionlen,max_deletionlen,overall_max_distance,overall_end_distance,
1258 genestrand,paired_end_p,first_read_p,
1259 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1260 /*method*/SEGMENT1,/*level*/4);
1261 debug(printf("Done\n"));
1262
1263 FREE(minus_records);
1264 FREE(plus_records);
1265 }
1266 }
1267
1268
1269 debug(printf("Comparing found_score_overall %d with nmismatches_search %d\n",
1270 *found_score_overall,nmismatches_search));
1271 if (*found_score_overall <= nmismatches_search) {
1272 if (indexdb_tr != NULL && remap_transcriptome_p == true) {
1273 remap_to_transcriptome(*sense_hits_gplus);
1274 remap_to_transcriptome(*sense_hits_gminus);
1275 remap_to_transcriptome(*antisense_hits_gplus);
1276 remap_to_transcriptome(*antisense_hits_gminus);
1277 }
1278 return SEGMENT1;
1279 }
1280
1281 /* Distant RNA splicing. No longer remap to transcriptome */
1282 if ((done_level = (*found_score_overall) + subopt_levels) > nmismatches_search) {
1283 done_level = nmismatches_search;
1284 }
1285 debug(printf("done_level %d = found_score %d + subopt_levels %d\n",done_level,*found_score_within_trims,subopt_levels));
1286
1287 min_trim = querylength; /* min of max(trim5,trim3) over each hit */
1288 for (p = *sense_hits_gplus; p != NULL; p = List_next(p)) {
1289 hit = (Stage3end_T) List_head(p);
1290 if (Stage3end_max_trim(hit) < min_trim) {
1291 min_trim = Stage3end_max_trim(hit);
1292 }
1293 }
1294 for (p = *sense_hits_gminus; p != NULL; p = List_next(p)) {
1295 hit = (Stage3end_T) List_head(p);
1296 if (Stage3end_max_trim(hit) < min_trim) {
1297 min_trim = Stage3end_max_trim(hit);
1298 }
1299 }
1300
1301 for (p = *antisense_hits_gplus; p != NULL; p = List_next(p)) {
1302 hit = (Stage3end_T) List_head(p);
1303 if (Stage3end_max_trim(hit) < min_trim) {
1304 min_trim = Stage3end_max_trim(hit);
1305 }
1306 }
1307 for (p = *antisense_hits_gminus; p != NULL; p = List_next(p)) {
1308 hit = (Stage3end_T) List_head(p);
1309 if (Stage3end_max_trim(hit) < min_trim) {
1310 min_trim = Stage3end_max_trim(hit);
1311 }
1312 }
1313
1314 debug(printf("Comparing min_trim %d with min_distantsplicing_end_matches %d\n",
1315 min_trim,min_distantsplicing_end_matches));
1316 if (min_trim < min_distantsplicing_end_matches) {
1317 /* Don't find distant splicing */
1318 debug(printf("Skipping distant splicing because min_trim %d < min_distantsplicing_end_matches %d\n",
1319 min_trim,min_distantsplicing_end_matches));
1320
1321 } else if ((max_splice_mismatches = done_level - distantsplicing_penalty) < 0) {
1322 debug(printf("Skipping distant splicing because done_level %d - distantsplicing_penalty %d < 0\n",
1323 done_level,distantsplicing_penalty));
1324
1325 } else if (splicingp == false) {
1326 /* TODO: Implement distant DNA fusions */
1327 debug(printf("Skipping distant splicing because splicingp is false\n"));
1328
1329 } else {
1330 debug(printf("Candidate for distant splicing because min_trim %d >= min_distantsplicing_end_matches %d\n",
1331 min_trim,min_distantsplicing_end_matches));
1332 debug(printf("Have sets: %d %d %d %d\n",
1333 List_length(this->queryfwd_plus_set),List_length(this->queryfwd_minus_set),
1334 List_length(this->queryrev_plus_set),List_length(this->queryrev_minus_set)));
1335 debug(printf("LEVEL %d: RUNNING DISTANT_RNA_SOLVE\n",5));
1336
1337 Distant_rna_solve(&(*found_score_overall),&(*found_score_within_trims),
1338 &(*sense_hits_gplus),&(*sense_hits_gminus),&(*antisense_hits_gplus),&(*antisense_hits_gminus),
1339 &startfrags_plus,&endfrags_plus,&startfrags_minus,&endfrags_minus,
1340
1341 this->queryfwd_plus_set,this->queryfwd_minus_set,
1342 this->queryrev_plus_set,this->queryrev_minus_set,
1343
1344 mismatch_positions_alloc,this->positions_alloc,
1345 query_compress_fwd,query_compress_rev,querylength,
1346 max_splice_mismatches,genestrand,/*first_read_p*/true,listpool,hitlistpool,
1347 /*level*/5);
1348 }
1349
1350 if (*found_score_overall <= nmismatches_search) {
1351 Substring_list_gc(&startfrags_plus);
1352 Substring_list_gc(&endfrags_plus);
1353 Substring_list_gc(&startfrags_minus);
1354 Substring_list_gc(&endfrags_minus);
1355 return DISTANT_RNA;
1356
1357 } else if (find_dna_chimeras_p == true) {
1358 debug(printf("LEVEL %d: RUNNING DISTANT_DNA_SOLVE\n",6));
1359 startfrags_plus = Substring_sort_siteN_halves(startfrags_plus,listpool,/*ascendingp*/true);
1360 endfrags_plus = Substring_sort_siteN_halves(endfrags_plus,listpool,/*ascendingp*/true);
1361 startfrags_minus = Substring_sort_siteN_halves(startfrags_minus,listpool,/*ascendingp*/true);
1362 endfrags_minus = Substring_sort_siteN_halves(endfrags_minus,listpool,/*ascendingp*/true);
1363
1364 Distant_dna_solve(&(*found_score_overall),&(*found_score_within_trims),
1365 &(*sense_hits_gplus),&(*sense_hits_gminus),&(*antisense_hits_gplus),&(*antisense_hits_gminus),
1366 startfrags_plus,endfrags_plus,startfrags_minus,endfrags_minus,
1367 mismatch_positions_alloc,query_compress_fwd,query_compress_rev,querylength,
1368 first_read_p,listpool,hitlistpool,/*level*/6);
1369 debug(printf("Returning from distant DNA solve\n"));
1370
1371 if (*found_score_overall <= nmismatches_search) {
1372 Substring_list_gc(&startfrags_plus);
1373 Substring_list_gc(&endfrags_plus);
1374 Substring_list_gc(&startfrags_minus);
1375 Substring_list_gc(&endfrags_minus);
1376 return DISTANT_DNA;
1377
1378 } else {
1379 debug(printf("Consider terminals. Have sense %d hits plus and %d hits minus, antisense %d hits plus and %d hits minus\n",
1380 List_length(*sense_hits_gplus),List_length(*sense_hits_gminus),
1381 List_length(*antisense_hits_gplus),List_length(*antisense_hits_gminus)));
1382
1383 /* Terminals. Use hits == NULL as criterion, rather than
1384 found_score, since Terminal_solve cannot improve found_score */
1385
1386 if (*sense_hits_gplus == NULL && *sense_hits_gminus == NULL &&
1387 *antisense_hits_gplus == NULL && *antisense_hits_gminus == NULL) {
1388
1389 debug(printf("LEVEL %d: RUNNING TERMINAL_SOLVE_PLUS\n",7));
1390 Terminal_solve_plus(&(*found_score_overall),&(*found_score_within_trims),
1391 &(*sense_hits_gplus),&(*antisense_hits_gplus),
1392 this->queryfwd_plus_set,this->queryrev_plus_set,
1393 mismatch_positions_alloc,/*queryuc_ptr,*/query_compress_fwd,querylength,
1394 genestrand,listpool,hitlistpool,/*level*/7);
1395
1396 debug(printf("LEVEL %d: RUNNING TERMINAL_SOLVE_MINUS\n",7));
1397 Terminal_solve_minus(&(*found_score_overall),&(*found_score_within_trims),
1398 &(*sense_hits_gminus),&(*antisense_hits_gminus),
1399 this->queryfwd_minus_set,this->queryrev_minus_set,
1400 mismatch_positions_alloc,/*queryrc,*/query_compress_rev,querylength,
1401 genestrand,listpool,hitlistpool,/*level*/7);
1402 debug(printf("Got sense %d plus and %d minus terminals, antisense %d plus and %d minus terminals\n",
1403 List_length(*sense_hits_gplus),List_length(*sense_hits_gminus),
1404 List_length(*antisense_hits_gplus),List_length(*antisense_hits_gminus)));
1405 }
1406 }
1407
1408 debug(printf("Returning from terminals\n"));
1409 }
1410
1411 Substring_list_gc(&startfrags_plus);
1412 Substring_list_gc(&endfrags_plus);
1413 Substring_list_gc(&startfrags_minus);
1414 Substring_list_gc(&endfrags_minus);
1415 return TERMINAL;
1416 }
1417
1418
1419 Stage3end_T *
Stage1_single_read(int * npaths_primary,int * npaths_altloc,int * first_absmq,int * second_absmq,Shortread_T queryseq,Intlistpool_T intlistpool,Univcoordlistpool_T univcoordlistpool,Listpool_T listpool,Univdiagpool_T univdiagpool,Hitlistpool_T hitlistpool)1420 Stage1_single_read (int *npaths_primary, int *npaths_altloc, int *first_absmq, int *second_absmq,
1421 Shortread_T queryseq,
1422 #if 0
1423 Oligoindex_array_T oligoindices_minor,
1424 Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
1425 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
1426 #endif
1427 Intlistpool_T intlistpool, Univcoordlistpool_T univcoordlistpool,
1428 Listpool_T listpool, Univdiagpool_T univdiagpool, Hitlistpool_T hitlistpool) {
1429 Stage3end_T *stage3array;
1430 T this;
1431 List_T hits, geneplus_hits, geneminus_hits;
1432 List_T sense_hits_gplus, sense_hits_gminus, antisense_hits_gplus, antisense_hits_gminus;
1433 int *mismatch_positions_alloc;
1434
1435 int found_score_overall, found_score_within_trims;
1436 int querylength, max_mismatches_refalt, max_mismatches_ref, min_coverage;
1437 char *queryuc_ptr, *queryrc, *quality_string;
1438 Compress_T query_compress_fwd, query_compress_rev;
1439
1440
1441 querylength = Shortread_fulllength(queryseq);
1442 queryuc_ptr = Shortread_fullpointer_uc(queryseq);
1443
1444 queryrc = (char *) MALLOC((querylength+1)*sizeof(char));
1445 make_complement_buffered(queryrc,queryuc_ptr,querylength);
1446
1447 mismatch_positions_alloc = (int *) MALLOC((querylength+1)*sizeof(int));
1448 query_compress_fwd = Compress_new_fwd(queryuc_ptr,querylength);
1449 query_compress_rev = Compress_new_rev(queryuc_ptr,querylength);
1450
1451 /* Previously used user_mismatches_refalt_float in searching, now just for filtering */
1452 if (user_mismatches_refalt_float < 0.0) {
1453 max_mismatches_refalt = querylength;
1454 } else if (user_mismatches_refalt_float > 0.0 && user_mismatches_refalt_float < 1.0) {
1455 max_mismatches_refalt = (int) rint(user_mismatches_refalt_float * (double) querylength);
1456 } else {
1457 /* Assuming that --max-mismatches=1 must mean 1 bp and not querylength */
1458 max_mismatches_refalt = (int) user_mismatches_refalt_float;
1459 }
1460
1461 /* Previously used user_mismatches_ref_float in searching, now just for filtering */
1462 if (user_mismatches_ref_float < 0.0) {
1463 max_mismatches_ref = querylength;
1464 } else if (user_mismatches_ref_float > 0.0 && user_mismatches_ref_float < 1.0) {
1465 max_mismatches_ref = (int) rint(user_mismatches_ref_float * (double) querylength);
1466 } else {
1467 /* Assuming that --max-mismatches=1 must mean 1 bp and not querylength */
1468 max_mismatches_ref = (int) user_mismatches_ref_float;
1469 }
1470
1471 if (user_mincoverage_float <= 0.0) {
1472 min_coverage = 0;
1473 } else if (user_mincoverage_float <= 1.0) {
1474 /* Assuming that --min-coverage=1 must mean 1.0 and not a coverage of 1 bp */
1475 min_coverage = (int) rint(user_mincoverage_float * (double) querylength);
1476 } else {
1477 min_coverage = (int) user_mincoverage_float;
1478 }
1479
1480 if (mode == STANDARD || mode == CMET_STRANDED || mode == ATOI_STRANDED || mode == TTOC_STRANDED) {
1481 this = Stage1_new(querylength);
1482 single_read_search(&found_score_overall,&found_score_within_trims,
1483 &sense_hits_gplus,&sense_hits_gminus,&antisense_hits_gplus,&antisense_hits_gminus,
1484 this,queryuc_ptr,queryrc,querylength,
1485 mismatch_positions_alloc,query_compress_fwd,query_compress_rev,/*genestrand*/0,
1486 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1487 /*paired_end_p*/false,/*first_read_p*/true);
1488 Stage1_free(&this);
1489
1490 sense_hits_gplus = Stage3end_filter(sense_hits_gplus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1491 sense_hits_gminus = Stage3end_filter(sense_hits_gminus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1492 antisense_hits_gplus = Stage3end_filter(antisense_hits_gplus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1493 antisense_hits_gminus = Stage3end_filter(antisense_hits_gminus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1494 hits = List_append(sense_hits_gplus,List_append(sense_hits_gminus,List_append(antisense_hits_gplus,antisense_hits_gminus)));
1495
1496 } else if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
1497 this = Stage1_new(querylength);
1498 single_read_search(&found_score_overall,&found_score_within_trims,
1499 &sense_hits_gplus,&sense_hits_gminus,&antisense_hits_gplus,&antisense_hits_gminus,
1500 this,queryuc_ptr,queryrc,querylength,
1501 mismatch_positions_alloc,query_compress_fwd,query_compress_rev,/*genestrand*/+1,
1502 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1503 /*paired_end_p*/false,/*first_read_p*/true);
1504 Stage1_free(&this);
1505
1506 sense_hits_gplus = Stage3end_filter(sense_hits_gplus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1507 sense_hits_gminus = Stage3end_filter(sense_hits_gminus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1508 antisense_hits_gplus = Stage3end_filter(antisense_hits_gplus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1509 antisense_hits_gminus = Stage3end_filter(antisense_hits_gminus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1510 geneplus_hits = List_append(sense_hits_gplus,List_append(sense_hits_gminus,List_append(antisense_hits_gplus,antisense_hits_gminus)));
1511
1512
1513 this = Stage1_new(querylength);
1514 single_read_search(&found_score_overall,&found_score_within_trims,
1515 &sense_hits_gplus,&sense_hits_gminus,&antisense_hits_gplus,&antisense_hits_gminus,
1516 this,queryuc_ptr,queryrc,querylength,
1517 mismatch_positions_alloc,query_compress_fwd,query_compress_rev,/*genestrand*/+2,
1518 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1519 /*paired_end_p*/false,/*first_read_p*/true);
1520 Stage1_free(&this);
1521
1522 sense_hits_gplus = Stage3end_filter(sense_hits_gplus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1523 sense_hits_gminus = Stage3end_filter(sense_hits_gminus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1524 antisense_hits_gplus = Stage3end_filter(antisense_hits_gplus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1525 antisense_hits_gminus = Stage3end_filter(antisense_hits_gminus,hitlistpool,max_mismatches_refalt,max_mismatches_ref,min_coverage);
1526 geneminus_hits = List_append(sense_hits_gplus,List_append(sense_hits_gminus,List_append(antisense_hits_gplus,antisense_hits_gminus)));
1527
1528 hits = List_append(geneplus_hits,geneminus_hits);
1529
1530 } else {
1531 fprintf(stderr,"Do not recognize mode %d\n",mode);
1532 abort();
1533 }
1534
1535
1536 if (hits == NULL) {
1537 *npaths_primary = *npaths_altloc = 0;
1538 stage3array = (Stage3end_T *) NULL;
1539
1540 } else {
1541 hits = Stage3end_remove_circular_alias(hits,hitlistpool); /* Contains a call to unalias_circular */
1542 hits = Stage3end_remove_duplicates(hits,hitlistpool); /* Aliases can cause duplicates */
1543 debug(printf("Have %d hits after remove_duplicates\n",List_length(hits)));
1544
1545 hits = Stage3end_optimal_score(hits,hitlistpool,querylength,/*finalp*/false);
1546 debug(printf("Have %d hits after optimal_score\n",List_length(hits)));
1547 hits = Stage3end_remove_overlaps(hits,hitlistpool,querylength,/*finalp*/true);
1548 debug(printf("Have %d hits after remove_overlaps\n",List_length(hits)));
1549 hits = Stage3end_optimal_score(hits,hitlistpool,querylength,/*finalp*/true);
1550 debug(printf("Have %d hits after final optimal_score\n",List_length(hits)));
1551
1552 quality_string = Shortread_quality_string(queryseq);
1553 Stage3end_count_hits(&(*npaths_primary),&(*npaths_altloc),hits);
1554 stage3array = (Stage3end_T *) List_to_array_out(hits,NULL); Hitlist_free(&hits); /* Return value */
1555 stage3array = Stage3end_eval_and_sort(/*npaths*/(*npaths_primary) + (*npaths_altloc),
1556 &(*first_absmq),&(*second_absmq),
1557 stage3array,queryuc_ptr,quality_string,/*displayp*/true);
1558 }
1559
1560 FREE(queryrc);
1561
1562 Compress_free(&query_compress_fwd);
1563 Compress_free(&query_compress_rev);
1564 FREE(mismatch_positions_alloc);
1565
1566 return stage3array;
1567 }
1568
1569
1570
1571
1572 static Pairtype_T
choose_among_paired(int * best_nmatches_paired,int * best_nmatches_5,int * best_nmatches_3,List_T hitpairs,List_T samechr,List_T conc_transloc)1573 choose_among_paired (int *best_nmatches_paired, int *best_nmatches_5, int *best_nmatches_3,
1574 List_T hitpairs, List_T samechr, List_T conc_transloc) {
1575 Pairtype_T final_pairtype = UNPAIRED;
1576 List_T p;
1577 Stage3pair_T hitpair;
1578 int nmatches, nmatches5, nmatches3;
1579
1580 debug16(printf("choose: %d hitpairs, %d conc_transloc, %d samechr\n",
1581 List_length(hitpairs),List_length(conc_transloc),List_length(samechr)));
1582
1583 *best_nmatches_paired = 0;
1584 *best_nmatches_5 = 0;
1585 *best_nmatches_3 = 0;
1586 for (p = hitpairs; p != NULL; p = p->rest) {
1587 hitpair = (Stage3pair_T) p->first;
1588 if ((nmatches = Stage3pair_nmatches_to_trims(&nmatches5,&nmatches3,hitpair)) > *best_nmatches_paired) {
1589 final_pairtype = CONCORDANT;
1590 *best_nmatches_paired = nmatches;
1591 *best_nmatches_5 = nmatches5;
1592 *best_nmatches_3 = nmatches3;
1593 }
1594 }
1595
1596 *best_nmatches_paired += 1; /* penalty for choosing translocation over others */
1597
1598 for (p = conc_transloc; p != NULL; p = p->rest) {
1599 hitpair = (Stage3pair_T) p->first;
1600 if ((nmatches = Stage3pair_nmatches_to_trims(&nmatches5,&nmatches3,hitpair)) > *best_nmatches_paired) {
1601 final_pairtype = CONCORDANT_TRANSLOCATIONS;
1602 *best_nmatches_paired = nmatches;
1603 *best_nmatches_5 = nmatches5;
1604 *best_nmatches_3 = nmatches3;
1605 }
1606 }
1607
1608 for (p = samechr; p != NULL; p = p->rest) {
1609 hitpair = (Stage3pair_T) p->first;
1610 if ((nmatches = Stage3pair_nmatches_to_trims(&nmatches5,&nmatches3,hitpair)) > *best_nmatches_paired) {
1611 final_pairtype = PAIRED_UNSPECIFIED;
1612 *best_nmatches_paired = nmatches;
1613 *best_nmatches_5 = nmatches5;
1614 *best_nmatches_3 = nmatches3;
1615 }
1616 }
1617
1618 debug16(printf("best_nmatches_paired among paired = %d = %d + %d\n",
1619 *best_nmatches_paired,*best_nmatches_5,*best_nmatches_3));
1620 debug16(printf("final pairtype: %s\n",Pairtype_string(final_pairtype)));
1621
1622 return final_pairtype;
1623 }
1624
1625
1626
1627 static List_T
paired_read_segment_search(bool * abort_pairing_p,List_T * samechr,List_T * conc_transloc,List_T hitpairs,int last_level_5,int last_level_3,int found_score_overall_5,int found_score_within_trims_5,int found_score_overall_3,int found_score_within_trims_3,T this5,T this3,Ladder_T sense_ladder5_plus,Ladder_T sense_ladder5_minus,Ladder_T sense_ladder3_plus,Ladder_T sense_ladder3_minus,Ladder_T antisense_ladder5_plus,Ladder_T antisense_ladder5_minus,Ladder_T antisense_ladder3_plus,Ladder_T antisense_ladder3_minus,char * queryuc_ptr_5,char * queryrc5,int querylength5,char * queryuc_ptr_3,char * queryrc3,int querylength3,int * mismatch_positions_alloc_5,int * mismatch_positions_alloc_3,Compress_T query5_compress_fwd,Compress_T query5_compress_rev,Compress_T query3_compress_fwd,Compress_T query3_compress_rev,int max_insertionlen_5,int max_insertionlen_3,int max_deletionlen_5,int max_deletionlen_3,int genestrand,Chrpos_T pairmax_linear,Chrpos_T overall_max_distance_5,Chrpos_T overall_max_distance_3,Chrpos_T overall_end_distance_5,Chrpos_T overall_end_distance_3,int max_mismatches_refalt_5,int max_mismatches_refalt_3,int max_mismatches_ref_5,int max_mismatches_ref_3,int min_coverage_5,int min_coverage_3,Intlistpool_T intlistpool,Univcoordlistpool_T univcoordlistpool,Listpool_T listpool,Univdiagpool_T univdiagpool,Hitlistpool_T hitlistpool)1628 paired_read_segment_search (bool *abort_pairing_p, List_T *samechr, List_T *conc_transloc,
1629 List_T hitpairs, int last_level_5, int last_level_3,
1630 int found_score_overall_5, int found_score_within_trims_5,
1631 int found_score_overall_3, int found_score_within_trims_3, T this5, T this3,
1632 Ladder_T sense_ladder5_plus, Ladder_T sense_ladder5_minus,
1633 Ladder_T sense_ladder3_plus, Ladder_T sense_ladder3_minus,
1634 Ladder_T antisense_ladder5_plus, Ladder_T antisense_ladder5_minus,
1635 Ladder_T antisense_ladder3_plus, Ladder_T antisense_ladder3_minus,
1636 char *queryuc_ptr_5, char *queryrc5, int querylength5, char *queryuc_ptr_3, char *queryrc3, int querylength3,
1637 int *mismatch_positions_alloc_5, int *mismatch_positions_alloc_3,
1638 Compress_T query5_compress_fwd, Compress_T query5_compress_rev,
1639 Compress_T query3_compress_fwd, Compress_T query3_compress_rev,
1640 int max_insertionlen_5, int max_insertionlen_3, int max_deletionlen_5, int max_deletionlen_3,
1641 int genestrand, Chrpos_T pairmax_linear, Chrpos_T overall_max_distance_5, Chrpos_T overall_max_distance_3,
1642 Chrpos_T overall_end_distance_5, Chrpos_T overall_end_distance_3,
1643
1644 int max_mismatches_refalt_5, int max_mismatches_refalt_3,
1645 int max_mismatches_ref_5, int max_mismatches_ref_3,
1646 int min_coverage_5, int min_coverage_3,
1647
1648 Intlistpool_T intlistpool, Univcoordlistpool_T univcoordlistpool,
1649 Listpool_T listpool, Univdiagpool_T univdiagpool, Hitlistpool_T hitlistpool) {
1650 int last_level;
1651
1652 /* int plus_specifici_5[3], minus_specifici_5[3], plus_specifici_3[3], minus_specifici_3[3]; */
1653 Univcoord_T *gplus5_diagonals, *gminus5_diagonals, *gplus3_diagonals, *gminus3_diagonals;
1654 int gplus5_ndiagonals, gminus5_ndiagonals, gplus3_ndiagonals, gminus3_ndiagonals;
1655
1656 /* concordant_score is for all hitpairs; adjacent score is for those hitpairs within adjacent_pairlength */
1657 int adjacent_score, concordant_score;
1658 int done_level_5, done_level_3, nmismatches_search, nmismatches_search_5, nmismatches_search_3;
1659 /* int max_terminal_mismatches; */
1660 int max_splice_mismatches;
1661 int min_trim_5, min_trim_3;
1662
1663 int maxpairedpaths;
1664
1665 List_T sense_hits5_gplus = NULL, sense_hits5_gminus = NULL, sense_hits3_gplus = NULL, sense_hits3_gminus = NULL,
1666 antisense_hits5_gplus = NULL, antisense_hits5_gminus = NULL, antisense_hits3_gplus = NULL, antisense_hits3_gminus = NULL;
1667
1668
1669 #if 0
1670 /* If we prioritize transcriptome hits over genomic hits */
1671 List_T transcriptome_hits5 = NULL, transcriptome_hits3 = NULL;
1672 List_T genome_hits5 = NULL, genome_hits5 = NULL;
1673 #endif
1674
1675
1676 /* For Segment_search */
1677 struct Record_T *plus_records5 = NULL, *minus_records5 = NULL, *plus_records3 = NULL, *minus_records3 = NULL;
1678 int plus_nrecords5 = 0, minus_nrecords5 = 0, plus_nrecords3 = 0, minus_nrecords3 = 0;
1679
1680 List_T startfrags5_plus = NULL, endfrags5_plus = NULL, startfrags5_minus = NULL, endfrags5_minus = NULL,
1681 startfrags3_plus = NULL, endfrags3_plus = NULL, startfrags3_minus = NULL, endfrags3_minus = NULL;
1682
1683 int level;
1684
1685
1686 debug(printf("Entered paired_read_segment_search with queryuc_ptr_5 %s\n",queryuc_ptr_5));
1687 debug(printf("Entered paired_read_segment_search with queryuc_ptr_3 %s\n",queryuc_ptr_3));
1688
1689 *abort_pairing_p = false;
1690 *samechr = (List_T) NULL;
1691 /* *conc_transloc = (List_T) NULL; -- May contain results from concordant alignment of single ends */
1692
1693 if (last_level_5 < last_level_3) {
1694 level = last_level = last_level_5;
1695 } else {
1696 level = last_level = last_level_3;
1697 }
1698 debug(printf("last_level is %s <- %s and %s\n",
1699 Method_string(last_level),Method_string(last_level_5),Method_string(last_level_3)));
1700
1701
1702 /* Take the larger of maxpaths_search and 10*maxpaths_report */
1703 maxpairedpaths = maxpaths_search;
1704 if (maxpairedpaths < 10*maxpaths_report) {
1705 maxpairedpaths = 10*maxpaths_report;
1706 }
1707
1708 adjacent_score = concordant_score = querylength5 + querylength3;
1709
1710 nmismatches_search_5 = querylength5/index1part_tr;
1711 nmismatches_search_3 = querylength3/index1part_tr;
1712 nmismatches_search = nmismatches_search_5 + nmismatches_search_3;
1713
1714
1715 /* found_score_5 = querylength5; */
1716 /* found_score_3 = querylength3; */
1717 /* found_score = querylength5 + querylength3; */
1718
1719
1720 if (last_level < KMER_EXACT) {
1721 if (last_level_5 < KMER_EXACT) {
1722 debug(printf("LEVEL %d: RUNNING KMER_SEARCH_GENOME_ENDS ON 5' READ\n",level));
1723 Stage1_init(this5,queryuc_ptr_5,querylength5,genestrand);
1724 }
1725
1726 if (last_level_3 < KMER_EXACT) {
1727 debug(printf("LEVEL %d: RUNNING KMER_SEARCH_GENOME_ENDS ON 3' READ\n",level));
1728 Stage1_init(this3,queryuc_ptr_3,querylength3,genestrand);
1729 }
1730 }
1731
1732 if (last_level < EXT) {
1733 if (last_level_5 < EXT && querylength5 >= index1part) {
1734 Stage1_rearrange(this5,querylength5,/*one5p*/false);
1735 Stage1_fill_all_oligos(this5,queryuc_ptr_5,querylength5,genestrand);
1736 }
1737
1738 if (last_level_3 < EXT && querylength3 >= index1part) {
1739 Stage1_rearrange(this3,querylength3,/*one3p*/false);
1740 Stage1_fill_all_oligos(this3,queryuc_ptr_3,querylength3,genestrand);
1741 }
1742 }
1743
1744 /* SEGMENT2 algorithm is different from SEGMENT1, so we should do this in all cases */
1745 if (querylength5 >= index1part) {
1746 debug(printf("LEVEL %d: RUNNING SEGMENT_SEARCH_GENOME (ANCHORED) ON 5' READ\n",level));
1747 if (last_level_5 < SEGMENT1) {
1748 Stage1_fill_position_ptrs_plus(this5,/*querystart*/0,/*queryend*/querylength5,genestrand);
1749 Stage1_fill_position_ptrs_minus(this5,/*querystart*/0,/*queryend*/querylength5,genestrand);
1750 }
1751
1752 /* Sense */
1753 gplus3_diagonals = Ladder_genomicstarts(&gplus3_ndiagonals,sense_ladder3_plus);
1754 gminus3_diagonals = Ladder_genomicends(&gminus3_ndiagonals,sense_ladder3_minus);
1755
1756 plus_records5 = Segment_identify_lower(&plus_nrecords5,
1757 #ifdef LARGE_GENOMES
1758 this5->plus_positions_high,
1759 #endif
1760 this5->plus_positions,this5->plus_npositions,this5->plus_validp,
1761 this5->stream_alloc,this5->streamsize_alloc,this5->querypos_diagterm_alloc,
1762 /*max_pairlength*/pairmax_linear,overall_max_distance_5,querylength5,
1763 gplus3_diagonals,gplus3_ndiagonals);
1764
1765 minus_records5 = Segment_identify_higher(&minus_nrecords5,
1766 #ifdef LARGE_GENOMES
1767 this5->minus_positions_high,
1768 #endif
1769 this5->minus_positions,this5->minus_npositions,this5->minus_validp,
1770 this5->stream_alloc,this5->streamsize_alloc,this5->querypos_diagterm_alloc,
1771 /*max_pairlength*/pairmax_linear,overall_max_distance_5,querylength5,
1772 gminus3_diagonals,gminus3_ndiagonals);
1773 FREE(gminus3_diagonals);
1774 FREE(gplus3_diagonals);
1775
1776 Segment_search_all(&found_score_overall_5,&found_score_within_trims_5,
1777 &sense_hits5_gplus,&sense_hits5_gminus,&antisense_hits5_gplus,&antisense_hits5_gminus,
1778 plus_records5,plus_nrecords5,minus_records5,minus_nrecords5,
1779
1780 queryuc_ptr_5,queryrc5,querylength5,mismatch_positions_alloc_5,
1781 this5->spliceinfo,this5->stream_alloc,this5->streamsize_alloc,
1782 query5_compress_fwd,query5_compress_rev,
1783 max_insertionlen_5,max_deletionlen_5,
1784 overall_max_distance_5,overall_end_distance_5,
1785 genestrand,/*paired_end_p*/true,/*first_read_p*/true,
1786 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1787 /*method*/SEGMENT2,level);
1788 FREE(plus_records5); FREE(minus_records5);
1789
1790
1791 if (splicingp == true) {
1792 /* Antisense */
1793 gplus3_diagonals = Ladder_genomicstarts(&gplus3_ndiagonals,antisense_ladder3_plus);
1794 gminus3_diagonals = Ladder_genomicends(&gminus3_ndiagonals,antisense_ladder3_minus);
1795
1796 plus_records5 = Segment_identify_lower(&plus_nrecords5,
1797 #ifdef LARGE_GENOMES
1798 this5->plus_positions_high,
1799 #endif
1800 this5->plus_positions,this5->plus_npositions,this5->plus_validp,
1801 this5->stream_alloc,this5->streamsize_alloc,this5->querypos_diagterm_alloc,
1802 /*max_pairlength*/pairmax_linear,overall_max_distance_5,querylength5,
1803 gplus3_diagonals,gplus3_ndiagonals);
1804
1805 minus_records5 = Segment_identify_higher(&minus_nrecords5,
1806 #ifdef LARGE_GENOMES
1807 this5->minus_positions_high,
1808 #endif
1809 this5->minus_positions,this5->minus_npositions,this5->minus_validp,
1810 this5->stream_alloc,this5->streamsize_alloc,this5->querypos_diagterm_alloc,
1811 /*max_pairlength*/pairmax_linear,overall_max_distance_5,querylength5,
1812 gminus3_diagonals,gminus3_ndiagonals);
1813 FREE(gminus3_diagonals);
1814 FREE(gplus3_diagonals);
1815
1816 Segment_search_all(&found_score_overall_5,&found_score_within_trims_5,
1817 &sense_hits5_gplus,&sense_hits5_gminus,&antisense_hits5_gplus,&antisense_hits5_gminus,
1818 plus_records5,plus_nrecords5,minus_records5,minus_nrecords5,
1819
1820 queryuc_ptr_5,queryrc5,querylength5,mismatch_positions_alloc_5,
1821 this5->spliceinfo,this5->stream_alloc,this5->streamsize_alloc,
1822 query5_compress_fwd,query5_compress_rev,
1823 max_insertionlen_5,max_deletionlen_5,
1824 overall_max_distance_5,overall_end_distance_5,
1825 genestrand,/*paired_end_p*/true,/*first_read_p*/true,
1826 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1827 /*method*/SEGMENT2,level);
1828 FREE(plus_records5); FREE(minus_records5);
1829 }
1830 }
1831
1832
1833 if (querylength3 >= index1part) {
1834 debug(printf("LEVEL %d: RUNNING SEGMENT_SEARCH_GENOME (ANCHORED) ON 3' READ\n",level));
1835 if (last_level_3 < SEGMENT1) {
1836 Stage1_fill_position_ptrs_plus(this3,/*querystart*/0,/*queryend*/querylength3,genestrand);
1837 Stage1_fill_position_ptrs_minus(this3,/*querystart*/0,/*queryend*/querylength3,genestrand);
1838 }
1839
1840 /* Sense */
1841 gplus5_diagonals = Ladder_genomicstarts(&gplus5_ndiagonals,sense_ladder5_plus);
1842 gminus5_diagonals = Ladder_genomicends(&gminus5_ndiagonals,sense_ladder5_minus);
1843
1844 plus_records3 = Segment_identify_higher(&plus_nrecords3,
1845 #ifdef LARGE_GENOMES
1846 this3->plus_positions_high,
1847 #endif
1848 this3->plus_positions,this3->plus_npositions,this3->plus_validp,
1849 this3->stream_alloc,this3->streamsize_alloc,this3->querypos_diagterm_alloc,
1850 /*max_pairlength*/pairmax_linear,overall_max_distance_3,querylength3,
1851 gplus5_diagonals,gplus5_ndiagonals);
1852
1853 minus_records3 = Segment_identify_lower(&minus_nrecords3,
1854 #ifdef LARGE_GENOMES
1855 this3->minus_positions_high,
1856 #endif
1857 this3->minus_positions,this3->minus_npositions,this3->minus_validp,
1858 this3->stream_alloc,this3->streamsize_alloc,this3->querypos_diagterm_alloc,
1859 /*max_pairlength*/pairmax_linear,overall_max_distance_3,querylength3,
1860 gminus5_diagonals,gminus5_ndiagonals);
1861 FREE(gminus5_diagonals);
1862 FREE(gplus5_diagonals);
1863
1864 Segment_search_all(&found_score_overall_3,&found_score_within_trims_3,
1865 &sense_hits3_gplus,&sense_hits3_gminus,&antisense_hits3_gplus,&antisense_hits3_gminus,
1866 plus_records3,plus_nrecords3,minus_records3,minus_nrecords3,
1867
1868 queryuc_ptr_3,queryrc3,querylength3,mismatch_positions_alloc_3,
1869 this3->spliceinfo,this3->stream_alloc,this3->streamsize_alloc,
1870 query3_compress_fwd,query3_compress_rev,
1871 max_insertionlen_3,max_deletionlen_3,
1872 overall_max_distance_3,overall_end_distance_3,
1873 genestrand,/*paired_end_p*/true,/*first_read_p*/false,
1874 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1875 /*method*/SEGMENT2,level);
1876 FREE(plus_records3); FREE(minus_records3);
1877
1878 if (splicingp == true) {
1879 /* Antisense */
1880 gplus5_diagonals = Ladder_genomicstarts(&gplus5_ndiagonals,antisense_ladder5_plus);
1881 gminus5_diagonals = Ladder_genomicends(&gminus5_ndiagonals,antisense_ladder5_minus);
1882
1883 plus_records3 = Segment_identify_higher(&plus_nrecords3,
1884 #ifdef LARGE_GENOMES
1885 this3->plus_positions_high,
1886 #endif
1887 this3->plus_positions,this3->plus_npositions,this3->plus_validp,
1888 this3->stream_alloc,this3->streamsize_alloc,this3->querypos_diagterm_alloc,
1889 /*max_pairlength*/pairmax_linear,overall_max_distance_3,querylength3,
1890 gplus5_diagonals,gplus5_ndiagonals);
1891
1892 minus_records3 = Segment_identify_lower(&minus_nrecords3,
1893 #ifdef LARGE_GENOMES
1894 this3->minus_positions_high,
1895 #endif
1896 this3->minus_positions,this3->minus_npositions,this3->minus_validp,
1897 this3->stream_alloc,this3->streamsize_alloc,this3->querypos_diagterm_alloc,
1898 /*max_pairlength*/pairmax_linear,overall_max_distance_3,querylength3,
1899 gminus5_diagonals,gminus5_ndiagonals);
1900 FREE(gminus5_diagonals);
1901 FREE(gplus5_diagonals);
1902
1903 Segment_search_all(&found_score_overall_3,&found_score_within_trims_3,
1904 &sense_hits3_gplus,&sense_hits3_gminus,&antisense_hits3_gplus,&antisense_hits3_gminus,
1905 plus_records3,plus_nrecords3,minus_records3,minus_nrecords3,
1906
1907 queryuc_ptr_3,queryrc3,querylength3,mismatch_positions_alloc_3,
1908 this3->spliceinfo,this3->stream_alloc,this3->streamsize_alloc,
1909 query3_compress_fwd,query3_compress_rev,
1910 max_insertionlen_3,max_deletionlen_3,
1911 overall_max_distance_3,overall_end_distance_3,
1912 genestrand,/*paired_end_p*/true,/*first_read_p*/false,
1913 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1914 /*method*/SEGMENT2,level);
1915 FREE(plus_records3); FREE(minus_records3);
1916 }
1917 }
1918
1919
1920 debug(printf("found scores: %d and %d (vs %d and %d allowed)\n",
1921 found_score_within_trims_5,found_score_within_trims_3,nmismatches_search_5,nmismatches_search_3));
1922
1923 debug(printf("sense hits5 plus: %d new. hits5 minus: %d new.\n",
1924 List_length(sense_hits5_gplus),List_length(sense_hits5_gminus)));
1925 debug(printf("sense hits3 plus: %d new. hits3 minus: %d new.\n",
1926 List_length(sense_hits3_gplus),List_length(sense_hits3_gminus)));
1927
1928 sense_hits5_gplus = Stage3end_filter(sense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
1929 sense_hits5_gminus = Stage3end_filter(sense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
1930 sense_hits3_gplus = Stage3end_filter(sense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
1931 sense_hits3_gminus = Stage3end_filter(sense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
1932
1933 hitpairs = Concordance_pair_up_genome(&(*abort_pairing_p),&adjacent_score,&concordant_score,&(*conc_transloc),
1934 hitpairs,sense_hits5_gplus,sense_hits5_gminus,sense_hits3_gplus,sense_hits3_gminus,
1935 sense_ladder5_plus,sense_ladder5_minus,sense_ladder3_plus,sense_ladder3_minus,
1936 querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
1937 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
1938 listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_FORWARD); level++;
1939 Hitlist_free(&sense_hits5_gplus); Hitlist_free(&sense_hits5_gminus);
1940 Hitlist_free(&sense_hits3_gplus); Hitlist_free(&sense_hits3_gminus);
1941
1942 if (splicingp == true) {
1943 debug(printf("antisense hits5 plus: %d new. hits5 minus: %d new.\n",
1944 List_length(antisense_hits5_gplus),List_length(antisense_hits5_gminus)));
1945 debug(printf("antisense hits3 plus: %d new. hits3 minus: %d new.\n",
1946 List_length(antisense_hits3_gplus),List_length(antisense_hits3_gminus)));
1947
1948 antisense_hits5_gplus = Stage3end_filter(antisense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
1949 antisense_hits5_gminus = Stage3end_filter(antisense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
1950 antisense_hits3_gplus = Stage3end_filter(antisense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
1951 antisense_hits3_gminus = Stage3end_filter(antisense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
1952
1953 hitpairs = Concordance_pair_up_genome(&(*abort_pairing_p),&adjacent_score,&concordant_score,&(*conc_transloc),
1954 hitpairs,antisense_hits5_gplus,antisense_hits5_gminus,antisense_hits3_gplus,antisense_hits3_gminus,
1955 antisense_ladder5_plus,antisense_ladder5_minus,antisense_ladder3_plus,antisense_ladder3_minus,
1956 querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
1957 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
1958 listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_ANTI); level++;
1959 Hitlist_free(&antisense_hits5_gplus); Hitlist_free(&antisense_hits5_gminus);
1960 Hitlist_free(&antisense_hits3_gplus); Hitlist_free(&antisense_hits3_gminus);
1961 }
1962
1963 debug(printf("(4) After level %d, we have adjacent_score %d, concordant_score %d (vs allowed %d), %d pairs\n",
1964 level,adjacent_score,concordant_score,nmismatches_search,List_length(hitpairs)));
1965
1966 if (*abort_pairing_p == true || concordant_score <= nmismatches_search) {
1967 return hitpairs;
1968
1969 } else if (last_level < DISTANT_RNA) {
1970 /* Find distant splicing. Use concordant_score, rather than
1971 adjacent_score as criterion, since distant splicing cannot find
1972 adjacent pairs */
1973
1974 /* distant splicing on 5' read */
1975 if (last_level_5 < DISTANT_RNA && found_score_within_trims_5 >= found_score_within_trims_3) {
1976 if ((done_level_5 = found_score_within_trims_5 + subopt_levels) > nmismatches_search_5) {
1977 done_level_5 = nmismatches_search_5;
1978 }
1979 debug(printf("done_level_5 %d = found_score_within_trims_5 %d + subopt_levels %d\n",
1980 done_level_5,found_score_within_trims_5,subopt_levels));
1981
1982 /* min of max(trim5,trim3) over each hit */
1983 min_trim_5 = Ladder_minimax_trim(sense_ladder5_plus,sense_ladder5_minus,
1984 antisense_ladder5_plus,antisense_ladder5_minus,querylength5);
1985
1986 debug(printf("For 5' end, comparing min_trim %d with min_distantsplicing_end_matches %d\n",
1987 min_trim_5,min_distantsplicing_end_matches));
1988 if (min_trim_5 < min_distantsplicing_end_matches) {
1989 /* Don't find distant splicing */
1990 debug(printf("For 5' end, skipping distant splicing because min_trim %d < min_distantsplicing_end_matches %d\n",
1991 min_trim_5,min_distantsplicing_end_matches));
1992
1993 } else if ((max_splice_mismatches = done_level_5 - distantsplicing_penalty) >= 0) {
1994 if (splicingp == true) {
1995 debug(printf("For 5' end, candidate for distant splicing because min_trim %d >= min_distantsplicing_end_matches %d\n",
1996 min_trim_5,min_distantsplicing_end_matches));
1997 debug(printf("Have sets: %d %d %d %d\n",
1998 List_length(this5->queryfwd_plus_set),List_length(this5->queryfwd_minus_set),
1999 List_length(this5->queryrev_plus_set),List_length(this5->queryrev_minus_set)));
2000 Distant_rna_solve(&found_score_overall_5,&found_score_within_trims_5,
2001 &sense_hits5_gplus,&sense_hits5_gminus,&antisense_hits5_gplus,&antisense_hits5_gminus,
2002 &startfrags5_plus,&endfrags5_plus,&startfrags5_minus,&endfrags5_minus,
2003
2004 this5->queryfwd_plus_set,this5->queryfwd_minus_set,
2005 this5->queryrev_plus_set,this5->queryrev_minus_set,
2006
2007 mismatch_positions_alloc_5,this5->positions_alloc,
2008 query5_compress_fwd,query5_compress_rev,querylength5,
2009 max_splice_mismatches,genestrand,/*first_read_p*/true,listpool,hitlistpool,level);
2010 } else {
2011 /* TODO: Implement distant DNA fusions */
2012 }
2013 }
2014 }
2015
2016 /* Distant splicing on 3' read */
2017 if (last_level_3 < DISTANT_RNA && found_score_within_trims_3 >= found_score_within_trims_5) {
2018 if ((done_level_3 = found_score_within_trims_3 + subopt_levels) > nmismatches_search_3) {
2019 done_level_3 = nmismatches_search_3;
2020 }
2021 debug(printf("done_level_3 %d = found_score_within_trims_3 %d + subopt_levels %d\n",
2022 done_level_3,found_score_within_trims_3,subopt_levels));
2023
2024 /* min of max(trim5,trim3) over each hit */
2025 min_trim_3 = Ladder_minimax_trim(sense_ladder3_plus,sense_ladder3_minus,
2026 antisense_ladder3_plus,antisense_ladder3_minus,querylength3);
2027
2028 debug(printf("For 3' end, comparing min_trim %d with min_distantsplicing_end_matches %d\n",
2029 min_trim_3,min_distantsplicing_end_matches));
2030 if (min_trim_3 < min_distantsplicing_end_matches) {
2031 /* Don't find distant splicing */
2032 debug(printf("For 3'end, skipping distant splicing because min_trim %d < min_distantsplicing_end_matches %d\n",
2033 min_trim_3,min_distantsplicing_end_matches));
2034
2035 } else if ((max_splice_mismatches = done_level_3 - distantsplicing_penalty) >= 0) {
2036 if (splicingp == true) {
2037 debug(printf("For 3' end, candidate for distant splicing because min_trim %d >= min_distantsplicing_end_matches %d\n",
2038 min_trim_3,min_distantsplicing_end_matches));
2039 debug(printf("Have sets: %d %d %d %d\n",
2040 List_length(this3->queryfwd_plus_set),List_length(this3->queryfwd_minus_set),
2041 List_length(this3->queryrev_plus_set),List_length(this3->queryrev_minus_set)));
2042 Distant_rna_solve(&found_score_overall_3,&found_score_within_trims_3,
2043 &sense_hits3_gplus,&sense_hits3_gminus,&antisense_hits3_gplus,&antisense_hits3_gminus,
2044 &startfrags3_plus,&endfrags3_plus,&startfrags3_minus,&endfrags3_minus,
2045
2046 this3->queryfwd_plus_set,this3->queryfwd_minus_set,
2047 this3->queryrev_plus_set,this3->queryrev_minus_set,
2048
2049 mismatch_positions_alloc_3,this3->positions_alloc,
2050 query3_compress_fwd,query3_compress_rev,querylength3,
2051 max_splice_mismatches,genestrand,/*first_read_p*/false,listpool,hitlistpool,level);
2052 } else {
2053 /* TODO: Implement distant DNA */
2054 }
2055 }
2056 }
2057
2058 debug(printf("found scores: %d and %d (vs %d and %d allowed)\n",
2059 found_score_within_trims_5,found_score_within_trims_3,nmismatches_search_5,nmismatches_search_3));
2060
2061 /* Sense */
2062 debug(printf("sense hits5 plus: %d new. hits5 minus: %d new.\n",
2063 List_length(sense_hits5_gplus),List_length(sense_hits5_gminus)));
2064 debug(printf("sense hits3 plus: %d new. hits3 minus: %d new.\n",
2065 List_length(sense_hits3_gplus),List_length(sense_hits3_gminus)));
2066
2067 sense_hits5_gplus = Stage3end_filter(sense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2068 sense_hits5_gminus = Stage3end_filter(sense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2069 sense_hits3_gplus = Stage3end_filter(sense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2070 sense_hits3_gminus = Stage3end_filter(sense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2071
2072 hitpairs = Concordance_pair_up_distant(&(*abort_pairing_p),&concordant_score,&(*samechr),&(*conc_transloc),hitpairs,
2073 sense_hits5_gplus,sense_hits5_gminus,sense_hits3_gplus,sense_hits3_gminus,
2074 sense_ladder5_plus,sense_ladder5_minus,sense_ladder3_plus,sense_ladder3_minus,
2075 querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2076 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2077 listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_FORWARD); level++;
2078 Hitlist_free(&sense_hits5_gplus); Hitlist_free(&sense_hits5_gminus);
2079 Hitlist_free(&sense_hits3_gplus); Hitlist_free(&sense_hits3_gminus);
2080
2081
2082 if (splicingp == true) {
2083 /* Antisense */
2084 debug(printf("antisense hits5 plus: %d new. hits5 minus: %d new.\n",
2085 List_length(antisense_hits5_gplus),List_length(antisense_hits5_gminus)));
2086 debug(printf("antisense hits3 plus: %d new. hits3 minus: %d new.\n",
2087 List_length(antisense_hits3_gplus),List_length(antisense_hits3_gminus)));
2088
2089 antisense_hits5_gplus = Stage3end_filter(antisense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2090 antisense_hits5_gminus = Stage3end_filter(antisense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2091 antisense_hits3_gplus = Stage3end_filter(antisense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2092 antisense_hits3_gminus = Stage3end_filter(antisense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2093
2094 hitpairs = Concordance_pair_up_distant(&(*abort_pairing_p),&concordant_score,&(*samechr),&(*conc_transloc),hitpairs,
2095 antisense_hits5_gplus,antisense_hits5_gminus,antisense_hits3_gplus,antisense_hits3_gminus,
2096 antisense_ladder5_plus,antisense_ladder5_minus,antisense_ladder3_plus,antisense_ladder3_minus,
2097 querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2098 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2099 listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_ANTI); level++;
2100 Hitlist_free(&antisense_hits5_gplus); Hitlist_free(&antisense_hits5_gminus);
2101 Hitlist_free(&antisense_hits3_gplus); Hitlist_free(&antisense_hits3_gminus);
2102 }
2103
2104 debug(printf("(6) After level %d, we have adjacent_score %d, concordant_score %d (vs allowed %d), %d pairs\n",
2105 level,adjacent_score,concordant_score,nmismatches_search,List_length(hitpairs)));
2106 }
2107
2108
2109 if (last_level < DISTANT_RNA && (*abort_pairing_p == true || concordant_score <= nmismatches_search)) {
2110 Substring_list_gc(&startfrags3_plus);
2111 Substring_list_gc(&endfrags3_plus);
2112 Substring_list_gc(&startfrags3_minus);
2113 Substring_list_gc(&endfrags3_minus);
2114 Substring_list_gc(&startfrags5_plus);
2115 Substring_list_gc(&endfrags5_plus);
2116 Substring_list_gc(&startfrags5_minus);
2117 Substring_list_gc(&endfrags5_minus);
2118 return hitpairs;
2119
2120 } else if (last_level < TERMINAL) {
2121 /* Terminals */
2122 if (last_level_5 < TERMINAL) {
2123 /* Need to recompute, because of new distant hits or because distant splicing was not run */
2124 min_trim_5 = Ladder_minimax_trim(sense_ladder5_plus,sense_ladder5_minus,
2125 antisense_ladder5_plus,antisense_ladder5_minus,querylength5);
2126
2127 if (min_trim_5 < min_distantsplicing_end_matches) {
2128 debug(printf("For 5' end, skipping terminals because min_trim %d < min_distantsplicing_end_matches %d\n",
2129 min_trim_5,min_distantsplicing_end_matches));
2130 } else {
2131 /* max_terminal_mismatches = done_level_5; */
2132 Terminal_solve_plus(&found_score_overall_5,&found_score_within_trims_5,
2133 &sense_hits5_gplus,&antisense_hits5_gplus,
2134 this5->queryfwd_plus_set,this5->queryrev_plus_set,
2135 mismatch_positions_alloc_5,/*queryuc_ptr_5,*/query5_compress_fwd,querylength5,
2136 genestrand,listpool,hitlistpool,level);
2137
2138 Terminal_solve_minus(&found_score_overall_5,&found_score_within_trims_5,
2139 &sense_hits5_gminus,&antisense_hits5_gminus,
2140 this5->queryfwd_minus_set,this5->queryrev_minus_set,
2141 mismatch_positions_alloc_5,/*queryrc5,*/query5_compress_rev,querylength5,
2142 genestrand,listpool,hitlistpool,level);
2143 }
2144 }
2145
2146 if (last_level_3 < DISTANT_RNA) {
2147 /* Need to recompute, because of new distant hits or because distant splicing was not run */
2148 min_trim_3 = Ladder_minimax_trim(sense_ladder3_plus,sense_ladder3_minus,
2149 antisense_ladder3_plus,antisense_ladder3_minus,querylength3);
2150
2151 if (min_trim_3 < min_distantsplicing_end_matches) {
2152 debug(printf("For 3'end, skipping terminals because min_trim %d < min_distantsplicing_end_matches %d\n",
2153 min_trim_3,min_distantsplicing_end_matches));
2154 } else {
2155 /* max_terminal_mismatches = done_level_3; */
2156 Terminal_solve_plus(&found_score_overall_3,&found_score_within_trims_3,
2157 &sense_hits3_gplus,&antisense_hits3_gplus,
2158 this3->queryfwd_plus_set,this3->queryrev_plus_set,
2159 mismatch_positions_alloc_3,/*queryuc_ptr_3,*/query3_compress_fwd,querylength3,
2160 genestrand,listpool,hitlistpool,level);
2161
2162 Terminal_solve_minus(&found_score_overall_3,&found_score_within_trims_3,
2163 &sense_hits3_gminus,&antisense_hits3_gminus,
2164 this3->queryfwd_minus_set,this3->queryrev_minus_set,
2165 mismatch_positions_alloc_3,/*queryrc3,*/query3_compress_rev,querylength3,
2166 genestrand,listpool,hitlistpool,level);
2167 }
2168 }
2169
2170 /* Sense */
2171 sense_hits5_gplus = Stage3end_filter(sense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2172 sense_hits5_gminus = Stage3end_filter(sense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2173 sense_hits3_gplus = Stage3end_filter(sense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2174 sense_hits3_gminus = Stage3end_filter(sense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2175
2176 hitpairs = Concordance_pair_up_genome(&(*abort_pairing_p),&adjacent_score,&concordant_score,&(*conc_transloc),
2177 hitpairs,sense_hits5_gplus,sense_hits5_gminus,sense_hits3_gplus,sense_hits3_gminus,
2178 sense_ladder5_plus,sense_ladder5_minus,sense_ladder3_plus,sense_ladder3_minus,
2179 querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2180 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2181 listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_FORWARD); level++;
2182 Hitlist_free(&sense_hits5_gplus); Hitlist_free(&sense_hits5_gminus);
2183 Hitlist_free(&sense_hits3_gplus); Hitlist_free(&sense_hits3_gminus);
2184
2185 if (splicingp == true) {
2186 /* Antisense */
2187 antisense_hits5_gplus = Stage3end_filter(antisense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2188 antisense_hits5_gminus = Stage3end_filter(antisense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2189 antisense_hits3_gplus = Stage3end_filter(antisense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2190 antisense_hits3_gminus = Stage3end_filter(antisense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2191
2192 hitpairs = Concordance_pair_up_genome(&(*abort_pairing_p),&adjacent_score,&concordant_score,&(*conc_transloc),
2193 hitpairs,antisense_hits5_gplus,antisense_hits5_gminus,antisense_hits3_gplus,antisense_hits3_gminus,
2194 antisense_ladder5_plus,antisense_ladder5_minus,antisense_ladder3_plus,antisense_ladder3_minus,
2195 querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2196 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2197 listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_ANTI); level++;
2198 Hitlist_free(&antisense_hits5_gplus); Hitlist_free(&antisense_hits5_gminus);
2199 Hitlist_free(&antisense_hits3_gplus); Hitlist_free(&antisense_hits3_gminus);
2200 }
2201
2202 debug(printf("(7) After level %d, we have adjacent_score %d, concordant_score %d (vs allowed %d), %d pairs\n",
2203 level,adjacent_score,concordant_score,nmismatches_search,List_length(hitpairs)));
2204 }
2205
2206 Substring_list_gc(&startfrags3_plus);
2207 Substring_list_gc(&endfrags3_plus);
2208 Substring_list_gc(&startfrags3_minus);
2209 Substring_list_gc(&endfrags3_minus);
2210 Substring_list_gc(&startfrags5_plus);
2211 Substring_list_gc(&endfrags5_plus);
2212 Substring_list_gc(&startfrags5_minus);
2213 Substring_list_gc(&endfrags5_minus);
2214
2215 debug(printf("Exiting paired_read_segment_search\n"));
2216 return hitpairs;
2217 }
2218
2219
2220
2221 static List_T
paired_read(bool * abort_pairing_p,List_T * hits5,List_T * hits3,List_T * samechr,List_T * conc_transloc,char * queryuc_ptr_5,char * queryrc5,int querylength5,char * queryuc_ptr_3,char * queryrc3,int querylength3,int * mismatch_positions_alloc_5,int * mismatch_positions_alloc_3,Compress_T query5_compress_fwd,Compress_T query5_compress_rev,Compress_T query3_compress_fwd,Compress_T query3_compress_rev,int max_insertionlen_5,int max_insertionlen_3,int max_deletionlen_5,int max_deletionlen_3,int genestrand,Chrpos_T pairmax_linear,Chrpos_T overall_max_distance_5,Chrpos_T overall_max_distance_3,Chrpos_T overall_end_distance_5,Chrpos_T overall_end_distance_3,int max_mismatches_refalt_5,int max_mismatches_refalt_3,int max_mismatches_ref_5,int max_mismatches_ref_3,int min_coverage_5,int min_coverage_3,Intlistpool_T intlistpool,Univcoordlistpool_T univcoordlistpool,Listpool_T listpool,Univdiagpool_T univdiagpool,Hitlistpool_T hitlistpool)2222 paired_read (bool *abort_pairing_p, List_T *hits5, List_T *hits3, List_T *samechr, List_T *conc_transloc,
2223 char *queryuc_ptr_5, char *queryrc5, int querylength5, char *queryuc_ptr_3, char *queryrc3, int querylength3,
2224 int *mismatch_positions_alloc_5, int *mismatch_positions_alloc_3,
2225 Compress_T query5_compress_fwd, Compress_T query5_compress_rev,
2226 Compress_T query3_compress_fwd, Compress_T query3_compress_rev,
2227 int max_insertionlen_5, int max_insertionlen_3, int max_deletionlen_5, int max_deletionlen_3,
2228 int genestrand, Chrpos_T pairmax_linear, Chrpos_T overall_max_distance_5, Chrpos_T overall_max_distance_3,
2229 Chrpos_T overall_end_distance_5, Chrpos_T overall_end_distance_3,
2230
2231 int max_mismatches_refalt_5, int max_mismatches_refalt_3,
2232 int max_mismatches_ref_5, int max_mismatches_ref_3,
2233 int min_coverage_5, int min_coverage_3,
2234
2235 Intlistpool_T intlistpool, Univcoordlistpool_T univcoordlistpool,
2236 Listpool_T listpool, Univdiagpool_T univdiagpool, Hitlistpool_T hitlistpool) {
2237
2238 List_T sense_hits5_gplus, sense_hits5_gminus, sense_hits3_gplus, sense_hits3_gminus,
2239 antisense_hits5_gplus, antisense_hits5_gminus, antisense_hits3_gplus, antisense_hits3_gminus;
2240 int adjacent_score, concordant_score;
2241 List_T hitpairs;
2242 int maxpairedpaths;
2243 int level5, level3;
2244 int found_score_overall_5, found_score_within_trims_5, found_score_overall_3, found_score_within_trims_3;
2245
2246 T this5, this3;
2247 Ladder_T sense_ladder5_plus, sense_ladder5_minus, sense_ladder3_plus, sense_ladder3_minus,
2248 antisense_ladder5_plus = NULL, antisense_ladder5_minus = NULL,
2249 antisense_ladder3_plus = NULL, antisense_ladder3_minus = NULL;
2250
2251
2252 *abort_pairing_p = false;
2253 hitpairs = (List_T) NULL;
2254 *samechr = (List_T) NULL;
2255 *conc_transloc = (List_T) NULL;
2256
2257 /* Take the larger of maxpaths_search and 10*maxpaths_report */
2258 maxpairedpaths = maxpaths_search;
2259 if (maxpairedpaths < 10*maxpaths_report) {
2260 maxpairedpaths = 10*maxpaths_report;
2261 }
2262
2263 adjacent_score = concordant_score = querylength5 + querylength3;
2264
2265
2266 /* TODO: Return level from single_read. Then if we don't have concordant hitpairs, take the min(level5,level3) and start paired_readfrom t
2267 here */
2268
2269 this5 = Stage1_new(querylength5);
2270 level5 = single_read_search(&found_score_overall_5,&found_score_within_trims_5,
2271 &sense_hits5_gplus,&sense_hits5_gminus,&antisense_hits5_gplus,&antisense_hits5_gminus,
2272 this5,queryuc_ptr_5,queryrc5,querylength5,
2273 mismatch_positions_alloc_5,query5_compress_fwd,query5_compress_rev,genestrand,
2274 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
2275 /*paired_end_p*/true,/*first_read_p*/true);
2276 debug(printf("sense: got %d plus and %d minus hits for 5' end\n",
2277 List_length(sense_hits5_gplus),List_length(sense_hits5_gminus)));
2278 debug(printf("antisense: got %d plus and %d minus hits for 5' end\n",
2279 List_length(antisense_hits5_gplus),List_length(antisense_hits5_gminus)));
2280
2281 this3 = Stage1_new(querylength3);
2282 level3 = single_read_search(&found_score_overall_3,&found_score_within_trims_3,
2283 &sense_hits3_gplus,&sense_hits3_gminus,&antisense_hits3_gplus,&antisense_hits3_gminus,
2284 this3,queryuc_ptr_3,queryrc3,querylength3,
2285 mismatch_positions_alloc_3,query3_compress_fwd,query3_compress_rev,genestrand,
2286 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
2287 /*paired_end_p*/true,/*first_read_p*/false);
2288 debug(printf("sense: got %d plus and %d minus hits for 3' end\n",
2289 List_length(sense_hits3_gplus),List_length(sense_hits3_gminus)));
2290 debug(printf("antisense: got %d plus and %d minus hits for 3' end\n",
2291 List_length(antisense_hits3_gplus),List_length(antisense_hits3_gminus)));
2292
2293 /* Sense */
2294 sense_ladder5_plus = Ladder_new(NULL,hitlistpool,/*end5p*/true);
2295 sense_ladder5_minus = Ladder_new(NULL,hitlistpool,/*end5p*/true);
2296 sense_ladder3_plus = Ladder_new(NULL,hitlistpool,/*end5p*/false);
2297 sense_ladder3_minus = Ladder_new(NULL,hitlistpool,/*end5p*/false);
2298
2299 sense_hits5_gplus = Stage3end_filter(sense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2300 sense_hits5_gminus = Stage3end_filter(sense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2301 sense_hits3_gplus = Stage3end_filter(sense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2302 sense_hits3_gminus = Stage3end_filter(sense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2303
2304 debug(printf("STARTING CONCORDANCE_PAIR_UP_GENOME, SENSE\n"));
2305 hitpairs = Concordance_pair_up_genome(&(*abort_pairing_p),&adjacent_score,&concordant_score,&(*conc_transloc),
2306 hitpairs,sense_hits5_gplus,sense_hits5_gminus,sense_hits3_gplus,sense_hits3_gminus,
2307 sense_ladder5_plus,sense_ladder5_minus,sense_ladder3_plus,sense_ladder3_minus,
2308 querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2309 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2310 listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_FORWARD);
2311 Hitlist_free(&sense_hits5_gplus); Hitlist_free(&sense_hits5_gminus);
2312 Hitlist_free(&sense_hits3_gplus); Hitlist_free(&sense_hits3_gminus);
2313 debug(printf("RETURNING FROM CONCORDANCE_PAIR_UP_GENOME, SENSE\n"));
2314
2315 if (splicingp == true) {
2316 /* Antisense */
2317 antisense_ladder5_plus = Ladder_new(NULL,hitlistpool,/*end5p*/true);
2318 antisense_ladder5_minus = Ladder_new(NULL,hitlistpool,/*end5p*/true);
2319 antisense_ladder3_plus = Ladder_new(NULL,hitlistpool,/*end5p*/false);
2320 antisense_ladder3_minus = Ladder_new(NULL,hitlistpool,/*end5p*/false);
2321
2322 antisense_hits5_gplus = Stage3end_filter(antisense_hits5_gplus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2323 antisense_hits5_gminus = Stage3end_filter(antisense_hits5_gminus,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2324 antisense_hits3_gplus = Stage3end_filter(antisense_hits3_gplus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2325 antisense_hits3_gminus = Stage3end_filter(antisense_hits3_gminus,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2326
2327 debug(printf("STARTING CONCORDANCE_PAIR_UP_GENOME, ANTISENSE\n"));
2328 hitpairs = Concordance_pair_up_genome(&(*abort_pairing_p),&adjacent_score,&concordant_score,&(*conc_transloc),
2329 hitpairs,antisense_hits5_gplus,antisense_hits5_gminus,antisense_hits3_gplus,antisense_hits3_gminus,
2330 antisense_ladder5_plus,antisense_ladder5_minus,antisense_ladder3_plus,antisense_ladder3_minus,
2331 querylength5,querylength3,mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2332 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2333 listpool,hitlistpool,maxpairedpaths,genestrand,/*sensedir*/SENSE_ANTI);
2334 Hitlist_free(&antisense_hits5_gplus); Hitlist_free(&antisense_hits5_gminus);
2335 Hitlist_free(&antisense_hits3_gplus); Hitlist_free(&antisense_hits3_gminus);
2336 debug(printf("RETURNING FROM CONCORDANCE_PAIR_UP_GENOME, ANTISENSE\n"));
2337 }
2338
2339 debug(printf("After initial concordance, have %d concordant hitpairs and %d concordant transloc\n\n",
2340 List_length(hitpairs),List_length(*conc_transloc)));
2341
2342
2343 /* DEBUGGING */
2344 if (hitpairs == NULL) {
2345 #if 0
2346 /* Computationally too expensive */
2347 Ladder_missing_paired_p(ladder5_plus,/*maxscore*/0) == true
2348 Ladder_missing_paired_p(ladder3_plus,/*maxscore*/0) == true
2349 Ladder_missing_paired_p(ladder5_minus,/*maxscore*/0) == true
2350 Ladder_missing_paired_p(ladder3_minus,/*maxscore*/0) == true
2351 #endif
2352
2353 debug(printf("CALLING PAIRED_READ_SEGMENT_SEARCH\n"));
2354 hitpairs = paired_read_segment_search(&(*abort_pairing_p),&(*samechr),&(*conc_transloc),hitpairs,
2355 level5,level3,found_score_overall_5,found_score_within_trims_5,
2356 found_score_overall_3,found_score_within_trims_3,this5,this3,
2357 sense_ladder5_plus,sense_ladder5_minus,sense_ladder3_plus,sense_ladder3_minus,
2358 antisense_ladder5_plus,antisense_ladder5_minus,antisense_ladder3_plus,antisense_ladder3_minus,
2359 queryuc_ptr_5,queryrc5,querylength5,queryuc_ptr_3,queryrc3,querylength3,
2360 mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2361 query5_compress_fwd,query5_compress_rev,
2362 query3_compress_fwd,query3_compress_rev,
2363 max_insertionlen_5,max_insertionlen_3,max_deletionlen_5,max_deletionlen_3,
2364 genestrand,pairmax_linear,overall_max_distance_5,overall_max_distance_3,
2365 overall_end_distance_5,overall_end_distance_3,
2366 max_mismatches_refalt_5,max_mismatches_refalt_3,
2367 max_mismatches_ref_5,max_mismatches_ref_3,min_coverage_5,min_coverage_3,
2368 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool);
2369 }
2370
2371 Stage1_free(&this3); Stage1_free(&this5);
2372
2373 *hits5 = *hits3 = (List_T) NULL;
2374 Ladder_to_hits(&(*hits5),&(*hits3),
2375 &sense_ladder5_plus,&sense_ladder5_minus,&sense_ladder3_plus,&sense_ladder3_minus,
2376 hitlistpool);
2377
2378 if (splicingp == true) {
2379 Ladder_to_hits(&(*hits5),&(*hits3),
2380 &antisense_ladder5_plus,&antisense_ladder5_minus,&antisense_ladder3_plus,&antisense_ladder3_minus,
2381 hitlistpool);
2382 }
2383
2384 debug(printf("Exiting paired_read\n"));
2385 return hitpairs;
2386 }
2387
2388
2389
2390 /* Have three lists: hitpairs, samechr, and conc_transloc => result */
2391 static Stage3pair_T *
consolidate_paired_results(int * npaths_primary,int * npaths_altloc,int * first_absmq,int * second_absmq,Pairtype_T * final_pairtype,Stage3end_T ** stage3array5,int * nhits5_primary,int * nhits5_altloc,int * first_absmq5,int * second_absmq5,Stage3end_T ** stage3array3,int * nhits3_primary,int * nhits3_altloc,int * first_absmq3,int * second_absmq3,List_T hitpairs,List_T samechr,List_T conc_transloc,List_T hits5,List_T hits3,char * queryuc_ptr_5,int querylength5,char * queryuc_ptr_3,int querylength3,char * quality_string_5,char * quality_string_3,int * mismatch_positions_alloc_5,int * mismatch_positions_alloc_3,Compress_T query5_compress_fwd,Compress_T query5_compress_rev,Compress_T query3_compress_fwd,Compress_T query3_compress_rev,int max_mismatches_refalt_5,int max_mismatches_refalt_3,int max_mismatches_ref_5,int max_mismatches_ref_3,int min_coverage_5,int min_coverage_3,Listpool_T listpool,Hitlistpool_T hitlistpool)2392 consolidate_paired_results (int *npaths_primary, int *npaths_altloc, int *first_absmq, int *second_absmq, Pairtype_T *final_pairtype,
2393 Stage3end_T **stage3array5, int *nhits5_primary, int *nhits5_altloc, int *first_absmq5, int *second_absmq5,
2394 Stage3end_T **stage3array3, int *nhits3_primary, int *nhits3_altloc, int *first_absmq3, int *second_absmq3,
2395 List_T hitpairs, List_T samechr, List_T conc_transloc, List_T hits5, List_T hits3,
2396
2397 char *queryuc_ptr_5, int querylength5, char *queryuc_ptr_3, int querylength3,
2398 char *quality_string_5, char *quality_string_3,
2399
2400 int *mismatch_positions_alloc_5, int *mismatch_positions_alloc_3,
2401 Compress_T query5_compress_fwd, Compress_T query5_compress_rev,
2402 Compress_T query3_compress_fwd, Compress_T query3_compress_rev,
2403
2404 int max_mismatches_refalt_5, int max_mismatches_refalt_3,
2405 int max_mismatches_ref_5, int max_mismatches_ref_3,
2406 int min_coverage_5, int min_coverage_3,
2407 Listpool_T listpool, Hitlistpool_T hitlistpool) {
2408 Stage3pair_T *stage3pairarray, stage3pair, newpair;
2409 Stage3end_T hit5, hit3;
2410 List_T result, singlehits5, singlehits3, p;
2411 int genestrand, sensedir;
2412 Pairtype_T pairtype;
2413 int best_nmatches_paired, best_nmatches_paired_5, best_nmatches_paired_3;
2414
2415
2416 *final_pairtype = choose_among_paired(&best_nmatches_paired,&best_nmatches_paired_5,&best_nmatches_paired_3,
2417 hitpairs,samechr,conc_transloc);
2418 debug16(printf("Entered consolidate_paired_results with final_pairtype %d\n",*final_pairtype));
2419
2420 /* DEBUGGING */
2421 if (*final_pairtype == CONCORDANT) {
2422 /* Have concordant results */
2423 debug16(printf("Have %d concordant results, %d samechr, and %d conc_transloc\n",
2424 List_length(hitpairs),List_length(samechr),List_length(conc_transloc)));
2425 for (p = samechr; p != NULL; p = List_next(p)) {
2426 stage3pair = (Stage3pair_T) List_head(p);
2427 Stage3pair_free(&stage3pair);
2428 }
2429 Hitlist_free(&samechr);
2430
2431 for (p = conc_transloc; p != NULL; p = List_next(p)) {
2432 stage3pair = (Stage3pair_T) List_head(p);
2433 Stage3pair_free(&stage3pair);
2434 }
2435 Hitlist_free(&conc_transloc);
2436
2437 if (1) {
2438 debug16(printf("Before removing overlaps, %d results\n",List_length(hitpairs)));
2439 result = Stage3pair_optimal_score(hitpairs,hitlistpool,querylength5,querylength3,/*finalp*/false);
2440 debug16(printf("After Stage3pair_optimal_score_prefinal, %d results\n",List_length(result)));
2441 result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/false,/*finalp*/true);
2442 debug16(printf("After Stage3pair_remove_overlaps, %d results\n",List_length(result)));
2443 result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/true);
2444 debug16(printf("After Stage3pair_optimal_score_final, %d results\n",List_length(result)));
2445
2446 /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2447 /* result = Stage3pair_sort_distance(result); */
2448 debug16(printf("After removing overlaps, %d results\n",List_length(result)));
2449
2450 } else if (true /*gmap_improvement_p == false*/) {
2451 debug16(printf("No GMAP improvement: Before removing overlaps, %d results\n",List_length(hitpairs)));
2452 result = Stage3pair_optimal_score(hitpairs,hitlistpool,querylength5,querylength3,/*finalp*/false);
2453 result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/false,/*finalp*/true);
2454 result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/true);
2455
2456 /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2457 /* result = Stage3pair_sort_distance(result); */
2458 debug16(printf("After removing overlaps, %d results\n",List_length(result)));
2459
2460 } else {
2461 debug16(printf("GMAP improvement: Before removing overlaps, %d results\n",List_length(hitpairs)));
2462 result = Stage3pair_optimal_score(hitpairs,hitlistpool,querylength5,querylength3,/*finalp*/false);
2463 result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/false,/*finalp*/false);
2464 result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/false);
2465 /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2466 /* result = Stage3pair_sort_distance(result); */
2467 debug16(printf("After removing overlaps, %d results\n",List_length(result)));
2468
2469 #ifdef TODO
2470 result = align_pair_with_gmap(&(*final_pairtype),result,
2471 queryuc_ptr_5,queryuc_ptr_3,querylength5,querylength3,
2472 oligoindices_minor,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
2473 cutoff_level_5,cutoff_level_3,/*expect_concordant_p*/true);
2474 if (Stage3pair_sense_consistent_p(result) == false) {
2475 debug16(printf("sense is not consistent\n"));
2476 result = align_pair_with_gmap(&(*final_pairtype),result,
2477 queryuc_ptr_5,queryuc_ptr_3,querylength5,querylength3,
2478 oligoindices_minor,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
2479 cutoff_level_5,cutoff_level_3,/*expect_concordant_p*/true);
2480 }
2481 #endif
2482
2483 result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/false);
2484 result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/false,/*finalp*/true);
2485 result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/true);
2486
2487 /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2488 }
2489
2490 } else if (*final_pairtype == PAIRED_UNSPECIFIED) {
2491 /* Have paired results */
2492 debug16(printf("Have paired unspecified\n"));
2493 for (p = hitpairs; p != NULL; p = List_next(p)) {
2494 stage3pair = (Stage3pair_T) List_head(p);
2495 Stage3pair_free(&stage3pair);
2496 }
2497 Hitlist_free(&hitpairs);
2498
2499 for (p = conc_transloc; p != NULL; p = List_next(p)) {
2500 stage3pair = (Stage3pair_T) List_head(p);
2501 Stage3pair_free(&stage3pair);
2502 }
2503 Hitlist_free(&conc_transloc);
2504
2505 if (true /*gmap_improvement_p == false*/) {
2506 debug16(printf("No GMAP improvement: Before removing overlaps, %d results\n",List_length(samechr)));
2507 result = Stage3pair_optimal_score(samechr,hitlistpool,querylength5,querylength3,/*finalp*/false);
2508 result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/false,/*finalp*/true);
2509 result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/true);
2510 /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2511 } else {
2512 debug16(printf("GMAP improvement: Before removing overlaps, %d results\n",List_length(samechr)));
2513 result = Stage3pair_optimal_score(samechr,hitlistpool,querylength5,querylength3,/*finalp*/false);
2514 result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/false,/*finalp*/false);
2515 result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/false);
2516 /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2517
2518 #ifdef TODO
2519 result = align_pair_with_gmap(&(*final_pairtype),result,
2520 queryuc_ptr_5,queryuc_ptr_3,querylength5,querylength3,
2521 oligoindices_minor,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,
2522 cutoff_level_5,cutoff_level_3,/*expect_concordant_p*/false);
2523 result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/true);
2524 result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/false,/*finalp*/true);
2525 result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/true);
2526 #endif
2527
2528 if (Stage3pair_concordantp(result) == true) {
2529 debug16(printf("Found remaining concordant solution, so removing non-concordant ones\n"));
2530 *final_pairtype = CONCORDANT;
2531 result = Stage3pair_filter_nonconcordant(result,hitlistpool);
2532 debug16(printf("Concordant results: %d\n",List_length(result)));
2533 } else {
2534 *final_pairtype = PAIRED_UNSPECIFIED;
2535 }
2536
2537 /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2538 }
2539
2540 *final_pairtype = CONCORDANT; /* Because paired terminals are each concordant */
2541
2542
2543 } else if (*final_pairtype == CONCORDANT_TRANSLOCATIONS) {
2544 debug16(printf("Have %d concordant translocation results\n",List_length(conc_transloc)));
2545 for (p = hitpairs; p != NULL; p = List_next(p)) {
2546 stage3pair = (Stage3pair_T) List_head(p);
2547 Stage3pair_free(&stage3pair);
2548 }
2549 Hitlist_free(&hitpairs);
2550
2551 for (p = samechr; p != NULL; p = List_next(p)) {
2552 stage3pair = (Stage3pair_T) List_head(p);
2553 Stage3pair_free(&stage3pair);
2554 }
2555 Hitlist_free(&samechr);
2556
2557 result = Stage3pair_optimal_score(conc_transloc,hitlistpool,querylength5,querylength3,/*finalp*/false);
2558 result = Stage3pair_remove_overlaps(result,hitlistpool,querylength5,querylength3,/*translocp*/true,/*finalp*/true);
2559 result = Stage3pair_optimal_score(result,hitlistpool,querylength5,querylength3,/*finalp*/true);
2560
2561 /* result = Stage3pair_resolve_multimapping(result,hitlistpool); */
2562 debug16(printf("Finally, have %d concordant translocation results\n",List_length(result)));
2563
2564 } else {
2565 debug16(printf("Have unpaired results\n"));
2566 /* Need to free conc_transloc, since we can get here with multiple results */
2567 for (p = conc_transloc; p != NULL; p = List_next(p)) {
2568 stage3pair = (Stage3pair_T) List_head(p);
2569 Stage3pair_free(&stage3pair);
2570 }
2571 Hitlist_free(&conc_transloc);
2572
2573 result = (List_T) NULL;
2574 }
2575
2576 debug16(printf("After eval: %d in result, %d hitpairs, %d conc_transloc, %d samechr\n",
2577 List_length(result),List_length(hitpairs),List_length(conc_transloc),List_length(samechr)));
2578
2579 if (result == NULL) {
2580 debug(printf("Have %d hits on 5' end before filter_coverage with min-coverage of %d\n",
2581 List_length(hits5),min_coverage_5));
2582 singlehits5 = Stage3end_filter(hits5,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5);
2583 debug(printf("Have %d hits on 5' end after filter_coverage\n",List_length(singlehits5)));
2584 debug16(printf("Entering Stage3end_optimal_score for 5' end\n"));
2585 singlehits5 = Stage3end_optimal_score(singlehits5,hitlistpool,querylength5,/*finalp*/false);
2586
2587 /* singlehits5 = Stage3end_reject_trimlengths(singlehits5); */
2588 singlehits5 = Stage3end_linearize_5(singlehits5);
2589 singlehits5 = Stage3end_remove_overlaps(singlehits5,hitlistpool,querylength5,/*finalp*/true);
2590 singlehits5 = Stage3end_optimal_score(singlehits5,hitlistpool,querylength5,/*finalp*/true);
2591 /* singlehits5 = Stage3end_resolve_multimapping(singlehits5,hitlistpool); */
2592
2593
2594 debug(printf("Have %d hits on 3' end before filter_coverage with min_coverage of %d\n",
2595 List_length(hits3),min_coverage_3));
2596 singlehits3 = Stage3end_filter(hits3,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3);
2597 debug(printf("Have %d hits on 3' end after filter_coverage\n",List_length(singlehits3)));
2598 debug16(printf("Entering Stage3end_optimal_score for 3' end\n"));
2599 singlehits3 = Stage3end_optimal_score(singlehits3,hitlistpool,querylength3,/*finalp*/false);
2600
2601 /* singlehits3 = Stage3end_reject_trimlengths(singlehits3); */
2602 singlehits3 = Stage3end_linearize_3(singlehits3);
2603 singlehits3 = Stage3end_remove_overlaps(singlehits3,hitlistpool,querylength3,/*finalp*/true);
2604 singlehits3 = Stage3end_optimal_score(singlehits3,hitlistpool,querylength3,/*finalp*/true);
2605 /* singlehits3 = Stage3end_resolve_multimapping(singlehits3,hitlistpool); */
2606
2607
2608 debug16(printf("5' end has %d hits and 3' end has %d hits\n",
2609 List_length(singlehits5),List_length(singlehits3)));
2610
2611 if (List_length(singlehits5) == 1 && List_length(singlehits3) == 1) {
2612 hit5 = (Stage3end_T) List_head(singlehits5);
2613 hit3 = (Stage3end_T) List_head(singlehits3);
2614 if ((genestrand = Stage3end_genestrand(hit5)) == Stage3end_genestrand(hit3) &&
2615 (sensedir = Stage3end_sensedir(hit5) == Stage3end_sensedir(hit3)) &&
2616 (pairtype = Stage3_determine_pairtype(hit5,hit3,/*stage3pair*/NULL)) != UNPAIRED) {
2617 /* Convert unpaired uniq to a paired uniq */
2618 debug16(printf("Converting unpaired uniq to paired uniq, with initial pairtype %s\n",Pairtype_string(pairtype)));
2619 if ((newpair = Stage3pair_new(hit5,hit3,genestrand,sensedir,pairtype,
2620 mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2621 query5_compress_fwd,query5_compress_rev,
2622 query3_compress_fwd,query3_compress_rev,
2623 #if 0
2624 queryuc_ptr_5,queryuc_ptr_3,
2625 pairpool,dynprogL,dynprogM,dynprogR,oligoindices_minor,
2626 diagpool,cellpool,
2627 #endif
2628 listpool,/*expect_concordant_p*/pairtype == CONCORDANT ? true : false,
2629 /*transcriptome_guided_p*/false)) != NULL) {
2630 result = Hitlist_push(NULL,hitlistpool,(void *) newpair);
2631
2632 *nhits5_primary = *nhits5_altloc = 0;
2633 *nhits3_primary = *nhits3_altloc = 0;
2634 *stage3array5 = *stage3array3 = (Stage3end_T *) NULL;
2635
2636 if (Stage3pair_altlocp(newpair) == true) {
2637 *npaths_primary = 0;
2638 *npaths_altloc = 1;
2639 } else {
2640 *npaths_primary = 1;
2641 *npaths_altloc = 0;
2642 }
2643 if (pairtype == CONCORDANT) {
2644 debug16(printf("final pairtype is CONCORDANT\n"));
2645 *final_pairtype = CONCORDANT;
2646 } else {
2647 debug16(printf("final pairtype is PAIRED_UNSPECIFIED\n"));
2648 *final_pairtype = PAIRED_UNSPECIFIED;
2649 }
2650
2651 stage3pairarray = (Stage3pair_T *) CALLOC_OUT(1,sizeof(Stage3pair_T));
2652 stage3pairarray[0] = (Stage3pair_T) List_head(result);
2653 Hitlist_free(&result);
2654
2655 #if 0
2656 Stage3pair_privatize(stage3pairarray,/*npairs*/1);
2657 #endif
2658 Stage3pair_eval_and_sort(/*npaths*/(*npaths_primary) + (*npaths_altloc),
2659 &(*first_absmq),&(*second_absmq),stage3pairarray,
2660 queryuc_ptr_5,queryuc_ptr_3,quality_string_5,quality_string_3);
2661
2662 stage3list_gc(&singlehits3);
2663 stage3list_gc(&singlehits5);
2664 debug16(printf("1 Exiting consolidate_paired_results with final_pairtype %d\n",*final_pairtype));
2665 return stage3pairarray;
2666 }
2667 }
2668 }
2669
2670 /* Fall through: halfmapping or unpaired */
2671 *npaths_primary = *npaths_altloc = 0;
2672 *final_pairtype = UNPAIRED;
2673
2674 /* singlehits5 = Stage3end_filter(singlehits5,hitlistpool,max_mismatches_refalt_5,max_mismatches_ref_5,min_coverage_5); */
2675 if (singlehits5 == NULL) {
2676 *nhits5_primary = *nhits5_altloc = 0;
2677 *stage3array5 = (Stage3end_T *) NULL;
2678 } else {
2679 #if 0
2680 singlehits5 = Stage3end_unalias_circular(singlehits5);
2681 #else
2682 singlehits5 = Stage3end_remove_circular_alias(singlehits5,hitlistpool); /* Contains a call to unalias_circular */
2683 singlehits5 = Stage3end_remove_duplicates(singlehits5,hitlistpool); /* Aliases can cause duplicates */
2684 Stage3end_count_hits(&(*nhits5_primary),&(*nhits5_altloc),singlehits5);
2685 #endif
2686 *stage3array5 = (Stage3end_T *) List_to_array_out(singlehits5,NULL); Hitlist_free(&singlehits5); /* Return value */
2687 }
2688
2689 /* singlehits3 = Stage3end_filter(singlehits3,hitlistpool,max_mismatches_refalt_3,max_mismatches_ref_3,min_coverage_3); */
2690 if (singlehits3 == NULL) {
2691 *nhits3_primary = *nhits3_altloc = 0;
2692 *stage3array3 = (Stage3end_T *) NULL;
2693 } else {
2694 #if 0
2695 singlehits3 = Stage3end_unalias_circular(singlehits3);
2696 #else
2697 singlehits3 = Stage3end_remove_circular_alias(singlehits3,hitlistpool); /* Contains a call to unalias_circular */
2698 singlehits3 = Stage3end_remove_duplicates(singlehits3,hitlistpool); /* Aliases can cause duplicates */
2699 Stage3end_count_hits(&(*nhits3_primary),&(*nhits3_altloc),singlehits3);
2700 #endif
2701 *stage3array3 = (Stage3end_T *) List_to_array_out(singlehits3,NULL); Hitlist_free(&singlehits3); /* Return value */
2702 }
2703
2704 if ((*nhits5_primary) + (*nhits5_altloc) > 0) {
2705 if ((*nhits3_primary) + (*nhits3_altloc) == 1) {
2706 /* Use single 3' hit to guide sorting of multiple 5' hits */
2707 *stage3array5 = Stage3end_eval_and_sort_guided((*nhits5_primary) + (*nhits5_altloc),
2708 &(*first_absmq5),&(*second_absmq5),/*guide*/(*stage3array3)[0],
2709 *stage3array5,queryuc_ptr_5,quality_string_5,/*displayp*/true);
2710 } else {
2711 *stage3array5 = Stage3end_eval_and_sort((*nhits5_primary) + (*nhits5_altloc),&(*first_absmq5),&(*second_absmq5),
2712 *stage3array5,queryuc_ptr_5,quality_string_5,/*displayp*/true);
2713 }
2714 }
2715
2716 if ((*nhits3_primary) + (*nhits3_altloc) > 0) {
2717 if ((*nhits5_primary) + (*nhits5_altloc) == 1) {
2718 /* Use single 5' hit to guide sorting of multiple 3' hits */
2719 *stage3array3 = Stage3end_eval_and_sort_guided((*nhits3_primary) + (*nhits3_altloc),
2720 &(*first_absmq3),&(*second_absmq3),/*guide*/(*stage3array5)[0],
2721 *stage3array3,queryuc_ptr_3,quality_string_3,/*displayp*/true);
2722 } else {
2723 *stage3array3 = Stage3end_eval_and_sort((*nhits3_primary) + (*nhits3_altloc),&(*first_absmq3),&(*second_absmq3),
2724 *stage3array3,queryuc_ptr_3,quality_string_3,/*displayp*/true);
2725 }
2726 }
2727 debug16(printf("Result is NULL, and we have %d hits on 5' end and %d hits on 3' end\n",
2728 (*nhits5_primary) + (*nhits5_altloc),(*nhits3_primary) + (*nhits3_altloc)));
2729 debug16(printf("2 Exiting consolidate_paired_results with final_pairtype %d\n",*final_pairtype));
2730 return (Stage3pair_T *) NULL;
2731
2732 } else {
2733 debug16(printf("final pairtype is %d\n",*final_pairtype));
2734 debug16(printf("Result is not NULL (%d paths), and we fall through to concordant, paired, or transloc pairs\n",
2735 List_length(result)));
2736 debug16(printf("Filtering: max_mismatches %d and %d, min_coverage %d and %d\n",
2737 max_mismatches_refalt_5,max_mismatches_refalt_3,min_coverage_5,min_coverage_3));
2738
2739 #if 0
2740 /* Eliminates entire pair even if only one end is bad. Should filter each end, and not each pair */
2741 result = Stage3pair_filter(result,hitlistpool,max_mismatches_5,max_mismatches_3,min_coverage_5,min_coverage_3);
2742 #endif
2743
2744 if (result == NULL) {
2745 *npaths_primary = *npaths_altloc = 0;
2746 stage3list_gc(&hits3);
2747 stage3list_gc(&hits5);
2748 *nhits5_primary = *nhits5_altloc = 0;
2749 *nhits3_primary = *nhits3_altloc = 0;
2750 *stage3array5 = *stage3array3 = (Stage3end_T *) NULL;
2751 debug16(printf("3 Exiting consolidate_paired_results with final_pairtype %d\n",*final_pairtype));
2752 return (Stage3pair_T *) NULL;
2753
2754 } else {
2755 /* result != NULL */
2756 /* Concordant, paired, or transloc pairs found. Remove single hits. */
2757 Stage3pair_count_hits(&(*npaths_primary),&(*npaths_altloc),result);
2758 stage3pairarray = (Stage3pair_T *) List_to_array_out(result,NULL); Hitlist_free(&result); /* Return value */
2759 #if 0
2760 Stage3pair_privatize(stage3pairarray,/*npaths*/(*npaths_primary) + (*npaths_altloc));
2761 #endif
2762 Stage3pair_eval_and_sort(/*npaths*/(*npaths_primary) + (*npaths_altloc),
2763 &(*first_absmq),&(*second_absmq),
2764 stage3pairarray,queryuc_ptr_5,queryuc_ptr_3,quality_string_5,quality_string_3);
2765 stage3list_gc(&hits3);
2766 stage3list_gc(&hits5);
2767
2768 *nhits5_primary = *nhits5_altloc = 0;
2769 *nhits3_primary = *nhits3_altloc = 0;
2770 *stage3array5 = *stage3array3 = (Stage3end_T *) NULL;
2771 debug16(printf("4 Exiting consolidate_paired_results with final_pairtype %d\n",*final_pairtype));
2772
2773 return stage3pairarray;
2774 }
2775 }
2776 }
2777
2778
2779 Stage3pair_T *
Stage1_paired_read(int * npaths_primary,int * npaths_altloc,int * first_absmq,int * second_absmq,Pairtype_T * final_pairtype,Stage3end_T ** stage3array5,int * nhits5_primary,int * nhits5_altloc,int * first_absmq5,int * second_absmq5,Stage3end_T ** stage3array3,int * nhits3_primary,int * nhits3_altloc,int * first_absmq3,int * second_absmq3,Shortread_T queryseq5,Shortread_T queryseq3,Chrpos_T pairmax_linear,Intlistpool_T intlistpool,Univcoordlistpool_T univcoordlistpool,Listpool_T listpool,Univdiagpool_T univdiagpool,Hitlistpool_T hitlistpool)2780 Stage1_paired_read (int *npaths_primary, int *npaths_altloc, int *first_absmq, int *second_absmq, Pairtype_T *final_pairtype,
2781 Stage3end_T **stage3array5, int *nhits5_primary, int *nhits5_altloc, int *first_absmq5, int *second_absmq5,
2782 Stage3end_T **stage3array3, int *nhits3_primary, int *nhits3_altloc, int *first_absmq3, int *second_absmq3,
2783 Shortread_T queryseq5, Shortread_T queryseq3, Chrpos_T pairmax_linear,
2784 Intlistpool_T intlistpool, Univcoordlistpool_T univcoordlistpool,
2785 Listpool_T listpool, Univdiagpool_T univdiagpool, Hitlistpool_T hitlistpool) {
2786 Stage3pair_T *stage3pairarray;
2787 bool abort_pairing_p, geneplus_abort_pairing_p, geneminus_abort_pairing_p;
2788 List_T hitpairs, geneplus_hitpairs, geneminus_hitpairs;
2789 List_T hits5, hits3, geneplus_hits5, geneplus_hits3, geneminus_hits5, geneminus_hits3;
2790 List_T samechr, geneplus_samechr, geneminus_samechr;
2791 List_T conc_transloc, geneplus_conc_transloc, geneminus_conc_transloc;
2792
2793 int querylength5, querylength3;
2794 int max_mismatches_refalt_5, max_mismatches_refalt_3;
2795 int max_mismatches_ref_5, max_mismatches_ref_3;
2796 int min_coverage_5, min_coverage_3;
2797 int max_middle_insertions_5, max_middle_insertions_3, max_middle_deletions_5, max_middle_deletions_3;
2798 int max_insertionlen_5, max_insertionlen_3, max_deletionlen_5, max_deletionlen_3;
2799 Chrpos_T overall_max_distance_5, overall_max_distance_3, overall_end_distance_5, overall_end_distance_3;
2800 char *queryuc_ptr_5, *queryuc_ptr_3, *queryrc5, *queryrc3, *quality_string_5, *quality_string_3;
2801 int *mismatch_positions_alloc_5, *mismatch_positions_alloc_3;
2802 Compress_T query5_compress_fwd, query5_compress_rev, query3_compress_fwd, query3_compress_rev;
2803
2804
2805 querylength5 = Shortread_fulllength(queryseq5);
2806 querylength3 = Shortread_fulllength(queryseq3);
2807
2808 /* Previously used user_mismatches_refalt_float in searching, now just for filtering */
2809 if (user_mismatches_refalt_float < 0.0) {
2810 max_mismatches_refalt_5 = querylength5;
2811 max_mismatches_refalt_3 = querylength3;
2812 } else if (user_mismatches_refalt_float > 0.0 && user_mismatches_refalt_float < 1.0) {
2813 max_mismatches_refalt_5 = (int) rint(user_mismatches_refalt_float * (double) querylength5);
2814 max_mismatches_refalt_3 = (int) rint(user_mismatches_refalt_float * (double) querylength3);
2815 } else {
2816 /* Assuming that --max-mismatches=1 must mean 1 bp and not querylength */
2817 max_mismatches_refalt_5 = max_mismatches_refalt_3 = (int) user_mismatches_refalt_float;
2818 }
2819
2820 if (user_mismatches_ref_float < 0.0) {
2821 max_mismatches_ref_5 = querylength5;
2822 max_mismatches_ref_3 = querylength3;
2823 } else if (user_mismatches_ref_float > 0.0 && user_mismatches_ref_float < 1.0) {
2824 max_mismatches_ref_5 = (int) rint(user_mismatches_ref_float * (double) querylength5);
2825 max_mismatches_ref_3 = (int) rint(user_mismatches_ref_float * (double) querylength3);
2826 } else {
2827 /* Assuming that --max-mismatches=1 must mean 1 bp and not querylength */
2828 max_mismatches_ref_5 = max_mismatches_ref_3 = (int) user_mismatches_ref_float;
2829 }
2830
2831 if (user_mincoverage_float <= 0.0) {
2832 min_coverage_5 = min_coverage_3 = 0;
2833 } else if (user_mincoverage_float <= 1.0) {
2834 /* Assuming that --min-coverage=1 must mean 1.0 and not a coverage of 1 bp */
2835 min_coverage_5 = (int) rint(user_mincoverage_float * (double) querylength5);
2836 min_coverage_3 = (int) rint(user_mincoverage_float * (double) querylength3);
2837 } else {
2838 min_coverage_5 = min_coverage_3 = (int) user_mincoverage_float;
2839 }
2840
2841 if (max_middle_insertions_float > 0.0 && max_middle_insertions_float < 1.0) {
2842 max_middle_insertions_5 = (int) rint(max_middle_insertions_float * (double) querylength5);
2843 max_middle_insertions_3 = (int) rint(max_middle_insertions_float * (double) querylength3);
2844 } else {
2845 max_middle_insertions_5 = max_middle_insertions_3 = (int) max_middle_insertions_float;
2846 }
2847 max_insertionlen_5 = max_middle_insertions_5 > max_end_insertions ? max_middle_insertions_5 : max_end_insertions;
2848 max_insertionlen_3 = max_middle_insertions_3 > max_end_insertions ? max_middle_insertions_3 : max_end_insertions;
2849
2850 if (max_middle_deletions_float > 0.0 && max_middle_deletions_float < 1.0) {
2851 max_middle_deletions_5 = (int) rint(max_middle_deletions_float * (double) querylength5);
2852 max_middle_deletions_3 = (int) rint(max_middle_deletions_float * (double) querylength3);
2853 } else {
2854 max_middle_deletions_5 = max_middle_insertions_3 = (int) max_middle_deletions_float;
2855 }
2856 max_deletionlen_5 = max_middle_deletions_5 > max_end_deletions ? max_middle_deletions_5 : max_end_deletions;
2857 max_deletionlen_3 = max_middle_deletions_3 > max_end_deletions ? max_middle_deletions_3 : max_end_deletions;
2858
2859 overall_max_distance_5 = shortsplicedist;
2860 if ((Chrpos_T) max_middle_deletions_5 > overall_max_distance_5) {
2861 overall_max_distance_5 = (Chrpos_T) max_middle_deletions_5;
2862 }
2863 if ((Chrpos_T) max_middle_insertions_5 > overall_max_distance_5) {
2864 overall_max_distance_5 = (Chrpos_T) max_middle_insertions_5;
2865 }
2866 overall_end_distance_5 = shortsplicedist_novelend > (Chrpos_T) max_deletionlen_5 ? shortsplicedist_novelend : (Chrpos_T) max_deletionlen_5;
2867
2868 overall_max_distance_3 = shortsplicedist;
2869 if ((Chrpos_T) max_middle_deletions_3 > overall_max_distance_3) {
2870 overall_max_distance_3 = (Chrpos_T) max_middle_deletions_3;
2871 }
2872 if ((Chrpos_T) max_middle_insertions_3 > overall_max_distance_3) {
2873 overall_max_distance_3 = (Chrpos_T) max_middle_insertions_3;
2874 }
2875 overall_end_distance_3 = shortsplicedist_novelend > (Chrpos_T) max_deletionlen_3 ? shortsplicedist_novelend : (Chrpos_T) max_deletionlen_3;
2876
2877
2878 queryuc_ptr_5 = Shortread_fullpointer_uc(queryseq5);
2879 queryuc_ptr_3 = Shortread_fullpointer_uc(queryseq3);
2880 quality_string_5 = Shortread_quality_string(queryseq5);
2881 quality_string_3 = Shortread_quality_string(queryseq3);
2882
2883 queryrc5 = (char *) MALLOC((querylength5+1)*sizeof(char));
2884 queryrc3 = (char *) MALLOC((querylength3+1)*sizeof(char));
2885 make_complement_buffered(queryrc5,queryuc_ptr_5,querylength5);
2886 make_complement_buffered(queryrc3,queryuc_ptr_3,querylength3);
2887
2888 mismatch_positions_alloc_5 = (int *) MALLOC((querylength5+1)*sizeof(int));
2889 mismatch_positions_alloc_3 = (int *) MALLOC((querylength3+1)*sizeof(int));
2890
2891 query5_compress_fwd = Compress_new_fwd(queryuc_ptr_5,querylength5);
2892 query5_compress_rev = Compress_new_rev(queryuc_ptr_5,querylength5);
2893 query3_compress_fwd = Compress_new_fwd(queryuc_ptr_3,querylength3);
2894 query3_compress_rev = Compress_new_rev(queryuc_ptr_3,querylength3);
2895
2896
2897 if (mode == STANDARD || mode == CMET_STRANDED || mode == ATOI_STRANDED || mode == TTOC_STRANDED) {
2898 hitpairs = paired_read(&abort_pairing_p,&hits5,&hits3,&samechr,&conc_transloc,
2899 queryuc_ptr_5,queryrc5,querylength5,queryuc_ptr_3,queryrc3,querylength3,
2900 mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2901 query5_compress_fwd,query5_compress_rev,
2902 query3_compress_fwd,query3_compress_rev,
2903 max_insertionlen_5,max_insertionlen_3,max_deletionlen_5,max_deletionlen_3,
2904 /*genestrand*/0,pairmax_linear,overall_max_distance_5,overall_max_distance_3,
2905 overall_end_distance_5,overall_end_distance_3,
2906 max_mismatches_refalt_5,max_mismatches_refalt_3,
2907 max_mismatches_ref_5,max_mismatches_ref_3,
2908 min_coverage_5,min_coverage_3,
2909 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool);
2910 if (abort_pairing_p == true) {
2911 /* Too many concordant. Could put into a separate split output */
2912 debug(printf("abort_pairing_p is true\n"));
2913 *final_pairtype = CONCORDANT;
2914 } else {
2915 *final_pairtype = CONCORDANT;
2916 }
2917
2918 stage3pairarray =
2919 consolidate_paired_results(&(*npaths_primary),&(*npaths_altloc),&(*first_absmq),&(*second_absmq),&(*final_pairtype),
2920 &(*stage3array5),&(*nhits5_primary),&(*nhits5_altloc),&(*first_absmq5),&(*second_absmq5),
2921 &(*stage3array3),&(*nhits3_primary),&(*nhits3_altloc),&(*first_absmq3),&(*second_absmq3),
2922 hitpairs,samechr,conc_transloc,hits5,hits3,
2923
2924 queryuc_ptr_5,querylength5,queryuc_ptr_3,querylength3,
2925 quality_string_5,quality_string_3,
2926
2927 mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2928 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2929
2930 max_mismatches_refalt_5,max_mismatches_refalt_3,
2931 max_mismatches_ref_5,max_mismatches_ref_3,
2932 min_coverage_5,min_coverage_3,
2933 /*oligoindices_minor,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,*/
2934 listpool,hitlistpool);
2935
2936 } else if (mode == CMET_NONSTRANDED || mode == ATOI_NONSTRANDED || mode == TTOC_NONSTRANDED) {
2937 geneplus_hitpairs = paired_read(&geneplus_abort_pairing_p,&geneplus_hits5,&geneplus_hits3,
2938 &geneplus_samechr,&geneplus_conc_transloc,
2939 queryuc_ptr_5,queryrc5,querylength5,queryuc_ptr_3,queryrc3,querylength3,
2940 mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2941 query5_compress_fwd,query5_compress_rev,
2942 query3_compress_fwd,query3_compress_rev,
2943 max_insertionlen_5,max_insertionlen_3,max_deletionlen_5,max_deletionlen_3,
2944 /*genestrand*/+1,pairmax_linear,overall_max_distance_5,overall_max_distance_3,
2945 overall_end_distance_5,overall_end_distance_3,
2946 max_mismatches_refalt_5,max_mismatches_refalt_3,
2947 max_mismatches_ref_5,max_mismatches_ref_3,
2948 min_coverage_5,min_coverage_3,
2949 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool);
2950
2951 geneminus_hitpairs = paired_read(&geneminus_abort_pairing_p,&geneminus_hits5,&geneminus_hits3,
2952 &geneminus_samechr,&geneminus_conc_transloc,
2953 queryuc_ptr_5,queryrc5,querylength5,queryuc_ptr_3,queryrc3,querylength3,
2954 mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2955 query5_compress_fwd,query5_compress_rev,
2956 query3_compress_fwd,query3_compress_rev,
2957 max_insertionlen_5,max_insertionlen_3,max_deletionlen_5,max_deletionlen_3,
2958 /*genestrand*/+2,pairmax_linear,overall_max_distance_5,overall_max_distance_3,
2959 overall_end_distance_5,overall_end_distance_3,
2960 max_mismatches_refalt_5,max_mismatches_refalt_3,
2961 max_mismatches_ref_5,max_mismatches_ref_3,
2962 min_coverage_5,min_coverage_3,
2963 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool);
2964
2965 if (geneplus_abort_pairing_p == true || geneminus_abort_pairing_p == true) {
2966 /* Too many concordant. Could put into a separate split output */
2967 debug(printf("abort_pairing_p is true\n"));
2968 *final_pairtype = CONCORDANT;
2969 } else {
2970 *final_pairtype = CONCORDANT;
2971 }
2972
2973 hits5 = List_append(geneplus_hits5,geneminus_hits5);
2974 hits3 = List_append(geneplus_hits3,geneminus_hits3);
2975 hitpairs = List_append(geneplus_hitpairs,geneminus_hitpairs);
2976 samechr = List_append(geneplus_samechr,geneminus_samechr);
2977 conc_transloc = List_append(geneplus_conc_transloc,geneminus_conc_transloc);
2978
2979 stage3pairarray =
2980 consolidate_paired_results(&(*npaths_primary),&(*npaths_altloc),&(*first_absmq),&(*second_absmq),&(*final_pairtype),
2981 &(*stage3array5),&(*nhits5_primary),&(*nhits5_altloc),&(*first_absmq5),&(*second_absmq5),
2982 &(*stage3array3),&(*nhits3_primary),&(*nhits3_altloc),&(*first_absmq3),&(*second_absmq3),
2983 hitpairs,samechr,conc_transloc,hits5,hits3,
2984
2985 queryuc_ptr_5,querylength5,queryuc_ptr_3,querylength3,
2986 quality_string_5,quality_string_3,
2987
2988 mismatch_positions_alloc_5,mismatch_positions_alloc_3,
2989 query5_compress_fwd,query5_compress_rev,query3_compress_fwd,query3_compress_rev,
2990
2991 max_mismatches_refalt_5,max_mismatches_refalt_3,
2992 max_mismatches_ref_5,max_mismatches_ref_3,
2993 min_coverage_5,min_coverage_3,
2994 /*oligoindices_minor,pairpool,diagpool,cellpool,dynprogL,dynprogM,dynprogR,*/
2995 listpool,hitlistpool);
2996
2997 } else {
2998 fprintf(stderr,"Do not recognize mode %d\n",mode);
2999 abort();
3000 }
3001
3002 Compress_free(&query5_compress_fwd);
3003 Compress_free(&query5_compress_rev);
3004 Compress_free(&query3_compress_fwd);
3005 Compress_free(&query3_compress_rev);
3006
3007 FREE(mismatch_positions_alloc_3);
3008 FREE(mismatch_positions_alloc_5);
3009
3010 FREE(queryrc3);
3011 FREE(queryrc5);
3012
3013 debug(printf("Returning with final_pairtype %d\n",*final_pairtype));
3014 return stage3pairarray;
3015 }
3016
3017
3018 void
Stage1hr_setup(Univ_IIT_T transcript_iit_in,Transcriptome_T transcriptome_in,Genome_T transcriptomebits_in,bool use_only_transcriptome_p_in,Indexdb_T indexdb_fwd_in,Indexdb_T indexdb_rev_in,Indexdb_T indexdb_tr_in,int index1part_tr_in,int index1part_in,int index1interval_in,double user_mismatches_refalt_float_in,double user_mismatches_ref_float_in,double user_mincoverage_float_in,double max_middle_insertions_float_in,double max_middle_deletions_float_in,int max_end_insertions_in,int max_end_deletions_in,Univ_IIT_T chromosome_iit_in,int nchromosomes_in,Genome_T genomecomp_in,Genome_T genomebits_in,Genome_T genomebits_alt_in,Mode_T mode_in,int maxpaths_search_in,int maxpaths_report_in,bool find_dna_chimeras_p_in,bool distances_observed_p_in,int subopt_levels_in,bool splicingp_in,Chrpos_T shortsplicedist_in,Chrpos_T shortsplicedist_novelend_in,Chrpos_T min_intronlength_in,Chrpos_T expected_pairlength_in,Chrpos_T pairlength_deviation_in,int distantsplicing_penalty_in,int min_distantsplicing_end_matches_in,int min_distantsplicing_identity_in)3019 Stage1hr_setup (Univ_IIT_T transcript_iit_in, Transcriptome_T transcriptome_in, Genome_T transcriptomebits_in,
3020 bool use_only_transcriptome_p_in,
3021
3022 Indexdb_T indexdb_fwd_in, Indexdb_T indexdb_rev_in, Indexdb_T indexdb_tr_in,
3023
3024 int index1part_tr_in, int index1part_in, int index1interval_in,
3025 double user_mismatches_refalt_float_in, double user_mismatches_ref_float_in, double user_mincoverage_float_in,
3026 double max_middle_insertions_float_in, double max_middle_deletions_float_in,
3027 int max_end_insertions_in, int max_end_deletions_in,
3028
3029 Univ_IIT_T chromosome_iit_in, int nchromosomes_in,
3030 Genome_T genomecomp_in, Genome_T genomebits_in, Genome_T genomebits_alt_in,
3031 Mode_T mode_in, int maxpaths_search_in, int maxpaths_report_in,
3032
3033 bool find_dna_chimeras_p_in, bool distances_observed_p_in, int subopt_levels_in,
3034
3035 bool splicingp_in, Chrpos_T shortsplicedist_in, Chrpos_T shortsplicedist_novelend_in,
3036 Chrpos_T min_intronlength_in, Chrpos_T expected_pairlength_in, Chrpos_T pairlength_deviation_in,
3037
3038 int distantsplicing_penalty_in,
3039 int min_distantsplicing_end_matches_in, int min_distantsplicing_identity_in) {
3040
3041 use_only_transcriptome_p = use_only_transcriptome_p_in;
3042
3043 transcript_iit = transcript_iit_in;
3044 transcriptome = transcriptome_in;
3045 transcriptomebits = transcriptomebits_in;
3046
3047 indexdb_fwd = indexdb_fwd_in;
3048 indexdb_rev = indexdb_rev_in;
3049 indexdb_tr = indexdb_tr_in;
3050
3051 index1part_tr = index1part_tr_in;
3052 index1part = index1part_in;
3053 index1interval = index1interval_in;
3054
3055 user_mismatches_refalt_float = user_mismatches_refalt_float_in;
3056 user_mismatches_ref_float = user_mismatches_ref_float_in;
3057 user_mincoverage_float = user_mincoverage_float_in;
3058
3059 max_middle_insertions_float = max_middle_insertions_float_in;
3060 max_middle_deletions_float = max_middle_deletions_float_in;
3061
3062 chromosome_iit = chromosome_iit_in;
3063 circular_typeint = Univ_IIT_typeint(chromosome_iit,"circular");
3064 nchromosomes = nchromosomes_in;
3065
3066 genomecomp = genomecomp_in;
3067 genomebits = genomebits_in;
3068 genomebits_alt = genomebits_alt_in;
3069
3070 #ifdef HAVE_64_BIT
3071 leftreadshift = 64 - index1part - index1part;
3072 oligobase_mask = ~(~ (Oligospace_T) 0 << 2*index1part);
3073 #else
3074 leftreadshift = 32 - index1part - index1part;
3075 oligobase_mask = ~(~ (Oligospace_T) 0 << 2*index1part);
3076 #endif
3077
3078 poly_a = POLY_A & oligobase_mask;
3079 poly_t = POLY_T & oligobase_mask;
3080
3081 mode = mode_in;
3082 maxpaths_search = maxpaths_search_in;
3083 maxpaths_report = maxpaths_report_in;
3084
3085 find_dna_chimeras_p = find_dna_chimeras_p_in;
3086 distances_observed_p = distances_observed_p_in;
3087
3088 subopt_levels = subopt_levels_in;
3089
3090 max_end_insertions = max_end_insertions_in;
3091 max_end_deletions = max_end_deletions_in;
3092
3093 splicingp = splicingp_in;
3094 shortsplicedist = shortsplicedist_in;
3095 shortsplicedist_novelend = shortsplicedist_novelend_in;
3096
3097 min_intronlength = min_intronlength_in;
3098 expected_pairlength = expected_pairlength_in;
3099 pairlength_deviation = pairlength_deviation_in;
3100
3101 distantsplicing_penalty = distantsplicing_penalty_in;
3102 min_distantsplicing_end_matches = min_distantsplicing_end_matches_in;
3103 min_distantsplicing_identity = min_distantsplicing_identity_in;
3104
3105 if (genomebits_alt_in != NULL) {
3106 snpp = true;
3107 } else {
3108 snpp = false;
3109 }
3110
3111 return;
3112 }
3113