1 /***************************************************************
2
3 The Subread software package is free software package:
4 you can redistribute it and/or modify it under the terms
5 of the GNU General Public License as published by the
6 Free Software Foundation, either version 3 of the License,
7 or (at your option) any later version.
8
9 Subread is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty
11 of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12
13 See the GNU General Public License for more details.
14
15 Authors: Drs Yang Liao and Wei Shi
16
17 ***************************************************************/
18
19
20 #define _GNU_SOURCE
21 #include <stdio.h>
22 #include <math.h>
23 #include <ctype.h>
24 #include <errno.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <time.h>
29 #include <sys/stat.h>
30 #include <sys/types.h>
31
32 #ifndef __FreeBSD__
33 #include <sys/timeb.h>
34 #endif
35
36 #include "subread.h"
37 #include "input-files.h"
38 #include "gene-algorithms.h"
39 #include "sorted-hashtable.h"
40
non_func(const char * fmt,...)41 void non_func(const char * fmt, ...)
42 {
43 }
44
45
subread_lock_release(subread_lock_t * lock)46 void subread_lock_release(subread_lock_t * lock)
47 {
48 #ifdef MACOS
49 pthread_mutex_unlock(lock);
50 #else
51 pthread_spin_unlock(lock);
52 #endif
53 }
subread_lock_occupy(subread_lock_t * lock)54 void subread_lock_occupy(subread_lock_t * lock)
55 {
56
57 #ifdef MACOS
58 pthread_mutex_lock(lock);
59 #else
60 pthread_spin_lock(lock);
61 #endif
62 }
63
subread_destroy_lock(subread_lock_t * lock)64 void subread_destroy_lock(subread_lock_t * lock) {
65 #ifdef MACOS
66 pthread_mutex_destroy(lock);
67 #else
68 pthread_spin_destroy(lock);
69 #endif
70 }
71
subread_init_lock(subread_lock_t * lock)72 void subread_init_lock(subread_lock_t * lock)
73 {
74 #ifdef MACOS
75 pthread_mutex_init(lock, NULL);
76 #else
77 pthread_spin_init(lock, PTHREAD_PROCESS_PRIVATE);
78 #endif
79
80 }
81
82 //int EXON_MAX_CIGAR_LEN = 26;
83
84
85 //#define indel_debug
86
87 #ifdef indel_debug
88 #define ddprintf SUBREADprintf
89 #define ddfflush fflush
90 #else
91 #define ddprintf non_func
92 #define ddfflush(a)
93 #endif
94
95 #define get_base_quality_score(quality_chr, quality_scale) ((quality_scale)==QUALITY_SCALE_NONE?-0.01: ((quality_scale) == QUALITY_SCALE_LINEAR?( (quality_chr) - '@')*0.01-0.03: correct_rate_table[(quality_chr) - '@']))
96
97
98 double begin_ftime;
99
get_next_char(FILE * fp)100 unsigned char get_next_char(FILE * fp)
101 {
102 int find_br = 0;
103 while (!feof(fp))
104 {
105 char nch;
106 nch = fgetc(fp);
107 if (find_br)
108 {
109 if (nch == '\n')
110 find_br=0;
111 }
112 else if (nch == '>')
113 find_br = 1;
114 else if (nch > 32)
115 return nch;
116 }
117 return 0;
118 }
119
is_valid_subread(const char * read_str)120 int is_valid_subread(const char * read_str)
121 {
122 return 1;
123 int i;
124 for (i=0; i<16; i++)
125 if(read_str[i] == 'N' || read_str[i] == '.')
126 return 0;
127 return 1;
128 }
129
read_quality_score(char * qualityb,int rl,int format)130 float read_quality_score(char * qualityb, int rl , int format)
131 {
132 int i;
133 int qual = 0;
134 int testlen = 0;
135
136 char base;
137
138 if(format==FASTQ_PHRED64)
139 base = 'B';
140 else base = '#';
141
142 for(i=0; i<rl; i++)
143 {
144 int testv = qualityb[i] - base;
145 if(testv > 1)
146 {
147 qual += testv;
148 testlen ++;
149 }
150 }
151 return qual*1./testlen;
152 }
153
min_matched_bases(char * qualityb,int rl,int format,float match_score)154 int min_matched_bases(char * qualityb, int rl , int format, float match_score)
155 {
156 if(!(qualityb) || !qualityb[0]) return 0;
157 int i;
158
159 char base;
160 int ret = 0;
161
162 if(format==FASTQ_PHRED64)
163 base = 'B';
164 else base = '#';
165
166 for(i=0; i<rl; i++)
167 ret += (qualityb[i] - base)<=5;
168
169 return (rl - ret*3/4)*match_score;
170 }
bad_quality_base_number(char * qualityb,int rl,int format)171 int bad_quality_base_number(char * qualityb, int rl , int format)
172 {
173 if(!(qualityb) || !qualityb[0]) return 0;
174 int ret = 0, i;
175 if(FASTQ_PHRED64==format)
176 {
177 for(i=0; i<rl; i++)
178 if(qualityb[i] <='F') ret ++;
179 }
180 else
181 for(i=0; i<rl; i++)
182 if(qualityb[i] <='#'+4) ret ++;
183
184
185 return ret;
186 }
187
188
189 double correct_rate_table [] ={ -1.58147375341 , -0.99684304401 , -0.69552447133 , -0.50767587370 , -0.38013040807 , -0.28926818720 , -0.22255151597 , -0.17255657291 , -0.13455196029 , -0.10536051566 , -0.08276530267 , -0.06517417320 , -0.05141827416 , -0.04062484422 , -0.03213357402 , -0.02543972753 , -0.02015436476 , -0.01597586925 , -0.01266917021 , -0.01005033585 , -0.00797499828 , -0.00632956293 , -0.00502447389 , -0.00398901727 , -0.00316728823 , -0.00251504651 , -0.00199725550 , -0.00158615046 , -0.00125971852 , -0.00100050033 , -0.00079464388 , -0.00063115648 , -0.00050131287 , -0.00039818644 , -0.00031627778 , -0.00025122020 , -0.00019954614 , -0.00015850188 , -0.00012590047 , -0.00010000500 , -0.00007943598 , -0.00006309773 , -0.00005011998 , -0.00003981151 , -0.00003162328 , -0.00002511918 , -0.00001995282 , -0.00001584906 , -0.00001258933 , -0.00001000005 , -0.00000794331 , -0.00000630959 , -0.00000501188 , -0.00000398108 , -0.00000316228 , -0.00000251189 , -0.00000199526 , -0.00000158489 , -0.00000125893 , -0.00000100000 , -0.00000079433 , -0.00000063096 , -0.00000050119 , -0.00000039811 , -0.00000031623 , -0.00000025119 , -0.00000019953 , -0.00000015849 , -0.00000012589 , -0.00000010000 , -0.00000007943 , -0.00000006310 , -0.00000005012 , -0.00000003981 , -0.00000003162 , -0.00000002512 , -0.00000001995 , -0.00000001585 , -0.00000001259 , -0.00000001000 , 0., 0.,0.,0.,0.,0.,0.,0.};
190
191 int reported_version_error = 0;
get_subread_quality(const char * quality_str,const char * read_str,int quality_scale,int phred_version)192 gene_quality_score_t get_subread_quality(const char * quality_str, const char * read_str, int quality_scale, int phred_version)
193 {
194 int ret =0;
195 int i;
196 /*
197 for (i=0; i<16; i++)
198 if(read_str[i] == 'N' || read_str[i] == '.')
199 return -22.;
200 */
201
202 //for(i=0;i<16;i++) ret += get_base_quality_score(quality_str[i] , quality_scale);
203 if(FASTQ_PHRED64 == phred_version)
204 for(i=0;i<16;i++) ret += (1000000 - get_base_error_prob64i(quality_str[i]));
205 else
206 for(i=0;i<16;i++) ret += (1000000 - get_base_error_prob33i(quality_str[i]));
207
208 /*
209 if (ret <0 && !reported_version_error)
210 {
211 SUBREADprintf("\nWARNING: negative Phred quality score! Please verify the version of the Phred scores.\n");
212 reported_version_error=1;
213 }*/
214
215 return ret/16000000.;
216 }
217
218 /*int get_base_phred(char quality_chr)
219 {
220 return quality_chr - '@';
221 }
222
223 void print_running_log(double finished_rate, double read_per_second, double expected_seconds, unsigned long long int total_reads, int is_pair)
224 {
225 char outbuff[99]; int i;
226 snprintf(outbuff, 98,"completed=%0.2f%%; time used=%.1fs; rate=%.1fk reads/s; time left=%.1fs; total=%lluk %s", finished_rate*100, miltime()-begin_ftime, read_per_second/1000 ,expected_seconds, total_reads/1000, is_pair?"pairs":"reads");
227 fputs(outbuff, stdout);
228 for(i=strlen(outbuff); i<105; i++)
229 SUBREADprintf(" ");
230 SUBREADprintf("%c",CORE_SOFT_BR_CHAR);
231 }
232
233 */
234
print_window_scrolling_bar(char * hint,float percentage,int width,int * internal_counter)235 void print_window_scrolling_bar(char * hint, float percentage, int width, int * internal_counter)
236 {
237 /*
238 char fan = '-';
239 int bar_width = width - 7 - strlen(hint) , i;
240 int dash_width = (int)(bar_width * percentage+0.5);
241 dash_width = min(dash_width, bar_width - 1);
242 int space_width = bar_width - 1 - dash_width;
243
244 char out_buf[100];
245
246 switch ((*internal_counter) % 4)
247 {
248 case 0:
249 fan='-';
250 break;
251 case 1:
252 fan='\\';
253 break;
254 case 2:
255 fan='|';
256 break;
257 case 3:
258 fan='/';
259 break;
260 }
261
262 (*internal_counter) ++;
263
264 sprintf (out_buf," %c %s [", fan, hint);
265 for(i = 0; i< dash_width; i++)
266 strcat(out_buf,"=");
267 strcat(out_buf,">");
268 for(i = 0; i< space_width; i++)
269 strcat(out_buf," ");
270 strcat(out_buf,"]");
271
272 print_in_box(80,0,0,"%s%c",out_buf,CORE_SOFT_BR_CHAR);
273
274 */
275 print_in_box(81,0,0," [ %.1f%%%% finished ] %c",percentage*100, CORE_SOFT_BR_CHAR);
276 }
277
278
279
280
print_text_scrolling_bar(char * hint,float percentage,int width,int * internal_counter)281 void print_text_scrolling_bar(char * hint, float percentage, int width, int * internal_counter)
282 {
283 char fan = '-';
284 int bar_width = width - 7 - strlen(hint) , i;
285 int dash_width = (int)(bar_width * percentage+0.5);
286 dash_width = min(dash_width, bar_width - 1);
287 int space_width = bar_width - 1 - dash_width;
288
289 /*
290 for (i=0; i<width; i++);
291 putchar(' ');
292 SUBREADprintf("\r");
293 */
294
295 switch ((*internal_counter) % 4)
296 {
297 case 0:
298 fan='-';
299 break;
300 case 1:
301 fan='\\';
302 break;
303 case 2:
304 fan='|';
305 break;
306 case 3:
307 fan='/';
308 break;
309 }
310
311 (*internal_counter) ++;
312
313
314 char lbuf[100];
315 sprintf (lbuf," %c %s [", fan, hint);
316 for(i = 0; i< dash_width; i++)
317 strcat(lbuf, "=");
318 strcat(lbuf, ">");
319 for(i = 0; i< space_width; i++)
320 strcat(lbuf, " ");
321 strcat(lbuf, "]");
322 SUBREADprintf("%s%c",lbuf,CORE_SOFT_BR_CHAR);
323
324 }
325
326
add_repeated_numbers(int qid,gene_vote_t * vote,unsigned char * repeated_regions)327 void add_repeated_numbers(int qid, gene_vote_t * vote, unsigned char * repeated_regions)
328 {
329
330 int i,j;
331
332 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
333 for(j=0; j< vote->items[i]; j++)
334 {
335 int v_2 ;
336 if(vote-> votes[i][j] >=2)
337 {
338 v_2 = vote-> votes[i][j] - 2;
339 if(repeated_regions[v_2 + qid * 16] < 255) repeated_regions[v_2 + qid * 16]++;
340 }
341
342 }
343 }
344
remove_repeated_reads(gehash_t * table,gehash_t * huge_table,int index_threshold)345 int remove_repeated_reads(gehash_t * table, gehash_t * huge_table, int index_threshold)
346 {
347 int i;
348 int vals[200000];
349 int val_len[200000];
350 int scroll_count = 0;
351 unsigned int all_removed = 0;
352
353
354 for (i=0; i <table->buckets_number ; i++)
355 {
356 struct gehash_bucket * cb = table->buckets + i;
357 int j;
358 int val_c = 0;
359
360 if(i % 300 == 0)
361 print_text_scrolling_bar ("Removing non-informative subreads", 1.0*i/table->buckets_number, 80, &scroll_count);
362
363 for(j=0; j<cb->current_items; j++)
364 {
365 int k, found = 0;
366 for(k=0;k<val_c;k++)
367 {
368 if (vals[k]==cb->item_keys[j])
369 {
370 val_len[k]++;
371 found = 1;
372 break;
373 }
374 }
375
376 if(!found)
377 {
378 if(val_c>199999)
379 {
380 SUBREADprintf("\nThere are too many items in a bucket; you may reduce the threshold of non-informative subreads to eliminate this problem.\n");
381 #ifdef MAKE_STANDALONE
382 exit(-1);
383 #else
384 return -1;
385 #endif
386 }
387 else
388 {
389 vals[val_c] = cb->item_keys[j];
390 val_len[val_c] = 1;
391 val_c++;
392 }
393 }
394 }
395
396 // SUBREADprintf ("F3\n");
397
398 for(j=0; j<val_c; j++)
399 {
400
401 if (gehash_exist(huge_table, vals[j]))
402 gehash_remove(table, vals[j]);
403 else if(val_len[j]> index_threshold)
404 {
405 gehash_remove(table, vals[j]);
406 gehash_insert(huge_table, vals[j], 1, NULL);
407 all_removed += val_len[j];
408 }
409 }
410 }
411
412 if(IS_DEBUG)
413 SUBREADprintf ("@LOG There are %u subreads removed from the index.\n", all_removed);
414
415 return all_removed;
416 }
417
418
linear_gene_position(const gene_offset_t * offsets,char * chro_name,unsigned int chro_pos)419 unsigned int linear_gene_position(const gene_offset_t* offsets , char *chro_name, unsigned int chro_pos)
420 {
421 unsigned int ret = 0 ;
422 int n;
423
424
425 n = HashTableGet(offsets -> read_name_to_index, chro_name)-NULL;
426 //printf("09NOV: GET '%s' = %d\n", chro_name, n);
427 if(n<1) return 0xffffffff;
428 if(n>1)
429 ret = offsets->read_offsets[n-2];
430 ret += offsets -> padding ;
431
432 return ret + chro_pos;
433 }
434
get_gene_linear(int chrono,int offset,const unsigned int offsets[])435 unsigned int get_gene_linear(int chrono, int offset, const unsigned int offsets [])
436 {
437 if (chrono>1)return offsets[chrono-1]+offset;
438 return offset;
439 }
440
locate_gene_position_max(unsigned int linear,const gene_offset_t * offsets,char ** chro_name,int * pos,int * head_cut_length,int * tail_cut_length,int rl)441 int locate_gene_position_max(unsigned int linear, const gene_offset_t* offsets , char ** chro_name, int * pos, int * head_cut_length, int * tail_cut_length, int rl)
442 {
443 int n = 0;
444
445 (*chro_name) = NULL;
446 (*pos) = -1;
447 int low_idx=0, high_idx= offsets->total_offsets;
448 while(1){
449 if(high_idx <= low_idx+1) {
450 n = max(low_idx-2,0);
451 break;
452 }
453 int mid_idx = (low_idx+high_idx)/2;
454 unsigned int mid_val = offsets->read_offsets[mid_idx];
455 if(mid_val > linear) high_idx = mid_idx;
456 else low_idx = mid_idx+1;
457 }
458
459 for (; n<offsets->total_offsets; n++)
460 {
461 if (offsets->read_offsets[n] > linear)
462 {
463
464 //#warning "======== COMMENT THIS LINE !! ========"
465 //SUBREADprintf("max=%u <= lim=%u : ACCEPTED.\n", rl + linear , offsets->read_offsets[n] + 16);
466
467 // the end of the read should not excess the end of the chromosome
468
469 if (n==0)
470 *pos = linear;
471 else
472 *pos = linear - offsets->read_offsets[n-1];
473
474
475 if(tail_cut_length == NULL){
476 if(rl + linear > offsets->read_offsets[n] + 15 - offsets -> padding){
477 return 1;
478 }
479 } else {
480 unsigned int posn1 = 0;
481 if(n>0) posn1 = offsets->read_offsets[n-1];
482 long long int tct = ( linear + rl - posn1 - offsets -> padding );
483 if(tct < rl)tct = rl;
484 long long int chro_leng = (offsets->read_offsets[n] - posn1 - 2*offsets -> padding + 16);
485
486 //SUBREADprintf("CHRO_LEN : %lld, READ_TAIL %lld , RL=%d\n", chro_leng, tct, rl);
487 tct -= chro_leng;
488 if( tct >= rl ){
489 return 1;
490 }
491 if( tct <0 )tct=0;
492 (*tail_cut_length) = tct;
493 }
494
495 if( (*pos) < offsets -> padding ) {
496 if(head_cut_length == NULL || (*pos) + rl <= offsets -> padding){
497 return 1;
498 }else{
499 (*head_cut_length) = offsets -> padding - (*pos);
500 (*pos) = offsets -> padding;
501 }
502 }
503
504 (*pos) -= offsets -> padding;
505 *chro_name = (char *)offsets->read_names+n*MAX_CHROMOSOME_NAME_LEN;
506
507 return 0;
508 }
509 }
510 return -1;
511 }
512
513
locate_gene_position(unsigned int linear,const gene_offset_t * offsets,char ** chro_name,int * pos)514 int locate_gene_position(unsigned int linear, const gene_offset_t* offsets , char ** chro_name, int * pos)
515 {
516 return locate_gene_position_max(linear, offsets, chro_name, pos, NULL, NULL, 0);
517 }
518
519 #define _index_vote(key) (((unsigned int)key)%GENE_VOTE_TABLE_SIZE)
520
521 #ifdef USE_SLOW_HASHTABLE_INDEX
522 int vv=0;
add_gene_vote(gene_vote_t * vote,int key,int add_new)523 void add_gene_vote(gene_vote_t* vote, int key , int add_new) {
524 add_gene_vote_weighted(vote, key, add_new, 1);
525 }
526
add_gene_vote_weighted(gene_vote_t * vote,int key,int add_new,int w)527 void add_gene_vote_weighted(gene_vote_t* vote, int key , int add_new, int w) {
528 int offset = _index_vote(key);
529 int datalen = vote -> items[offset];
530 unsigned int * dat = vote -> pos[offset];
531 int i;
532
533 for (i=0;i<datalen;i++)
534 {
535 if (dat[i] == key)
536 {
537
538 int test_max = (vote->votes[offset][i]);
539 test_max += w;
540 vote->votes[offset][i] = test_max;
541
542 if(test_max > vote->max_vote){
543 vote->max_vote = test_max;
544 vote->max_position = key;
545 }
546 return;
547 }
548 }
549
550 if(!add_new || datalen >=GENE_VOTE_SPACE)
551 return;
552
553 // w += 16;
554
555 vote -> items[offset] ++;
556 dat[i] = key;
557 vote->votes[offset][i]=w;
558 }
559
560 #endif
561
max_gene_vote(gene_vote_t * vote,int * position_result,int query)562 int max_gene_vote(gene_vote_t* vote, int * position_result, int query)
563 {
564 int n, i, max_index=0;
565 int max_votes = -1;
566 for (n=0; n<GENE_VOTE_TABLE_SIZE; n++)
567 {
568 int itms = vote->items[n];
569 gene_vote_number_t * vots = vote->votes[n];
570 for (i=0; i<itms; i++)
571 {
572
573 if (vots[i] > max_votes)
574 {
575 max_index = n<<16|i;
576 max_votes = vots[i];
577 }
578 }
579 }
580
581 if(max_votes == -1)
582 {
583 *position_result = -1;
584 return 0;
585 }
586 else
587 {
588 *position_result = vote->pos[max_index>>16][max_index&0xffff];
589 return max_votes;
590 }
591 }
592
593
clear_allvote(gene_allvote_t * allvote)594 void clear_allvote(gene_allvote_t* allvote)
595 {
596 memset(allvote -> max_votes,0, allvote -> max_len* sizeof(*allvote -> max_votes));
597 memset(allvote -> masks,0, allvote -> max_len* sizeof(*allvote -> masks));
598 memset(allvote -> span_coverage ,0, allvote -> max_len* sizeof(char));
599 }
600
destory_allvote(gene_allvote_t * allvote)601 void destory_allvote(gene_allvote_t* allvote)
602 {
603 free(allvote -> max_positions);
604 free(allvote -> max_votes);
605 free(allvote -> max_quality);
606 free(allvote -> max_final_quality);
607 free(allvote -> masks);
608 #ifdef REPORT_ALL_THE_BEST
609 free(allvote -> best_records);
610 #endif
611 free(allvote -> is_counterpart);
612 free(allvote -> span_coverage);
613 if(allvote -> max_indel_recorder)
614 free(allvote -> max_indel_recorder);
615 }
616
init_allvote(gene_allvote_t * allvote,int expected_len,int allowed_indels)617 int init_allvote(gene_allvote_t* allvote, int expected_len, int allowed_indels)
618 {
619 int is_OK = 0;
620 allvote -> max_len = expected_len;
621 allvote -> max_positions = (unsigned int *) malloc(sizeof(int)*expected_len);
622 allvote -> max_votes = (gene_vote_number_t *) calloc(sizeof(gene_vote_number_t), expected_len);
623 allvote -> max_quality = (gene_quality_score_t *) calloc(sizeof(gene_quality_score_t), expected_len);
624 allvote -> max_final_quality = (gene_quality_score_t *) calloc(sizeof(gene_quality_score_t), expected_len);
625 allvote -> masks = (short *) calloc(sizeof(short), expected_len);
626 #ifdef REPORT_ALL_THE_BEST
627 allvote -> best_records = (gene_best_record_t *) malloc(sizeof(gene_best_record_t)* expected_len);
628 #endif
629 allvote -> is_counterpart = (unsigned char *) malloc(expected_len);
630
631 allvote -> max_indel_tolerance = allowed_indels;
632 allvote -> indel_recorder_length = max(3*(allowed_indels+1)+1, 28);
633 allvote -> span_coverage = (char *)calloc(sizeof(char), expected_len);
634 allvote -> repeated_regions = (unsigned char *)calloc(sizeof(unsigned char), 16 * expected_len);
635
636 if((allvote -> max_quality && allvote -> max_positions && allvote -> max_votes && allvote -> max_final_quality && allvote -> masks && allvote -> is_counterpart && allvote -> span_coverage))
637 is_OK = 1;
638
639 if(allowed_indels && is_OK)
640 {
641 allvote -> max_indel_recorder = (char *)malloc( allvote -> indel_recorder_length *expected_len);
642 is_OK = (allvote -> max_indel_recorder!=NULL);
643 }
644 else allvote -> max_indel_recorder = NULL;
645
646 if(is_OK)
647 return 0;
648 else
649 {
650 SUBREADputs(MESSAGE_OUT_OF_MEMORY);
651 return 1;
652 }
653 }
654
655
656 // rl_adjust >0: "D" in read; rl_adjust<0: "I" in read
compress_cigar(char * cigar,int total_length,char * read,int * pos_offset,int * rl_adjust)657 void compress_cigar(char *cigar, int total_length, char * read, int * pos_offset, int *rl_adjust)
658 {
659 char tmp[200];
660 char cigar_piece [10];
661
662 int cigar_len = strlen(cigar);
663 cigar_len = min(cigar_len, EXON_MAX_CIGAR_LEN);
664 cigar[cigar_len]=0;
665
666 int i;
667 srInt_64 tmpv = 0;
668
669 char last_operation = 'X';
670 srInt_64 last_tmpv = 0;
671 int is_first_M = 1;
672 int delta_i = 0;
673 int delta_d = 0;
674 int delta_rl = 0;
675 tmp[0]=0;
676 int cigar_length=0;
677
678 for(i=0; i < cigar_len; i++)
679 {
680 char cc = cigar[i];
681 if(isdigit(cc))
682 {
683 tmpv=tmpv*10+(cc-'0');
684 }
685 else if(cc=='-')
686 {
687 last_tmpv = 0;
688 cigar_length = 0;
689 break;
690 }
691 else
692 {
693 if(is_first_M && pos_offset && cc=='S')
694 *pos_offset = tmpv;
695 else if(cc=='M') is_first_M = 0;
696
697 if((cc!=last_operation) && (last_operation!='X'))
698 {
699 if(last_operation == 'M' || last_operation == 'S' || last_operation == 'N' || last_operation == 'B' || last_operation == 'J' || last_operation == 'j' || last_operation == 'b')
700 {
701 if(delta_i)
702 {
703 sprintf(cigar_piece,"%dI", delta_i);
704 strcat(tmp, cigar_piece);
705 }
706 delta_i = 0;
707
708 if(delta_d)
709 {
710 sprintf(cigar_piece,"%dD", delta_d);
711 strcat(tmp, cigar_piece);
712 }
713 delta_d = 0;
714
715 #ifdef __MINGW32__
716 sprintf(cigar_piece,"%I64d%c", last_tmpv, last_operation);
717 #else
718 sprintf(cigar_piece,"%lld%c", last_tmpv, last_operation);
719 #endif
720 strcat(tmp, cigar_piece);
721 }
722
723 if(last_operation == 'M' || last_operation == 'S' || last_operation == 'I')
724 cigar_length += last_tmpv;
725
726 last_tmpv = 0;
727 }
728 if(cc=='I' )
729 {
730 delta_i += tmpv;
731 delta_rl -= tmpv;
732 }
733 if(cc=='D' )
734 {
735 delta_d += tmpv;
736 delta_rl += tmpv;
737 }
738
739
740 last_tmpv += tmpv;
741
742 tmpv = 0;
743 last_operation = cc;
744
745 }
746
747 }
748
749 if(last_tmpv)
750 {
751 if(delta_i)
752 {
753 sprintf(cigar_piece,"%dI", delta_i);
754 strcat(tmp, cigar_piece);
755 }
756 if(delta_d)
757 {
758 sprintf(cigar_piece,"%dD", delta_d);
759 strcat(tmp, cigar_piece);
760 }
761
762
763
764 if(last_operation =='M' || last_operation =='S')
765 {
766 #ifdef __MINGW32__
767 sprintf(cigar_piece,"%I64d%c", tmpv+last_tmpv, last_operation);
768 #else
769 sprintf(cigar_piece,"%lld%c", tmpv+last_tmpv, last_operation);
770 #endif
771 strcat(tmp, cigar_piece);
772 }
773
774 if(last_operation == 'M' || last_operation == 'S' || last_operation == 'I')
775 cigar_length += tmpv+last_tmpv;
776 }
777 if(cigar_length == total_length)
778 {
779 if(rl_adjust)(*rl_adjust)=delta_rl;
780 strcpy(cigar, tmp);
781 }
782 else
783 {
784 sprintf(cigar, "%dM", total_length);
785 }
786 }
787
788 // adjust_len <0: "I" in read; adjust_len>0: "D" in read
show_cigar(char * info,int len,int is_reversed_map,char * buf,int indel_tolerance,int total_subreads,char * read,int * pos_offset,int * adjust_len)789 void show_cigar(char * info, int len, int is_reversed_map, char * buf, int indel_tolerance, int total_subreads, char *read, int * pos_offset, int * adjust_len)
790 {
791 int i;
792 int last_offset = 0, cursor = 0;
793
794 if(info[0]==-1){
795 sprintf(buf, "%dM", len);
796 return;
797 }
798 else if(info[0]==-2){
799 if (strchr( info+1, '-'))
800 sprintf(buf, "%dM", len);
801 else
802 {
803 strncpy(buf, info+1, 98);
804 compress_cigar(buf, len, read, pos_offset, adjust_len);
805 }
806 return;
807 }
808 else if(info[0]==-3){
809 info ++;
810 }
811
812 for(i=0; i<indel_tolerance*3; i+=3)
813 {
814 if (!info[i])break;
815 int dist = info[i+2];
816 //int subread_start = info[i]-1;
817 int subread_end = info[i+1]-1;
818
819 int base_end = (i < indel_tolerance*3-3 && info[i+3])?find_subread_end(len, total_subreads, subread_end):len;
820
821 // SUBREADprintf("BE=%d ; II+3=%d ; len=%d\n", base_end, info[i+3], len);
822
823 int offset = last_offset-dist;
824 if (base_end - cursor - (offset>0?offset:0) <0)
825 {
826 buf[0]=0;
827
828 cursor = 0;
829 break;
830 }
831 if (i >0)
832 {
833 sprintf(buf+strlen(buf), "%d%c%dM", abs(offset), offset>0?'I':'D', base_end - cursor - (offset>0?offset:0));
834 if(adjust_len)
835 (*adjust_len) -= offset;
836 }
837 else
838 sprintf(buf+strlen(buf), "%dM", base_end);
839 last_offset = dist;
840 cursor = base_end;
841 }
842 compress_cigar(buf, len, read, pos_offset, adjust_len);
843 }
844
add_allvote_q(gene_allvote_t * allvote,int qid,int pos,gene_vote_number_t votes,gene_quality_score_t quality,int is_counterpart,short mask,char * max_indel_recorder,gene_value_index_t * array_index,char * read_txt,int read_len,int max_indel,int total_subreads,int space_type,int report_junction,int is_head_high_quality,char * qual_txt,int phred_version,char span_coverage,short ** dynamic_programming_short,char ** dynamic_programming_char)845 void add_allvote_q(gene_allvote_t* allvote,int qid , int pos, gene_vote_number_t votes, gene_quality_score_t quality, int is_counterpart, short mask, char * max_indel_recorder, gene_value_index_t * array_index, char * read_txt, int read_len, int max_indel, int total_subreads, int space_type, int report_junction, int is_head_high_quality, char * qual_txt, int phred_version, char span_coverage, short **dynamic_programming_short , char ** dynamic_programming_char)
846 {
847
848 // SUBREADprintf("Ospan=%d Nspan=%d\n", allvote -> span_coverage[qid], span_coverage);
849
850 }
851
852
853 float CORE_RECOVER_MATCHING_RATE = 0.9;
854 float CORE_INDEL_MATCHING_RATE_TAIL = 0.8;
855 float CORE_INDEL_MATCHING_RATE_HEAD = 0.8;
856
find_and_explain_indel(gene_allvote_t * allvote,int qid,int pos,gene_vote_number_t votes,gene_quality_score_t quality,int is_counterpart,char mask,char * max_indel_recorder,gene_value_index_t * array_index,char * read_txt,int read_len,int max_indel,int total_subreads,int space_type,int report_junction,int is_head_high_quality,char * qual_txt,int phred_version,short ** dynamic_programming_short,char ** dynamic_programming_char)857 void find_and_explain_indel(gene_allvote_t* allvote,int qid , int pos, gene_vote_number_t votes, gene_quality_score_t quality, int is_counterpart, char mask, char * max_indel_recorder, gene_value_index_t * array_index, char * read_txt, int read_len, int max_indel, int total_subreads, int space_type, int report_junction, int is_head_high_quality, char * qual_txt, int phred_version , short ** dynamic_programming_short, char **dynamic_programming_char)
858 {
859 //unsigned int pos0 = pos;
860 if(allvote -> max_indel_recorder)
861 {
862 //PNT111
863 short head_indel_pos=-1 , tail_indel_pos=-1;
864 int head_indel_movement=0, tail_indel_movement=0;
865 if (array_index)
866 {
867 int k;
868 if (max_indel_recorder[0])
869 {
870 int cover_start = find_subread_end(read_len, total_subreads, max_indel_recorder[0]-1)- 15;
871
872 for (k=0; max_indel_recorder[k]; k+=3);
873 int cover_end = find_subread_end(read_len, total_subreads, max_indel_recorder[k-2]-1) + max(0,-max_indel_recorder[k-1]);
874
875 int head_qual = is_head_high_quality?0x7fffffff:0, tail_qual = is_head_high_quality?0:0x7fffffff;
876
877
878 if(qual_txt && qual_txt[0])
879 {
880 int j;
881 head_qual = 0;
882 tail_qual = 0;
883 for (j = 0; j<cover_start; j++)
884 {
885 if(FASTQ_PHRED64 == phred_version)
886 {
887 head_qual += (1000000-get_base_error_prob64i(qual_txt[j]));
888 }
889 else
890 {
891 head_qual += (1000000-get_base_error_prob33i(qual_txt[j]));
892 }
893 }
894 for (j = 0; j<read_len - cover_end; j++)
895 {
896 if(FASTQ_PHRED64 == phred_version)
897 {
898 tail_qual += (1000000-get_base_error_prob64i(qual_txt[read_len - j -1]));
899 }
900 else
901 {
902 tail_qual += (1000000-get_base_error_prob33i(qual_txt[read_len - j -1]));
903 }
904 }
905 }
906
907 int head_must_correct = 4;
908 if(cover_start > 0)
909 {
910 if (head_qual / cover_start < 850000)
911 {
912 CORE_INDEL_MATCHING_RATE_HEAD = 0.75;
913 head_must_correct =2;
914 }
915 else if (head_qual / cover_start < 950000 )
916 {
917 CORE_INDEL_MATCHING_RATE_HEAD = 0.85;
918 head_must_correct =3;
919 }
920 else CORE_INDEL_MATCHING_RATE_HEAD = 0.92;
921 }else CORE_INDEL_MATCHING_RATE_HEAD = 9999;
922
923
924 int tail_must_correct = 4;
925 if(read_len-cover_end > 0)
926 {
927 if (tail_qual / (read_len-cover_end) < 850000)
928 {
929 CORE_INDEL_MATCHING_RATE_TAIL = 0.75;
930 tail_must_correct =2;
931 }
932 else if (tail_qual / (read_len-cover_end) < 950000)
933 {
934 CORE_INDEL_MATCHING_RATE_TAIL = 0.85;
935 tail_must_correct =3;
936 }
937 else CORE_INDEL_MATCHING_RATE_TAIL = 0.92;
938 }else CORE_INDEL_MATCHING_RATE_TAIL = 9999;
939
940
941
942
943 int is_full_covered = 0;
944 is_full_covered = extend_covered_region(array_index, pos, read_txt, read_len, cover_start, cover_end, 4, head_must_correct, tail_must_correct, max_indel, space_type, max_indel_recorder[k-1], &head_indel_pos, &head_indel_movement, &tail_indel_pos, &tail_indel_movement, is_head_high_quality, qual_txt, phred_version, CORE_INDEL_MATCHING_RATE_HEAD, CORE_INDEL_MATCHING_RATE_TAIL);
945 if(head_indel_movement)
946 {
947 pos += head_indel_movement;
948 allvote -> max_positions[qid] = pos;
949 }
950 if(is_full_covered == 3)
951 {
952 allvote -> masks[qid] &= ~IS_RECOVERED_JUNCTION_READ;
953 }
954 else allvote -> masks[qid] |= IS_RECOVERED_JUNCTION_READ;
955 }
956 }
957
958 *(allvote -> max_indel_recorder + qid * allvote -> indel_recorder_length) = 0xff ;
959
960
961 if ((max_indel_recorder[3] || head_indel_pos>=0 || tail_indel_pos>0)){
962 int head_pos = 0;
963 if((head_indel_pos>=0 && report_junction) || head_indel_movement)head_pos = head_indel_pos ;//- max(0,head_indel_movement);
964 int tail_pos = read_len;
965 if((tail_indel_pos>0 && report_junction) || tail_indel_movement)tail_pos = tail_indel_pos ;//- min(0, tail_indel_movement);
966 explain_indel_in_middle(allvote, qid , pos, max_indel_recorder, array_index, read_txt, read_len, max_indel, total_subreads, head_pos, tail_pos , head_indel_movement, tail_indel_movement, report_junction, dynamic_programming_short, dynamic_programming_char);
967 //if(*(allvote -> max_indel_recorder + qid * allvote -> indel_recorder_length) != 0xfd ) allvote -> max_positions[qid] = pos0;
968 }
969
970 }
971
972 }
973
explain_indel_in_middle(gene_allvote_t * allvote,int qid,int pos,char * max_indel_recorder,gene_value_index_t * array_index,char * read_txt,int read_len,int max_indel,int total_subreads,int head_start_point,int tail_end_point,int head_indel_movement,int tail_indel_movement,int report_junction,short ** dynamic_programming_short,char ** dynamic_programming_char)974 void explain_indel_in_middle(gene_allvote_t* allvote, int qid , int pos, char * max_indel_recorder, gene_value_index_t * array_index, char * read_txt, int read_len, int max_indel, int total_subreads, int head_start_point, int tail_end_point, int head_indel_movement, int tail_indel_movement, int report_junction, short ** dynamic_programming_short , char ** dynamic_programming_char)
975 {
976
977 char indel_operations[1500];
978 int i,xx;
979 char tmp_cigar [EXON_MAX_CIGAR_LEN+1];
980
981 int current_pos = head_start_point;
982 int explain_cursor = head_start_point;
983 char last_operation = 0;
984 int del_number = 0;
985 int dynamic_delta = 0;
986
987 tmp_cigar[0]=0;
988
989 if(current_pos >0)
990 {
991 if (head_indel_movement != 0)
992 sprintf(tmp_cigar, "%dM%d%c", head_start_point - max(head_indel_movement, 0), abs(head_indel_movement), head_indel_movement>0?'I':'D');
993 else if (report_junction)
994 sprintf(tmp_cigar, "%dS", head_start_point);
995 }
996
997 //ddprintf ("R=%s; REC[8]=%d [9]=%d\n", read_txt, max_indel_recorder[8], max_indel_recorder[9]);
998
999 for (i=3; max_indel_recorder[i]; i+=3)
1000 {
1001 if (i >= max_indel*3) break;
1002
1003 int last_dist = max_indel_recorder[i-1];
1004 int black_subread_start = max_indel_recorder[i-2]-1;
1005 int black_subread_end = max_indel_recorder[i]-1;
1006 if (black_subread_end < black_subread_start+1) black_subread_end = black_subread_start+1;
1007 while(max_indel_recorder[i+3])
1008 {
1009 if (max_indel_recorder[i+3]- black_subread_end>2 || i >= max_indel*3-3) break;
1010 black_subread_end = max_indel_recorder[i+3]-1;
1011 i+=3;
1012 }
1013 int black_base_start = find_subread_end(read_len, total_subreads,black_subread_start )- 3 + max(0,-last_dist);
1014 int black_base_end = find_subread_end(read_len, total_subreads, black_subread_end)-15 + 3 + max(0,-max_indel_recorder[i+2]);
1015
1016 int blackref_base_start = black_base_start - max(0,-last_dist) + max(0,last_dist);
1017 // SUBREADprintf("baseref_offset=%d\n", max(0,last_dist));
1018
1019 int gap_end_read = black_base_end + max(0,max_indel_recorder[i+2]);
1020
1021 blackref_base_start = max(0,blackref_base_start);
1022 black_base_start = max(0,black_base_start);
1023 black_base_end = min(read_len, black_base_end);
1024
1025 int vpos = strlen(tmp_cigar);
1026 if (vpos > EXON_MAX_CIGAR_LEN -6) break;
1027
1028 int exp_indel = last_dist - max_indel_recorder[i+2];
1029
1030
1031 int moves = 0;
1032 if(black_base_end > black_base_start)
1033 moves = dynamic_align (read_txt + black_base_start, black_base_end-black_base_start , array_index, pos + blackref_base_start, max_indel, indel_operations, exp_indel, -10, black_base_end - black_base_start+5, dynamic_programming_short, dynamic_programming_char);
1034
1035 last_operation = 0;
1036 if(moves)// < (black_base_end-black_base_start) + 6 + 1.5 * exp_indel)
1037 {
1038 int pre_read_len = 0;
1039 current_pos = black_base_start ;
1040 xx=0;
1041 while (pre_read_len < 0 && xx < moves)
1042 {
1043 if(indel_operations[xx]!=2)pre_read_len++;
1044 xx++;
1045 }
1046 for (; xx<moves; xx++)
1047 {
1048 int current_operation = indel_operations[xx];
1049 if(current_operation == 3) current_operation = 0;
1050
1051 if(current_operation != last_operation && (current_pos < gap_end_read-1+1 || current_operation == 0))// && current_pos!=explain_cursor)
1052 {
1053 int vpos = strlen(tmp_cigar);
1054 if (vpos>EXON_MAX_CIGAR_LEN-6) break;
1055 sprintf(tmp_cigar + vpos, "%d%c", last_operation==1?del_number:(current_pos - explain_cursor), last_operation==0?'M':(last_operation==1?'D':'I'));
1056 explain_cursor = current_pos ;
1057 del_number = 0;
1058 }
1059 if(current_pos >= gap_end_read-1 || xx == moves - 1){
1060 if(current_operation != 0 && current_pos!=explain_cursor)
1061 {
1062 int vpos = strlen(tmp_cigar);
1063 sprintf(tmp_cigar + vpos, "%d%c", current_operation==1?del_number:(current_pos - explain_cursor), current_operation==1?'D':'I');
1064 explain_cursor = current_pos;
1065 del_number=0;
1066 }
1067 break;
1068 }
1069
1070 if (indel_operations[xx]==0) current_pos ++;
1071 else if (indel_operations[xx]==1) {del_number++; dynamic_delta++;}//"D"
1072 else if (indel_operations[xx]==2) {current_pos ++; dynamic_delta--;} //"I"
1073 else if (indel_operations[xx]==3) current_pos ++;
1074
1075
1076 last_operation = current_operation;
1077 }
1078 }
1079 else
1080 {
1081 int movement = last_dist - max_indel_recorder[i+2];
1082 int vpos = strlen(tmp_cigar);
1083 if (vpos>EXON_MAX_CIGAR_LEN-6) break;
1084
1085 current_pos = find_subread_end(read_len, total_subreads,max_indel_recorder[i]-1)-15 + (max_indel_recorder[i+2] >0?max_indel_recorder[i+2]:-max_indel_recorder[i+2]);
1086 current_pos -= (black_base_end - black_base_start)/2 -3;
1087
1088 if(movement)
1089 sprintf(tmp_cigar + vpos, "%dM%d%c", current_pos - explain_cursor , abs(movement), movement<0?'D':'I' );
1090 else if (current_pos - explain_cursor>0)
1091 sprintf(tmp_cigar + vpos, "%dM", current_pos - explain_cursor );
1092
1093 explain_cursor = current_pos + (movement >0?movement:0);
1094 current_pos = find_subread_end(read_len, total_subreads,max_indel_recorder[i+1]-1);
1095 last_operation = 0;
1096 del_number = 0;
1097 moves = 0;
1098 }
1099 last_operation = 0;
1100 current_pos = black_base_end;
1101
1102
1103 }
1104
1105 int vpos = strlen(tmp_cigar);
1106 if (vpos > EXON_MAX_CIGAR_LEN-9 )
1107 {
1108 *(allvote -> max_indel_recorder + qid * allvote -> indel_recorder_length) = 0xfd;
1109 memcpy(allvote -> max_indel_recorder + qid * allvote -> indel_recorder_length+1, max_indel_recorder, 3 * max_indel);
1110 }
1111 else
1112 {
1113 if (explain_cursor<read_len)
1114 {
1115 if(tail_end_point > explain_cursor)
1116 sprintf(tmp_cigar+vpos,"%dM", tail_end_point - explain_cursor);
1117 vpos = strlen(tmp_cigar);
1118 if(tail_end_point< read_len)
1119 {
1120 if (tail_indel_movement!=0)
1121 {
1122 int tail_len_m = read_len -(tail_end_point - min(tail_indel_movement, 0));
1123 if(tail_len_m)
1124 sprintf(tmp_cigar+vpos, "%d%c%dM", abs(tail_indel_movement), tail_indel_movement<0?'I':'D', tail_len_m);
1125 else
1126 sprintf(tmp_cigar+vpos, "%d%c", abs(tail_indel_movement), tail_indel_movement<0?'I':'D');
1127 }
1128 else if (report_junction)
1129 sprintf(tmp_cigar+vpos, "%dS",read_len - tail_end_point );
1130 }
1131 }
1132 // if (head_start_point >0 || tail_end_point < read_len)
1133 // strcat(tmp_cigar,"X");
1134
1135 *(allvote -> max_indel_recorder + qid * allvote -> indel_recorder_length) = 0xfe ;
1136 strncpy(allvote -> max_indel_recorder + qid * allvote -> indel_recorder_length+1, tmp_cigar, allvote -> indel_recorder_length - 2);
1137 }
1138 }
1139
evaluate_piece(char * piece_str,int chron,int offset,int is_counterpart,int start_pos,int end_pos)1140 int evaluate_piece(char * piece_str, int chron, int offset, int is_counterpart, int start_pos, int end_pos)
1141 {
1142 char fname[MAX_FILE_NAME_LENGTH];
1143 int inner_pos = 0, i;
1144 FILE * fp;
1145 char next_char=0;
1146 int ret = 0;
1147
1148 if (chron == 0)
1149 sprintf(fname, "/opt/Work2001/Gene-Search/src/GENE-LIB/%02da.fa", chron);
1150 else
1151 sprintf(fname, "/opt/Work2001/Gene-Search/src/GENE-LIB/%02d.fa", chron);
1152
1153 inner_pos = offset + offset / 70;
1154
1155 fp = f_subr_open(fname,"r");
1156
1157 while(next_char!='\n')
1158 next_char=fgetc(fp);
1159 fseek(fp, inner_pos, SEEK_CUR);
1160
1161 for(i=0 ; i<end_pos; i++)
1162 {
1163 next_char = get_next_char(fp);
1164 if(i < start_pos)
1165 continue;
1166 if(next_char == 'N')
1167 ret ++;
1168 else{
1169 if(is_counterpart)
1170 {
1171 if (piece_str[99-i] == 'A' && next_char == 'T')
1172 ret ++;
1173 else if (piece_str[99-i] == 'G' && next_char == 'C')
1174 ret ++;
1175 else if (piece_str[99-i] == 'T' && next_char == 'A')
1176 ret ++;
1177 else if (piece_str[99-i] == 'C' && next_char == 'G')
1178 ret ++;
1179 }
1180 else if(piece_str[i] == next_char)
1181 ret ++;
1182 }
1183 }
1184
1185 fclose(fp);
1186
1187 return ret;
1188
1189 }
1190
match_chro_indel(char * read,gene_value_index_t * index,unsigned int pos,int test_len,int is_negative_strand,int space_type,int indel_size,gene_vote_number_t * indel_recorder,int total_subreads)1191 int match_chro_indel(char * read, gene_value_index_t * index, unsigned int pos, int test_len, int is_negative_strand, int space_type, int indel_size, gene_vote_number_t * indel_recorder, int total_subreads)
1192 {
1193
1194 int ret = 0;
1195 int tali = 0;
1196 int last_section_end = 0;
1197
1198 for(tali = 0; indel_recorder[tali*3] && tali<MAX_INDEL_TOLERANCE; tali++)
1199 {
1200 int section_last_subread = indel_recorder[tali*3+1] - 1;
1201 int section_offset = indel_recorder[tali*3+2];
1202 int this_section_end = find_subread_end(test_len , total_subreads, section_last_subread);
1203 if(!indel_recorder[tali*3+3]) this_section_end = test_len;
1204
1205 this_section_end = min(test_len, this_section_end);
1206 this_section_end = max(this_section_end, last_section_end);
1207
1208 ret += match_chro(read + last_section_end - min(0, section_offset), index, pos + last_section_end + max(0, section_offset), this_section_end - last_section_end + min(0, section_offset), is_negative_strand * 0, space_type);
1209
1210 last_section_end = this_section_end;
1211 }
1212
1213 return ret;
1214
1215 }
1216
1217
match_chro_indel_old(char * read,gene_value_index_t * index,unsigned int pos,int test_len,int is_negative_strand,int space_type,int indel_size)1218 int match_chro_indel_old(char * read, gene_value_index_t * index, unsigned int pos, int test_len, int is_negative_strand, int space_type, int indel_size)
1219 {
1220
1221 int i;
1222 int ret = 0;
1223 for (i = -indel_size; i<=indel_size ; i++)
1224 {
1225 if (pos +i + test_len < index -> start_base_offset + index -> length && pos > -i)
1226 ret += match_chro(read, index, pos + i, test_len, is_negative_strand, space_type);
1227 }
1228 return ret;
1229
1230 }
1231
1232
1233
mark_votes_array_index(char * read_str,int read_len,gene_vote_t * dest,gene_vote_t * src,gene_value_index_t * my_array_index,int color_space,int indel_tolerance,int min_minor,const char quality_str[],int quality_scale,int is_negative_strand)1234 void mark_votes_array_index(char * read_str, int read_len, gene_vote_t * dest, gene_vote_t * src, gene_value_index_t * my_array_index, int color_space, int indel_tolerance, int min_minor, const char quality_str [], int quality_scale, int is_negative_strand)
1235 {
1236 int i, j;
1237
1238 dest -> max_vote = 0;
1239 dest -> max_quality = 0;
1240
1241 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
1242 {
1243 dest -> items[i] = src -> items[i];
1244 for(j=0; j< src->items[i]; j++)
1245 {
1246 unsigned int potential_position = src-> pos[i][j];
1247 float matchingness_count = 0;
1248 if (src -> votes [i][j]>= min_minor)
1249 //matchingness_count = match_read(read_str, read_len, potential_position, my_array_index, color_space, indel_tolerance, quality_str, quality_scale);
1250 matchingness_count = 1.*match_chro_indel(read_str, my_array_index, potential_position, read_len, 0, color_space, indel_tolerance, NULL, 0);
1251
1252 dest -> pos[i][j] = potential_position;
1253 dest -> quality[i][j] = matchingness_count;
1254 dest -> votes [i][j] =src -> votes [i][j];
1255 dest -> masks[i][j] = src -> masks[i][j];
1256 dest -> coverage_start[i][j] = src -> coverage_start[i][j];
1257 dest -> coverage_end[i][j] = src -> coverage_end[i][j];
1258 memcpy(dest -> indel_recorder[i][j], src -> indel_recorder[i][j], MAX_INDEL_TOLERANCE*3*sizeof(char));
1259 // find for the `best positions'; only the best positions are replaced by the base machingness scores.
1260 if((matchingness_count > dest -> max_quality && src -> votes [i][j] == dest -> max_vote) || (src -> votes [i][j] > dest -> max_vote))
1261 {
1262 memcpy(dest -> max_indel_recorder, src -> indel_recorder[i][j], MAX_INDEL_TOLERANCE*3*sizeof(char));
1263 dest -> max_vote = src -> votes [i][j];
1264 dest -> max_mask = src -> masks[i][j];
1265 dest -> max_quality = matchingness_count;
1266 dest -> max_position = potential_position;
1267 dest -> max_coverage_start = src -> coverage_start[i][j];
1268 dest -> max_coverage_end = src -> coverage_end[i][j];
1269 }
1270 }
1271 }
1272 }
1273
select_positions_array(char * read1_str,int read1_len,char * read2_str,int read2_len,gene_vote_t * vote_read1,gene_vote_t * vote_read2,gene_vote_number_t * numvote_read1,gene_vote_number_t * numvote_read2,gene_quality_score_t * sum_quality,gene_quality_score_t * qual_r1,gene_quality_score_t * qual_r2,gene_vote_number_t * r1_recorder,gene_vote_number_t * r2_recorder,gehash_data_t * pos_read1,gehash_data_t * pos_read2,unsigned int max_pair_dest,unsigned int min_pair_dest,int min_major,int min_minor,int is_negative_strand,gene_value_index_t * my_array_index,int color_space,int indel_tolerance,int number_of_anchors_quality,const char quality_str1[],const char quality_str2[],int quality_scale,int max_indel_len,int * is_break_even)1274 int select_positions_array(char * read1_str, int read1_len, char * read2_str, int read2_len, gene_vote_t * vote_read1, gene_vote_t * vote_read2, gene_vote_number_t * numvote_read1, gene_vote_number_t * numvote_read2, gene_quality_score_t * sum_quality, gene_quality_score_t * qual_r1, gene_quality_score_t * qual_r2, gene_vote_number_t * r1_recorder, gene_vote_number_t * r2_recorder, gehash_data_t * pos_read1, gehash_data_t * pos_read2, unsigned int max_pair_dest, unsigned int min_pair_dest, int min_major, int min_minor,int is_negative_strand, gene_value_index_t * my_array_index, int color_space, int indel_tolerance, int number_of_anchors_quality, const char quality_str1 [], const char quality_str2 [], int quality_scale, int max_indel_len, int * is_break_even)
1275 {
1276 gene_vote_t base_vote_1 , base_vote_2;
1277
1278 mark_votes_array_index(read1_str, read1_len, &base_vote_1, vote_read1, my_array_index, color_space, indel_tolerance, min_minor, quality_str1, quality_scale, is_negative_strand);
1279 mark_votes_array_index(read2_str, read2_len, &base_vote_2, vote_read2, my_array_index, color_space, indel_tolerance, min_minor, quality_str2, quality_scale, is_negative_strand);
1280
1281 // SUBREADprintf("Q=%.3f\n", *qual_r2);
1282
1283 return select_positions(&base_vote_1, &base_vote_2, numvote_read1, numvote_read2, sum_quality, qual_r1, qual_r2, pos_read1, pos_read2, r1_recorder , r2_recorder ,max_pair_dest, min_pair_dest, min_major, min_minor, is_negative_strand, number_of_anchors_quality,max_indel_len, is_break_even);
1284 }
1285
1286
select_positions(gene_vote_t * vote_read1,gene_vote_t * vote_read2,gene_vote_number_t * numvote_read1,gene_vote_number_t * numvote_read2,gene_quality_score_t * sum_quality,gene_quality_score_t * qual_r1,gene_quality_score_t * qual_r2,gehash_data_t * pos_read1,gehash_data_t * pos_read2,gene_vote_number_t * read1_indel_recorder,gene_vote_number_t * read2_indel_recorder,unsigned int max_pair_dest,unsigned int min_pair_dest,int min_major,int min_minor,int is_negative_strand,int number_of_anchors_quality,int max_indel_len,int * is_breakeven)1287 int select_positions(gene_vote_t * vote_read1, gene_vote_t * vote_read2, gene_vote_number_t * numvote_read1, gene_vote_number_t * numvote_read2, gene_quality_score_t * sum_quality, gene_quality_score_t * qual_r1, gene_quality_score_t * qual_r2 , gehash_data_t * pos_read1, gehash_data_t * pos_read2, gene_vote_number_t * read1_indel_recorder, gene_vote_number_t * read2_indel_recorder, unsigned int max_pair_dest, unsigned int min_pair_dest, int min_major, int min_minor,int is_negative_strand, int number_of_anchors_quality, int max_indel_len, int * is_breakeven)
1288 {
1289
1290 int k, i, j, anchors = 0;
1291 gehash_data_t anchors_position [ANCHORS_NUMBER];
1292 unsigned char anchor_read [ANCHORS_NUMBER];
1293
1294 gene_vote_number_t anchor_votes = max(vote_read1->max_vote, vote_read2->max_vote);
1295 gene_vote_number_t anchors_votes [ANCHORS_NUMBER];
1296 gene_quality_score_t anchors_quality [ANCHORS_NUMBER];
1297 int anchors_buckets [ANCHORS_NUMBER];
1298 int anchors_index [ANCHORS_NUMBER];
1299
1300 gene_vote_number_t minor_votes [ANCHORS_NUMBER];
1301 gene_quality_score_t minor_quality [ANCHORS_NUMBER];
1302 gehash_data_t minor_position [ANCHORS_NUMBER];
1303 int minor_buckets [ANCHORS_NUMBER];
1304 int minor_index [ANCHORS_NUMBER];
1305 char is_minor_breakeven [ANCHORS_NUMBER];
1306
1307 if(anchor_votes < min_major)
1308 return 0;
1309
1310 for (k=0; k<2; k++)
1311 {
1312 gene_vote_t * current_vote = k?vote_read2:vote_read1;
1313 if (current_vote->max_vote < anchor_votes)
1314 continue;
1315
1316 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
1317 for(j=0; j< current_vote->items[i]; j++)
1318 {
1319 if(current_vote->votes[i][j] >= anchor_votes - 0)
1320 {
1321 anchors_position[anchors] = current_vote->pos[i][j];
1322 anchors_votes [anchors] = current_vote->votes[i][j];
1323 anchors_quality [anchors] = current_vote->quality[i][j];
1324 anchor_read [anchors] = k;
1325 anchors_buckets [anchors] = i;
1326 anchors_index [anchors] = j;
1327
1328 anchors++;
1329
1330 if(anchors >= ANCHORS_NUMBER)
1331 {
1332 // SUBREADprintf("Abnormally too many anchors [%d, %d] %f\n", i, j, anchor_votes);
1333 anchors --;
1334 }
1335 }
1336 }
1337
1338 }
1339
1340 memset(minor_votes,0, ANCHORS_NUMBER*sizeof(gene_vote_number_t));
1341 memset(minor_quality,0, ANCHORS_NUMBER*sizeof(gene_quality_score_t));
1342 memset(is_minor_breakeven,0, ANCHORS_NUMBER);
1343 for (k=0; k<2; k++)
1344 {
1345 gene_vote_t * current_vote = k?vote_read2:vote_read1;
1346 gene_vote_t * current_vote2 = k?vote_read1:vote_read2;
1347
1348 if (current_vote2->max_vote < anchor_votes)
1349 continue;
1350
1351 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
1352 for(j=0; j< current_vote->items[i]; j++)
1353 {
1354 if(current_vote->votes[i][j] >= min_minor)
1355 {
1356 int l;
1357 for (l=0; l<anchors; l++)
1358 {
1359
1360 if(anchor_read[l]==k)continue;
1361
1362 long long int abdist = current_vote->pos[i][j];
1363 abdist -= anchors_position[l];
1364
1365 if((anchor_read[l] && !is_negative_strand) ||
1366 ((!anchor_read[l]) && is_negative_strand))
1367 abdist = -abdist;
1368
1369 if (abdist < 0)
1370 //#warning WGSIM generates paired-end data in a wrong way; abdist is given its absolute value in such a case.
1371 abdist = -abdist;
1372 // continue;
1373
1374 long long int abdist_old = minor_position[l];
1375 abdist_old -= anchors_position[l];
1376 if(abdist_old <0) abdist_old=-abdist_old;
1377 if ( (minor_votes[l] < current_vote->votes[i][j] ||
1378 (minor_votes[l] == current_vote->votes[i][j] && current_vote->quality [i][j] > minor_quality[l]) ||
1379 (minor_votes[l] == current_vote->votes[i][j] && abs(current_vote->quality [i][j] - minor_quality[l])<0.0001 && abdist < abdist_old))&&
1380 (abdist <= max_pair_dest) &&
1381 (abdist >= min_pair_dest)
1382 )
1383 {
1384 minor_votes[l] = current_vote->votes[i][j];
1385 minor_position[l] = current_vote->pos[i][j];
1386 minor_quality[l] = current_vote->quality [i][j];
1387 minor_buckets[l] = i;
1388 minor_index[l] = j;
1389 }
1390 }
1391 }
1392 }
1393
1394 }
1395
1396 gene_vote_number_t selection_minor_vote = 0;
1397 gene_vote_number_t selection_major_vote = 0;
1398 gehash_data_t selection_minor_pos = 0;
1399 gehash_data_t selection_major_pos = 0;
1400
1401 int selected_major_index=0, selected_major_bucket=0, selected_minor_bucket=0, selected_minor_index=0;
1402
1403 *sum_quality = 0;
1404
1405 unsigned char selection_major_read_no = 0;
1406 //int reason = 0;
1407
1408 for (k=0; k<anchors; k++)
1409 {
1410 if(minor_votes[k]==0)
1411 continue;
1412
1413 int need_replace = 0;
1414
1415
1416 if ((minor_votes[k]+anchors_votes[k]) > (selection_minor_vote+selection_major_vote) ||
1417 ((minor_votes[k]+anchors_votes[k]) == (selection_minor_vote+selection_major_vote) &&
1418 minor_quality[k] + anchors_quality[k] > *sum_quality))
1419 {
1420 need_replace = 1;
1421 }
1422 else if (minor_votes[k]+anchors_votes[k] == selection_minor_vote+selection_major_vote &&
1423 minor_quality[k] + anchors_quality[k] == *sum_quality)
1424 {
1425 if (selection_minor_pos != minor_position[k] && minor_position[k] != selection_major_pos)
1426 {
1427 * is_breakeven = 1;
1428 if (selection_major_pos > anchors_position[k]) need_replace = 1;
1429 }
1430 }
1431
1432 if(need_replace){
1433 selection_minor_pos = minor_position[k];
1434 selection_major_pos = anchors_position[k];
1435 selection_minor_vote = minor_votes[k];
1436 selection_major_vote = anchors_votes[k];
1437 selection_major_read_no = anchor_read[k];
1438 selected_major_bucket = anchors_buckets[k];
1439 selected_minor_bucket = minor_buckets[k];
1440 selected_major_index = anchors_index[k];
1441 selected_minor_index = minor_index[k];
1442 * is_breakeven = is_minor_breakeven[k];
1443 * qual_r1 = anchor_read[k] ? minor_quality[k]:anchors_quality[k];
1444 * qual_r2 = anchor_read[k] ? anchors_quality[k]: minor_quality[k];
1445 * sum_quality = minor_quality[k] + anchors_quality[k];
1446
1447 }
1448 }
1449
1450 if(selection_minor_vote > 0)
1451 {
1452 if(selection_major_read_no) // major is on read 2
1453 {
1454 *numvote_read1 = selection_minor_vote;
1455 *numvote_read2 = selection_major_vote;
1456 *pos_read1 = selection_minor_pos;
1457 *pos_read2 = selection_major_pos ;
1458 indel_recorder_copy(read1_indel_recorder, vote_read1 -> indel_recorder[selected_minor_bucket][selected_minor_index]);
1459 indel_recorder_copy(read2_indel_recorder, vote_read2 -> indel_recorder[selected_major_bucket][selected_major_index]);
1460 }
1461 else
1462 {
1463 *numvote_read1 = selection_major_vote;
1464 *numvote_read2 = selection_minor_vote;
1465 *pos_read1 = selection_major_pos;
1466 *pos_read2 = selection_minor_pos;
1467 indel_recorder_copy(read1_indel_recorder, vote_read1 -> indel_recorder[selected_major_bucket][selected_major_index]);
1468 indel_recorder_copy(read2_indel_recorder, vote_read2 -> indel_recorder[selected_minor_bucket][selected_minor_index]);
1469 }
1470 return 1;
1471 }
1472
1473 return 0;
1474 }
1475
destroy_offsets(gene_offset_t * offsets)1476 void destroy_offsets(gene_offset_t* offsets)
1477 {
1478 free(offsets->read_names);
1479 free(offsets->read_offsets);
1480 HashTableDestroy(offsets->read_name_to_index);
1481 }
1482
load_offsets(gene_offset_t * offsets,const char index_prefix[])1483 int load_offsets(gene_offset_t* offsets , const char index_prefix [])
1484 {
1485 char fn[MAX_FILE_NAME_LENGTH];
1486 FILE * fp;
1487 int n=0;
1488 int padding = 0;
1489
1490 int is_V3_index = gehash_load_option(index_prefix, SUBREAD_INDEX_OPTION_INDEX_PADDING , &padding);
1491 if(!is_V3_index) return 1;
1492
1493 memset(offsets, 0, sizeof( gene_offset_t));
1494 sprintf(fn, "%s.reads", index_prefix);
1495
1496 fp = f_subr_open(fn, "r");
1497
1498 if(!fp)
1499 {
1500 SUBREADprintf("file not found :%s\n", fn);
1501 return 1;
1502 }
1503
1504 int current_max_n = 100;
1505 offsets->read_names = malloc(current_max_n * MAX_READ_NAME_LEN);
1506 offsets->read_offsets= malloc(current_max_n * sizeof(int));
1507 offsets->read_name_to_index = HashTableCreate(5000);
1508 offsets->padding = padding;
1509
1510 HashTableSetKeyComparisonFunction(offsets->read_name_to_index, my_strcmp);
1511 HashTableSetHashFunction(offsets->read_name_to_index ,HashTableStringHashFunction);
1512 HashTableSetDeallocationFunctions(offsets->read_name_to_index ,free, NULL);
1513
1514
1515
1516 while (!feof(fp))
1517 {
1518 int i=0, step = 0, j=0;
1519
1520 read_line(MAX_FILE_NAME_LENGTH-1,fp, fn, 0);
1521 if (strlen(fn)<2)continue;
1522 while (fn[i])
1523 {
1524 if (fn[i] == '\t')
1525 {
1526 fn[i]=0;
1527 offsets->read_offsets[n] = (unsigned int)atoll(fn);
1528 step = 1;
1529 }
1530 else if (step)
1531 {
1532 if(j<MAX_CHROMOSOME_NAME_LEN-1)
1533 {
1534 *(offsets->read_names+MAX_CHROMOSOME_NAME_LEN*n + (j++)) = fn[i];
1535 *(offsets->read_names+MAX_CHROMOSOME_NAME_LEN*n + j) = 0;
1536 }
1537 }
1538 i++;
1539 }
1540
1541 char * read_name_mem = malloc(MAX_CHROMOSOME_NAME_LEN);
1542 strcpy(read_name_mem, offsets->read_names + n*MAX_CHROMOSOME_NAME_LEN);
1543
1544 //printf("09NOV: add_annotation_to_junctions: '%s',%u\n", read_name_mem , n);
1545 HashTablePut(offsets->read_name_to_index, read_name_mem , NULL + 1 + n);
1546
1547 n++;
1548 if(n >= current_max_n)
1549 {
1550 offsets->read_names = realloc(offsets->read_names, 2*current_max_n * MAX_CHROMOSOME_NAME_LEN);
1551 offsets->read_offsets = realloc(offsets->read_offsets, 2*current_max_n * sizeof(int));
1552 current_max_n*=2;
1553 }
1554
1555
1556 offsets->read_offsets[n] = 0;
1557
1558 }
1559
1560 offsets->total_offsets=n;
1561 fclose(fp);
1562 return 0;
1563 }
1564
1565 #ifndef MAKE_STANDALONE
1566 #define CLOCK_USE_GETTIME
1567 #endif
1568
miltime()1569 double miltime(){
1570 double ret;
1571 #if defined(__FreeBSD__) || defined(__DragonFly__)
1572 struct timeval tp;
1573 struct timezone tz;
1574 tz.tz_minuteswest=0;
1575 tz.tz_dsttime=0;
1576 gettimeofday(&tp,&tz);
1577 ret = tp.tv_sec+ 0.001*0.001* tp.tv_usec;
1578 #else
1579 #ifdef CLOCK_USE_GETTIME
1580 struct timespec tsc;
1581 clock_gettime(CLOCK_REALTIME, &tsc);
1582 ret = tsc.tv_sec*1. + tsc.tv_nsec*1./1000/1000/1000;
1583 #else
1584 struct timeb trp;
1585 ftime(&trp);
1586 ret = trp.time*1.0+(trp.millitm*1.0/1000.0);
1587 #endif
1588 #endif
1589
1590 return ret;
1591 }
1592
1593 #define front2(str, bias) (*((str)+(bias))+*((str)+1+(bias)))
1594 #define front4(str, bias) (front2(str, bias)+front2(str, bias+2))
1595 #define front8(str, bias) (front4(str, bias)+front4(str, bias+4))
1596 #define front16(str, bias) (front8(str, bias)+front8(str, bias+8))
1597
match_read(const char read_str[],int read_len,unsigned int potential_position,gene_value_index_t * my_array_index,int space_type,int indel_tolerance,const char quality_str[],int quality_scale)1598 float match_read(const char read_str[], int read_len, unsigned int potential_position, gene_value_index_t * my_array_index, int space_type, int indel_tolerance, const char quality_str [], int quality_scale)
1599 {
1600 float ret = 0;
1601 int i, bias;
1602 char read_matchingness [7][1250];
1603
1604 if(indel_tolerance>3) indel_tolerance = 3;
1605
1606 for(bias=-indel_tolerance; bias<=indel_tolerance;bias++)
1607 {
1608 for(i=0;i<read_len; i++)
1609 {
1610 char base_int = base2int(read_str[i]);
1611 int is_matched_base = gvindex_match_base(my_array_index, potential_position+i+bias, base_int);
1612 read_matchingness[bias+indel_tolerance][i] = is_matched_base;
1613
1614 /* if(i % 3 == 2)
1615 {
1616 int b2_match = (read_matchingness[bias+indel_tolerance][i -2] && read_matchingness[bias+indel_tolerance][i -1] && read_matchingness[bias+indel_tolerance][i]);
1617 if(b2_match)
1618 {
1619 read_matchingness[bias+indel_tolerance][i] = 2;
1620 read_matchingness[bias+indel_tolerance][i-1] = 2;
1621 read_matchingness[bias+indel_tolerance][i-2] = 2;
1622 }
1623 }*/
1624 }
1625 }
1626
1627 for(i=0; i<read_len-4; i+=4)
1628 {
1629
1630 int j;
1631 int best_movement = 0;
1632 float max_matchness = -1;
1633 for (j=-indel_tolerance; j<=indel_tolerance; j++)
1634 {
1635 int m = front4(read_matchingness[j],i);
1636 if(m > max_matchness)
1637 {
1638 max_matchness = m*1.;
1639 best_movement = j;
1640 }
1641 }
1642
1643 // SUBREADprintf("best-match=%f\n", max_matchness);
1644 //
1645
1646 if (quality_str[0])
1647 {
1648 max_matchness = 0;
1649 // float qs0 = max_matchness;
1650 for (j=0; j<4; j++)
1651 {
1652 if(read_matchingness[best_movement][j+i])
1653 {
1654 // penalty
1655 max_matchness += 1.03+get_base_quality_score(quality_str[j+i], quality_scale);
1656 }
1657 else
1658 {
1659 // relief
1660 //max_matchness += 0.2-get_base_phred(quality_str[j+i])*0.01;
1661 }
1662 }
1663 // SUBREADprintf("Delta matchingness = %f\n", qs0 - max_matchness);
1664 }
1665
1666 ret += max_matchness; //front8(read_matchingness[movement], i); // (front4(read_matchingness[movement], i)==4?4:0);
1667 }
1668
1669
1670 return ret;
1671 }
1672
1673
final_matchingness_scoring(const char read_str[],const char quality_str[],int read_len,gene_vote_t * vote,gehash_data_t * max_position,gene_vote_number_t * max_vote,short * max_mask,gene_quality_score_t * max_quality,gene_value_index_t * my_array_index,int space_type,int indel_tolerance,int quality_scale,gene_vote_number_t * max_indel_recorder,int * max_coverage_start,int * max_coverage_end)1674 void final_matchingness_scoring(const char read_str[], const char quality_str[], int read_len, gene_vote_t * vote, gehash_data_t * max_position, gene_vote_number_t * max_vote, short *max_mask, gene_quality_score_t * max_quality, gene_value_index_t * my_array_index, int space_type, int indel_tolerance, int quality_scale, gene_vote_number_t * max_indel_recorder, int * max_coverage_start, int * max_coverage_end)
1675 {
1676 int i, j;
1677 gene_quality_score_t max_matching = -1.0;
1678
1679 *max_vote = vote -> max_vote;
1680
1681 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
1682 for(j=0; j< vote->items[i]; j++)
1683 {
1684 if( vote->votes[i][j] < vote -> max_vote -1 ) continue;
1685
1686 unsigned int potential_position = vote -> pos[i][j];
1687 gene_quality_score_t matchingness_count = 1.*match_chro_indel((char *)read_str, my_array_index, potential_position, read_len, 0, space_type, indel_tolerance, NULL, 0);
1688 if(matchingness_count > max_matching)
1689 {
1690 max_matching = matchingness_count;
1691 *max_position = potential_position;
1692 *max_mask = vote -> masks[i][j];
1693 *max_coverage_start = vote-> coverage_start[i][j];
1694 *max_coverage_end = vote -> coverage_end[i][j];
1695 indel_recorder_copy(max_indel_recorder, vote-> indel_recorder[i][j]);
1696
1697 * max_quality = max_matching;
1698 }
1699 else if (matchingness_count == max_matching)
1700 {
1701 // SUBREADprintf("\nBREAK EVEN DETECTED AT SORTED TABLE: %u (kept) and %u\n", vote->max_position, vote -> pos[i][j]);
1702 (*max_mask) |= IS_BREAKEVEN_READ ;
1703 }
1704 }
1705 }
1706
1707 int DPALIGN_CREATEGAP_PENALTY = -2 ;
1708 int DPALIGN_EXTENDGAP_PENALTY = 0 ;
1709 int DPALIGN_MISMATCH_PENALTY = 0;
1710 int DPALIGN_MATCH_SCORE = 2;
1711
1712
1713
search_DP_branch(char * read,int read_len,gene_value_index_t * index,unsigned int begin_position,int path_i,int path_j,short ** table,char ** table_mask,int max_indel,char * movement_buffer,int expected_offset,int current_score,int out_pos,int current_offset,int init_read_offset,int shutdown_read_offset,int * all_steps)1714 int search_DP_branch(char * read, int read_len, gene_value_index_t * index, unsigned int begin_position, int path_i, int path_j, short ** table,char ** table_mask, int max_indel, char * movement_buffer, int expected_offset, int current_score, int out_pos, int current_offset, int init_read_offset, int shutdown_read_offset, int * all_steps)
1715 {
1716
1717 if (1499 - out_pos > (read_len <<2) || (*all_steps) > 3000 + (read_len << 3) || (*all_steps) > 33000){
1718 /*ddprintf("\nTOO MANY STEPS: rl = %d len=%d steps=%d\n", read_len, 1499 - out_pos, *all_steps);
1719 ddfflush(stdout) ;*/
1720 return 0;
1721 }
1722 if(path_j < 0 || path_i < 0)
1723 {
1724 int i;
1725 //ddprintf("\nFINAL TEST exp %d real-offset %d\n", expected_offset, current_offset);
1726 if (expected_offset !=current_offset)return 0;
1727 for (i=0; i<=path_j; i++) movement_buffer[out_pos--] = 1;
1728 for (i=0; i<=path_i; i++) movement_buffer[out_pos--] = 2;
1729 return out_pos ;
1730 }
1731
1732 short upper_score;
1733 if (path_i>0) upper_score = table[path_i-1][path_j];
1734 else upper_score = 0;
1735
1736 short left_score;
1737 if (path_j>0) left_score = table[path_i][path_j-1];
1738 else left_score = 0;
1739
1740 short upperleft_score;
1741 if (path_j>0 && path_i>0) upperleft_score = table[path_i-1][path_j-1];
1742 else upperleft_score = 0;
1743
1744 char is_matched_ij = gvindex_get(index, begin_position + path_j) == read[path_i]?DPALIGN_MATCH_SCORE :DPALIGN_MISMATCH_PENALTY;
1745
1746 int found = 0;
1747 int table_mask_j_1 = 0, table_mask_i_1 = 0;
1748 int table_mask2_i_1 = 0;
1749 if(path_j >0 && path_i >=0)
1750 {
1751 table_mask_j_1 = left_score + (table_mask[path_i][path_j-1]? DPALIGN_EXTENDGAP_PENALTY: DPALIGN_CREATEGAP_PENALTY) == current_score;
1752 }
1753 if(path_i >0 && path_j >=0)
1754 {
1755 table_mask_i_1 = upper_score + (table_mask[path_i-1][path_j]? DPALIGN_EXTENDGAP_PENALTY: DPALIGN_CREATEGAP_PENALTY) == current_score;
1756 table_mask2_i_1 = upper_score + (table_mask[path_i-1][path_j]? DPALIGN_EXTENDGAP_PENALTY: DPALIGN_CREATEGAP_PENALTY) < 0 ;
1757 }
1758
1759
1760 if ((!found ) && ( table_mask_j_1 || (current_score == 0 && table_mask_i_1)))
1761 {
1762 movement_buffer[out_pos] = 1;
1763 (*all_steps) ++;
1764 //ddprintf ("Path_i = %d ; Path_j = %d ; Offset = %d => 1\n", path_i, path_j, current_offset);
1765 //ddfflush(stdout) ;
1766 found =search_DP_branch (read, read_len, index, begin_position, path_i , path_j -1, table , table_mask, max_indel, movement_buffer, expected_offset, left_score, out_pos -1, current_offset - ((path_i >= init_read_offset && path_i <= shutdown_read_offset)?1:0), init_read_offset, shutdown_read_offset, all_steps);
1767 }
1768 if ((!found )&& ( table_mask_i_1 || (current_score == 0 && table_mask2_i_1)))
1769 {
1770 movement_buffer[out_pos] = 2;
1771 (*all_steps) ++;
1772 //ddprintf ("Path_i = %d ; Path_j = %d ; Offset = %d => 2\n", path_i, path_j, current_offset);
1773 //ddfflush(stdout) ;
1774 found =search_DP_branch (read, read_len, index, begin_position, path_i -1 , path_j , table , table_mask , max_indel, movement_buffer, expected_offset, upper_score, out_pos -1, current_offset + ((path_i >= init_read_offset && path_i <= shutdown_read_offset)?1:0), init_read_offset, shutdown_read_offset, all_steps);
1775 }
1776 if((!found )&& (is_matched_ij + (upperleft_score - current_score) == 0))
1777 {
1778 movement_buffer[out_pos] = is_matched_ij==2?0:3;
1779 //ddprintf ("Path_i = %d ; Path_j = %d ; Offset = %d => 0\n", path_i, path_j, current_offset);
1780 (*all_steps) ++;
1781 found = search_DP_branch (read, read_len, index, begin_position, path_i -1 , path_j -1, table , table_mask, max_indel, movement_buffer, expected_offset, upperleft_score, out_pos -1, current_offset, init_read_offset, shutdown_read_offset, all_steps);
1782 }
1783
1784
1785 return found;
1786 }
1787
1788 #define INDEL_WINDOW_WIDTH 4
1789
window_indel_align(char * read,int read_len,gene_value_index_t * index,unsigned int begin_position,int max_indel,char * movement_buffer,int expected_offset,int init_read_offset,int shutdown_read_offset)1790 int window_indel_align(char * read, int read_len, gene_value_index_t * index, unsigned int begin_position, int max_indel, char * movement_buffer, int expected_offset, int init_read_offset, int shutdown_read_offset)
1791 {
1792 short indel_windows[32];
1793 int scores [32][MAX_READ_LENGTH];
1794 int i,j,windows_number;
1795 char chro_str[200];
1796
1797 memset(indel_windows,0, 32*sizeof(short));
1798
1799 windows_number = abs(expected_offset)+1;
1800 //ddprintf ("\nWindow size = %d ; ExpOffset = %d\n", windows_number , expected_offset);
1801 ddfflush(stdout) ;
1802 for (i=0; i< read_len; i++)
1803 for (j=0; j< windows_number; j++)
1804 {
1805 int matchingness ;
1806 if (j==0) chro_str[i] = gvindex_get(index, begin_position+i);
1807 if (expected_offset<0)
1808 matchingness = *(read + i) == gvindex_get(index, begin_position + j + i) ;
1809 else
1810 matchingness = *(read + i) == gvindex_get(index, begin_position - windows_number + j + 1 + i) ;
1811 indel_windows [j] += matchingness;
1812
1813 if (i >= INDEL_WINDOW_WIDTH)// The result for i - INDEL_WINDOW_WIDTH is OK
1814 {
1815 scores[j][i - INDEL_WINDOW_WIDTH] = indel_windows [j] ;
1816 }
1817 if (i >= INDEL_WINDOW_WIDTH)// The result for i - INDEL_WINDOW_WIDTH is OK
1818 {
1819
1820 if (expected_offset<0)
1821 matchingness = *(read - INDEL_WINDOW_WIDTH + i) == gvindex_get(index, begin_position + j + i - INDEL_WINDOW_WIDTH) ;
1822 else
1823 matchingness = *(read - INDEL_WINDOW_WIDTH + i) == gvindex_get(index, begin_position -windows_number + j +1 + i - INDEL_WINDOW_WIDTH) ;
1824 indel_windows [j] -= matchingness;
1825 }
1826 }
1827 chro_str[i]=0;
1828 j = read[read_len];
1829 read[read_len]=0;
1830
1831 ddprintf ("CHRO=%s\nREAD=%s\n", chro_str, read);
1832
1833 /*
1834 float contingency_list [MAX_READ_LENGTH];
1835 for (i=0; i< read_len - windows_number-INDEL_WINDOW_WIDTH; i++)
1836 {
1837 float contingency;
1838 if (expected_offset >0)
1839 contingency = ((scores[0][i]*1. + 1)-(scores[windows_number-1][i]*1.+1));
1840 else
1841 contingency = ((scores[windows_number-1][i]*1. + 1)-(scores[0][i]*1.+1));
1842 contingency_list[i]=contingency;
1843 }*/
1844
1845 int max_score=-1;
1846 int max_pos = -1;
1847 if (expected_offset>0)
1848 {
1849 //insertion
1850 for (i=read_len-INDEL_WINDOW_WIDTH-1; i>=0; i--)
1851 {
1852 if (scores[windows_number - 1][i - expected_offset]>=2 && scores[0][i] >= max_score)
1853 {
1854 max_pos = i - windows_number+1;
1855 max_score = scores[0][i];
1856 }
1857 }
1858 }
1859 else
1860 {
1861 //deletion
1862 for (i=read_len-INDEL_WINDOW_WIDTH-1; i>=0; i--)
1863 {
1864 if (scores[windows_number -1][i] >= max_score && scores[0][i + expected_offset] >=2)
1865 {
1866 max_score = scores[windows_number -1][i];
1867 max_pos = i ;
1868 }
1869 }
1870
1871 }
1872
1873 max_pos = min(read_len, max(0, max_pos));
1874
1875 int move_number = 0;
1876
1877 for (i=0; i< read_len -INDEL_WINDOW_WIDTH; i++)
1878 {
1879 if (i==max_pos)
1880 for (j = 0; j < windows_number-1; j++)
1881 movement_buffer[move_number++] = 1+(expected_offset>0); // 1=del; 2=ins
1882 if (i!=max_pos || expected_offset <0)
1883 movement_buffer [move_number++] = 0; // matched
1884
1885 #ifdef indel_debug
1886 for (j=0; j< windows_number; j++)
1887 SUBREADprintf("%d ",scores[j][i]);
1888 SUBREADprintf (" %c\n", (i==max_pos)?'*':' ') ;
1889 #endif
1890 }
1891 for(; i< read_len; i++)
1892 movement_buffer [move_number++] = 0; // matched
1893
1894
1895
1896 read[read_len] = j;
1897 return move_number;
1898 }
1899
dynamic_align(char * read,int read_len,gene_value_index_t * index,unsigned int begin_position,int max_indel,char * movement_buffer,int expected_offset,int init_read_offset,int shutdown_read_offset,short ** table,char ** table_mask)1900 int dynamic_align(char * read, int read_len, gene_value_index_t * index, unsigned int begin_position, int max_indel, char * movement_buffer, int expected_offset, int init_read_offset, int shutdown_read_offset, short **table , char ** table_mask)
1901 // read must be converted to the positive strand.
1902 // movement buffer: 0:match, 1: read-insert, 2: gene-insert, 3:mismatch
1903 // the size of the movement buffer must be equal to the length of the read plus max_indel * 3.
1904 {
1905
1906
1907 int i,j;
1908
1909 #ifdef indel_debug
1910 int ii, jj;
1911 for(ii = 0; ii<read_len - expected_offset; ii++)
1912 putchar(gvindex_get(index, begin_position + ii));
1913
1914 SUBREADprintf ("\n%s\n", read);
1915 #endif
1916
1917 for (i=0; i<read_len ; i++)
1918 {
1919 for(j=0; j<read_len - expected_offset; j++)
1920 {
1921 table_mask[i][j]=0;
1922 if (j < i - max_indel || j > max_indel + i)
1923 {
1924 table[i][j]=-9999;
1925 #ifdef indel_debug
1926 putchar('\t');
1927 #endif
1928 continue;
1929 }
1930
1931 short from_upper;
1932
1933 if (i>0) from_upper = table[i-1][j] + (table_mask[i-1][j]?DPALIGN_EXTENDGAP_PENALTY:DPALIGN_CREATEGAP_PENALTY);
1934 else from_upper = DPALIGN_CREATEGAP_PENALTY;
1935
1936 short from_left;
1937
1938 if (j>0) from_left = table[i][j-1] + (table_mask[i][j-1]?DPALIGN_EXTENDGAP_PENALTY:DPALIGN_CREATEGAP_PENALTY);
1939 else from_left = DPALIGN_CREATEGAP_PENALTY;
1940
1941 char chromo_ch = gvindex_get(index, begin_position + j);
1942 char is_matched_ij = (chromo_ch == read[i])?DPALIGN_MATCH_SCORE:DPALIGN_MISMATCH_PENALTY;
1943
1944 short from_upperleft;
1945 if (i>0 && j>0) from_upperleft = table[i-1][j-1] + is_matched_ij;
1946 else from_upperleft = is_matched_ij;
1947 if ((i ==0 || j ==0) && (i+j>0)) from_upperleft += DPALIGN_CREATEGAP_PENALTY;
1948
1949 if (from_upperleft <=from_left || from_upperleft <= from_upper)
1950 table_mask[i][j]=1;
1951
1952 table[i][j]=max(0,max(from_upper, max(from_left, from_upperleft)));
1953 #ifdef indel_debug
1954 SUBREADprintf("%c%c\t", chromo_ch, read[i]);
1955 #endif
1956 }
1957 #ifdef indel_debug
1958 SUBREADputs("");
1959 #endif
1960
1961 }
1962 #ifdef indel_debug
1963 //SUBREADputs("");
1964 //SUBREADputs("");
1965
1966 #endif
1967
1968 short current_score = table[read_len-1] [read_len - expected_offset-1];
1969 short path_i = read_len-1 ;
1970 int out_pos = 1499;
1971 char out_tmp [1500];
1972 int all_steps = 0;
1973 j = read_len-expected_offset -1;
1974
1975 #ifdef indel_debug
1976
1977 for(ii=0;ii< 20; ii++)
1978 {
1979 for(jj=0; jj<20; jj++)
1980 SUBREADprintf("%d\t",table[ii][jj]);
1981 SUBREADputs("");
1982 }
1983 SUBREADputs("");
1984 SUBREADputs("");
1985
1986 for(ii=0;ii< 20; ii++)
1987 {
1988 for(jj=0; jj<20; jj++)
1989 SUBREADprintf("%d\t",table_mask[ii][jj]);
1990 SUBREADputs("");
1991 }
1992 #endif
1993
1994 out_pos = search_DP_branch(read, read_len, index, begin_position, path_i, j, table, table_mask, max_indel, out_tmp , expected_offset, current_score , out_pos, 0, init_read_offset,shutdown_read_offset, &all_steps);
1995
1996 if (out_pos)
1997 {
1998 memcpy(movement_buffer, out_tmp + out_pos +1, 1499 - out_pos);
1999 return 1499 - out_pos;
2000 }
2001 else return 0;
2002
2003 }
2004
extend_covered_region(gene_value_index_t * array_index,unsigned int read_start_pos,char * read,int read_len,int cover_start,int cover_end,int window_size,int req_match_5end,int req_match_3end,int indel_tolerance,int space_type,int tail_indel,short * head_indel_pos,int * head_indel_movement,short * tail_indel_pos,int * tail_indel_movement,int is_head_high_quality,char * qual_txt,int qual_format,float head_matching_rate,float tail_matching_rate)2005 int extend_covered_region(gene_value_index_t *array_index, unsigned int read_start_pos, char * read, int read_len, int cover_start, int cover_end, int window_size, int req_match_5end , int req_match_3end, int indel_tolerance, int space_type, int tail_indel, short * head_indel_pos, int * head_indel_movement, short * tail_indel_pos, int * tail_indel_movement, int is_head_high_quality, char * qual_txt, int qual_format, float head_matching_rate, float tail_matching_rate)
2006 {
2007 int ret = 0;
2008 *head_indel_pos = -1;
2009 *tail_indel_pos = -1;
2010 if (cover_start >= window_size && head_matching_rate < 1.0001)
2011 {
2012 int head_test_len = cover_start;
2013 int roughly_mapped = match_chro(read, array_index, read_start_pos, head_test_len , 0, space_type);
2014 if (roughly_mapped >= head_test_len * CORE_RECOVER_MATCHING_RATE - 0.0001)
2015 {
2016 ret |= 1;
2017 }
2018 else
2019 {
2020 int window_end_pos = cover_start + window_size-1;
2021 int not_too_bad = 1;
2022 int right_match_number=0;
2023
2024 while (1)
2025 {
2026 int matched_bases_in_window = match_chro_wronglen(read+ window_end_pos - window_size, array_index, read_start_pos + window_end_pos - window_size, window_size, space_type, NULL, &right_match_number);
2027 int best_indel_movement = -1000;
2028 int best_indel_pos = -1;
2029
2030 if (matched_bases_in_window >= req_match_5end) // this window is not bad enough so that an indel is considered
2031 window_end_pos--;
2032 else
2033 {
2034 roughly_mapped = match_chro(read, array_index, read_start_pos, window_end_pos - right_match_number , 0, space_type);
2035 if (roughly_mapped < (int)(0.5+ (window_end_pos - right_match_number) * CORE_RECOVER_MATCHING_RATE))
2036 {
2037 // the quality of this window is too low (at most 1 base is matched). I must consider if it is an indel.
2038 int indel_movement_i ;
2039
2040 int best_matched_bases_after_indel = -1;
2041
2042 not_too_bad = 0;
2043 for (indel_movement_i = 0; indel_movement_i < 2* indel_tolerance-1 ; indel_movement_i ++)
2044 {
2045 int indel_movement = (indel_movement_i+1)/2 * (indel_movement_i %2?1:-1);
2046 int test_length = window_end_pos /*- 1*/ - max(0, indel_movement) - right_match_number;
2047
2048 if (test_length < window_size) continue;
2049 //if (test_length <= 1+ abs(indel_movement)/4) continue;
2050 //test_length = min(10, test_length);
2051 if (abs(indel_movement) > indel_tolerance) continue;
2052
2053 int test_start =0;// window_end_pos - max(0, indel_movement) - right_match_number - test_length;
2054
2055 int matched_bases_after_indel = match_chro_support(read +test_start, array_index, read_start_pos + indel_movement +test_start, test_length,0, space_type, qual_txt, qual_format);
2056 float test_rate = head_matching_rate;
2057
2058 if(test_length < 3) test_rate = 1;
2059
2060 if(best_matched_bases_after_indel <matched_bases_after_indel && matched_bases_after_indel >= (int)(0.5+test_length * test_rate))
2061 {
2062 not_too_bad = 1;
2063
2064 best_matched_bases_after_indel = matched_bases_after_indel;
2065 best_indel_movement = indel_movement;
2066 best_indel_pos = window_end_pos - right_match_number;
2067
2068
2069
2070 *head_indel_pos = best_indel_pos;
2071 *head_indel_movement = best_indel_movement;
2072 }
2073 }
2074 if(best_indel_pos<0) *head_indel_pos = window_end_pos - right_match_number;
2075 break;
2076 }else window_end_pos--;
2077 }
2078 if (window_end_pos - window_size <= 0) break;
2079 }
2080 if(not_too_bad)
2081 ret |=1;
2082 //else
2083 //*head_indel_pos = -1;
2084 }
2085 }
2086 else ret |= 1;
2087
2088
2089
2090 if (cover_end <= read_len - window_size && tail_matching_rate < 1.0001)
2091 {
2092 int tail_test_len = read_len - cover_end;
2093 int roughly_mapped = match_chro(read + cover_end, array_index, read_start_pos + cover_end + tail_indel, tail_test_len , 0, space_type);
2094 if (roughly_mapped >= tail_test_len * CORE_RECOVER_MATCHING_RATE - 0.0001)
2095 {
2096 ret |= 2;
2097 }
2098 else
2099 {
2100 int window_start_pos = cover_end - window_size +1;
2101 int not_too_bad = 1;
2102
2103 while (1)
2104 {
2105 int left_match_number = 0;
2106 int matched_bases_in_window = match_chro_wronglen(read+ window_start_pos, array_index, read_start_pos + window_start_pos + tail_indel, window_size, space_type, &left_match_number, NULL);
2107 int best_indel_movement = -1000;
2108 int best_indel_pos = -1;
2109
2110 if (matched_bases_in_window >= req_match_3end) // this window is not bad enough so that an indel is considered
2111 window_start_pos++;
2112 else
2113 {
2114 roughly_mapped = match_chro(read+window_start_pos + left_match_number, array_index, read_start_pos + window_start_pos + tail_indel + left_match_number, read_len - window_start_pos - left_match_number , 0, space_type);
2115
2116 if (roughly_mapped < (int)(0.5+(read_len - window_start_pos -left_match_number ) * CORE_RECOVER_MATCHING_RATE))
2117 {
2118 // the quality of this window is too low (at most 1 base is matched). I must consider if it is an indel.
2119 int indel_movement_i ;
2120 int best_matched_bases_after_indel = -1;
2121 not_too_bad = 0;
2122 for (indel_movement_i = 0; indel_movement_i < 2* indel_tolerance; indel_movement_i ++)
2123 {
2124
2125 int indel_adjustment = (indel_movement_i+1)/2 * (indel_movement_i %2?1:-1);
2126 int indel_movement = tail_indel + indel_adjustment;
2127 int test_length = read_len - window_start_pos - left_match_number + min(0, indel_adjustment);
2128 //test_length = min(10, test_length);
2129
2130
2131 if (test_length < window_size) continue;
2132 //if (test_length <= 1 + abs(indel_movement)/4) continue;
2133 if (abs(indel_movement) > indel_tolerance) continue;
2134
2135 int matched_bases_after_indel = match_chro_support(read + window_start_pos - min(0, indel_adjustment) + left_match_number, array_index, read_start_pos + window_start_pos + max(0,indel_movement) +left_match_number , test_length,0, space_type, qual_txt + window_start_pos - min(0, indel_adjustment) + left_match_number , qual_format);
2136
2137 float test_rate = tail_matching_rate;
2138
2139 if(test_length < 3) test_rate = 1;
2140
2141 if(best_matched_bases_after_indel <matched_bases_after_indel && matched_bases_after_indel >= (int)(0.5+test_length * test_rate))
2142 {
2143 not_too_bad = 1;
2144 best_matched_bases_after_indel = matched_bases_after_indel;
2145 best_indel_movement = indel_movement;
2146 best_indel_pos = window_start_pos + left_match_number ;//-1;
2147 *tail_indel_movement = best_indel_movement ;
2148 }
2149 }
2150
2151 if(best_indel_pos<0)
2152 *tail_indel_pos = window_start_pos + left_match_number ;
2153 else
2154 *tail_indel_pos = best_indel_pos;
2155 break;
2156 }else window_start_pos++;
2157 }
2158 if (window_start_pos + window_size >= read_len) break;
2159 }
2160 if(not_too_bad)
2161 ret |=2;
2162 //else
2163 // *tail_indel_pos =-1;
2164 }
2165 }
2166 else ret |= 2;
2167
2168
2169 return ret;
2170 }
2171
2172
match_base_quality_cs(gene_value_index_t * array_index,char * read_txt,unsigned int pos,char * qual_txt,int read_len,int phred_version,int * high_qual_unmatch,int * all_mismatched,int ql_kill,int head_clipped,int tail_clipped)2173 float match_base_quality_cs(gene_value_index_t *array_index, char * read_txt, unsigned int pos, char * qual_txt, int read_len, int phred_version, int * high_qual_unmatch, int * all_mismatched, int ql_kill, int head_clipped, int tail_clipped)
2174 {
2175 int i;
2176 int ret =0;
2177 char lastch;
2178 if(pos < array_index -> start_base_offset || pos + read_len >= array_index -> start_base_offset + array_index -> length){
2179 //SUBREADprintf("WARNING: BASE INDEX OUT OF LIMIT: %u < %u < %u\n%s\n", array_index -> start_base_offset , pos, array_index -> start_base_offset + array_index -> length, read_txt);
2180 // exit(-1);
2181 return (read_len - tail_clipped - head_clipped);
2182 }
2183 lastch = gvindex_get(array_index, pos);
2184 for(i=head_clipped; i<read_len - tail_clipped; i++)
2185 {
2186 char nch = gvindex_get(array_index, pos+i+1);
2187 int is_matched = read_txt[i] == '0'+chars2color(lastch, nch);
2188 if(is_matched){
2189 ret ++;
2190 }else
2191 {
2192 (*all_mismatched)++;
2193 (*high_qual_unmatch)++;
2194 ret --;
2195 }
2196
2197 lastch = nch;
2198 }
2199 return ret*1.;
2200 }
2201
match_base_quality(gene_value_index_t * array_index,char * read_txt,unsigned int pos,char * qual_txt,int read_len,int is_negative,int phred_version,int * high_qual_unmatch,int * all_mismatched,int ql_kill,int head_clipped,int tail_clipped)2202 float match_base_quality(gene_value_index_t *array_index, char * read_txt, unsigned int pos, char * qual_txt, int read_len, int is_negative, int phred_version, int * high_qual_unmatch, int * all_mismatched, int ql_kill, int head_clipped, int tail_clipped)
2203 {
2204 int i;
2205 int ret =0;
2206 if(pos < array_index -> start_base_offset || pos + read_len >= array_index -> start_base_offset + array_index -> length){
2207 //SUBREADprintf("WARNING: BASE INDEX OUT OF LIMIT: %u < %u < %u\n%s\n", array_index -> start_base_offset , pos, array_index -> start_base_offset + array_index -> length, read_txt);
2208 // exit(-1);
2209 return (read_len - tail_clipped - head_clipped);
2210 }
2211 for(i=head_clipped; i<read_len - tail_clipped; i++)
2212 {
2213 char true_chr;
2214 if(is_negative)
2215 {
2216 true_chr = gvindex_get(array_index, pos + read_len - i - 1);
2217 if(true_chr == 'A') true_chr = 'T';
2218 else if(true_chr == 'G') true_chr = 'C';
2219 else if(true_chr == 'C') true_chr = 'G';
2220 else true_chr = 'A';
2221 }
2222 else
2223 true_chr = gvindex_get(array_index, pos + i);
2224
2225 //SUBREADprintf("%c vs %c\n", true_chr , read_txt[i]);
2226
2227 if (true_chr == read_txt[i])
2228 {
2229 if(!qual_txt)
2230 ret += 1000000;
2231 else if(FASTQ_PHRED64 == phred_version)
2232 ret += (1000000-get_base_error_prob64i(qual_txt[i]));
2233 else
2234 ret += (1000000-get_base_error_prob33i(qual_txt[i]));
2235 }
2236 else
2237 {
2238 (*all_mismatched)++;
2239 if(!qual_txt)
2240 {
2241 ret -= 1000000;
2242 (*high_qual_unmatch)++;
2243 }
2244 else
2245 {
2246 int ql ;
2247 if(FASTQ_PHRED64 == phred_version)
2248 ql = get_base_error_prob64i(qual_txt[i]);
2249 else
2250 ql = get_base_error_prob33i(qual_txt[i]);
2251 /*
2252 #ifdef QUALITY_KILL
2253 #if QUALITY_KILL > 196
2254 #define QL_MIN 999000
2255 #endif
2256 #endif
2257
2258 #ifndef QL_MIN
2259 #define QL_MIN 200000
2260 #endif*/
2261
2262 if( ql < ql_kill) (*high_qual_unmatch)++;
2263
2264 ret += ql-1000000;
2265 }
2266 }
2267 }
2268 //printf ("SECTION QUAL = %d LEN = %d\n", ret, read_len);
2269 return ret/1000000.;
2270 }
2271
final_mapping_quality(gene_value_index_t * array_index,unsigned int pos,char * read_txt,char * qual_txt,char * cigar_txt,int phred_version,int * mismatch,int rl,char * refined_cigar,unsigned int * new_pos)2272 float final_mapping_quality(gene_value_index_t *array_index, unsigned int pos, char * read_txt, char * qual_txt, char * cigar_txt, int phred_version, int * mismatch, int rl, char * refined_cigar, unsigned int * new_pos)
2273 {
2274 int cigar_cursor = 0;
2275 int read_cursor = 0;
2276 long long int chromosome_cursor = pos, x;
2277 int cigar_length = strlen(cigar_txt);
2278 int i;
2279 int ret = 0;
2280
2281 x= 0;
2282 while(cigar_cursor < cigar_length)
2283 {
2284 if(cigar_txt[cigar_cursor] =='X')
2285 {
2286 cigar_cursor++;
2287 continue;
2288 }
2289 if(cigar_txt[cigar_cursor]>='0' && cigar_txt[cigar_cursor]<='9')
2290 x = x*10+ (cigar_txt[cigar_cursor]-'0');
2291 else
2292 {
2293 if(cigar_txt[cigar_cursor] == 'M' || cigar_txt[cigar_cursor] == 'S')
2294 {
2295 int all_MM=0;
2296 float nret = match_base_quality(array_index, read_txt + read_cursor, chromosome_cursor , (qual_txt && qual_txt[0])?qual_txt + read_cursor:NULL, x, 0, phred_version, mismatch, &all_MM, 200000,0,0);
2297 //printf ("%s: Q=%.6f; L=%d ; POS=%u\n", read_txt + read_cursor, nret, x, chromosome_cursor);
2298
2299 ret += (int)(nret*1000000);
2300 chromosome_cursor +=x;
2301 read_cursor +=x;
2302 }
2303 else if(cigar_txt[cigar_cursor] == 'I')
2304 {
2305 if(!qual_txt)
2306 {
2307 ret += x*1000000;
2308 read_cursor +=x;
2309 }
2310 else if(FASTQ_PHRED64 == phred_version)
2311 {
2312 for(i = 0; i<x; i++)
2313 {
2314 char qchar = qual_txt[read_cursor++];
2315 ret += (1000000-get_base_error_prob64i(qchar));
2316 }
2317 }
2318 else
2319 {
2320 for(i = 0; i<x; i++)
2321 {
2322 char qchar = qual_txt[read_cursor++];
2323 ret += (1000000-get_base_error_prob33i(qchar));
2324 }
2325 }
2326 }
2327 else if(cigar_txt[cigar_cursor] == 'D' || cigar_txt[cigar_cursor] == 'N'|| cigar_txt[cigar_cursor] == 'j' || cigar_txt[cigar_cursor] == 'J')
2328 chromosome_cursor +=x;
2329 else if(cigar_txt[cigar_cursor] == 'B' || cigar_txt[cigar_cursor] == 'b')
2330 chromosome_cursor -=x;
2331
2332 x= 0;
2333 }
2334 cigar_cursor++;
2335 }
2336 if(read_cursor != rl)
2337 {
2338 *mismatch = 9999;
2339 return 0.;
2340 }
2341 //printf("S=%.5f, LEN=%d\n", ret , read_cursor);
2342
2343 float match_rate = (ret/10000.) / read_cursor+100.;
2344
2345 if(refined_cigar && match_rate < 190)
2346 {
2347 char read_data[1250];
2348 chromosome_cursor = pos;
2349 read_cursor = 0;
2350 cigar_cursor = 0;
2351 x=0;
2352
2353 while(cigar_cursor < cigar_length)
2354 {
2355 if(cigar_txt[cigar_cursor] =='X')
2356 {
2357 cigar_cursor++;
2358 continue;
2359 }
2360 if(cigar_txt[cigar_cursor]>='0' && cigar_txt[cigar_cursor]<='9')
2361 x = x*10+ (cigar_txt[cigar_cursor]-'0');
2362 else
2363 {
2364 if(cigar_txt[cigar_cursor] == 'M' || cigar_txt[cigar_cursor] == 'S')
2365 {
2366 gvindex_get_string(read_data + read_cursor , array_index , chromosome_cursor , x , 0);
2367 chromosome_cursor +=x;
2368 read_cursor +=x;
2369 }
2370 else if(cigar_txt[cigar_cursor] == 'I')
2371 {
2372 int xk;
2373 for(xk=read_cursor;xk<read_cursor+x; xk++)
2374 read_data[cigar_cursor] = 'I';
2375 read_cursor +=x;
2376 }
2377 else if(cigar_txt[cigar_cursor] == 'D' || cigar_txt[cigar_cursor] == 'N'|| cigar_txt[cigar_cursor] == 'j' || cigar_txt[cigar_cursor] == 'J')
2378 chromosome_cursor +=x;
2379 else if(cigar_txt[cigar_cursor] == 'B' || cigar_txt[cigar_cursor] == 'b')
2380 chromosome_cursor -=x;
2381
2382 x= 0;
2383 }
2384 cigar_cursor++;
2385 }
2386
2387 int window_matched = 0;
2388 int last_confirmed_read_pos = 99999;
2389 int REFINE_WINDOW_SIZE = 10;
2390 for(read_cursor = - REFINE_WINDOW_SIZE; read_cursor < rl - REFINE_WINDOW_SIZE; read_cursor++)
2391 {
2392
2393 // printf("MMM %c =?= %c\n", read_txt[read_cursor+REFINE_WINDOW_SIZE] , read_data[read_cursor+REFINE_WINDOW_SIZE]);
2394 window_matched += (read_txt[read_cursor+REFINE_WINDOW_SIZE] == read_data[read_cursor+REFINE_WINDOW_SIZE] || read_txt[read_cursor+REFINE_WINDOW_SIZE] =='I');
2395 if(read_cursor>=0)
2396 {
2397 if(window_matched >= REFINE_WINDOW_SIZE -1)
2398 if(read_txt[read_cursor+REFINE_WINDOW_SIZE] == read_data[read_cursor+REFINE_WINDOW_SIZE])
2399 last_confirmed_read_pos = read_cursor+REFINE_WINDOW_SIZE+1;
2400 window_matched -= (read_txt[read_cursor] == read_data[read_cursor] || read_txt[read_cursor] == 'I');
2401 }
2402 }
2403
2404
2405 window_matched = 0;
2406 int first_confirmed_read_pos = 99999;
2407 for(read_cursor = rl -1 ; read_cursor >=0 ; read_cursor--)
2408 {
2409
2410 window_matched += (read_txt[read_cursor] == read_data[read_cursor]|| read_txt[read_cursor] == 'I');
2411 if(read_cursor <= rl- REFINE_WINDOW_SIZE - 1)
2412 {
2413 if(window_matched >= REFINE_WINDOW_SIZE -1)
2414 if(read_txt[read_cursor] == read_data[read_cursor])
2415 first_confirmed_read_pos = read_cursor;
2416 window_matched -= (read_txt[read_cursor+REFINE_WINDOW_SIZE] == read_data[read_cursor+REFINE_WINDOW_SIZE] || read_txt[read_cursor + REFINE_WINDOW_SIZE] == 'I');
2417 }
2418 }
2419
2420
2421 cigar_cursor = 0;
2422 read_cursor = 0;
2423 refined_cigar[0]=0;
2424 x=0;
2425 int begin_copy =0;
2426 chromosome_cursor = pos;
2427
2428 if(first_confirmed_read_pos<99990 && last_confirmed_read_pos<99990)
2429 {
2430 if(first_confirmed_read_pos>0)
2431 sprintf(refined_cigar,"%dS",first_confirmed_read_pos);
2432 while(cigar_cursor < cigar_length)
2433 {
2434 if(cigar_txt[cigar_cursor] =='X')
2435 {
2436 cigar_cursor++;
2437 continue;
2438 }
2439 if(cigar_txt[cigar_cursor]>='0' && cigar_txt[cigar_cursor]<='9')
2440 x = x*10+ (cigar_txt[cigar_cursor]-'0');
2441 else
2442 {
2443 if(cigar_txt[cigar_cursor] == 'M' || cigar_txt[cigar_cursor] == 'S' || cigar_txt[cigar_cursor] == 'I')
2444 {
2445 int out_len = 0;
2446 if(read_cursor < first_confirmed_read_pos && read_cursor + x >= first_confirmed_read_pos)
2447 out_len = read_cursor + x - first_confirmed_read_pos;
2448 else if(first_confirmed_read_pos <= read_cursor)
2449 out_len = x;
2450
2451 if(out_len)
2452 {
2453 if(!begin_copy)
2454 *new_pos = chromosome_cursor - read_cursor;
2455 if(read_cursor >= last_confirmed_read_pos) out_len=0;
2456 else if(read_cursor < last_confirmed_read_pos && read_cursor +x>last_confirmed_read_pos)
2457 out_len -= (read_cursor +x - last_confirmed_read_pos);
2458 }
2459
2460 if(out_len)
2461 {
2462 sprintf(refined_cigar+strlen(refined_cigar), "%d%c", out_len, cigar_txt[cigar_cursor]);
2463 begin_copy=1;
2464 }
2465 if(cigar_txt[cigar_cursor] == 'M' || cigar_txt[cigar_cursor] == 'S')
2466 chromosome_cursor +=x;
2467
2468 read_cursor +=x;
2469 if(read_cursor >= last_confirmed_read_pos) break;
2470 }
2471 else
2472 {
2473 if(begin_copy)
2474 #ifdef __MINGW32__
2475 sprintf(refined_cigar+strlen(refined_cigar), "%I64d%c", x, cigar_txt[cigar_cursor]);
2476 #else
2477 sprintf(refined_cigar+strlen(refined_cigar), "%lld%c", x, cigar_txt[cigar_cursor]);
2478 #endif
2479
2480 if(cigar_txt[cigar_cursor] == 'D' || cigar_txt[cigar_cursor] == 'N'|| cigar_txt[cigar_cursor] == 'j' || cigar_txt[cigar_cursor] == 'J')
2481 chromosome_cursor +=x;
2482 else if(cigar_txt[cigar_cursor] == 'B' || cigar_txt[cigar_cursor] == 'b')
2483 chromosome_cursor -=x;
2484
2485
2486 }
2487
2488 x= 0;
2489 }
2490 cigar_cursor++;
2491 }
2492 if(last_confirmed_read_pos<=rl-1)
2493 sprintf(refined_cigar+strlen(refined_cigar),"%dS",rl-last_confirmed_read_pos);
2494 }
2495 else if(new_pos)*new_pos=pos;
2496 }
2497 else if(new_pos)*new_pos=pos;
2498
2499 return match_rate;
2500 }
2501
2502
print_votes(gene_vote_t * vote,char * index_prefix)2503 void print_votes(gene_vote_t * vote, char *index_prefix)
2504 {
2505
2506 gene_offset_t offsets;
2507 int i,j;
2508 char * chrname = NULL;
2509 int chrpos = 0;
2510 load_offsets (&offsets, index_prefix);
2511 //locate_gene_position(vote -> max_position, &offsets, &chrname, &chrpos);
2512
2513 SUBREADprintf(" ========== Max votes = %d ==========\n", vote->max_vote);// , Position is %s,%u\n", vote->max_vote, chrname, chrpos );
2514 for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
2515 for(j=0; j< vote->items[i]; j++)
2516 {
2517 locate_gene_position(vote -> pos[i][j], &offsets, &chrname, &chrpos);
2518 int toli = vote->toli [i][j];
2519 int last_offset = vote -> indel_recorder[i][j][toli+2];
2520 SUBREADprintf(" %s\tVote = %d , Position is %s,%d (+%u) Coverage is (%d, %d) Indel %d %s (%d)\n", vote->votes[i][j] == vote->max_vote?"***":" ", vote->votes[i][j] , chrname, chrpos, vote -> pos[i][j], vote -> coverage_start[i][j], vote -> coverage_end[i][j],last_offset, (vote -> masks[i][j])?"NEG":"POS", vote -> masks[i][j]);
2521
2522 if(1){
2523 int k;
2524 for(k=0; k<=toli; k+=3){
2525 SUBREADprintf(" %d - %d : D=%d ", vote -> indel_recorder[i][j][k], vote -> indel_recorder[i][j][k+1], vote -> indel_recorder[i][j][k+2]);
2526 }
2527 SUBREADputs("");
2528 }
2529 }
2530
2531
2532 }
2533
2534
2535 static float PROB_QUAL_TABLE[] = {1.000000000 , 0.794328235 , 0.630957344 , 0.501187234 , 0.398107171 , 0.316227766 , 0.251188643 , 0.199526231 , 0.158489319 , 0.125892541 , 0.100000000 , 0.079432823 , 0.063095734 , 0.050118723 , 0.039810717 , 0.031622777 , 0.025118864 , 0.019952623 , 0.015848932 , 0.012589254 , 0.010000000 , 0.007943282 , 0.006309573 , 0.005011872 , 0.003981072 , 0.003162278 , 0.002511886 , 0.001995262 , 0.001584893 , 0.001258925 , 0.001000000 , 0.000794328 , 0.000630957 , 0.000501187 , 0.000398107 , 0.000316228 , 0.000251189 , 0.000199526 , 0.000158489 , 0.000125893 , 0.000100000 , 0.000079433 , 0.000063096 , 0.000050119 , 0.000039811 , 0.000031623 , 0.000025119 , 0.000019953 , 0.000015849 , 0.000012589 , 0.000010000 , 0.000007943 , 0.000006310 , 0.000005012 , 0.000003981 , 0.000003162 , 0.000002512 , 0.000001995 , 0.000001585 , 0.000001259 , 0.000001000 , 0.000000794 , 0.000000631 , 0.000000501};
2536
2537
2538
2539
get_base_error_prob33(char v)2540 float get_base_error_prob33(char v)
2541 {
2542 return PROB_QUAL_TABLE[v-'!'];
2543 }
get_base_error_prob64(char v)2544 float get_base_error_prob64(char v)
2545 {
2546 return PROB_QUAL_TABLE[v-'@'];
2547 }
2548
2549 static int PROB_QUAL_INT_TABLE[] = { 1000000 , 794328 , 630957 , 501187 , 398107 , 316227 , 251188 , 199526 , 158489 , 125892 , 100000 , 79432 , 63095 , 50118 , 39810 , 31622 , 25118 , 19952 , 15848 , 12589 , 10000 , 7943 , 6309 , 5011 , 3981 , 3162 , 2511 , 1995 , 1584 , 1258 , 1000 , 794 , 630 , 501 , 398 , 316 , 251 , 199 , 158 , 125 , 100 , 79 , 63 , 50 , 39 , 31 , 25 , 19 , 15 , 12 , 10 , 7 , 6 , 5 , 3 , 3 , 2 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , };
2550
2551
get_base_error_prob33i(char v)2552 int get_base_error_prob33i(char v)
2553 {
2554 return PROB_QUAL_INT_TABLE[v-'!'];
2555 }
get_base_error_prob64i(char v)2556 int get_base_error_prob64i(char v)
2557 {
2558 return PROB_QUAL_INT_TABLE[v-'@'];
2559 }
2560
2561 #ifdef SKIP_THIS_PART
bad_reverse_cigar(char * cigar)2562 void bad_reverse_cigar(char * cigar)
2563 {
2564 int cigar_cursor = 0;
2565 srInt_64 tmpv=0;
2566 char ncg[100];
2567 ncg[0]=0;
2568 while(cigar[cigar_cursor])
2569 {
2570 char cc = cigar[cigar_cursor];
2571 if(isdigit(cc))
2572 {
2573 tmpv=tmpv*10+(cc-'0');
2574 }
2575 else if((cc>='A'&&cc<='Z')||(cc>='a'&&cc<='z'))
2576 {
2577 char ncg2[103];
2578 #ifdef __MINGW32__
2579 sprintf(ncg2, "%I64d%c", tmpv, cc);
2580 #else
2581 sprintf(ncg2, "%lld%c", tmpv, cc);
2582 #endif
2583 strncat(ncg2, ncg, 99);
2584 strncpy(ncg, ncg2, 99);
2585 tmpv=0;
2586 }
2587 else
2588 {
2589 char ncg2[103];
2590 sprintf(ncg2, "%c%s",cc,ncg);
2591 strncpy(ncg, ncg2, 99);
2592 tmpv=0;
2593 }
2594 cigar_cursor++;
2595 }
2596 strcpy(cigar, ncg);
2597 }
2598
2599 #ifdef TEST_WHATCIGARREVERSE
main()2600 int main()
2601 #else
2602 int debug_main()
2603 #endif
2604 {
2605 char cg[100];
2606 sprintf(cg,"10M8H20M9I100N30M");
2607 bad_reverse_cigar(cg);
2608 SUBREADprintf("%s\n",cg);
2609 return 0;
2610 }
2611 #endif
2612
remove_indel_neighbours(HashTable * indel_table)2613 void remove_indel_neighbours(HashTable * indel_table)
2614 {
2615
2616 unsigned int * to_delete;
2617 char * to_delete_len;
2618 int xi;
2619 KeyValuePair * cursor;
2620 return;
2621
2622 to_delete=(unsigned int* )malloc(sizeof(int)*400000);
2623 to_delete_len=(char* )malloc(sizeof(char)*400000);
2624 int num_delete=0, bucket;
2625
2626 for(bucket=0; bucket<indel_table -> numOfBuckets; bucket++)
2627 {
2628 cursor = indel_table -> bucketArray[bucket];
2629 while (1)
2630 {
2631 if(!cursor) break;
2632 indel_record_t * p = (indel_record_t *) cursor -> key;
2633 indel_result_t * pv = (indel_result_t *)cursor -> value;
2634
2635 indel_record_t p_nb;
2636 p_nb.len = p->len;
2637
2638 for(xi=-5;xi<=5;xi++)
2639 {
2640 if(!xi)continue;
2641
2642 p_nb.pos = p->pos+xi;
2643 indel_result_t * p_nbv = (indel_result_t *)HashTableGet(indel_table, &p_nb);
2644 if(p_nbv && (p_nbv->support > pv->support || (p_nbv->support == pv->support && xi>0)))
2645 {
2646 if(num_delete<399999)
2647 {
2648 to_delete_len[num_delete]=p->len;
2649 to_delete[num_delete++]= p->pos+xi;
2650 }
2651 break;
2652 }
2653 }
2654
2655 cursor = cursor->next;
2656 }
2657 }
2658
2659
2660 for(xi=0;xi<num_delete;xi++)
2661 {
2662 indel_record_t p;
2663 p.len = to_delete_len[xi];
2664 p.pos = to_delete[xi];
2665 HashTableRemove(indel_table, &p);
2666 }
2667
2668
2669 SUBREADprintf("\n %d low-confidence indels have been removed.", num_delete);
2670 free(to_delete);
2671 free(to_delete_len);
2672
2673 }
2674
print_version_info()2675 void print_version_info()
2676 {
2677 SUBREADprintf("\nSubread %s\n", SUBREAD_VERSION);
2678 SUBREADprintf("\n");
2679 }
2680
fc_strcmp_chro(const void * s1,const void * s2)2681 int fc_strcmp_chro(const void * s1, const void * s2)
2682 {
2683
2684 // // we can compare the 3-th and 4-th bytes because we know that the buffers have enough lengths.
2685 // if(((char *)s1)[4] != ((char *)s2)[4] || ((char *)s1)[3] != ((char *)s2)[3] )
2686 // return 1;
2687 return strcmp((char*)s1, (char*)s2);
2688 }
2689
2690
2691
fc_chro_hash(const void * key)2692 srUInt_64 fc_chro_hash(const void *key) {
2693 const unsigned char *str = (const unsigned char *) key;
2694
2695 int xk1;
2696 unsigned long hashValue = 0;
2697
2698 for(xk1=0; str[xk1]; xk1++)
2699 {
2700 unsigned long ch = str[xk1];
2701 hashValue += (ch + xk1) << (ch & 0xf);
2702 }
2703
2704
2705 return hashValue;
2706 }
2707
2708
2709