1 /***************************************************************
2 
3    The Subread software package is free software package:
4    you can redistribute it and/or modify it under the terms
5    of the GNU General Public License as published by the
6    Free Software Foundation, either version 3 of the License,
7    or (at your option) any later version.
8 
9    Subread is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty
11    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 
13    See the GNU General Public License for more details.
14 
15    Authors: Drs Yang Liao and Wei Shi
16 
17   ***************************************************************/
18 
19 
20 #define _GNU_SOURCE
21 #include <stdio.h>
22 #include <math.h>
23 #include <ctype.h>
24 #include <errno.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <time.h>
29 #include <sys/stat.h>
30 #include <sys/types.h>
31 
32 #ifndef __FreeBSD__
33 #include <sys/timeb.h>
34 #endif
35 
36 #include "subread.h"
37 #include "input-files.h"
38 #include "gene-algorithms.h"
39 #include "sorted-hashtable.h"
40 
non_func(const char * fmt,...)41 void non_func(const char * fmt, ...)
42 {
43 }
44 
45 
subread_lock_release(subread_lock_t * lock)46 void subread_lock_release(subread_lock_t * lock)
47 {
48 	#ifdef MACOS
49 	pthread_mutex_unlock(lock);
50 	#else
51 	pthread_spin_unlock(lock);
52 	#endif
53 }
subread_lock_occupy(subread_lock_t * lock)54 void subread_lock_occupy(subread_lock_t * lock)
55 {
56 
57 	#ifdef MACOS
58 	pthread_mutex_lock(lock);
59 	#else
60 	pthread_spin_lock(lock);
61 	#endif
62 }
63 
subread_destroy_lock(subread_lock_t * lock)64 void subread_destroy_lock(subread_lock_t * lock) {
65 	#ifdef MACOS
66 	pthread_mutex_destroy(lock);
67 	#else
68 	pthread_spin_destroy(lock);
69 	#endif
70 }
71 
subread_init_lock(subread_lock_t * lock)72 void subread_init_lock(subread_lock_t * lock)
73 {
74 	#ifdef MACOS
75 	pthread_mutex_init(lock, NULL);
76 	#else
77 	pthread_spin_init(lock, PTHREAD_PROCESS_PRIVATE);
78 	#endif
79 
80 }
81 
82 //int EXON_MAX_CIGAR_LEN = 26;
83 
84 
85 //#define indel_debug
86 
87 #ifdef indel_debug
88 #define ddprintf SUBREADprintf
89 #define ddfflush fflush
90 #else
91 #define ddprintf non_func
92 #define ddfflush(a)
93 #endif
94 
95 #define get_base_quality_score(quality_chr, quality_scale)  ((quality_scale)==QUALITY_SCALE_NONE?-0.01: ((quality_scale) == QUALITY_SCALE_LINEAR?(	(quality_chr) - '@')*0.01-0.03: correct_rate_table[(quality_chr) - '@']))
96 
97 
98 double begin_ftime;
99 
get_next_char(FILE * fp)100 unsigned char get_next_char(FILE * fp)
101 {
102 	int find_br = 0;
103 	while (!feof(fp))
104 	{
105 		char nch;
106 		nch = fgetc(fp);
107 		if (find_br)
108 		{
109 			if (nch == '\n')
110 				find_br=0;
111 		}
112 		else if (nch == '>')
113 			find_br = 1;
114 		else if (nch > 32)
115 			return nch;
116 	}
117 	return 0;
118 }
119 
is_valid_subread(const char * read_str)120 int is_valid_subread(const char * read_str)
121 {
122 	return 1;
123 	int i;
124 	for (i=0; i<16; i++)
125 		if(read_str[i] == 'N' || read_str[i] == '.')
126 			return 0;
127 	return 1;
128 }
129 
read_quality_score(char * qualityb,int rl,int format)130 float read_quality_score(char * qualityb, int rl , int format)
131 {
132 	int i;
133 	int qual = 0;
134 	int testlen = 0;
135 
136 	char base;
137 
138 	if(format==FASTQ_PHRED64)
139 		base = 'B';
140 	else	base = '#';
141 
142 	for(i=0; i<rl; i++)
143 	{
144 		int testv = qualityb[i] - base;
145 		if(testv > 1)
146 		{
147 			qual += testv;
148 			testlen ++;
149 		}
150 	}
151 	return qual*1./testlen;
152 }
153 
min_matched_bases(char * qualityb,int rl,int format,float match_score)154 int min_matched_bases(char * qualityb, int rl , int format, float match_score)
155 {
156 	if(!(qualityb) || !qualityb[0]) return 0;
157 	int i;
158 
159 	char base;
160 	int ret = 0;
161 
162 	if(format==FASTQ_PHRED64)
163 		base = 'B';
164 	else	base = '#';
165 
166 	for(i=0; i<rl; i++)
167 		ret += (qualityb[i] - base)<=5;
168 
169 	return (rl - ret*3/4)*match_score;
170 }
bad_quality_base_number(char * qualityb,int rl,int format)171 int bad_quality_base_number(char * qualityb, int rl , int format)
172 {
173 	if(!(qualityb) || !qualityb[0]) return 0;
174 	int ret = 0, i;
175 	if(FASTQ_PHRED64==format)
176 	{
177 		for(i=0; i<rl; i++)
178 			if(qualityb[i] <='F') ret ++;
179 	}
180 	else
181 		for(i=0; i<rl; i++)
182 			if(qualityb[i] <='#'+4) ret ++;
183 
184 
185 	return ret;
186 }
187 
188 
189 double correct_rate_table [] ={ -1.58147375341 , -0.99684304401 , -0.69552447133 , -0.50767587370 , -0.38013040807 , -0.28926818720 , -0.22255151597 , -0.17255657291 , -0.13455196029 , -0.10536051566 , -0.08276530267 , -0.06517417320 , -0.05141827416 , -0.04062484422 , -0.03213357402 , -0.02543972753 , -0.02015436476 , -0.01597586925 , -0.01266917021 , -0.01005033585 , -0.00797499828 , -0.00632956293 , -0.00502447389 , -0.00398901727 , -0.00316728823 , -0.00251504651 , -0.00199725550 , -0.00158615046 , -0.00125971852 , -0.00100050033 , -0.00079464388 , -0.00063115648 , -0.00050131287 , -0.00039818644 , -0.00031627778 , -0.00025122020 , -0.00019954614 , -0.00015850188 , -0.00012590047 , -0.00010000500 , -0.00007943598 , -0.00006309773 , -0.00005011998 , -0.00003981151 , -0.00003162328 , -0.00002511918 , -0.00001995282 , -0.00001584906 , -0.00001258933 , -0.00001000005 , -0.00000794331 , -0.00000630959 , -0.00000501188 , -0.00000398108 , -0.00000316228 , -0.00000251189 , -0.00000199526 , -0.00000158489 , -0.00000125893 , -0.00000100000 , -0.00000079433 , -0.00000063096 , -0.00000050119 , -0.00000039811 , -0.00000031623 , -0.00000025119 , -0.00000019953 , -0.00000015849 , -0.00000012589 , -0.00000010000 , -0.00000007943 , -0.00000006310 , -0.00000005012 , -0.00000003981 , -0.00000003162 , -0.00000002512 , -0.00000001995 , -0.00000001585 , -0.00000001259 , -0.00000001000 , 0., 0.,0.,0.,0.,0.,0.,0.};
190 
191 int reported_version_error = 0;
get_subread_quality(const char * quality_str,const char * read_str,int quality_scale,int phred_version)192 gene_quality_score_t get_subread_quality(const char * quality_str, const char * read_str, int quality_scale, int phred_version)
193 {
194 	int ret =0;
195 	int i;
196 /*
197 	for (i=0; i<16; i++)
198 		if(read_str[i] == 'N' || read_str[i] == '.')
199 			return -22.;
200 */
201 
202 	//for(i=0;i<16;i++) ret += get_base_quality_score(quality_str[i] , quality_scale);
203 	if(FASTQ_PHRED64 == phred_version)
204 		for(i=0;i<16;i++) ret += (1000000 - get_base_error_prob64i(quality_str[i]));
205 	else
206 		for(i=0;i<16;i++) ret += (1000000 - get_base_error_prob33i(quality_str[i]));
207 
208 	/*
209 	if (ret <0 && !reported_version_error)
210 	{
211 		SUBREADprintf("\nWARNING: negative Phred quality score! Please verify the version of the Phred scores.\n");
212 		reported_version_error=1;
213 	}*/
214 
215 	return ret/16000000.;
216 }
217 
218 /*int get_base_phred(char quality_chr)
219 {
220 	return quality_chr - '@';
221 }
222 
223 void print_running_log(double finished_rate, double read_per_second, double expected_seconds, unsigned long long int total_reads, int is_pair)
224 {
225 	char outbuff[99]; int i;
226 	snprintf(outbuff, 98,"completed=%0.2f%%; time used=%.1fs; rate=%.1fk reads/s; time left=%.1fs; total=%lluk %s", finished_rate*100, miltime()-begin_ftime, read_per_second/1000 ,expected_seconds, total_reads/1000, is_pair?"pairs":"reads");
227 	fputs(outbuff, stdout);
228 	for(i=strlen(outbuff); i<105; i++)
229 		SUBREADprintf(" ");
230 	SUBREADprintf("%c",CORE_SOFT_BR_CHAR);
231 }
232 
233 */
234 
print_window_scrolling_bar(char * hint,float percentage,int width,int * internal_counter)235 void print_window_scrolling_bar(char * hint, float percentage, int width, int * internal_counter)
236 {
237 	/*
238 	char fan = '-';
239 	int bar_width = width - 7 - strlen(hint) , i;
240 	int dash_width = (int)(bar_width * percentage+0.5);
241 	dash_width = min(dash_width, bar_width - 1);
242 	int space_width = bar_width - 1 - dash_width;
243 
244 	char out_buf[100];
245 
246 	switch ((*internal_counter) % 4)
247 	{
248 		case 0:
249 			fan='-';
250 			break;
251 		case 1:
252 			fan='\\';
253 			break;
254 		case 2:
255 			fan='|';
256 			break;
257 		case 3:
258 			fan='/';
259 			break;
260 	}
261 
262 	(*internal_counter) ++;
263 
264 	sprintf (out_buf," %c %s [", fan, hint);
265 	for(i = 0; i< dash_width; i++)
266 		strcat(out_buf,"=");
267 	strcat(out_buf,">");
268 	for(i = 0; i< space_width; i++)
269 		strcat(out_buf," ");
270 	strcat(out_buf,"]");
271 
272 	print_in_box(80,0,0,"%s%c",out_buf,CORE_SOFT_BR_CHAR);
273 
274 	*/
275 	print_in_box(81,0,0," [ %.1f%%%% finished ]    %c",percentage*100, CORE_SOFT_BR_CHAR);
276 }
277 
278 
279 
280 
print_text_scrolling_bar(char * hint,float percentage,int width,int * internal_counter)281 void print_text_scrolling_bar(char * hint, float percentage, int width, int * internal_counter)
282 {
283 	char fan = '-';
284 	int bar_width = width - 7 - strlen(hint) , i;
285 	int dash_width = (int)(bar_width * percentage+0.5);
286 	dash_width = min(dash_width, bar_width - 1);
287 	int space_width = bar_width - 1 - dash_width;
288 
289 	/*
290 	for (i=0; i<width; i++);
291 		putchar(' ');
292 	SUBREADprintf("\r");
293 	*/
294 
295 	switch ((*internal_counter) % 4)
296 	{
297 		case 0:
298 			fan='-';
299 			break;
300 		case 1:
301 			fan='\\';
302 			break;
303 		case 2:
304 			fan='|';
305 			break;
306 		case 3:
307 			fan='/';
308 			break;
309 	}
310 
311 	(*internal_counter) ++;
312 
313 
314 	char lbuf[100];
315 	sprintf (lbuf," %c %s [", fan, hint);
316 	for(i = 0; i< dash_width; i++)
317 		strcat(lbuf, "=");
318 	strcat(lbuf, ">");
319 	for(i = 0; i< space_width; i++)
320 		strcat(lbuf, " ");
321 	strcat(lbuf, "]");
322 	SUBREADprintf("%s%c",lbuf,CORE_SOFT_BR_CHAR);
323 
324 }
325 
326 
add_repeated_numbers(int qid,gene_vote_t * vote,unsigned char * repeated_regions)327 void add_repeated_numbers(int qid, gene_vote_t * vote, unsigned char * repeated_regions)
328 {
329 
330 	int i,j;
331 
332 	for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
333 		for(j=0; j< vote->items[i]; j++)
334 		{
335 			int v_2 ;
336 			if(vote-> votes[i][j] >=2)
337 			{
338 				v_2 = vote-> votes[i][j] - 2;
339 				if(repeated_regions[v_2 + qid  * 16] < 255) repeated_regions[v_2 + qid  * 16]++;
340 			}
341 
342 		}
343 }
344 
remove_repeated_reads(gehash_t * table,gehash_t * huge_table,int index_threshold)345 int remove_repeated_reads(gehash_t * table, gehash_t * huge_table, int index_threshold)
346 {
347 	int i;
348 	int vals[200000];
349 	int val_len[200000];
350 	int scroll_count = 0;
351 	unsigned int all_removed = 0;
352 
353 
354 	for (i=0; i <table->buckets_number ; i++)
355 	{
356 		struct gehash_bucket * cb = table->buckets + i;
357 		int j;
358 		int val_c = 0;
359 
360 		if(i % 300 == 0)
361 			print_text_scrolling_bar ("Removing non-informative subreads", 1.0*i/table->buckets_number, 80, &scroll_count);
362 
363 		for(j=0; j<cb->current_items; j++)
364 		{
365 			int k, found = 0;
366 			for(k=0;k<val_c;k++)
367 			{
368 				if (vals[k]==cb->item_keys[j])
369 				{
370 					val_len[k]++;
371 					found = 1;
372 					break;
373 				}
374 			}
375 
376 			if(!found)
377 			{
378 				if(val_c>199999)
379 				{
380 					SUBREADprintf("\nThere are too many items in a bucket; you may reduce the threshold of non-informative subreads to eliminate this problem.\n");
381 					#ifdef MAKE_STANDALONE
382 					exit(-1);
383 					#else
384 					return -1;
385 					#endif
386 				}
387 				else
388 				{
389 					vals[val_c] = cb->item_keys[j];
390 					val_len[val_c] = 1;
391 					val_c++;
392 				}
393 			}
394 		}
395 
396 //			SUBREADprintf ("F3\n");
397 
398 		for(j=0; j<val_c; j++)
399 		{
400 
401 			if (gehash_exist(huge_table, vals[j]))
402 				 gehash_remove(table, vals[j]);
403 			else if(val_len[j]> index_threshold)
404 			{
405 				gehash_remove(table, vals[j]);
406 				gehash_insert(huge_table, vals[j], 1, NULL);
407 				all_removed += val_len[j];
408 			}
409 		}
410 	}
411 
412 	if(IS_DEBUG)
413 		SUBREADprintf ("@LOG There are %u subreads removed from the index.\n", all_removed);
414 
415 	return all_removed;
416 }
417 
418 
linear_gene_position(const gene_offset_t * offsets,char * chro_name,unsigned int chro_pos)419 unsigned int linear_gene_position(const gene_offset_t* offsets , char *chro_name, unsigned int chro_pos)
420 {
421 	unsigned int ret = 0 ;
422 	int n;
423 
424 
425 	n = HashTableGet(offsets -> read_name_to_index, chro_name)-NULL;
426 	//printf("09NOV: GET '%s' = %d\n", chro_name, n);
427 	if(n<1) return 0xffffffff;
428 	if(n>1)
429 		ret = offsets->read_offsets[n-2];
430 	ret += offsets -> padding ;
431 
432 	return ret + chro_pos;
433 }
434 
get_gene_linear(int chrono,int offset,const unsigned int offsets[])435 unsigned int get_gene_linear(int chrono, int offset, const unsigned int offsets [])
436 {
437 	if (chrono>1)return offsets[chrono-1]+offset;
438 	return offset;
439 }
440 
locate_gene_position_max(unsigned int linear,const gene_offset_t * offsets,char ** chro_name,int * pos,int * head_cut_length,int * tail_cut_length,int rl)441 int locate_gene_position_max(unsigned int linear, const gene_offset_t* offsets , char ** chro_name, int * pos, int * head_cut_length, int * tail_cut_length, int rl)
442 {
443 	int n = 0;
444 
445 	(*chro_name) = NULL;
446 	(*pos) = -1;
447 	int low_idx=0, high_idx= offsets->total_offsets;
448 	while(1){
449 		if(high_idx <= low_idx+1) {
450 			n = max(low_idx-2,0);
451 			break;
452 		}
453 		int mid_idx = (low_idx+high_idx)/2;
454 		unsigned int mid_val = offsets->read_offsets[mid_idx];
455 		if(mid_val > linear) high_idx = mid_idx;
456 		else low_idx = mid_idx+1;
457 	}
458 
459 	for (; n<offsets->total_offsets; n++)
460 	{
461 		if (offsets->read_offsets[n] > linear)
462 		{
463 
464 			//#warning "======== COMMENT THIS LINE !! ========"
465 			//SUBREADprintf("max=%u <= lim=%u : ACCEPTED.\n", rl + linear , offsets->read_offsets[n] + 16);
466 
467 			// the end of the read should not excess the end of the chromosome
468 
469 			if (n==0)
470 				*pos = linear;
471 			else
472 				*pos = linear - offsets->read_offsets[n-1];
473 
474 
475 			if(tail_cut_length == NULL){
476 				if(rl + linear > offsets->read_offsets[n] + 15 - offsets -> padding){
477 					return 1;
478 				}
479 			} else {
480 				unsigned int posn1 = 0;
481 				if(n>0) posn1 = offsets->read_offsets[n-1];
482 				long long int tct =  ( linear + rl - posn1 - offsets -> padding );
483 				if(tct < rl)tct = rl;
484 				long long int chro_leng = (offsets->read_offsets[n] - posn1 - 2*offsets -> padding + 16);
485 
486 				//SUBREADprintf("CHRO_LEN : %lld, READ_TAIL %lld , RL=%d\n", chro_leng, tct, rl);
487 				tct -= chro_leng;
488 				if( tct >= rl ){
489 					return 1;
490 				}
491 				if( tct <0    )tct=0;
492 				(*tail_cut_length) = tct;
493 			}
494 
495 			if( (*pos) < offsets -> padding ) {
496 				if(head_cut_length == NULL || (*pos) + rl <= offsets -> padding){
497 					return 1;
498 				}else{
499 					(*head_cut_length) = offsets -> padding - (*pos);
500 					(*pos) = offsets -> padding;
501 				}
502 			}
503 
504 			(*pos) -= offsets -> padding;
505 			*chro_name = (char *)offsets->read_names+n*MAX_CHROMOSOME_NAME_LEN;
506 
507 			return 0;
508 		}
509 	}
510 	return -1;
511 }
512 
513 
locate_gene_position(unsigned int linear,const gene_offset_t * offsets,char ** chro_name,int * pos)514 int locate_gene_position(unsigned int linear, const gene_offset_t* offsets , char ** chro_name, int * pos)
515 {
516 	return locate_gene_position_max(linear, offsets, chro_name, pos, NULL, NULL, 0);
517 }
518 
519 #define _index_vote(key) (((unsigned int)key)%GENE_VOTE_TABLE_SIZE)
520 
521 #ifdef USE_SLOW_HASHTABLE_INDEX
522 int vv=0;
add_gene_vote(gene_vote_t * vote,int key,int add_new)523 void add_gene_vote(gene_vote_t* vote, int key , int add_new) {
524 	add_gene_vote_weighted(vote, key, add_new, 1);
525 }
526 
add_gene_vote_weighted(gene_vote_t * vote,int key,int add_new,int w)527 void add_gene_vote_weighted(gene_vote_t* vote, int key , int add_new, int w) {
528 	int offset = _index_vote(key);
529 	int datalen = vote -> items[offset];
530 	unsigned int * dat = vote -> pos[offset];
531 	int i;
532 
533 	for (i=0;i<datalen;i++)
534 	{
535 		if (dat[i] == key)
536 		{
537 
538 			int test_max = (vote->votes[offset][i]);
539 			test_max += w;
540 			vote->votes[offset][i] = test_max;
541 
542 			if(test_max > vote->max_vote){
543 				vote->max_vote = test_max;
544 				vote->max_position = key;
545 			}
546 			return;
547 		}
548 	}
549 
550 	if(!add_new || datalen >=GENE_VOTE_SPACE)
551 		return;
552 
553 //	w += 16;
554 
555 	vote -> items[offset] ++;
556 	dat[i] = key;
557 	vote->votes[offset][i]=w;
558 }
559 
560 #endif
561 
max_gene_vote(gene_vote_t * vote,int * position_result,int query)562 int max_gene_vote(gene_vote_t* vote, int * position_result, int query)
563 {
564 	int n, i, max_index=0;
565 	int max_votes = -1;
566 	for (n=0; n<GENE_VOTE_TABLE_SIZE; n++)
567 	{
568 		int itms = vote->items[n];
569 		gene_vote_number_t * vots = vote->votes[n];
570 		for (i=0; i<itms; i++)
571 		{
572 
573 			if (vots[i] > max_votes)
574 			{
575 				max_index = n<<16|i;
576 				max_votes = vots[i];
577 			}
578 		}
579 	}
580 
581 	if(max_votes == -1)
582 	{
583 		*position_result = -1;
584 		return 0;
585 	}
586 	else
587 	{
588 		*position_result = vote->pos[max_index>>16][max_index&0xffff];
589 		return max_votes;
590 	}
591 }
592 
593 
clear_allvote(gene_allvote_t * allvote)594 void clear_allvote(gene_allvote_t* allvote)
595 {
596 	memset(allvote -> max_votes,0,  allvote -> max_len* sizeof(*allvote -> max_votes));
597 	memset(allvote -> masks,0,  allvote -> max_len* sizeof(*allvote -> masks));
598 	memset(allvote -> span_coverage ,0,  allvote -> max_len* sizeof(char));
599 }
600 
destory_allvote(gene_allvote_t * allvote)601 void destory_allvote(gene_allvote_t* allvote)
602 {
603 	free(allvote -> max_positions);
604 	free(allvote -> max_votes);
605 	free(allvote -> max_quality);
606 	free(allvote -> max_final_quality);
607 	free(allvote -> masks);
608 	#ifdef REPORT_ALL_THE_BEST
609 	free(allvote -> best_records);
610 	#endif
611 	free(allvote -> is_counterpart);
612 	free(allvote -> span_coverage);
613 	if(allvote -> max_indel_recorder)
614 		free(allvote -> max_indel_recorder);
615 }
616 
init_allvote(gene_allvote_t * allvote,int expected_len,int allowed_indels)617 int init_allvote(gene_allvote_t* allvote, int expected_len, int allowed_indels)
618 {
619 	int is_OK = 0;
620 	allvote -> max_len = expected_len;
621 	allvote -> max_positions = (unsigned int *) malloc(sizeof(int)*expected_len);
622 	allvote -> max_votes = (gene_vote_number_t *) calloc(sizeof(gene_vote_number_t), expected_len);
623 	allvote -> max_quality = (gene_quality_score_t *) calloc(sizeof(gene_quality_score_t), expected_len);
624 	allvote -> max_final_quality = (gene_quality_score_t *) calloc(sizeof(gene_quality_score_t), expected_len);
625 	allvote -> masks = (short *) calloc(sizeof(short), expected_len);
626 #ifdef REPORT_ALL_THE_BEST
627 	allvote -> best_records = (gene_best_record_t *) malloc(sizeof(gene_best_record_t)* expected_len);
628 #endif
629 	allvote -> is_counterpart = (unsigned char *) malloc(expected_len);
630 
631 	allvote -> max_indel_tolerance = allowed_indels;
632 	allvote	-> indel_recorder_length = max(3*(allowed_indels+1)+1, 28);
633 	allvote -> span_coverage = (char *)calloc(sizeof(char), expected_len);
634 	allvote -> repeated_regions = (unsigned char *)calloc(sizeof(unsigned char), 16 * expected_len);
635 
636 	if((allvote -> max_quality &&  allvote -> max_positions  && allvote -> max_votes  && allvote -> max_final_quality && allvote -> masks && allvote -> is_counterpart && allvote -> span_coverage))
637 		is_OK = 1;
638 
639 	if(allowed_indels && is_OK)
640 	{
641 		allvote -> max_indel_recorder = (char *)malloc( allvote -> indel_recorder_length *expected_len);
642 		is_OK = (allvote -> max_indel_recorder!=NULL);
643 	}
644 	else	allvote -> max_indel_recorder =  NULL;
645 
646 	if(is_OK)
647 		return 0;
648 	else
649 	{
650 		SUBREADputs(MESSAGE_OUT_OF_MEMORY);
651 		return 1;
652 	}
653 }
654 
655 
656 // rl_adjust >0: "D" in read; rl_adjust<0: "I" in read
compress_cigar(char * cigar,int total_length,char * read,int * pos_offset,int * rl_adjust)657 void compress_cigar(char *cigar, int total_length, char * read, int * pos_offset, int *rl_adjust)
658 {
659 	char tmp[200];
660 	char cigar_piece [10];
661 
662 	int cigar_len = strlen(cigar);
663 	cigar_len = min(cigar_len, EXON_MAX_CIGAR_LEN);
664 	cigar[cigar_len]=0;
665 
666 	int i;
667 	srInt_64 tmpv = 0;
668 
669 	char last_operation = 'X';
670 	srInt_64 last_tmpv = 0;
671 	int is_first_M = 1;
672 	int delta_i = 0;
673 	int delta_d = 0;
674 	int delta_rl = 0;
675 	tmp[0]=0;
676 	int cigar_length=0;
677 
678 	for(i=0; i < cigar_len; i++)
679 	{
680 		char cc = cigar[i];
681 		if(isdigit(cc))
682 		{
683 			tmpv=tmpv*10+(cc-'0');
684 		}
685 		else if(cc=='-')
686 		{
687 			last_tmpv = 0;
688 			cigar_length = 0;
689 			break;
690 		}
691 		else
692 		{
693 			if(is_first_M && pos_offset && cc=='S')
694 				*pos_offset = tmpv;
695 			else if(cc=='M') is_first_M = 0;
696 
697 			if((cc!=last_operation) && (last_operation!='X'))
698 			{
699 				if(last_operation == 'M' || last_operation == 'S' || last_operation == 'N' ||  last_operation == 'B' || last_operation == 'J' || last_operation == 'j' || last_operation == 'b')
700 				{
701 					if(delta_i)
702 					{
703 						sprintf(cigar_piece,"%dI", delta_i);
704 						strcat(tmp, cigar_piece);
705 					}
706 					delta_i = 0;
707 
708 					if(delta_d)
709 					{
710 						sprintf(cigar_piece,"%dD", delta_d);
711 						strcat(tmp, cigar_piece);
712 					}
713 					delta_d = 0;
714 
715 					#ifdef __MINGW32__
716 					sprintf(cigar_piece,"%I64d%c", last_tmpv, last_operation);
717 					#else
718 					sprintf(cigar_piece,"%lld%c", last_tmpv, last_operation);
719 					#endif
720 					strcat(tmp, cigar_piece);
721 				}
722 
723 				if(last_operation == 'M' || last_operation == 'S' || last_operation == 'I')
724 					cigar_length += last_tmpv;
725 
726 				last_tmpv = 0;
727 			}
728 			if(cc=='I' )
729 			{
730 				delta_i += tmpv;
731 				delta_rl -= tmpv;
732 			}
733 			if(cc=='D' )
734 			{
735 				delta_d += tmpv;
736 				delta_rl += tmpv;
737 			}
738 
739 
740 			last_tmpv += tmpv;
741 
742 			tmpv = 0;
743 			last_operation = cc;
744 
745 		}
746 
747 	}
748 
749 	if(last_tmpv)
750 	{
751 		if(delta_i)
752 		{
753 			sprintf(cigar_piece,"%dI", delta_i);
754 			strcat(tmp, cigar_piece);
755 		}
756 		if(delta_d)
757 		{
758 			sprintf(cigar_piece,"%dD", delta_d);
759 			strcat(tmp, cigar_piece);
760 		}
761 
762 
763 
764 		if(last_operation =='M' || last_operation =='S')
765 		{
766 			#ifdef __MINGW32__
767 			sprintf(cigar_piece,"%I64d%c", tmpv+last_tmpv, last_operation);
768 			#else
769 			sprintf(cigar_piece,"%lld%c", tmpv+last_tmpv, last_operation);
770 			#endif
771 			strcat(tmp, cigar_piece);
772 		}
773 
774 		if(last_operation == 'M' || last_operation == 'S' || last_operation == 'I')
775 			cigar_length += tmpv+last_tmpv;
776 	}
777 	if(cigar_length == total_length)
778 	{
779 		if(rl_adjust)(*rl_adjust)=delta_rl;
780 		strcpy(cigar, tmp);
781 	}
782 	else
783 	{
784 		sprintf(cigar, "%dM", total_length);
785 	}
786 }
787 
788 // adjust_len <0: "I" in read; adjust_len>0: "D" in read
show_cigar(char * info,int len,int is_reversed_map,char * buf,int indel_tolerance,int total_subreads,char * read,int * pos_offset,int * adjust_len)789 void show_cigar(char * info, int len, int is_reversed_map, char * buf, int indel_tolerance, int total_subreads, char *read, int * pos_offset, int * adjust_len)
790 {
791 	int i;
792 	int last_offset = 0, cursor = 0;
793 
794 	if(info[0]==-1){
795 		sprintf(buf, "%dM", len);
796 		return;
797 	}
798 	else if(info[0]==-2){
799 		if (strchr( info+1, '-'))
800 			sprintf(buf, "%dM", len);
801 		else
802 		{
803 			strncpy(buf, info+1, 98);
804 			compress_cigar(buf, len, read, pos_offset, adjust_len);
805 		}
806 		return;
807 	}
808 	else if(info[0]==-3){
809 		info ++;
810 	}
811 
812 	for(i=0; i<indel_tolerance*3; i+=3)
813 	{
814 		if (!info[i])break;
815 		int dist = info[i+2];
816 		//int subread_start = info[i]-1;
817 		int subread_end = info[i+1]-1;
818 
819 		int base_end = (i < indel_tolerance*3-3 && info[i+3])?find_subread_end(len, total_subreads, subread_end):len;
820 
821 //			SUBREADprintf("BE=%d ; II+3=%d ; len=%d\n", base_end, info[i+3], len);
822 
823 		int offset = last_offset-dist;
824 		if (base_end - cursor - (offset>0?offset:0) <0)
825 		{
826 			buf[0]=0;
827 
828 			cursor = 0;
829 			break;
830 		}
831 		if (i >0)
832 		{
833 			sprintf(buf+strlen(buf), "%d%c%dM", abs(offset), offset>0?'I':'D', base_end - cursor - (offset>0?offset:0));
834 			if(adjust_len)
835 				(*adjust_len) -= offset;
836 		}
837 		else
838 			sprintf(buf+strlen(buf), "%dM", base_end);
839 		last_offset = dist;
840 		cursor = base_end;
841 	}
842 	compress_cigar(buf, len, read, pos_offset, adjust_len);
843 }
844 
add_allvote_q(gene_allvote_t * allvote,int qid,int pos,gene_vote_number_t votes,gene_quality_score_t quality,int is_counterpart,short mask,char * max_indel_recorder,gene_value_index_t * array_index,char * read_txt,int read_len,int max_indel,int total_subreads,int space_type,int report_junction,int is_head_high_quality,char * qual_txt,int phred_version,char span_coverage,short ** dynamic_programming_short,char ** dynamic_programming_char)845 void add_allvote_q(gene_allvote_t* allvote,int qid , int pos, gene_vote_number_t votes, gene_quality_score_t quality, int is_counterpart, short mask, char * max_indel_recorder, gene_value_index_t * array_index, char * read_txt, int read_len, int max_indel, int total_subreads, int space_type, int report_junction, int is_head_high_quality, char * qual_txt, int phred_version, char span_coverage, short **dynamic_programming_short ,  char ** dynamic_programming_char)
846 {
847 
848 //	SUBREADprintf("Ospan=%d Nspan=%d\n", allvote -> span_coverage[qid], span_coverage);
849 
850 }
851 
852 
853 float CORE_RECOVER_MATCHING_RATE = 0.9;
854 float CORE_INDEL_MATCHING_RATE_TAIL = 0.8;
855 float CORE_INDEL_MATCHING_RATE_HEAD = 0.8;
856 
find_and_explain_indel(gene_allvote_t * allvote,int qid,int pos,gene_vote_number_t votes,gene_quality_score_t quality,int is_counterpart,char mask,char * max_indel_recorder,gene_value_index_t * array_index,char * read_txt,int read_len,int max_indel,int total_subreads,int space_type,int report_junction,int is_head_high_quality,char * qual_txt,int phred_version,short ** dynamic_programming_short,char ** dynamic_programming_char)857 void find_and_explain_indel(gene_allvote_t* allvote,int qid , int pos, gene_vote_number_t votes, gene_quality_score_t quality, int is_counterpart, char mask, char * max_indel_recorder, gene_value_index_t * array_index, char * read_txt, int read_len, int max_indel, int total_subreads, int space_type, int report_junction,  int is_head_high_quality, char * qual_txt, int phred_version , short ** dynamic_programming_short,  char **dynamic_programming_char)
858 {
859 	//unsigned int pos0 = pos;
860 	if(allvote -> max_indel_recorder)
861 	{
862 		//PNT111
863 		short head_indel_pos=-1 , tail_indel_pos=-1;
864 		int head_indel_movement=0, tail_indel_movement=0;
865 		if (array_index)
866 		{
867 			int k;
868 			if (max_indel_recorder[0])
869 			{
870 				int cover_start = find_subread_end(read_len, total_subreads, max_indel_recorder[0]-1)- 15;
871 
872 				for (k=0; max_indel_recorder[k]; k+=3);
873 				int cover_end = find_subread_end(read_len, total_subreads, max_indel_recorder[k-2]-1) + max(0,-max_indel_recorder[k-1]);
874 
875 				int head_qual = is_head_high_quality?0x7fffffff:0, tail_qual = is_head_high_quality?0:0x7fffffff;
876 
877 
878 				if(qual_txt && qual_txt[0])
879 				{
880 					int j;
881 					head_qual = 0;
882 					tail_qual = 0;
883 					for (j = 0; j<cover_start; j++)
884 					{
885 						if(FASTQ_PHRED64 == phred_version)
886 						{
887 							head_qual += (1000000-get_base_error_prob64i(qual_txt[j]));
888 						}
889 						else
890 						{
891 							head_qual += (1000000-get_base_error_prob33i(qual_txt[j]));
892 						}
893 					}
894 					for (j = 0; j<read_len - cover_end; j++)
895 					{
896 						if(FASTQ_PHRED64 == phred_version)
897 						{
898 							tail_qual += (1000000-get_base_error_prob64i(qual_txt[read_len - j -1]));
899 						}
900 						else
901 						{
902 							tail_qual += (1000000-get_base_error_prob33i(qual_txt[read_len - j -1]));
903 						}
904 					}
905 				}
906 
907 				int head_must_correct = 4;
908 				if(cover_start > 0)
909 				{
910 					if (head_qual / cover_start < 850000)
911 					{
912 						CORE_INDEL_MATCHING_RATE_HEAD = 0.75;
913 						head_must_correct =2;
914 					}
915 					else if (head_qual / cover_start < 950000 )
916 					{
917 						CORE_INDEL_MATCHING_RATE_HEAD = 0.85;
918 						head_must_correct =3;
919 					}
920 					else CORE_INDEL_MATCHING_RATE_HEAD = 0.92;
921 				}else	CORE_INDEL_MATCHING_RATE_HEAD = 9999;
922 
923 
924 				int tail_must_correct = 4;
925 				if(read_len-cover_end > 0)
926 				{
927 					if (tail_qual / (read_len-cover_end) < 850000)
928 					{
929 						CORE_INDEL_MATCHING_RATE_TAIL = 0.75;
930 						tail_must_correct =2;
931 					}
932 					else if (tail_qual / (read_len-cover_end) < 950000)
933 					{
934 						CORE_INDEL_MATCHING_RATE_TAIL = 0.85;
935 						tail_must_correct =3;
936 					}
937 					else CORE_INDEL_MATCHING_RATE_TAIL = 0.92;
938 				}else	CORE_INDEL_MATCHING_RATE_TAIL = 9999;
939 
940 
941 
942 
943 				int is_full_covered = 0;
944 				is_full_covered = extend_covered_region(array_index, pos, read_txt, read_len, cover_start, cover_end, 4, head_must_correct, tail_must_correct,  max_indel, space_type, max_indel_recorder[k-1], &head_indel_pos, &head_indel_movement, &tail_indel_pos, &tail_indel_movement, is_head_high_quality, qual_txt, phred_version, CORE_INDEL_MATCHING_RATE_HEAD, CORE_INDEL_MATCHING_RATE_TAIL);
945 				if(head_indel_movement)
946 				{
947 					pos += head_indel_movement;
948 					allvote -> max_positions[qid] = pos;
949 				}
950 				if(is_full_covered == 3)
951 				{
952 					allvote -> masks[qid] &= ~IS_RECOVERED_JUNCTION_READ;
953 				}
954 				else	allvote -> masks[qid] |= IS_RECOVERED_JUNCTION_READ;
955 			}
956 		}
957 
958 		*(allvote -> max_indel_recorder + qid * allvote -> indel_recorder_length) = 0xff ;
959 
960 
961 		if ((max_indel_recorder[3] || head_indel_pos>=0 || tail_indel_pos>0)){
962 			int head_pos = 0;
963 			if((head_indel_pos>=0 && report_junction) || head_indel_movement)head_pos = head_indel_pos ;//- max(0,head_indel_movement);
964 			int tail_pos = read_len;
965 			if((tail_indel_pos>0 && report_junction) || tail_indel_movement)tail_pos = tail_indel_pos ;//- min(0, tail_indel_movement);
966 			explain_indel_in_middle(allvote, qid , pos, max_indel_recorder,  array_index, read_txt, read_len,  max_indel, total_subreads, head_pos, tail_pos , head_indel_movement, tail_indel_movement, report_junction, dynamic_programming_short, dynamic_programming_char);
967 			//if(*(allvote -> max_indel_recorder + qid * allvote -> indel_recorder_length) != 0xfd ) allvote -> max_positions[qid] = pos0;
968 		}
969 
970 	}
971 
972 }
973 
explain_indel_in_middle(gene_allvote_t * allvote,int qid,int pos,char * max_indel_recorder,gene_value_index_t * array_index,char * read_txt,int read_len,int max_indel,int total_subreads,int head_start_point,int tail_end_point,int head_indel_movement,int tail_indel_movement,int report_junction,short ** dynamic_programming_short,char ** dynamic_programming_char)974 void explain_indel_in_middle(gene_allvote_t* allvote, int qid , int pos, char * max_indel_recorder, gene_value_index_t * array_index, char * read_txt, int read_len, int max_indel, int total_subreads, int head_start_point, int tail_end_point, int head_indel_movement, int tail_indel_movement, int report_junction, short ** dynamic_programming_short ,  char ** dynamic_programming_char)
975 {
976 
977 	char indel_operations[1500];
978 	int i,xx;
979 	char tmp_cigar [EXON_MAX_CIGAR_LEN+1];
980 
981 	int current_pos = head_start_point;
982 	int explain_cursor = head_start_point;
983 	char last_operation = 0;
984 	int del_number = 0;
985 	int dynamic_delta = 0;
986 
987 	tmp_cigar[0]=0;
988 
989 	if(current_pos >0)
990 	{
991 		if (head_indel_movement != 0)
992 			sprintf(tmp_cigar, "%dM%d%c", head_start_point  - max(head_indel_movement, 0), abs(head_indel_movement), head_indel_movement>0?'I':'D');
993 		else if (report_junction)
994 			sprintf(tmp_cigar, "%dS", head_start_point);
995 	}
996 
997 	//ddprintf ("R=%s; REC[8]=%d [9]=%d\n", read_txt, max_indel_recorder[8], max_indel_recorder[9]);
998 
999 	for (i=3; max_indel_recorder[i]; i+=3)
1000 	{
1001 		if (i >= max_indel*3) break;
1002 
1003 		int last_dist = max_indel_recorder[i-1];
1004 		int black_subread_start = max_indel_recorder[i-2]-1;
1005 		int black_subread_end = max_indel_recorder[i]-1;
1006 		if (black_subread_end < black_subread_start+1) black_subread_end = black_subread_start+1;
1007 		while(max_indel_recorder[i+3])
1008 		{
1009 			if (max_indel_recorder[i+3]- black_subread_end>2 || i >= max_indel*3-3) break;
1010 			black_subread_end = max_indel_recorder[i+3]-1;
1011 			i+=3;
1012 		}
1013 		int black_base_start = find_subread_end(read_len, total_subreads,black_subread_start )- 3 + max(0,-last_dist);
1014 		int black_base_end = find_subread_end(read_len, total_subreads, black_subread_end)-15 + 3 + max(0,-max_indel_recorder[i+2]);
1015 
1016 		int blackref_base_start = black_base_start - max(0,-last_dist) + max(0,last_dist);
1017 //		SUBREADprintf("baseref_offset=%d\n", max(0,last_dist));
1018 
1019 		int gap_end_read =  black_base_end + max(0,max_indel_recorder[i+2]);
1020 
1021 		blackref_base_start = max(0,blackref_base_start);
1022 		black_base_start = max(0,black_base_start);
1023 		black_base_end = min(read_len, black_base_end);
1024 
1025 		int vpos = strlen(tmp_cigar);
1026 		if (vpos > EXON_MAX_CIGAR_LEN -6) break;
1027 
1028 		int exp_indel = last_dist - max_indel_recorder[i+2];
1029 
1030 
1031 		int moves = 0;
1032 		if(black_base_end > black_base_start)
1033 			moves = dynamic_align (read_txt + black_base_start, black_base_end-black_base_start , array_index, pos + blackref_base_start, max_indel, indel_operations, exp_indel,  -10, black_base_end - black_base_start+5, dynamic_programming_short, dynamic_programming_char);
1034 
1035 		last_operation = 0;
1036 		if(moves)// < (black_base_end-black_base_start) + 6 + 1.5 * exp_indel)
1037 		{
1038 			int pre_read_len = 0;
1039 			current_pos = black_base_start ;
1040 			xx=0;
1041 			while (pre_read_len < 0 && xx < moves)
1042 			{
1043 				if(indel_operations[xx]!=2)pre_read_len++;
1044 				xx++;
1045 			}
1046 			for (; xx<moves; xx++)
1047 			{
1048 				int current_operation = indel_operations[xx];
1049 				if(current_operation == 3) current_operation = 0;
1050 
1051 				if(current_operation != last_operation && (current_pos < gap_end_read-1+1 || current_operation == 0))// && current_pos!=explain_cursor)
1052 				{
1053 					int vpos = strlen(tmp_cigar);
1054 					if (vpos>EXON_MAX_CIGAR_LEN-6) break;
1055 					sprintf(tmp_cigar + vpos, "%d%c", last_operation==1?del_number:(current_pos - explain_cursor), last_operation==0?'M':(last_operation==1?'D':'I'));
1056 					explain_cursor = current_pos ;
1057 					del_number = 0;
1058 				}
1059 				if(current_pos >= gap_end_read-1 || xx == moves - 1){
1060 					if(current_operation != 0 && current_pos!=explain_cursor)
1061 					{
1062 						int vpos = strlen(tmp_cigar);
1063 						sprintf(tmp_cigar + vpos, "%d%c", current_operation==1?del_number:(current_pos - explain_cursor), current_operation==1?'D':'I');
1064 						explain_cursor = current_pos;
1065 						del_number=0;
1066 					}
1067 					break;
1068 				}
1069 
1070 				if (indel_operations[xx]==0) current_pos ++;
1071 				else if (indel_operations[xx]==1) {del_number++; dynamic_delta++;}//"D"
1072 				else if (indel_operations[xx]==2) {current_pos ++; dynamic_delta--;} //"I"
1073 				else if (indel_operations[xx]==3) current_pos ++;
1074 
1075 
1076 				last_operation = current_operation;
1077 			}
1078 		}
1079 		else
1080 		{
1081 			int movement = last_dist - max_indel_recorder[i+2];
1082 			int vpos = strlen(tmp_cigar);
1083 			if (vpos>EXON_MAX_CIGAR_LEN-6) break;
1084 
1085 			current_pos = find_subread_end(read_len, total_subreads,max_indel_recorder[i]-1)-15 + (max_indel_recorder[i+2] >0?max_indel_recorder[i+2]:-max_indel_recorder[i+2]);
1086 			current_pos -= (black_base_end - black_base_start)/2 -3;
1087 
1088 			if(movement)
1089 				sprintf(tmp_cigar + vpos, "%dM%d%c", current_pos - explain_cursor , abs(movement), movement<0?'D':'I' );
1090 			else if (current_pos - explain_cursor>0)
1091 				sprintf(tmp_cigar + vpos, "%dM", current_pos - explain_cursor );
1092 
1093 			explain_cursor = current_pos + (movement >0?movement:0);
1094 			current_pos = find_subread_end(read_len, total_subreads,max_indel_recorder[i+1]-1);
1095 			last_operation = 0;
1096 			del_number = 0;
1097 			moves = 0;
1098 		}
1099 		last_operation = 0;
1100 		current_pos = black_base_end;
1101 
1102 
1103 	}
1104 
1105 	int vpos = strlen(tmp_cigar);
1106 	if (vpos > EXON_MAX_CIGAR_LEN-9 )
1107 	{
1108 		*(allvote -> max_indel_recorder + qid * allvote -> indel_recorder_length) = 0xfd;
1109 		memcpy(allvote -> max_indel_recorder + qid * allvote -> indel_recorder_length+1, max_indel_recorder, 3 * max_indel);
1110 	}
1111 	else
1112 	{
1113 		if (explain_cursor<read_len)
1114 		{
1115 			if(tail_end_point > explain_cursor)
1116 				sprintf(tmp_cigar+vpos,"%dM", tail_end_point - explain_cursor);
1117 			vpos = strlen(tmp_cigar);
1118 			if(tail_end_point< read_len)
1119 			{
1120 				if (tail_indel_movement!=0)
1121 				{
1122 					int tail_len_m = read_len -(tail_end_point - min(tail_indel_movement, 0));
1123 					if(tail_len_m)
1124 						sprintf(tmp_cigar+vpos, "%d%c%dM", abs(tail_indel_movement), tail_indel_movement<0?'I':'D', tail_len_m);
1125 					else
1126 						sprintf(tmp_cigar+vpos, "%d%c", abs(tail_indel_movement), tail_indel_movement<0?'I':'D');
1127 				}
1128 				else if (report_junction)
1129 					sprintf(tmp_cigar+vpos, "%dS",read_len - tail_end_point );
1130 			}
1131 		}
1132 	//	if (head_start_point >0 || tail_end_point < read_len)
1133 	//		strcat(tmp_cigar,"X");
1134 
1135 		*(allvote -> max_indel_recorder + qid * allvote -> indel_recorder_length) = 0xfe ;
1136 		strncpy(allvote -> max_indel_recorder + qid * allvote -> indel_recorder_length+1, tmp_cigar, allvote -> indel_recorder_length - 2);
1137 	}
1138 }
1139 
evaluate_piece(char * piece_str,int chron,int offset,int is_counterpart,int start_pos,int end_pos)1140 int evaluate_piece(char * piece_str, int chron, int offset, int is_counterpart, int start_pos, int end_pos)
1141 {
1142 	char fname[MAX_FILE_NAME_LENGTH];
1143 	int inner_pos = 0, i;
1144 	FILE * fp;
1145 	char next_char=0;
1146 	int ret = 0;
1147 
1148 	if (chron == 0)
1149 		sprintf(fname, "/opt/Work2001/Gene-Search/src/GENE-LIB/%02da.fa", chron);
1150 	else
1151 		sprintf(fname, "/opt/Work2001/Gene-Search/src/GENE-LIB/%02d.fa", chron);
1152 
1153 	inner_pos = offset + offset / 70;
1154 
1155 	fp = f_subr_open(fname,"r");
1156 
1157 	while(next_char!='\n')
1158 		next_char=fgetc(fp);
1159 	fseek(fp, inner_pos, SEEK_CUR);
1160 
1161 	for(i=0 ; i<end_pos; i++)
1162 	{
1163 		next_char = get_next_char(fp);
1164 		if(i < start_pos)
1165 			continue;
1166 		if(next_char == 'N')
1167 			ret ++;
1168 		else{
1169 			if(is_counterpart)
1170 			{
1171 				if (piece_str[99-i] == 'A' && next_char == 'T')
1172 					ret ++;
1173 				else if (piece_str[99-i] == 'G' && next_char == 'C')
1174 					ret ++;
1175 				else if (piece_str[99-i] == 'T' && next_char == 'A')
1176 					ret ++;
1177 				else if (piece_str[99-i] == 'C' && next_char == 'G')
1178 					ret ++;
1179 			}
1180 			else if(piece_str[i] == next_char)
1181 				ret ++;
1182 		}
1183 	}
1184 
1185 	fclose(fp);
1186 
1187 	return ret;
1188 
1189 }
1190 
match_chro_indel(char * read,gene_value_index_t * index,unsigned int pos,int test_len,int is_negative_strand,int space_type,int indel_size,gene_vote_number_t * indel_recorder,int total_subreads)1191 int match_chro_indel(char * read, gene_value_index_t * index, unsigned int pos, int test_len, int is_negative_strand, int space_type, int indel_size, gene_vote_number_t * indel_recorder, int total_subreads)
1192 {
1193 
1194 	int ret = 0;
1195 	int tali = 0;
1196 	int last_section_end = 0;
1197 
1198 	for(tali = 0; indel_recorder[tali*3] && tali<MAX_INDEL_TOLERANCE; tali++)
1199 	{
1200 		int section_last_subread = indel_recorder[tali*3+1] - 1;
1201 		int section_offset = indel_recorder[tali*3+2];
1202 		int this_section_end = find_subread_end(test_len , total_subreads, section_last_subread);
1203 		if(!indel_recorder[tali*3+3]) this_section_end = test_len;
1204 
1205 		this_section_end = min(test_len, this_section_end);
1206 		this_section_end = max(this_section_end, last_section_end);
1207 
1208 		ret += match_chro(read + last_section_end - min(0, section_offset), index, pos + last_section_end + max(0, section_offset), this_section_end - last_section_end + min(0, section_offset), is_negative_strand * 0, space_type);
1209 
1210 		last_section_end = this_section_end;
1211 	}
1212 
1213 	return ret;
1214 
1215 }
1216 
1217 
match_chro_indel_old(char * read,gene_value_index_t * index,unsigned int pos,int test_len,int is_negative_strand,int space_type,int indel_size)1218 int match_chro_indel_old(char * read, gene_value_index_t * index, unsigned int pos, int test_len, int is_negative_strand, int space_type, int indel_size)
1219 {
1220 
1221 	int i;
1222 	int ret = 0;
1223 	for (i = -indel_size; i<=indel_size ; i++)
1224 	{
1225 		if (pos +i + test_len < index -> start_base_offset + index -> length && pos > -i)
1226 			ret += match_chro(read, index, pos + i, test_len, is_negative_strand, space_type);
1227 	}
1228 	return ret;
1229 
1230 }
1231 
1232 
1233 
mark_votes_array_index(char * read_str,int read_len,gene_vote_t * dest,gene_vote_t * src,gene_value_index_t * my_array_index,int color_space,int indel_tolerance,int min_minor,const char quality_str[],int quality_scale,int is_negative_strand)1234 void mark_votes_array_index(char * read_str, int read_len, gene_vote_t * dest, gene_vote_t * src, gene_value_index_t * my_array_index, int color_space, int indel_tolerance, int min_minor, const char quality_str [], int quality_scale, int is_negative_strand)
1235 {
1236 	int i, j;
1237 
1238 	dest -> max_vote = 0;
1239 	dest -> max_quality = 0;
1240 
1241 	for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
1242 	{
1243 		dest -> items[i] = src -> items[i];
1244 		for(j=0; j< src->items[i]; j++)
1245 		{
1246 			unsigned int potential_position = src-> pos[i][j];
1247 			float matchingness_count = 0;
1248 			if (src -> votes [i][j]>= min_minor)
1249 				//matchingness_count = match_read(read_str, read_len, potential_position, my_array_index, color_space, indel_tolerance, quality_str, quality_scale);
1250 				matchingness_count = 1.*match_chro_indel(read_str, my_array_index, potential_position, read_len, 0, color_space, indel_tolerance, NULL, 0);
1251 
1252 			dest -> pos[i][j] = potential_position;
1253 			dest -> quality[i][j] = matchingness_count;
1254 			dest -> votes [i][j] =src -> votes [i][j];
1255 			dest -> masks[i][j] = src -> masks[i][j];
1256 			dest -> coverage_start[i][j] = src -> coverage_start[i][j];
1257 			dest -> coverage_end[i][j] = src -> coverage_end[i][j];
1258 			memcpy(dest -> indel_recorder[i][j], src -> indel_recorder[i][j], MAX_INDEL_TOLERANCE*3*sizeof(char));
1259 			// find for the `best positions'; only the best positions are replaced by the base machingness scores.
1260 			if((matchingness_count >  dest -> max_quality && src -> votes [i][j] == dest -> max_vote) || (src -> votes [i][j] > dest -> max_vote))
1261 			{
1262 				memcpy(dest -> max_indel_recorder, src -> indel_recorder[i][j], MAX_INDEL_TOLERANCE*3*sizeof(char));
1263 				dest -> max_vote  = src -> votes [i][j];
1264 				dest -> max_mask  = src -> masks[i][j];
1265 				dest -> max_quality = matchingness_count;
1266 				dest -> max_position = potential_position;
1267 				dest -> max_coverage_start = src -> coverage_start[i][j];
1268 				dest -> max_coverage_end = src -> coverage_end[i][j];
1269 			}
1270 		}
1271 	}
1272 }
1273 
select_positions_array(char * read1_str,int read1_len,char * read2_str,int read2_len,gene_vote_t * vote_read1,gene_vote_t * vote_read2,gene_vote_number_t * numvote_read1,gene_vote_number_t * numvote_read2,gene_quality_score_t * sum_quality,gene_quality_score_t * qual_r1,gene_quality_score_t * qual_r2,gene_vote_number_t * r1_recorder,gene_vote_number_t * r2_recorder,gehash_data_t * pos_read1,gehash_data_t * pos_read2,unsigned int max_pair_dest,unsigned int min_pair_dest,int min_major,int min_minor,int is_negative_strand,gene_value_index_t * my_array_index,int color_space,int indel_tolerance,int number_of_anchors_quality,const char quality_str1[],const char quality_str2[],int quality_scale,int max_indel_len,int * is_break_even)1274 int select_positions_array(char * read1_str, int read1_len, char * read2_str, int read2_len,  gene_vote_t * vote_read1, gene_vote_t * vote_read2, gene_vote_number_t * numvote_read1, gene_vote_number_t * numvote_read2,  gene_quality_score_t * sum_quality, gene_quality_score_t * qual_r1, gene_quality_score_t * qual_r2, gene_vote_number_t * r1_recorder, gene_vote_number_t * r2_recorder, gehash_data_t * pos_read1, gehash_data_t * pos_read2, unsigned int max_pair_dest, unsigned int min_pair_dest, int min_major, int min_minor,int is_negative_strand, gene_value_index_t * my_array_index, int color_space, int indel_tolerance, int number_of_anchors_quality, const char quality_str1 [], const char quality_str2 [], int quality_scale, int max_indel_len, int * is_break_even)
1275 {
1276 	gene_vote_t base_vote_1 , base_vote_2;
1277 
1278 	mark_votes_array_index(read1_str, read1_len, &base_vote_1, vote_read1, my_array_index, color_space, indel_tolerance, min_minor, quality_str1, quality_scale, is_negative_strand);
1279 	mark_votes_array_index(read2_str, read2_len, &base_vote_2, vote_read2, my_array_index, color_space, indel_tolerance, min_minor, quality_str2, quality_scale, is_negative_strand);
1280 
1281 //	SUBREADprintf("Q=%.3f\n", *qual_r2);
1282 
1283 	return select_positions(&base_vote_1, &base_vote_2, numvote_read1, numvote_read2, sum_quality, qual_r1, qual_r2,  pos_read1, pos_read2, r1_recorder , r2_recorder ,max_pair_dest,  min_pair_dest,  min_major,  min_minor, is_negative_strand, number_of_anchors_quality,max_indel_len, is_break_even);
1284 }
1285 
1286 
select_positions(gene_vote_t * vote_read1,gene_vote_t * vote_read2,gene_vote_number_t * numvote_read1,gene_vote_number_t * numvote_read2,gene_quality_score_t * sum_quality,gene_quality_score_t * qual_r1,gene_quality_score_t * qual_r2,gehash_data_t * pos_read1,gehash_data_t * pos_read2,gene_vote_number_t * read1_indel_recorder,gene_vote_number_t * read2_indel_recorder,unsigned int max_pair_dest,unsigned int min_pair_dest,int min_major,int min_minor,int is_negative_strand,int number_of_anchors_quality,int max_indel_len,int * is_breakeven)1287 int select_positions(gene_vote_t * vote_read1, gene_vote_t * vote_read2, gene_vote_number_t * numvote_read1, gene_vote_number_t * numvote_read2, gene_quality_score_t * sum_quality, gene_quality_score_t * qual_r1, gene_quality_score_t * qual_r2 , gehash_data_t * pos_read1, gehash_data_t * pos_read2, gene_vote_number_t * read1_indel_recorder, gene_vote_number_t * read2_indel_recorder, unsigned int max_pair_dest, unsigned int min_pair_dest, int min_major, int min_minor,int is_negative_strand, int number_of_anchors_quality, int max_indel_len, int * is_breakeven)
1288 {
1289 
1290 	int k, i, j, anchors = 0;
1291 	gehash_data_t anchors_position [ANCHORS_NUMBER];
1292 	unsigned char anchor_read [ANCHORS_NUMBER];
1293 
1294 	gene_vote_number_t anchor_votes = max(vote_read1->max_vote, vote_read2->max_vote);
1295 	gene_vote_number_t anchors_votes [ANCHORS_NUMBER];
1296 	gene_quality_score_t anchors_quality [ANCHORS_NUMBER];
1297 	int anchors_buckets [ANCHORS_NUMBER];
1298 	int anchors_index [ANCHORS_NUMBER];
1299 
1300 	gene_vote_number_t minor_votes [ANCHORS_NUMBER];
1301 	gene_quality_score_t minor_quality [ANCHORS_NUMBER];
1302 	gehash_data_t minor_position [ANCHORS_NUMBER];
1303 	int minor_buckets [ANCHORS_NUMBER];
1304 	int minor_index [ANCHORS_NUMBER];
1305 	char is_minor_breakeven [ANCHORS_NUMBER];
1306 
1307 	if(anchor_votes < min_major)
1308 		return 0;
1309 
1310 	for (k=0; k<2; k++)
1311 	{
1312 		gene_vote_t * current_vote = k?vote_read2:vote_read1;
1313 		if (current_vote->max_vote < anchor_votes)
1314 			continue;
1315 
1316 		for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
1317 			for(j=0; j< current_vote->items[i]; j++)
1318 			{
1319 				if(current_vote->votes[i][j] >= anchor_votes -	    0)
1320 				{
1321 					anchors_position[anchors] = current_vote->pos[i][j];
1322 					anchors_votes   [anchors] = current_vote->votes[i][j];
1323 					anchors_quality [anchors] = current_vote->quality[i][j];
1324 					anchor_read	[anchors] = k;
1325 					anchors_buckets [anchors] = i;
1326 					anchors_index   [anchors] = j;
1327 
1328 					anchors++;
1329 
1330 					if(anchors >= ANCHORS_NUMBER)
1331 					{
1332 	//					SUBREADprintf("Abnormally too many anchors [%d, %d] %f\n", i, j, anchor_votes);
1333 						anchors --;
1334 					}
1335 				}
1336 			}
1337 
1338 	}
1339 
1340 	memset(minor_votes,0, ANCHORS_NUMBER*sizeof(gene_vote_number_t));
1341 	memset(minor_quality,0, ANCHORS_NUMBER*sizeof(gene_quality_score_t));
1342 	memset(is_minor_breakeven,0, ANCHORS_NUMBER);
1343 	for (k=0; k<2; k++)
1344 	{
1345 		gene_vote_t * current_vote = k?vote_read2:vote_read1;
1346 		gene_vote_t * current_vote2 = k?vote_read1:vote_read2;
1347 
1348 		if (current_vote2->max_vote < anchor_votes)
1349 			continue;
1350 
1351 		for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
1352 			for(j=0; j< current_vote->items[i]; j++)
1353 			{
1354 				if(current_vote->votes[i][j] >= min_minor)
1355 				{
1356 					int l;
1357 					for (l=0; l<anchors; l++)
1358 					{
1359 
1360 						if(anchor_read[l]==k)continue;
1361 
1362 						long long int abdist = current_vote->pos[i][j];
1363 						abdist -= anchors_position[l];
1364 
1365 						if((anchor_read[l] && !is_negative_strand) ||
1366 						    ((!anchor_read[l]) && is_negative_strand))
1367 							abdist = -abdist;
1368 
1369 						if (abdist < 0)
1370 						//#warning WGSIM generates paired-end data in a wrong way; abdist is given its absolute value in such a case.
1371 							abdist = -abdist;
1372 						//	continue;
1373 
1374 						long long int abdist_old = minor_position[l];
1375 						abdist_old -=  anchors_position[l];
1376 						if(abdist_old <0) abdist_old=-abdist_old;
1377 						if (	(minor_votes[l] < current_vote->votes[i][j] ||
1378 							(minor_votes[l] == current_vote->votes[i][j] && current_vote->quality [i][j] > minor_quality[l]) ||
1379 							(minor_votes[l] == current_vote->votes[i][j] && abs(current_vote->quality [i][j] - minor_quality[l])<0.0001 && abdist < abdist_old))&&
1380 							(abdist <= max_pair_dest) &&
1381 							(abdist >= min_pair_dest)
1382 						   )
1383 						{
1384 							minor_votes[l] = current_vote->votes[i][j];
1385 							minor_position[l] = current_vote->pos[i][j];
1386 							minor_quality[l] = current_vote->quality [i][j];
1387 							minor_buckets[l] = i;
1388 							minor_index[l] = j;
1389 						}
1390 					}
1391 				}
1392 			}
1393 
1394 	}
1395 
1396 	gene_vote_number_t selection_minor_vote = 0;
1397 	gene_vote_number_t selection_major_vote = 0;
1398 	gehash_data_t selection_minor_pos = 0;
1399 	gehash_data_t selection_major_pos = 0;
1400 
1401 	int selected_major_index=0, selected_major_bucket=0, selected_minor_bucket=0, selected_minor_index=0;
1402 
1403 	*sum_quality = 0;
1404 
1405 	unsigned char selection_major_read_no = 0;
1406 	//int reason = 0;
1407 
1408 	for (k=0; k<anchors; k++)
1409 	{
1410 		if(minor_votes[k]==0)
1411 			continue;
1412 
1413 		int need_replace = 0;
1414 
1415 
1416 		if ((minor_votes[k]+anchors_votes[k]) > (selection_minor_vote+selection_major_vote) ||
1417 	   	       ((minor_votes[k]+anchors_votes[k]) == (selection_minor_vote+selection_major_vote) &&
1418 		 	 minor_quality[k] + anchors_quality[k] > *sum_quality))
1419 		{
1420 			need_replace = 1;
1421 		}
1422 		else if (minor_votes[k]+anchors_votes[k] == selection_minor_vote+selection_major_vote &&
1423 		 	 minor_quality[k] + anchors_quality[k] == *sum_quality)
1424 		{
1425 			if (selection_minor_pos != minor_position[k] && minor_position[k] != selection_major_pos)
1426 			{
1427 				* is_breakeven = 1;
1428 				if (selection_major_pos >  anchors_position[k]) need_replace = 1;
1429 			}
1430 		}
1431 
1432 		if(need_replace){
1433 			selection_minor_pos = minor_position[k];
1434 			selection_major_pos = anchors_position[k];
1435 			selection_minor_vote = minor_votes[k];
1436 			selection_major_vote = anchors_votes[k];
1437 			selection_major_read_no = anchor_read[k];
1438 			selected_major_bucket = anchors_buckets[k];
1439 			selected_minor_bucket = minor_buckets[k];
1440 			selected_major_index = anchors_index[k];
1441 			selected_minor_index = minor_index[k];
1442 			* is_breakeven = is_minor_breakeven[k];
1443 			* qual_r1 = anchor_read[k] ? minor_quality[k]:anchors_quality[k];
1444 			* qual_r2 = anchor_read[k] ? anchors_quality[k]: minor_quality[k];
1445 			* sum_quality = minor_quality[k] + anchors_quality[k];
1446 
1447 		}
1448 	}
1449 
1450 	if(selection_minor_vote > 0)
1451 	{
1452 		if(selection_major_read_no)	// major is on read 2
1453 		{
1454 			*numvote_read1 = selection_minor_vote;
1455 			*numvote_read2 = selection_major_vote;
1456 			*pos_read1 = selection_minor_pos;
1457 			*pos_read2 = selection_major_pos ;
1458 			indel_recorder_copy(read1_indel_recorder, vote_read1 -> indel_recorder[selected_minor_bucket][selected_minor_index]);
1459 			indel_recorder_copy(read2_indel_recorder, vote_read2 -> indel_recorder[selected_major_bucket][selected_major_index]);
1460 		}
1461 		else
1462 		{
1463 			*numvote_read1 = selection_major_vote;
1464 			*numvote_read2 = selection_minor_vote;
1465 			*pos_read1 = selection_major_pos;
1466 			*pos_read2 = selection_minor_pos;
1467 			indel_recorder_copy(read1_indel_recorder, vote_read1 -> indel_recorder[selected_major_bucket][selected_major_index]);
1468 			indel_recorder_copy(read2_indel_recorder, vote_read2 -> indel_recorder[selected_minor_bucket][selected_minor_index]);
1469 		}
1470 		return 1;
1471 	}
1472 
1473 	return 0;
1474 }
1475 
destroy_offsets(gene_offset_t * offsets)1476 void destroy_offsets(gene_offset_t* offsets)
1477 {
1478 	free(offsets->read_names);
1479 	free(offsets->read_offsets);
1480 	HashTableDestroy(offsets->read_name_to_index);
1481 }
1482 
load_offsets(gene_offset_t * offsets,const char index_prefix[])1483 int load_offsets(gene_offset_t* offsets , const char index_prefix [])
1484 {
1485 	char fn[MAX_FILE_NAME_LENGTH];
1486 	FILE * fp;
1487 	int n=0;
1488 	int padding = 0;
1489 
1490 	int is_V3_index = gehash_load_option(index_prefix, SUBREAD_INDEX_OPTION_INDEX_PADDING , &padding);
1491 	if(!is_V3_index) return 1;
1492 
1493 	memset(offsets, 0, sizeof( gene_offset_t));
1494 	sprintf(fn, "%s.reads", index_prefix);
1495 
1496 	fp = f_subr_open(fn, "r");
1497 
1498 	if(!fp)
1499 	{
1500 		SUBREADprintf("file not found :%s\n", fn);
1501 		return 1;
1502 	}
1503 
1504 	int current_max_n = 100;
1505 	offsets->read_names = malloc(current_max_n * MAX_READ_NAME_LEN);
1506 	offsets->read_offsets= malloc(current_max_n * sizeof(int));
1507 	offsets->read_name_to_index = HashTableCreate(5000);
1508 	offsets->padding = padding;
1509 
1510 	HashTableSetKeyComparisonFunction(offsets->read_name_to_index, my_strcmp);
1511 	HashTableSetHashFunction(offsets->read_name_to_index ,HashTableStringHashFunction);
1512 	HashTableSetDeallocationFunctions(offsets->read_name_to_index ,free, NULL);
1513 
1514 
1515 
1516 	while (!feof(fp))
1517 	{
1518 		int i=0, step = 0, j=0;
1519 
1520 		read_line(MAX_FILE_NAME_LENGTH-1,fp, fn, 0);
1521 		if (strlen(fn)<2)continue;
1522 		while (fn[i])
1523 		{
1524 			if (fn[i] == '\t')
1525 			{
1526 				fn[i]=0;
1527 				offsets->read_offsets[n] = (unsigned int)atoll(fn);
1528 				step = 1;
1529 			}
1530 			else if (step)
1531 			{
1532 				if(j<MAX_CHROMOSOME_NAME_LEN-1)
1533 				{
1534 					*(offsets->read_names+MAX_CHROMOSOME_NAME_LEN*n + (j++)) = fn[i];
1535 					*(offsets->read_names+MAX_CHROMOSOME_NAME_LEN*n + j) = 0;
1536 				}
1537 			}
1538 			i++;
1539 		}
1540 
1541 		char * read_name_mem = malloc(MAX_CHROMOSOME_NAME_LEN);
1542 		strcpy(read_name_mem, offsets->read_names + n*MAX_CHROMOSOME_NAME_LEN);
1543 
1544 		//printf("09NOV: add_annotation_to_junctions: '%s',%u\n", read_name_mem , n);
1545 		HashTablePut(offsets->read_name_to_index, read_name_mem , NULL + 1 + n);
1546 
1547 		n++;
1548 		if(n >= current_max_n)
1549 		{
1550 			offsets->read_names = realloc(offsets->read_names, 2*current_max_n * MAX_CHROMOSOME_NAME_LEN);
1551 			offsets->read_offsets = realloc(offsets->read_offsets, 2*current_max_n * sizeof(int));
1552 			current_max_n*=2;
1553 		}
1554 
1555 
1556 		offsets->read_offsets[n] = 0;
1557 
1558 	}
1559 
1560 	offsets->total_offsets=n;
1561 	fclose(fp);
1562 	return 0;
1563 }
1564 
1565 #ifndef MAKE_STANDALONE
1566 #define CLOCK_USE_GETTIME
1567 #endif
1568 
miltime()1569 double miltime(){
1570 	double ret;
1571 	#if defined(__FreeBSD__) || defined(__DragonFly__)
1572 		struct timeval tp;
1573 		struct timezone tz;
1574 		tz.tz_minuteswest=0;
1575 		tz.tz_dsttime=0;
1576 		gettimeofday(&tp,&tz);
1577 		ret = tp.tv_sec+ 0.001*0.001* tp.tv_usec;
1578 	#else
1579     	#ifdef CLOCK_USE_GETTIME
1580 	 	struct timespec tsc;
1581 		clock_gettime(CLOCK_REALTIME, &tsc);
1582 	 	ret = tsc.tv_sec*1. + tsc.tv_nsec*1./1000/1000/1000;
1583     	#else
1584 		struct timeb trp;
1585 		ftime(&trp);
1586 		ret = trp.time*1.0+(trp.millitm*1.0/1000.0);
1587 	#endif
1588 	#endif
1589 
1590 	return ret;
1591 }
1592 
1593 #define front2(str, bias)	(*((str)+(bias))+*((str)+1+(bias)))
1594 #define front4(str, bias)	(front2(str, bias)+front2(str, bias+2))
1595 #define front8(str, bias)	(front4(str, bias)+front4(str, bias+4))
1596 #define front16(str, bias)	(front8(str, bias)+front8(str, bias+8))
1597 
match_read(const char read_str[],int read_len,unsigned int potential_position,gene_value_index_t * my_array_index,int space_type,int indel_tolerance,const char quality_str[],int quality_scale)1598 float match_read(const char read_str[], int read_len, unsigned int potential_position,  gene_value_index_t * my_array_index, int space_type, int indel_tolerance, const char quality_str [], int quality_scale)
1599 {
1600 	float ret = 0;
1601 	int i, bias;
1602 	char read_matchingness [7][1250];
1603 
1604 	if(indel_tolerance>3) indel_tolerance = 3;
1605 
1606 	for(bias=-indel_tolerance; bias<=indel_tolerance;bias++)
1607 	{
1608 		for(i=0;i<read_len; i++)
1609 		{
1610 			char base_int = base2int(read_str[i]);
1611 			int is_matched_base =  gvindex_match_base(my_array_index, potential_position+i+bias, base_int);
1612 			read_matchingness[bias+indel_tolerance][i] = is_matched_base;
1613 
1614 /*			if(i % 3 == 2)
1615 			{
1616 				int b2_match = (read_matchingness[bias+indel_tolerance][i -2]  && read_matchingness[bias+indel_tolerance][i -1] && read_matchingness[bias+indel_tolerance][i]);
1617 				if(b2_match)
1618 				{
1619 					read_matchingness[bias+indel_tolerance][i]  = 2;
1620 					read_matchingness[bias+indel_tolerance][i-1]  = 2;
1621 					read_matchingness[bias+indel_tolerance][i-2]  = 2;
1622 				}
1623 			}*/
1624 		}
1625 	}
1626 
1627 	for(i=0; i<read_len-4; i+=4)
1628 	{
1629 
1630 		int j;
1631 		int best_movement = 0;
1632 		float max_matchness = -1;
1633 		for (j=-indel_tolerance; j<=indel_tolerance; j++)
1634 		{
1635 			int m = front4(read_matchingness[j],i);
1636 			if(m > max_matchness)
1637 			{
1638 				max_matchness = m*1.;
1639 				best_movement = j;
1640 			}
1641 		}
1642 
1643 //		SUBREADprintf("best-match=%f\n", max_matchness);
1644 //
1645 
1646 		if (quality_str[0])
1647 		{
1648 			max_matchness = 0;
1649 		//	float qs0 = max_matchness;
1650 			for (j=0; j<4; j++)
1651 			{
1652 				if(read_matchingness[best_movement][j+i])
1653 				{
1654 					// penalty
1655 					max_matchness += 1.03+get_base_quality_score(quality_str[j+i], quality_scale);
1656 				}
1657 				else
1658 				{
1659 					// relief
1660 					//max_matchness += 0.2-get_base_phred(quality_str[j+i])*0.01;
1661 				}
1662 			}
1663 		//	SUBREADprintf("Delta matchingness = %f\n", qs0 - max_matchness);
1664 		}
1665 
1666 		ret += max_matchness; //front8(read_matchingness[movement], i);  // (front4(read_matchingness[movement], i)==4?4:0);
1667 	}
1668 
1669 
1670 	return  ret;
1671 }
1672 
1673 
final_matchingness_scoring(const char read_str[],const char quality_str[],int read_len,gene_vote_t * vote,gehash_data_t * max_position,gene_vote_number_t * max_vote,short * max_mask,gene_quality_score_t * max_quality,gene_value_index_t * my_array_index,int space_type,int indel_tolerance,int quality_scale,gene_vote_number_t * max_indel_recorder,int * max_coverage_start,int * max_coverage_end)1674 void final_matchingness_scoring(const char read_str[], const char quality_str[], int read_len, gene_vote_t * vote, gehash_data_t * max_position, gene_vote_number_t * max_vote, short *max_mask, gene_quality_score_t * max_quality, gene_value_index_t * my_array_index, int space_type, int indel_tolerance, int quality_scale, gene_vote_number_t * max_indel_recorder, int * max_coverage_start, int * max_coverage_end)
1675 {
1676 	int i, j;
1677 	gene_quality_score_t max_matching = -1.0;
1678 
1679 	*max_vote = vote -> max_vote;
1680 
1681 	for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
1682 		for(j=0; j< vote->items[i]; j++)
1683 		{
1684 			if( vote->votes[i][j]  < vote -> max_vote -1 ) continue;
1685 
1686 			unsigned int potential_position = vote -> pos[i][j];
1687 			gene_quality_score_t matchingness_count = 1.*match_chro_indel((char *)read_str, my_array_index, potential_position, read_len, 0, space_type, indel_tolerance, NULL, 0);
1688 			if(matchingness_count > max_matching)
1689 			{
1690 				max_matching = matchingness_count;
1691 				*max_position = potential_position;
1692 				*max_mask = vote -> masks[i][j];
1693 				*max_coverage_start = vote-> coverage_start[i][j];
1694 				*max_coverage_end = vote -> coverage_end[i][j];
1695 				indel_recorder_copy(max_indel_recorder, vote-> indel_recorder[i][j]);
1696 
1697 				* max_quality = max_matching;
1698 			}
1699 			else if (matchingness_count == max_matching)
1700 			{
1701 //				SUBREADprintf("\nBREAK EVEN DETECTED AT SORTED TABLE: %u (kept) and %u\n", vote->max_position, vote -> pos[i][j]);
1702 				(*max_mask) |= IS_BREAKEVEN_READ ;
1703 			}
1704 		}
1705 }
1706 
1707 int DPALIGN_CREATEGAP_PENALTY = -2 ;
1708 int DPALIGN_EXTENDGAP_PENALTY = 0 ;
1709 int DPALIGN_MISMATCH_PENALTY = 0;
1710 int DPALIGN_MATCH_SCORE = 2;
1711 
1712 
1713 
search_DP_branch(char * read,int read_len,gene_value_index_t * index,unsigned int begin_position,int path_i,int path_j,short ** table,char ** table_mask,int max_indel,char * movement_buffer,int expected_offset,int current_score,int out_pos,int current_offset,int init_read_offset,int shutdown_read_offset,int * all_steps)1714 int search_DP_branch(char * read, int read_len, gene_value_index_t * index, unsigned int begin_position, int path_i, int path_j, short ** table,char ** table_mask, int max_indel, char * movement_buffer, int expected_offset, int current_score, int out_pos, int current_offset, int init_read_offset, int shutdown_read_offset, int * all_steps)
1715 {
1716 
1717 	if (1499 - out_pos > (read_len <<2) || (*all_steps) > 3000 + (read_len << 3) || (*all_steps) > 33000){
1718 		/*ddprintf("\nTOO MANY STEPS: rl = %d    len=%d    steps=%d\n", read_len, 1499 - out_pos, *all_steps);
1719 		ddfflush(stdout) ;*/
1720 		return 0;
1721 	}
1722 	if(path_j < 0 || path_i < 0)
1723 	{
1724 		int i;
1725 		//ddprintf("\nFINAL TEST exp %d  real-offset %d\n", expected_offset, current_offset);
1726 		if (expected_offset !=current_offset)return 0;
1727 		for (i=0; i<=path_j; i++) movement_buffer[out_pos--] = 1;
1728 		for (i=0; i<=path_i; i++) movement_buffer[out_pos--] = 2;
1729 		return  out_pos ;
1730 	}
1731 
1732 	short upper_score;
1733 	if (path_i>0) upper_score = table[path_i-1][path_j];
1734 	else	      upper_score = 0;
1735 
1736 	short left_score;
1737 	if (path_j>0)      left_score = table[path_i][path_j-1];
1738 	else	      left_score = 0;
1739 
1740 	short upperleft_score;
1741 	if (path_j>0 && path_i>0) upperleft_score = table[path_i-1][path_j-1];
1742 	else	      upperleft_score = 0;
1743 
1744 	char is_matched_ij =  gvindex_get(index, begin_position + path_j) == read[path_i]?DPALIGN_MATCH_SCORE :DPALIGN_MISMATCH_PENALTY;
1745 
1746 	int found = 0;
1747 	int table_mask_j_1 = 0, table_mask_i_1 = 0;
1748 	int table_mask2_i_1 = 0;
1749 	if(path_j >0 && path_i >=0)
1750 	{
1751 		table_mask_j_1 = left_score + (table_mask[path_i][path_j-1]? DPALIGN_EXTENDGAP_PENALTY: DPALIGN_CREATEGAP_PENALTY) == current_score;
1752 	}
1753 	if(path_i >0 && path_j >=0)
1754 	{
1755 		table_mask_i_1 = upper_score + (table_mask[path_i-1][path_j]? DPALIGN_EXTENDGAP_PENALTY: DPALIGN_CREATEGAP_PENALTY) == current_score;
1756 		table_mask2_i_1 = upper_score + (table_mask[path_i-1][path_j]? DPALIGN_EXTENDGAP_PENALTY: DPALIGN_CREATEGAP_PENALTY) < 0 ;
1757 	}
1758 
1759 
1760 	if ((!found ) && ( table_mask_j_1 || (current_score == 0 && table_mask_i_1)))
1761 	{
1762 		movement_buffer[out_pos] = 1;
1763 		(*all_steps) ++;
1764 		//ddprintf ("Path_i = %d ; Path_j = %d ; Offset = %d => 1\n", path_i, path_j, current_offset);
1765 		//ddfflush(stdout) ;
1766 		found =search_DP_branch (read, read_len, index, begin_position, path_i , path_j -1, table  , table_mask, max_indel, movement_buffer, expected_offset, left_score, out_pos -1, current_offset - ((path_i >= init_read_offset && path_i <= shutdown_read_offset)?1:0),  init_read_offset, shutdown_read_offset, all_steps);
1767 	}
1768 	if ((!found )&& ( table_mask_i_1 || (current_score == 0 && table_mask2_i_1)))
1769 	{
1770 		movement_buffer[out_pos] = 2;
1771 		(*all_steps) ++;
1772 		//ddprintf ("Path_i = %d ; Path_j = %d ; Offset = %d => 2\n", path_i, path_j, current_offset);
1773 		//ddfflush(stdout) ;
1774 		found =search_DP_branch (read, read_len, index, begin_position, path_i -1 , path_j , table , table_mask , max_indel, movement_buffer, expected_offset, upper_score, out_pos -1, current_offset + ((path_i >= init_read_offset && path_i <= shutdown_read_offset)?1:0),  init_read_offset, shutdown_read_offset, all_steps);
1775 	}
1776 	if((!found )&& (is_matched_ij + (upperleft_score - current_score) == 0))
1777 	{
1778 		movement_buffer[out_pos] = is_matched_ij==2?0:3;
1779 		//ddprintf ("Path_i = %d ; Path_j = %d ; Offset = %d => 0\n", path_i, path_j, current_offset);
1780 		(*all_steps) ++;
1781 		found = search_DP_branch (read, read_len, index, begin_position, path_i -1 , path_j -1, table , table_mask, max_indel, movement_buffer, expected_offset, upperleft_score, out_pos -1, current_offset, init_read_offset, shutdown_read_offset, all_steps);
1782 	}
1783 
1784 
1785 	return found;
1786 }
1787 
1788 #define INDEL_WINDOW_WIDTH 4
1789 
window_indel_align(char * read,int read_len,gene_value_index_t * index,unsigned int begin_position,int max_indel,char * movement_buffer,int expected_offset,int init_read_offset,int shutdown_read_offset)1790 int window_indel_align(char * read, int read_len, gene_value_index_t * index, unsigned int begin_position, int max_indel, char * movement_buffer, int expected_offset, int init_read_offset, int shutdown_read_offset)
1791 {
1792 	short indel_windows[32];
1793 	int scores [32][MAX_READ_LENGTH];
1794 	int i,j,windows_number;
1795 	char chro_str[200];
1796 
1797 	memset(indel_windows,0, 32*sizeof(short));
1798 
1799 	windows_number = abs(expected_offset)+1;
1800 	//ddprintf ("\nWindow size = %d ; ExpOffset = %d\n", windows_number , expected_offset);
1801 	ddfflush(stdout) ;
1802 	for (i=0; i< read_len; i++)
1803 		for (j=0; j< windows_number; j++)
1804 		{
1805 			int matchingness ;
1806 			if (j==0) chro_str[i] = gvindex_get(index, begin_position+i);
1807 			if (expected_offset<0)
1808 				matchingness = *(read + i) == gvindex_get(index, begin_position + j + i) ;
1809 			else
1810 				matchingness = *(read + i) == gvindex_get(index, begin_position - windows_number + j + 1 + i) ;
1811 			indel_windows [j] += matchingness;
1812 
1813 			if (i >= INDEL_WINDOW_WIDTH)// The result for i - INDEL_WINDOW_WIDTH is OK
1814 			{
1815 				scores[j][i - INDEL_WINDOW_WIDTH] = indel_windows [j] ;
1816 			}
1817 			if (i >= INDEL_WINDOW_WIDTH)// The result for i - INDEL_WINDOW_WIDTH is OK
1818 			{
1819 
1820 				if (expected_offset<0)
1821 					matchingness = *(read - INDEL_WINDOW_WIDTH + i) == gvindex_get(index, begin_position + j + i - INDEL_WINDOW_WIDTH) ;
1822 				else
1823 					matchingness = *(read - INDEL_WINDOW_WIDTH + i) == gvindex_get(index, begin_position -windows_number + j +1 + i - INDEL_WINDOW_WIDTH) ;
1824 				indel_windows [j] -= matchingness;
1825 			}
1826 		}
1827 	chro_str[i]=0;
1828 	j = read[read_len];
1829 	read[read_len]=0;
1830 
1831 	ddprintf ("CHRO=%s\nREAD=%s\n", chro_str, read);
1832 
1833 	/*
1834 	float contingency_list [MAX_READ_LENGTH];
1835 	for (i=0; i< read_len - windows_number-INDEL_WINDOW_WIDTH; i++)
1836 	{
1837 		float contingency;
1838 		if (expected_offset >0)
1839 			contingency  = ((scores[0][i]*1. + 1)-(scores[windows_number-1][i]*1.+1));
1840 		else
1841 			contingency  = ((scores[windows_number-1][i]*1. + 1)-(scores[0][i]*1.+1));
1842 		contingency_list[i]=contingency;
1843 	}*/
1844 
1845 	int max_score=-1;
1846 	int max_pos = -1;
1847 	if (expected_offset>0)
1848 	{
1849 		//insertion
1850 		for (i=read_len-INDEL_WINDOW_WIDTH-1; i>=0; i--)
1851 		{
1852 			if (scores[windows_number - 1][i - expected_offset]>=2 && scores[0][i] >= max_score)
1853 			{
1854 				max_pos = i - windows_number+1;
1855 				max_score = scores[0][i];
1856 			}
1857 		}
1858 	}
1859 	else
1860 	{
1861 		//deletion
1862 		for (i=read_len-INDEL_WINDOW_WIDTH-1; i>=0; i--)
1863 		{
1864 			if (scores[windows_number -1][i] >= max_score && scores[0][i + expected_offset] >=2)
1865 			{
1866 				max_score = scores[windows_number -1][i];
1867 				max_pos = i ;
1868 			}
1869 		}
1870 
1871 	}
1872 
1873 	max_pos = min(read_len, max(0, max_pos));
1874 
1875 	int move_number = 0;
1876 
1877 	for (i=0; i< read_len -INDEL_WINDOW_WIDTH; i++)
1878 	{
1879 		if (i==max_pos)
1880 			for (j = 0; j < windows_number-1; j++)
1881 				movement_buffer[move_number++] = 1+(expected_offset>0);	// 1=del; 2=ins
1882 		if (i!=max_pos || expected_offset <0)
1883 			movement_buffer [move_number++] = 0; // matched
1884 
1885 		#ifdef indel_debug
1886 		for (j=0; j< windows_number; j++)
1887 			SUBREADprintf("%d ",scores[j][i]);
1888 		SUBREADprintf ("     %c\n", (i==max_pos)?'*':' ') ;
1889 		#endif
1890 	}
1891 	for(; i< read_len; i++)
1892 		movement_buffer [move_number++] = 0; // matched
1893 
1894 
1895 
1896 	read[read_len] = j;
1897 	return move_number;
1898 }
1899 
dynamic_align(char * read,int read_len,gene_value_index_t * index,unsigned int begin_position,int max_indel,char * movement_buffer,int expected_offset,int init_read_offset,int shutdown_read_offset,short ** table,char ** table_mask)1900 int dynamic_align(char * read, int read_len, gene_value_index_t * index, unsigned int begin_position, int max_indel, char * movement_buffer, int expected_offset, int init_read_offset, int shutdown_read_offset, short **table  ,  char ** table_mask)
1901 // read must be converted to the positive strand.
1902 // movement buffer: 0:match, 1: read-insert, 2: gene-insert, 3:mismatch
1903 // the size of the movement buffer must be equal to the length of the read plus max_indel * 3.
1904 {
1905 
1906 
1907 	int i,j;
1908 
1909 	#ifdef indel_debug
1910 	int ii, jj;
1911 	for(ii = 0; ii<read_len - expected_offset; ii++)
1912 		putchar(gvindex_get(index, begin_position + ii));
1913 
1914 	SUBREADprintf ("\n%s\n", read);
1915 	#endif
1916 
1917 	for (i=0; i<read_len ; i++)
1918 	{
1919 		for(j=0; j<read_len - expected_offset; j++)
1920 		{
1921 			table_mask[i][j]=0;
1922 			if (j < i - max_indel || j > max_indel + i)
1923 			{
1924 				table[i][j]=-9999;
1925 			#ifdef indel_debug
1926 				putchar('\t');
1927 			#endif
1928 				continue;
1929 			}
1930 
1931 			short from_upper;
1932 
1933 			if (i>0) from_upper = table[i-1][j] + (table_mask[i-1][j]?DPALIGN_EXTENDGAP_PENALTY:DPALIGN_CREATEGAP_PENALTY);
1934 			else     from_upper = DPALIGN_CREATEGAP_PENALTY;
1935 
1936 			short from_left;
1937 
1938 			if (j>0) from_left = table[i][j-1] + (table_mask[i][j-1]?DPALIGN_EXTENDGAP_PENALTY:DPALIGN_CREATEGAP_PENALTY);
1939 			else     from_left = DPALIGN_CREATEGAP_PENALTY;
1940 
1941 			char chromo_ch = gvindex_get(index, begin_position + j);
1942 			char is_matched_ij = (chromo_ch == read[i])?DPALIGN_MATCH_SCORE:DPALIGN_MISMATCH_PENALTY;
1943 
1944 			short from_upperleft;
1945 			if (i>0 && j>0) from_upperleft = table[i-1][j-1] + is_matched_ij;
1946 			else	    from_upperleft = is_matched_ij;
1947 			if ((i ==0 || j ==0) && (i+j>0)) from_upperleft += DPALIGN_CREATEGAP_PENALTY;
1948 
1949 			if (from_upperleft <=from_left || from_upperleft <= from_upper)
1950 				table_mask[i][j]=1;
1951 
1952 			table[i][j]=max(0,max(from_upper, max(from_left, from_upperleft)));
1953 			#ifdef indel_debug
1954 			SUBREADprintf("%c%c\t", chromo_ch, read[i]);
1955 			#endif
1956 		}
1957 		#ifdef indel_debug
1958 		SUBREADputs("");
1959 		#endif
1960 
1961 	}
1962 	#ifdef indel_debug
1963 	//SUBREADputs("");
1964 	//SUBREADputs("");
1965 
1966 	#endif
1967 
1968 	short current_score = table[read_len-1] [read_len - expected_offset-1];
1969 	short path_i = read_len-1 ;
1970 	int out_pos = 1499;
1971 	char out_tmp [1500];
1972 	int all_steps = 0;
1973 	j = read_len-expected_offset -1;
1974 
1975 	#ifdef indel_debug
1976 
1977 	for(ii=0;ii< 20; ii++)
1978 	{
1979 		for(jj=0; jj<20; jj++)
1980 			SUBREADprintf("%d\t",table[ii][jj]);
1981 		SUBREADputs("");
1982 	}
1983 		SUBREADputs("");
1984 		SUBREADputs("");
1985 
1986 	for(ii=0;ii< 20; ii++)
1987 	{
1988 		for(jj=0; jj<20; jj++)
1989 			SUBREADprintf("%d\t",table_mask[ii][jj]);
1990 		SUBREADputs("");
1991 	}
1992 	#endif
1993 
1994 	out_pos = search_DP_branch(read, read_len, index, begin_position, path_i, j, table, table_mask, max_indel,  out_tmp , expected_offset,  current_score , out_pos, 0, init_read_offset,shutdown_read_offset, &all_steps);
1995 
1996 	if (out_pos)
1997 	{
1998 		memcpy(movement_buffer, out_tmp  + out_pos +1, 1499 - out_pos);
1999 		return 1499 - out_pos;
2000 	}
2001 	else return 0;
2002 
2003 }
2004 
extend_covered_region(gene_value_index_t * array_index,unsigned int read_start_pos,char * read,int read_len,int cover_start,int cover_end,int window_size,int req_match_5end,int req_match_3end,int indel_tolerance,int space_type,int tail_indel,short * head_indel_pos,int * head_indel_movement,short * tail_indel_pos,int * tail_indel_movement,int is_head_high_quality,char * qual_txt,int qual_format,float head_matching_rate,float tail_matching_rate)2005 int extend_covered_region(gene_value_index_t *array_index, unsigned int read_start_pos, char * read, int read_len, int cover_start, int cover_end, int window_size, int req_match_5end , int req_match_3end, int indel_tolerance, int space_type, int tail_indel, short * head_indel_pos, int * head_indel_movement, short * tail_indel_pos, int * tail_indel_movement, int is_head_high_quality, char * qual_txt, int qual_format, float head_matching_rate, float tail_matching_rate)
2006 {
2007 	int ret = 0;
2008 	*head_indel_pos = -1;
2009 	*tail_indel_pos = -1;
2010 	if (cover_start >= window_size && head_matching_rate < 1.0001)
2011 	{
2012 		int head_test_len =  cover_start;
2013 		int roughly_mapped = match_chro(read, array_index, read_start_pos, head_test_len , 0, space_type);
2014 		if (roughly_mapped >= head_test_len  * CORE_RECOVER_MATCHING_RATE - 0.0001)
2015 		{
2016 			ret |= 1;
2017 		}
2018 		else
2019 		{
2020 			int window_end_pos = cover_start + window_size-1;
2021 			int not_too_bad = 1;
2022 			int right_match_number=0;
2023 
2024 			while (1)
2025 			{
2026 				int matched_bases_in_window = match_chro_wronglen(read+ window_end_pos - window_size, array_index, read_start_pos + window_end_pos - window_size, window_size, space_type, NULL, &right_match_number);
2027 				int best_indel_movement = -1000;
2028 				int best_indel_pos = -1;
2029 
2030 				if (matched_bases_in_window >= req_match_5end) // this window is not bad enough so that an indel is considered
2031 					window_end_pos--;
2032 				else
2033 				{
2034 					roughly_mapped = match_chro(read, array_index, read_start_pos, window_end_pos - right_match_number , 0, space_type);
2035 					if (roughly_mapped < (int)(0.5+ (window_end_pos - right_match_number)  * CORE_RECOVER_MATCHING_RATE))
2036 					{
2037 						// the quality of this window is too low (at most 1 base is matched). I must consider if it is an indel.
2038 						int indel_movement_i ;
2039 
2040 						int best_matched_bases_after_indel = -1;
2041 
2042 						not_too_bad = 0;
2043 						for (indel_movement_i = 0; indel_movement_i < 2* indel_tolerance-1 ; indel_movement_i ++)
2044 						{
2045 							int indel_movement = (indel_movement_i+1)/2 * (indel_movement_i %2?1:-1);
2046 							int test_length = window_end_pos /*- 1*/ - max(0, indel_movement) -  right_match_number;
2047 
2048 							if (test_length < window_size) continue;
2049 							//if (test_length <= 1+ abs(indel_movement)/4) continue;
2050 							//test_length = min(10, test_length);
2051 							if (abs(indel_movement) > indel_tolerance) continue;
2052 
2053 							int test_start =0;// window_end_pos - max(0, indel_movement) -  right_match_number - test_length;
2054 
2055 							int matched_bases_after_indel = match_chro_support(read +test_start, array_index, read_start_pos + indel_movement +test_start, test_length,0, space_type, qual_txt, qual_format);
2056 							float test_rate = head_matching_rate;
2057 
2058 							if(test_length < 3) test_rate = 1;
2059 
2060 							if(best_matched_bases_after_indel <matched_bases_after_indel  && matched_bases_after_indel >= (int)(0.5+test_length * test_rate))
2061 							{
2062 								not_too_bad = 1;
2063 
2064 								best_matched_bases_after_indel = matched_bases_after_indel;
2065 								best_indel_movement = indel_movement;
2066 								best_indel_pos = window_end_pos - right_match_number;
2067 
2068 
2069 
2070 								*head_indel_pos = best_indel_pos;
2071 								*head_indel_movement = best_indel_movement;
2072 							}
2073 						}
2074 						if(best_indel_pos<0) *head_indel_pos =  window_end_pos - right_match_number;
2075 						break;
2076 					}else window_end_pos--;
2077 				}
2078 				if (window_end_pos - window_size <= 0) break;
2079 			}
2080 			if(not_too_bad)
2081 				ret |=1;
2082 			//else
2083 				//*head_indel_pos = -1;
2084 		}
2085 	}
2086 	else ret |= 1;
2087 
2088 
2089 
2090 	if (cover_end <= read_len - window_size && tail_matching_rate < 1.0001)
2091 	{
2092 		int tail_test_len = read_len - cover_end;
2093 		int roughly_mapped = match_chro(read + cover_end, array_index, read_start_pos + cover_end + tail_indel, tail_test_len , 0, space_type);
2094 		if (roughly_mapped >= tail_test_len  * CORE_RECOVER_MATCHING_RATE - 0.0001)
2095 		{
2096 			ret |= 2;
2097 		}
2098 		else
2099 		{
2100 			int window_start_pos = cover_end - window_size +1;
2101 			int not_too_bad = 1;
2102 
2103 			while (1)
2104 			{
2105 				int left_match_number = 0;
2106 				int matched_bases_in_window = match_chro_wronglen(read+ window_start_pos, array_index, read_start_pos + window_start_pos + tail_indel, window_size, space_type, &left_match_number, NULL);
2107 				int best_indel_movement = -1000;
2108 				int best_indel_pos = -1;
2109 
2110 				if (matched_bases_in_window >= req_match_3end) // this window is not bad enough so that an indel is considered
2111 					window_start_pos++;
2112 				else
2113 				{
2114 					roughly_mapped = match_chro(read+window_start_pos + left_match_number, array_index, read_start_pos + window_start_pos + tail_indel + left_match_number, read_len - window_start_pos - left_match_number , 0, space_type);
2115 
2116 					if (roughly_mapped < (int)(0.5+(read_len - window_start_pos -left_match_number )  * CORE_RECOVER_MATCHING_RATE))
2117 					{
2118 						// the quality of this window is too low (at most 1 base is matched). I must consider if it is an indel.
2119 						int indel_movement_i ;
2120 						int best_matched_bases_after_indel = -1;
2121 						not_too_bad = 0;
2122 						for (indel_movement_i = 0; indel_movement_i < 2* indel_tolerance; indel_movement_i ++)
2123 						{
2124 
2125 							int indel_adjustment = (indel_movement_i+1)/2 * (indel_movement_i %2?1:-1);
2126 							int indel_movement = tail_indel + indel_adjustment;
2127 							int test_length = read_len - window_start_pos  - left_match_number + min(0, indel_adjustment);
2128 							//test_length = min(10, test_length);
2129 
2130 
2131 							if (test_length < window_size) continue;
2132 							//if (test_length <= 1 + abs(indel_movement)/4) continue;
2133 							if (abs(indel_movement) > indel_tolerance) continue;
2134 
2135 							int matched_bases_after_indel = match_chro_support(read + window_start_pos - min(0, indel_adjustment) + left_match_number, array_index, read_start_pos + window_start_pos  + max(0,indel_movement) +left_match_number , test_length,0, space_type, qual_txt +  window_start_pos - min(0, indel_adjustment) + left_match_number , qual_format);
2136 
2137 							float test_rate = tail_matching_rate;
2138 
2139 							if(test_length < 3) test_rate = 1;
2140 
2141 							if(best_matched_bases_after_indel <matched_bases_after_indel  && matched_bases_after_indel >= (int)(0.5+test_length * test_rate))
2142 							{
2143 								not_too_bad = 1;
2144 								best_matched_bases_after_indel = matched_bases_after_indel;
2145 								best_indel_movement = indel_movement;
2146 								best_indel_pos = window_start_pos + left_match_number ;//-1;
2147 								*tail_indel_movement = best_indel_movement ;
2148 							}
2149 						}
2150 
2151 						if(best_indel_pos<0)
2152 							*tail_indel_pos =  window_start_pos + left_match_number ;
2153 						else
2154 							*tail_indel_pos = best_indel_pos;
2155 						break;
2156 					}else window_start_pos++;
2157 				}
2158 				if (window_start_pos + window_size >= read_len) break;
2159 			}
2160 			if(not_too_bad)
2161 				ret |=2;
2162 			//else
2163 			//	*tail_indel_pos =-1;
2164 		}
2165 	}
2166 	else ret |= 2;
2167 
2168 
2169 	return ret;
2170 }
2171 
2172 
match_base_quality_cs(gene_value_index_t * array_index,char * read_txt,unsigned int pos,char * qual_txt,int read_len,int phred_version,int * high_qual_unmatch,int * all_mismatched,int ql_kill,int head_clipped,int tail_clipped)2173 float match_base_quality_cs(gene_value_index_t *array_index, char * read_txt,  unsigned int pos, char * qual_txt, int read_len, int phred_version, int * high_qual_unmatch, int * all_mismatched, int ql_kill, int head_clipped, int tail_clipped)
2174 {
2175 	int i;
2176 	int ret =0;
2177 	char lastch;
2178 	if(pos < array_index -> start_base_offset || pos + read_len >= array_index -> start_base_offset + array_index -> length){
2179 		//SUBREADprintf("WARNING: BASE INDEX OUT OF LIMIT: %u < %u < %u\n%s\n", array_index -> start_base_offset , pos, array_index -> start_base_offset + array_index -> length, read_txt);
2180 	//	exit(-1);
2181 		return (read_len - tail_clipped - head_clipped);
2182 	}
2183 	lastch = gvindex_get(array_index, pos);
2184 	for(i=head_clipped; i<read_len - tail_clipped; i++)
2185 	{
2186 		char nch = gvindex_get(array_index, pos+i+1);
2187 		int is_matched = read_txt[i] == '0'+chars2color(lastch, nch);
2188 		if(is_matched){
2189 			ret ++;
2190 		}else
2191 		{
2192 			(*all_mismatched)++;
2193 			(*high_qual_unmatch)++;
2194 			ret --;
2195 		}
2196 
2197 		lastch = nch;
2198 	}
2199 	return ret*1.;
2200 }
2201 
match_base_quality(gene_value_index_t * array_index,char * read_txt,unsigned int pos,char * qual_txt,int read_len,int is_negative,int phred_version,int * high_qual_unmatch,int * all_mismatched,int ql_kill,int head_clipped,int tail_clipped)2202 float match_base_quality(gene_value_index_t *array_index, char * read_txt,  unsigned int pos, char * qual_txt, int read_len, int is_negative, int phred_version, int * high_qual_unmatch, int * all_mismatched, int ql_kill, int head_clipped, int tail_clipped)
2203 {
2204 	int i;
2205 	int ret =0;
2206 	if(pos < array_index -> start_base_offset || pos + read_len >= array_index -> start_base_offset + array_index -> length){
2207 		//SUBREADprintf("WARNING: BASE INDEX OUT OF LIMIT: %u < %u < %u\n%s\n", array_index -> start_base_offset , pos, array_index -> start_base_offset + array_index -> length, read_txt);
2208 	//	exit(-1);
2209 		return (read_len - tail_clipped - head_clipped);
2210 	}
2211 	for(i=head_clipped; i<read_len - tail_clipped; i++)
2212 	{
2213 		char true_chr;
2214 		if(is_negative)
2215 		{
2216 			true_chr = gvindex_get(array_index, pos + read_len - i - 1);
2217 			if(true_chr == 'A') true_chr = 'T';
2218 			else if(true_chr == 'G') true_chr = 'C';
2219 			else if(true_chr == 'C') true_chr = 'G';
2220 			else  true_chr = 'A';
2221 		}
2222 		else
2223 			true_chr = gvindex_get(array_index, pos + i);
2224 
2225 		//SUBREADprintf("%c vs %c\n", true_chr , read_txt[i]);
2226 
2227 		if (true_chr == read_txt[i])
2228 		{
2229 			if(!qual_txt)
2230 				ret += 1000000;
2231 			else if(FASTQ_PHRED64 == phred_version)
2232 				ret += (1000000-get_base_error_prob64i(qual_txt[i]));
2233 			else
2234 				ret += (1000000-get_base_error_prob33i(qual_txt[i]));
2235 		}
2236 		else
2237 		{
2238 			(*all_mismatched)++;
2239 			if(!qual_txt)
2240 			{
2241 				ret -= 1000000;
2242 				(*high_qual_unmatch)++;
2243 			}
2244 			else
2245 			{
2246 				int ql ;
2247 				if(FASTQ_PHRED64 == phred_version)
2248 					ql = get_base_error_prob64i(qual_txt[i]);
2249 				else
2250 					ql = get_base_error_prob33i(qual_txt[i]);
2251 				/*
2252 				#ifdef QUALITY_KILL
2253 					#if QUALITY_KILL > 196
2254 						#define QL_MIN 999000
2255 					#endif
2256 				#endif
2257 
2258 				#ifndef QL_MIN
2259 					#define QL_MIN 200000
2260 				#endif*/
2261 
2262 				if( ql < ql_kill) (*high_qual_unmatch)++;
2263 
2264 				ret += ql-1000000;
2265 			}
2266 		}
2267 	}
2268 	//printf ("SECTION QUAL = %d LEN = %d\n", ret, read_len);
2269 	return ret/1000000.;
2270 }
2271 
final_mapping_quality(gene_value_index_t * array_index,unsigned int pos,char * read_txt,char * qual_txt,char * cigar_txt,int phred_version,int * mismatch,int rl,char * refined_cigar,unsigned int * new_pos)2272 float final_mapping_quality(gene_value_index_t *array_index, unsigned int pos, char * read_txt, char * qual_txt, char * cigar_txt, int phred_version, int * mismatch, int rl, char * refined_cigar, unsigned int * new_pos)
2273 {
2274 	int cigar_cursor = 0;
2275 	int read_cursor = 0;
2276 	long long int chromosome_cursor = pos, x;
2277 	int cigar_length = strlen(cigar_txt);
2278 	int i;
2279 	int ret = 0;
2280 
2281 	x= 0;
2282 	while(cigar_cursor < cigar_length)
2283 	{
2284 		if(cigar_txt[cigar_cursor] =='X')
2285 		{
2286 			cigar_cursor++;
2287 			continue;
2288 		}
2289 		if(cigar_txt[cigar_cursor]>='0' && cigar_txt[cigar_cursor]<='9')
2290 			x = x*10+ (cigar_txt[cigar_cursor]-'0');
2291 		else
2292 		{
2293 			if(cigar_txt[cigar_cursor] == 'M' || cigar_txt[cigar_cursor] == 'S')
2294 			{
2295 				int all_MM=0;
2296 				float nret = match_base_quality(array_index, read_txt + read_cursor, chromosome_cursor , (qual_txt && qual_txt[0])?qual_txt + read_cursor:NULL, x, 0, phred_version, mismatch, &all_MM, 200000,0,0);
2297 				//printf ("%s: Q=%.6f; L=%d ; POS=%u\n",  read_txt + read_cursor, nret, x, chromosome_cursor);
2298 
2299 				ret += (int)(nret*1000000);
2300 				chromosome_cursor +=x;
2301 				read_cursor +=x;
2302 			}
2303 			else if(cigar_txt[cigar_cursor] == 'I')
2304 			{
2305 				if(!qual_txt)
2306 				{
2307 					ret += x*1000000;
2308 					read_cursor +=x;
2309 				}
2310 				else if(FASTQ_PHRED64 == phred_version)
2311 				{
2312 					for(i = 0; i<x; i++)
2313 					{
2314 						char qchar = qual_txt[read_cursor++];
2315 						ret += (1000000-get_base_error_prob64i(qchar));
2316 					}
2317 				}
2318 				else
2319 				{
2320 					for(i = 0; i<x; i++)
2321 					{
2322 						char qchar = qual_txt[read_cursor++];
2323 						ret += (1000000-get_base_error_prob33i(qchar));
2324 					}
2325 				}
2326 			}
2327 			else if(cigar_txt[cigar_cursor] == 'D' || cigar_txt[cigar_cursor] == 'N'|| cigar_txt[cigar_cursor] == 'j' || cigar_txt[cigar_cursor] == 'J')
2328 				chromosome_cursor +=x;
2329 			else if(cigar_txt[cigar_cursor] == 'B' || cigar_txt[cigar_cursor] == 'b')
2330 				chromosome_cursor -=x;
2331 
2332 			x= 0;
2333 		}
2334 		cigar_cursor++;
2335 	}
2336 	if(read_cursor != rl)
2337 	{
2338 		*mismatch = 9999;
2339 		return 0.;
2340 	}
2341 	//printf("S=%.5f, LEN=%d\n", ret , read_cursor);
2342 
2343 	float match_rate = (ret/10000.) / read_cursor+100.;
2344 
2345 	if(refined_cigar && match_rate < 190)
2346 	{
2347 		char read_data[1250];
2348 		chromosome_cursor = pos;
2349 		read_cursor = 0;
2350 		cigar_cursor = 0;
2351 		x=0;
2352 
2353 		while(cigar_cursor < cigar_length)
2354 		{
2355 			if(cigar_txt[cigar_cursor] =='X')
2356 			{
2357 				cigar_cursor++;
2358 				continue;
2359 			}
2360 			if(cigar_txt[cigar_cursor]>='0' && cigar_txt[cigar_cursor]<='9')
2361 				x = x*10+ (cigar_txt[cigar_cursor]-'0');
2362 			else
2363 			{
2364 				if(cigar_txt[cigar_cursor] == 'M' || cigar_txt[cigar_cursor] == 'S')
2365 				{
2366 					gvindex_get_string(read_data + read_cursor  ,  array_index ,  chromosome_cursor , x , 0);
2367 					chromosome_cursor +=x;
2368 					read_cursor +=x;
2369 				}
2370 				else if(cigar_txt[cigar_cursor] == 'I')
2371 				{
2372 					int xk;
2373 					for(xk=read_cursor;xk<read_cursor+x; xk++)
2374 						read_data[cigar_cursor] = 'I';
2375 					read_cursor +=x;
2376 				}
2377 				else if(cigar_txt[cigar_cursor] == 'D' || cigar_txt[cigar_cursor] == 'N'|| cigar_txt[cigar_cursor] == 'j' || cigar_txt[cigar_cursor] == 'J')
2378 					chromosome_cursor +=x;
2379 				else if(cigar_txt[cigar_cursor] == 'B' || cigar_txt[cigar_cursor] == 'b')
2380 					chromosome_cursor -=x;
2381 
2382 				x= 0;
2383 			}
2384 			cigar_cursor++;
2385 		}
2386 
2387 		int window_matched = 0;
2388 		int last_confirmed_read_pos = 99999;
2389 		int REFINE_WINDOW_SIZE = 10;
2390 		for(read_cursor = - REFINE_WINDOW_SIZE; read_cursor < rl - REFINE_WINDOW_SIZE; read_cursor++)
2391 		{
2392 
2393 //			printf("MMM %c =?= %c\n", read_txt[read_cursor+REFINE_WINDOW_SIZE] , read_data[read_cursor+REFINE_WINDOW_SIZE]);
2394 			window_matched += (read_txt[read_cursor+REFINE_WINDOW_SIZE] == read_data[read_cursor+REFINE_WINDOW_SIZE] || read_txt[read_cursor+REFINE_WINDOW_SIZE] =='I');
2395 			if(read_cursor>=0)
2396 			{
2397 				if(window_matched >= REFINE_WINDOW_SIZE -1)
2398 					if(read_txt[read_cursor+REFINE_WINDOW_SIZE] == read_data[read_cursor+REFINE_WINDOW_SIZE])
2399 						last_confirmed_read_pos = read_cursor+REFINE_WINDOW_SIZE+1;
2400 				window_matched -= (read_txt[read_cursor] == read_data[read_cursor] || read_txt[read_cursor] == 'I');
2401 			}
2402 		}
2403 
2404 
2405 		window_matched = 0;
2406 		int first_confirmed_read_pos = 99999;
2407 		for(read_cursor = rl -1 ; read_cursor >=0 ; read_cursor--)
2408 		{
2409 
2410 			window_matched += (read_txt[read_cursor] == read_data[read_cursor]|| read_txt[read_cursor] == 'I');
2411 			if(read_cursor <= rl- REFINE_WINDOW_SIZE - 1)
2412 			{
2413 				if(window_matched >= REFINE_WINDOW_SIZE -1)
2414 					if(read_txt[read_cursor] == read_data[read_cursor])
2415 						first_confirmed_read_pos = read_cursor;
2416 				window_matched -= (read_txt[read_cursor+REFINE_WINDOW_SIZE] == read_data[read_cursor+REFINE_WINDOW_SIZE] || read_txt[read_cursor + REFINE_WINDOW_SIZE] == 'I');
2417 			}
2418 		}
2419 
2420 
2421 		cigar_cursor = 0;
2422 		read_cursor = 0;
2423 		refined_cigar[0]=0;
2424 		x=0;
2425 		int begin_copy  =0;
2426 		chromosome_cursor = pos;
2427 
2428 		if(first_confirmed_read_pos<99990 && last_confirmed_read_pos<99990)
2429 		{
2430 			if(first_confirmed_read_pos>0)
2431 				sprintf(refined_cigar,"%dS",first_confirmed_read_pos);
2432 			while(cigar_cursor < cigar_length)
2433 			{
2434 				if(cigar_txt[cigar_cursor] =='X')
2435 				{
2436 					cigar_cursor++;
2437 					continue;
2438 				}
2439 				if(cigar_txt[cigar_cursor]>='0' && cigar_txt[cigar_cursor]<='9')
2440 					x = x*10+ (cigar_txt[cigar_cursor]-'0');
2441 				else
2442 				{
2443 					if(cigar_txt[cigar_cursor] == 'M' || cigar_txt[cigar_cursor] == 'S' || cigar_txt[cigar_cursor] == 'I')
2444 					{
2445 						int out_len = 0;
2446 						if(read_cursor < first_confirmed_read_pos && read_cursor + x >= first_confirmed_read_pos)
2447 							out_len = read_cursor + x - first_confirmed_read_pos;
2448 						else if(first_confirmed_read_pos <= read_cursor)
2449 							out_len = x;
2450 
2451 						if(out_len)
2452 						{
2453 							if(!begin_copy)
2454 								*new_pos = chromosome_cursor - read_cursor;
2455 							if(read_cursor >= last_confirmed_read_pos) out_len=0;
2456 							else if(read_cursor < last_confirmed_read_pos && read_cursor +x>last_confirmed_read_pos)
2457 								out_len -= (read_cursor +x - last_confirmed_read_pos);
2458 						}
2459 
2460 						if(out_len)
2461 						{
2462 							sprintf(refined_cigar+strlen(refined_cigar), "%d%c", out_len, cigar_txt[cigar_cursor]);
2463 							begin_copy=1;
2464 						}
2465 						if(cigar_txt[cigar_cursor] == 'M' || cigar_txt[cigar_cursor] == 'S')
2466 							chromosome_cursor +=x;
2467 
2468 						read_cursor +=x;
2469 						if(read_cursor >= last_confirmed_read_pos) break;
2470 					}
2471 					else
2472 					{
2473 						if(begin_copy)
2474 							#ifdef __MINGW32__
2475 							sprintf(refined_cigar+strlen(refined_cigar), "%I64d%c", x, cigar_txt[cigar_cursor]);
2476 							#else
2477 							sprintf(refined_cigar+strlen(refined_cigar), "%lld%c", x, cigar_txt[cigar_cursor]);
2478 							#endif
2479 
2480 						if(cigar_txt[cigar_cursor] == 'D' || cigar_txt[cigar_cursor] == 'N'|| cigar_txt[cigar_cursor] == 'j' || cigar_txt[cigar_cursor] == 'J')
2481 							chromosome_cursor +=x;
2482 						else if(cigar_txt[cigar_cursor] == 'B' || cigar_txt[cigar_cursor] == 'b')
2483 							chromosome_cursor -=x;
2484 
2485 
2486 					}
2487 
2488 					x= 0;
2489 				}
2490 				cigar_cursor++;
2491 			}
2492 			if(last_confirmed_read_pos<=rl-1)
2493 				sprintf(refined_cigar+strlen(refined_cigar),"%dS",rl-last_confirmed_read_pos);
2494 		}
2495 		else if(new_pos)*new_pos=pos;
2496 	}
2497 	else if(new_pos)*new_pos=pos;
2498 
2499 	return match_rate;
2500 }
2501 
2502 
print_votes(gene_vote_t * vote,char * index_prefix)2503 void print_votes(gene_vote_t * vote, char *index_prefix)
2504 {
2505 
2506 	gene_offset_t offsets;
2507 	int i,j;
2508 	char * chrname = NULL;
2509 	int chrpos = 0;
2510 	load_offsets (&offsets, index_prefix);
2511 	//locate_gene_position(vote -> max_position, &offsets, &chrname, &chrpos);
2512 
2513 	SUBREADprintf(" ==========   Max votes = %d   ==========\n", vote->max_vote);// , Position is %s,%u\n", vote->max_vote, chrname, chrpos );
2514 	for (i=0; i<GENE_VOTE_TABLE_SIZE; i++)
2515 		for(j=0; j< vote->items[i]; j++)
2516 		{
2517 			locate_gene_position(vote -> pos[i][j], &offsets, &chrname, &chrpos);
2518 			int toli = vote->toli [i][j];
2519 			int last_offset = vote -> indel_recorder[i][j][toli+2];
2520 			SUBREADprintf("  %s\tVote = %d , Position is %s,%d (+%u) Coverage is (%d, %d) Indel %d %s (%d)\n", vote->votes[i][j] == vote->max_vote?"***":"   ", vote->votes[i][j] , chrname, chrpos, vote -> pos[i][j], vote -> coverage_start[i][j], vote -> coverage_end[i][j],last_offset, (vote -> masks[i][j])?"NEG":"POS", vote -> masks[i][j]);
2521 
2522 			if(1){
2523 				int k;
2524 				for(k=0; k<=toli; k+=3){
2525 					SUBREADprintf("    %d - %d : D=%d    ",  vote -> indel_recorder[i][j][k], vote -> indel_recorder[i][j][k+1], vote -> indel_recorder[i][j][k+2]);
2526 				}
2527 				SUBREADputs("");
2528 			}
2529 		}
2530 
2531 
2532 }
2533 
2534 
2535 static float PROB_QUAL_TABLE[] = {1.000000000 , 0.794328235 , 0.630957344 , 0.501187234 , 0.398107171 , 0.316227766 , 0.251188643 , 0.199526231 , 0.158489319 , 0.125892541 , 0.100000000 , 0.079432823 , 0.063095734 , 0.050118723 , 0.039810717 , 0.031622777 , 0.025118864 , 0.019952623 , 0.015848932 , 0.012589254 , 0.010000000 , 0.007943282 , 0.006309573 , 0.005011872 , 0.003981072 , 0.003162278 , 0.002511886 , 0.001995262 , 0.001584893 , 0.001258925 , 0.001000000 , 0.000794328 , 0.000630957 , 0.000501187 , 0.000398107 , 0.000316228 , 0.000251189 , 0.000199526 , 0.000158489 , 0.000125893 , 0.000100000 , 0.000079433 , 0.000063096 , 0.000050119 , 0.000039811 , 0.000031623 , 0.000025119 , 0.000019953 , 0.000015849 , 0.000012589 , 0.000010000 , 0.000007943 , 0.000006310 , 0.000005012 , 0.000003981 , 0.000003162 , 0.000002512 , 0.000001995 , 0.000001585 , 0.000001259 , 0.000001000 , 0.000000794 , 0.000000631 , 0.000000501};
2536 
2537 
2538 
2539 
get_base_error_prob33(char v)2540 float get_base_error_prob33(char v)
2541 {
2542 	return PROB_QUAL_TABLE[v-'!'];
2543 }
get_base_error_prob64(char v)2544 float get_base_error_prob64(char v)
2545 {
2546 	return PROB_QUAL_TABLE[v-'@'];
2547 }
2548 
2549 static int PROB_QUAL_INT_TABLE[] = { 1000000 , 794328 , 630957 , 501187 , 398107 , 316227 , 251188 , 199526 , 158489 , 125892 , 100000 , 79432 , 63095 , 50118 , 39810 , 31622 , 25118 , 19952 , 15848 , 12589 , 10000 , 7943 , 6309 , 5011 , 3981 , 3162 , 2511 , 1995 , 1584 , 1258 , 1000 , 794 , 630 , 501 , 398 , 316 , 251 , 199 , 158 , 125 , 100 , 79 , 63 , 50 , 39 , 31 , 25 , 19 , 15 , 12 , 10 , 7 , 6 , 5 , 3 , 3 , 2 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , };
2550 
2551 
get_base_error_prob33i(char v)2552 int get_base_error_prob33i(char v)
2553 {
2554 	return PROB_QUAL_INT_TABLE[v-'!'];
2555 }
get_base_error_prob64i(char v)2556 int get_base_error_prob64i(char v)
2557 {
2558 	return PROB_QUAL_INT_TABLE[v-'@'];
2559 }
2560 
2561 #ifdef SKIP_THIS_PART
bad_reverse_cigar(char * cigar)2562 void bad_reverse_cigar(char * cigar)
2563 {
2564 	int cigar_cursor = 0;
2565 	srInt_64 tmpv=0;
2566 	char ncg[100];
2567 	ncg[0]=0;
2568 	while(cigar[cigar_cursor])
2569 	{
2570 		char cc = cigar[cigar_cursor];
2571 		if(isdigit(cc))
2572 		{
2573 			tmpv=tmpv*10+(cc-'0');
2574 		}
2575 		else if((cc>='A'&&cc<='Z')||(cc>='a'&&cc<='z'))
2576 		{
2577 			char ncg2[103];
2578 			#ifdef __MINGW32__
2579 			sprintf(ncg2, "%I64d%c", tmpv, cc);
2580 			#else
2581 			sprintf(ncg2, "%lld%c", tmpv, cc);
2582 			#endif
2583 			strncat(ncg2, ncg, 99);
2584 			strncpy(ncg, ncg2, 99);
2585 			tmpv=0;
2586 		}
2587 		else
2588 		{
2589 			char ncg2[103];
2590 			sprintf(ncg2, "%c%s",cc,ncg);
2591 			strncpy(ncg, ncg2, 99);
2592 			tmpv=0;
2593 		}
2594 		cigar_cursor++;
2595 	}
2596 	strcpy(cigar, ncg);
2597 }
2598 
2599 #ifdef TEST_WHATCIGARREVERSE
main()2600 int main()
2601 #else
2602 int debug_main()
2603 #endif
2604 {
2605 	char cg[100];
2606 	sprintf(cg,"10M8H20M9I100N30M");
2607 	bad_reverse_cigar(cg);
2608 	SUBREADprintf("%s\n",cg);
2609 	return 0;
2610 }
2611 #endif
2612 
remove_indel_neighbours(HashTable * indel_table)2613 void remove_indel_neighbours(HashTable * indel_table)
2614 {
2615 
2616 	unsigned int * to_delete;
2617 	char * to_delete_len;
2618 	int xi;
2619 	KeyValuePair * cursor;
2620 	return;
2621 
2622 	to_delete=(unsigned int* )malloc(sizeof(int)*400000);
2623 	to_delete_len=(char* )malloc(sizeof(char)*400000);
2624 	int num_delete=0, bucket;
2625 
2626 	for(bucket=0; bucket<indel_table -> numOfBuckets; bucket++)
2627 	{
2628 		cursor = indel_table -> bucketArray[bucket];
2629 		while (1)
2630 		{
2631 			if(!cursor) break;
2632 			indel_record_t * p = (indel_record_t *) cursor -> key;
2633 			indel_result_t * pv = (indel_result_t *)cursor -> value;
2634 
2635 			indel_record_t p_nb;
2636 			p_nb.len = p->len;
2637 
2638 			for(xi=-5;xi<=5;xi++)
2639 			{
2640 				if(!xi)continue;
2641 
2642 				p_nb.pos = p->pos+xi;
2643 				indel_result_t * p_nbv = (indel_result_t *)HashTableGet(indel_table, &p_nb);
2644 				if(p_nbv && (p_nbv->support > pv->support || (p_nbv->support == pv->support && xi>0)))
2645 				{
2646 					if(num_delete<399999)
2647 					{
2648 						to_delete_len[num_delete]=p->len;
2649 						to_delete[num_delete++]= p->pos+xi;
2650 					}
2651 					break;
2652 				}
2653 			}
2654 
2655 			cursor = cursor->next;
2656 		}
2657 	}
2658 
2659 
2660 	for(xi=0;xi<num_delete;xi++)
2661 	{
2662 		indel_record_t p;
2663 		p.len = to_delete_len[xi];
2664 		p.pos = to_delete[xi];
2665 		HashTableRemove(indel_table, &p);
2666 	}
2667 
2668 
2669 	SUBREADprintf("\n %d low-confidence indels have been removed.", num_delete);
2670 	free(to_delete);
2671 	free(to_delete_len);
2672 
2673 }
2674 
print_version_info()2675 void print_version_info()
2676 {
2677 	SUBREADprintf("\nSubread %s\n", SUBREAD_VERSION);
2678 	SUBREADprintf("\n");
2679 }
2680 
fc_strcmp_chro(const void * s1,const void * s2)2681 int fc_strcmp_chro(const void * s1, const void * s2)
2682 {
2683 
2684 //	// we can compare the 3-th and 4-th bytes because we know that the buffers have enough lengths.
2685 //	if(((char *)s1)[4] != ((char *)s2)[4] || ((char *)s1)[3] != ((char *)s2)[3] )
2686 //		return 1;
2687 	return strcmp((char*)s1, (char*)s2);
2688 }
2689 
2690 
2691 
fc_chro_hash(const void * key)2692 srUInt_64 fc_chro_hash(const void *key) {
2693 	const unsigned char *str = (const unsigned char *) key;
2694 
2695 	int xk1;
2696 	unsigned long hashValue = 0;
2697 
2698 	for(xk1=0; str[xk1]; xk1++)
2699 	{
2700 		unsigned long ch = str[xk1];
2701 		hashValue += (ch + xk1) << (ch & 0xf);
2702 	}
2703 
2704 
2705 	return hashValue;
2706 }
2707 
2708 
2709