1 /***************************************************************
2
3 The Subread software package is free software package:
4 you can redistribute it and/or modify it under the terms
5 of the GNU General Public License as published by the
6 Free Software Foundation, either version 3 of the License,
7 or (at your option) any later version.
8
9 Subread is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty
11 of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12
13 See the GNU General Public License for more details.
14
15 Authors: Drs Yang Liao and Wei Shi
16
17 ***************************************************************/
18
19 /***************************************************************
20
21 The ASCII Art used in this file was generated using FIGlet and
22 the big font, contributed by Glenn Chappell to FIGlet.
23
24 ***************************************************************/
25
26 #include <stdio.h>
27 #include <assert.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <stdarg.h>
31 #include <time.h>
32 #include <sys/time.h>
33 #include <getopt.h>
34 #include <sys/types.h>
35 #ifndef __MINGW32__
36 #include <sys/resource.h>
37 #endif
38 #include <unistd.h>
39 #include <sys/stat.h>
40 #include <locale.h>
41 #include <ctype.h>
42
43
44
45 #include "subread.h"
46 #include "sublog.h"
47 #include "core.h"
48 #include "input-files.h"
49 #include "input-blc.h"
50 #include "sorted-hashtable.h"
51 #include "HelperFunctions.h"
52
53 #include "core-bigtable.h"
54 #include "core-indel.h"
55 #include "core-junction.h"
56
57 static struct option long_options[] =
58 {
59 {"memory-optimisation", required_argument, 0, 0},
60 {0, 0, 0, 0}
61 };
62
63 int (*progress_report_callback)(int, int, int);
64
core_version_number(char * program)65 void core_version_number(char * program)
66 {
67 SUBREADprintf("\n%s v%s\n\n" , program, SUBREAD_VERSION);
68 }
69
is_valid_float(char * optarg,char * optname)70 int is_valid_float(char * optarg, char * optname){
71 int xk1=0;
72 while(1){
73 int nch = optarg[xk1++];
74 if(!nch){
75 if(xk1 == 1){
76 SUBREADprintf("Value for argumant %s-%s is missing.\n", optname[1]?"-":"", optname);
77 return 0;
78 }
79 break;
80 }
81 if(( nch!='-' || xk1 > 1 ) && nch != '.' && !isdigit(nch)){
82 SUBREADprintf("Value for argumant %s-%s is not a valid number: '%s'\n", optname[1]?"-":"", optname, optarg);
83 return 0;
84 }
85 }
86 return 1;
87 }
88
is_valid_digit(char * optarg,char * optname)89 int is_valid_digit(char * optarg, char * optname){
90 int xk1=0;
91 while(1){
92 int nch = optarg[xk1++];
93 if(!nch){
94 if(xk1 == 1){
95 SUBREADprintf("Value for argumant %s-%s is missing.\n", optname[1]?"-":"", optname);
96 return 0;
97 }
98 break;
99 }
100 if(( nch!='-' || xk1 > 1 ) && !isdigit(nch)){
101 SUBREADprintf("Value for argumant %s-%s is not a valid number: '%s'\n", optname[1]?"-":"", optname, optarg);
102 return 0;
103 }
104 }
105 return 1;
106 }
107
is_valid_digit_range(char * optarg,char * optname,int min,int max_inc)108 int is_valid_digit_range(char * optarg, char * optname, int min, int max_inc){
109 if(!is_valid_digit( optarg, optname )) return 0;
110
111 int tv = atoi(optarg);
112 if(tv < min || tv > max_inc){
113 SUBREADprintf("Value for argumant %s-%s is out of range: %d to %d\n", optname[1]?"-":"", optname, min, max_inc);
114 return 0;
115 }
116 return 1;
117 }
118
warning_file_limit()119 void warning_file_limit()
120 {
121 #ifndef __MINGW32__
122 struct rlimit limit_st;
123 getrlimit(RLIMIT_NOFILE, & limit_st);
124
125 {
126 if(min(limit_st.rlim_cur , limit_st.rlim_max) < 400)
127 {
128 print_in_box(80,0,0,"WARNING This operation needs to open many files at same time,");
129 print_in_box(80,0,0," but your OS only allows to open %d files.", min(limit_st.rlim_cur , limit_st.rlim_max));
130 print_in_box(80,0,0," You can use command 'ulimit -n 500' to raise this limit");
131 print_in_box(80,0,0," to 500, or the program may become very slow.");
132 print_in_box(80,0,0,"");
133 }
134 }
135 #endif
136 }
137
exec_cmd(char * cmd,char * outstr,int out_limit)138 int exec_cmd(char * cmd, char * outstr, int out_limit){
139 FILE* pipe = popen(cmd, "r");
140 if (!pipe) return -1;
141
142 outstr[0]=0;
143 char * linebuf = malloc(3000);
144 int out_ptr = 0;
145
146 while (!feof(pipe)) {
147 if (fgets(linebuf, 128, pipe) != NULL){
148 if(out_ptr + strlen(linebuf) < out_limit){
149 strcat(outstr+out_ptr, linebuf);
150 out_ptr += strlen(linebuf);
151 }
152 }
153 }
154 pclose(pipe);
155 free(linebuf);
156 return out_ptr;
157 }
158
print_in_box(int line_width,int is_boundary,int options,char * pattern,...)159 void print_in_box(int line_width, int is_boundary, int options, char * pattern,...)
160 {
161 int put_color_for_colon, is_center, is_wrapped;
162 va_list args;
163 va_start(args , pattern);
164 char * content, *out_line_buff;
165
166 put_color_for_colon = (options & PRINT_BOX_NOCOLOR_FOR_COLON)?0:1;
167 is_center = (options & PRINT_BOX_CENTER)?1:0;
168 is_wrapped = (options & PRINT_BOX_WRAPPED)?1:0;
169
170 content= malloc(1200);
171 int content_len = vsprintf(content, pattern, args);
172 out_line_buff= malloc(1200);
173 out_line_buff[0]=0;;
174
175 if(is_wrapped){
176 int seg_i = 0;
177 for(seg_i=0; seg_i < content_len; seg_i += line_width-7){
178 strcpy(out_line_buff, content + seg_i);
179 out_line_buff[line_width-7] = 0;
180
181 print_in_box(line_width, is_boundary, options & (~PRINT_BOX_WRAPPED), out_line_buff);
182 }
183 }else{
184 int is_R_code,x1,content_len = strlen(content), state, txt_len, is_cut = 0, real_lenwidth;
185
186 is_R_code = 1;
187 #ifdef MAKE_STANDALONE
188 is_R_code = 0;
189 #endif
190
191 if(content_len>0&&content[content_len-1]=='\r'){
192 content_len--;
193 content[content_len] = 0;
194 }
195
196 if(content_len>0&&content[content_len-1]=='\n'){
197 content_len--;
198 content[content_len] = 0;
199 }
200
201 state = 0;
202 txt_len = 0;
203 real_lenwidth = line_width;
204 for(x1 = 0; content [x1]; x1++)
205 {
206 char nch = content [x1];
207 if(nch == CHAR_ESC)
208 state = 1;
209 if(state){
210 real_lenwidth --;
211 }else{
212 txt_len++;
213
214 if(txt_len == 80 - 6)
215 {
216 is_cut = 1;
217 }
218 }
219
220 if(nch == 'm' && state)
221 state = 0;
222 }
223
224 if(is_cut)
225 {
226 state = 0;
227 txt_len = 0;
228 for(x1 = 0; content [x1]; x1++)
229 {
230 char nch = content [x1];
231 if(nch == CHAR_ESC)
232 state = 1;
233 if(!state){
234 txt_len++;
235 if(txt_len == 80 - 9)
236 {
237 strcpy(content+x1, "\x1b[0m ...");
238 content_len = line_width - 4;
239 content_len = 80 - 4;
240 line_width = 80;
241 break;
242 }
243 }
244 if(nch == 'm' && state)
245 state = 0;
246 }
247 }
248
249 if(content_len==0 && is_boundary)
250 {
251 strcat(out_line_buff,is_boundary==1?"//":"\\\\");
252 for(x1=0;x1<line_width-4;x1++)
253 strcat(out_line_buff,"=");
254 strcat(out_line_buff,is_boundary==1?"\\\\":"//");
255 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO, "%s", out_line_buff);
256
257 free(content);
258 free(out_line_buff);
259 return;
260 }
261 else if(is_boundary)
262 {
263 int left_stars = (line_width - content_len)/2 - 1;
264 int right_stars = line_width - content_len - 2 - left_stars;
265 strcat(out_line_buff,is_boundary==1?"//":"\\\\");
266 for(x1=0;x1<left_stars-2;x1++) strcat(out_line_buff,"=");
267 sprintf(out_line_buff+strlen(out_line_buff),"%c[36m", CHAR_ESC);
268 sprintf(out_line_buff+strlen(out_line_buff)," %s ", content);
269 sprintf(out_line_buff+strlen(out_line_buff),"%c[0m", CHAR_ESC);
270 for(x1=0;x1<right_stars-2;x1++) strcat(out_line_buff,"=");
271 strcat(out_line_buff,is_boundary==1?"\\\\":"//");
272 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO, "%s", out_line_buff);
273
274 free(content);
275 free(out_line_buff);
276 return;
277 }
278
279 int right_spaces, left_spaces;
280 if(is_center)
281 left_spaces = (line_width - content_len)/2-2;
282 else
283 left_spaces = 1;
284
285 right_spaces = line_width - 4 - content_len- left_spaces;
286
287 char spaces[81];
288 memset(spaces , ' ', 80);
289 spaces[0]='|';
290 spaces[1]='|';
291 spaces[80]=0;
292
293 //sublog_fwrite(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO,"||");
294
295 //for(x1=0;x1<left_spaces;x1++) sublog_fwrite(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO," ");
296
297 spaces[left_spaces+2] = 0;
298 strcat(out_line_buff,spaces);
299
300 if(is_R_code)
301 {
302 strcat(out_line_buff,content);
303 }
304 else
305 {
306 int col1w=-1;
307 for(x1=0; content[x1]; x1++)
308 {
309 if(content[x1]==':')
310 {
311 col1w=x1;
312 break;
313 }
314 }
315 if(col1w>0 && col1w < content_len-1 && put_color_for_colon)
316 {
317 content[col1w+1]=0;
318 strcat(out_line_buff,content);
319 strcat(out_line_buff," ");
320 sprintf(out_line_buff+strlen(out_line_buff),"%c[36m", CHAR_ESC);
321 strcat(out_line_buff,content+col1w+2);
322 sprintf(out_line_buff+strlen(out_line_buff),"%c[0m", CHAR_ESC);
323 }
324 else
325 strcat(out_line_buff,content);
326 }
327 // for(x1=0;x1<right_spaces - 1;x1++) sublog_fwrite(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO," ");
328
329 memset(spaces , ' ', 80);
330 spaces[79]='|';
331 spaces[78]='|';
332
333 right_spaces = max(1,right_spaces);
334 sprintf(out_line_buff+strlen(out_line_buff)," %c[0m%s", CHAR_ESC , spaces + (78 - right_spaces + 1));
335 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO, out_line_buff);
336 }
337 free(out_line_buff);
338 free(content);
339 }
340
341
342
343
show_summary(global_context_t * global_context)344 int show_summary(global_context_t * global_context)
345 {
346
347 #ifndef MAKE_STANDALONE
348 // SAVE summary into files. This is only for Rsubread currently.
349 char sumname[MAX_FILE_NAME_LENGTH+30];
350 sprintf(sumname, "%s.summary", global_context->config.output_prefix);
351 FILE * sumfp = fopen(sumname,"w");
352 #ifdef __MINGW32__
353 fprintf(sumfp, "Total_%s\t%I64u\n", global_context->input_reads.is_paired_end_reads?"fragments":"reads" , global_context -> all_processed_reads);
354 #else
355 fprintf(sumfp, "Total_%s\t%llu\n", global_context->input_reads.is_paired_end_reads?"fragments":"reads" , global_context -> all_processed_reads);
356 #endif
357 fprintf(sumfp, "Mapped_%s\t%u\n", global_context->input_reads.is_paired_end_reads?"fragments":"reads" , global_context -> all_mapped_reads);
358 fprintf(sumfp, "Uniquely_mapped_%s\t%u\n", global_context->input_reads.is_paired_end_reads?"fragments":"reads" , global_context -> all_uniquely_mapped_reads);
359 fprintf(sumfp, "Multi_mapping_%s\t%u\n", global_context->input_reads.is_paired_end_reads?"fragments":"reads" , global_context -> all_multimapping_reads);
360 fprintf(sumfp, "Unmapped_%s\t%u\n", global_context->input_reads.is_paired_end_reads?"fragments":"reads" , global_context -> all_unmapped_reads);
361
362 if(global_context->input_reads.is_paired_end_reads){
363 #ifdef __MINGW32__
364 fprintf(sumfp, "Properly_paired_fragments\t%I64u\n",global_context -> all_correct_PE_reads);
365 #else
366 fprintf(sumfp, "Properly_paired_fragments\t%llu\n",global_context -> all_correct_PE_reads);
367 #endif
368 fprintf(sumfp, "Singleton_fragments\t%u\n", global_context -> not_properly_pairs_only_one_end_mapped);
369 fprintf(sumfp, "More_than_one_chr_fragments\t%u\n", global_context -> not_properly_pairs_different_chro);
370 fprintf(sumfp, "Unexpected_strandness_fragments\t%u\n", global_context -> not_properly_different_strands);
371 fprintf(sumfp, "Unexpected_template_length\t%u\n", global_context -> not_properly_pairs_TLEN_wrong);
372 fprintf(sumfp, "Inversed_mapping\t%u\n", global_context -> not_properly_pairs_wrong_arrangement);
373 }
374
375 if(global_context->config.entry_program_name == CORE_PROGRAM_SUBJUNC && ( global_context -> config.prefer_donor_receptor_junctions || !(global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection)))
376 fprintf(sumfp, "Junctions\t%u\n", global_context -> all_junctions);
377 fprintf(sumfp, "Indels\t%u\n", global_context -> all_indels);
378
379
380 fclose(sumfp);
381 #endif
382
383 if(progress_report_callback)
384 {
385 long long int all_reads_K = global_context -> all_processed_reads / 1000;
386 float mapped_reads_percentage = global_context -> all_mapped_reads * 1./global_context -> all_processed_reads;
387 if(global_context->input_reads.is_paired_end_reads) mapped_reads_percentage/=2;
388 progress_report_callback(10, 900000, (int) (miltime()-global_context->start_time));
389 progress_report_callback(10, 900010, (int) all_reads_K);
390 progress_report_callback(10, 900011, (int) (10000.*mapped_reads_percentage));
391 }
392
393 print_in_box(80,0,1," ");
394 print_in_box(89,0,1," %c[36mCompleted successfully.%c[0m", CHAR_ESC, CHAR_ESC);
395 print_in_box(80,0,1," ");
396 print_in_box(80,2,1," ");
397 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO, "");
398 print_in_box(80, 1,1," Summary");
399 print_in_box(80, 0,1," ");
400
401 #ifdef __MINGW32__
402 if(global_context->input_reads.is_paired_end_reads)
403 print_in_box(80, 0,0," Total fragments : %I64d" , global_context -> all_processed_reads);
404 else
405 print_in_box(80, 0,0," Total reads : %I64d" , global_context -> all_processed_reads);
406
407 print_in_box(81, 0,0," Mapped : %u (%.1f%%%%)", global_context -> all_mapped_reads, global_context -> all_mapped_reads*100.0 / global_context -> all_processed_reads);
408 print_in_box(80, 0,0," Uniquely mapped : %u", global_context -> all_uniquely_mapped_reads);
409 print_in_box(80, 0,0," Multi-mapping : %u", global_context -> all_multimapping_reads);
410 print_in_box(80, 0,1," ");
411 print_in_box(80, 0,0," Unmapped : %u", global_context -> all_unmapped_reads);
412 if(global_context->input_reads.is_paired_end_reads){
413 print_in_box(80, 0,1," ");
414 print_in_box(80, 0,0," Properly paired : %I64d", global_context -> all_correct_PE_reads);
415 print_in_box(80, 0,0," Not properly paired : %I64d", global_context -> all_mapped_reads - global_context -> all_correct_PE_reads);
416 print_in_box(80, 0,0," Singleton : %u", global_context -> not_properly_pairs_only_one_end_mapped);
417 print_in_box(80, 0,0," Chimeric : %u", global_context -> not_properly_pairs_different_chro);
418 print_in_box(80, 0,0," Unexpected strandness : %u", global_context -> not_properly_different_strands);
419 print_in_box(80, 0,0," Unexpected fragment length : %u", global_context -> not_properly_pairs_TLEN_wrong);
420 print_in_box(80, 0,0," Unexpected read order : %u", global_context -> not_properly_pairs_wrong_arrangement);
421 }
422
423 print_in_box(80, 0,1," ");
424
425 if(global_context->config.output_prefix[0])
426 {
427 if(global_context->config.entry_program_name == CORE_PROGRAM_SUBJUNC && ( global_context -> config.prefer_donor_receptor_junctions || !(global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection)))
428 print_in_box(80, 0,0," Junctions : %u", global_context -> all_junctions);
429 if((global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection))
430 print_in_box(80, 0,0," Fusions : %u", global_context -> all_fusions);
431 print_in_box(80, 0,0," Indels : %u", global_context -> all_indels);
432 }
433 #else
434 if(global_context->input_reads.is_paired_end_reads)
435 print_in_box(80, 0,0," Total fragments : %'llu" , global_context -> all_processed_reads);
436 else
437 print_in_box(80, 0,0," Total reads : %'llu" , global_context -> all_processed_reads);
438
439 print_in_box(81, 0,0," Mapped : %'u (%.1f%%%%)", global_context -> all_mapped_reads, global_context -> all_mapped_reads*100.0 / global_context -> all_processed_reads);
440 print_in_box(80, 0,0," Uniquely mapped : %'u", global_context -> all_uniquely_mapped_reads);
441 print_in_box(80, 0,0," Multi-mapping : %'u", global_context -> all_multimapping_reads);
442 print_in_box(80, 0,1," ");
443 print_in_box(80, 0,0," Unmapped : %'u", global_context -> all_unmapped_reads);
444 if(global_context->input_reads.is_paired_end_reads){
445 print_in_box(80, 0,1," ");
446 print_in_box(80, 0,0," Properly paired : %'llu", global_context -> all_correct_PE_reads);
447 print_in_box(80, 0,0," Not properly paired : %'llu", global_context -> all_mapped_reads - global_context -> all_correct_PE_reads);
448 print_in_box(80, 0,0," Singleton : %'u", global_context -> not_properly_pairs_only_one_end_mapped);
449 print_in_box(80, 0,0," Chimeric : %'u", global_context -> not_properly_pairs_different_chro);
450 print_in_box(80, 0,0," Unexpected strandness : %'u", global_context -> not_properly_different_strands);
451 print_in_box(80, 0,0," Unexpected fragment length : %'u", global_context -> not_properly_pairs_TLEN_wrong);
452 print_in_box(80, 0,0," Unexpected read order : %'u", global_context -> not_properly_pairs_wrong_arrangement);
453 }
454
455 print_in_box(80, 0,1," ");
456
457 if(global_context->config.output_prefix[0])
458 {
459 if(global_context->config.entry_program_name == CORE_PROGRAM_SUBJUNC && ( global_context -> config.prefer_donor_receptor_junctions || !(global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection)))
460 print_in_box(80, 0,0," Junctions : %'u", global_context -> all_junctions);
461 if((global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection))
462 print_in_box(80, 0,0," Fusions : %'u", global_context -> all_fusions);
463 print_in_box(80, 0,0," Indels : %'u", global_context -> all_indels);
464 }
465
466 #endif
467
468 if(global_context -> is_phred_warning)
469 {
470 print_in_box(80, 0,1," ");
471 print_in_box(80,0,0," WARNING : Phred offset (%d) incorrect?", global_context->config.phred_score_format == FASTQ_PHRED33?33:64);
472 }
473 print_in_box(80, 0,1," ");
474 print_in_box(80, 0,0," Running time : %.1f minutes", (miltime()-global_context->start_time)*1./60);
475
476 if( global_context->input_reads.is_paired_end_reads && global_context -> config.reported_multi_best_reads<2 && global_context -> expected_TLEN_read_numbers < READPAIRS_FOR_CALC_EXPT_TLEN){
477 print_in_box(80, 0,1," ");
478 print_in_box(80,0,0," NOTE : No enough read-pairs to derive expected fragment length.");
479 }
480 /*
481 print_in_box(80, 0,0," Running time 0 : %.2f minutes", global_context->timecost_load_index/60);
482 print_in_box(80, 0,0," Running time 1 : %.2f minutes", global_context->timecost_voting/60);
483 print_in_box(80, 0,0," Running time 2 : %.2f minutes", global_context->timecost_before_realign/60);
484 print_in_box(80, 0,0," Running time 3 : %.2f minutes", global_context->timecost_for_realign/60);
485 */
486 print_in_box(80, 0,1,"");
487 print_in_box(80, 2,1,"");
488 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO, "");
489
490 return 0;
491 }
492
493
494
show_progress(global_context_t * global_context,thread_context_t * thread_context,unsigned int current_read_no,int task)495 void show_progress(global_context_t * global_context, thread_context_t * thread_context, unsigned int current_read_no, int task)
496 {
497
498 // Read_chunk_start is the file_offset of the very first read in the entire file.
499 // current_circle_start_position_file1 is the file_offset of the first read in this 5-million read chunk (or whatever the chunk size is)
500
501
502 if(global_context->config.scRNA_input_mode){
503 if(task == 10){
504 char minchr[10];
505 float min_value = (miltime() - global_context -> start_time)*1./60;
506 if(min_value < 9.91)
507 sprintf(minchr, "%.01f", min_value);
508 else sprintf(minchr, "% 3d", (int)min_value);
509 print_in_box(80,0,0," processed % 3d million input reads in %s minutes", current_read_no/1000000, minchr);
510 }
511 //SUBREADprintf("TASK=%d, RNO=%u\n", task, current_read_no);
512 return;
513 }
514
515 if(thread_context&&thread_context->thread_id)
516 {
517 SUBREADputs("show_progress can only be called by thread#0\n");
518 return;
519 }
520
521 gene_input_t * ginp1 = &global_context->input_reads.first_read_file;
522
523 unsigned long long ginp1_file_pos = geinput_file_offset(ginp1);
524
525 if(task == STEP_VOTING)
526 {
527 unsigned long long real_read_number = global_context -> all_processed_reads + current_read_no;
528 if(real_read_number>1000)
529 global_context -> input_reads . avg_read_length = (ginp1_file_pos - ginp1 -> read_chunk_start) * 1./real_read_number ;
530 }
531
532 unsigned long long total_file_size = global_context -> input_reads.first_read_file_size;
533 unsigned long long guessed_all_reads = total_file_size / global_context -> input_reads . avg_read_length;
534
535 unsigned long long current_block_start_file_offset = global_context -> current_circle_start_abs_offset_file1;
536
537 unsigned long long guessed_this_chunk_reads = (total_file_size - current_block_start_file_offset) / global_context -> input_reads . avg_read_length ;
538 if(guessed_this_chunk_reads > global_context ->config.reads_per_chunk) guessed_this_chunk_reads = global_context ->config.reads_per_chunk;
539
540 unsigned long long guessed_all_reads_before_this_chunk = current_block_start_file_offset / global_context -> input_reads . avg_read_length ;
541 unsigned long long reads_finished_in_this_chunk = (ginp1_file_pos - current_block_start_file_offset) / global_context -> input_reads . avg_read_length;
542
543 int is_thred_step_running = global_context->config.is_third_iteration_running ? 1:0;
544
545 if(task != STEP_VOTING)
546 reads_finished_in_this_chunk = (ginp1_file_pos - current_block_start_file_offset) / global_context -> input_reads . avg_read_length;
547
548 unsigned long long finished_steps = guessed_all_reads_before_this_chunk * (global_context -> index_block_number * 2 + 1 + is_thred_step_running);
549
550
551 // add steps for voting
552 if(task == STEP_VOTING)
553 finished_steps += guessed_this_chunk_reads * global_context -> current_index_block_number * 2;
554 else if(task == STEP_ITERATION_TWO)
555 finished_steps += guessed_this_chunk_reads * global_context -> index_block_number * 2;
556 else if(task > STEP_ITERATION_TWO)
557 finished_steps += guessed_this_chunk_reads *(global_context -> index_block_number * 2+1);
558
559 if(task == STEP_VOTING)
560 finished_steps += reads_finished_in_this_chunk*2;
561 else finished_steps += reads_finished_in_this_chunk;
562
563 unsigned long long guessed_all_steps = guessed_all_reads * (global_context -> index_block_number *2 + 1 + is_thred_step_running);
564
565 float finished_rate = finished_steps*1./guessed_all_steps;
566 float reads_per_second = 0;
567
568 if(task == STEP_VOTING)
569 reads_per_second = finished_rate *1.*guessed_all_reads / (miltime() - global_context -> align_start_time);
570 else
571 reads_per_second = finished_rate *1.*guessed_all_reads / (miltime() - global_context -> start_time);
572
573 if(current_read_no>1000 && !progress_report_callback)
574 {
575 char minchr[10];
576 float min_value = (miltime() - global_context -> start_time)*1./60;
577 if(min_value < 9.91)
578 sprintf(minchr, "%.01f", min_value);
579 else sprintf(minchr, "% 3d", (int)min_value);
580
581 // print_in_box(81,0,0, "%4d%%%% completed, %s mins elapsed, total=%dk %s, rate=%2.1fk/s\r", (int)(finished_rate*100), minchr,(int)(guessed_all_reads*1./1000), global_context -> input_reads.is_paired_end_reads?"frags":"reads", reads_per_second/1000);
582 print_in_box(81,0,0, "%4d%%%% completed, %s mins elapsed, rate=%2.1fk %s per second\r", (int)(finished_rate*100), minchr, reads_per_second/1000, global_context -> input_reads.is_paired_end_reads?"fragments":"reads");
583 }
584
585 if(progress_report_callback)
586 {
587 progress_report_callback(10 ,task,(int)(10000*finished_rate));
588 progress_report_callback(20 ,task,(int)(guessed_all_reads/1000));
589 }
590 }
591
592
593 /*
594 int Xmain(int argc , char ** argv);
595
596 int main(int argc, char ** argv)
597 {
598 int xk1=0;
599 for(xk1=0; xk1<4; xk1++)
600 Xmain(argc , argv);
601 return 0;
602 }
603 */
604
605
parse_opts_core(int argc,char ** argv,global_context_t * global_context)606 int parse_opts_core(int argc , char ** argv, global_context_t * global_context)
607 {
608 int c;
609 int option_index = 0;
610
611 optind = 1;
612 opterr = 1;
613 optopt = 63;
614
615 while ((c = getopt_long (argc, argv, "ExsS:L:AHd:D:n:m:p:P:R:r:i:l:o:T:Q:I:t:B:b:Q:FcuUfM?", long_options, &option_index)) != -1)
616 {
617 switch(c)
618 {
619 case 'Q':
620 global_context->config.multi_best_reads = atoi(optarg);
621 break;
622 case 'H':
623 global_context->config.use_hamming_distance_break_ties = 1;
624 break;
625 case 's':
626 global_context->config.downscale_mapping_quality = 1;
627 break;
628 case 'M':
629 global_context->config.do_big_margin_filtering_for_reads = 1;
630 global_context->config.report_multi_mapping_reads = 0;
631 break;
632
633 case 'A':
634 global_context->config.report_sam_file = 0;
635 break;
636 case 'E':
637 global_context->config.max_mismatch_exonic_reads = 200;
638 global_context->config.max_mismatch_junction_reads = 200;
639
640 break;
641 case 'f':
642 global_context->config.max_mismatch_exonic_reads = 200;
643 global_context->config.max_mismatch_junction_reads = 200;
644 global_context->config.do_fusion_detection = 1;
645 global_context->config.minimum_subread_for_first_read = 1;
646 global_context->config.minimum_subread_for_second_read = 1;
647 global_context->config.total_subreads = 28;
648 global_context->config.report_no_unpaired_reads = 0;
649 global_context->config.limited_tree_scan = 0;
650 global_context->config.use_hamming_distance_in_exon = 1;
651 break;
652 case 'x':
653 global_context->config.max_mismatch_exonic_reads = 10;
654 global_context->config.max_mismatch_junction_reads = 1;
655 global_context->config.ambiguous_mapping_tolerance = 39;
656 global_context->config.extending_search_indels = 0;
657
658 global_context->config.do_breakpoint_detection = 1;
659 global_context->config.total_subreads = 14;
660 global_context->config.minimum_subread_for_first_read = 3;
661 global_context->config.minimum_subread_for_second_read = 1;
662 global_context->config.high_quality_base_threshold = 990000;
663 global_context->config.do_big_margin_filtering_for_junctions = 1;
664 global_context->config.report_no_unpaired_reads = 0;
665 global_context->config.limited_tree_scan = 1;
666 global_context->config.use_hamming_distance_in_exon = 0;
667 break;
668 case 'S':
669 global_context->config.is_first_read_reversed = optarg[0]=='r'?1:0;
670 global_context->config.is_second_read_reversed = optarg[0]=='f'?0:1;
671 break;
672 case 'U':
673 global_context->config.report_no_unpaired_reads = 1;
674 break;
675 case 'u':
676 global_context->config.report_multi_mapping_reads = 0;
677 break;
678 case 'b':
679 global_context->config.is_methylation_reads = 1;
680 break;
681 case 'D':
682 global_context->config.maximum_pair_distance = atoi(optarg);
683 break;
684 case 'd':
685 global_context->config.minimum_pair_distance = atoi(optarg);
686 break;
687 case 'n':
688 global_context->config.total_subreads = atoi(optarg);
689 break;
690 case 'm':
691 global_context->config.minimum_subread_for_first_read = atoi(optarg);
692 break;
693 case 'T':
694 global_context->config.all_threads = atoi(optarg);
695 if(global_context->config.all_threads <1) global_context->config.all_threads = 1;
696 if(global_context->config.all_threads >32) global_context->config.all_threads = 32;
697
698 break;
699 case 'r':
700 strncpy(global_context->config.first_read_file, optarg, MAX_FILE_NAME_LENGTH-1);
701 break;
702 case 'R':
703 global_context->input_reads.is_paired_end_reads = 1;
704 strncpy(global_context->config.second_read_file, optarg, MAX_FILE_NAME_LENGTH-1);
705 break;
706 case 'i':
707 strncpy(global_context->config.index_prefix, optarg, MAX_FILE_NAME_LENGTH-1);
708 break;
709 case 'o':
710 strncpy(global_context->config.output_prefix, optarg, MAX_FILE_NAME_LENGTH-1);
711 break;
712 case 'I':
713 global_context->config.max_indel_length = atoi(optarg);
714
715 if(global_context->config.max_indel_length <0)global_context->config.max_indel_length =0;
716 if(global_context->config.max_indel_length > MAX_INSERTION_LENGTH)global_context->config.max_indel_length = MAX_INSERTION_LENGTH;
717 if(global_context->config.max_indel_length > 16)
718 {
719 global_context->config.reassembly_subread_length = 12;
720 global_context->config.reassembly_window_multiplex = 3;
721 global_context->config.reassembly_start_read_number = 5;
722 global_context->config.reassembly_tolerable_voting = 0;
723 global_context->config.reassembly_window_alleles = 2;
724 global_context->config.reassembly_key_length = 28;
725
726 global_context->config.is_third_iteration_running = 1;
727 global_context->config.max_mismatch_exonic_reads = 2;
728 global_context->config.max_mismatch_junction_reads = 2;
729 global_context->config.total_subreads = 28;
730 global_context->config.do_big_margin_filtering_for_reads = 1;
731
732 global_context->config.do_superlong_indel_detection = 0;
733 }
734 break;
735 case 'P':
736 if (optarg[0]=='3')
737 global_context->config.phred_score_format = FASTQ_PHRED33;
738 else
739 global_context->config.phred_score_format = FASTQ_PHRED64;
740 break;
741 case 'p':
742 global_context->config.minimum_subread_for_second_read = atoi(optarg);
743 break;
744 case 't':
745 sprintf(global_context->config.temp_file_prefix, "%s/core-temp-sum-%06u-%05u", optarg, getpid(), myrand_rand()
746 );
747 break;
748 case 'F':
749 global_context->config.is_second_iteration_running = 0;
750 global_context->config.report_sam_file = 0;
751 break;
752 case 'B':
753 strcpy(global_context->config.exon_annotation_file, optarg);
754 break;
755 case 'c':
756 global_context->config.space_type = GENE_SPACE_COLOR;
757 break;
758
759 case 0:
760 //if(strcmp("memory-optimisation",long_options[option_index].name)==0)
761 // memory_optimisation = atoi(optarg);
762 break;
763 case '?':
764 default:
765 return -1 ;
766 }
767 }
768
769
770 return 0;
771 }
772
773 #ifdef MAKE_STANDALONE
main_back(int argc,char ** argv)774 int main_back(int argc , char ** argv)
775 {
776 progress_report_callback = NULL;
777 #else
778 int subread_core_main(int argc , char ** argv)
779 {
780 if(progress_report_callback) progress_report_callback(0,0,1);
781 #endif
782
783 return core_main(argc, argv, parse_opts_core);
784 }
785
786
787 int check_configuration(global_context_t * global_context)
788 {
789 int expected_type = FILE_TYPE_FAST_;
790 if(global_context -> config.is_SAM_file_input && global_context -> config.is_BAM_input)
791 expected_type = FILE_TYPE_BAM;
792 else if(global_context -> config.is_SAM_file_input && !global_context -> config.is_BAM_input)
793 expected_type = FILE_TYPE_SAM;
794 else if(global_context -> config.is_gzip_fastq)
795 expected_type = FILE_TYPE_GZIP_FAST_;
796
797 if(global_context -> config.max_indel_length > 16)
798 warning_file_limit();
799
800 int wret = 0, wret2 = 0;
801 if( global_context ->config.scRNA_input_mode == 0 ) warning_file_type(global_context -> config.first_read_file, expected_type);
802 if(global_context -> config.second_read_file[0])
803 {
804 if(expected_type==FILE_TYPE_FAST_ || expected_type==FILE_TYPE_GZIP_FAST_)
805 wret2 = warning_file_type(global_context -> config.second_read_file, expected_type);
806 else
807 print_in_box(80,0,0,"Only one input SAM or BAM file is needed. The second input file is ignored.");
808 }
809
810 if(wret == -1 || wret2 == -1){
811 return -1;
812 }
813
814 if(4 == sizeof(void *) && global_context->config.sort_reads_by_coordinates){
815 SUBREADputs("ERROR: the sort by coordinate function only works on 64-bit computers.");
816 return -1;
817 }
818
819 if(global_context->config.is_input_read_order_required && global_context->config.sort_reads_by_coordinates){
820 SUBREADputs("ERROR: you shouldn't specify keep input order and sort by coordinate at same time.");
821 return -1;
822 }
823
824 if((global_context -> config.is_BAM_output == 0 || global_context->config.output_prefix[0]==0) && global_context->config.sort_reads_by_coordinates){
825 if(global_context -> config.is_BAM_output==0)SUBREADputs("ERROR: SAM output doesn't support read sorting by coordinates.");
826 else SUBREADputs("ERROR: STDOUT output doesn't support read sorting by coordinates.");
827 return -1;
828 }
829
830 return 0;
831
832 }
833
834 unsigned long long myrand_seed = 0;
835 unsigned long long myrand_seed2 = 0;
836
837 void myrand_srand(unsigned long long seed){
838
839 // when it is in R, the seed is NOT used but the R random numbers are used as the seed.
840 myrand_seed = 0;
841 myrand_seed2 = 0;
842
843 #ifndef MAKE_STANDALONE
844 GetRNGstate();
845 seed = 0;
846 seed = (seed << 16)+ (int)(unif_rand()*65521);
847 seed = (seed << 16)+ (int)(unif_rand()*65521);
848 seed = (seed << 16)+ (int)(unif_rand()*65521);
849 seed = (seed << 16)+ (int)(unif_rand()*65521);
850
851 PutRNGstate();
852 #endif
853 myrand_seed = seed;
854 myrand_rand();
855 myrand_rand();
856 myrand_rand();
857 myrand_rand();
858 myrand_rand();
859 myrand_rand();
860 myrand_rand();
861 myrand_rand();
862 myrand_rand();
863 }
864
865 #define THE_966666773323RD_PRIME 28962406029617llu
866 int myrand_rand(){
867 //if(myrand_seed % 3133LLU == 0) myrand_srand(0);
868
869 myrand_seed ^= (myrand_seed % 858173 + myrand_seed % 104729);
870 myrand_seed2 += myrand_seed;
871 myrand_seed ^= myrand_seed2 << 13;
872 return (int)(((myrand_seed^myrand_seed2) %THE_966666773323RD_PRIME) % (1LLU+RAND_MAX));
873 }
874
875
876 int core_main(int argc , char ** argv, int (parse_opts (int , char **, global_context_t * )))
877 {
878 struct timeval xtime;
879 gettimeofday(&xtime,NULL);
880 myrand_srand(time(NULL)^xtime.tv_usec);
881
882 global_context_t * global_context;
883 global_context = (global_context_t*)malloc(sizeof(global_context_t));
884 memset(global_context, 0, sizeof(global_context_t));
885 init_global_context(global_context);
886
887
888 int ret = parse_opts(argc , argv, global_context);
889 init_core_temp_path(global_context);
890 //global_context->config.reads_per_chunk = 200*1024;
891
892 if(global_context->config.max_indel_length > 20 && !global_context->input_reads.is_paired_end_reads)
893 {
894 global_context->config.total_subreads = 28;
895 global_context->config.reassembly_start_read_number = 3;
896 global_context->config.do_superlong_indel_detection = 1;
897 }
898
899 if(global_context->config.fast_run){
900 global_context -> config.top_scores = 1;
901 global_context -> config.max_vote_combinations = 1;
902 global_context -> config.max_vote_simples = 1;
903 global_context -> config.multi_best_reads = 1;
904 }
905
906 ret = ret || print_configuration(global_context);
907 ret = ret || check_configuration(global_context);
908 ret = ret || load_global_context(global_context);
909 ret = ret || init_modules(global_context);
910 ret = ret || read_chunk_circles(global_context);
911 ret = ret || write_final_results(global_context);
912 ret = ret || destroy_modules(global_context);
913 ret = ret || destroy_global_context(global_context);
914 ret = ret || show_summary(global_context);
915
916 free(global_context);
917
918 return ret;
919 }
920
921 // the new file name is written into fname then.
922
923 int convert_BAM_to_SAM(global_context_t * global_context, char * fname, int is_bam)
924 {
925 char temp_file_name[MAX_FILE_NAME_LENGTH+80], *fline=malloc(3000), tmp_readname[MAX_READ_NAME_LEN];
926 short tmp_flags;
927 SamBam_FILE * sambam_reader;
928
929 char * env_no_sort = getenv("SUBREAD_DO_NOT_CHECK_INPUT");
930 if(env_no_sort){
931 global_context->input_reads.is_paired_end_reads = 1;
932 SUBREADprintf("\nWARNING: The SAM input file was assumed to be sorted by name.\nENV: SUBREAD_DO_NOT_CHECK_INPUT\n");
933 return 0;
934 }
935
936 int is_file_sorted = 1;
937 unsigned long long int read_no = 0;
938 sambam_reader = SamBam_fopen(fname, is_bam?SAMBAM_FILE_BAM:SAMBAM_FILE_SAM);
939 if(!sambam_reader)
940 {
941 SUBREADprintf("ERROR: Unable to open file '%s'. The file is not in the specified format.\n", fname);
942 free(fline);
943 return -1;
944 }
945 while(1)
946 {
947 char * is_ret = SamBam_fgets(sambam_reader, fline, 2999, 1);
948 if(!is_ret) break;
949 if(sambam_reader ->is_bam_broken ) {
950 SUBREADputs("ERROR: the BAM format is broken.");
951 return -1;
952 }
953 if(fline[0]=='@')continue;
954 if(is_SAM_unsorted(fline, tmp_readname, &tmp_flags , read_no)){
955 if(tmp_flags & 1) global_context->input_reads.is_paired_end_reads = 1;
956 is_file_sorted = 0;
957 read_no++;
958 break;
959 }
960 read_no++;
961 if(tmp_flags & 1) global_context->input_reads.is_paired_end_reads = 1;
962 else global_context->input_reads.is_paired_end_reads = 0;
963
964 if(! global_context->input_reads.is_paired_end_reads) break;
965 }
966
967 if(read_no<1)
968 {
969 SUBREADprintf("ERROR: unable to find any reads from file '%s'. The file is not in the specified format or is empty.\n", fname);
970 return -1;
971 }
972
973 SamBam_fclose(sambam_reader);
974 print_in_box(80,0,0,"The input %s file contains %s-end reads.", is_bam?"BAM":"SAM", global_context->input_reads.is_paired_end_reads?"paired":"single");
975 if(!is_file_sorted)
976 print_in_box(80,0,0,"The input %s file is unsorted. Reorder it...", is_bam?"BAM":"SAM");
977 else if(is_bam)
978 print_in_box(80,0,0,"Convert the input BAM file...");
979
980 int disk_is_full = 0;
981 if(is_bam || (global_context->input_reads.is_paired_end_reads && !is_file_sorted))
982 {
983 sprintf(temp_file_name, "%s.sam", global_context->config.temp_file_prefix);
984 sambam_reader = SamBam_fopen(fname, is_bam?SAMBAM_FILE_BAM: SAMBAM_FILE_SAM);
985 if(!sambam_reader){
986 SUBREADprintf("Unable to open %s.\n", fname);
987 return -1;
988 }
989 FILE * sam_fp = NULL;
990 SAM_sort_writer writer;
991 int writer_opened = 0;
992
993 if(is_file_sorted) sam_fp = f_subr_open(temp_file_name,"w");
994 else writer_opened = sort_SAM_create(&writer, temp_file_name, NULL);
995
996 if((is_file_sorted && !sam_fp) || (writer_opened && !is_file_sorted)){
997 SUBREADprintf("Failed to write to the directory. You may not have permission to write to this directory or the disk is full.\n");
998 return -1;
999 }
1000
1001 while(1)
1002 {
1003 char * is_ret = SamBam_fgets(sambam_reader, fline, 2999, 1);
1004 if(!is_ret) break;
1005 if(is_file_sorted){
1006 int wlen = fputs(fline, sam_fp);
1007 if(wlen <0){
1008 SUBREADprintf("ERROR: unable to write into the temporary SAM file. Please check the disk space in the output directory.\n");
1009 disk_is_full = 1;
1010 break;
1011 }
1012 }else{
1013 int ret = sort_SAM_add_line(&writer, fline, strlen(fline));
1014 if(ret== -1) {
1015 print_in_box(80,0,0,"ERROR: read name is too long; check input format.");
1016 break;
1017 }
1018 if(ret== -2) {
1019 SUBREADprintf("ERROR: unable to write into the temporary SAM file. Please check the disk space in the output directory.\n");
1020 disk_is_full = 1;
1021 break;
1022 }
1023 }
1024 }
1025
1026 if(is_file_sorted)
1027 fclose(sam_fp);
1028 else{
1029 int ret = sort_SAM_finalise(&writer);
1030 if(writer.unpaired_reads)
1031 #ifdef __MINGW32__
1032 print_in_box(80,0,0,"%I64d single-end mapped reads in reordering.", writer.unpaired_reads);
1033 #else
1034 print_in_box(80,0,0,"%lld single-end mapped reads in reordering.", writer.unpaired_reads);
1035 #endif
1036 if(ret) {
1037 disk_is_full = 1;
1038 SUBREADprintf("ERROR: unable to create the temporary file. Please check the disk space in the output directory.\n");
1039 }
1040 }
1041
1042 SamBam_fclose(sambam_reader);
1043 strcpy(fname, temp_file_name);
1044 global_context -> will_remove_input_file = 1;
1045 }
1046
1047 free(fline);
1048 return disk_is_full;
1049 }
1050
1051 int convert_GZ_to_FQ(global_context_t * global_context, char * fname, int half_n)
1052 {
1053 int is_OK = 0;
1054 char temp_file_name[MAX_FILE_NAME_LENGTH+30];
1055 char * linebuff=malloc(3001);
1056 gzFile rawfp = gzopen(fname, "r");
1057
1058 if(rawfp)
1059 {
1060 print_in_box(80,0,0,"Decompress %s...", fname);
1061 sprintf(temp_file_name, "%s-%d.fq", global_context->config.temp_file_prefix, half_n);
1062 FILE * outfp = fopen(temp_file_name, "w");
1063 if(outfp)
1064 {
1065 while(1)
1066 {
1067 char * bufr =gzgets(rawfp, linebuff, 3000);
1068 if(!bufr) break;
1069 fputs(bufr, outfp);
1070 }
1071 is_OK = 1;
1072 fclose(outfp);
1073 }
1074 else{
1075 SUBREADprintf("Unable to create temporary file '%s'\nPlease run the program in a directory where you have the privilege to create files.\n", temp_file_name);
1076 }
1077
1078 gzclose(rawfp);
1079 }
1080
1081 strcpy(fname, temp_file_name);
1082 global_context -> will_remove_input_file |= (1<< (half_n-1));
1083
1084 return !is_OK;
1085 }
1086
1087 int core_geinput_open(global_context_t * global_context, gene_input_t * fp, int half_number)
1088 {
1089 char *fname;
1090 if(global_context->config.is_SAM_file_input) {
1091 fname = global_context ->config.first_read_file;
1092 if(half_number == 1)
1093 if(convert_BAM_to_SAM(global_context, global_context ->config.first_read_file, global_context ->config.is_BAM_input)) return -1;
1094 if(!global_context->input_reads.is_paired_end_reads) half_number=0;
1095 return geinput_open_sam(fname, fp, half_number);
1096 } else {
1097 int rv = -1;
1098 //SUBREADprintf("SCRNA_MODE=%d\n", global_context->config.scRNA_input_mode );
1099 if(global_context -> config.is_gzip_fastq)
1100 if(convert_GZ_to_FQ(global_context, (half_number==2)? global_context ->config.second_read_file : global_context ->config.first_read_file, half_number)) return -1;
1101 fname = (half_number == 2)?global_context -> config.second_read_file:global_context -> config.first_read_file;
1102 if(global_context->config.scRNA_input_mode == GENE_INPUT_BCL)
1103 rv = geinput_open_bcl(fname , fp, global_context -> config.reads_per_chunk, global_context -> config.all_threads );
1104 else if(global_context->config.scRNA_input_mode == GENE_INPUT_SCRNA_FASTQ)
1105 rv = geinput_open_scRNA_fqs(fname , fp, global_context -> config.reads_per_chunk, global_context -> config.all_threads );
1106 else if(global_context->config.scRNA_input_mode == GENE_INPUT_SCRNA_BAM)
1107 rv = geinput_open_scRNA_BAM(fname , fp, global_context -> config.reads_per_chunk, global_context -> config.all_threads );
1108 else
1109 rv = geinput_open(fname, fp);
1110
1111 if(global_context->input_reads.is_paired_end_reads && global_context->config. scRNA_input_mode){
1112 SUBREADprintf("ERROR: No paired-end input is allowed on scRNA mode.\n");
1113 return -1;
1114 }
1115 return rv;
1116 }
1117 }
1118
1119
1120 int fetch_next_read_pair(global_context_t * global_context, thread_context_t * thread_context, gene_input_t* ginp1, gene_input_t* ginp2, int *read_len_1, int *read_len_2, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, char * qual_text_1, char *qual_text_2, int remove_color_head, subread_read_number_t * read_no_in_chunk)
1121 {
1122 int rl1=0, rl2=0;
1123 int is_second_R1, is_second_R2;
1124 subread_read_number_t this_number = -1;
1125
1126 geinput_preload_buffer(ginp1, &global_context -> input_reads.input_lock);
1127 if(ginp2) geinput_preload_buffer(ginp2, &global_context -> input_reads.input_lock);
1128
1129 subread_lock_occupy(&global_context -> input_reads.input_lock);
1130 if(global_context -> running_processed_reads_in_chunk < global_context -> config.reads_per_chunk)
1131 {
1132 do
1133 {
1134 is_second_R1 = 0; is_second_R2 = 0;
1135 rl1 = geinput_next_read_trim(ginp1, read_name_1, read_text_1 , qual_text_1, global_context->config.read_trim_5, global_context->config.read_trim_3, &is_second_R1);
1136 //if(global_context -> running_processed_reads_in_chunk < 100)SUBREADprintf("GETRNAMES %s LEN=%d\n", read_name_1, rl1);
1137 if(global_context->config.space_type == GENE_SPACE_COLOR && remove_color_head)
1138 {
1139 if(isalpha(read_text_1[0]))
1140 {
1141 int xk1;
1142 for(xk1=2; read_text_1[xk1]; xk1++)
1143 read_text_1[xk1-2]=read_text_1[xk1];
1144 read_text_1[xk1-2]=0;
1145 }
1146 }
1147
1148 if(ginp2)
1149 {
1150 rl2 = geinput_next_read_trim(ginp2, read_name_2, read_text_2 , qual_text_2, global_context->config.read_trim_5, global_context->config.read_trim_3, &is_second_R2);
1151 if(global_context->config.space_type == GENE_SPACE_COLOR && remove_color_head)
1152 {
1153 if(isalpha(read_text_2[0]))
1154 {
1155 int xk1;
1156 for(xk1=2; read_text_2[xk1]; xk1++)
1157 read_text_2[xk1-2]=read_text_2[xk1];
1158 read_text_2[xk1-2]=0;
1159 }
1160 }
1161 }
1162 if(rl1 <= 0 || (rl2 <= 0 && ginp2)) break;
1163 } while(is_second_R1||is_second_R2) ;
1164
1165
1166 //SUBREADprintf("IMBRL1=%d\tRL2=%d\n", rl1, rl2);
1167 //
1168 if(rl1 >0 || (rl2 > 0 && ginp2)){
1169 this_number = global_context -> running_processed_reads_in_chunk;
1170 global_context -> running_processed_reads_in_chunk ++;
1171 }
1172 }
1173 subread_lock_release(&global_context -> input_reads.input_lock);
1174
1175 if( global_context->config.space_type == GENE_SPACE_COLOR) {
1176 rl1-=1;rl2-=1;
1177 }
1178
1179 if(ginp2 && rl1 * rl2 <=0 && (rl1>0 || rl2>0)){
1180 if(!global_context-> input_reads.is_internal_error)
1181 SUBREADprintf("\nERROR: two input files have different amounts of reads.\n\n");
1182 global_context-> input_reads.is_internal_error = 1;
1183 *read_no_in_chunk = -1;
1184 return 1;
1185 } else if(rl1>0 && (rl2>0 || !ginp2) && this_number>=0) {
1186 if(global_context->config.is_first_read_reversed)
1187 {
1188 reverse_read(read_text_1, rl1, global_context->config.space_type);
1189 if(qual_text_1)
1190 reverse_quality(qual_text_1, rl1);
1191 }
1192
1193 if(ginp2 && global_context->config.is_second_read_reversed)
1194 {
1195 reverse_read(read_text_2, rl2, global_context->config.space_type);
1196 if(qual_text_2)
1197 reverse_quality(qual_text_2, rl2);
1198 }
1199
1200 *read_no_in_chunk = this_number;
1201 *read_len_1 = rl1;
1202 if(ginp2)
1203 *read_len_2 = rl2;
1204 return 0;
1205 } else {
1206 // normally finished
1207 *read_no_in_chunk = -1;
1208 return 1;
1209 }
1210 }
1211
1212 int write_final_results(global_context_t * context)
1213 {
1214
1215 if((context -> config.do_fusion_detection || context -> config.do_long_del_detection) && context -> config.do_structural_variance_detection)
1216 finalise_structural_variances(context);
1217
1218 if(context -> config.output_prefix[0] && 0==context -> input_reads.is_internal_error && !(context -> config.is_BAM_output && context -> output_bam_writer -> is_internal_error) ) {
1219 write_indel_final_results(context);
1220
1221 if(context -> config.entry_program_name == CORE_PROGRAM_SUBJUNC && (context -> config.prefer_donor_receptor_junctions||!(context -> config.do_fusion_detection || context -> config.do_long_del_detection)))
1222 write_junction_final_results(context);
1223
1224 if((context -> config.do_fusion_detection || context -> config.do_long_del_detection))
1225 write_fusion_final_results(context);
1226 }
1227
1228 return 0;
1229 }
1230
1231 int get_soft_clipping_length(char* CIGAR)
1232 {
1233 int nch;
1234 int cigar_cursor, tmp_int = 0;
1235
1236 for(cigar_cursor = 0; (nch = CIGAR[cigar_cursor]) > 0;cigar_cursor++)
1237 {
1238 if(isdigit(nch)) tmp_int = tmp_int*10+(nch-'0');
1239 else
1240 {
1241 if(nch=='S') return tmp_int;
1242 return 0;
1243 }
1244 }
1245 return 0;
1246 }
1247
1248
1249 #define CIGAR_PERFECT_SECTIONS 12
1250
1251 typedef struct{
1252 char current_cigar_decompress[CORE_MAX_CIGAR_STR_LEN + 1];
1253 char cigar [CORE_MAX_CIGAR_STR_LEN];
1254
1255 unsigned short chimeric_sections;
1256 unsigned int out_poses[CIGAR_PERFECT_SECTIONS];
1257 short out_lens[CIGAR_PERFECT_SECTIONS];
1258 char out_cigars[CIGAR_PERFECT_SECTIONS][60];
1259 char out_strands[CIGAR_PERFECT_SECTIONS];
1260
1261 char additional_information[CORE_ADDITIONAL_INFO_LENGTH + 1];
1262 mapping_result_t * raw_result;
1263
1264 unsigned int linear_position;
1265 short soft_clipping_movements;
1266 char * chro;
1267 int offset;
1268 int strand;
1269 int is_first_section_jumpped;
1270
1271 int mapping_quality;
1272 int is_NM_appied;
1273 }subread_output_tmp_t;
1274
1275 typedef struct{
1276 long long int *PE_distance;
1277 char * out_cigar_buffer[CIGAR_PERFECT_SECTIONS];
1278 subread_output_tmp_t *r1;
1279 subread_output_tmp_t *r2;
1280 subread_output_tmp_t ** out_pairs;
1281 mapping_result_t ** out_raws;
1282
1283 } subread_output_context_t;
1284
1285 void init_output_context(global_context_t * global_context, subread_output_context_t * out_context)
1286 {
1287 int xk1;
1288 memset(out_context, 0, sizeof(subread_output_context_t));
1289
1290 out_context -> r1 = malloc(sizeof(subread_output_tmp_t));
1291 for(xk1=0;xk1<CIGAR_PERFECT_SECTIONS;xk1++)
1292 out_context -> out_cigar_buffer[xk1] = malloc(60);
1293
1294 out_context -> out_pairs = malloc(sizeof( subread_output_context_t *) * global_context->config.multi_best_reads * 2);
1295 out_context -> out_raws = malloc(sizeof( mapping_result_t * *) * global_context->config.multi_best_reads * 2);
1296
1297 if(global_context -> input_reads.is_paired_end_reads)
1298 {
1299 out_context -> PE_distance = malloc(sizeof(long long) * global_context->config.multi_best_reads);
1300 out_context -> r2 = malloc(sizeof(subread_output_tmp_t));
1301 } else {
1302 out_context -> PE_distance = NULL;
1303 out_context -> r2 = NULL;
1304 }
1305 }
1306
1307 void destroy_output_context(global_context_t * global_context , subread_output_context_t * out_context)
1308 {
1309 int xk1;
1310 for(xk1=0;xk1<CIGAR_PERFECT_SECTIONS;xk1++)
1311 free(out_context -> out_cigar_buffer[xk1]);
1312
1313 free(out_context -> out_raws);
1314 free(out_context -> out_pairs);
1315 free(out_context -> r1 );
1316 if(global_context -> input_reads.is_paired_end_reads)
1317 {
1318 free(out_context -> r2);
1319 free(out_context -> PE_distance);
1320 }
1321 }
1322
1323 int locate_current_value_index(global_context_t * global_context, thread_context_t * thread_context, mapping_result_t * result, int rlen);
1324 int calc_edit_dist(global_context_t * global_context, mapping_result_t * current_result, char * cigar , unsigned int pos , char * read_text, int all_mm)
1325 {
1326 unsigned int cigar_cursor = 0, tmpi=0;
1327
1328 while(1)
1329 {
1330 char nch = cigar[cigar_cursor++];
1331 if(!nch)break;
1332
1333 if('0'<=nch && '9' >=nch)
1334 {
1335 tmpi = tmpi*10+nch-'0';
1336 }
1337 else{
1338 if(nch == 'I' || nch == 'D')
1339 {
1340 all_mm+=tmpi;
1341 }
1342
1343 tmpi = 0;
1344 }
1345 }
1346 return all_mm;
1347 }
1348
1349 unsigned int move_to_read_head(unsigned int tailpos, char * cigar){
1350 int cigar_i = 0, nch;
1351 unsigned int tmpi = 0;
1352 while(0<(nch = cigar[cigar_i++])){
1353 if(isdigit(nch)){
1354 tmpi = tmpi * 10 + nch - '0';
1355 }else{
1356 if(nch == 'N' || nch == 'M' || nch == 'D') tailpos -= tmpi;
1357 tmpi = 0;
1358 }
1359 }
1360 return tailpos;
1361 }
1362
1363
1364 // This function returns 1 if the cut was added.
1365 // It returns 0 if the head or tail cut is not able to be added (e.g., the cigar ends up like "50M4I10S" or "10S30N90M")
1366 int add_head_tail_cut_softclipping(global_context_t * global_context, unsigned int linear, char * cigar, int rlen, int head_cut, int tail_cut){
1367 //SUBREADprintf("ADD_SOFT: %s , %d, %d\n", cigar,head_cut,tail_cut);
1368
1369 char cigar_added [CORE_MAX_CIGAR_STR_LEN];
1370 int cigar_cursor = 0, read_cursor = 0, next_read_cursor = 0;
1371 int tmpi = 0, nch, has_M = 0;
1372 unsigned int linear_cursor = linear;
1373
1374 cigar_added[0]=0;
1375
1376 while(1){
1377 nch = cigar[cigar_cursor++];
1378 if(nch == 0) break;
1379
1380 if(isdigit(nch)){
1381 tmpi = 10 * tmpi + (nch - '0');
1382 }else{
1383 if('M' == nch || 'S' == nch || 'I' == nch)
1384 next_read_cursor = read_cursor + tmpi;
1385
1386 if('M' == nch || 'D' == nch || 'S' == nch || 'N' == nch){
1387 linear_cursor += tmpi;
1388 }
1389
1390 int head_S = 0, tail_S =0, remainder_tmpi = tmpi, is_skip = 0;
1391
1392 if(next_read_cursor <= head_cut) is_skip = 1;
1393 if(read_cursor >= rlen - tail_cut) is_skip = 1;
1394 if(!is_skip){
1395 if(nch != 'S'){
1396 if(read_cursor <= head_cut)head_S = head_cut;
1397 if(next_read_cursor >= rlen - tail_cut) tail_S = tail_cut;
1398 remainder_tmpi = tmpi;
1399 if(head_S) remainder_tmpi-= (head_S - read_cursor);
1400 if(tail_S) remainder_tmpi-= next_read_cursor - (rlen - tail_S);
1401 }
1402 //SUBREADprintf("OPTR=%c, THREE-LENS=%d,%d,%d\n", nch, head_S, remainder_tmpi, tail_S);
1403 if(head_S >0 || tail_S > 0) if(nch !='M') return 0;
1404
1405 if(head_S > 0) sprintf(cigar_added + strlen(cigar_added), "%dS", head_S );
1406 if(remainder_tmpi > 0){
1407 sprintf(cigar_added + strlen(cigar_added), "%d%c", remainder_tmpi , nch );
1408 if( nch == 'M' ) has_M = 1;
1409 }
1410 if(tail_S > 0) sprintf(cigar_added + strlen(cigar_added), "%dS", tail_S );
1411 }
1412
1413 read_cursor = next_read_cursor;
1414 tmpi = 0;
1415 }
1416 }
1417 //SUBREADprintf("RPCIGAR: %s => %s\n", cigar, cigar_added);
1418 strcpy(cigar, cigar_added);
1419 return has_M;
1420 }
1421
1422
1423 int convert_read_to_tmp(global_context_t * global_context , subread_output_context_t * output_context, int read_number, int is_second_read, int read_len, char * read_text, char * qual_text, realignment_result_t * current_result, subread_output_tmp_t * r, char * read_name)
1424 {
1425
1426 int is_r_OK;
1427 r -> raw_result = current_result -> mapping_result;
1428 r -> additional_information[0]=0;
1429
1430 is_r_OK = (current_result -> mapping_result -> result_flags & CORE_IS_FULLY_EXPLAINED) > 0;
1431
1432 if(0 && FIXLENstrcmp("V0112_0155:7:1101:5279:29143#ATCACG", read_name) == 0)
1433 SUBREADprintf("%s R_%d CPOINT1 : is_OK=%d\n", read_name , is_second_read + 1 , is_r_OK);
1434
1435 if(is_r_OK) {
1436
1437 int current_strand = (current_result -> mapping_result -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
1438 int is_first_section_jumped = current_result -> first_base_is_jumpped;
1439 if(is_first_section_jumped) assert((global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection));
1440 strcpy(r-> current_cigar_decompress, current_result -> cigar_string);
1441
1442 int chimeric_sections = 0;
1443
1444 r->is_first_section_jumpped = is_first_section_jumped;
1445 r->linear_position = current_result -> first_base_position;
1446 //r->mapping_quality = current_result -> final_quality;
1447 r->mapping_quality = 40; // changed on 10-JUN-2016
1448 if(current_result -> realign_flags & CORE_IS_BREAKEVEN)
1449 r -> mapping_quality = 0;
1450 else
1451 r -> mapping_quality /= current_result->MAPQ_adjustment;
1452 //SUBREADprintf("REP=%d\n", current_result->MAPQ_adjustment);
1453
1454 strcpy(r->cigar, r -> current_cigar_decompress);
1455 r->strand = (current_result -> mapping_result -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
1456
1457 //sprintf(r->additional_information, "\tSM:i:%d",current_result -> final_mismatched_bases);
1458
1459
1460 //printf("SM='%s'\n", r->additional_information);
1461
1462 if((global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection))
1463 {
1464 chimeric_sections = chimeric_cigar_parts(global_context, r->linear_position, is_first_section_jumped ^ current_strand, is_first_section_jumped, r->current_cigar_decompress, r->out_poses, output_context->out_cigar_buffer, r->out_strands, read_len, r->out_lens, read_name);
1465
1466 if(chimeric_sections > 0){
1467 int xk1;
1468 r->chimeric_sections = chimeric_sections;
1469 strcpy(r->out_cigars[0], output_context->out_cigar_buffer[0]);
1470 for(xk1=1; xk1<chimeric_sections; xk1++)
1471 {
1472 int chimeric_pos;
1473 char * chimaric_chr;
1474 strcpy(r->out_cigars[xk1], output_context->out_cigar_buffer[xk1]);
1475 unsigned int vitual_head_pos = r->out_poses[xk1];
1476 char strand_xor = (r->out_strands[xk1] == '-') != is_second_read;//!= (r->out_strands[0]=='-') ;
1477
1478 //if(( r->out_strands[xk1] == '-' ) != (r->out_strands[0]=='-' )) vitual_head_pos = move_to_read_head(vitual_head_pos, r->out_cigars[xk1]);
1479
1480 if(0==locate_gene_position_max(vitual_head_pos ,& global_context -> chromosome_table, & chimaric_chr, & chimeric_pos, NULL, NULL, 0+r->out_lens[xk1]))
1481 {
1482 int soft_clipping_movement = 0;
1483 soft_clipping_movement = get_soft_clipping_length(r->out_cigars[xk1]);
1484 assert(chimaric_chr);
1485 sprintf(r->additional_information + strlen(r->additional_information), "\tCG:Z:%s\tCP:i:%u\tCT:Z:%c\tCC:Z:%s", r->out_cigars[xk1] , max(1,chimeric_pos + soft_clipping_movement + 1), strand_xor?'-':'+' , chimaric_chr );
1486 }
1487 else is_r_OK = 0;
1488 }
1489 r->linear_position = r->out_poses[0];
1490 r->strand = r->out_strands[0]=='-';
1491
1492 strcpy(r->cigar , r->out_cigars[0]);
1493 }else is_r_OK = 0;
1494 }
1495 if(is_r_OK)
1496 r->soft_clipping_movements = get_soft_clipping_length(r->cigar);
1497 }
1498
1499 if(is_r_OK)
1500 {
1501 int head_cut = 0 , tail_cut = 0;
1502
1503 if(locate_gene_position_max(r->linear_position + r->soft_clipping_movements,& global_context -> chromosome_table, &r-> chro , &r -> offset, &head_cut, &tail_cut,(global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection)?read_len:(current_result->chromosomal_length - r->soft_clipping_movements))){
1504 is_r_OK = 0;
1505 } else {
1506 int is_added_OK = 1;
1507 if(head_cut!=0 || tail_cut!=0)
1508 is_added_OK = add_head_tail_cut_softclipping(global_context, r->linear_position, r->cigar , read_len, head_cut, tail_cut);
1509
1510 if(is_added_OK){
1511 r -> offset++;
1512 assert(r-> chro);
1513 }else{
1514 is_r_OK = 0;
1515 }
1516 }
1517
1518 if(global_context -> config.do_breakpoint_detection && !(current_result -> realign_flags & CORE_NOTFOUND_DONORS))
1519 {
1520 sprintf(r->additional_information + strlen(r->additional_information), "\tXS:A:%c", (current_result -> realign_flags & CORE_IS_GT_AG_DONORS)?'+':'-');
1521 }
1522
1523 /*
1524 if(global_context -> config.more_accurate_fusions)
1525 {
1526
1527 sprintf(r->additional_information + strlen(r->additional_information), "\tDF:i:%d", current_result -> best_second_diff_bases >0?current_result -> best_second_diff_bases:9999 );
1528 }*/
1529
1530
1531 }
1532
1533 return is_r_OK;
1534 }
1535
1536
1537 int add_event_detected_from_cigar(global_context_t * global_context, unsigned int left_last_base, unsigned int right_first_base, int is_indel, int indel_len, int left_extend, int right_extend, int read_length, char * read_name)
1538 {
1539 HashTable * event_table = NULL;
1540 chromosome_event_t * event_space = NULL;
1541 event_table = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_entry_table;
1542 event_space = ((indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID]) -> event_space_dynamic;
1543
1544 chromosome_event_t *site_events[MAX_EVENT_ENTRIES_PER_SITE];
1545 int search_types ;
1546 if(is_indel) search_types = CHRO_EVENT_TYPE_INDEL;
1547 else search_types = CHRO_EVENT_TYPE_JUNCTION | CHRO_EVENT_TYPE_FUSION;
1548
1549 if(left_last_base > right_first_base) {
1550 unsigned int ttt;
1551 ttt = right_first_base;
1552 right_first_base = left_last_base;
1553 left_last_base = ttt;
1554 }
1555
1556 int xk1, site_events_no = search_event(global_context, event_table , event_space , left_last_base, EVENT_SEARCH_BY_BOTH_SIDES , search_types , site_events);
1557
1558 int is_found = 0;
1559 for(xk1 = 0; xk1 < site_events_no; xk1++){
1560 chromosome_event_t * this_event = site_events[xk1];
1561 if(is_indel && this_event -> event_type != CHRO_EVENT_TYPE_INDEL) continue;
1562 if((!is_indel) && this_event -> event_type == CHRO_EVENT_TYPE_INDEL) continue;
1563
1564 #ifndef __MINGW32__
1565 if(0 && ( memcmp(read_name, "V0112_0155:7:1104:17648:117432",28)==0 || ( this_event -> event_small_side > 2613701363 - 200 && this_event -> event_small_side < 2613701363 + 100) ))
1566 SUBREADprintf("EVENT: L=%u, R=%u (TABLE L=%u , R=%u), LEXT=%d, REXT=%d -- %s\n", left_last_base, right_first_base, this_event -> event_small_side, this_event -> event_large_side, left_extend, right_extend, read_name);
1567 #endif
1568
1569 if(this_event -> event_small_side == left_last_base && this_event -> event_large_side == right_first_base){
1570 if(is_indel && this_event -> indel_length == indel_len){
1571 this_event -> final_counted_reads ++;
1572 is_found = 1;
1573 break;
1574 }
1575 if(!is_indel){
1576 this_event -> final_counted_reads ++;
1577
1578 //
1579 if(read_length >= 80)
1580 {
1581 if(left_extend > 30 && right_extend > 30) this_event -> critical_supporting_reads ++;
1582 }else{
1583 if(left_extend > 18 && right_extend > 18) this_event -> critical_supporting_reads ++;
1584 }
1585 this_event -> junction_flanking_left = max(this_event -> junction_flanking_left, left_extend);
1586 this_event -> junction_flanking_right = max(this_event -> junction_flanking_right, right_extend);
1587 is_found = 1;
1588 break;
1589 }
1590 }
1591 }
1592
1593 if(!is_found){
1594 //SUBREADprintf("\nEVENT NOT FOUND.\n\n");
1595 return 1;
1596 }
1597 return 0;
1598 }
1599
1600 int get_junction_right_extension(char * remainder_cigar){
1601 int ret = 0;
1602 unsigned int tmpi = 0;
1603 int cigar_cursor, nch;
1604
1605 for(cigar_cursor = 0;;cigar_cursor++){
1606 nch = remainder_cigar[cigar_cursor];
1607 if(0 == nch)break;
1608 if(isdigit(nch))tmpi = tmpi*10 + nch - '0';
1609 else{
1610 if(nch == 'M' || nch == 'D')
1611 ret += tmpi;
1612 if(nch == 'N' || nch == 'n' || nch == 'B' || nch == 'b')
1613 break;
1614
1615 tmpi = 0;
1616 }
1617 }
1618 return ret;
1619 }
1620
1621 void remove_soft_clipping(char * dst, char * src){
1622 unsigned int tmpi = 0, head_clip = 0, tail_clip = 0, last_M = 0;
1623 int cigar_cursor, is_first_s = 1, nch, nch2;
1624
1625 dst[0] = 0;
1626
1627 for(cigar_cursor = 0;;cigar_cursor++){
1628 nch = src[cigar_cursor];
1629 nch2 = src[cigar_cursor+1];
1630
1631 if(0 == nch)break;
1632 if(isdigit(nch))tmpi = tmpi*10 + nch - '0';
1633 else{
1634 if('S' == nch){
1635 if(is_first_s) head_clip = tmpi;
1636 if(nch2 == 0) tail_clip = tmpi;
1637 }
1638 else if('M' == nch){
1639 last_M = tmpi;
1640 }
1641 else{
1642 if(last_M){
1643 sprintf(dst + strlen(dst), "%uM", last_M + head_clip);
1644 last_M = 0;
1645 head_clip = 0;
1646 }
1647 sprintf(dst + strlen(dst), "%u%c", tmpi, nch);
1648 }
1649 tmpi=0;
1650 is_first_s = 0;
1651 }
1652 }
1653 if(last_M){
1654 sprintf(dst + strlen(dst), "%uM" , last_M + tail_clip + head_clip);
1655 }
1656
1657 }
1658
1659 int getFirstM(char * cig){
1660 int tmpi = 0, nch, x1;
1661
1662 for(x1=0; 0!=(nch = cig[x1]) ; x1++){
1663 if(isdigit(nch)){
1664 tmpi=tmpi*10 + nch - '0';
1665 }else break;
1666 }
1667 return tmpi;
1668 }
1669
1670 int calc_tlen(global_context_t * global_context, subread_output_tmp_t * rec1 , subread_output_tmp_t * rec2, int read_len_1, int read_len_2);
1671
1672 int calc_flags(global_context_t * global_context, thread_context_t * thread_context, subread_output_tmp_t * rec1 , subread_output_tmp_t * rec2, int is_second_read, int read_len_1, int read_len_2, int current_location_no , int tlen, int this_OK, int mate_OK)
1673 {
1674 int ret, is_TLEN_wrong = 0;
1675
1676 if(global_context->input_reads.is_paired_end_reads)
1677 {
1678 ret = SAM_FLAG_PAIRED_TASK;
1679 ret |= is_second_read?SAM_FLAG_SECOND_READ_IN_PAIR:SAM_FLAG_FIRST_READ_IN_PAIR;
1680
1681 subread_output_tmp_t * this_rec = is_second_read?rec2:rec1;
1682 subread_output_tmp_t * mate_rec = is_second_read?rec1:rec2;
1683
1684 if(this_OK == 0) ret |= SAM_FLAG_UNMAPPED;
1685 else
1686 if(this_rec->strand + is_second_read == 1) ret |= SAM_FLAG_REVERSE_STRAND_MATCHED;
1687
1688 if(mate_OK == 0) ret |= SAM_FLAG_MATE_UNMATCHED;
1689 else
1690 if(mate_rec->strand + is_second_read != 1) ret |= SAM_FLAG_MATE_REVERSE_STRAND_MATCHED;
1691
1692 if(rec1 && rec2)
1693 {
1694 int TLEN = tlen;//calc_tlen(global_context , rec1, rec2, read_len_1, read_len_2);
1695 int is_PEM = 0;
1696 if( rec1 -> chro == rec2 -> chro /* two pointers can be directly compared. */ && TLEN >= global_context->config.minimum_pair_distance &&
1697 TLEN <= global_context-> config.maximum_pair_distance && this_rec->strand == mate_rec->strand )
1698 {
1699 if(global_context -> config.is_first_read_reversed && !(global_context -> config.is_second_read_reversed))
1700 {
1701 if(this_rec -> strand == 0) {
1702 if((is_second_read + (mate_rec-> offset > this_rec -> offset) == 1) || mate_rec-> offset == this_rec -> offset)
1703 is_PEM = 1;
1704 else is_TLEN_wrong = 1;
1705 }
1706 }
1707 else
1708 {
1709 if(this_rec -> strand) {
1710 if((is_second_read + (mate_rec-> offset < this_rec -> offset) == 1) || mate_rec-> offset == this_rec -> offset) is_PEM = 1;
1711 else is_TLEN_wrong = 1;
1712 }else {
1713 if((is_second_read + (mate_rec-> offset > this_rec -> offset) == 1) || mate_rec-> offset == this_rec -> offset) is_PEM = 1;
1714 else is_TLEN_wrong = 1;
1715 }
1716 }
1717 }
1718 if(is_PEM) ret |= SAM_FLAG_MATCHED_IN_PAIR;
1719 else if(is_second_read){
1720 if(rec1 -> chro != rec2 -> chro){
1721 if(thread_context) thread_context -> not_properly_pairs_different_chro ++;
1722 else global_context -> not_properly_pairs_different_chro++;
1723 }else if(this_rec->strand != mate_rec->strand){
1724 if(thread_context) thread_context -> not_properly_different_strands ++;
1725 else global_context -> not_properly_different_strands ++;
1726 }else if(TLEN < global_context->config.minimum_pair_distance || TLEN > global_context-> config.maximum_pair_distance){
1727 if(thread_context) thread_context -> not_properly_pairs_TLEN_wrong ++;
1728 else global_context -> not_properly_pairs_TLEN_wrong ++;
1729 }else if(is_TLEN_wrong){
1730 if(thread_context) thread_context -> not_properly_pairs_wrong_arrangement ++;
1731 else global_context -> not_properly_pairs_wrong_arrangement ++;
1732 }
1733
1734 }
1735 }
1736 }
1737 else
1738 {
1739 ret = 0;
1740
1741 if(this_OK == 0) ret |= SAM_FLAG_UNMAPPED;
1742 else
1743 if(rec1->strand) ret |= SAM_FLAG_REVERSE_STRAND_MATCHED;
1744 }
1745
1746 if(current_location_no>0){
1747 if((rec1 && !is_second_read)||(rec2 && is_second_read))
1748 ret |= SAM_FLAG_SECONDARY_MAPPING;
1749 }
1750
1751 return ret;
1752 }
1753
1754
1755 int calc_tlen(global_context_t * global_context, subread_output_tmp_t * rec1 , subread_output_tmp_t * rec2, int read_len_1, int read_len_2)
1756 {
1757 int ret = -1;
1758
1759 unsigned int r1_head_pos = rec1 -> offset;// - rec1 -> soft_clipping_movements;
1760 unsigned int r2_head_pos = rec2 -> offset;// - rec2 -> soft_clipping_movements;
1761
1762 if(r1_head_pos == r2_head_pos)
1763 ret = max(read_len_1, read_len_2); // the two reads are fully overlapping
1764 else{
1765 int is_r2_smaller = r2_head_pos < r1_head_pos;
1766 subread_output_tmp_t * smaller_rec = is_r2_smaller?rec2:rec1;
1767 unsigned int smaller_head_pos = is_r2_smaller?r2_head_pos:r1_head_pos;
1768 unsigned int larger_head_pos = is_r2_smaller?r1_head_pos:r2_head_pos;
1769
1770 int len_larger_rec = is_r2_smaller?read_len_1:read_len_2;
1771 int len_smaller_rec = is_r2_smaller?read_len_2:read_len_1;
1772
1773 unsigned int tmpi = 0;
1774 unsigned int chro_cursor = smaller_head_pos;
1775 unsigned int section_end = 0;
1776 unsigned int read_cursor = 0;
1777 int cigar_cursor, nch, nch2;
1778 for(cigar_cursor = 0; ; cigar_cursor++){
1779 nch = smaller_rec -> cigar[cigar_cursor];
1780 nch2 = smaller_rec -> cigar[cigar_cursor+1];
1781
1782 if(nch > 0){
1783 if(isdigit(nch)) tmpi = tmpi * 10 + nch - '0';
1784 else{
1785 if(nch == 'M' || nch == 'S'){
1786 chro_cursor += tmpi;
1787 read_cursor += tmpi;
1788 section_end = chro_cursor;
1789 }
1790 if(nch == 'N' || nch == 'B' || nch == 'b' || nch == 'n' || nch == 'I' || nch == 'D' || nch2 == 0){
1791 if(nch == 'N' || nch == 'D')
1792 chro_cursor += tmpi;
1793
1794 //SUBREADprintf("NCH=%c, NCH2=%d, SEC_END=%u, LARGE_HEAD=%u\n", nch, nch2, section_end, larger_head_pos);
1795 if(section_end >= larger_head_pos)
1796 {
1797 ret = read_cursor + larger_head_pos - section_end + len_larger_rec;
1798 break;
1799 }
1800 }
1801
1802 if(nch == 'I'){
1803 read_cursor += tmpi;
1804 }
1805
1806 if(nch == 'B' || nch == 'b' || nch == 'n') break; // fusion block, unable to determine the TLEN after the fusion point.
1807
1808 tmpi = 0;
1809 }
1810 }else{
1811 break;
1812 }
1813
1814 }
1815
1816 if(ret < 0)
1817 ret = larger_head_pos - section_end + len_larger_rec + len_smaller_rec;
1818 }
1819
1820
1821 //SUBREADprintf("TLEN = %d :: \tREC1 : %s:%u (%s - %d) ; REC2: %s:%u (%s - %d )\n", ret, rec1 -> chro, rec1 -> offset, rec1 -> cigar, rec1 -> soft_clipping_movements,
1822 // rec2 -> chro, rec2 -> offset, rec2 -> cigar, rec2 -> soft_clipping_movements);
1823
1824 return ret;
1825
1826 }
1827
1828 int calc_should_reverse(global_context_t * global_context, subread_output_tmp_t * rec1 , subread_output_tmp_t * rec2, int is_second_read)
1829 {
1830 subread_output_tmp_t * cur_rec = is_second_read?rec2:rec1;
1831
1832 if(cur_rec)
1833 return cur_rec->strand;
1834 else
1835 {
1836 return is_second_read;
1837 //subread_output_tmp_t * mate_rec = is_second_read?rec2:rec1;
1838 //return mate_rec?mate_rec->strand:is_second_read;
1839 }
1840 }
1841
1842 void remove_nm_i(char * st)
1843 {
1844 char * st_nm = strstr(st, "\tNM:i:");
1845 if(st_nm)
1846 {
1847 char * write_cursor = st_nm;
1848 int started = 0;
1849 st_nm++;
1850 while(1)
1851 {
1852 char nch = *st_nm;
1853 if(!nch)
1854 {
1855 *write_cursor=0;
1856 break;
1857 }
1858 if(nch == '\t') started = 1;
1859 if(started)
1860 {
1861 *write_cursor=*st_nm;
1862 write_cursor++;
1863 }
1864 st_nm++;
1865 }
1866 }
1867 }
1868
1869 double last_t1 = 0;
1870 int for_other_thread = 0;
1871 int for_one_threads = 0;
1872
1873
1874 void add_buffered_fragment(global_context_t * global_context, thread_context_t * thread_context, subread_read_number_t pair_number ,
1875 char * read_name1, unsigned int flags1, char * chro_name1, unsigned int chro_position1, int mapping_quality1, char * cigar1,
1876 char * next_chro_name1, unsigned int next_chro_pos1, int temp_len1, int read_len1,
1877 char * read_text1, char * qual_text1, char * additional_columns1,
1878 char * read_name2, unsigned int flags2, char * chro_name2, unsigned int chro_position2, int mapping_quality2, char * cigar2,
1879 char * next_chro_name2, unsigned int next_chro_pos2, int temp_len2, int read_len2,
1880 char * read_text2, char * qual_text2, char * additional_columns2,
1881 int all_locations, int this_location
1882 ){
1883
1884 assert(global_context -> config.all_threads > 1);
1885
1886 if( global_context -> config.is_BAM_output && !global_context -> config.is_input_read_order_required){
1887 //SUBREADprintf("MTBAM : THR_%d CORE: %s\n", thread_context -> thread_id, read_name1);
1888 SamBam_writer_add_read(global_context -> output_bam_writer, thread_context -> thread_id, read_name1, flags1, chro_name1, chro_position1, mapping_quality1, cigar1, next_chro_name1, next_chro_pos1, temp_len1, read_len1, read_text1, qual_text1, additional_columns1, !global_context->input_reads.is_paired_end_reads);
1889 if(global_context->input_reads.is_paired_end_reads)
1890 SamBam_writer_add_read(global_context -> output_bam_writer, thread_context -> thread_id, read_name2, flags2, chro_name2, chro_position2, mapping_quality2, cigar2, next_chro_name2, next_chro_pos2, temp_len2, read_len2, read_text2, qual_text2, additional_columns2, 1);
1891 }else{
1892 while(1){
1893 int is_fin = 0;
1894 subread_lock_occupy(&global_context -> output_lock);
1895 //SUBREADprintf("THWT : TH_%d IS AFTER %lld ; LAST=%lld (%d/%d)\n", thread_context -> thread_id, pair_number -1, global_context -> last_written_fragment_number, this_location, all_locations);
1896 if(global_context -> last_written_fragment_number == pair_number-1){
1897 if(global_context -> config.is_BAM_output){
1898 SamBam_writer_add_read(global_context -> output_bam_writer, -1, read_name1, flags1, chro_name1 , chro_position1, mapping_quality1, cigar1, next_chro_name1 , next_chro_pos1, temp_len1, read_len1, read_text1, qual_text1, additional_columns1, !global_context->input_reads.is_paired_end_reads);
1899
1900 if(global_context->input_reads.is_paired_end_reads)
1901 SamBam_writer_add_read(global_context -> output_bam_writer, -2, read_name2, flags2, chro_name2 , chro_position2, mapping_quality2, cigar2, next_chro_name2 , next_chro_pos2, temp_len2, read_len2, read_text2, qual_text2, additional_columns2, 1);
1902
1903 }else{
1904 int write_len_2 = 100, write_len = sambamout_fprintf(global_context -> output_sam_fp , "%s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%d\t%s\t%s%s%s\n", read_name1, flags1, chro_name1, chro_position1, mapping_quality1, cigar1, next_chro_name1, next_chro_pos1, temp_len1, read_text1, qual_text1, additional_columns1[0]?"\t":"", additional_columns1);
1905 if(global_context->input_reads.is_paired_end_reads)
1906 write_len_2 = sambamout_fprintf(global_context -> output_sam_fp , "%s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%d\t%s\t%s%s%s\n", read_name2, flags2, chro_name2, chro_position2, mapping_quality2, cigar2, next_chro_name2, next_chro_pos2, temp_len2, read_text2, qual_text2, additional_columns2[0]?"\t":"", additional_columns2);
1907
1908 if( write_len < 10 || write_len_2 < 10 ){
1909 global_context -> output_sam_is_full = 1;
1910 }
1911 }
1912 if(all_locations <= this_location +1)global_context -> last_written_fragment_number = pair_number;
1913 is_fin = 1;
1914 }
1915 subread_lock_release(&global_context -> output_lock);
1916 if(is_fin)break;
1917 usleep(2);
1918 }
1919 }
1920 }
1921
1922 #define write_chunk_results_145 write_chunk_results
1923
1924 // rec1 or rec2 is OK if they are not NULL.
1925 void write_single_fragment(global_context_t * global_context, thread_context_t * thread_context, subread_output_tmp_t * rec1, realignment_result_t * raw_r1, subread_output_tmp_t * rec2, realignment_result_t * raw_r2, int all_locations , int current_location , char * read_name_1, char * read_name_2, int read_len_1, int read_len_2, char * read_text_1, char * read_text_2, char * qual_text_1, char * qual_text_2, subread_read_number_t pair_number, int non_informative_subread_r1, int non_informative_subread_r2, int is_R1_OK, int is_R2_OK)
1926 {
1927
1928 //assert(all_locations <= global_context -> config.reported_multi_best_reads);
1929 int tlen = 0;
1930
1931 if(is_R1_OK && is_R2_OK && rec1->chro == rec2->chro)tlen = calc_tlen(global_context , rec1, rec2, read_len_1, read_len_2);
1932
1933 //#warning "SUBREAD_151 ============= REMOVE '+1' when not doing structure variance detection for the proposal =========="
1934 if(0)
1935 if((global_context -> config.do_fusion_detection || global_context -> config.do_long_del_detection) && rec1 && rec2 && current_location == 0 && rec1 -> chimeric_sections == 1 && rec2 -> chimeric_sections == 1){
1936 // when current_location == 0, both r1 and r2 were on the same strand.
1937 int is_funky = is_funky_fragment(global_context, read_name_1, rec1 -> chro, rec1 -> offset, read_len_1, rec1 -> strand, rec1 -> out_cigars[0], read_text_1, read_name_2, rec2 -> chro, rec2 -> offset, read_len_2, rec2 -> strand, rec2 -> out_cigars[0] , read_text_2, tlen);
1938 if (0 &&is_funky)
1939 SUBREADprintf("RNAME %s is %d, CHRO = '%s' %p '%s' %p, POS=%u, %u\n", read_name_1, is_funky, rec1 -> chro,rec1 -> chro,rec2 -> chro,rec2 -> chro, rec1 -> offset, rec2 -> offset);
1940
1941 if(is_funky & FUNKY_FRAGMENT_A){
1942 fraglist_append(&global_context -> funky_list_A, pair_number);
1943 }
1944 if(1)if(is_funky & FUNKY_FRAGMENT_BC){
1945 //#warning "LOGIC WRONG: R1 AND R2 SHOULD BE DECIDED BY THEIR MAPPING POSITIONS"
1946 bktable_append(&global_context -> funky_table_BC, rec1 -> chro, rec1 -> offset , NULL + (2*pair_number));
1947 bktable_append(&global_context -> funky_table_BC, rec2 -> chro, rec2 -> offset , NULL + (2*pair_number+1));
1948 }
1949 if(1)if(is_funky & FUNKY_FRAGMENT_DE){
1950 fraglist_append(&global_context -> funky_list_DE, pair_number);
1951 bktable_append(&global_context -> funky_table_DE, rec1 -> chro, rec1 -> offset , NULL + (2*pair_number + (rec1 -> offset > rec2 -> offset ? 1:0)));
1952 bktable_append(&global_context -> funky_table_DE, rec2 -> chro, rec2 -> offset , NULL + (2*pair_number + (rec1 -> offset < rec2 -> offset ? 1:0)));
1953 }
1954 }
1955
1956 int flag1 = calc_flags( global_context, thread_context , rec1, rec2, 0, read_len_1, read_len_2, current_location, tlen, is_R1_OK, is_R2_OK);
1957
1958 int flag2 = -1;
1959
1960 if(global_context->input_reads.is_paired_end_reads)
1961 {
1962 flag2 = calc_flags( global_context , thread_context, rec1, rec2, 1, read_len_1, read_len_2, current_location, tlen, is_R2_OK, is_R1_OK);
1963 if((0 == current_location) && (flag2 & SAM_FLAG_MATCHED_IN_PAIR)){
1964 if(thread_context)thread_context->all_correct_PE_reads ++;
1965 else global_context->all_correct_PE_reads ++;
1966 }
1967 }
1968
1969 int applied_reverse_space;
1970 applied_reverse_space = global_context->config.space_type;
1971 if(global_context -> config.convert_color_to_base)
1972 {
1973
1974 //SUBREADprintf("BEST_READ_NO = %d / %d\n", current_location, all_locations);
1975 //SUBREADprintf("ORGI: %s\n", read_text_1);
1976 colorread2base(read_text_1, read_len_1+1);
1977 //SUBREADprintf("CONV: %s\n\n", read_text_1);
1978
1979 if(global_context->input_reads.is_paired_end_reads)
1980 colorread2base(read_text_2, read_len_2+1);
1981
1982 applied_reverse_space = GENE_SPACE_BASE;
1983 }
1984
1985 int should_1_reverse = calc_should_reverse( global_context , rec1, rec2, 0);
1986
1987 if(should_1_reverse)
1988 {
1989 reverse_read(read_text_1, read_len_1 + global_context->config.convert_color_to_base, applied_reverse_space);
1990 reverse_quality(qual_text_1, read_len_1);
1991 }
1992
1993 if(global_context->input_reads.is_paired_end_reads)
1994 {
1995 int should_2_reverse = calc_should_reverse( global_context , rec1, rec2, 1);
1996
1997 if(should_2_reverse)
1998 {
1999 reverse_read(read_text_2, read_len_2 + global_context->config.convert_color_to_base, applied_reverse_space);
2000 reverse_quality(qual_text_2, read_len_2);
2001 }
2002 }
2003 remove_backslash(read_name_1);
2004 if(global_context->input_reads.is_paired_end_reads) remove_backslash(read_name_2);
2005
2006 int display_offset1 = 0, display_tailgate1 = 0;
2007 int display_offset2 = 0, display_tailgate2 = 0;
2008
2009 if(global_context -> config.space_type == GENE_SPACE_COLOR)
2010 {
2011
2012 if(rec1 && rec1 -> strand)
2013 {
2014 display_offset1 = 0;
2015 display_tailgate1 = 1;
2016 }
2017 else
2018 {
2019 if(!global_context -> config.convert_color_to_base)
2020 {
2021 // the first base was a fake prime base; the second base is the first meaningful base.
2022 char second_char = read_text_1[1];
2023 read_text_1[1] = color2char(second_char, read_text_1[0]);
2024 }
2025 display_offset1 = 1;
2026 display_tailgate1 = 0;
2027 }
2028
2029 if(rec2 && rec2 -> strand)
2030 {
2031 if(!global_context -> config.convert_color_to_base)
2032 {
2033 // the first base was a fake prime base; the second base is the first meaningful base.
2034 char second_char = read_text_2[1];
2035 read_text_2[1] = color2char(second_char, read_text_2[0]);
2036 }
2037 display_offset2 = 1;
2038 display_tailgate2 = 0;
2039 }
2040 else
2041 {
2042 display_offset2 = 0;
2043 display_tailgate2 = 1;
2044 }
2045 assert(display_offset1 + display_tailgate1 == 1);
2046 assert(display_offset2 + display_tailgate2 == 1);
2047 }
2048 if(!qual_text_1[0])
2049 {
2050 int xi2;
2051 for(xi2=display_offset1;read_text_1[xi2 + display_tailgate1];xi2++) qual_text_1[xi2 - display_offset1] = 'I';
2052 qual_text_1[xi2 - display_offset1]=0;
2053 for(xi2=display_offset2;read_text_2[xi2 + display_tailgate2];xi2++) qual_text_2[xi2 - display_offset2] = 'I';
2054 qual_text_2[xi2 - display_offset2]=0;
2055
2056 }
2057
2058
2059
2060 if(display_tailgate1)
2061 read_text_1[strlen(read_text_1)-1]=0;
2062 if(display_tailgate2)
2063 read_text_2[strlen(read_text_2)-1]=0;
2064
2065
2066 if(global_context -> config.space_type == GENE_SPACE_BASE){
2067 if(is_R1_OK) {// && !strstr(rec1->additional_information, "\tNM:i:")){
2068 short rec1_edit = calc_edit_dist(global_context, rec1->raw_result, rec1->cigar , rec1->linear_position, read_text_1, raw_r1 -> final_mismatched_bases);
2069 sprintf(rec1->additional_information + strlen( rec1->additional_information), "\tNM:i:%d", rec1_edit );
2070 }
2071 if(global_context->input_reads.is_paired_end_reads && is_R2_OK) //&& !strstr(rec2->additional_information, "\tNM:i:"))
2072 {
2073 short rec2_edit = calc_edit_dist(global_context, rec2->raw_result, rec2->cigar , rec2->linear_position, read_text_2, raw_r2 -> final_mismatched_bases);
2074 sprintf(rec2->additional_information + strlen( rec2->additional_information), "\tNM:i:%d", rec2_edit );
2075 }
2076 }
2077
2078 char extra_additional_1 [1000+CORE_ADDITIONAL_INFO_LENGTH], extra_additional_2[1000+CORE_ADDITIONAL_INFO_LENGTH];
2079 int extra_additional_1_ptr=0, extra_additional_2_ptr=0;
2080
2081 extra_additional_1[0] = extra_additional_2[0]=0;
2082
2083 if(is_R1_OK || is_R2_OK){ // no NH and HI only if both ends are unmapped.
2084 extra_additional_1_ptr +=snprintf(extra_additional_1, 310, "HI:i:%d\tNH:i:%d", current_location+1, all_locations);
2085 extra_additional_2_ptr +=snprintf(extra_additional_2, 310, "HI:i:%d\tNH:i:%d", current_location+1, all_locations);
2086 }
2087
2088 if(global_context->config.read_group_id[0])
2089 {
2090 extra_additional_1_ptr += snprintf(extra_additional_1+ extra_additional_1_ptr, 310, "\tRG:Z:%s", global_context->config.read_group_id);
2091 extra_additional_2_ptr += snprintf(extra_additional_2+ extra_additional_2_ptr, 310, "\tRG:Z:%s", global_context->config.read_group_id);
2092 }
2093
2094 char * out_chro1, * out_chro2, *out_cigar1, *out_cigar2;
2095 if(is_R1_OK)
2096 {
2097 strcpy(extra_additional_1 + extra_additional_1_ptr, rec1->additional_information);
2098 out_chro1 = rec1->chro;
2099 out_cigar1 = rec1->cigar;
2100 }
2101 else
2102 {
2103 out_chro1 = "*";
2104 out_cigar1 = "*";
2105 }
2106
2107 if(is_R2_OK)
2108 {
2109 strcpy(extra_additional_2 + extra_additional_2_ptr, rec2->additional_information);
2110 out_chro2 = rec2->chro;
2111 out_cigar2 = rec2->cigar;
2112 }
2113 else
2114 {
2115 out_chro2 = "*";
2116 out_cigar2 = "*";
2117 }
2118
2119 int out_offset1=0, out_offset2=0;
2120 long long int out_tlen1, out_tlen2;
2121 int out_mapping_quality1 = 0, out_mapping_quality2 = 0;
2122
2123 out_tlen1 = tlen;
2124 out_tlen2 = tlen;
2125 if(is_R1_OK && is_R2_OK)
2126 {
2127 if( rec1->offset > rec2->offset) out_tlen1 = - out_tlen1;
2128 else if(rec2->offset > rec1->offset) out_tlen2 = -out_tlen2;
2129 else{
2130 if( rec1 -> strand ) out_tlen1 = - out_tlen1;
2131 else out_tlen2 = -out_tlen2;
2132 }
2133 }
2134
2135 if(0==current_location)
2136 {
2137 if(global_context -> input_reads.is_paired_end_reads)
2138 {
2139 if(is_R1_OK || is_R2_OK){
2140 //SUBREADprintf("MAPTOLEN\n");
2141 if(thread_context)
2142 thread_context -> all_mapped_reads ++;
2143 else
2144 global_context -> all_mapped_reads ++;
2145 }
2146 }
2147 else if(is_R1_OK){
2148 if(thread_context)
2149 thread_context -> all_mapped_reads ++;
2150 else
2151 global_context -> all_mapped_reads ++;
2152 }
2153 }
2154
2155 if(is_R1_OK){
2156 out_offset1 = max(1, rec1->offset);
2157 out_mapping_quality1 = rec1->mapping_quality;
2158 }
2159 if(is_R2_OK){
2160 out_offset2 = max(1, rec2->offset);
2161 out_mapping_quality2 = rec2->mapping_quality;
2162 }
2163
2164
2165 char * mate_chro_for_1 = out_chro2;
2166 char * mate_chro_for_2 = out_chro1;
2167
2168 if(out_chro1 == out_chro2 && out_chro1 && out_chro1[0]!='*'){
2169 mate_chro_for_1="=";
2170 mate_chro_for_2="=";
2171 }
2172
2173 //if(pair_number<10)SUBREADprintf("FIN_ADD #%d: %s in %p \n", pair_number, read_name_1, thread_context);
2174
2175 if(thread_context)
2176 add_buffered_fragment(global_context, thread_context, pair_number,
2177 read_name_1, flag1, out_chro1 , out_offset1, out_mapping_quality1, out_cigar1, mate_chro_for_1 , out_offset2, out_tlen1, read_len_1,
2178 read_text_1 + display_offset1, qual_text_1, extra_additional_1,
2179 read_name_2, flag2, out_chro2 , out_offset2, out_mapping_quality2, out_cigar2, mate_chro_for_2 , out_offset1, out_tlen2, read_len_2,
2180 read_text_2 + display_offset2, qual_text_2, extra_additional_2, all_locations, current_location);
2181 else{
2182 //subread_lock_occupy(&global_context -> output_lock);
2183 if(global_context -> config.is_BAM_output){
2184 SamBam_writer_add_read(global_context -> output_bam_writer, -1, read_name_1, flag1, out_chro1 , out_offset1, out_mapping_quality1, out_cigar1, mate_chro_for_1 , out_offset2, out_tlen1, read_len_1, read_text_1 + display_offset1, qual_text_1, extra_additional_1, !global_context->input_reads.is_paired_end_reads);
2185
2186 if(global_context->input_reads.is_paired_end_reads)
2187 SamBam_writer_add_read(global_context -> output_bam_writer, -1, read_name_2, flag2, out_chro2 , out_offset2, out_mapping_quality2, out_cigar2, mate_chro_for_2 , out_offset1, out_tlen2, read_len_2, read_text_2 + display_offset2, qual_text_2, extra_additional_2, 1);
2188 }
2189 else
2190 {
2191 int write_len_2 = 100,
2192 #ifdef __MINGW32__
2193 write_len = sambamout_fprintf(global_context -> output_sam_fp , "%s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%I64d\t%s\t%s%s%s\n", read_name_1, flag1, out_chro1, out_offset1, out_mapping_quality1, out_cigar1, mate_chro_for_1, out_offset2, out_tlen1, read_text_1 + display_offset1, qual_text_1, extra_additional_1[0]?"\t":"", extra_additional_1);
2194 #else
2195 write_len = sambamout_fprintf(global_context -> output_sam_fp , "%s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%lld\t%s\t%s%s%s\n", read_name_1, flag1, out_chro1, out_offset1, out_mapping_quality1, out_cigar1, mate_chro_for_1, out_offset2, out_tlen1, read_text_1 + display_offset1, qual_text_1, extra_additional_1[0]?"\t":"", extra_additional_1);
2196 #endif
2197 if(global_context->input_reads.is_paired_end_reads)
2198 #ifdef __MINGW32__
2199 write_len_2 = sambamout_fprintf(global_context -> output_sam_fp , "%s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%I64d\t%s\t%s%s%s\n", read_name_2, flag2, out_chro2, out_offset2, out_mapping_quality2, out_cigar2, mate_chro_for_2, out_offset1, out_tlen2, read_text_2 + display_offset2, qual_text_2, extra_additional_2[0]?"\t":"", extra_additional_2);
2200 #else
2201 write_len_2 = sambamout_fprintf(global_context -> output_sam_fp , "%s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%lld\t%s\t%s%s%s\n", read_name_2, flag2, out_chro2, out_offset2, out_mapping_quality2, out_cigar2, mate_chro_for_2, out_offset1, out_tlen2, read_text_2 + display_offset2, qual_text_2, extra_additional_2[0]?"\t":"", extra_additional_2);
2202 #endif
2203
2204 if( write_len < 10 || write_len_2 < 10 ){
2205 global_context -> output_sam_is_full = 1;
2206 }
2207 }
2208 //subread_lock_release(&global_context -> output_lock);
2209 }
2210
2211
2212 if(is_R1_OK) rec1->raw_result->selected_position += rec1->soft_clipping_movements;
2213 if(is_R2_OK) rec2->raw_result->selected_position += rec2->soft_clipping_movements;
2214
2215 }
2216
2217
2218 void init_chunk_scanning_parameters(global_context_t * global_context, thread_context_t * thread_context, gene_input_t ** ginp1, gene_input_t ** ginp2)
2219 {
2220 *ginp2 = NULL;
2221 *ginp1 = &global_context->input_reads.first_read_file;
2222 if(global_context->input_reads.is_paired_end_reads)
2223 *ginp2 = &global_context->input_reads.second_read_file;
2224 }
2225
2226 gene_value_index_t * find_current_value_index(global_context_t * global_context, unsigned int pos, int len)
2227 {
2228 int block_no;
2229 if(global_context->index_block_number<2)
2230 {
2231 unsigned index_begin = global_context -> all_value_indexes [0] . start_base_offset;
2232 unsigned index_end = global_context -> all_value_indexes [0] . start_base_offset + global_context -> all_value_indexes [0] . length;
2233
2234 if(pos>=index_begin && pos + len<index_end)
2235 return & global_context->all_value_indexes [0];
2236 else return NULL;
2237 }
2238 else
2239 for(block_no=0; block_no<global_context->index_block_number; block_no++)
2240 {
2241 unsigned index_begin = global_context -> all_value_indexes [block_no] . start_base_offset;
2242 unsigned index_end = global_context -> all_value_indexes [block_no] . start_base_offset + global_context -> all_value_indexes [block_no] . length;
2243 if((block_no == 0 && pos >= index_begin && pos < index_end - 1000000) ||
2244 (block_no>0 && block_no<global_context->index_block_number-1 && pos >= index_begin+ 1000000 && pos < index_end - 1000000) ||
2245 (block_no == global_context->index_block_number-1 && pos >= index_begin + 1000000 && pos < index_end ))
2246 {
2247 return & global_context -> all_value_indexes [block_no];
2248 }
2249 }
2250 return NULL;
2251 }
2252 //this function selects the correct all_value_indexes from global_context and put it to global_context or thread_context if thread_context is not NULL.
2253 int locate_current_value_index(global_context_t * global_context, thread_context_t * thread_context, mapping_result_t * result, int rlen)
2254 {
2255 int block_no;
2256
2257 if(global_context->index_block_number<2)
2258 {
2259 unsigned index_begin = global_context -> all_value_indexes [0] . start_base_offset;
2260 unsigned index_end = global_context -> all_value_indexes [0] . start_base_offset + global_context -> all_value_indexes [0] . length;
2261
2262 //SUBREADprintf("RESET2 : %u should <= %u\n", result->selected_position + rlen, index_end);
2263
2264 if(result->selected_position>=index_begin && result->selected_position + rlen<=index_end)
2265 {
2266 if(thread_context)thread_context->current_value_index = & global_context->all_value_indexes [0];
2267 else global_context->current_value_index =& global_context->all_value_indexes [0];
2268 return 0;
2269 }
2270 else return 1;
2271 }
2272 for(block_no=0; block_no<global_context->index_block_number; block_no++)
2273 {
2274 unsigned index_begin = global_context -> all_value_indexes [block_no] . start_base_offset;
2275 unsigned index_end = global_context -> all_value_indexes [block_no] . start_base_offset + global_context -> all_value_indexes [block_no] . length;
2276 if((block_no == 0 && result->selected_position >= index_begin && result->selected_position < index_end - 1000000) ||
2277 (block_no>0 && block_no<global_context->index_block_number-1 && result->selected_position >= index_begin+ 1000000 && result->selected_position < index_end - 1000000) ||
2278 (block_no == global_context->index_block_number-1 && result->selected_position >= index_begin + 1000000 && result->selected_position < index_end ))
2279 {
2280 if(thread_context)thread_context->current_value_index =& global_context -> all_value_indexes [block_no];
2281 else global_context->current_value_index =& global_context -> all_value_indexes [block_no];
2282 return 0;
2283 }
2284 }
2285 return 1;
2286 }
2287
2288
2289
2290
2291 int do_iteration_one(global_context_t * global_context, thread_context_t * thread_context)
2292 {
2293 assert(0); // not needed anymore
2294 return 0;
2295 }
2296
2297
2298
2299 int finish_iteration_three(global_context_t * global_context, thread_context_t * thread_context)
2300 {
2301 return 0;
2302 }
2303 int do_iteration_three(global_context_t * global_context, thread_context_t * thread_context)
2304 {
2305 int ret;
2306 gene_input_t * ginp1 = NULL , * ginp2 = NULL;
2307 subread_read_number_t current_read_number=0;
2308 char read_text_1[MAX_READ_LENGTH+1], read_text_2[MAX_READ_LENGTH+1];
2309 char qual_text_1[MAX_READ_LENGTH+1], qual_text_2[MAX_READ_LENGTH+1];
2310 char read_name_1[MAX_READ_NAME_LEN+1], read_name_2[MAX_READ_NAME_LEN+1];
2311 int read_len_1, read_len_2=0, sqr_interval, sqr_read_number = 0;
2312
2313 //unsigned int low_index_border = global_context -> current_value_index -> start_base_offset;
2314 //unsigned int high_index_border = global_context -> current_value_index -> start_base_offset + global_context -> current_value_index -> length;
2315
2316 print_in_box(80,0,0,"Prepare for long indel deleteion...");
2317 init_chunk_scanning_parameters(global_context,thread_context, & ginp1, & ginp2);
2318 sqr_interval = max(5000,global_context -> processed_reads_in_chunk/10/ global_context -> config.all_threads);
2319
2320 while(1)
2321 {
2322 int is_second_read;
2323
2324 sqr_read_number++;
2325 ret = fetch_next_read_pair(global_context, thread_context, ginp1, ginp2, &read_len_1, &read_len_2, read_name_1, read_name_2, read_text_1, read_text_2, qual_text_1, qual_text_2, 1, ¤t_read_number);
2326 if(ret) break;
2327
2328 int best_read_id = 0;
2329 for (is_second_read = 0; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read ++)
2330 {
2331 int is_reversed_already = 0;
2332 for(best_read_id = 0; best_read_id < global_context -> config.multi_best_reads; best_read_id++)
2333 {
2334 mapping_result_t current_result_body;
2335 mapping_result_t mate_result_body;
2336
2337 mapping_result_t * current_result = ¤t_result_body, *mate_result = NULL;
2338 bigtable_readonly_result(global_context, NULL, current_read_number, best_read_id, is_second_read, current_result, NULL);
2339
2340 if(global_context -> input_reads.is_paired_end_reads){
2341 mate_result = &mate_result_body;
2342 bigtable_readonly_result(global_context, NULL, current_read_number, best_read_id,!is_second_read, mate_result, NULL);
2343 }
2344
2345 //if(best_read_id && (current_result->selected_votes <1)) break;
2346
2347 char * current_read_name = is_second_read?read_name_2 : read_name_1;
2348 char * current_read = is_second_read?read_text_2 : read_text_1;
2349 char * current_qual = is_second_read?qual_text_2 : qual_text_1;
2350 int current_rlen = is_second_read?read_len_2:read_len_1;
2351 int mate_rlen = is_second_read?read_len_1:read_len_2;
2352
2353 if((current_result->result_flags & CORE_IS_FULLY_EXPLAINED) || (global_context -> input_reads.is_paired_end_reads &&(mate_result -> result_flags & CORE_IS_FULLY_EXPLAINED)))
2354 {
2355 //SUBREADprintf("BUILD BLOCK FOR READ_%d '%s' BEST=%d\n", is_second_read + 1, current_read_name , best_read_id);
2356 if(current_result->result_flags & CORE_IS_FULLY_EXPLAINED)
2357 {
2358 int is_negative_strand = ((current_result -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0);
2359
2360 if(is_negative_strand + is_reversed_already ==1)
2361 {
2362 reverse_read(current_read, current_rlen, global_context->config.space_type);
2363 reverse_quality(current_qual, current_rlen);
2364 is_reversed_already=!is_reversed_already;
2365 }
2366
2367 build_local_reassembly(global_context , thread_context , current_read_number, current_read_name , current_read , current_qual , current_rlen, 0 , is_second_read, best_read_id, 0, current_result, mate_result);
2368
2369 }
2370 else if(global_context -> input_reads.is_paired_end_reads && (mate_result -> result_flags & CORE_IS_FULLY_EXPLAINED))
2371 {
2372 int is_negative_strand = ((mate_result -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0);
2373 if(is_negative_strand+is_reversed_already==1)
2374 {
2375 reverse_read(current_read, current_rlen, global_context->config.space_type);
2376 reverse_quality(current_qual, current_rlen);
2377 is_reversed_already=!is_reversed_already;
2378 }
2379
2380 build_local_reassembly(global_context , thread_context , current_read_number , current_read_name , current_read , current_qual , current_rlen, mate_rlen , is_second_read, best_read_id, 1, current_result, mate_result);
2381 }
2382 }
2383 }
2384 }
2385
2386 if(!thread_context || thread_context->thread_id == 0)
2387 {
2388 if(sqr_read_number>sqr_interval)
2389 {
2390 show_progress(global_context, thread_context, current_read_number, STEP_ITERATION_THREE);
2391 sqr_read_number = 0;
2392 }
2393 }
2394 }
2395
2396 return 0;
2397
2398 }
2399
2400
2401 void add_realignment_event_support(global_context_t * global_context , realignment_result_t * res){
2402 int xk1;
2403 indel_context_t * indel_context = (indel_context_t *)global_context -> module_contexts[MODULE_INDEL_ID];
2404
2405 for(xk1 = 0; xk1 < MAX_EVENTS_IN_READ ; xk1++){
2406 chromosome_event_t *sup = res -> supporting_chromosome_events[xk1];
2407 if(!sup)break;
2408
2409 int lock_hash = sup -> global_event_id % EVENT_BODY_LOCK_BUCKETS;
2410 subread_lock_occupy(indel_context -> event_body_locks+lock_hash);
2411 sup -> final_counted_reads ++;
2412 sup -> junction_flanking_left = max(sup -> junction_flanking_left, res -> flanking_size_left[xk1]);
2413 sup -> junction_flanking_right = max(sup -> junction_flanking_right, res -> flanking_size_right[xk1]);
2414 subread_lock_release(indel_context -> event_body_locks+lock_hash);
2415 }
2416 }
2417
2418 unsigned int calc_end_pos(unsigned int p, char * cigar, unsigned int * all_skipped_len, int * is_exonic_regions, global_context_t * global_context);
2419 void test_PE_and_same_chro_align(global_context_t * global_context , realignment_result_t * res1, realignment_result_t * res2, int * is_exonic_regions, int * is_PE_distance, int * is_same_chromosome, int read_len_1, int read_len_2, char * rname, int * res_tlen);
2420 void write_realignments_for_fragment(global_context_t * global_context, thread_context_t * thread_context, subread_output_context_t * out_context, unsigned int read_number, realignment_result_t * res1, realignment_result_t * res2, char * read_name_1, char * read_name_2, char * read_text_1, char * read_text_2, char * qual_text_1, char * qual_text_2 , int rlen1 , int rlen2, int multi_mapping_number, int this_multi_mapping_i, int non_informative_subreads_r1, int non_informative_subreads_r2){
2421
2422 int is_2_OK = 0, is_1_OK = 0;
2423
2424 if(res1){
2425 is_1_OK = convert_read_to_tmp(global_context , out_context, read_number, 0, rlen1, read_text_1, qual_text_1, res1, out_context -> r1, read_name_1);
2426 if(is_1_OK) add_realignment_event_support(global_context, res1);
2427 }
2428 if(res2){
2429 is_2_OK = convert_read_to_tmp(global_context , out_context, read_number, 1, rlen2, read_text_2, qual_text_2, res2, out_context -> r2, read_name_2);
2430 if(is_2_OK) add_realignment_event_support(global_context, res2);
2431 }
2432
2433 subread_output_tmp_t * r1_output = NULL;
2434 subread_output_tmp_t * r2_output = NULL;
2435
2436 if(res1){
2437 r1_output = out_context -> r1;
2438 }
2439
2440 if(res2){
2441 r2_output = out_context -> r2;
2442 }
2443
2444 if(this_multi_mapping_i < 1){
2445 if( is_1_OK == 0 && is_2_OK == 0)
2446 if(thread_context) thread_context -> all_unmapped_reads ++;
2447 else global_context -> all_unmapped_reads ++;
2448 else if( is_1_OK == 0 || is_2_OK == 0){
2449 if(thread_context) thread_context -> not_properly_pairs_only_one_end_mapped ++;
2450 else global_context -> not_properly_pairs_only_one_end_mapped ++;
2451
2452 if((is_1_OK && (res1 -> realign_flags & CORE_IS_BREAKEVEN )) ||
2453 (is_2_OK && (res2 -> realign_flags & CORE_IS_BREAKEVEN))) {
2454 if(thread_context) thread_context -> all_multimapping_reads ++;
2455 else global_context -> all_multimapping_reads ++;
2456 }else{
2457 if(thread_context) thread_context -> all_uniquely_mapped_reads ++;
2458 else global_context -> all_uniquely_mapped_reads ++;
2459 }
2460 }else{
2461 if(res1 -> realign_flags & CORE_IS_BREAKEVEN){
2462 if(thread_context) thread_context -> all_multimapping_reads ++;
2463 else global_context -> all_multimapping_reads ++;
2464 }else{
2465 if(thread_context) thread_context -> all_uniquely_mapped_reads ++;
2466 else global_context -> all_uniquely_mapped_reads ++;
2467 }
2468 }
2469 }
2470
2471
2472 if((!global_context->config.ignore_unmapped_reads) || (is_2_OK || is_1_OK))
2473 write_single_fragment(global_context, thread_context, r1_output, res1, r2_output, res2, multi_mapping_number , this_multi_mapping_i , read_name_1, read_name_2, rlen1, rlen2, read_text_1, read_text_2, qual_text_1, qual_text_2, read_number, non_informative_subreads_r1, non_informative_subreads_r2, is_1_OK, is_2_OK);
2474 }
2475
2476
2477 void clear_repeated_buffer(global_context_t * global_context, unsigned int * repeated_buffer_position, char ** repeated_buffer_cigar, int * repeated_count){
2478 (*repeated_count) = 0;
2479 }
2480 int add_repeated_buffer(global_context_t * global_context, unsigned int * repeated_buffer_position, char ** repeated_buffer_cigar, int * repeated_count, realignment_result_t * res1, realignment_result_t * res2){
2481 int x1, is_repeated = 0;
2482 char * r1_cigar = "*";
2483 unsigned int r1_pos = 0;
2484
2485 if(res1){
2486 r1_cigar = res1 -> cigar_string;
2487 r1_pos = res1 -> first_base_position;
2488 }
2489
2490 char * r2_cigar = "*";
2491 unsigned int r2_pos = 0;
2492
2493 if(res2){
2494 r2_cigar = res2 -> cigar_string;
2495 r2_pos = res2 -> first_base_position;
2496 }
2497
2498 for(x1 = 0; x1 < (*repeated_count); x1 += 2){
2499 //if(strcmp(r1_cigar, "48M2769N53M")==0)
2500 // SUBREADprintf("KYKY %u=%u %s=%s\n" , r1_pos, repeated_buffer_position[x1], r1_cigar , repeated_buffer_cigar[x1]);
2501 if(repeated_buffer_position[x1] == r1_pos && repeated_buffer_position[x1+1] == r2_pos)
2502 if(strcmp(repeated_buffer_cigar[x1], r1_cigar) == 0 && strcmp(repeated_buffer_cigar[x1+1], r2_cigar) == 0)
2503 {
2504 is_repeated = 1;
2505 break;
2506 }
2507 }
2508
2509 if( (!is_repeated) && (*repeated_count) < MAX_ALIGNMENT_PER_ANCHOR * 2 * global_context -> config.reported_multi_best_reads){
2510
2511
2512 //if(strcmp(r1_cigar, "48M2769N53M")==0)
2513 // SUBREADprintf("CPCP %u=%u %s=%s\n" , r1_pos, repeated_buffer_position[x1], r1_cigar , repeated_buffer_cigar[x1]);
2514 repeated_buffer_position[*repeated_count] = r1_pos;
2515 repeated_buffer_position[1 + *repeated_count] = r2_pos;
2516 strcpy(repeated_buffer_cigar[*repeated_count], r1_cigar);
2517 strcpy(repeated_buffer_cigar[1 + *repeated_count], r2_cigar);
2518 (*repeated_count) +=2;
2519 }
2520
2521 return is_repeated;
2522 }
2523 int do_iteration_two(global_context_t * global_context, thread_context_t * thread_context)
2524 {
2525 gene_input_t * ginp1 = NULL , * ginp2 = NULL;
2526 subread_read_number_t current_read_number=0;
2527 char read_text_1[MAX_READ_LENGTH+1], read_text_2[MAX_READ_LENGTH+1];
2528 char qual_text_1[MAX_READ_LENGTH+1], qual_text_2[MAX_READ_LENGTH+1];
2529
2530 char raw_read_text_1[MAX_READ_LENGTH+1], raw_read_text_2[MAX_READ_LENGTH+1];
2531 char raw_qual_text_1[MAX_READ_LENGTH+1], raw_qual_text_2[MAX_READ_LENGTH+1];
2532 char read_name_1[MAX_READ_NAME_LEN+1], read_name_2[MAX_READ_NAME_LEN+1];
2533 char * repeated_buffer_cigars[MAX_ALIGNMENT_PER_ANCHOR * 2 * global_context -> config.reported_multi_best_reads];
2534 unsigned int repeated_buffer_pos[MAX_ALIGNMENT_PER_ANCHOR * 2 * global_context -> config.reported_multi_best_reads];
2535 int repeated_count;
2536 int read_len_1=0, read_len_2=0;
2537 int sqr_interval, sqr_read_number=0;
2538
2539 int * final_MATCH_buffer1, *final_MISMATCH_buffer1, *final_PENALTY_buffer1;
2540 int * final_MATCH_buffer2, *final_MISMATCH_buffer2, *final_PENALTY_buffer2, non_informative_subreads_r1=0, non_informative_subreads_r2=0;
2541 int * final_realignment_index1, *final_realignment_index2;
2542 unsigned int *final_realignment_number;
2543 unsigned long long * final_SCORE_buffer;
2544 unsigned int * final_TLEN_buffer;
2545
2546 realignment_result_t * final_realignments;
2547 subread_output_context_t out_context;
2548
2549 //SUBREADprintf("THREAD_OPT2_start\n");
2550 init_output_context(global_context, &out_context);
2551
2552 for(repeated_count = 0;repeated_count < MAX_ALIGNMENT_PER_ANCHOR * 2 * global_context -> config.reported_multi_best_reads ; repeated_count ++ ){
2553 repeated_buffer_cigars[repeated_count] = malloc(2*CORE_MAX_CIGAR_STR_LEN);
2554 }
2555
2556 final_MATCH_buffer1 = malloc(sizeof(int) * 2 * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR);
2557 final_MISMATCH_buffer1 = malloc(sizeof(int) * 2 * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR);
2558 final_PENALTY_buffer1 = malloc(sizeof(int) * 2 * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR);
2559 final_realignment_index1 = malloc(sizeof(int) * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR);
2560
2561 final_MATCH_buffer2 = malloc(sizeof(int) * 2 * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR);
2562 final_MISMATCH_buffer2 = malloc(sizeof(int) * 2 * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR);
2563 final_PENALTY_buffer2 = malloc(sizeof(int) * 2 * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR);
2564 final_realignment_index2 = malloc(sizeof(int) * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR);
2565
2566 final_SCORE_buffer = malloc(sizeof(long long) * global_context -> config.multi_best_reads * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR*MAX_ALIGNMENT_PER_ANCHOR);
2567 final_TLEN_buffer = malloc(sizeof(int) * global_context -> config.multi_best_reads * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR*MAX_ALIGNMENT_PER_ANCHOR);
2568
2569 final_realignments = malloc(sizeof(realignment_result_t) * global_context -> config.multi_best_reads * 2 * MAX_ALIGNMENT_PER_ANCHOR);
2570 final_realignment_number = malloc(sizeof(int) * global_context -> config.multi_best_reads * 2);
2571
2572 mapping_result_t * r1_align_result_buffer, * r2_align_result_buffer;
2573 subjunc_result_t * r1_subjunc_result_buffer, * r2_subjunc_result_buffer;
2574
2575 r1_align_result_buffer = malloc(sizeof(mapping_result_t) * global_context -> config.multi_best_reads);
2576 r2_align_result_buffer = malloc(sizeof(mapping_result_t) * global_context -> config.multi_best_reads);
2577
2578 r1_subjunc_result_buffer = malloc(sizeof(subjunc_result_t) * global_context -> config.multi_best_reads);
2579 r2_subjunc_result_buffer = malloc(sizeof(subjunc_result_t) * global_context -> config.multi_best_reads);
2580
2581 //unsigned int low_index_border = global_context -> current_value_index -> start_base_offset;
2582 //unsigned int high_index_border = global_context -> current_value_index -> start_base_offset + global_context -> current_value_index -> length;
2583
2584 init_chunk_scanning_parameters(global_context,thread_context, & ginp1, & ginp2);
2585 sqr_interval = max(5000,global_context -> processed_reads_in_chunk/10/ global_context -> config.all_threads);
2586
2587
2588 while(1)
2589 {
2590 int is_second_read;
2591 int max_votes;
2592
2593 sqr_read_number++;
2594 fetch_next_read_pair(global_context, thread_context, ginp1, ginp2, &read_len_1, &read_len_2, read_name_1, read_name_2, read_text_1, read_text_2, qual_text_1, qual_text_2, 0, ¤t_read_number);
2595 //if(current_read_number%500000==0)SUBREADprintf("THREAD_OPT2_fetch %s %s ; RNO=%lld\n", read_name_1, read_text_1, current_read_number);
2596 strcpy(raw_read_text_1, read_text_1);
2597 strcpy(raw_qual_text_1, qual_text_1);
2598 //printf("OCT27-STEPB - %s\n", read_name_1);
2599
2600 if(global_context -> input_reads.is_paired_end_reads){
2601 strcpy(raw_read_text_2, read_text_2);
2602 strcpy(raw_qual_text_2, qual_text_2);
2603 }
2604
2605 if(global_context->config.space_type == GENE_SPACE_COLOR){
2606 if(isalpha(read_text_1[0])){
2607 //SUBREADprintf("FIRST : %s\n\n", read_text_1);
2608 int xk1;
2609 for(xk1=2; read_text_1[xk1]; xk1++){
2610 read_text_1[xk1-2] = read_text_1[xk1];
2611 }
2612 read_text_1[xk1-2] = 0;
2613 }
2614
2615 if(global_context -> input_reads.is_paired_end_reads){
2616 //SUBREADprintf("SECOND: %s\n\n", read_text_2);
2617 if(isalpha(read_text_2[0])){
2618 int xk1;
2619 for(xk1=1; read_text_2[xk1]; xk1++){
2620 read_text_2[xk1-1] = read_text_2[xk1];
2621 }
2622 read_text_2[xk1-1] = 0;
2623 }
2624 }
2625 }
2626
2627 if(raw_qual_text_1[0] && global_context -> config.phred_score_format == FASTQ_PHRED64){
2628 fastq_64_to_33(raw_qual_text_1);
2629 if(global_context->input_reads.is_paired_end_reads)
2630 fastq_64_to_33(raw_qual_text_2);
2631 }
2632
2633 if((global_context -> config.is_BAM_output && global_context -> output_bam_writer -> is_internal_error) || global_context -> input_reads.is_internal_error ||
2634 (global_context -> output_sam_is_full))break;
2635 if(current_read_number < 0) break;
2636
2637 // if no more reads
2638 if( global_context -> input_reads.is_paired_end_reads)
2639 max_votes = max(_global_retrieve_alignment_ptr(global_context, current_read_number, 0, 0)->selected_votes, _global_retrieve_alignment_ptr(global_context, current_read_number, 1, 0)->selected_votes);
2640 else max_votes = _global_retrieve_alignment_ptr(global_context, current_read_number, 0, 0)->selected_votes;
2641
2642 //if( current_read_number < 10 ) SUBREADprintf("BSSS2 : %s : %d\n", read_name_1, max_votes);
2643
2644 int best_read_id=0;
2645
2646 //memset(final_MATCH_buffer, 0, sizeof(int) * 2 * global_context -> config.multi_best_reads);
2647 //memset(final_MISMATCH_buffer, 0, sizeof(int) * 2 * global_context -> config.multi_best_reads);
2648 int r1_candidate_locations = 0, r2_candidate_locations = 0;
2649 int r1_step2_locations = 0, r2_step2_locations = 0;
2650
2651 clear_repeated_buffer(global_context, repeated_buffer_pos, repeated_buffer_cigars, &repeated_count);
2652 for (is_second_read = 0; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read ++)
2653 {
2654 char * current_read_name = is_second_read?read_name_2 : read_name_1;
2655 int is_reversed_already = 0, realignment_i;
2656 int * current_candidate_locations = is_second_read?&r2_candidate_locations:&r1_candidate_locations;
2657
2658 int * current_MATCH_buffer = is_second_read?final_MATCH_buffer2:final_MATCH_buffer1;
2659 int * current_MISMATCH_buffer = is_second_read?final_MISMATCH_buffer2:final_MISMATCH_buffer1;
2660 int * current_PENALTY_buffer = is_second_read?final_PENALTY_buffer2:final_PENALTY_buffer1;
2661 int * current_realignment_index = is_second_read?final_realignment_index2:final_realignment_index1;
2662
2663
2664 for(best_read_id = 0; best_read_id < global_context -> config.multi_best_reads; best_read_id++)
2665 {
2666 mapping_result_t *current_result = _global_retrieve_alignment_ptr(global_context, current_read_number, is_second_read, best_read_id);
2667
2668 if(best_read_id == 0){
2669 if(is_second_read)
2670 non_informative_subreads_r2 = current_result -> noninformative_subreads_in_vote;
2671 else
2672 non_informative_subreads_r1 = current_result -> noninformative_subreads_in_vote;
2673 }
2674
2675 char * current_read = is_second_read?read_text_2 : read_text_1;
2676 char * current_qual = is_second_read?qual_text_2 : qual_text_1;
2677 int current_rlen = is_second_read?read_len_2:read_len_1;
2678
2679 if(current_result->selected_votes < global_context->config.minimum_subread_for_second_read || max_votes < global_context->config.minimum_subread_for_first_read)
2680 {
2681 // if(current_read_number == 111 || current_read_number == 112)
2682 //SUBREADprintf("RESET0 [%lld] R_%d SEL=%d MAX=%d\n", current_read_number, is_second_read + 1, current_result->selected_votes, max_votes);
2683 current_result -> selected_votes = 0;
2684 continue;
2685 }
2686 if(locate_current_value_index(global_context, thread_context, current_result, current_rlen))
2687 {
2688 current_result -> selected_votes = 0;
2689 //SUBREADprintf("RESET1 Read position excesses index boundary.\n");
2690 continue;
2691 }
2692
2693 //#warning ">>>>>>> COMMENT THIS <<<<<<<"
2694 //printf("OCT27-STEP21-%s:%d-ALN%02d-THRE %d in MM=%d\n", current_read_name, is_second_read + 1, best_read_id, thread_context -> thread_id, global_context -> config.multi_best_reads);
2695
2696 int is_negative_strand = (current_result -> result_flags & CORE_IS_NEGATIVE_STRAND)?1:0;
2697
2698 if(is_negative_strand + is_reversed_already == 1)
2699 {
2700 reverse_read(current_read, current_rlen, global_context->config.space_type);
2701 reverse_quality(current_qual, current_rlen);
2702 is_reversed_already = !is_reversed_already;
2703 }
2704
2705 current_result -> result_flags &= ~CORE_IS_FULLY_EXPLAINED;
2706
2707
2708 if(is_second_read) r2_step2_locations = best_read_id + 1;
2709 else r1_step2_locations = best_read_id + 1;
2710
2711 unsigned int final_alignments = explain_read(global_context, thread_context , final_realignments + (is_second_read + 2 * best_read_id) * MAX_ALIGNMENT_PER_ANCHOR,
2712 current_read_number, current_rlen, current_read_name, current_read, current_qual, is_second_read, best_read_id, is_negative_strand);
2713
2714 final_realignment_number[ best_read_id * 2 + is_second_read ] = final_alignments;
2715
2716 for(realignment_i = 0 ; realignment_i < final_alignments ; realignment_i ++){
2717 if((* current_candidate_locations) >= global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR) break;
2718
2719 int realign_index = (is_second_read + 2 * best_read_id)* MAX_ALIGNMENT_PER_ANCHOR + realignment_i;
2720 int final_MATCH = final_realignments[realign_index].final_matched_bases;
2721 int final_PENALTY = final_realignments[realign_index].final_penalty;
2722
2723 if((current_result -> result_flags & CORE_IS_FULLY_EXPLAINED) && final_MATCH>0) {
2724 int final_MISMATCH = final_realignments[realign_index].final_mismatched_bases;
2725 //#warning ">>>>>>>>>>>>>>>>>>>>>>>>>>>>> REMOVE THIS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"
2726 //printf("OCT27-STEPMSM-MMM %s M=%d MM=%d\n", read_name_1, final_MATCH ,final_MISMATCH);
2727
2728 current_PENALTY_buffer[*current_candidate_locations] = final_PENALTY;
2729 current_MATCH_buffer[*current_candidate_locations] = final_MATCH;
2730 current_MISMATCH_buffer[*current_candidate_locations] = final_MISMATCH;
2731 current_realignment_index[*current_candidate_locations] = realign_index;
2732 (*current_candidate_locations) ++;
2733 }
2734 }
2735 //#warning ">>>>>>> COMMENT THIS <<<<<<<"
2736 //printf("OCT27-FI-%s:%d-ALN%02d-THRE %d\n", current_read_name, is_second_read + 1, best_read_id, thread_context -> thread_id);
2737 }
2738 }
2739
2740 //if(161430 <= current_read_number) SUBREADprintf("LOC1=%d, LOC2=%d\n", r1_candidate_locations, r2_candidate_locations);
2741
2742 int need_expect_TLEN = r2_candidate_locations && r1_candidate_locations && global_context -> config.reported_multi_best_reads<2 && global_context -> expected_TLEN_read_numbers < READPAIRS_FOR_CALC_EXPT_TLEN;
2743 int output_cursor = 0;
2744 if(r2_candidate_locations == 0 || r1_candidate_locations == 0) {
2745 int is_second_read, highest_score_occurence =0;
2746 for(is_second_read = 0; is_second_read < 1+ global_context -> input_reads.is_paired_end_reads; is_second_read++)
2747 {
2748 int current_candidate_locations = is_second_read?r2_candidate_locations:r1_candidate_locations;
2749 if(current_candidate_locations > 0){
2750
2751 int * current_MATCH_buffer = is_second_read?final_MATCH_buffer2:final_MATCH_buffer1;
2752 int * current_PENALTY_buffer = is_second_read?final_PENALTY_buffer2:final_PENALTY_buffer1;
2753 int * current_MISMATCH_buffer = is_second_read?final_MISMATCH_buffer2:final_MISMATCH_buffer1;
2754 int * current_realignment_index = is_second_read?final_realignment_index2:final_realignment_index1;
2755
2756 int read_record_i;
2757 unsigned long long int best_score_highest = 0;
2758 unsigned long long int scores_array [global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR];
2759
2760 for(read_record_i = 0; read_record_i < current_candidate_locations; read_record_i++){
2761 realignment_result_t * current_realignment_result = final_realignments + current_realignment_index[read_record_i];
2762
2763 unsigned int this_MATCH = current_MATCH_buffer[read_record_i];
2764 unsigned int this_MISMATCH = current_MISMATCH_buffer[read_record_i];
2765 unsigned int this_PENALTY = current_PENALTY_buffer[read_record_i];
2766 unsigned long long this_SCORE;
2767
2768 if(global_context -> config.experiment_type == CORE_EXPERIMENT_DNASEQ){
2769 this_SCORE = this_MATCH * 100000llu + (10000 - this_MISMATCH);
2770 }else{
2771 unsigned int skip = 0; int is_exonic_regions = 1;
2772 if(global_context -> exonic_region_bitmap)calc_end_pos(current_realignment_result -> first_base_position, current_realignment_result -> cigar_string, &skip, &is_exonic_regions, global_context );
2773
2774 if(global_context -> config.scRNA_input_mode)
2775 this_SCORE =((100000llu * (10000 - this_MISMATCH + 2*is_exonic_regions) + this_MATCH)*50llu - this_PENALTY)*20llu+ current_realignment_result -> known_junction_supp;
2776 else this_SCORE =((100000llu * (10000 - this_MISMATCH) + this_MATCH)*50llu - this_PENALTY)*20llu+ current_realignment_result -> known_junction_supp;
2777 }
2778
2779 best_score_highest = max(best_score_highest, this_SCORE);
2780 scores_array[read_record_i] = this_SCORE;
2781 }
2782
2783 for(read_record_i = 0; read_record_i < current_candidate_locations ; read_record_i++){
2784 realignment_result_t * current_realignment_result = final_realignments + current_realignment_index[read_record_i];
2785 if( scores_array[read_record_i] >= best_score_highest && (current_realignment_result -> realign_flags & CORE_TOO_MANY_MISMATCHES)==0) {
2786 int is_repeated = 0;
2787
2788 is_repeated = add_repeated_buffer(global_context, repeated_buffer_pos, repeated_buffer_cigars, &repeated_count, is_second_read?NULL:current_realignment_result , is_second_read?current_realignment_result:NULL);
2789 if(is_repeated)
2790 scores_array[read_record_i] = 0;
2791 else
2792 highest_score_occurence ++;
2793 }
2794 }
2795
2796 if(highest_score_occurence<2 || global_context -> config.report_multi_mapping_reads){
2797 int is_break_even = 0;
2798 if(highest_score_occurence>1) is_break_even = 1;
2799
2800 highest_score_occurence = min(highest_score_occurence, global_context -> config.reported_multi_best_reads);
2801 for(read_record_i = 0; read_record_i < current_candidate_locations ; read_record_i++){
2802 realignment_result_t * current_realignment_result = final_realignments + current_realignment_index[read_record_i];
2803
2804 if( scores_array[read_record_i] >= best_score_highest && (current_realignment_result -> realign_flags & CORE_TOO_MANY_MISMATCHES)==0
2805 && output_cursor < global_context -> config.reported_multi_best_reads){
2806 strcpy(read_text_1, raw_read_text_1);
2807 strcpy(read_text_2, raw_read_text_2);
2808 strcpy(qual_text_1, raw_qual_text_1);
2809 strcpy(qual_text_2, raw_qual_text_2);
2810
2811 if(is_break_even) current_realignment_result -> realign_flags |= CORE_IS_BREAKEVEN;
2812 current_realignment_result -> MAPQ_adjustment = current_MISMATCH_buffer [read_record_i] + ( is_second_read?(r2_step2_locations): (r1_step2_locations));
2813
2814 // SUBREADprintf("THREAD_OPT1_write %s\n", read_name_1);
2815 if(is_second_read)
2816 write_realignments_for_fragment(global_context, thread_context, &out_context, current_read_number, NULL, current_realignment_result, read_name_1, read_name_2, read_text_1, read_text_2, qual_text_1, qual_text_2, read_len_1, read_len_2, highest_score_occurence, output_cursor, non_informative_subreads_r1, non_informative_subreads_r2);
2817 else write_realignments_for_fragment(global_context, thread_context, &out_context , current_read_number, current_realignment_result, NULL, read_name_1, read_name_2, read_text_1, read_text_2, qual_text_1, qual_text_2, read_len_1, read_len_2, highest_score_occurence, output_cursor, non_informative_subreads_r1, non_informative_subreads_r2);
2818 output_cursor ++;
2819 }
2820 }
2821
2822 assert(output_cursor == highest_score_occurence);
2823 }
2824 }
2825 }
2826 } else {
2827 int r1_best_id, r2_best_id, highest_score_occurence = 0;
2828 int expected_TLEN;
2829 if(global_context -> expected_TLEN_read_numbers >= READPAIRS_FOR_CALC_EXPT_TLEN)
2830 expected_TLEN = global_context -> expected_TLEN_sum / global_context -> expected_TLEN_read_numbers;
2831 else expected_TLEN = (global_context -> config.minimum_pair_distance + global_context -> config.maximum_pair_distance)/2;
2832
2833 unsigned long long highest_score = 0;
2834 if(need_expect_TLEN) memset(final_TLEN_buffer, 0 , sizeof(int) * global_context -> config.multi_best_reads * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR*MAX_ALIGNMENT_PER_ANCHOR );
2835 memset(final_SCORE_buffer, 0 , sizeof(long long) * global_context -> config.multi_best_reads * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR*MAX_ALIGNMENT_PER_ANCHOR );
2836 for(r1_best_id = 0; r1_best_id < r1_candidate_locations; r1_best_id ++) {
2837 int r1_matched = final_MATCH_buffer1[r1_best_id];
2838 if(r1_matched < 1) continue;
2839 realignment_result_t * realignment_result_R1 = final_realignments + final_realignment_index1[r1_best_id];
2840
2841 for(r2_best_id = 0; r2_best_id < r2_candidate_locations; r2_best_id ++) {
2842 int r2_matched = final_MATCH_buffer2[r2_best_id];
2843 if(r2_matched < 1) continue;
2844
2845 realignment_result_t * realignment_result_R2 = final_realignments + final_realignment_index2[r2_best_id];
2846 int is_PE = 0, tlen=0;
2847 int is_same_chro = 0, is_exonic_regions = 0;
2848 unsigned long long final_SCORE = 0;
2849
2850 test_PE_and_same_chro_align(global_context , realignment_result_R1 , realignment_result_R2, &is_exonic_regions, &is_PE, &is_same_chro , read_len_1, read_len_2, read_name_1, &tlen);
2851
2852 unsigned long long TLEN_exp_score = 0;
2853 if(is_PE && global_context -> config.no_TLEN_preference == 0 && global_context -> config.reported_multi_best_reads<2){
2854 TLEN_exp_score = (tlen > expected_TLEN)?(tlen - expected_TLEN):(expected_TLEN - tlen);
2855 if(TLEN_exp_score>999) TLEN_exp_score=0;
2856 else TLEN_exp_score = 999-TLEN_exp_score;
2857 }
2858 if(global_context -> config.experiment_type == CORE_EXPERIMENT_DNASEQ){
2859 int weight;
2860
2861 //#warning " ============ USE THE FIRST THREE WEIGHTS! ======== "
2862
2863 if(is_PE)
2864 weight = 120;
2865 //weight = 300;
2866 else if(is_same_chro)
2867 weight = 100;
2868 else weight = 80;
2869 //weight = 30;
2870 final_SCORE = weight * (final_MATCH_buffer1[r1_best_id] + final_MATCH_buffer2[r2_best_id]);
2871 //#warning "=========== ADD BY YANG LIAO FOR MORE MAPPED READS WITH '-u' OPTION ================"
2872 final_SCORE = final_SCORE * 1000llu - final_MISMATCH_buffer1[r1_best_id] - final_MISMATCH_buffer2[r2_best_id];
2873 final_SCORE = final_SCORE * 20llu - final_PENALTY_buffer1[r1_best_id] - final_PENALTY_buffer2[r2_best_id];
2874 final_SCORE = final_SCORE * 1000llu + TLEN_exp_score;
2875 if(is_PE && need_expect_TLEN) final_TLEN_buffer[r1_best_id * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR + r2_best_id] = tlen;
2876 //SUBREADprintf("%s FINAL_SCORE=%llu TLEN_SCORE=%llu DIST=%d EXP_TLEN=%d\n", read_name_1, final_SCORE, TLEN_exp_score, tlen, expected_TLEN);
2877 } else if (global_context -> config.experiment_type == CORE_EXPERIMENT_RNASEQ) {
2878 int weight;
2879
2880
2881 if(1){
2882
2883 if(is_exonic_regions && is_PE)
2884 weight = 5000;
2885 else if(is_PE || is_exonic_regions)
2886 weight = 3000;
2887 else if(is_same_chro)
2888 weight = 1000;
2889 else weight = 300;
2890 }else{
2891 if(is_PE)
2892 weight = 3000;
2893 else if(is_same_chro)
2894 weight = 1000;
2895 else weight = 300;
2896
2897 }
2898
2899 //#warning "=========== ADD BY YANG LIAO ' + 2' ===================="
2900 final_SCORE = weight / (final_MISMATCH_buffer1[r1_best_id] + final_MISMATCH_buffer2[r2_best_id] + 1 + 2);
2901
2902 //#warning "=========== ADD BY YANG LIAO FOR MORE MAPPED READS WITH '-u' OPTION ================"
2903 final_SCORE = final_SCORE * 3000llu + (final_MATCH_buffer1[r1_best_id] + final_MATCH_buffer2[r2_best_id]);
2904 final_SCORE = final_SCORE * 20 + realignment_result_R2 -> known_junction_supp + realignment_result_R1 -> known_junction_supp;
2905 final_SCORE = final_SCORE * 20 - final_PENALTY_buffer1[r1_best_id] - final_PENALTY_buffer2[r2_best_id];
2906 final_SCORE = final_SCORE * 1000 + TLEN_exp_score;
2907
2908 //#warning ">>>>>>>>>>>>>>>>>>>>>>>>>>>>> REMOVE THIS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"
2909 //printf("OCT27-STEPMSM-FNSC %s M=%d,%d MM=%d,%d CHROSAME=%d FSCR=%llu\n", read_name_1, final_MATCH_buffer1[r1_best_id], final_MATCH_buffer2[r2_best_id], final_MISMATCH_buffer1[r1_best_id] , final_MISMATCH_buffer2[r2_best_id], is_same_chro, final_SCORE );
2910 if(is_PE && need_expect_TLEN) final_TLEN_buffer[r1_best_id * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR + r2_best_id] = tlen;
2911 } else assert(0);
2912
2913 assert(final_SCORE > 0);
2914 final_SCORE_buffer[r1_best_id * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR + r2_best_id] = final_SCORE;
2915
2916 if(final_SCORE > highest_score) {
2917 //#warning ">>>>>>>>>>>> COMMENT THIS <<<<<<<<<<<<<<<<<<<"
2918 //printf("OCT27-STEPMSM-REPL %s : SCORE %llu -> %llu, pos=%u,%u\n", read_name_1, final_SCORE, highest_score , realignment_result_R1->mapping_result->selected_position, realignment_result_R2->mapping_result->selected_position );
2919 highest_score_occurence = 1;
2920 highest_score = final_SCORE;
2921 //SUBREADprintf("29NOV2016-FIRSTBEST %s [%d : score %llu] LOC = %d,%d\n", read_name_1, highest_score_occurence, final_SCORE, r1_best_id, r2_best_id);
2922 clear_repeated_buffer(global_context, repeated_buffer_pos, repeated_buffer_cigars, &repeated_count);
2923 add_repeated_buffer(global_context, repeated_buffer_pos, repeated_buffer_cigars, &repeated_count, realignment_result_R1, realignment_result_R2);
2924 } else if(final_SCORE == highest_score) {
2925 int is_repeat = 0;
2926
2927 if(global_context -> config.reported_multi_best_reads)
2928 is_repeat = add_repeated_buffer(global_context, repeated_buffer_pos, repeated_buffer_cigars, &repeated_count, realignment_result_R1, realignment_result_R2);
2929
2930 if(is_repeat)
2931 final_SCORE_buffer[r1_best_id * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR + r2_best_id] = 0;
2932 else{
2933 highest_score_occurence ++;
2934 // SUBREADprintf("29NOV2016-BEST %s [%d : score %llu] LOC = %d,%d\n", read_name_1, highest_score_occurence, final_SCORE, r1_best_id, r2_best_id);
2935 }
2936 }
2937 }
2938 }
2939
2940 //SUBREADprintf("Highest score = %llu, Occurance = %d\n", highest_score , highest_score_occurence);
2941 // Then, copy the (R1, R2) that have the highest score into the align_res buffer.
2942
2943 //#warning ">>>>>>> COMMENT THIS <<<<<<<"
2944 //printf("OCT27-WRITE-BOTHMAOOED-%s-THRE %d ; occu=%d\n", read_name_1, thread_context -> thread_id, highest_score_occurence);
2945
2946 if(highest_score_occurence <= 1 || global_context -> config.report_multi_mapping_reads){
2947
2948 int is_break_even = 0;
2949 if(highest_score_occurence>1)is_break_even = 1;
2950
2951 highest_score_occurence = min(highest_score_occurence, global_context -> config.reported_multi_best_reads);
2952 for(r1_best_id = 0; r1_best_id < r1_candidate_locations; r1_best_id ++)
2953 {
2954 int r1_matched = final_MATCH_buffer1[r1_best_id];
2955 if(r1_matched < 1) continue;
2956
2957 for(r2_best_id = 0; r2_best_id < r2_candidate_locations; r2_best_id ++)
2958 {
2959 int r2_matched = final_MATCH_buffer2[r2_best_id];
2960 if(r2_matched < 1) continue;
2961
2962 if(final_SCORE_buffer[r1_best_id * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR + r2_best_id] == highest_score &&
2963 output_cursor < global_context -> config.reported_multi_best_reads){
2964 if(need_expect_TLEN){
2965 int this_tlen = final_TLEN_buffer[r1_best_id * global_context -> config.multi_best_reads * MAX_ALIGNMENT_PER_ANCHOR + r2_best_id];
2966 if(this_tlen>0){
2967 subread_lock_occupy(&global_context -> output_lock);
2968 global_context -> expected_TLEN_read_numbers++;
2969 global_context -> expected_TLEN_sum += this_tlen;
2970 if(global_context -> expected_TLEN_read_numbers == READPAIRS_FOR_CALC_EXPT_TLEN)
2971 print_in_box(80,0,0," Estimated fragment length : %d bp\n", (int)(global_context -> expected_TLEN_sum / global_context -> expected_TLEN_read_numbers ));
2972 subread_lock_release(&global_context -> output_lock);
2973 }
2974 }
2975 realignment_result_t * r1_realign = final_realignments + final_realignment_index1[r1_best_id];
2976 realignment_result_t * r2_realign = final_realignments + final_realignment_index2[r2_best_id];
2977
2978 strcpy(read_text_1, raw_read_text_1);
2979 strcpy(read_text_2, raw_read_text_2);
2980 strcpy(qual_text_1, raw_qual_text_1);
2981 strcpy(qual_text_2, raw_qual_text_2);
2982
2983 if(is_break_even){
2984 r1_realign -> realign_flags |= CORE_IS_BREAKEVEN;
2985 r2_realign -> realign_flags |= CORE_IS_BREAKEVEN;
2986 }
2987
2988 r1_realign -> MAPQ_adjustment = r1_step2_locations + final_MISMATCH_buffer1[r1_best_id];
2989 r2_realign -> MAPQ_adjustment = r2_step2_locations + final_MISMATCH_buffer2[r2_best_id];
2990
2991 // SUBREADprintf("THREAD_OPT2_write %s\n", read_name_1);
2992 write_realignments_for_fragment(global_context, thread_context, &out_context, current_read_number, r1_realign, r2_realign, read_name_1, read_name_2, read_text_1, read_text_2, qual_text_1, qual_text_2, read_len_1, read_len_2, highest_score_occurence, output_cursor, non_informative_subreads_r1, non_informative_subreads_r2);
2993 output_cursor ++;
2994 }
2995 }
2996 }
2997 assert(output_cursor == highest_score_occurence );
2998 }
2999 }
3000
3001 //if( current_read_number < 10 ) SUBREADprintf("BEEE2 : %s : %d\n", read_name_1, max_votes);
3002 //#warning ">>>>>>> COMMENT THIS <<<<<<<"
3003 //printf("OCT27-WRITE-UNMAP?-%s-THRE %d\n", read_name_1, thread_context -> thread_id);
3004 if(output_cursor<1) {
3005 strcpy(read_text_1, raw_read_text_1);
3006 strcpy(read_text_2, raw_read_text_2);
3007 strcpy(qual_text_1, raw_qual_text_1);
3008 strcpy(qual_text_2, raw_qual_text_2);
3009 // SUBREADprintf("THREAD_OPT3_write %s\n", read_name_1);
3010 write_realignments_for_fragment(global_context, thread_context, &out_context, current_read_number, NULL, NULL, read_name_1, read_name_2, read_text_1, read_text_2, raw_qual_text_1, raw_qual_text_2, read_len_1, read_len_2, 0, 0, non_informative_subreads_r1, non_informative_subreads_r2);
3011 }
3012
3013 if(!thread_context || thread_context->thread_id == 0)
3014 {
3015 if(sqr_read_number>sqr_interval)
3016 {
3017 show_progress(global_context, thread_context, current_read_number, STEP_ITERATION_TWO);
3018 sqr_read_number=0;
3019 }
3020 }
3021 //#warning ">>>>>>> COMMENT THIS <<<<<<<"
3022 //printf("OCT27-FIN-%s-THRE %d\n", read_name_1, thread_context -> thread_id);
3023 //bigtable_release_result(global_context, thread_context, current_read_number, 1);
3024 }
3025
3026 free(final_realignments);
3027 free(final_realignment_number);
3028 free(final_MATCH_buffer1);
3029 free(final_MISMATCH_buffer1);
3030 free(final_PENALTY_buffer1);
3031 free(final_realignment_index1);
3032
3033 free(final_MATCH_buffer2);
3034 free(final_MISMATCH_buffer2);
3035 free(final_PENALTY_buffer2);
3036 free(final_realignment_index2);
3037
3038 free(final_SCORE_buffer);
3039 free(final_TLEN_buffer);
3040
3041 free(r1_align_result_buffer);
3042 free(r1_subjunc_result_buffer);
3043 free(r2_align_result_buffer);
3044 free(r2_subjunc_result_buffer);
3045
3046 for(repeated_count = 0;repeated_count < MAX_ALIGNMENT_PER_ANCHOR * 2 * global_context -> config.reported_multi_best_reads ; repeated_count ++ ){
3047 free(repeated_buffer_cigars[repeated_count]);
3048 }
3049
3050 destroy_output_context(global_context, &out_context);
3051 //SUBREADprintf("OPT2-FINISHED\n");
3052 if(global_context ->config.all_threads >1 && global_context ->config.is_BAM_output && !global_context ->config.is_input_read_order_required)SamBam_writer_finalise_thread(global_context -> output_bam_writer, thread_context -> thread_id);
3053 if(global_context ->config.all_threads <=1 && global_context ->config.is_BAM_output && global_context ->config.sort_reads_by_coordinates)SamBam_writer_finalise_one_thread(global_context -> output_bam_writer);
3054 return 0;
3055 }
3056
3057 int core_get_subread_quality(global_context_t * global_context, thread_context_t * thread_context, char * qual, int qual_len)
3058 {
3059 int x1, ret=1;
3060
3061 if(!qual)return 1;
3062 if(!qual[0])return 1;
3063
3064 int offset = (global_context->config.phred_score_format == FASTQ_PHRED33)?33:64;
3065
3066 for(x1=0; (x1 < qual_len) && qual[x1]; x1++)
3067 ret += (qual[x1] - offset);
3068
3069 return ret;
3070 }
3071
3072 int has_better_mapping(global_context_t * global_context, thread_context_t * thread_context, subread_read_number_t current_read_number, int is_second_read, int this_aln_id){
3073 int better_read_id;
3074
3075 mapping_result_t * this_r = _global_retrieve_alignment_ptr(global_context, current_read_number, is_second_read, this_aln_id);
3076 for(better_read_id = 0; better_read_id < this_aln_id; better_read_id++){
3077 mapping_result_t * better_r = _global_retrieve_alignment_ptr(global_context, current_read_number, is_second_read, better_read_id);
3078 if( this_r -> selected_position >= better_r -> selected_position - global_context -> config.max_indel_length - 1
3079 && this_r -> selected_position <= better_r -> selected_position + global_context -> config.max_indel_length +1 )
3080 if( this_r -> confident_coverage_start >= better_r -> confident_coverage_start
3081 && this_r -> confident_coverage_end <= better_r -> confident_coverage_end) return 1;
3082 }
3083 return 0;
3084 }
3085
3086 int do_voting(global_context_t * global_context, thread_context_t * thread_context)
3087 {
3088 int xk1;
3089 gene_input_t * ginp1 = NULL , * ginp2 = NULL;
3090 subread_read_number_t current_read_number=0, last_shown_curr = 0;
3091 char * read_text_1, * read_text_2;
3092 char * qual_text_1, * qual_text_2;
3093 char read_name_1[MAX_READ_NAME_LEN+1], read_name_2[MAX_READ_NAME_LEN+1];
3094 int read_len_1, read_len_2=0;
3095 int min_first_read_votes = global_context -> config.minimum_subread_for_first_read;
3096 int voting_max_indel_length = min(16, global_context->config.max_indel_length);
3097 int sqr_interval=10000, sqr_read_number = 0;
3098
3099 read_text_1 = malloc(MAX_READ_LENGTH+1);
3100 read_text_2 = malloc(MAX_READ_LENGTH+1);
3101 qual_text_1 = malloc(MAX_READ_LENGTH+1);
3102 qual_text_2 = malloc(MAX_READ_LENGTH+1);
3103
3104 gene_vote_t * vote_1, * vote_2, * vote_fg;
3105
3106 vote_1 = (gene_vote_t *) malloc(sizeof(gene_vote_t));
3107 vote_2 = (gene_vote_t *) malloc(sizeof(gene_vote_t));
3108 vote_fg = (gene_vote_t *) malloc(sizeof(gene_vote_t));
3109
3110 if(!(vote_1&&vote_2))
3111 {
3112 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_FATAL,"Cannot allocate voting memory.");
3113 return 1;
3114 }
3115
3116 init_chunk_scanning_parameters(global_context,thread_context, & ginp1, & ginp2);
3117
3118 unsigned int low_index_border = global_context -> current_value_index -> start_base_offset;
3119 unsigned int high_index_border = global_context -> current_value_index -> start_base_offset + global_context -> current_value_index -> length;
3120 int has_second_read = 1 + global_context -> input_reads.is_paired_end_reads;
3121
3122 if(thread_context)
3123 thread_context -> current_value_index = global_context -> current_value_index;
3124
3125 int GENE_SLIDING_STEP = global_context -> current_index -> index_gap;
3126 int need_junction_step = global_context -> config.do_breakpoint_detection || global_context -> config.do_fusion_detection ||global_context -> config.do_long_del_detection;
3127
3128 while(1)
3129 {
3130 int is_second_read;
3131 int subread_no;
3132 int is_reversed, applied_subreads = 0, v1_all_subreads=0, v2_all_subreads=0;
3133
3134 fetch_next_read_pair(global_context, thread_context, ginp1, ginp2, &read_len_1, &read_len_2, read_name_1, read_name_2, read_text_1, read_text_2, qual_text_1, qual_text_2,1, ¤t_read_number);
3135 //SUBREADprintf("VOTING READ # %lld: %s %s %d\n", current_read_number, read_name_1, read_text_1, read_len_1);
3136
3137 if(current_read_number < 0){
3138 // #warning ">>>>>>>>>>>>> COMMENT THIS <<<<<<<<<<<<<<<<<<<"
3139 // printf("OCT27-STEPB-QUIT :T%d\n", thread_context -> thread_id);
3140 break;
3141 }
3142
3143 for(is_reversed = 0; is_reversed<2; is_reversed++)
3144 {
3145 //printf("MAP_REA = %s / %s\n", read_text_1, read_text_2);
3146
3147 for (is_second_read = 0; is_second_read < has_second_read; is_second_read ++)
3148 {
3149 gene_vote_t * current_vote = is_second_read?vote_2: vote_1;
3150 char * current_read = is_second_read?read_text_2 : read_text_1;
3151 int current_rlen = is_second_read?read_len_2:read_len_1;
3152 int subread_step;
3153 if(current_rlen< 16) continue;
3154 int CR15GLS = (current_rlen - 15 - GENE_SLIDING_STEP)<<16;
3155 if(current_rlen<= EXON_LONG_READ_LENGTH)
3156 {
3157 subread_step = CR15GLS /(global_context -> config.total_subreads -1);
3158 if(subread_step<(GENE_SLIDING_STEP<<16))subread_step = GENE_SLIDING_STEP<<16;
3159 }else{
3160 subread_step = 6<<16;
3161 if( CR15GLS / subread_step > 62)
3162 subread_step = CR15GLS/62;
3163 }
3164
3165 int noninformative_subreads_for_each_gap[GENE_SLIDING_STEP];
3166
3167 applied_subreads = 1 + CR15GLS / subread_step;
3168 if(is_second_read) v2_all_subreads = applied_subreads;
3169 else v1_all_subreads = applied_subreads;
3170
3171 //SUBREADprintf("NSUBR=%d\tAPPLIED_SUBR=%d\tSTEP=%d\n", global_context -> config.total_subreads, applied_subreads, subread_step);
3172
3173 unsigned int current_high_border = high_index_border - current_rlen;
3174
3175 if(global_context->config.do_breakpoint_detection && current_rlen > EXON_LONG_READ_LENGTH)//&& global_context->config.all_threads<2)
3176 {
3177 char * current_qual = is_second_read?qual_text_2 : qual_text_1;
3178 core_fragile_junction_voting(global_context, thread_context, read_name_1, current_read, current_qual, current_rlen, is_reversed, global_context->config.space_type, low_index_border, current_high_border, vote_fg);
3179
3180 }
3181 if(global_context->config.SAM_extra_columns)
3182 {
3183 for(xk1=0;xk1<GENE_SLIDING_STEP;xk1++)
3184 noninformative_subreads_for_each_gap[xk1]=0;
3185 }
3186
3187 int allow_indel_i;
3188 unsigned int shift_indel_locs[ GENE_VOTE_TABLE_SIZE * GENE_VOTE_SPACE ];
3189 unsigned int shift_indel_NO = 0;
3190 for(allow_indel_i = 0; allow_indel_i<2; allow_indel_i++){
3191 if(is_reversed==0 || !(global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection)){
3192 if(is_second_read == 0)init_gene_vote(vote_1);
3193 if(is_second_read == 1)init_gene_vote(vote_2);
3194 }
3195
3196 for(subread_no=0; subread_no < applied_subreads ; subread_no++)
3197 {
3198 for(xk1=0; xk1<GENE_SLIDING_STEP ; xk1++)
3199 {
3200
3201 if(global_context->config.SAM_extra_columns)
3202 {
3203 current_vote -> noninformative_subreads = noninformative_subreads_for_each_gap[xk1];
3204 }
3205
3206 int subread_offset = ((subread_step * subread_no) >> 16);
3207 if(GENE_SLIDING_STEP > 1)
3208 subread_offset -= subread_offset%(GENE_SLIDING_STEP) - xk1;
3209
3210 char * subread_string = current_read + subread_offset;
3211
3212 gehash_key_t subread_integer = genekey2int(subread_string, global_context->config.space_type);
3213 if(global_context->config.is_methylation_reads)
3214 gehash_go_q_CtoT(global_context->current_index, subread_integer , subread_offset, current_rlen, is_reversed, current_vote, 1, 0xffffff, voting_max_indel_length, subread_no, 1, low_index_border, high_index_border - current_rlen);
3215 else
3216 gehash_go_X(global_context->current_index, subread_integer , subread_offset, current_rlen, is_reversed, current_vote, voting_max_indel_length, subread_no, low_index_border, current_high_border, allow_indel_i, shift_indel_locs, &shift_indel_NO);
3217 if(global_context->config.SAM_extra_columns)
3218 noninformative_subreads_for_each_gap[xk1] = current_vote -> noninformative_subreads;
3219 }
3220
3221 }
3222 if(shift_indel_NO == 0 || global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection)break;
3223 }
3224
3225 if(global_context->config.SAM_extra_columns)
3226 {
3227 short max_noninformative_subreads = -1;
3228
3229 for(xk1=0;xk1<GENE_SLIDING_STEP;xk1++)
3230 if(noninformative_subreads_for_each_gap[xk1] > max_noninformative_subreads)
3231 max_noninformative_subreads = noninformative_subreads_for_each_gap[xk1];
3232
3233 current_vote -> noninformative_subreads = max_noninformative_subreads;
3234 }
3235 }
3236
3237
3238
3239 if(is_reversed==1 || !(global_context-> config.do_fusion_detection || global_context-> config.do_long_del_detection))
3240 {
3241 //#warning "====== CHECK PRINTING !!! =========="
3242 if(0&¤t_read_number == 4000){
3243 SUBREADprintf(">>>%llu<<<\n%s [%d] %s\n%s [%d] %s VOTE1_MAX=%d >= %d\n", current_read_number, read_name_1, read_len_1, read_text_1, read_name_2, read_len_2, read_text_2, vote_1->max_vote, min_first_read_votes);
3244 SUBREADprintf(" ======= PAIR %s = %llu ; NON_INFORMATIVE = %d, %d =======\n", read_name_1, current_read_number, vote_1 -> noninformative_subreads, vote_2 -> noninformative_subreads);
3245 print_votes(vote_1, global_context -> config.index_prefix);
3246 print_votes(vote_2, global_context -> config.index_prefix);
3247 }
3248
3249 if(global_context -> input_reads.is_paired_end_reads)
3250 process_voting_junction(global_context, thread_context, current_read_number, vote_1, vote_2, read_name_1, read_name_2, read_text_1, read_text_2, read_len_1, read_len_2, is_reversed, v1_all_subreads, v2_all_subreads);
3251 else{
3252 if(vote_1->max_vote >= min_first_read_votes)
3253 process_voting_junction(global_context, thread_context, current_read_number, vote_1, vote_2, read_name_1, NULL , read_text_1, NULL, read_len_1, 0, is_reversed, v1_all_subreads, 0);
3254 else if(_global_retrieve_alignment_ptr(global_context, current_read_number, 0,0) -> selected_votes < 1)
3255 {
3256 mapping_result_t * allll = _global_retrieve_alignment_ptr(global_context, current_read_number, 0,0);
3257 allll -> noninformative_subreads_in_vote = 0;
3258
3259
3260 _global_retrieve_alignment_ptr(global_context, current_read_number, 0,0)->used_subreads_in_vote = max(_global_retrieve_alignment_ptr(global_context, current_read_number, 0,0)->used_subreads_in_vote, applied_subreads);
3261 _global_retrieve_alignment_ptr(global_context, current_read_number, 0,0)->noninformative_subreads_in_vote = max(_global_retrieve_alignment_ptr(global_context, current_read_number, 0,0)->noninformative_subreads_in_vote, vote_1 -> noninformative_subreads);
3262 }
3263 }
3264 }
3265
3266 if(is_reversed == 0)
3267 {
3268 reverse_read(read_text_1, read_len_1, global_context->config.space_type);
3269 if(global_context -> input_reads.is_paired_end_reads)
3270 reverse_read(read_text_2, read_len_2, global_context->config.space_type);
3271 }
3272 }
3273
3274 int read_1_reversed = 1;
3275 int read_2_reversed = 1;
3276 int best_read_id;
3277
3278 if(global_context -> is_final_voting_run){
3279 for(is_second_read = 0; is_second_read < 1 + global_context -> input_reads.is_paired_end_reads; is_second_read ++)
3280 {
3281 int * this_read_has_reversed = is_second_read ? &read_2_reversed:&read_1_reversed;
3282 char * current_read_name = is_second_read?read_name_2 : read_name_1;
3283 char * current_read = is_second_read?read_text_2 : read_text_1;
3284 char * current_qual = is_second_read?qual_text_2 : qual_text_1;
3285 int current_rlen = is_second_read?read_len_2:read_len_1;
3286
3287 for(best_read_id = 0; best_read_id < global_context -> config.multi_best_reads; best_read_id++)
3288 {
3289 mapping_result_t * current_r = _global_retrieve_alignment_ptr(global_context, current_read_number, is_second_read,best_read_id);
3290 if(current_r -> selected_votes < 1) continue;
3291
3292 int this_read_should_be_reversed = (current_r -> result_flags & CORE_IS_NEGATIVE_STRAND) ? 1:0;
3293
3294 //SUBREADprintf("DETECT INDEL: should_reverse = %d, this_has_reversed = %d\n", this_read_should_be_reversed, *this_read_has_reversed);
3295
3296 if(this_read_should_be_reversed != (*this_read_has_reversed))
3297 {
3298 (*this_read_has_reversed) = !(*this_read_has_reversed);
3299 reverse_read(current_read, current_rlen, global_context->config.space_type);
3300 if(current_qual)
3301 reverse_quality(current_qual , current_rlen);
3302 }
3303
3304 gene_value_index_t * curr_val_index = thread_context? thread_context -> current_value_index: global_context -> current_value_index;
3305 locate_current_value_index(global_context, thread_context, current_r, current_rlen);
3306
3307 // #warning "==== UNCOMMENT THE NEXT LINE ===="
3308
3309 if(0 && FIXLENstrcmp("R00000003493", current_read_name)==0)
3310 SUBREADprintf("TEST_BETTER: better_id = %d, votes = %d, has=%d\n", best_read_id, current_r -> selected_votes, has_better_mapping(global_context, thread_context, current_read_number, is_second_read,best_read_id));
3311
3312 if(!has_better_mapping(global_context, thread_context, current_read_number, is_second_read,best_read_id))
3313 find_new_indels(global_context, thread_context, current_read_number, current_read_name, current_read, current_qual, current_rlen, is_second_read, best_read_id);
3314 if(need_junction_step)
3315 find_new_junctions(global_context, thread_context, current_read_number, current_read_name, current_read, current_qual, current_rlen, is_second_read, best_read_id);
3316
3317 // #warning "==== UNCOMMENT THE NEXT LINE ===="
3318 if(0 && FIXLENstrcmp("R00000003493", current_read_name)==0){
3319 SUBREADprintf("TEST_BETTER_END: better_id = %d, votes = %d, has=%d\n", best_read_id, current_r -> selected_votes, has_better_mapping(global_context, thread_context, current_read_number, is_second_read,best_read_id));
3320 exit(1);
3321 }
3322
3323 if(thread_context) thread_context -> current_value_index = curr_val_index;
3324 else global_context -> current_value_index = curr_val_index;
3325 }
3326 }
3327 }
3328
3329 if(!thread_context || thread_context->thread_id == 0)
3330 {
3331 if(0 == global_context -> config.scRNA_input_mode && sqr_read_number > sqr_interval)
3332 {
3333 show_progress(global_context, thread_context, current_read_number, STEP_VOTING);
3334 sqr_read_number = 0;
3335 unsigned long long total_file_size = global_context -> input_reads.first_read_file_size;
3336 unsigned long long guessed_all_reads = total_file_size / global_context -> input_reads . avg_read_length;// / (1+global_context -> config.is_SAM_file_input);
3337 sqr_interval = max(10000,guessed_all_reads / global_context -> config.all_threads/10);
3338 }
3339 if(global_context -> config.scRNA_input_mode && current_read_number - last_shown_curr >= 1000000){
3340 show_progress(global_context, thread_context, current_read_number + global_context -> all_processed_reads, STEP_VOTING);
3341 last_shown_curr = current_read_number;
3342 }
3343 }
3344
3345
3346 //bigtable_release_result(global_context, thread_context, current_read_number, 1);
3347 sqr_read_number++;
3348
3349 }
3350
3351 free(vote_1);
3352 free(vote_2);
3353 free(vote_fg);
3354 free(read_text_1);
3355 free(read_text_2);
3356 free(qual_text_1);
3357 free(qual_text_2);
3358
3359 return 0;
3360 }
3361
3362 void subread_init_topKbuff(global_context_t * global_context, topK_buffer_t * topKbuff){
3363 topKbuff -> vote_simple_1_buffer = malloc(global_context -> config.max_vote_simples * sizeof(simple_mapping_t));
3364 topKbuff -> vote_simple_2_buffer = malloc(global_context -> config.max_vote_simples * sizeof(simple_mapping_t));
3365
3366 topKbuff -> junction_tmp_r1 = malloc(sizeof(subjunc_result_t) * global_context->config.multi_best_reads);
3367 topKbuff -> junction_tmp_r2 = malloc(sizeof(subjunc_result_t) * global_context->config.multi_best_reads);
3368
3369 topKbuff -> alignment_tmp_r1 = malloc(sizeof(mapping_result_t) * global_context->config.multi_best_reads);
3370 topKbuff -> alignment_tmp_r2 = malloc(sizeof(mapping_result_t) * global_context->config.multi_best_reads);
3371
3372 topKbuff -> comb_buffer = malloc(global_context -> config.max_vote_combinations * sizeof(vote_combination_t));
3373 }
3374
3375 void subread_free_topKbuff(global_context_t * global_context, topK_buffer_t * topKbuff){
3376
3377 free(topKbuff ->junction_tmp_r1);
3378 free(topKbuff ->junction_tmp_r2);
3379 free(topKbuff ->alignment_tmp_r1);
3380 free(topKbuff ->alignment_tmp_r2);
3381 free(topKbuff ->comb_buffer);
3382 free(topKbuff ->vote_simple_1_buffer);
3383 free(topKbuff ->vote_simple_2_buffer);
3384 }
3385
3386
3387
3388 void * run_in_thread(void * pthread_param)
3389 {
3390 void ** parameters = (void **)pthread_param;
3391 global_context_t * global_context = (global_context_t * ) parameters[0];
3392 thread_context_t * thread_context = (thread_context_t *) parameters[1];
3393 int task = *((int *)(parameters[2]));
3394 subread_lock_t * thread_init_lock = (subread_lock_t * ) parameters[3];
3395 int * ret_value_pointer = (int *)parameters[4];
3396
3397 if(thread_init_lock)
3398 subread_lock_release(thread_init_lock);
3399
3400
3401 switch(task)
3402 {
3403 case STEP_VOTING:
3404 *ret_value_pointer = do_voting(global_context, thread_context);
3405 break;
3406 case STEP_ITERATION_TWO:
3407 *ret_value_pointer = do_iteration_two(global_context, thread_context);
3408 break;
3409
3410 }
3411
3412 //sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_DETAILS, "finished running %d", task);
3413 return NULL;
3414 }
3415
3416 int run_maybe_threads(global_context_t *global_context, int task)
3417 {
3418 void * thr_parameters [5];
3419 int ret_value =0;
3420
3421 if(task == STEP_ITERATION_TWO){
3422 global_context -> last_written_fragment_number = -1;
3423 }
3424
3425
3426 if(global_context->config.all_threads<2) {
3427 thr_parameters[0] = global_context;
3428 thr_parameters[1] = NULL;
3429 thr_parameters[2] = &task;
3430 thr_parameters[3] = NULL;
3431 thr_parameters[4] = &ret_value;
3432
3433 run_in_thread(thr_parameters);
3434 if(STEP_VOTING == task){
3435 // sort and merge events from all threads and the global event space.
3436 sort_global_event_table(global_context);
3437 // sort the event entry table at each location.
3438 //#warning "==== UNCOMMENT NEXT LINE ======"
3439 sort_junction_entry_table(global_context);
3440 }
3441 } else {
3442 int current_thread_no ;
3443 thread_context_t thread_contexts[64];
3444 int ret_values[64];
3445
3446 memset(thread_contexts, 0, sizeof(thread_context_t)*64);
3447 global_context -> all_thread_contexts = thread_contexts;
3448
3449 for(current_thread_no = 0 ; current_thread_no < global_context->config.all_threads ; current_thread_no ++) {
3450 thread_contexts[current_thread_no].thread_id = current_thread_no;
3451 init_indel_thread_contexts(global_context, thread_contexts+current_thread_no, task);
3452 if((global_context->config.do_breakpoint_detection || global_context-> config.do_fusion_detection || global_context->config.do_breakpoint_detection || global_context-> config.do_long_del_detection))
3453 init_junction_thread_contexts(global_context, thread_contexts+current_thread_no, task);
3454
3455 if(STEP_VOTING == task) subread_init_topKbuff(global_context,&thread_contexts[current_thread_no].topKbuff);
3456
3457 subread_lock_occupy(&global_context -> thread_initial_lock);
3458 thr_parameters[0] = global_context;
3459 thr_parameters[1] = thread_contexts+current_thread_no;
3460 thr_parameters[2] = &task;
3461 thr_parameters[3] = (void *)&global_context -> thread_initial_lock;
3462 thr_parameters[4] = ret_values + current_thread_no;
3463
3464 pthread_create(&thread_contexts[current_thread_no].thread, NULL, run_in_thread, &thr_parameters);
3465 }
3466
3467 for(current_thread_no = 0 ; current_thread_no < global_context->config.all_threads ; current_thread_no ++) {
3468 pthread_join(thread_contexts[current_thread_no].thread, NULL);
3469
3470 if(STEP_ITERATION_TWO == task){
3471 global_context -> all_mapped_reads += thread_contexts[current_thread_no].all_mapped_reads;
3472 global_context -> all_correct_PE_reads += thread_contexts[current_thread_no].all_correct_PE_reads;
3473 global_context -> not_properly_pairs_wrong_arrangement += thread_contexts[current_thread_no].not_properly_pairs_wrong_arrangement;
3474 global_context -> not_properly_pairs_different_chro += thread_contexts[current_thread_no].not_properly_pairs_different_chro;
3475 global_context -> not_properly_different_strands += thread_contexts[current_thread_no].not_properly_different_strands;
3476 global_context -> not_properly_pairs_TLEN_wrong += thread_contexts[current_thread_no].not_properly_pairs_TLEN_wrong;
3477 global_context -> all_unmapped_reads += thread_contexts[current_thread_no].all_unmapped_reads;
3478 global_context -> not_properly_pairs_only_one_end_mapped += thread_contexts[current_thread_no].not_properly_pairs_only_one_end_mapped;
3479 global_context -> all_multimapping_reads += thread_contexts[current_thread_no].all_multimapping_reads;
3480 global_context -> all_uniquely_mapped_reads += thread_contexts[current_thread_no].all_uniquely_mapped_reads;
3481
3482 }
3483 if(STEP_VOTING == task) subread_free_topKbuff(global_context,&thread_contexts[current_thread_no].topKbuff);
3484 ret_value += *(ret_values + current_thread_no);
3485 if(ret_value)break;
3486 }
3487
3488 // sort and merge events from all threads and the global event space.
3489 finalise_indel_and_junction_thread(global_context, thread_contexts, task);
3490 if(STEP_VOTING == task)
3491 // sort the event entry table at each location.
3492 sort_junction_entry_table(global_context);
3493 }
3494
3495 if(CORE_SOFT_BR_CHAR == '\r')
3496 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO, "");
3497 return ret_value;
3498 }
3499
3500 void clean_context_after_chunk(global_context_t * context)
3501 {
3502 context -> running_processed_reads_in_chunk = 0;
3503 context -> processed_reads_in_chunk = 0;
3504 init_bigtable_results(context, 1);
3505
3506 indel_context_t * indel_context = (indel_context_t *)context -> module_contexts[MODULE_INDEL_ID];
3507 chromosome_event_t * event_space = indel_context -> event_space_dynamic;
3508
3509 int event_id;
3510 //memset(context -> big_margin_record , 0 , sizeof(*context -> big_margin_record) * context ->config.reads_per_chunk * (context->input_reads.is_paired_end_reads?2:1) * context->config.big_margin_record_size);
3511 for(event_id = 0; event_id < indel_context->total_events; event_id++){
3512 chromosome_event_t * event_body = event_space + event_id;
3513 event_body -> critical_read_id = 0xffffffffffffffffllu;
3514 }
3515 }
3516
3517 #define SKIP_CORE_NOEMPTY(fp_loc, buf_loc) { while(1){char *ret_loc = fgets(buf_loc, 3000, fp_loc); if(buf_loc[0]!='\n' || !ret_loc) break; } }
3518
3519 void locate_read_files(global_context_t * global_context, int type)
3520 {
3521 // The BCL input module uses its own chunking algorithm.
3522 if(global_context -> input_reads.first_read_file.file_type == GENE_INPUT_BCL) return;
3523
3524 if(type==SEEK_SET) {
3525 geinput_tell(&global_context -> input_reads.first_read_file, &global_context -> current_circle_start_position_file1);
3526 if(global_context ->input_reads.is_paired_end_reads)
3527 geinput_tell(&global_context -> input_reads.second_read_file, &global_context -> current_circle_start_position_file2);
3528 } else {
3529 geinput_tell(&global_context -> input_reads.first_read_file, &global_context -> current_circle_end_position_file1);
3530 if(global_context ->input_reads.is_paired_end_reads)
3531 geinput_tell(&global_context -> input_reads.second_read_file, &global_context -> current_circle_end_position_file2);
3532 }
3533 if(global_context -> input_reads.first_read_file.file_type == GENE_INPUT_SCRNA_FASTQ) return;
3534 if(global_context -> input_reads.first_read_file.file_type == GENE_INPUT_SCRNA_BAM) return;
3535
3536 // This variable is only used to display the running progress.
3537 // That is why the scRNA-seq mode doesn't need to keep the start pos.
3538 if(type==SEEK_SET)
3539 global_context -> current_circle_start_abs_offset_file1 = geinput_file_offset(&(global_context -> input_reads.first_read_file));
3540
3541 }
3542
3543 void rewind_read_files(global_context_t * global_context, int type)
3544 {
3545 if(type==SEEK_SET)
3546 {
3547 geinput_seek(&global_context -> input_reads.first_read_file, & global_context -> current_circle_start_position_file1);
3548 if(global_context ->input_reads.is_paired_end_reads)
3549 geinput_seek(&global_context -> input_reads.second_read_file, & global_context -> current_circle_start_position_file2);
3550 }
3551 else
3552 {
3553 geinput_seek(&global_context -> input_reads.first_read_file, & global_context -> current_circle_end_position_file1);
3554 if(global_context ->input_reads.is_paired_end_reads)
3555 geinput_seek(&global_context -> input_reads.second_read_file, & global_context -> current_circle_end_position_file2);
3556
3557 }
3558 }
3559
3560 void go_chunk_start( global_context_t * global_context ){
3561 if(global_context -> input_reads.first_read_file.file_type == GENE_INPUT_BCL)
3562 cacheBCL_go_chunk_start(&global_context -> input_reads.first_read_file.bcl_input);
3563 else
3564 rewind_read_files(global_context, SEEK_SET);
3565 global_context -> running_processed_reads_in_chunk=0;
3566 }
3567
3568 void go_chunk_nextchunk( global_context_t * global_context ){
3569 if(global_context -> input_reads.first_read_file.file_type == GENE_INPUT_BCL)
3570 cacheBCL_go_chunk_end(&global_context -> input_reads.first_read_file.bcl_input);
3571 else
3572 rewind_read_files(global_context, SEEK_END);
3573 global_context -> running_processed_reads_in_chunk=0;
3574 }
3575
3576 int read_chunk_circles(global_context_t *global_context)
3577 {
3578 int block_no;
3579
3580 // printf("GINP1 AT %llu\n", ftello(global_context -> input_reads.first_read_file.input_fp));
3581
3582 unsigned int chunk_no = 0;
3583
3584 global_context -> current_index = (gehash_t*) malloc(sizeof(gehash_t));
3585 global_context -> current_value_index = global_context -> all_value_indexes;
3586 global_context -> running_processed_reads_in_chunk=0;
3587 global_context -> processed_reads_in_chunk=0;
3588
3589 double time_load_index = miltime();
3590 for(block_no = 0; block_no< global_context->index_block_number; block_no++)
3591 {
3592 char tmp_fname[MAX_FILE_NAME_LENGTH+ 30];
3593 sprintf(tmp_fname, "%s.%02d.%c.array", global_context->config.index_prefix, block_no, global_context->config.space_type == GENE_SPACE_COLOR?'c':'b');
3594 if(gvindex_load(&global_context -> all_value_indexes[block_no], tmp_fname)) return -1;
3595 }
3596 double period_load_index = miltime() - time_load_index;
3597 global_context -> timecost_load_index += period_load_index;
3598
3599 while(1)
3600 {
3601 int ret;
3602
3603 locate_read_files(global_context, SEEK_SET);
3604 for(global_context->current_index_block_number = 0; global_context->current_index_block_number < global_context->index_block_number; global_context->current_index_block_number++)
3605 {
3606 char tmp_fname[MAX_FILE_NAME_LENGTH+30];
3607 time_load_index = miltime();
3608 if(global_context->index_block_number > 1 || chunk_no == 0)
3609 {
3610 sprintf(tmp_fname, "%s.%02d.%c.tab", global_context->config.index_prefix, global_context->current_index_block_number, global_context->config.space_type == GENE_SPACE_COLOR?'c':'b');
3611 print_in_box(80,0,0, "Load the %d-th index block...",1+ global_context->current_index_block_number);
3612
3613 if(gehash_load(global_context -> current_index, tmp_fname)) return -1;
3614 print_in_box(80,0,0, "The index block has been loaded.");
3615 sprintf(tmp_fname, "%s.%02d.%c.array", global_context->config.index_prefix, global_context->current_index_block_number, global_context->config.space_type == GENE_SPACE_COLOR?'c':'b');
3616 }
3617 period_load_index = miltime() - time_load_index;
3618 global_context -> timecost_load_index += period_load_index;
3619 global_context -> current_value_index = global_context -> all_value_indexes + global_context->current_index_block_number;
3620
3621 if(global_context->current_index_block_number ==0 && global_context -> all_processed_reads==0)
3622 global_context->align_start_time = miltime();
3623
3624 if(global_context->index_block_number == global_context->current_index_block_number + 1)
3625 global_context -> is_final_voting_run = 1;
3626 else global_context -> is_final_voting_run = 0;
3627
3628 print_in_box(80,0,0, "Start read mapping in chunk.");
3629 double time_start_voting = miltime();
3630 ret = run_maybe_threads(global_context, STEP_VOTING);
3631 double period_voting = miltime() - time_start_voting;
3632 global_context -> timecost_voting += period_voting;
3633
3634 if(0 == global_context->current_index_block_number){
3635 locate_read_files(global_context, SEEK_END);
3636 global_context -> processed_reads_in_chunk = global_context -> running_processed_reads_in_chunk;
3637 }
3638
3639 if(global_context->current_index_block_number < global_context->index_block_number -1)
3640 go_chunk_start(global_context);
3641 // rewind_read_files(global_context, SEEK_SET);
3642
3643 int is_last_chunk = global_context -> processed_reads_in_chunk < global_context->config.reads_per_chunk;
3644
3645 if(global_context->index_block_number > 1 || is_last_chunk)
3646 gehash_destory_fast(global_context -> current_index);
3647
3648 if(ret) break;
3649 if(!global_context -> processed_reads_in_chunk) break;
3650 }
3651
3652
3653 // after the voting step, all subread index blocks are released and all base index blocks are loaded at once.
3654 double time_before_realign = miltime();
3655
3656 if(0 == chunk_no && global_context->config.exon_annotation_file[0] && global_context->config.do_breakpoint_detection){
3657 ret = ret || load_known_junctions(global_context);
3658 if(!ret){
3659 // sort and merge events from all threads and the global event space.
3660 sort_global_event_table(global_context);
3661 // sort the event entry table at each location.
3662 sort_junction_entry_table(global_context);
3663 }
3664 }
3665
3666 ret = ret || anti_supporting_read_scan(global_context);
3667 remove_neighbour(global_context);
3668
3669 go_chunk_start(global_context);
3670 // rewind_read_files(global_context, SEEK_SET);
3671 double period_before_realign = miltime() - time_before_realign;
3672 global_context -> timecost_before_realign += period_before_realign;
3673
3674 double time_realign = miltime();
3675 ret = ret || run_maybe_threads(global_context, STEP_ITERATION_TWO);
3676 double period_realign = miltime() - time_realign;
3677
3678 global_context -> timecost_for_realign += period_realign;
3679
3680 if(global_context -> config.is_third_iteration_running)
3681 {
3682 go_chunk_start(global_context);
3683 //rewind_read_files(global_context, SEEK_SET);
3684 ret = ret || do_iteration_three(global_context, NULL);
3685 }
3686
3687 go_chunk_nextchunk(global_context);
3688 //rewind_read_files(global_context, SEEK_END);
3689
3690 global_context -> all_processed_reads+= global_context ->processed_reads_in_chunk;
3691
3692 if(ret) return ret;
3693
3694 if(global_context -> processed_reads_in_chunk < global_context->config.reads_per_chunk ||
3695 (global_context -> config.is_BAM_output && global_context -> output_bam_writer -> is_internal_error) || global_context -> input_reads.is_internal_error ||
3696 (global_context -> output_sam_is_full))
3697 // base value indexes loaded in the last circle are not destroyed and are used in writting the indel VCF.
3698 // the indexes will be destroyed in destroy_global_context
3699 break;
3700
3701 clean_context_after_chunk(global_context);
3702 chunk_no++;
3703 }
3704
3705 free(global_context -> current_index);
3706
3707 if(global_context -> config.is_third_iteration_running)
3708 finalise_long_insertions(global_context);
3709
3710 return 0;
3711 }
3712
3713 void char_strftime(char * tbuf){
3714 time_t rawtime;
3715 struct tm * timeinfo;
3716
3717 time (&rawtime);
3718 timeinfo = localtime (&rawtime);
3719 strftime (tbuf,80,"%d-%b-%Y %H:%M:%S",timeinfo);
3720
3721 }
3722
3723 void print_subread_logo()
3724 {
3725 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO ," %c[44;37m ========== %c[0m%c[36m _____ _ _ ____ _____ ______ _____ ", CHAR_ESC, CHAR_ESC, CHAR_ESC);
3726 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO ," %c[44;37m ===== %c[0m%c[36m / ____| | | | _ \\| __ \\| ____| /\\ | __ \\ ", CHAR_ESC, CHAR_ESC, CHAR_ESC);
3727 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO ," %c[44;37m ===== %c[0m%c[36m | (___ | | | | |_) | |__) | |__ / \\ | | | |", CHAR_ESC, CHAR_ESC, CHAR_ESC);
3728 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO ," %c[44;37m ==== %c[0m%c[36m \\___ \\| | | | _ <| _ /| __| / /\\ \\ | | | |", CHAR_ESC, CHAR_ESC, CHAR_ESC);
3729 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO ," %c[44;37m ==== %c[0m%c[36m ____) | |__| | |_) | | \\ \\| |____ / ____ \\| |__| |", CHAR_ESC, CHAR_ESC, CHAR_ESC);
3730 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO ," %c[44;37m ========== %c[0m%c[36m |_____/ \\____/|____/|_| \\_\\______/_/ \\_\\_____/%c[0m", CHAR_ESC, CHAR_ESC, CHAR_ESC, CHAR_ESC);
3731 #ifdef MAKE_STANDALONE
3732 char * spaces = "";
3733 if(strlen(SUBREAD_VERSION) == 8) spaces = "";
3734 else if(strlen(SUBREAD_VERSION) == 5) spaces = " ";
3735 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO ," %sv%s",spaces,SUBREAD_VERSION);
3736 #else
3737 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_INFO ," %s",SUBREAD_VERSION);
3738 #endif
3739 }
3740
3741
3742
3743 int print_configuration(global_context_t * context)
3744 {
3745 setlocale(LC_NUMERIC, "");
3746 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"");
3747 print_subread_logo();
3748 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"");
3749 print_in_box(80, 1, 1, context->config.entry_program_name == CORE_PROGRAM_SUBJUNC?"setting":"setting");
3750 print_in_box(80, 0, 1, "");
3751
3752 if(context->config.do_breakpoint_detection)
3753 {
3754 if(context-> config.do_fusion_detection)
3755 print_in_box(80, 0, 0, "Function : Read alignment + Junction/Fusion detection%s", context->config.experiment_type == CORE_EXPERIMENT_DNASEQ?" (DNA-Seq)":" (RNA-Seq)");
3756 else if(context-> config.do_long_del_detection)
3757 print_in_box(80, 0, 0, "Function : Read alignment + Long Deletion detection%s", context->config.experiment_type == CORE_EXPERIMENT_DNASEQ?" (DNA-Seq)":" (RNA-Seq)");
3758 else
3759 print_in_box(80, 0, 0, "Function : Read alignment + Junction detection (%s)", context->config.experiment_type == CORE_EXPERIMENT_DNASEQ?"DNA-Seq":"RNA-Seq");
3760 }
3761 else
3762 print_in_box(80, 0, 0, "Function : Read alignment%s", context->config.experiment_type == CORE_EXPERIMENT_DNASEQ?" (DNA-Seq)":" (RNA-Seq)");
3763 if( context->config.second_read_file[0]){
3764 print_in_box(80, 0, 0, "Input file 1 : %s", get_short_fname(context->config.first_read_file));
3765 print_in_box(80, 0, 0, "Input file 2 : %s", get_short_fname(context->config.second_read_file));
3766 }else if(context->config.scRNA_input_mode == GENE_INPUT_SCRNA_FASTQ){
3767 const char *strmm_tmp = context->config.first_read_file;
3768 int sample_no = 1;
3769 while((strmm_tmp = strstr(strmm_tmp, SCRNA_FASTA_SPLIT1))!=NULL) {
3770 sample_no++;
3771 strmm_tmp++;
3772 }
3773 print_in_box(80, 0, 0, "Input file : %d samples from scRNA-seq", sample_no);
3774 }else if(context->config.scRNA_input_mode == GENE_INPUT_BCL){
3775 print_in_box(80, 0, 0, "Input file : %s%s", get_short_fname(context->config.first_read_file), " (scRNA)");
3776 }else if(context->config.scRNA_input_mode == GENE_INPUT_SCRNA_BAM){
3777 print_in_box(80, 0, 0, "Input file : %s%s", get_short_fname(context->config.first_read_file), " (10X BAM)");
3778 }else{
3779 print_in_box(80, 0, 0, "Input file : %s%s", get_short_fname(context->config.first_read_file), context->config.is_SAM_file_input?(context->config.is_BAM_input?" (BAM)":" (SAM)"):(""));
3780 }
3781
3782 if(context->config.output_prefix [0])
3783 print_in_box(80, 0, 0, "Output file : %s (%s)%s", get_short_fname(context->config.output_prefix), context->config.is_BAM_output?"BAM":"SAM", context->config.is_input_read_order_required?", Keep Order":(context->config.sort_reads_by_coordinates?", Sorted":""));
3784 else
3785 print_in_box(80, 0, 0, "Output method : STDOUT (%s)" , context->config.is_BAM_output?"BAM":"SAM");
3786
3787 print_in_box(80, 0, 0, "Index name : %s", get_short_fname(context->config.index_prefix));
3788 print_in_box(80, 0, 0, "");
3789 print_in_box(80, 0, 1, "------------------------------------");
3790 print_in_box(80, 0, 0, "");
3791 print_in_box(80, 0, 0, " Threads : %d", context->config.all_threads);
3792 print_in_box(80, 0, 0, " Phred offset : %d", (context->config.phred_score_format == FASTQ_PHRED33)?33:64);
3793 if( context->config.second_read_file[0])
3794 {
3795 print_in_box(80, 0, 0, " # of extracted subreads : %d", context->config.total_subreads);
3796 print_in_box(80, 0, 0, " Min read1 vote : %d", context->config.minimum_subread_for_first_read);
3797 print_in_box(80, 0, 0, " Min read2 vote : %d", context->config.minimum_subread_for_second_read);
3798 print_in_box(80, 0, 0, " Max fragment size : %d", context->config.maximum_pair_distance);
3799 print_in_box(80, 0, 0, " Min fragment size : %d", context->config.minimum_pair_distance);
3800 }
3801 else
3802 print_in_box(80, 0, 0, " Min votes : %d / %d", context->config.minimum_subread_for_first_read, context->config.total_subreads);
3803
3804 print_in_box(80, 0, 0, " Max mismatches : %d", context->config.max_mismatch_exonic_reads);
3805 print_in_box(80, 0, 0, " Max indel length : %d", context->config.max_indel_length);
3806 print_in_box(80, 0, 0, " Report multi-mapping reads : %s", context->config.report_multi_mapping_reads?"yes":"no");
3807 print_in_box(80, 0, 0, "Max alignments per multi-mapping read : %d", context->config.multi_best_reads);
3808
3809 if(context->config.exon_annotation_file[0]){
3810 if(context->config.exon_annotation_file_screen_out[0])
3811 print_in_box(80, 0, 0, " Annotations : %s", context->config.exon_annotation_file_screen_out);
3812 else
3813 print_in_box(80, 0, 0, " Annotations : %s (%s)",get_short_fname( context->config.exon_annotation_file), context->config.exon_annotation_file_type==FILE_TYPE_GTF?"GTF":"SAF");
3814 }
3815
3816 if(context->config.max_insertion_at_junctions)
3817 print_in_box(80, 0, 0, " Insertions at junc : %d", context->config.max_insertion_at_junctions);
3818
3819 if(context->config.read_group_id[0])
3820 print_in_box(80, 0, 0, " Read group name : %s", context->config.read_group_id);
3821
3822 print_in_box(80, 0, 1, "");
3823 print_in_box(80, 2, 1, "");
3824 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"");
3825
3826
3827 if(!context->config.experiment_type){
3828 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"You have to specify the experiment type by using the '-t' option.\n");
3829 return -1;
3830 }
3831
3832 if(!context->config.first_read_file[0])
3833 {
3834 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"You have to specify at least one input file in the FASTQ/FASTA/PLAIN format using the '-r' option.\n");
3835 return -1;
3836 }
3837
3838 if(0 && !context->config.output_prefix[0])
3839 {
3840 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"You have to specify the path of output using the '-o' option.\n");
3841 return -1;
3842 }
3843
3844 if(!context->config.index_prefix[0])
3845 {
3846 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"You have to specify the prefix of the index files using the '-i' option.\n");
3847 return -1;
3848 }
3849 char tbuf[90];
3850 char_strftime(tbuf);
3851
3852 print_in_box(80,1,1,"Running (%s, pid=%d)", tbuf, getpid());
3853 print_in_box(80,0,1,"");
3854
3855
3856 return 0;
3857 }
3858
3859
3860
3861 int init_paired_votes(global_context_t *context)
3862 {
3863 init_bigtable_results(context, 0);
3864 return 0;
3865 }
3866
3867 #define FETCH_SEQ_LEN(x1) (context->chromosome_table.read_offsets[xk1] - last_offset+16 - ( 2 * context->chromosome_table.padding ))
3868
3869 void write_sam_headers(global_context_t * context)
3870 {
3871 char * sorting_str = context -> config.sort_reads_by_coordinates?"coordinate":"unsorted";
3872 if(context -> config.is_BAM_output)
3873 {
3874 char header_buff[100];
3875 sprintf(header_buff, "@HD\tVN:1.0\tSO:%s", sorting_str);
3876 SamBam_writer_add_header(context -> output_bam_writer, header_buff, 0);
3877 int xk1;
3878 int last_offset = 0;
3879 char * obuf = malloc(10000+5000);
3880 for(xk1=0; xk1< context->chromosome_table.total_offsets; xk1++)
3881 {
3882 int seq_len = FETCH_SEQ_LEN(x1);
3883 //context->chromosome_table.read_offsets[xk1] - last_offset+16
3884 SamBam_writer_add_chromosome(context -> output_bam_writer, context->chromosome_table.read_names+ xk1 * MAX_CHROMOSOME_NAME_LEN, seq_len, 1);
3885 last_offset = context->chromosome_table.read_offsets[xk1];
3886 }
3887
3888
3889 if(context->config.read_group_id[0])
3890 {
3891 snprintf(obuf,10000, "@RG\tID:%s%s",context->config.read_group_id, context->config.read_group_txt);
3892 SamBam_writer_add_header(context -> output_bam_writer,obuf, 0);
3893 }
3894 snprintf(obuf,9999+4900, "@PG\tID:subread\tPN:subread\tVN:%s\tCL:%s", SUBREAD_VERSION, context->rebuilt_command_line);
3895 SamBam_writer_add_header(context -> output_bam_writer,obuf, 0);
3896 SamBam_writer_finish_header(context -> output_bam_writer);
3897 free(obuf);
3898 }
3899 else
3900 {
3901 sambamout_fprintf(context -> output_sam_fp, "@HD\tVN:1.0\tSO:%s\n", sorting_str);
3902 int xk1;
3903 int last_offset = 0;
3904 for(xk1=0; xk1< context->chromosome_table.total_offsets; xk1++)
3905 {
3906 int seq_len = FETCH_SEQ_LEN(x1);
3907 sambamout_fprintf(context -> output_sam_fp, "@SQ\tSN:%s\tLN:%u\n", context->chromosome_table.read_names+ xk1 * MAX_CHROMOSOME_NAME_LEN, seq_len);
3908 last_offset = context->chromosome_table.read_offsets[xk1];
3909 }
3910
3911 if(context->config.read_group_id[0])
3912 sambamout_fprintf(context -> output_sam_fp, "@RG\tID:%s%s\n",context->config.read_group_id, context->config.read_group_txt);
3913 sambamout_fprintf(context -> output_sam_fp, "@PG\tID:subread\tPN:subread\tVN:%s\tCL:%s\n", SUBREAD_VERSION, context->rebuilt_command_line);
3914
3915 }
3916 }
3917
3918 #define EXONIC_REGION_RESOLUTION 16
3919
3920 int is_pos_in_annotated_exon_regions(global_context_t * global_context, unsigned int pos){
3921 int exonic_map_byte = pos / EXONIC_REGION_RESOLUTION / 8;
3922 int exonic_map_bit = (pos / EXONIC_REGION_RESOLUTION) % 8;
3923
3924 return (global_context ->exonic_region_bitmap [exonic_map_byte] & (1<<exonic_map_bit))?1:0;
3925
3926 }
3927
3928 char * get_sam_chro_name_from_alias(HashTable * tab, char * anno_chro){
3929 KeyValuePair * cursor = NULL;
3930 long x1;
3931 for(x1 = 0; x1 < tab -> numOfBuckets; x1 ++){
3932 cursor = tab -> bucketArray[x1];
3933 while(1){
3934 if(NULL == cursor)break;
3935 char * tab_anno_chro = cursor -> value;
3936 if(strcmp(tab_anno_chro, anno_chro) == 0) return (char *)cursor -> key;
3937 cursor = cursor -> next;
3938 }
3939 }
3940 return NULL;
3941 }
3942
3943 int do_anno_bitmap_add_feature(char * gene_name, char * tracnscript_id, char * chro_name, unsigned int feature_start, unsigned int feature_end, int is_negative_strand, void * context){
3944 global_context_t * global_context = context;
3945 if(global_context -> sam_chro_to_anno_chr_alias){
3946 char * sam_chro = get_sam_chro_name_from_alias(global_context -> sam_chro_to_anno_chr_alias, chro_name);
3947 if(sam_chro!=NULL) chro_name = sam_chro;
3948 }
3949
3950 if(!HashTableGet(global_context -> annotation_chro_table,chro_name))
3951 HashTablePut(global_context -> annotation_chro_table, memstrcpy( chro_name ), NULL+1);
3952
3953 char tmp_chro_name[MAX_CHROMOSOME_NAME_LEN];
3954 int access_n = HashTableGet( global_context -> chromosome_table.read_name_to_index, chro_name ) - NULL;
3955 if(access_n < 1){
3956 if(chro_name[0]=='c' && chro_name[1]=='h' && chro_name[2]=='r'){
3957 chro_name += 3;
3958 }else{
3959 strcpy(tmp_chro_name, "chr");
3960 strcat(tmp_chro_name, chro_name);
3961 chro_name = tmp_chro_name;
3962 }
3963 }
3964
3965 unsigned int exonic_map_start = linear_gene_position(&global_context->chromosome_table , chro_name, feature_start);
3966 unsigned int exonic_map_stop = linear_gene_position(&global_context->chromosome_table , chro_name, feature_end);
3967 if(exonic_map_start > 0xffffff00 || exonic_map_stop > 0xffffff00){
3968 return -1;
3969 }
3970
3971 exonic_map_start -= exonic_map_start%EXONIC_REGION_RESOLUTION;
3972 exonic_map_stop -= exonic_map_stop%EXONIC_REGION_RESOLUTION;
3973
3974 for(; exonic_map_start <= exonic_map_stop; exonic_map_start+=EXONIC_REGION_RESOLUTION){
3975 int exonic_map_byte = exonic_map_start / EXONIC_REGION_RESOLUTION / 8;
3976 int exonic_map_bit = (exonic_map_start / EXONIC_REGION_RESOLUTION) % 8;
3977
3978 global_context ->exonic_region_bitmap[exonic_map_byte] |= (1<<exonic_map_bit);
3979 }
3980 return 0;
3981 }
3982
3983 void convert_table_key_to_alias(void * ky ,void * va, HashTable * tab){
3984 HashTable * alias_index_to_annot = tab -> appendix2;
3985 HashTable * result_annot_chros = tab -> appendix1;
3986
3987 char * index_name = ky;
3988 char * annot_name = HashTableGet(alias_index_to_annot, index_name);
3989 if(annot_name==NULL) annot_name = index_name;
3990 HashTablePut( result_annot_chros, annot_name, va );
3991 }
3992
3993 void warning_anno_vs_index(HashTable * anno_chros_tab, gene_offset_t * index_chros_offset, HashTable * alias_index_to_annot){
3994
3995 HashTable * index_chros_tab = index_chros_offset -> read_name_to_index;
3996 HashTable * chros_in_annot_using_index_names = anno_chros_tab ;
3997 HashTable * chros_in_index_using_annot_names = index_chros_tab ;
3998 HashTable * alias_annot_to_index = NULL;
3999
4000 if(alias_index_to_annot){
4001 chros_in_annot_using_index_names = StringTableCreate(1000);
4002 chros_in_index_using_annot_names = StringTableCreate(1000);
4003
4004 alias_annot_to_index = StringTableReverse(alias_index_to_annot);
4005 anno_chros_tab -> appendix1 = chros_in_annot_using_index_names;
4006 anno_chros_tab -> appendix2 = alias_annot_to_index;
4007 HashTableIteration(anno_chros_tab, convert_table_key_to_alias);
4008
4009 index_chros_tab -> appendix1 = chros_in_index_using_annot_names;
4010 index_chros_tab -> appendix2 = alias_index_to_annot;
4011 HashTableIteration(index_chros_tab, convert_table_key_to_alias);
4012 }
4013
4014 warning_hash_hash(anno_chros_tab, chros_in_index_using_annot_names, "Chromosomes/contigs in annotation but not in index :");
4015 warning_hash_hash(index_chros_tab, chros_in_annot_using_index_names, "Chromosomes/contigs in index but not in annotation :");
4016
4017 if(alias_index_to_annot){
4018 HashTableDestroy(alias_annot_to_index);
4019 HashTableDestroy(chros_in_annot_using_index_names);
4020 HashTableDestroy(chros_in_index_using_annot_names);
4021 }
4022 }
4023
4024 int load_annotated_exon_regions(global_context_t * global_context){
4025 int bitmap_size = (4096 / EXONIC_REGION_RESOLUTION / 8)*1024*1024;
4026 global_context ->exonic_region_bitmap = malloc(bitmap_size);
4027 memset( global_context ->exonic_region_bitmap , 0, bitmap_size );
4028 global_context -> annotation_chro_table = HashTableCreate(1003);
4029 HashTableSetDeallocationFunctions( global_context -> annotation_chro_table, free, NULL);
4030 HashTableSetKeyComparisonFunction( global_context -> annotation_chro_table, my_strcmp);
4031 HashTableSetHashFunction( global_context -> annotation_chro_table, fc_chro_hash);
4032
4033 int loaded_features = load_features_annotation(global_context->config.exon_annotation_file, global_context->config.exon_annotation_file_type, global_context->config.exon_annotation_gene_id_column, NULL, global_context->config.exon_annotation_feature_name_column, global_context, do_anno_bitmap_add_feature);
4034 if(loaded_features < 0)return -1;
4035 else print_in_box(80,0,0,"%d annotation records were loaded.\n", loaded_features);
4036 return 0;
4037 }
4038
4039 int load_global_context(global_context_t * context)
4040 {
4041 char tmp_fname [MAX_FILE_NAME_LENGTH + 50];
4042 int min_phred_score = -1 , max_phred_score = -1;
4043 context -> is_phred_warning = 0;
4044
4045 if(context->config.multi_best_reads>1 && ! context->config.report_multi_mapping_reads){
4046 print_in_box(80,0,0,"WARNING: Multi-mapping reads are reported.");
4047 context->config.report_multi_mapping_reads = 1;
4048 }
4049
4050 if(context->config.scRNA_input_mode){
4051 context -> config.multi_best_reads = 3;
4052 context -> config.do_remove_neighbour_for_scRNA = 1;
4053 context -> config.reads_per_chunk = 30000000*context -> config.multi_best_reads;
4054 context -> config.multi_best_reads = max(context -> config.multi_best_reads , context -> config.reported_multi_best_reads);
4055 // opening a BCL input needs the exact chunk size.
4056 if(context->config.multi_best_reads>1) context -> config.reads_per_chunk /= context->config.multi_best_reads;
4057 char * reads_per_chunk = getenv("CC_READS_PER_CHUNK");
4058 char * remove_beighbour = getenv("CC_REMOVE_NEIGHBOUR");
4059 if(remove_beighbour) context -> config.do_remove_neighbour_for_scRNA = remove_beighbour[0]-'0';
4060 if(reads_per_chunk) context -> config.reads_per_chunk = atoi(reads_per_chunk);
4061 }
4062 print_in_box(80,0,0,"Check the input reads.");
4063 subread_init_lock(&context->input_reads.input_lock);
4064 if(core_geinput_open(context, &context->input_reads.first_read_file, 1)) {
4065 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"Unable to open the input file '%s'\n", context->config.first_read_file);
4066 return -1;
4067 }
4068
4069 context->config.space_type = context->input_reads.first_read_file.space_type;
4070 print_in_box(89,0,0,"The input file contains %c[36m%s%c[0m space reads.", CHAR_ESC, context->config.space_type == GENE_SPACE_COLOR?"color":"base", CHAR_ESC);
4071 if(context->config.space_type == GENE_SPACE_COLOR && context->config.is_BAM_output && !context->config.convert_color_to_base)
4072 {
4073 print_in_box(80,0,0,"The color-space bases will be converted into base space in the BAM output.");
4074 context->config.convert_color_to_base=1;
4075 }
4076 else if(context->config.space_type == GENE_SPACE_BASE && context->config.convert_color_to_base)
4077 {
4078 print_in_box(80,0,0,"The reads will not be converted into base space.");
4079 context->config.convert_color_to_base=0;
4080 }
4081
4082
4083
4084 if(context->input_reads.is_paired_end_reads)
4085 {
4086 if(core_geinput_open(context, &context->input_reads.second_read_file, 2))
4087 {
4088 //sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"Unable to open '%s' as input. Please check if it exists, you have the permission to read it, and it is in the correct format.\n", context->config.second_read_file);
4089 return -1;
4090 }
4091
4092 context -> config.max_vote_combinations = 3;
4093 context -> config.multi_best_reads = 3;
4094 context -> config.max_vote_simples = 64;
4095 context -> config.max_vote_number_cutoff = 2;
4096 }else{
4097 context -> config.max_vote_combinations = 3;
4098 context -> config.multi_best_reads = 3;
4099 context -> config.max_vote_simples = 3;
4100 context -> config.max_vote_number_cutoff = 2;
4101 }
4102
4103 print_in_box(80,0,0,"Initialise the memory objects.");
4104 context -> config.max_vote_simples = max(context -> config.max_vote_simples , context -> config.reported_multi_best_reads);
4105 context -> config.max_vote_combinations = max(context -> config.max_vote_combinations , context -> config.reported_multi_best_reads);
4106
4107 if(!context->config.scRNA_input_mode){
4108 // if it is BCL input then these two parameters have been decided .
4109 if(context->config.multi_best_reads>1) context->config.reads_per_chunk /= context->config.multi_best_reads;
4110 if(context->input_reads.is_paired_end_reads) context->config.reads_per_chunk /= 2;
4111 }
4112
4113 struct stat ginp1_stat;
4114 int guess_tested_reads = 0;
4115 stat(context->config.first_read_file , &ginp1_stat);
4116 context->input_reads.first_read_file_size = ginp1_stat.st_size;
4117
4118 print_in_box(80,0,0,"Estimate the mean read length.");
4119 context -> input_reads.avg_read_length = 100;
4120 if(context->config.scRNA_input_mode != GENE_INPUT_SCRNA_BAM && context->config.scRNA_input_mode != GENE_INPUT_SCRNA_FASTQ) context -> input_reads.avg_read_length = guess_reads_density_format(context->config.first_read_file , context->config.is_SAM_file_input?1:0, &min_phred_score, &max_phred_score , &guess_tested_reads);
4121 if(context -> input_reads.avg_read_length<0 )context -> input_reads.avg_read_length = 250;
4122 // SUBREADprintf("QR=[%d,%d]; ALEN=%f\n", min_phred_score, max_phred_score, context -> input_reads.avg_read_length);
4123 if(max_phred_score>=0)
4124 {
4125 int inferred_offset;
4126
4127 if(abs(min_phred_score - 33) < abs(min_phred_score - 64)) inferred_offset = 33;
4128 else inferred_offset = 64;
4129
4130 if(context -> input_reads.first_read_file.file_type != GENE_INPUT_GZIP_FASTA){
4131 if((context->config.phred_score_format == FASTQ_PHRED64 && inferred_offset == 33) ||
4132 (context->config.phred_score_format == FASTQ_PHRED33 && inferred_offset == 64))
4133 {
4134 print_in_box(80,0,0, "WARNING - The specified Phred score offset (%d) seems incorrect.", context->config.phred_score_format == FASTQ_PHRED33?33:64);
4135 print_in_box(80,0,0, " ASCII values of the quality scores of read bases included in");
4136 print_in_box(80,0,0, " the first %d reads were found to be within the range of",guess_tested_reads);
4137 print_in_box(80,0,0, " [%d,%d].", min_phred_score, max_phred_score);
4138 print_in_box(80,0,0, "");
4139 context -> is_phred_warning = 1;
4140 }
4141 else{
4142 print_in_box(80,0,0, "The range of Phred scores observed in the data is [%d,%d]", min_phred_score - inferred_offset, max_phred_score - inferred_offset);
4143 }
4144 }
4145 }
4146
4147 subread_init_lock(&context -> output_lock);
4148
4149 if(context->config.report_sam_file && context -> config.output_prefix[0])
4150 {
4151 // ====== open output files ======
4152 // Only the sam file is opened here; other files like bed, indel and etc are opened in init_modules()
4153 sprintf(tmp_fname,"%s", context->config.output_prefix);
4154
4155 print_in_box(80,0,0,"Create the output %s file.", context -> config.is_BAM_output?"BAM":"SAM");
4156 if(context -> config.is_BAM_output)
4157 {
4158 context -> output_bam_writer = malloc(sizeof(SamBam_Writer));
4159 SamBam_writer_create(context -> output_bam_writer , tmp_fname, context -> config.is_input_read_order_required?1:context -> config.all_threads, context -> config.sort_reads_by_coordinates, context->config.scRNA_input_mode, context -> config.temp_file_prefix);
4160 context -> output_sam_fp = NULL;
4161 }
4162 else
4163 {
4164 context -> output_sam_fp = f_subr_open(tmp_fname,"wb");
4165 //context -> output_sam_inner_buffer = malloc(OUTPUT_BUFFER_SIZE);
4166 //setvbuf (context -> output_sam_fp, context -> output_sam_inner_buffer, _IOFBF, OUTPUT_BUFFER_SIZE);
4167 context -> output_bam_writer = NULL;
4168 }
4169 if((!context -> output_bam_writer) && (!context->output_sam_fp))
4170 {
4171 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"Unable to open '%s' to write. Please check if the path exists and you have the permission to create/write this file", tmp_fname);
4172 return -1;
4173 }
4174 }
4175 else
4176 {
4177 if(context -> config.is_BAM_output)
4178 {
4179 context -> output_bam_writer = malloc(sizeof(SamBam_Writer));
4180 SamBam_writer_create(context -> output_bam_writer ,NULL, context -> config.is_input_read_order_required?1:context -> config.all_threads, context -> config.sort_reads_by_coordinates, 0, context -> config.temp_file_prefix);
4181 }
4182 context->output_sam_fp = NULL;
4183 }
4184
4185 // ====== check index files, count blocks and load chro table ======
4186 print_in_box(80,0,0,"Check the index.");
4187 sprintf(tmp_fname, "%s.reads", context->config.index_prefix);
4188 if(!does_file_exist(tmp_fname))
4189 {
4190 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"Unable top open index '%s'. Please make sure that the correct prefix is specified and you have the permission to read these files. For example, if there are files '/opt/my_index.reads', '/opt/my_index.files' and etc, the index prefix should be specified as '/opt/my_index' without any suffix. \n", context->config.index_prefix);
4191 return -1;
4192 }
4193
4194 context->current_index_block_number = 0;
4195 if(load_offsets(&context->chromosome_table, context->config.index_prefix)){
4196 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"\nThe index was built by using an old version of Subread; its format is no longer supported. Please use the current version of the index builder to rebuild it.\n");
4197 return 1;
4198 }
4199
4200 if(context->config.space_type == GENE_SPACE_COLOR)
4201 sprintf(tmp_fname, "%s.00.c.tab", context->config.index_prefix);
4202 else
4203 sprintf(tmp_fname, "%s.00.b.tab", context->config.index_prefix);
4204 if(!does_file_exist(tmp_fname))
4205 {
4206 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"Your reads are in the %s space but the index was not built in the same space. Unable to precess the reads.\n", context->config.space_type == GENE_SPACE_COLOR?"color":"base");
4207 return -1;
4208 }
4209
4210 context->index_block_number = 0;
4211 while(1)
4212 {
4213 sprintf(tmp_fname, "%s.%02d.%c.tab", context->config.index_prefix, context->index_block_number, context->config.space_type == GENE_SPACE_COLOR?'c':'b');
4214 if(!does_file_exist(tmp_fname))break;
4215 context->index_block_number ++;
4216 if(context->index_block_number>=2 && context->config.max_indel_length > 16)
4217 {
4218 print_in_box(80,0,0,"ERROR You cannot use multi-block index for very-long indel detection.");
4219 print_in_box(80,0,0,"Please set the maximum indel length <= 16.");
4220 return -1;
4221 }
4222 }
4223
4224 if(context->config.report_sam_file)
4225 write_sam_headers(context);
4226
4227 print_in_box(80,0,0,"Init the voting space.");
4228 // ====== init other variables ======
4229 if(context -> config.all_threads>1)
4230 subread_init_lock(&context -> thread_initial_lock);
4231 else subread_init_topKbuff(context, &context -> topKbuff);
4232
4233 if(init_paired_votes(context))
4234 {
4235 sublog_printf(SUBLOG_STAGE_RELEASED, SUBLOG_LEVEL_ERROR,"Cannot initialise the voting space. You need at least 2GB of empty physical memory to run this program.\n");
4236 return 1;
4237 }
4238 sublog_printf(SUBLOG_STAGE_DEV1, SUBLOG_LEVEL_DEBUG, "load_global_context: finished");
4239
4240 memset( context->all_value_indexes , 0 , 100 * sizeof(gene_value_index_t));
4241
4242 context -> sam_chro_to_anno_chr_alias = NULL;
4243 if(context->config.exon_annotation_file[0]){
4244 print_in_box(80,0,0,"Load the annotation file.");
4245 if( load_annotated_exon_regions( context ) ) return -1;
4246 if(context->config.exon_annotation_alias_file[0])
4247 context -> sam_chro_to_anno_chr_alias = load_alias_table(context->config.exon_annotation_alias_file);
4248 warning_anno_vs_index(context -> annotation_chro_table, &context -> chromosome_table, context -> sam_chro_to_anno_chr_alias);
4249 HashTableDestroy(context -> annotation_chro_table);
4250 } else context -> exonic_region_bitmap = NULL;
4251 print_in_box(80,0,0,"Global environment is initialised.");
4252 return 0;
4253 }
4254
4255
4256 int init_modules(global_context_t * context)
4257 {
4258 sublog_printf(SUBLOG_STAGE_DEV1, SUBLOG_LEVEL_DEBUG, "init_modules: begin");
4259 int ret = init_indel_tables(context);
4260 if((context->config.do_breakpoint_detection || context-> config.do_fusion_detection || context->config.do_breakpoint_detection || context-> config.do_long_del_detection))
4261 ret = ret || init_junction_tables(context);
4262
4263 sublog_printf(SUBLOG_STAGE_DEV1, SUBLOG_LEVEL_DEBUG, "init_modules: finished: %d",ret);
4264 return ret;
4265 }
4266
4267 int destroy_modules(global_context_t * context)
4268 {
4269 destroy_indel_module(context);
4270 if((context->config.do_breakpoint_detection || context-> config.do_fusion_detection || context->config.do_breakpoint_detection || context-> config.do_long_del_detection))
4271 destroy_junction_tables(context);
4272 return 0;
4273 }
4274
4275 int destroy_global_context(global_context_t * context)
4276 {
4277 int xk1, block_no, ret = 0;
4278
4279 if(context -> exonic_region_bitmap) free(context -> exonic_region_bitmap);
4280
4281 for(block_no = 0; block_no< context->index_block_number; block_no++)
4282 gvindex_destory(&context -> all_value_indexes[block_no]);
4283
4284 if(context->config.all_threads<2){
4285 subread_free_topKbuff( context,&context -> topKbuff );
4286 }
4287 if(context->output_sam_fp) {
4288 if(context -> output_sam_is_full){
4289 unlink(context->config.output_prefix);
4290 SUBREADprintf("\nERROR: cannot finish the SAM file. Please check the disk space in the output directory.\nNo output file was generated.\n");
4291 ret = 1;
4292 }
4293 fclose(context -> output_sam_fp);
4294 }
4295
4296 if(context->input_reads.is_internal_error){
4297 unlink(context->config.output_prefix);
4298 return 1;
4299 }
4300
4301 if(context->output_bam_writer) {
4302 SamBam_writer_close(context->output_bam_writer);
4303 if(context->output_bam_writer -> is_internal_error){
4304 unlink(context->config.output_prefix);
4305 SUBREADprintf("\nERROR: cannot finish the BAM file. Please check the disk space in the output directory.\nNo output file was generated.\n");
4306 ret = 1;
4307 }
4308 free(context->output_bam_writer);
4309 context->output_bam_writer=NULL;
4310 }
4311 //free(context->big_margin_record);
4312
4313 for(xk1=0; xk1<5; xk1++)
4314 if(context->module_contexts[xk1])free(context->module_contexts[xk1]);
4315 geinput_close(&context -> input_reads.first_read_file);
4316 if(context->input_reads.is_paired_end_reads) geinput_close(&context -> input_reads.second_read_file);
4317 destroy_offsets(&context->chromosome_table);
4318 finalise_bigtable_results(context);
4319
4320 if((context -> will_remove_input_file & 1) && strstr(context ->config.first_read_file, "/core-temp")) unlink(context ->config.first_read_file);
4321 if((context -> will_remove_input_file & 2) && strstr(context ->config.second_read_file, "/core-temp")) unlink(context ->config.second_read_file);
4322 free(context -> rebuilt_command_line);
4323
4324 return ret;
4325 }
4326
4327 void subread_rebuild_cmd(int argc, char ** argv, global_context_t * global_context){
4328 global_context -> rebuilt_command_line_size = rebuild_command_line(&global_context -> rebuilt_command_line , argc, argv);
4329 }
4330
4331
4332 int write_bincigar_part(char * bincigar, int chropt, unsigned int optlen, int bincigar_len)
4333 {
4334 int binopt, binbytes, x1;
4335
4336 if(optlen<1) return -1;
4337
4338 if(optlen < 4) binbytes=1;
4339 else if(optlen < 1024) binbytes=2;
4340 else if(optlen < 262144) binbytes=3;
4341 else if(optlen < 67108864) binbytes=4;
4342 else binbytes=5;
4343
4344 if(bincigar_len<binbytes) return -1;
4345
4346 switch(chropt)
4347 {
4348 case 'S':
4349 binopt = CORE_CIGAR_OPT_S;
4350 break;
4351 case 'M':
4352 binopt = CORE_CIGAR_OPT_M;
4353 break;
4354 case 'I':
4355 binopt = CORE_CIGAR_OPT_I;
4356 break;
4357 case 'D':
4358 binopt = CORE_CIGAR_OPT_D;
4359 break;
4360 case 'N':
4361 binopt = CORE_CIGAR_OPT_N;
4362 break;
4363 case 'n':
4364 binopt = CORE_CIGAR_OPT_NN;
4365 break;
4366 case 'B':
4367 binopt = CORE_CIGAR_OPT_B;
4368 break;
4369 case 'b':
4370 binopt = CORE_CIGAR_OPT_BB;
4371 break;
4372 default:
4373 return -1;
4374 }
4375
4376 bincigar[0]=binopt | (binbytes << 3) | ((optlen & 3)<< 6);
4377 optlen >>= 2;
4378 for(x1=1;x1<binbytes; x1++)
4379 {
4380 bincigar[x1] = optlen&0xff;
4381 optlen>>=8;
4382 }
4383
4384 return binbytes;
4385 }
4386
4387 // function returns the actual length of bincigar, or -1 if anything is wrong, e.g., bincigar_len is too short or unrecognized operations.
4388 int OLD_cigar2bincigar(char *cigar, char *bincigar, int bincigar_len)
4389 {
4390 int xk1=0;
4391 unsigned int tmpv=0, bincigar_cursor=0;
4392 while(1)
4393 {
4394 int nch = cigar[xk1];
4395 if(!nch) break;
4396 xk1++;
4397
4398 if(isdigit(nch)) tmpv=tmpv*10+(nch-'0');
4399 else
4400 {
4401 int bincigar_sec_len = write_bincigar_part(bincigar+bincigar_cursor, nch, tmpv, bincigar_len-bincigar_cursor);
4402 if(bincigar_sec_len<0){
4403 bincigar[0]=0;
4404 return -1;
4405 }
4406 bincigar_cursor += bincigar_sec_len;
4407 tmpv=0;
4408 }
4409 }
4410
4411 if(bincigar_cursor<bincigar_len) bincigar[bincigar_cursor] = 0;
4412
4413 //printf("%s : BL=%d\n", cigar, bincigar_cursor);
4414
4415 return bincigar_cursor;
4416 }
4417
4418
4419 int write_cigar_part(char *bincigar, char *cigar, int cigar_len , int * bincigar_move)
4420 {
4421 int binbytes, x1, binopt, charopt;
4422 unsigned int tmpv = 0;
4423 char sec_buf[13];
4424
4425 binbytes = 7& (bincigar[0] >> 3);
4426 binopt = 7 & bincigar[0];
4427
4428 switch(binopt)
4429 {
4430 case CORE_CIGAR_OPT_D:
4431 charopt='D';
4432 break;
4433 case CORE_CIGAR_OPT_I:
4434 charopt='I';
4435 break;
4436 case CORE_CIGAR_OPT_M:
4437 charopt='M';
4438 break;
4439 case CORE_CIGAR_OPT_S:
4440 charopt='S';
4441 break;
4442 case CORE_CIGAR_OPT_B:
4443 charopt='B';
4444 break;
4445 case CORE_CIGAR_OPT_BB:
4446 charopt='b';
4447 break;
4448 case CORE_CIGAR_OPT_N:
4449 charopt='N';
4450 break;
4451 default:
4452 charopt='n';
4453 break;
4454 }
4455
4456 tmpv = (bincigar[0]>>6) & 3;
4457 for(x1 = 1; x1 < binbytes; x1++)
4458 {
4459 unsigned int dtmpv = 0xff & bincigar[x1];
4460 dtmpv <<= (x1*8 - 6);
4461 tmpv += dtmpv;
4462 }
4463
4464 int added_len = sprintf(sec_buf, "%u%c", tmpv, charopt);
4465 if(added_len > cigar_len)
4466 return -1;
4467 memcpy(cigar, sec_buf, added_len);
4468 (*bincigar_move) = binbytes;
4469
4470 return added_len;
4471 }
4472
4473 int OLD_bincigar2cigar(char * cigar, int cigar_len, char * bincigar, int bincigar_max_len, int read_len)
4474 {
4475
4476 int cigar_cursor = 0, bincigar_cursor = 0;
4477
4478 while(1)
4479 {
4480 int bincigar_move = 0;
4481 int cigar_sec_len = write_cigar_part(bincigar + bincigar_cursor, cigar+cigar_cursor, cigar_len-cigar_cursor-1, &bincigar_move);
4482 if(cigar_sec_len<0){
4483 sprintf(cigar,"%dM", read_len);
4484 return -1;
4485 }
4486 //printf("NPC=%s\n", cigar);
4487 cigar_cursor += cigar_sec_len;
4488 bincigar_cursor += bincigar_move;
4489 if(bincigar_cursor>=bincigar_max_len) break;
4490 if(bincigar[bincigar_cursor] == 0) break;
4491 }
4492 cigar[cigar_cursor] = 0;
4493
4494 return cigar_cursor;
4495 }
4496
4497 int term_strncpy(char * dst, char * src, int max_dst_mem)
4498 {
4499 int i;
4500
4501 for(i=0; i<max_dst_mem; i++)
4502 {
4503 if(!src[i]) break;
4504 dst[i]=src[i];
4505 if(i == max_dst_mem-1)
4506 SUBREADprintf("String out of memory limit: '%s'\n", src);
4507 }
4508 if(i >= max_dst_mem) i = max_dst_mem-1;
4509 dst[i] = 0;
4510
4511 return 0;
4512 }
4513 void absoffset_to_posstr(global_context_t * global_context, unsigned int pos, char * res){
4514 char * ch;
4515 int off;
4516
4517 locate_gene_position(pos, &global_context -> chromosome_table, & ch, &off);
4518
4519 sprintf(res, "%s:%u", ch, off);
4520 }
4521
4522 int chimeric_cigar_parts(global_context_t * global_context, unsigned int sel_pos, int is_first_section_negative_strand, int is_first_section_reversed, char * in_cigar, unsigned int * out_poses, char ** out_cigars, char * out_strands, int read_len, short * perfect_lens, char * read_name)
4523 {
4524 unsigned int current_perfect_map_start = sel_pos;
4525 int current_perfect_section_no = 0;
4526 unsigned int current_perfect_cursor = sel_pos;
4527 int is_reversed = is_first_section_reversed;
4528 int is_negative = is_first_section_negative_strand;
4529 int read_cursor = 0;
4530 int out_cigar_writer_ptr = 0;
4531 unsigned int tmpi = 0;
4532
4533 short perfect_len = 0;
4534
4535 int cigar_cursor;
4536
4537 out_poses[0] = current_perfect_map_start - (is_reversed?1:0);
4538 out_strands[0] = is_negative?'-':'+';
4539 char main_piece_strand = (is_first_section_negative_strand == is_first_section_reversed)?'+':'-';
4540
4541 for(cigar_cursor=0;;cigar_cursor++)
4542 {
4543 char ncch = in_cigar[cigar_cursor];
4544 int is_chimeric_section_end = 0;
4545
4546 if(!ncch){
4547 perfect_lens [current_perfect_section_no] = perfect_len ;
4548 current_perfect_section_no++;
4549 break;
4550 }
4551
4552 if(toupper(ncch)=='N'||toupper(ncch)=='B')
4553 {
4554
4555 unsigned int jummped_location;
4556 int is_chro_jump = 0, is_long_jump = 0;
4557
4558 if(is_reversed)
4559 {
4560 if(toupper(ncch)=='N')
4561 jummped_location = current_perfect_map_start - 1 + tmpi;
4562 else
4563 jummped_location = current_perfect_map_start - 1 - tmpi;
4564 }
4565 else
4566 {
4567 if(toupper(ncch)=='N')
4568 jummped_location = current_perfect_cursor + tmpi;
4569 else
4570 jummped_location = current_perfect_cursor - tmpi;
4571
4572 }
4573
4574 if(ncch == 'N')
4575 {
4576 char * curr_chr, * new_chr;
4577 int curr_offset, new_offset;
4578 locate_gene_position_max(current_perfect_cursor, &global_context -> chromosome_table, & curr_chr, & curr_offset, NULL, NULL, 1);
4579 locate_gene_position_max(jummped_location , &global_context -> chromosome_table, & new_chr, & new_offset, NULL, NULL, 1);
4580 if( curr_chr == NULL || new_chr == NULL ){
4581 /*
4582 char outpos[100];
4583 absoffset_to_posstr(global_context, sel_pos + 1, outpos);
4584 SUBREADprintf("Wrong CIGAR: mapped to %s, CIGAR=%s\n", outpos , in_cigar);
4585 */
4586 return -1;
4587 }
4588 assert(curr_chr);
4589 assert(new_chr);
4590 is_chro_jump = (curr_chr != new_chr);
4591
4592 long long int dist = (long long int)current_perfect_cursor;
4593 dist -= (long long int)jummped_location;
4594 if(abs(dist) >= global_context -> config.maximum_intron_length)
4595 is_long_jump = 1;
4596
4597 //#warning ">>>>>> COMMENT WHEN RELEASE <<<<<<"
4598 if(0 && FIXLENstrcmp("R000000007", read_name)==0)
4599 SUBREADprintf("dist=%lld, abs=%lld, %u, %u\n",dist,abs(dist), current_perfect_cursor, jummped_location);
4600 // A long jump is the jump longer than 2^27.
4601 // Picard does not like it!!
4602 }
4603
4604 // is_long_jump is true only if the two sections are on different chromosomes.
4605 //#warning ">>>>>> COMMENT WHEN RELEASE <<<<<<"
4606 if(0 && FIXLENstrcmp("R000000007", read_name)==0)
4607 SUBREADprintf("CHR_JMP=%d, NCCH=%c, LONG_JMP=%d\n", is_chro_jump, ncch, is_long_jump);
4608 if(is_chro_jump || islower(ncch) || ncch == 'B' || is_long_jump)
4609 {
4610 current_perfect_cursor = jummped_location;
4611
4612 if(islower(ncch)){
4613 is_reversed = !is_reversed;
4614 is_negative = !is_negative;
4615
4616 if(is_reversed)
4617 current_perfect_cursor --;
4618 }
4619
4620 current_perfect_map_start = current_perfect_cursor;
4621 tmpi = 0;
4622 if(read_cursor<read_len)
4623 sprintf(out_cigars[current_perfect_section_no] + out_cigar_writer_ptr,"%dS", read_len - read_cursor);
4624
4625 perfect_lens [current_perfect_section_no] = perfect_len ;
4626 perfect_len = 0;
4627
4628 current_perfect_section_no++;
4629 if(current_perfect_section_no>=CIGAR_PERFECT_SECTIONS)break;
4630
4631 out_poses[current_perfect_section_no] = current_perfect_map_start - read_cursor;
4632 out_strands[current_perfect_section_no] = is_negative?'-':'+';
4633 out_cigar_writer_ptr = sprintf(out_cigars[current_perfect_section_no],"%dS", read_cursor);
4634 is_chimeric_section_end = 1;
4635 }
4636 }
4637
4638 if(!is_chimeric_section_end)
4639 {
4640 if(isalpha(ncch))
4641 {
4642 out_cigar_writer_ptr+=sprintf(out_cigars[current_perfect_section_no]+out_cigar_writer_ptr, "%u%c", tmpi, ncch);
4643 }
4644 if(ncch == 'M'|| ncch == 'S')
4645 {
4646 read_cursor += tmpi;
4647 if(ncch == 'M')
4648 perfect_len += tmpi;
4649 if(is_reversed)
4650 out_poses[current_perfect_section_no] += tmpi;
4651 else
4652 current_perfect_cursor += tmpi;
4653 tmpi = 0;
4654 }
4655 else if(ncch == 'D' || ncch == 'N')
4656 {
4657 if(is_reversed)
4658 out_poses[current_perfect_section_no] += tmpi;
4659 else
4660 current_perfect_cursor += tmpi;
4661 tmpi = 0;
4662 }
4663 else if(ncch == 'I')
4664 {
4665 read_cursor += tmpi;
4666 tmpi = 0;
4667 }
4668 else if(isdigit(ncch))
4669 tmpi = tmpi*10+(ncch-'0');
4670 }
4671 }
4672
4673 int xk1 = 0, best_match = -9999;
4674
4675 for(xk1=0; xk1<current_perfect_section_no;xk1++)
4676 {
4677 if(best_match < 0 || (main_piece_strand == out_strands[xk1] && perfect_lens[xk1]>perfect_lens[best_match]))
4678 best_match = xk1;
4679 }
4680
4681 if(best_match>0)
4682 {
4683 unsigned int tmpv;
4684 char cigar_sec[100];
4685 tmpv = out_poses[0];
4686 out_poses[0]=out_poses[best_match];
4687 out_poses[best_match] = tmpv;
4688
4689 tmpv = out_strands[0];
4690 out_strands[0] = out_strands[best_match];
4691 out_strands[best_match] = tmpv;
4692
4693 strcpy(cigar_sec, out_cigars[0]);
4694 strcpy(out_cigars[0], out_cigars[best_match]);
4695 strcpy(out_cigars[best_match] , cigar_sec);
4696
4697 tmpv = perfect_lens[0];
4698 perfect_lens[0] = perfect_lens[best_match];
4699 perfect_lens[best_match] = tmpv;
4700 }
4701
4702 return current_perfect_section_no;
4703 }
4704
4705 void quick_sort_run(void * arr, int spot_low,int spot_high, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r));
4706
4707 void quick_sort(void * arr, int arr_size, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r))
4708 {
4709 quick_sort_run(arr, 0, arr_size-1, compare, exchange);
4710 }
4711
4712
4713 void quick_sort_run(void * arr, int spot_low,int spot_high, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r))
4714 {
4715 // https://en.wikipedia.org/wiki/Quicksort
4716 // Lomuto partition scheme
4717
4718 int pivot,j,i;
4719
4720 if(spot_high <= spot_low) return;
4721 pivot = spot_high;
4722 i = spot_low;
4723
4724 for(j = spot_low+1; j < spot_high; j++)
4725 if(compare(arr, j, pivot)<=0) {
4726 exchange(arr,i,j);
4727 i++;
4728 }
4729
4730 exchange(arr, i, spot_high);
4731
4732 quick_sort_run(arr, spot_low, i-1, compare, exchange);
4733 quick_sort_run(arr, i+1, spot_high, compare, exchange);
4734
4735 }
4736 void basic_sort_run(void * arr, int start, int items, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r)){
4737 int i, j;
4738 for(i=start; i< start + items - 1; i++)
4739 {
4740 int min_j = i;
4741 for(j=i + 1; j< start + items; j++)
4742 {
4743 if(compare(arr, min_j, j) > 0)
4744 min_j = j;
4745 }
4746 if(i!=min_j)
4747 exchange(arr, i, min_j);
4748 }
4749 }
4750
4751 void basic_sort(void * arr, int items, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r)){
4752 basic_sort_run(arr, 0, items, compare, exchange);
4753 }
4754
4755
4756 void merge_sort_run(void * arr, int start, int items, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r), void merge(void * arr, int start, int items, int items2))
4757 {
4758 if(items > 11)
4759 {
4760 int half_point = items/2;
4761 merge_sort_run(arr, start, half_point, compare, exchange, merge);
4762 merge_sort_run(arr, start + half_point, items - half_point, compare, exchange, merge);
4763 merge(arr, start, half_point, items - half_point);
4764 }
4765 else
4766 {
4767 basic_sort_run(arr, start, items, compare, exchange);
4768 }
4769 }
4770 void merge_sort(void * arr, int arr_size, int compare (void * arr, int l, int r), void exchange(void * arr, int l, int r), void merge(void * arr, int start, int items, int items2))
4771 {
4772 merge_sort_run(arr, 0, arr_size, compare, exchange, merge);
4773 }
4774
4775 unsigned int calc_end_pos(unsigned int p, char * cigar, unsigned int * all_skipped_len, int * is_exonic_regions, global_context_t * global_context){
4776 unsigned int cursor = p, tmpi=0;
4777 int nch, cigar_cursor;
4778 for(cigar_cursor = 0; 0!=(nch = cigar[cigar_cursor]); cigar_cursor++){
4779 if(isdigit(nch)){
4780 tmpi = tmpi * 10 + (nch - '0');
4781 }else{
4782 if((nch == 'S' && cursor == p) || nch == 'M' || nch == 'N' || nch == 'D'){
4783 if(nch == 'M' && global_context -> exonic_region_bitmap){
4784 if(global_context -> config.do_breakpoint_detection){
4785 if(is_pos_in_annotated_exon_regions(global_context, cursor) == 0 || is_pos_in_annotated_exon_regions(global_context, cursor + tmpi - 1) == 0) ( * is_exonic_regions) = 0;
4786 } else {
4787 if(is_pos_in_annotated_exon_regions(global_context, cursor + tmpi/2) == 0) ( * is_exonic_regions) = 0;
4788 }
4789 }
4790
4791 cursor += tmpi;
4792 if(nch == 'N' || nch == 'D')(*all_skipped_len) += tmpi;
4793 }
4794 tmpi = 0;
4795 }
4796 }
4797 return cursor;
4798
4799 }
4800
4801 void test_PE_and_same_chro_cigars(global_context_t * global_context , unsigned int pos1, unsigned int pos2, int * is_exonic_regions, int * is_PE_distance, int * is_same_chromosome, int read_len_1, int read_len_2, char * cigar1, char * cigar2, char *read_name, int * res_tlen){
4802 char * r1_chr = NULL, * r2_chr = NULL;
4803 int r1_pos, r2_pos;
4804
4805 (*is_same_chromosome) = 0;
4806 (*is_PE_distance) = 0;
4807 (*is_exonic_regions) = 1;
4808
4809 locate_gene_position(pos1, &global_context -> chromosome_table, & r1_chr, & r1_pos);
4810 locate_gene_position(pos2, &global_context -> chromosome_table, & r2_chr, & r2_pos);
4811
4812 if(r1_chr == r2_chr){
4813 unsigned int skip_1 = 0;
4814 unsigned int skip_2 = 0;
4815 unsigned int r1_end_pos = calc_end_pos(pos1, cigar1, &skip_1, is_exonic_regions, global_context );
4816 unsigned int r2_end_pos = calc_end_pos(pos2, cigar2, &skip_2, is_exonic_regions, global_context );
4817
4818 unsigned int tlen = max(r1_end_pos, r2_end_pos) - min(pos1, pos2);
4819 if(tlen > skip_1) tlen -= skip_1;
4820 if(tlen > skip_2) tlen -= skip_2;
4821
4822 (*is_same_chromosome) = 1;
4823
4824 if(tlen >= global_context -> config.minimum_pair_distance && tlen <= global_context -> config.maximum_pair_distance)
4825 (* is_PE_distance) = 1;
4826 *res_tlen = tlen;
4827 }else *res_tlen = 0x7fffffff;
4828 }
4829
4830 void test_PE_and_same_chro_align(global_context_t * global_context , realignment_result_t * res1, realignment_result_t * res2, int * is_exonic_regions, int * is_PE_distance, int * is_same_chromosome, int read_len_1, int read_len_2, char * read_name, int *res_tlen){
4831 return test_PE_and_same_chro_cigars(global_context, res1 -> first_base_position, res2 -> first_base_position, is_exonic_regions, is_PE_distance, is_same_chromosome , read_len_1 , read_len_2, res1 -> cigar_string, res2 -> cigar_string, read_name, res_tlen);
4832 }
4833
4834
4835
4836 void test_PE_and_same_chro(global_context_t * global_context , unsigned int pos1, unsigned int pos2, int * is_PE_distance, int * is_same_chromosome, int read_len_1, int read_len_2)
4837 {
4838 char * r1_chr, * r2_chr;
4839 int r1_pos, r2_pos;
4840
4841 (*is_same_chromosome) = 0;
4842 (*is_PE_distance) = 0;
4843
4844 int re1 = locate_gene_position(pos1, &global_context -> chromosome_table, & r1_chr, & r1_pos);
4845 int re2 = locate_gene_position(pos2, &global_context -> chromosome_table, & r2_chr, & r2_pos);
4846
4847 if(re1 ==0 && 0 ==re2) {
4848 long long tlen = r1_pos;
4849 tlen -= r2_pos;
4850 tlen = abs(tlen);
4851 tlen += (r1_pos > r2_pos)?read_len_1:read_len_2;
4852 unsigned int tlenI = (unsigned int) tlen;
4853
4854 //SUBREADprintf("TEST PE: %p == %p , TLEN=%u\n", r1_chr, r2_chr, tlenI);
4855
4856 if(r1_chr == r2_chr){
4857 (*is_same_chromosome) = 1;
4858 if(tlenI >= global_context -> config.minimum_pair_distance && tlenI <= global_context -> config.maximum_pair_distance)
4859 (* is_PE_distance) = 1;
4860 }
4861 }
4862 }
4863
4864 int FIXLENstrcmp(char * flen, char * rname){
4865 char * tt=NULL;
4866 char * fixed_len = malloc(strlen(flen)+1);
4867 strcpy(fixed_len, flen);
4868 char * s1 = strtok_r(fixed_len, "\n", &tt);
4869
4870 int found = 0;
4871 while(1){
4872 int this_ok = 1,x=0;
4873 for(; s1[x]; x++){
4874 if(rname[x]!=s1[x]) {
4875 this_ok = 0;
4876 break;
4877 }
4878 }
4879 if(this_ok){
4880 found = 1;
4881 break;
4882 }
4883 s1 = strtok_r(NULL, "\n", &tt);
4884 if(!s1)break;
4885 }
4886 free(fixed_len);
4887 return !found;
4888 }
4889