1 /***************************************************************
2
3 The Subread and Rsubread software packages are free
4 software packages:
5
6 you can redistribute it and/or modify it under the terms
7 of the GNU General Public License as published by the
8 Free Software Foundation, either version 3 of the License,
9 or (at your option) any later version.
10
11 Subread is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty
13 of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14
15 See the GNU General Public License for more details.
16
17 Authors: Drs Yang Liao and Wei Shi
18
19 ***************************************************************/
20
21
22 #define _GNU_SOURCE
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <stdarg.h>
26 #include <assert.h>
27 #include <string.h>
28 #include <unistd.h>
29 #include <ctype.h>
30 #include <time.h>
31
32
33 #ifndef MAKE_STANDALONE
34 #include <R.h>
35 #endif
36
37 #include <zlib.h>
38 #include <math.h>
39 #include <pthread.h>
40 #include <getopt.h>
41 #include "subread.h"
42 #include "interval_merge.h"
43 #include "core.h"
44 #include "gene-algorithms.h"
45 #include "sambam-file.h"
46 #include "input-files.h"
47 #include "input-blc.h"
48 #include "hashtable.h"
49 #include "seek-zlib.h"
50 #include "HelperFunctions.h"
51
52 /********************************************************************/
53 /********************************************************************/
54 /********************************************************************/
55 // NEW FUNCTION FOR MULTI-THREADING
56 /********************************************************************/
57 /********************************************************************/
58 /********************************************************************/
59 #define CHROMOSOME_NAME_LENGTH 256
60 #define MAX_FC_READ_LENGTH 10001
61 #define MAX_HIT_NUMBER (1000*1000*1000)
62 #define MAX_EXTRA_COLS 15
63 #define MAX_UMI_LEN 14
64 #define FC_FLIST_SPLITOR "\026"
65
66 typedef struct{
67 char * gene_name;
68 unsigned int pos_first_base;
69 unsigned int pos_last_base;
70 } fc_junction_gene_t;
71
72
73 #define MAXIMUM_INSERTION_IN_SECTION 8
74
75 typedef struct {
76 char * chro;
77 unsigned int start_pos;
78 unsigned int chromosomal_length;
79 short insertions;
80 unsigned int insertion_start_pos[ MAXIMUM_INSERTION_IN_SECTION ];
81 unsigned short insertion_lengths[ MAXIMUM_INSERTION_IN_SECTION ];
82 } CIGAR_interval_t;
83
84
85
86 typedef struct {
87 int space;
88 int used;
89 fc_junction_gene_t ** genes;
90 } gene_info_list_t;
91
92 typedef struct {
93 char chromosome_name_left[CHROMOSOME_NAME_LENGTH + 1];
94 char chromosome_name_right[CHROMOSOME_NAME_LENGTH + 1];
95 unsigned int last_exon_base_left;
96 unsigned int first_exon_base_right;
97 } fc_junction_info_t;
98
99 typedef struct {
100 srInt_64 feature_name_pos;
101 unsigned int start;
102 unsigned int end;
103 unsigned int sorted_order;
104
105 unsigned short chro_name_pos_delta;
106 char is_negative_strand;
107 char * extra_columns;
108 } fc_feature_info_t;
109
110 typedef struct {
111 srInt_64 assigned_reads;
112
113 srInt_64 unassigned_unmapped;
114 srInt_64 unassigned_read_type;
115 srInt_64 unassigned_singleton;
116 srInt_64 unassigned_mappingquality;
117 srInt_64 unassigned_chimericreads;
118 srInt_64 unassigned_fragmentlength;
119 srInt_64 unassigned_duplicate;
120 srInt_64 unassigned_multimapping;
121 srInt_64 unassigned_secondary;
122 srInt_64 unassigned_junction_condition;
123 srInt_64 unassigned_nofeatures;
124 srInt_64 unassigned_overlapping_length;
125 srInt_64 unassigned_ambiguous;
126 } fc_read_counters;
127
128 typedef srInt_64 read_count_type_t;
129
130 typedef struct {
131 unsigned short thread_id;
132 srInt_64 nreads_mapped_to_exon;
133 srInt_64 all_reads;
134 //unsigned short current_read_length1;
135 //unsigned short current_read_length2;
136 unsigned int count_table_size;
137 read_count_type_t * count_table;
138 unsigned int chunk_read_ptr;
139 pthread_t thread_object;
140
141 int hits_number_capacity;
142 unsigned int * hits_start_pos1;
143 unsigned int * hits_start_pos2;
144
145 unsigned short * hits_length1;
146 unsigned short * hits_length2;
147
148 char ** hits_chro1;
149 char ** hits_chro2;
150
151 srInt_64 * hits_indices1;
152 srInt_64 * hits_indices2;
153
154 unsigned int proc_Starting_Chro_Points_1BASE[65536];
155 unsigned short proc_Starting_Read_Points[65536];
156 unsigned short proc_Section_Read_Lengths[65536];
157 char * proc_ChroNames[65536];
158 char proc_Event_After_Section[65536];
159 CIGAR_interval_t proc_CIGAR_intervals_R1[65536], proc_CIGAR_intervals_R2[65536];
160
161 char ** scoring_buff_gap_chros;
162 unsigned int * scoring_buff_gap_starts;
163 unsigned short * scoring_buff_gap_lengths;
164 char * read_details_buff;
165 char * bam_compressed_buff;
166 int read_details_buff_used;
167
168 unsigned int * scoring_buff_numbers;
169 unsigned int * scoring_buff_flags;
170 unsigned int * scoring_buff_overlappings;
171 srInt_64 * scoring_buff_exon_ids;
172 srInt_64 del4_added_reads;
173
174 char * chro_name_buff;
175 z_stream bam_file_output_stream;
176
177 HashTable ** scRNA_sample_bc_tables; // sample_ID ==> int64s: cell barcode id <<32 | umi barcode id
178 HashTable * scRNA_registered_UMI_table; // UMI bases ==> UMI_no +1 in this thread
179 HashTable * junction_counting_table; // key: string chro_name \t last_base_previous_exont \t first_base_next_exon
180 HashTable * splicing_point_table;
181 HashTable * RG_table; // rg_name -> [ count_table, sum_fc_read_counters, junction_counting_table, splicing_point_table]
182 // NOTE: some reads have no RG tag. These reads are put into the tables in this object but not in the RG_table -> tables.
183 srInt_64 scRNA_pooled_reads;
184 srInt_64 *scRNA_reads_per_sample;
185 srInt_64 *scRNA_mapped_reads_per_sample;
186 srInt_64 *scRNA_assigned_reads_per_sample;
187 srInt_64 scRNA_has_valid_sample_index;
188 srInt_64 scRNA_has_valid_cell_barcode;
189 fc_read_counters read_counters;
190 } fc_thread_thread_context_t;
191
192 #define READ_SHIFT_UPSTREAM 10
193 #define READ_SHIFT_DOWNSTREAM 20
194 #define READ_SHIFT_LEFT 30
195 #define READ_SHIFT_RIGHT 40
196 #define REVERSE_TABLE_BUCKET_LENGTH 131072
197 #define REDUCE_TO_5_PRIME_END 5
198 #define REDUCE_TO_3_PRIME_END 3
199
200 typedef struct {
201 unsigned int chro_number;
202 unsigned int chro_features;
203 unsigned int chro_feature_table_start;
204 unsigned int chro_block_table_start;
205 unsigned int chro_block_table_end;
206 unsigned int chro_possible_length;
207
208 unsigned short chro_reverse_table_current_size;
209 unsigned int * reverse_table_start_index;
210 int reverse_table_start_index_size;
211 //unsigned int * reverse_table_end_index;
212 } fc_chromosome_index_info;
213
214 typedef struct {
215 int is_gene_level;
216 int is_paired_end_mode_assign;
217 int is_paired_end_reads_expected;
218 int is_multi_overlap_allowed;
219 int restricted_no_multi_overlap;
220 char * strand_check_mode;
221 int is_strand_checked;
222 int is_both_end_required;
223 int is_chimertc_disallowed;
224 int is_PE_distance_checked;
225 int is_multi_mapping_allowed;
226 int is_primary_alignment_only;
227 int is_SAM_file;
228 int is_read_details_out;
229 int is_junction_no_chro_shown;
230 int is_unpaired_warning_shown;
231 int is_stake_warning_shown;
232 int is_read_too_long_to_SAM_BAM_shown;
233 int is_split_or_exonic_only;
234 int is_duplicate_ignored;
235 int is_first_read_reversed;
236 int is_second_read_straight;
237 int is_verbose;
238 int long_read_minimum_length;
239 int assign_reads_to_RG;
240 int use_stdin_file;
241 int is_mixed_PE_SE;
242 int disk_is_full;
243 int do_not_sort;
244 int reduce_5_3_ends_to_one;
245 int isCVersion;
246 int use_fraction_multi_mapping;
247 int do_junction_counting;
248 int do_detection_call;
249 int this_input_number;
250
251 int need_calculate_overlap_len;
252 int need_calculate_fragment_len;
253
254 int min_mapping_quality_score;
255 int min_paired_end_distance;
256 int max_paired_end_distance;
257 int max_M;
258 int feature_block_size;
259 int read_length;
260 int line_length;
261 int longest_chro_name;
262 int five_end_extension;
263 int three_end_extension;
264 int read_shift_type;
265 int read_shift_size;
266 int fragment_minimum_overlapping;
267 int do_scRNA_table;
268 float fractional_minimum_overlapping;
269 float fractional_minimum_feature_overlapping;
270 int use_overlapping_break_tie;
271 int max_missing_bases_in_read, max_missing_bases_in_feature;
272
273 srInt_64 all_reads;
274
275 unsigned short thread_number;
276 fc_thread_thread_context_t * thread_contexts;
277 int sambam_chro_table_items;
278 int is_input_bad_format;
279 int any_reads_are_PE;
280 SamBam_Reference_Info * sambam_chro_table;
281 pthread_spinlock_t read_details_out_lock;
282
283 SAM_pairer_context_t read_pairer;
284 SAM_pairer_context_t scRNA_read_pairer;
285
286 char * debug_command;
287 char * unistr_buffer_space;
288 srInt_64 max_BAM_header_size;
289 srInt_64 unistr_buffer_size;
290 srInt_64 unistr_buffer_used;
291 int is_scRNA_BAM_FQ_out_generated;
292 int scRNA_input_mode;
293 float scRNA_umi_cutoff;
294 int scRNA_rerun_on_persample_BAM;
295 int * scRNA_applied_umi_cut;
296 HashTable * scRNA_sample_sheet_table;
297 ArrayList * scRNA_sample_barcode_list;
298 ArrayList * scRNA_cell_barcodes_array;
299 HashTable * scRNA_cell_barcode_head_tail_table;
300 HashTable * scRNA_lineno1B_to_sampleno1B_tab;
301 ArrayList * scRNA_sample_id_to_name;
302 HashTable * scRNA_sample_BAM_writers; // sample_ID(1-base) ==> SamBam_writer
303 HashTable * lineno_2_sortedno_tab;
304 int known_cell_barcode_length;
305 HashTable * junction_features_table;
306 HashTable * junction_bucket_table;
307 fasta_contigs_t * fasta_contigs;
308 HashTable * gene_name_table; // gene_name -> gene_number
309 HashTable * BAM_chros_to_anno_table; // name in annotation file -> alias name
310 HashTable * GCcontent_table; // gene_name -> "qc_content_frac"
311 int scRNA_do_one_batch_runner_current;
312 pthread_spinlock_t scRNA_do_one_batch_runner_lock;
313 FILE ** scRNA_barcode_batched_bins;
314 pthread_spinlock_t * scRNA_barcode_batched_locks;
315 int scRNA_barcode_batched_bin_no;
316 int scRNA_barcode_batched_max_Rbin_len;
317 int scRNA_barcode_batched_max_genes;
318 int scRNA_UMI_length;
319
320
321 char * RGnames_set;
322 int RGnames_capacity;
323 int RGnames_ptr;
324
325 char alias_file_name[MAX_FILE_NAME_LENGTH];
326 char input_file_name[MAX_FILE_NAME_LENGTH];
327 char * input_file_short_name;
328 char raw_input_file_name[MAX_FILE_NAME_LENGTH];
329 char output_file_name[MAX_FILE_NAME_LENGTH];
330 char output_file_path[MAX_FILE_NAME_LENGTH];
331 char temp_file_dir[MAX_FILE_NAME_LENGTH];
332 char read_details_path[MAX_FILE_NAME_LENGTH];
333 char annotation_file_screen_output[MAX_FILE_NAME_LENGTH];
334 char scRNA_sample_sheet[MAX_FILE_NAME_LENGTH];
335 char scRNA_cell_barcode_list[MAX_FILE_NAME_LENGTH];
336 unsigned char ** gene_name_array; // gene_internal_number -> gene_name
337 int input_file_unique;
338
339 char * reported_extra_columns;
340 HashTable * exontable_chro_table; // gene_name -> fc_chromosome_index_info structure (contains chro_number, feature_number, block_start, block_end, etc)
341 int exontable_nchrs;
342 int exontable_exons;
343 int * exontable_geneid;
344 char * exontable_strand;
345 char ** exontable_chr;
346 srInt_64 * exontable_start;
347 srInt_64 * exontable_stop;
348 char feature_name_column[2000];
349 char gene_id_column[100];
350
351 srInt_64 * exontable_block_end_index;
352 srInt_64 * exontable_block_max_end;
353 srInt_64 * exontable_block_min_start;
354
355 char ** exontable_anno_chrs;
356 char * exontable_anno_chr_2ch;
357 srInt_64 * exontable_anno_chr_heads;
358
359 FILE * read_details_out_FP;
360 double start_time;
361
362 char * cmd_rebuilt;
363 char redo;
364
365 fc_read_counters read_counters;
366
367 } fc_thread_global_context_t;
368
369 unsigned int tick_time = 1000;
370
fetch_boundaries(char * chroname,char * cigar,unsigned int pos,char strand,int * has_left,unsigned short * left_on_read,unsigned int * left_pos,int * has_right,unsigned short * right_on_read,unsigned int * right_pos,fc_junction_info_t * result_junctions,int junction_space)371 int fetch_boundaries(char * chroname,char * cigar, unsigned int pos, char strand, int *has_left, unsigned short *left_on_read, unsigned int *left_pos, int *has_right, unsigned short *right_on_read, unsigned int *right_pos, fc_junction_info_t * result_junctions, int junction_space){
372
373 int cigar_cursor = 0, nch, read_len = 0, ret = 0;
374 unsigned int chro_cursor = pos, tmpi = 0;
375 unsigned int right_boundary = 0;
376 unsigned short left_clipped = 0;
377 unsigned short right_clipped = 0;
378 *has_right = 0;
379 *has_left = 0;
380
381 for(; (nch = cigar[cigar_cursor])!=0 ; cigar_cursor++){
382 if(isdigit(nch)){
383 tmpi = tmpi*10 + (nch - '0');
384 } else {
385 if (nch == 'S'){
386 if(chro_cursor == pos) left_clipped = tmpi;else right_clipped=tmpi;
387 read_len += tmpi;
388 } else if(nch == 'M' || nch == 'D'){
389 if(nch == 'M')read_len += tmpi;
390
391 chro_cursor += tmpi;
392 right_boundary = chro_cursor -1;
393 } else if(nch == 'N'){
394 unsigned int last_exon_last_base = chro_cursor - 1;
395 unsigned int next_exon_first_base = chro_cursor + tmpi;
396 chro_cursor += tmpi;
397
398 if(ret < junction_space){
399 result_junctions[ret].last_exon_base_left = last_exon_last_base;
400 result_junctions[ret].first_exon_base_right = next_exon_first_base;
401 strcpy(result_junctions[ret].chromosome_name_left, chroname);
402 strcpy(result_junctions[ret].chromosome_name_right, chroname);
403
404 ret ++;
405 }
406
407
408 } else if(nch == 'I') read_len += tmpi;
409 tmpi = 0;
410 }
411 }
412 if(left_clipped){
413 *has_left = 1;
414 *left_on_read = left_clipped;
415 *left_pos = pos;
416 }
417 if(right_clipped){
418 *has_right = 1;
419 *right_on_read = read_len - right_clipped - 1;
420 *right_pos = right_boundary;
421 }
422 return ret;
423 }
424
425 // This function parses the cigar string and returns the number of exon-exon junctions found in the cigar.
426 // It returns 0 if no junctions are found.
calc_junctions_from_cigar(fc_thread_global_context_t * global_context,int flag,char * chroname,unsigned int pos,char * cigar,char * extra_tags,fc_junction_info_t * result_junctions)427 int calc_junctions_from_cigar(fc_thread_global_context_t * global_context, int flag, char * chroname, unsigned int pos, char * cigar , char * extra_tags, fc_junction_info_t * result_junctions){
428 unsigned short boundaries_inclusive_base_on_read[global_context -> max_M];
429 unsigned int boundaries_inclusive_base_pos[global_context -> max_M];
430 char boundaries_chromosomes[global_context -> max_M][MAX_CHROMOSOME_NAME_LEN];
431 char boundaries_extend_to_left_on_read[global_context -> max_M];
432 int boundaries = 0;
433
434 int cigar_cursor = 0, nch, ret = 0, read_len = 0, x1, x2;
435 unsigned int chro_cursor = pos, tmpi = 0;
436 unsigned short left_clipped = 0;
437 unsigned short right_clipped = 0;
438
439 for(; (nch = cigar[cigar_cursor])!=0 ; cigar_cursor++){
440 if(isdigit(nch)){
441 tmpi = tmpi*10 + (nch - '0');
442 } else {
443 if (nch == 'S'){
444 if(chro_cursor == pos) left_clipped = tmpi;else right_clipped=tmpi;
445 read_len += tmpi;
446 } else if(nch == 'M' || nch == 'D'){
447 if(nch == 'M')read_len += tmpi;
448
449 chro_cursor += tmpi;
450 } else if(nch == 'N'){
451 unsigned int last_exon_last_base = chro_cursor - 1;
452 unsigned int next_exon_first_base = chro_cursor + tmpi;
453 if(ret <= global_context -> max_M - 1){
454 result_junctions[ret].last_exon_base_left = last_exon_last_base;
455 result_junctions[ret].first_exon_base_right = next_exon_first_base;
456 strcpy(result_junctions[ret].chromosome_name_left, chroname);
457 strcpy(result_junctions[ret].chromosome_name_right, chroname);
458
459 ret ++;
460 }
461 chro_cursor += tmpi;
462 } else if(nch == 'I') read_len += tmpi;
463 tmpi = 0;
464 }
465 }
466 if(left_clipped){
467 strcpy(boundaries_chromosomes[boundaries] , chroname);
468 boundaries_extend_to_left_on_read[boundaries] = 0;
469 boundaries_inclusive_base_pos[boundaries] = pos;
470 boundaries_inclusive_base_on_read[boundaries++] = left_clipped;
471 }
472 if(right_clipped){
473 strcpy(boundaries_chromosomes[boundaries] , chroname);
474 boundaries_extend_to_left_on_read[boundaries] = 1;
475 boundaries_inclusive_base_pos[boundaries] = chro_cursor - 1;
476 boundaries_inclusive_base_on_read[boundaries++] = read_len - right_clipped - 1;
477 }
478
479 int tag_cursor=0;
480
481 //if(strstr(extra_tags, "CG:Z")) {
482 // SUBREADprintf("CIGAR=%s, EXTRA=%s\n", cigar, extra_tags);
483 //}
484 int status = PARSE_STATUS_TAGNAME;
485 char tag_name[2], typechar=0;
486 int tag_inner_cursor=0;
487
488 char read_main_strand = (((flag & 0x10) == 0x10) == ((flag & 0x40)==0x40))?'-':'+';
489 char current_fusion_char[MAX_CHROMOSOME_NAME_LEN];
490 unsigned int current_fusion_pos = 0;
491 char current_fusion_strand = 0;
492 char current_fusion_cigar[global_context -> max_M * 15];
493 current_fusion_cigar [0] =0;
494 current_fusion_char [0]=0;
495
496 while(1){
497 int nch = extra_tags[tag_cursor];
498 if(status == PARSE_STATUS_TAGNAME){
499 tag_name[tag_inner_cursor++] = nch;
500 if(tag_inner_cursor == 2){
501 status = PARSE_STATUS_TAGTYPE;
502 tag_cursor += 1;
503 assert(extra_tags[tag_cursor] == ':');
504 }
505 }else if(status == PARSE_STATUS_TAGTYPE){
506 typechar = nch;
507 tag_cursor +=1;
508 assert(extra_tags[tag_cursor] == ':');
509 tag_inner_cursor = 0;
510 status = PARSE_STATUS_TAGVALUE;
511 }else if(status == PARSE_STATUS_TAGVALUE){
512 if(nch == '\t' || nch == 0){
513 if(current_fusion_cigar[0] && current_fusion_char[0] && current_fusion_pos && current_fusion_strand){
514
515 unsigned int left_pos = 0, right_pos = 0;
516 unsigned short left_on_read = 0, right_on_read = 0;
517 int has_left = 0, has_right = 0;
518
519 unsigned int start_pos = current_fusion_pos;
520 if(current_fusion_strand!=read_main_strand)
521 start_pos = find_left_end_cigar(current_fusion_pos, current_fusion_cigar);
522
523 ret += fetch_boundaries(current_fusion_char, current_fusion_cigar, start_pos, current_fusion_strand, &has_left, &left_on_read, &left_pos, &has_right, &right_on_read, &right_pos, result_junctions + ret, global_context -> max_M - ret );
524
525 if(has_left){
526 strcpy(boundaries_chromosomes[boundaries] , current_fusion_char);
527 boundaries_extend_to_left_on_read[boundaries] = 0;
528 boundaries_inclusive_base_pos[boundaries] = left_pos;
529 boundaries_inclusive_base_on_read[boundaries++] = left_on_read;
530 }
531 if(has_right){
532 strcpy(boundaries_chromosomes[boundaries] , current_fusion_char);
533 boundaries_extend_to_left_on_read[boundaries] = 1;
534 boundaries_inclusive_base_pos[boundaries] = right_pos;
535 boundaries_inclusive_base_on_read[boundaries++] = right_on_read;
536 }
537
538
539 // SUBREADprintf("BOUND_EXT: %s:%u (at %u) (%c) ~ %s:%u (at %u) (%c)\n", current_fusion_char, left_pos, left_on_read, has_left?'Y':'X' , current_fusion_char, right_pos, right_on_read, has_right?'Y':'X');
540
541 current_fusion_pos = 0;
542 current_fusion_strand = 0;
543 current_fusion_cigar [0] =0;
544 current_fusion_char [0]=0;
545 }
546
547 tag_inner_cursor = 0;
548 status = PARSE_STATUS_TAGNAME;
549 }else{
550 if(tag_name[0]=='C' && tag_name[1]=='C' && typechar == 'Z'){
551 current_fusion_char[tag_inner_cursor++]=nch;
552 current_fusion_char[tag_inner_cursor]=0;
553 }else if(tag_name[0]=='C' && tag_name[1]=='G' && typechar == 'Z'){
554 current_fusion_cigar[tag_inner_cursor++]=nch;
555 current_fusion_cigar[tag_inner_cursor]=0;
556 }else if(tag_name[0]=='C' && tag_name[1]=='P' && typechar == 'i'){
557 current_fusion_pos = current_fusion_pos * 10 + (nch - '0');
558 }else if(tag_name[0]=='C' && tag_name[1]=='T' && typechar == 'Z'){
559 current_fusion_strand = nch;
560 }
561 }
562 }
563
564 if(nch == 0){
565 assert(status == PARSE_STATUS_TAGNAME);
566 break;
567 }
568
569 tag_cursor++;
570 }
571
572
573 //for(x1 = 0; x1 < boundaries; x1++)
574 // SUBREADprintf("HAS: LR:%d, READ:%d\n", boundaries_extend_to_left_on_read[x1], boundaries_inclusive_base_on_read[x1]);
575
576 for(x1 = 0; x1 < boundaries; x1++)
577 for(x2 = 0; x2 < boundaries; x2++){
578 if(x1==x2) continue;
579 if(boundaries_chromosomes[x1][0]==0 || boundaries_chromosomes[x2][0]==0) continue;
580 if(boundaries_extend_to_left_on_read[x1] == 1 && boundaries_extend_to_left_on_read[x2] == 0){
581 if( boundaries_inclusive_base_on_read[x1] == boundaries_inclusive_base_on_read[x2]-1 ){
582
583 if(ret <= global_context -> max_M - 1){
584 result_junctions[ret].last_exon_base_left = boundaries_inclusive_base_pos[x1];
585 result_junctions[ret].first_exon_base_right = boundaries_inclusive_base_pos[x2];
586 strcpy(result_junctions[ret].chromosome_name_left, boundaries_chromosomes[x1]);
587 strcpy(result_junctions[ret].chromosome_name_right, boundaries_chromosomes[x2]);
588 ret++;
589 }
590
591
592 // SUBREADprintf("MATCH: %d ~ %d\n", boundaries_inclusive_base_on_read[x1], boundaries_inclusive_base_on_read[x2]);
593 boundaries_chromosomes[x1][0]=0;
594 boundaries_chromosomes[x2][0]=0;
595 }
596 }
597 }
598
599 //for(x1 = 0; x1 < boundaries; x1++)
600 // if(boundaries_chromosomes[x1][0])
601 // SUBREADprintf("LEFT: LR:%d, READ:%d\n", boundaries_extend_to_left_on_read[x1], boundaries_inclusive_base_on_read[x1]);
602 return ret;
603 }
604
605
unistr_cpy(fc_thread_global_context_t * global_context,char * str,int strl)606 srInt_64 unistr_cpy(fc_thread_global_context_t * global_context, char * str, int strl)
607 {
608 srInt_64 ret;
609 if(global_context->unistr_buffer_used + strl >= global_context->unistr_buffer_size-1)
610 {
611 if( global_context->unistr_buffer_size < (1000llu*1000u*1000u*32)) // 32GB
612 {
613 global_context -> unistr_buffer_size = global_context->unistr_buffer_size /2 *3;
614 global_context -> unistr_buffer_space = realloc(global_context -> unistr_buffer_space, global_context->unistr_buffer_size);
615 }
616 else
617 {
618 SUBREADprintf("Error: exceed memory limit (32GB) for storing feature names.\n");
619 return 0xffffffffu;
620 }
621 }
622
623 strcpy(global_context -> unistr_buffer_space + global_context->unistr_buffer_used, str);
624 ret = global_context->unistr_buffer_used;
625
626 global_context->unistr_buffer_used += strl +1;
627
628 return ret;
629 }
630
print_FC_configuration(fc_thread_global_context_t * global_context,char * annot,char * sam,char * out,int is_sam,int is_GTF,int * n_input_files,int isReadSummaryReport,char * PE_exp,char * PE_ass)631 int print_FC_configuration(fc_thread_global_context_t * global_context, char * annot, char * sam, char * out, int is_sam, int is_GTF, int *n_input_files, int isReadSummaryReport, char * PE_exp, char * PE_ass)
632 {
633 char * tmp_ptr1 = NULL , * next_fn, *sam_used = malloc(strlen(sam)+MAX_FILE_NAME_LENGTH), sam_ntxt[30],bam_ntxt[30], next_ntxt[50];
634 int nfiles=1, nBAMfiles = 0, nNonExistFiles = 0, x1;
635 char MAC_or_random[13];
636 mac_or_rand_str(MAC_or_random);
637
638 /*
639 if(global_context -> max_missing_bases_in_read >= 0 && global_context -> fractional_minimum_overlapping > 0.000001){
640 SUBREADprintf("\nERROR: multiple filtering conditions on overlapping bases in reads\n");
641 return 1;
642 }
643
644 if(global_context -> max_missing_bases_in_feature >= 0 && global_context -> fractional_minimum_feature_overlapping > 0.000001){
645 SUBREADprintf("\nERROR: multiple filtering conditions on overlapping bases in features\n");
646 return 1;
647 }*/
648
649 sprintf(sam_used, "%s/featureCounts_test_file_writable-%06d-%s.tmp", global_context -> temp_file_dir, getpid(), MAC_or_random);
650 FILE * fp = fopen(sam_used,"w");
651 if(fp){
652 fclose(fp);
653 unlink(sam_used);
654 }else{
655 SUBREADprintf("\nERROR: temporary directory is not writable: '%s'\n\n", global_context -> temp_file_dir);
656 return 1;
657 }
658
659 strcpy(sam_used, sam);
660 nfiles = 0;
661 while(1)
662 {
663 next_fn = strtok_r(nfiles==0?sam_used:NULL, FC_FLIST_SPLITOR, &tmp_ptr1);
664 if(next_fn == NULL || strlen(next_fn)<1) break;
665 nfiles++;
666
667 srInt_64 BAM_header_size = -1;
668 int file_probe = is_certainly_bam_file(next_fn, NULL, &BAM_header_size);
669 if(BAM_header_size>0) global_context -> max_BAM_header_size = max( global_context -> max_BAM_header_size , BAM_header_size + 180000);
670 if(file_probe==-1){
671 nNonExistFiles++;
672 if(global_context -> use_stdin_file){
673 SUBREADprintf("\nERROR: no valid SAM or BAM file is received from <STDIN>\n\n");
674 }else{
675 SUBREADprintf("\nERROR: invalid parameter: '%s'\n\n", next_fn);
676 }
677 return 1;
678 }
679 if(file_probe == 1) nBAMfiles++;
680 }
681
682 SUBREADputs("");
683 print_subread_logo();
684 SUBREADputs("");
685 print_in_box(80,1,1,"featureCounts setting");
686 print_in_box(80,0,0,"");
687
688 sam_ntxt[0]=0;
689 bam_ntxt[0]=0;
690 next_ntxt[0]=0;
691
692 if(nNonExistFiles)
693 sprintf(next_ntxt, "%d unknown file%s", nNonExistFiles, nNonExistFiles>1?"s":"");
694 if(nBAMfiles)
695 sprintf(bam_ntxt, "%d BAM file%s ", nBAMfiles, nBAMfiles>1?"s":"");
696 if(nfiles-nNonExistFiles-nBAMfiles)
697 sprintf(sam_ntxt, "%d SAM file%s ", nfiles-nNonExistFiles-nBAMfiles , (nfiles-nNonExistFiles-nBAMfiles)>1?"s":"");
698
699
700 strcpy(sam_used, sam);
701
702 print_in_box(80,0,0," Input files : %s%s%s", sam_ntxt, bam_ntxt, next_ntxt);
703 print_in_box(80,0,0,"");
704 nfiles=0;
705
706 while(1){
707 next_fn = strtok_r(nfiles==0?sam_used:NULL, FC_FLIST_SPLITOR, &tmp_ptr1);
708 if(next_fn == NULL || strlen(next_fn)<1) break;
709 //int is_first_read_PE = 0 , file_probe = is_certainly_bam_file(next_fn, &is_first_read_PE, NULL);
710 print_in_box(89,0,0," %c[36m%s%c[0m",CHAR_ESC, global_context -> use_stdin_file?"<STDIN>":get_short_fname(next_fn),CHAR_ESC);
711 nfiles++;
712 }
713
714 (*n_input_files) = nfiles;
715 print_in_box(80,0,0,"");
716
717 if(global_context -> annotation_file_screen_output[0]==0){
718 print_in_box(80,0,0," Output file : %s", get_short_fname(out));
719 print_in_box(80,0,0," Summary : %s.summary", get_short_fname(out));
720 }
721
722 char * PEassignStr = malloc(nfiles * 6);
723 char * PEexpectStr = malloc(nfiles * 6);
724
725 int exp_all_same = 1, ass_all_same = 1;
726
727 sprintf(PEexpectStr,"%s, ", (PE_exp[0]=='1')?"yes":"no");
728 char * Ystr = nfiles>5?"Y":"yes";
729 char * Nstr = nfiles>5?"N":"no";
730 if(PE_exp[1]){
731 for(x1=1; PE_exp[x1]; x1++)
732 if(PE_exp[x1]!=PE_exp[0]) exp_all_same=0;
733
734 if(!exp_all_same){
735 PEexpectStr[0]=0;
736 for(x1=0; PE_exp[x1]; x1++)
737 sprintf(PEexpectStr+strlen(PEexpectStr), "%s, ", (PE_exp[x1]=='1')?Ystr:Nstr);
738 }
739 }
740 PEexpectStr[strlen(PEexpectStr)-2]=0;
741
742 sprintf(PEassignStr,"%s, ", (PE_ass[0]=='1')?"yes":"no");
743 if(PE_ass[1]){
744 for(x1=1; PE_ass[x1]; x1++)
745 if(PE_ass[x1]!=PE_ass[0]) ass_all_same=0;
746
747 if(!ass_all_same){
748 PEassignStr[0]=0;
749 for(x1=0; PE_ass[x1]; x1++)
750 sprintf(PEassignStr+strlen(PEassignStr), "%s, ", (PE_ass[x1]=='1')?Ystr:Nstr);
751 }
752 }
753 PEassignStr[strlen(PEassignStr)-2]=0;
754
755 print_in_box(80,0,0," Paired-end : %s",PEexpectStr);
756 print_in_box(80,0,0," Count read pairs : %s",PEassignStr);
757 free(PEassignStr);
758 free(PEexpectStr);
759
760 if(global_context -> annotation_file_screen_output[0])
761 print_in_box(80,0,0," Annotation : %s",global_context -> annotation_file_screen_output);
762 else
763 print_in_box(80,0,0," Annotation : %s (%s)", get_short_fname(annot), is_GTF?"GTF":"SAF");
764 print_in_box(80,0,0," Dir for temp files : %s", global_context->temp_file_dir);
765
766 if(global_context -> do_scRNA_table){
767 print_in_box(80,0,0,"");
768 print_in_box(80,0,0," scRNA count table : <input_file>.scRNA.table");
769 print_in_box(80,0,0," scRNA sample sheet : %s", get_short_fname(global_context->scRNA_sample_sheet));
770 print_in_box(80,0,0," scRNA barcode list : %s", get_short_fname(global_context->scRNA_cell_barcode_list));
771 }
772
773 if(isReadSummaryReport){
774 print_in_box(80,0,0," Assignment details : <input_file>.featureCounts%s", isReadSummaryReport == FILE_TYPE_BAM?".bam":(isReadSummaryReport == FILE_TYPE_SAM?".sam":""));
775 if(global_context -> read_details_path[0])
776 print_in_box(80,0,0," Details output path : %s", global_context ->read_details_path);
777 else
778 print_in_box(80,0,0," (Note that files are saved to the output directory)");
779 print_in_box(80,0,0,"");
780 }
781
782 if(global_context -> do_junction_counting)
783 print_in_box(80,0,0," Junction Counting : <output_file>.jcounts");
784 #ifdef MAKE_STANDALONE
785 #endif
786
787 if(global_context -> alias_file_name[0])
788 print_in_box(80,0,0," Chromosome alias file : %s", get_short_fname(global_context -> alias_file_name));
789
790 #ifdef MAKE_STANDALONE
791 print_in_box(80,0,0,"");
792 #endif
793 print_in_box(80,0,0," Threads : %d", global_context->thread_number);
794 print_in_box(80,0,0," Level : %s level", global_context->is_gene_level?"meta-feature":"feature");
795 // print_in_box(80,0,0," Paired-end : %s", global_context->is_paired_end_mode_assign?"yes":"no");
796 if(global_context -> do_not_sort && global_context->is_paired_end_mode_assign) {
797 print_in_box(80,0,0," Sorting PE Reads : never");
798 print_in_box(80,0,0,"");
799 }
800
801 char * multi_mapping_allow_mode = "not counted";
802 if(global_context->is_multi_mapping_allowed)
803 multi_mapping_allow_mode = global_context -> use_fraction_multi_mapping?"counted (fractional)": "counted";
804
805 print_in_box(80,0,0," Multimapping reads : %s", multi_mapping_allow_mode);
806
807 if(global_context-> is_primary_alignment_only)
808 print_in_box(80,0,0," Multiple alignments : primary alignment only");
809
810 print_in_box(80,0,0,"Multi-overlapping reads : %s", global_context->is_multi_overlap_allowed?"counted":"not counted");
811 if(global_context -> is_split_or_exonic_only)
812 print_in_box(80,0,0," Split alignments : %s", (1 == global_context -> is_split_or_exonic_only)?"only split alignments":"only exonic alignments");
813 print_in_box(80,0,0," Min overlapping bases : %d", global_context -> fragment_minimum_overlapping);
814 if(global_context -> max_missing_bases_in_read >= 0)
815 print_in_box(80,0,0," Max missing bases : %d in reads", global_context -> max_missing_bases_in_read);
816 if(global_context -> max_missing_bases_in_feature >= 0)
817 print_in_box(80,0,0," Max missing bases : %d in features", global_context -> max_missing_bases_in_feature);
818 if(global_context -> fractional_minimum_overlapping > 0.000001)
819 print_in_box(81,0,0," Min overlapping frac. : %0.1f%%%% to reads", global_context -> fractional_minimum_overlapping*100);
820 if(global_context -> fractional_minimum_feature_overlapping > 0.000001)
821 print_in_box(81,0,0," Min overlapping frac. : %0.1f%%%% to features", global_context -> fractional_minimum_feature_overlapping*100);
822 if(global_context -> read_shift_size >0)
823 print_in_box(80,0,0," Read shift : %d to %s", global_context -> read_shift_size, global_context -> read_shift_type==READ_SHIFT_UPSTREAM?"upstream":( global_context -> read_shift_type==READ_SHIFT_DOWNSTREAM?"downstream":( global_context -> read_shift_type==READ_SHIFT_LEFT?"left":"right")));
824 if(global_context -> five_end_extension || global_context -> three_end_extension)
825 print_in_box(80,0,0," Read extension : %d on 5' and %d on 3' ends", global_context -> five_end_extension , global_context -> three_end_extension);
826 if(global_context -> reduce_5_3_ends_to_one)
827 print_in_box(80,0,0," Read reduction : to %d' end" , global_context -> reduce_5_3_ends_to_one == REDUCE_TO_5_PRIME_END ?5:3);
828 if(global_context -> is_duplicate_ignored)
829 print_in_box(80,0,0," Duplicated Reads : ignored");
830 if(global_context -> long_read_minimum_length < 5000)
831 print_in_box(80,0,0," Long read mode : yes");
832 //print_in_box(80,0,0," Read orientations : %c%c", global_context->is_first_read_reversed?'r':'f', global_context->is_second_read_straight?'f':'r' );
833
834 if(global_context->is_paired_end_mode_assign)
835 {
836 print_in_box(80,0,0,"");
837 print_in_box(80,0,0," Chimeric reads : %s", global_context->is_chimertc_disallowed?"not counted":"counted");
838 print_in_box(80,0,0," Both ends mapped : %s", global_context->is_both_end_required?"required":"not required");
839
840 if(global_context->is_PE_distance_checked)
841 print_in_box(80,0,0," Fragment length : %d - %d", global_context -> min_paired_end_distance, global_context -> max_paired_end_distance);
842 }
843
844 print_in_box(80,0,0,"");
845 print_in_box(80,2,1,"");
846 SUBREADputs("");
847 print_in_box(80,1,1,"Running");
848 print_in_box(80,0,0,"");
849 if( global_context -> max_BAM_header_size > 32 * 1024 * 1024 ){
850 }
851 if(global_context->BAM_chros_to_anno_table)
852 print_in_box(80,0,0,"%ld chromosome name aliases are loaded.", global_context -> BAM_chros_to_anno_table ->numOfElements);
853
854 free(sam_used);
855 return 0;
856 }
857
print_FC_results(fc_thread_global_context_t * global_context,char * out)858 void print_FC_results(fc_thread_global_context_t * global_context, char * out)
859 {
860 //print_in_box(89,0,1,"%c[36mAlignment assignment finished.%c[0m", CHAR_ESC, CHAR_ESC);
861 print_in_box(80,0,0,"");
862 #ifdef MAKE_STANDALONE
863 print_in_box(80,0,PRINT_BOX_WRAPPED,"Summary of counting results can be found in file \"%s.summary\"", out);
864 print_in_box(80,0,0,"");
865 #endif
866 print_in_box(80,2,1,"");
867 SUBREADputs("");
868 return;
869
870 SUBREADputs("");
871 }
872
fc_strcmp(const void * s1,const void * s2)873 int fc_strcmp(const void * s1, const void * s2)
874 {
875 return strcmp((char*)s1, (char*)s2);
876 }
877
junc_gene_free(void * vv)878 void junc_gene_free(void *vv){
879 fc_junction_gene_t *v = vv;
880 free(v -> gene_name);
881 free(v);
882 }
883
register_junc_feature(fc_thread_global_context_t * global_context,char * feature_name,char * chro,unsigned int start,unsigned int stop)884 void register_junc_feature(fc_thread_global_context_t *global_context, char * feature_name, char * chro, unsigned int start, unsigned int stop){
885 HashTable * gene_table = HashTableGet(global_context -> junction_features_table, chro);
886 //SUBREADprintf("REG %s : %p\n", chro, gene_table);
887 if(NULL == gene_table){
888 gene_table = HashTableCreate(48367);
889 HashTableSetDeallocationFunctions(gene_table, NULL, junc_gene_free);
890 HashTableSetKeyComparisonFunction(gene_table, fc_strcmp);
891 HashTableSetHashFunction(gene_table, fc_chro_hash);
892
893 char * new_name = malloc(strlen(chro)+1);
894 strcpy(new_name, chro);
895 HashTablePut(global_context -> junction_features_table, new_name, gene_table);
896 }
897 fc_junction_gene_t * gene_info = HashTableGet(gene_table, feature_name);
898 if(NULL == gene_info){
899 gene_info = malloc(sizeof(fc_junction_gene_t));
900 gene_info -> gene_name = strdup(feature_name);
901 gene_info -> pos_first_base = start;
902 gene_info -> pos_last_base = stop;
903
904 HashTablePut(gene_table, gene_info -> gene_name, gene_info);
905 }else{
906 gene_info -> pos_first_base = min(start, gene_info -> pos_first_base);
907 gene_info -> pos_last_base = max(stop, gene_info -> pos_last_base);
908 }
909 }
910
free_bucket_table_list(void * pv)911 void free_bucket_table_list(void * pv){
912 gene_info_list_t * list = (gene_info_list_t*) pv;
913 free(list -> genes);
914 free(list);
915 }
916
match_feature_name_column(char * infile,char * needed)917 int match_feature_name_column(char * infile, char * needed){
918 char * ptt = NULL;
919 char lneeded[strlen(needed)+1];
920 strcpy(lneeded, needed);
921 char * t1 = strtok_r(lneeded, ",", &ptt);
922 while(t1){
923 if(strcmp(t1, infile)==0) return 1;
924 t1 = strtok_r(NULL,",", &ptt);
925 }
926 return 0;
927 }
928
929 #define JUNCTION_BUCKET_STEP (128*1024)
930
locate_junc_features(fc_thread_global_context_t * global_context,char * chro,unsigned int pos,fc_junction_gene_t ** ret_info,int max_ret_info_size)931 int locate_junc_features(fc_thread_global_context_t *global_context, char * chro, unsigned int pos, fc_junction_gene_t ** ret_info, int max_ret_info_size){
932 gene_info_list_t * list = NULL;
933 char bucket_key[CHROMOSOME_NAME_LENGTH + 20];
934
935 if(global_context -> BAM_chros_to_anno_table) {
936 char * anno_chro_name = HashTableGet( global_context -> BAM_chros_to_anno_table , chro);
937 if(anno_chro_name){
938 sprintf(bucket_key, "%s:%u", anno_chro_name, pos - pos % JUNCTION_BUCKET_STEP);
939 list = HashTableGet(global_context -> junction_bucket_table, bucket_key);
940 }
941 }
942
943 if(list == NULL){
944 sprintf(bucket_key, "%s:%u", chro, pos - pos % JUNCTION_BUCKET_STEP);
945 list = HashTableGet(global_context -> junction_bucket_table, bucket_key);
946 }
947
948 if(list == NULL && strlen(chro)>3 && memcmp(chro, "chr", 3)==0){
949 sprintf(bucket_key, "%s:%u", chro+3, pos - pos % JUNCTION_BUCKET_STEP);
950 list = HashTableGet(global_context -> junction_bucket_table, bucket_key);
951 }
952
953 if(list == NULL){
954 sprintf(bucket_key, "chr%s:%u", chro, pos - pos % JUNCTION_BUCKET_STEP);
955 list = HashTableGet(global_context -> junction_bucket_table, bucket_key);
956 }
957
958 int ret = 0;
959
960 if(list){
961 int x1;
962 for(x1 = 0; x1 < list -> used; x1++){
963 fc_junction_gene_t * gene_info = list -> genes[x1];
964 if(gene_info -> pos_first_base <= pos && gene_info -> pos_last_base >= pos){
965 if(ret < max_ret_info_size)
966 ret_info [ret ++] = gene_info;
967 }
968 }
969 }
970
971 return ret;
972 }
973
974 // This function loads annotations from the file.
975 // It returns the number of featres loaded, or -1 if something is wrong.
976 // Memory will be allowcated in this function. The pointer is saved in *loaded_features.
977 // The invoker must release the memory itself.
978
979 #define MAX_ANNOT_LINE_LENGTH 1000000
load_feature_info(fc_thread_global_context_t * global_context,const char * annotation_file,int file_type,fc_feature_info_t ** loaded_features)980 int load_feature_info(fc_thread_global_context_t *global_context, const char * annotation_file, int file_type, fc_feature_info_t ** loaded_features)
981 {
982 unsigned int features = 0, xk1 = 0, lineno=0;
983 char * file_line = malloc(MAX_ANNOT_LINE_LENGTH+1);
984 autozip_fp anno_fp;
985 int apret = autozip_open(annotation_file, &anno_fp);
986 int is_GFF_warned = 0;
987 if(apret < 0) return -1;
988
989 HashTable * chro_name_table = HashTableCreate(1603);
990 HashTableSetHashFunction(chro_name_table, fc_chro_hash);
991 HashTableSetKeyComparisonFunction(chro_name_table, fc_strcmp_chro);
992 global_context -> longest_chro_name = 0;
993
994 if(global_context -> do_junction_counting){
995 global_context -> junction_bucket_table = HashTableCreate(76037);
996 HashTableSetDeallocationFunctions(global_context -> junction_bucket_table, free, free_bucket_table_list);
997 HashTableSetKeyComparisonFunction(global_context -> junction_bucket_table, fc_strcmp);
998 HashTableSetHashFunction(global_context -> junction_bucket_table, fc_chro_hash);
999
1000 global_context -> junction_features_table = HashTableCreate(1603);
1001 HashTableSetDeallocationFunctions(global_context -> junction_features_table, free, (void (*)(void *))HashTableDestroy);
1002 HashTableSetKeyComparisonFunction(global_context -> junction_features_table, fc_strcmp);
1003 HashTableSetHashFunction(global_context -> junction_features_table, fc_chro_hash);
1004 }
1005
1006
1007 // first scan: get the chromosome size (that have exons), total number of features
1008 // also create chro_name_table : chro_name => fc_chromosome_index_info
1009 while(0)
1010 {
1011 int rchars = autozip_gets(&anno_fp, file_line, MAX_ANNOT_LINE_LENGTH);
1012 char * token_temp = NULL, *chro_name;
1013 fc_chromosome_index_info * chro_stab;
1014 unsigned int feature_pos = 0;
1015 if(rchars < 1) break;
1016
1017 lineno++;
1018 if(is_comment_line(file_line, file_type, lineno-1))continue;
1019 if(file_type == FILE_TYPE_GTF)
1020 {
1021 chro_name = strtok_r(file_line,"\t",&token_temp);
1022 strtok_r(NULL,"\t", &token_temp); // lib_name (not needed)
1023 char * feature_type = strtok_r(NULL,"\t", &token_temp);
1024 if(match_feature_name_column(feature_type, global_context -> feature_name_column))
1025 {
1026 strtok_r(NULL,"\t", &token_temp); // feature_start
1027 feature_pos = atoi(strtok_r(NULL,"\t", &token_temp));// feature_end
1028 features++;
1029 }
1030 else chro_name = NULL;
1031 }
1032 else
1033 {
1034 strtok_r(file_line,"\t", &token_temp);
1035 chro_name = strtok_r(NULL,"\t",&token_temp);
1036 strtok_r(NULL,"\t",&token_temp); // feature_start
1037 feature_pos = atoi(strtok_r(NULL,"\t", &token_temp));// feature_end
1038
1039 features++;
1040 }
1041
1042 if(chro_name)
1043 {
1044 if(strlen(chro_name)>=CHROMOSOME_NAME_LENGTH)
1045 chro_name[CHROMOSOME_NAME_LENGTH-1]=0;
1046 chro_stab = HashTableGet(chro_name_table, chro_name);
1047
1048 if(chro_stab)
1049 {
1050 chro_stab -> chro_possible_length = max(chro_stab -> chro_possible_length , feature_pos+1);
1051 }else
1052 {
1053 char * tmp_chro_name = malloc(CHROMOSOME_NAME_LENGTH);
1054 term_strncpy(tmp_chro_name, chro_name, CHROMOSOME_NAME_LENGTH);
1055 chro_stab = calloc(sizeof(fc_chromosome_index_info),1);
1056 chro_stab -> chro_number = chro_name_table->numOfElements;
1057 chro_stab -> chro_possible_length = feature_pos+1;
1058 chro_stab -> reverse_table_start_index_size = 5000000;
1059 chro_stab -> reverse_table_start_index = NULL;
1060 HashTablePut(chro_name_table, tmp_chro_name, chro_stab);
1061 }
1062
1063 chro_stab -> chro_features ++;
1064 }
1065 }
1066
1067 //autozip_rewind(&anno_fp);
1068
1069 unsigned int ret_features_size = 400000;
1070 fc_feature_info_t * ret_features = malloc(sizeof(fc_feature_info_t) * ret_features_size);
1071 char * tmpnameex = malloc(50001);
1072
1073 lineno = 0;
1074 while(1)
1075 {
1076 int is_gene_id_found = 0;
1077 int rchars = autozip_gets(&anno_fp, file_line, MAX_ANNOT_LINE_LENGTH);
1078 if(rchars < 1) break;
1079 if(rchars >= MAX_ANNOT_LINE_LENGTH - 1){
1080 SUBREADprintf("\nERROR: the %u-th line in your GTF file is extremely long (longer than %d bytes).\nThe program cannot parse this line.\n", lineno+1, MAX_ANNOT_LINE_LENGTH-1);
1081 return -2;
1082 }
1083
1084 lineno++;
1085 char * token_temp = NULL;
1086 if(is_comment_line(file_line, file_type, lineno-1))continue;
1087
1088 if(file_type == FILE_TYPE_RSUBREAD){
1089 if(xk1 >= ret_features_size) {
1090 ret_features_size *=2;
1091 ret_features = realloc(ret_features, sizeof(fc_feature_info_t) * ret_features_size);
1092 }
1093 char * feature_name = strtok_r(file_line,"\t",&token_temp);
1094 int feature_name_len = strlen(feature_name);
1095 if(feature_name_len > FEATURE_NAME_LENGTH-2){
1096 SUBREADprintf("WARNING: feature name on the %d-th line is longer than %d bytes. The name is truncated\n", lineno, FEATURE_NAME_LENGTH -2);
1097 feature_name[FEATURE_NAME_LENGTH -2 ] = 0;
1098 }
1099
1100 srInt_64 genename_pos = unistr_cpy(global_context, (char *)feature_name, feature_name_len);
1101
1102 // SUBREADprintf("REALL: '%s'=%d [%d] %p POS=%d\t\tOLD_NAME_POS=%d\n" , feature_name, feature_name_len , xk1, ret_features+xk1, genename_pos, xk1>0?ret_features[xk1-1].feature_name_pos:-1);
1103 ret_features[xk1].feature_name_pos = genename_pos;
1104
1105 char * seq_name = strtok_r(NULL,"\t", &token_temp);
1106 int chro_name_len = strlen(seq_name);
1107 if(chro_name_len > CHROMOSOME_NAME_LENGTH) seq_name[CHROMOSOME_NAME_LENGTH -1 ] = 0;
1108 srInt_64 chro_name_pos = unistr_cpy(global_context, (char *)seq_name, chro_name_len);
1109 global_context -> longest_chro_name = max(chro_name_len, global_context -> longest_chro_name);
1110
1111
1112 char * start_ptr = strtok_r(NULL,"\t", &token_temp);
1113 char * end_ptr = strtok_r(NULL,"\t", &token_temp);
1114
1115 if(start_ptr == NULL || end_ptr == NULL){
1116 SUBREADprintf("\nWarning: the format on the %d-th line is wrong.\n", lineno);
1117 }
1118 srInt_64 tv1 = atoll(start_ptr);
1119 srInt_64 tv2 = atoll(end_ptr);
1120
1121 if( isdigit(start_ptr[0]) && isdigit(end_ptr[0]) ){
1122 if(strlen(start_ptr) > 10 || strlen(end_ptr) > 10 || tv1 > 0x7fffffff || tv2> 0x7fffffff){
1123 SUBREADprintf("\nError: Line %d contains a coordinate greater than 2^31.\n", lineno);
1124 return -2;
1125 }
1126
1127 if(tv1 >tv2){
1128 SUBREADprintf("\nError: Line %d contains a feature that do not have a positive length.\n", lineno);
1129 return -2;
1130 }
1131 }else{
1132 SUBREADprintf("\nError: Line %d contains a format error. The expected annotation format is SAF.\n", lineno);
1133 return -2;
1134 }
1135
1136 ret_features[xk1].chro_name_pos_delta = chro_name_pos - ret_features[xk1].feature_name_pos;
1137 ret_features[xk1].start = atoi( start_ptr );// start
1138 if(ret_features[xk1].start>0x7fffffff)
1139 {
1140 ret_features[xk1].start = 0;
1141 print_in_box(80,0,0,"WARNING the %d-th line has a negative chro coordinate.", lineno);
1142 }
1143
1144 ret_features[xk1].end = atoi( end_ptr );//end
1145 if(ret_features[xk1].end>0x7fffffff)
1146 {
1147 ret_features[xk1].end = 0;
1148 print_in_box(80,0,0,"WARNING the %d-th line has a negative chro coordinate.", lineno);
1149 }
1150
1151 char * strand_str = strtok_r(NULL,"\t", &token_temp);
1152 if(strand_str == NULL)
1153 ret_features[xk1].is_negative_strand = 0;
1154 else
1155 ret_features[xk1].is_negative_strand = ('+' ==strand_str[0])?0:(('-' ==strand_str[0])?1:-1);
1156
1157 if(global_context -> do_detection_call){
1158 char * GCcontent = strtok_r(NULL,"\t", &token_temp);
1159 if(GCcontent){
1160 int gclen = strlen(GCcontent);
1161 if(gclen>0)GCcontent[gclen-1]=0;
1162 HashTablePut(global_context -> GCcontent_table, strdup(feature_name) , strdup(GCcontent));
1163 }
1164 }
1165
1166 ret_features[xk1].sorted_order = xk1;
1167
1168 int bin_location = ret_features[xk1].start / REVERSE_TABLE_BUCKET_LENGTH;
1169
1170 fc_chromosome_index_info * chro_stab = HashTableGet(chro_name_table, seq_name);
1171 int feature_pos = ret_features[xk1].end;
1172 if(NULL == chro_stab){
1173 char * tmp_chro_name = malloc(CHROMOSOME_NAME_LENGTH);
1174 term_strncpy(tmp_chro_name, seq_name, CHROMOSOME_NAME_LENGTH);
1175 chro_stab = calloc(sizeof(fc_chromosome_index_info),1);
1176 chro_stab -> chro_number = chro_name_table->numOfElements;
1177 chro_stab -> chro_possible_length = feature_pos+1;
1178 chro_stab -> reverse_table_start_index_size = 5000000;
1179 chro_stab -> reverse_table_start_index = calloc( chro_stab -> reverse_table_start_index_size / REVERSE_TABLE_BUCKET_LENGTH +2, sizeof(int));
1180 HashTablePut(chro_name_table, tmp_chro_name, chro_stab);
1181 }else chro_stab -> chro_possible_length = max(feature_pos+1, chro_stab -> chro_possible_length);
1182 chro_stab -> chro_features ++;
1183
1184 if( chro_stab -> chro_possible_length >= chro_stab -> reverse_table_start_index_size ) {
1185 int old_end = sizeof(int) *( chro_stab -> reverse_table_start_index_size / REVERSE_TABLE_BUCKET_LENGTH +2);
1186 chro_stab -> reverse_table_start_index_size = max(chro_stab -> reverse_table_start_index_size * 2, (int)(chro_stab -> chro_possible_length * 1.3));
1187 int new_size = sizeof(int) *( chro_stab -> reverse_table_start_index_size / REVERSE_TABLE_BUCKET_LENGTH +2);
1188 chro_stab -> reverse_table_start_index = realloc( chro_stab -> reverse_table_start_index , new_size);
1189 memset(chro_stab -> reverse_table_start_index + old_end / sizeof(int), 0, new_size - old_end);
1190 }
1191 chro_stab -> reverse_table_start_index[bin_location]++;
1192 is_gene_id_found = 1;
1193
1194 assert(feature_name);
1195 if(global_context -> do_junction_counting)
1196 register_junc_feature(global_context , feature_name, seq_name, ret_features[xk1].start, ret_features[xk1].end);
1197
1198 xk1++;
1199 } else if(file_type == FILE_TYPE_GTF) {
1200 char feature_name_tmp[FEATURE_NAME_LENGTH];
1201 sprintf(feature_name_tmp, "LINE_%07u", xk1 + 1);
1202 char * seq_name = strtok_r(file_line,"\t",&token_temp);
1203 strtok_r(NULL,"\t", &token_temp);// source
1204 char * feature_type = strtok_r(NULL,"\t", &token_temp);// feature_type
1205 if(match_feature_name_column(feature_type, global_context -> feature_name_column)) {
1206
1207 if(xk1 >= ret_features_size) {
1208 ret_features_size *=2;
1209 ret_features = realloc(ret_features, sizeof(fc_feature_info_t) * ret_features_size);
1210 }
1211
1212 char * start_ptr = strtok_r(NULL,"\t", &token_temp);
1213 char * end_ptr = strtok_r(NULL,"\t", &token_temp);
1214
1215 if(start_ptr == NULL || end_ptr == NULL){
1216 SUBREADprintf("\nWarning: the format on the %d-th line is wrong.\n", lineno);
1217 }
1218 srInt_64 tv1 = atoll(start_ptr);
1219 srInt_64 tv2 = atoll(end_ptr);
1220
1221 if( isdigit(start_ptr[0]) && isdigit(end_ptr[0]) ){
1222 if(strlen(start_ptr) > 10 || strlen(end_ptr) > 10 || tv1 > 0x7fffffff || tv2> 0x7fffffff){
1223 SUBREADprintf("\nError: Line %d contains a coordinate greater than 2^31.\n", lineno);
1224 return -2;
1225 }
1226 }else{
1227 SUBREADprintf("\nError: Line %d contains a format error. The expected annotation format is GTF/GFF.\n", lineno);
1228 return -2;
1229 }
1230 ret_features[xk1].start = atoi(start_ptr);// start
1231 ret_features[xk1].end = atoi(end_ptr);//end
1232
1233 if(ret_features[xk1].start < 1 || ret_features[xk1].end<1 || ret_features[xk1].start > 0x7fffffff || ret_features[xk1].end > 0x7fffffff || ret_features[xk1].start > ret_features[xk1].end){
1234 SUBREADprintf("\Error: the feature on the %d-th line has zero coordinate or zero lengths\n\n", lineno);
1235 return -2;
1236 }
1237
1238
1239 strtok_r(NULL,"\t", &token_temp);// score
1240 char * strand_str = strtok_r(NULL,"\t", &token_temp);
1241 ret_features[xk1].is_negative_strand = ('-' == strand_str[0])?1:( ('+' == strand_str[0])?0:-1 );//strand
1242 ret_features[xk1].sorted_order = xk1;
1243 strtok_r(NULL,"\t",&token_temp); // "frame"
1244 char * extra_attrs = strtok_r(NULL,"\t",&token_temp); // name_1 "val1"; name_2 "val2"; ...
1245 ret_features[xk1].extra_columns = NULL;
1246 if(extra_attrs && (strlen(extra_attrs)>2))
1247 {
1248 int attr_val_len = GTF_extra_column_value(extra_attrs , global_context -> gene_id_column , feature_name_tmp, FEATURE_NAME_LENGTH);
1249 if(attr_val_len>0) is_gene_id_found=1;
1250 // printf("V=%s\tR=%d\n", extra_attrs , attr_val_len);
1251
1252 if(global_context -> reported_extra_columns){
1253 char * extcols = malloc(30);
1254 int extcols_size = 30, extcols_len = 0;
1255
1256 char * this_exname_ptr=global_context -> reported_extra_columns;
1257 while(1){
1258 int padd0, is_last=1;
1259 for(padd0=0; this_exname_ptr[padd0]; padd0++)
1260 if(this_exname_ptr[padd0]=='\t'){
1261 this_exname_ptr[padd0]=0;
1262 is_last=0;
1263 break;
1264 }
1265
1266 attr_val_len = GTF_extra_column_value(extra_attrs , this_exname_ptr , tmpnameex, 50000);
1267
1268 if(attr_val_len<0){
1269 attr_val_len=2;
1270 strcpy(tmpnameex,"NA");
1271 }
1272 if(attr_val_len + extcols_len + 2 > extcols_size){
1273 extcols_size = max(extcols_size*2, attr_val_len + extcols_len+2);
1274 extcols = realloc(extcols, extcols_size);
1275 }
1276 memcpy(extcols+extcols_len, tmpnameex, attr_val_len);
1277 extcols_len += attr_val_len;
1278 extcols[extcols_len]='\t';
1279 extcols_len += 1;
1280
1281 if(is_last)break;
1282 this_exname_ptr[padd0]='\t';
1283 this_exname_ptr += padd0+1;
1284 }
1285 extcols[extcols_len-1]=0;
1286 ret_features[xk1].extra_columns = extcols;
1287 }
1288 }
1289
1290 if(!is_gene_id_found) {
1291 if(!is_GFF_warned)
1292 {
1293 int ext_att_len = strlen(extra_attrs);
1294 if(extra_attrs[ext_att_len-1] == '\n') extra_attrs[ext_att_len-1] =0;
1295 SUBREADprintf("\nERROR: failed to find the gene identifier attribute in the 9th column of the provided GTF file.\nThe specified gene identifier attribute is '%s' \nAn example of attributes included in your GTF annotation is '%s'.\n\n", global_context -> gene_id_column, extra_attrs);
1296 }
1297 is_GFF_warned++;
1298 }
1299
1300 int feature_name_len = strlen(feature_name_tmp);
1301 if(feature_name_len > FEATURE_NAME_LENGTH-2){
1302 SUBREADprintf("WARNING: feature name on the %d-th line is longer than %d bytes. The name is truncated\n", lineno, FEATURE_NAME_LENGTH-2);
1303 feature_name_tmp[FEATURE_NAME_LENGTH -2 ] = 0;
1304 }
1305 ret_features[xk1].feature_name_pos = unistr_cpy(global_context, (char *)feature_name_tmp, feature_name_len);
1306
1307 int chro_name_len = strlen(seq_name);
1308 if(chro_name_len > CHROMOSOME_NAME_LENGTH) seq_name[CHROMOSOME_NAME_LENGTH -1 ] = 0;
1309 srInt_64 chro_name_pos = unistr_cpy(global_context, (char *)seq_name, chro_name_len);
1310 global_context -> longest_chro_name = max(chro_name_len, global_context -> longest_chro_name);
1311
1312 ret_features[xk1].chro_name_pos_delta = chro_name_pos - ret_features[xk1].feature_name_pos;
1313
1314 int bin_location = ret_features[xk1].start / REVERSE_TABLE_BUCKET_LENGTH;
1315 fc_chromosome_index_info * chro_stab = HashTableGet(chro_name_table, seq_name);
1316 int feature_pos = ret_features[xk1].end;
1317
1318 if(NULL == chro_stab){
1319 char * tmp_chro_name = malloc(CHROMOSOME_NAME_LENGTH);
1320 term_strncpy(tmp_chro_name, seq_name, CHROMOSOME_NAME_LENGTH);
1321 chro_stab = calloc(sizeof(fc_chromosome_index_info),1);
1322 chro_stab -> chro_number = chro_name_table->numOfElements;
1323 chro_stab -> chro_possible_length = feature_pos+1;
1324 chro_stab -> reverse_table_start_index_size = 5000000;
1325 chro_stab -> reverse_table_start_index = calloc( chro_stab -> reverse_table_start_index_size / REVERSE_TABLE_BUCKET_LENGTH +2 , sizeof(int));
1326 HashTablePut(chro_name_table, tmp_chro_name, chro_stab);
1327 }else chro_stab -> chro_possible_length = max(feature_pos+1, chro_stab -> chro_possible_length);
1328 chro_stab -> chro_features ++;
1329
1330 if( chro_stab -> chro_possible_length >= chro_stab -> reverse_table_start_index_size ) {
1331 int old_end = sizeof(int) *( chro_stab -> reverse_table_start_index_size / REVERSE_TABLE_BUCKET_LENGTH +2);
1332 chro_stab -> reverse_table_start_index_size = max(chro_stab -> reverse_table_start_index_size * 2, (int)(chro_stab -> chro_possible_length * 1.3));
1333 int new_size = sizeof(int) *( chro_stab -> reverse_table_start_index_size / REVERSE_TABLE_BUCKET_LENGTH +2);
1334 chro_stab -> reverse_table_start_index = realloc(chro_stab -> reverse_table_start_index, new_size);
1335 memset(chro_stab -> reverse_table_start_index + old_end / sizeof(int), 0, new_size - old_end);
1336 }
1337
1338 chro_stab -> reverse_table_start_index[bin_location]++;
1339
1340 if(global_context -> do_junction_counting)
1341 register_junc_feature(global_context , feature_name_tmp, seq_name, ret_features[xk1].start, ret_features[xk1].end);
1342
1343 xk1++;
1344 }
1345 }
1346 }
1347 features = xk1;
1348 autozip_close(&anno_fp);
1349 free(file_line);
1350 free(tmpnameex);
1351
1352 (*loaded_features) = ret_features;
1353 global_context -> exontable_nchrs = (int)chro_name_table-> numOfElements;
1354 global_context -> exontable_chro_table = chro_name_table;
1355
1356
1357 if(is_GFF_warned) return -2;
1358 if(features < 1){
1359 if(global_context -> annotation_file_screen_output[0]){
1360 SUBREADprintf("ERROR: no features were loaded in format %s. The annotation format can be specified by the 'isGTFAnnotationFile' option%s.\n", file_type == FILE_TYPE_GTF?"GTF":"SAF", file_type == FILE_TYPE_GTF?", and the required feature type can be specified by the 'GTF.featureType' option":"");
1361 }else{
1362 SUBREADprintf("ERROR: no features were loaded in format %s. The annotation format can be specified by the '-F' option%s.\n", file_type == FILE_TYPE_GTF?"GTF":"SAF", file_type == FILE_TYPE_GTF?", and the required feature type can be specified by the '-t' option.":"");
1363 }
1364 SUBREADprintf("\n\n");
1365 return -2;
1366 }
1367
1368 print_in_box(80,0,0," Features : %d\n", features);
1369 return features;
1370 }
1371
find_or_insert_gene_name(fc_thread_global_context_t * global_context,unsigned char * feature_name)1372 int find_or_insert_gene_name(fc_thread_global_context_t * global_context, unsigned char * feature_name)
1373 {
1374 HashTable * genetable = global_context -> gene_name_table;
1375
1376 srInt_64 gene_number = HashTableGet(genetable, feature_name) - NULL;
1377 if(gene_number>0)
1378 return gene_number-1;
1379 else
1380 {
1381 gene_number = genetable -> numOfElements;
1382 HashTablePut(genetable, feature_name, NULL+gene_number+1);
1383 global_context -> gene_name_array[gene_number] = feature_name;
1384 // real memory space of feature_name is in the "loaded_features" data structure.
1385 // now we only save its pointer.
1386
1387 return gene_number;
1388 }
1389 }
1390
register_reverse_table(int block_no,srInt_64 this_block_min_start,srInt_64 this_block_max_end,fc_chromosome_index_info * chro_inf)1391 void register_reverse_table(int block_no, srInt_64 this_block_min_start, srInt_64 this_block_max_end, fc_chromosome_index_info * chro_inf)
1392 {
1393
1394 unsigned int reversed_bucket_start = this_block_min_start / REVERSE_TABLE_BUCKET_LENGTH;
1395 unsigned int reversed_bucket_end = this_block_max_end / REVERSE_TABLE_BUCKET_LENGTH;
1396 assert(this_block_min_start <= this_block_max_end);
1397 assert(reversed_bucket_end < chro_inf -> chro_possible_length);
1398 int x1;
1399 for(x1 = reversed_bucket_start; x1 <= reversed_bucket_end; x1++)
1400 {
1401 chro_inf->reverse_table_start_index[x1] = min(chro_inf->reverse_table_start_index[x1], block_no);
1402 //chro_inf->reverse_table_end_index[x1] = max(chro_inf->reverse_table_end_index[x1], block_no+1);
1403 }
1404
1405 }
1406
feature_merge(void * arrv,int start,int items,int items2)1407 void feature_merge(void * arrv, int start, int items, int items2)
1408 {
1409
1410 void ** arr = (void **) arrv;
1411
1412 srInt_64 * ret_start = (srInt_64 *) arr[0];
1413 srInt_64 * ret_end = (srInt_64 *) arr[1];
1414 unsigned char * ret_strand = (unsigned char *) arr[2];
1415 int * ret_entyrez = (int *) arr[3];
1416 fc_feature_info_t ** old_info_ptr = (fc_feature_info_t **) arr[4];
1417
1418 int total_items = items+items2;
1419 srInt_64 * tmp_start = malloc(sizeof(srInt_64) * total_items);
1420 srInt_64 * tmp_end = malloc(sizeof(srInt_64) * total_items);
1421 unsigned char * tmp_strand = malloc(sizeof(char) * total_items);
1422 int * tmp_entyrez = malloc(sizeof(int) * total_items);
1423 fc_feature_info_t ** tmp_info_ptr = malloc(sizeof(fc_feature_info_t*) * total_items);
1424
1425 int read_1_ptr = start;
1426 int read_2_ptr = start+items;
1427 int write_ptr;
1428
1429 for(write_ptr=0; write_ptr<total_items; write_ptr++)
1430 {
1431 if((read_1_ptr >= start+items)||(read_2_ptr < start+total_items && ret_start[read_1_ptr] >= ret_start[read_2_ptr]))
1432 {
1433 tmp_start[write_ptr] = ret_start[read_2_ptr];
1434 tmp_end[write_ptr] = ret_end[read_2_ptr];
1435 tmp_strand[write_ptr] = ret_strand[read_2_ptr];
1436 tmp_entyrez[write_ptr] = ret_entyrez[read_2_ptr];
1437 tmp_info_ptr[write_ptr] = old_info_ptr[read_2_ptr];
1438 read_2_ptr++;
1439 }
1440 else
1441 {
1442 tmp_start[write_ptr] = ret_start[read_1_ptr];
1443 tmp_end[write_ptr] = ret_end[read_1_ptr];
1444 tmp_strand[write_ptr] = ret_strand[read_1_ptr];
1445 tmp_entyrez[write_ptr] = ret_entyrez[read_1_ptr];
1446 tmp_info_ptr[write_ptr] = old_info_ptr[read_1_ptr];
1447 read_1_ptr++;
1448 }
1449 }
1450
1451 memcpy(ret_start+ start, tmp_start, sizeof(srInt_64) * total_items);
1452 memcpy(ret_end+ start, tmp_end, sizeof(srInt_64) * total_items);
1453 memcpy(ret_strand+ start, tmp_strand, sizeof(char) * total_items);
1454 memcpy(ret_entyrez+ start, tmp_entyrez, sizeof(int) * total_items);
1455 memcpy(old_info_ptr+ start, tmp_info_ptr, sizeof(fc_feature_info_t*) * total_items);
1456
1457 free(tmp_start);
1458 free(tmp_end);
1459 free(tmp_strand);
1460 free(tmp_entyrez);
1461 free(tmp_info_ptr);
1462 }
1463
1464
feature_sort_compare(void * arrv,int l,int r)1465 int feature_sort_compare(void * arrv, int l, int r)
1466 {
1467 void ** arr = (void **) arrv;
1468 srInt_64 * ret_start = (srInt_64 *)arr[0];
1469 srInt_64 ll = ret_start[l];
1470 srInt_64 rl = ret_start[r];
1471
1472 if(ll==rl) return 0;
1473 else if(ll>rl) return 1;
1474 else return -1;
1475 }
1476
feature_sort_exchange(void * arrv,int l,int r)1477 void feature_sort_exchange(void * arrv, int l, int r)
1478 {
1479 void ** arr = (void **) arrv;
1480 srInt_64 tmp;
1481 fc_feature_info_t * tmpptr;
1482
1483 srInt_64 * ret_start = (srInt_64 *) arr[0];
1484 srInt_64 * ret_end = (srInt_64 *) arr[1];
1485 unsigned char * ret_strand = (unsigned char *) arr[2];
1486 int * ret_entyrez = (int *) arr[3];
1487 fc_feature_info_t ** old_info_ptr = (fc_feature_info_t **) arr[4];
1488
1489
1490 tmp = ret_start[r];
1491 ret_start[r]=ret_start[l];
1492 ret_start[l]=tmp;
1493
1494 tmp = ret_end[r];
1495 ret_end[r]=ret_end[l];
1496 ret_end[l]=tmp;
1497
1498 tmp = ret_strand[r];
1499 ret_strand[r]=ret_strand[l];
1500 ret_strand[l]=tmp;
1501
1502 tmp = ret_entyrez[r];
1503 ret_entyrez[r]=ret_entyrez[l];
1504 ret_entyrez[l]=tmp;
1505
1506 tmpptr = old_info_ptr[r];
1507 old_info_ptr[r]=old_info_ptr[l];
1508 old_info_ptr[l]=tmpptr;
1509
1510 }
1511
1512
1513
sort_feature_info(fc_thread_global_context_t * global_context,unsigned int features,fc_feature_info_t * loaded_features,char *** sorted_chr_names,int ** sorted_entrezid,srInt_64 ** sorted_start,srInt_64 ** sorted_end,unsigned char ** sorted_strand,char ** anno_chr_2ch,char *** anno_chrs,srInt_64 ** anno_chr_head,srInt_64 ** block_end_index,srInt_64 ** block_min_start_pos,srInt_64 ** block_max_end_pos)1514 void sort_feature_info(fc_thread_global_context_t * global_context, unsigned int features, fc_feature_info_t * loaded_features, char *** sorted_chr_names, int ** sorted_entrezid, srInt_64 ** sorted_start, srInt_64 ** sorted_end, unsigned char ** sorted_strand, char ** anno_chr_2ch, char *** anno_chrs, srInt_64 ** anno_chr_head, srInt_64 ** block_end_index, srInt_64 ** block_min_start_pos, srInt_64 ** block_max_end_pos)
1515 {
1516 unsigned int chro_pnt;
1517 unsigned int xk1,xk2;
1518 int * ret_entrez = malloc(sizeof(int) * features);
1519 srInt_64 * ret_start = malloc(sizeof(srInt_64) * features);
1520 srInt_64 * ret_end = malloc(sizeof(srInt_64) * features);
1521 int current_block_buffer_size = 2000;
1522
1523 srInt_64 * ret_block_end_index = malloc(sizeof(srInt_64) * current_block_buffer_size);
1524 srInt_64 * ret_block_min_start = malloc(sizeof(srInt_64) * current_block_buffer_size);
1525 srInt_64 * ret_block_max_end = malloc(sizeof(srInt_64) * current_block_buffer_size);
1526 unsigned char * ret_strand = malloc(features);
1527 char ** ret_char_name = malloc(sizeof(void *) * features);
1528 fc_feature_info_t ** old_info_ptr = malloc(sizeof(void *) * features);
1529 (*anno_chrs) = malloc(sizeof(void *) * global_context -> exontable_nchrs);
1530 (*anno_chr_head) = malloc(sizeof(srInt_64) * global_context -> exontable_nchrs);
1531 (*anno_chr_2ch) = malloc(sizeof(char) * global_context -> exontable_nchrs*2);
1532 unsigned int * chro_feature_ptr = calloc(sizeof(int) * global_context -> exontable_nchrs,1);
1533 fc_chromosome_index_info ** tmp_chro_info_ptrs = malloc(global_context -> exontable_nchrs * sizeof(fc_chromosome_index_info *));
1534
1535 global_context -> gene_name_array = malloc(sizeof(char *) * features); // there should be much less identical names.
1536 global_context -> gene_name_table = HashTableCreate(5000);
1537 HashTableSetHashFunction(global_context -> gene_name_table, HashTableStringHashFunction);
1538 HashTableSetKeyComparisonFunction(global_context -> gene_name_table, fc_strcmp);
1539
1540 // init start positions of each chromosome block.
1541 if(1)
1542 {
1543 KeyValuePair * cursor;
1544 int bucket;
1545 unsigned int sum_ptr = 0;
1546 for(bucket=0; bucket < global_context -> exontable_chro_table -> numOfBuckets; bucket++)
1547 {
1548 cursor = global_context -> exontable_chro_table -> bucketArray[bucket];
1549 while (1)
1550 {
1551 if (!cursor) break;
1552 fc_chromosome_index_info * tmp_chro_inf = cursor -> value;
1553 cursor = cursor->next;
1554 //tmp_chro_inf -> reverse_table_end_index = calloc(sizeof(int), tmp_chro_inf->chro_possible_length / REVERSE_TABLE_BUCKET_LENGTH +2);
1555 chro_feature_ptr [tmp_chro_inf -> chro_number] = tmp_chro_inf -> chro_features;
1556 tmp_chro_info_ptrs[tmp_chro_inf -> chro_number] = tmp_chro_inf;
1557 }
1558 }
1559
1560 for(xk1 = 0; xk1 < global_context -> exontable_nchrs; xk1++)
1561 {
1562 unsigned int tmpv = chro_feature_ptr[xk1];
1563 chro_feature_ptr[xk1] = sum_ptr;
1564 tmp_chro_info_ptrs[xk1] -> chro_feature_table_start = sum_ptr;
1565 // printf("SII=%u + %u\n", sum_ptr, tmpv);
1566 sum_ptr += tmpv;
1567 }
1568
1569 }
1570 int current_block_id = 0, sort_i = 0;
1571
1572 (*sorted_chr_names) = ret_char_name;
1573 (*sorted_entrezid) = ret_entrez;
1574 (*sorted_start) = ret_start;
1575 (*sorted_end) = ret_end;
1576 (*sorted_strand) = ret_strand;
1577 int curr_chro_number = 0;
1578
1579 for(chro_pnt=0; chro_pnt < features; chro_pnt++)
1580 {
1581 char * this_chro_name = global_context -> unistr_buffer_space + loaded_features[chro_pnt].feature_name_pos + loaded_features[chro_pnt].chro_name_pos_delta;
1582 fc_chromosome_index_info * this_chro_info = HashTableGet(global_context -> exontable_chro_table , this_chro_name);
1583 assert(this_chro_info);
1584 unsigned int this_chro_number = this_chro_info -> chro_number;
1585 unsigned int this_chro_table_ptr = chro_feature_ptr[this_chro_number];
1586
1587 ret_char_name[this_chro_table_ptr] = this_chro_name;// (char *)loaded_features[chro_pnt].chro;
1588 ret_entrez[this_chro_table_ptr] = find_or_insert_gene_name(global_context, (unsigned char *)(global_context -> unistr_buffer_space + loaded_features[chro_pnt].feature_name_pos));
1589 ret_start[this_chro_table_ptr] = loaded_features[chro_pnt].start;
1590 ret_end[this_chro_table_ptr] = loaded_features[chro_pnt].end;
1591 ret_strand[this_chro_table_ptr] = loaded_features[chro_pnt].is_negative_strand;
1592 old_info_ptr[this_chro_table_ptr] = &loaded_features[chro_pnt];
1593
1594 chro_feature_ptr[this_chro_number]++;
1595 }
1596
1597 for(xk1 = 0; xk1 < global_context -> exontable_nchrs; xk1++)
1598 {
1599 fc_chromosome_index_info * tmp_chro_inf = tmp_chro_info_ptrs[xk1];
1600 int bins_in_chr = ( tmp_chro_inf->chro_possible_length / REVERSE_TABLE_BUCKET_LENGTH +2);
1601 short * features_per_block_bins = malloc(sizeof(short)*bins_in_chr);
1602 for(xk2=0; xk2<bins_in_chr; xk2++)
1603 {
1604 features_per_block_bins[xk2] = max(1,min(1000,(int)(0.9999999+sqrt(tmp_chro_inf -> reverse_table_start_index[xk2]))));
1605 //printf("CHR%d : SQR[%d]=%d (%d)\n", tmp_chro_inf -> chro_number,xk2, features_per_block_bins[xk2], tmp_chro_inf -> reverse_table_start_index[xk2] );
1606 }
1607
1608 memset(tmp_chro_inf -> reverse_table_start_index, 0xff, sizeof(int) *bins_in_chr);
1609
1610 tmp_chro_inf -> chro_block_table_start = current_block_id;
1611 unsigned int this_block_items = 0;
1612 srInt_64 this_block_min_start = 0x7fffffff, this_block_max_end = 0;
1613 unsigned int this_chro_tab_end = tmp_chro_inf -> chro_features + tmp_chro_inf -> chro_feature_table_start;
1614
1615 void * in_array[5];
1616 in_array[0] = ret_start + tmp_chro_inf -> chro_feature_table_start;
1617 in_array[1] = ret_end + tmp_chro_inf -> chro_feature_table_start;
1618 in_array[2] = ret_strand + tmp_chro_inf -> chro_feature_table_start;
1619 in_array[3] = ret_entrez + tmp_chro_inf -> chro_feature_table_start;
1620 in_array[4] = old_info_ptr + tmp_chro_inf -> chro_feature_table_start;
1621
1622 merge_sort(in_array, this_chro_tab_end - tmp_chro_inf -> chro_feature_table_start, feature_sort_compare, feature_sort_exchange, feature_merge);
1623
1624 for(sort_i = tmp_chro_inf -> chro_feature_table_start; sort_i< this_chro_tab_end ; sort_i++)
1625 {
1626 // NOW THE FEATURES (ret_start, ret_end, ret_strand, ret_entrez, old_info_ptr) ARE ALL SORTED!
1627 //printf("NT=%lu\tCHRO=%d\n", ret_start[sort_i], tmp_chro_inf->chro_number);
1628 old_info_ptr[sort_i]->sorted_order = sort_i;
1629
1630 int feature_bin_location = ret_start[sort_i] / REVERSE_TABLE_BUCKET_LENGTH;
1631 int block_bin_location = this_block_min_start / REVERSE_TABLE_BUCKET_LENGTH;
1632
1633 if(this_block_items && (this_block_items > features_per_block_bins[block_bin_location] || feature_bin_location != block_bin_location))//global_context -> feature_block_size)
1634 {
1635
1636 if(current_block_id >= current_block_buffer_size - 1)
1637 {
1638 current_block_buffer_size *= 1.3;
1639 ret_block_min_start = realloc(ret_block_min_start, sizeof(srInt_64)*current_block_buffer_size);
1640 ret_block_max_end = realloc(ret_block_max_end, sizeof(srInt_64)*current_block_buffer_size);
1641 ret_block_end_index = realloc(ret_block_end_index, sizeof(srInt_64)*current_block_buffer_size);
1642 }
1643
1644
1645 ret_block_end_index[current_block_id] = sort_i; // FIRST UNWANTED ID
1646 ret_block_min_start[current_block_id] = this_block_min_start;
1647 ret_block_max_end[current_block_id] = this_block_max_end;
1648 register_reverse_table(current_block_id, this_block_min_start, this_block_max_end, tmp_chro_inf);
1649 //printf("B=%d; ST=%ld, END=%ld, ITM=%d\n", current_block_id, this_block_min_start, this_block_max_end, this_block_items);
1650 current_block_id++;
1651 this_block_max_end = 0;
1652 this_block_items = 0;
1653 this_block_min_start = 0x7fffffff;
1654 }
1655
1656 this_block_max_end = max(this_block_max_end, ret_end[sort_i]);
1657 this_block_min_start = min(this_block_min_start, ret_start[sort_i]);
1658 this_block_items ++;
1659
1660 }
1661 if(this_block_items)
1662 {
1663 if(current_block_id >= current_block_buffer_size)
1664 {
1665 current_block_buffer_size *= 1.3;
1666 ret_block_min_start = realloc(ret_block_min_start, sizeof(srInt_64)*current_block_buffer_size);
1667 ret_block_max_end = realloc(ret_block_max_end, sizeof(srInt_64)*current_block_buffer_size);
1668 ret_block_end_index = realloc(ret_block_end_index, sizeof(srInt_64)*current_block_buffer_size);
1669 }
1670
1671 ret_block_end_index[current_block_id] = this_chro_tab_end; // FIRST UNWANTED ID
1672 ret_block_min_start[current_block_id] = this_block_min_start;
1673 ret_block_max_end[current_block_id] = this_block_max_end;
1674 register_reverse_table(current_block_id, this_block_min_start, this_block_max_end, tmp_chro_inf);
1675 current_block_id++;
1676 }
1677
1678 (*anno_chr_head) [curr_chro_number] = current_block_id;
1679 tmp_chro_inf -> chro_block_table_end = current_block_id;
1680 free(features_per_block_bins);
1681 }
1682
1683 (*block_end_index) = ret_block_end_index;
1684 (*block_min_start_pos) = ret_block_min_start;
1685 (*block_max_end_pos) = ret_block_max_end;
1686
1687 //print_in_box(80, 0,0,"The %u features are sorted.\n", sort_i);
1688 free(old_info_ptr);
1689 free(tmp_chro_info_ptrs);
1690 free(chro_feature_ptr);
1691 }
1692
strcmp_slash(char * s1,char * s2)1693 int strcmp_slash(char * s1, char * s2)
1694 {
1695 char nch;
1696 while(0!=(nch = *(s1++))){
1697 if(nch == '/') break;
1698 if(nch != (*s2)) return 1;
1699 s2++;
1700 }
1701 return nch != *s2;
1702 }
1703
1704 #define NH_FRACTION_INT 65536
1705
calculate_multi_overlap_fraction(fc_thread_global_context_t * global_context,unsigned int fixed_fractional_count,int maximum_total_count)1706 unsigned int calculate_multi_overlap_fraction(fc_thread_global_context_t * global_context, unsigned int fixed_fractional_count, int maximum_total_count){
1707 //SUBREADprintf("SSSSFRAC = %d ; FIXED / MAX = %u , %d\n", global_context -> use_fraction_multi_mapping, fixed_fractional_count, maximum_total_count);
1708 if(global_context -> use_fraction_multi_mapping) return fixed_fractional_count / maximum_total_count;
1709 else return fixed_fractional_count;
1710 }
1711
calc_fixed_fraction(int nh)1712 unsigned int calc_fixed_fraction(int nh){
1713 if(nh==1) return NH_FRACTION_INT;
1714 else if(nh == 2) return NH_FRACTION_INT>>1;
1715 else return NH_FRACTION_INT / nh;
1716 }
1717
1718
calc_float_fraction(read_count_type_t score,read_count_type_t * integer_count,double * float_count)1719 int calc_float_fraction(read_count_type_t score, read_count_type_t * integer_count, double * float_count){
1720 if(score % NH_FRACTION_INT == 0){
1721 (*integer_count) = score / NH_FRACTION_INT;
1722 return 0;
1723 }else{
1724 (*float_count) = score * 1./NH_FRACTION_INT;
1725 return 1;
1726 }
1727 }
1728
1729
print_read_wrapping(char * rl,int is_second)1730 void print_read_wrapping(char * rl, int is_second){
1731 int refill_spaces = 3;
1732
1733 int read_length = 0, x1 = 0, spaces=0;
1734
1735 for(x1 = 0; x1 < 3100; x1++){
1736 if(rl[x1]==0 && rl[x1+1]==0)break;
1737 if(rl[x1]=='0' || rl[x1]=='\t') spaces++;
1738 read_length ++;
1739 }
1740
1741 char *out_buf1 = malloc(read_length + spaces * refill_spaces + 1), out_buf2[100];
1742 int ox=0;
1743
1744 for(x1 = 0; x1 < 3000; x1++){
1745 if(rl[x1]=='\n' || (rl[x1]==0 && rl[x1+1]==0)){
1746 out_buf1[ox]=0;
1747 break;
1748 } else if((rl[x1]==0 && rl[x1+1]!=0) || rl[x1] == '\t'){
1749 int x2;
1750 for(x2 = 0; x2 < refill_spaces ; x2++){
1751 out_buf1[ox]=' ';
1752 ox++;
1753 }
1754 } else {
1755 out_buf1[ox]=rl[x1];
1756 ox++;
1757 }
1758 }
1759 out_buf1[ox] = 0;
1760
1761 x1=0;
1762
1763 while(1){
1764 int x2;
1765 for(x2 = 0; x2 < 67 ; x2 ++){
1766 char nch = out_buf1[x1];
1767 if(nch == 0) break;
1768 out_buf2[x2] = nch;
1769 x1++;
1770 }
1771 out_buf2[x2] = 0;
1772
1773 print_in_box(80,0,PRINT_BOX_NOCOLOR_FOR_COLON," %s", out_buf2);
1774 if(out_buf1[x1] == 0)break;
1775 }
1776
1777 free(out_buf1);
1778
1779 }
1780
1781
disallocate_RG_tables(void * pt)1782 void disallocate_RG_tables(void * pt){
1783 void ** t4 = pt;
1784 free(t4[0]);
1785 free(t4[1]);
1786 if(t4[2]){
1787 HashTableDestroy(t4[2]);
1788 HashTableDestroy(t4[3]);
1789 }
1790 free(pt);
1791 }
1792
1793
process_pairer_reset(void * pairer_vp)1794 void process_pairer_reset(void * pairer_vp){
1795 SAM_pairer_context_t * pairer = (SAM_pairer_context_t *) pairer_vp;
1796 fc_thread_global_context_t * global_context = (fc_thread_global_context_t * )pairer -> appendix1;
1797 if(global_context -> sambam_chro_table) free(global_context -> sambam_chro_table);
1798 global_context -> sambam_chro_table = NULL;
1799 global_context -> sambam_chro_table_items = 0;
1800 if(global_context -> assign_reads_to_RG) free(global_context -> RGnames_set);
1801 if(global_context -> do_scRNA_table){
1802 SUBREADprintf("ERROR: the BAM input is incompatible with scRNA\n");
1803 assert(0);
1804 }
1805
1806 int xk1, xk2;
1807 for(xk1=0; xk1<global_context-> thread_number; xk1++)
1808 {
1809 for(xk2=0; xk2<global_context -> exontable_exons; xk2++)
1810 {
1811 global_context -> thread_contexts[xk1].count_table[xk2] = 0;
1812 }
1813
1814 global_context -> thread_contexts[xk1].del4_added_reads = 0;
1815
1816 global_context -> thread_contexts[xk1].all_reads = 0;
1817 global_context -> thread_contexts[xk1].nreads_mapped_to_exon = 0;
1818
1819 global_context -> thread_contexts[xk1].read_counters.unassigned_ambiguous = 0;
1820 global_context -> thread_contexts[xk1].read_counters.unassigned_nofeatures = 0;
1821 global_context -> thread_contexts[xk1].read_counters.unassigned_unmapped = 0;
1822 global_context -> thread_contexts[xk1].read_counters.unassigned_singleton = 0;
1823 global_context -> thread_contexts[xk1].read_counters.unassigned_read_type = 0;
1824 global_context -> thread_contexts[xk1].read_counters.unassigned_mappingquality = 0;
1825 global_context -> thread_contexts[xk1].read_counters.unassigned_fragmentlength = 0;
1826 global_context -> thread_contexts[xk1].read_counters.unassigned_chimericreads = 0;
1827 global_context -> thread_contexts[xk1].read_counters.unassigned_multimapping = 0;
1828 global_context -> thread_contexts[xk1].read_counters.unassigned_secondary = 0;
1829 global_context -> thread_contexts[xk1].read_counters.unassigned_junction_condition = 0;
1830 global_context -> thread_contexts[xk1].read_counters.unassigned_duplicate = 0;
1831 global_context -> thread_contexts[xk1].read_counters.unassigned_overlapping_length = 0;
1832 global_context -> thread_contexts[xk1].read_counters.assigned_reads = 0;
1833 global_context -> thread_contexts[xk1].read_details_buff_used = 0;
1834
1835 if(global_context -> do_junction_counting)
1836 {
1837 HashTableDestroy(global_context -> thread_contexts[xk1].junction_counting_table);
1838 global_context -> thread_contexts[xk1].junction_counting_table = HashTableCreate(131317);
1839 HashTableSetHashFunction(global_context -> thread_contexts[xk1].junction_counting_table,HashTableStringHashFunction);
1840 HashTableSetDeallocationFunctions(global_context -> thread_contexts[xk1].junction_counting_table, free, NULL);
1841 HashTableSetKeyComparisonFunction(global_context -> thread_contexts[xk1].junction_counting_table, fc_strcmp_chro);
1842
1843 HashTableDestroy(global_context -> thread_contexts[xk1].splicing_point_table);
1844 global_context -> thread_contexts[xk1].splicing_point_table = HashTableCreate(131317);
1845 HashTableSetHashFunction(global_context -> thread_contexts[xk1].splicing_point_table,HashTableStringHashFunction);
1846 HashTableSetDeallocationFunctions(global_context -> thread_contexts[xk1].splicing_point_table, free, NULL);
1847 HashTableSetKeyComparisonFunction(global_context -> thread_contexts[xk1].splicing_point_table, fc_strcmp_chro);
1848 }
1849
1850 if(global_context -> assign_reads_to_RG){
1851 HashTableDestroy(global_context -> thread_contexts[xk1].RG_table);
1852 global_context -> thread_contexts[xk1].RG_table = HashTableCreate(97);
1853 HashTableSetHashFunction(global_context -> thread_contexts[xk1].RG_table,HashTableStringHashFunction);
1854 HashTableSetDeallocationFunctions(global_context -> thread_contexts[xk1].RG_table, free, disallocate_RG_tables);
1855 HashTableSetKeyComparisonFunction(global_context -> thread_contexts[xk1].RG_table, fc_strcmp_chro);
1856 }
1857
1858
1859 }
1860
1861 if(global_context -> read_details_out_FP){
1862 int tranc_ret = ftruncate(fileno(global_context -> read_details_out_FP), 0);
1863 if(0 != tranc_ret) SUBREADprintf("ERROR: Unable to truncate assignment detail file\n");
1864 fseek(global_context -> read_details_out_FP, 0 , SEEK_SET);
1865 }
1866 }
1867
is_value_contig_name(char * n,int l)1868 int is_value_contig_name(char * n, int l){
1869 int x;
1870 for(x=0; x<l; x++){
1871 if(n[x]==0)continue;
1872 if(n[x]>'~' || n[x]<'!') return 0;
1873 }
1874 return 1;
1875 }
1876
FC_CRC32(char * dat,int len)1877 unsigned int FC_CRC32(char * dat, int len){
1878 unsigned int crc0 = crc32(0, NULL, 0);
1879 unsigned int ret = crc32(crc0, (unsigned char *)dat, len);
1880 return ret;
1881 }
1882
1883 struct simple_bam_writer_index_per_chro{
1884 HashTable * index_binP1_table;
1885 ArrayList * index_binP0_list;
1886 ArrayList * win16k_list;
1887 };
1888
simple_bam_writer_deallocate_index_per_chro(void * p)1889 void simple_bam_writer_deallocate_index_per_chro(void * p){
1890 struct simple_bam_writer_index_per_chro * ch = p;
1891 HashTableDestroy(ch->index_binP1_table);
1892 ArrayListDestroy(ch->index_binP0_list);
1893 ArrayListDestroy(ch->win16k_list);
1894 free(ch);
1895 }
1896
1897
simple_bam_writer_new_index_per_chro()1898 struct simple_bam_writer_index_per_chro * simple_bam_writer_new_index_per_chro(){
1899 struct simple_bam_writer_index_per_chro * ret = malloc(sizeof(struct simple_bam_writer_index_per_chro ));
1900 ret -> index_binP1_table = HashTableCreate(4000);
1901 HashTableSetDeallocationFunctions(ret -> index_binP1_table, NULL, (void (*) (void*)) ArrayListDestroy);
1902 ret -> index_binP0_list = ArrayListCreate(20000);
1903 ret -> win16k_list = ArrayListCreate(20000);
1904 return ret;
1905 }
1906
1907 #define MERGER_WORKER_BINSIZE 66000
1908 typedef struct {
1909 FILE * bam_FP;
1910 FILE * bai_FP;
1911 z_stream strm;
1912 char inbin[MERGER_WORKER_BINSIZE];
1913 int inbin_len;
1914 int total_chromosomes;
1915 HashTable * bam_blockP1_to_offset0B_table;
1916 HashTable * index_per_chro;
1917 } simple_bam_writer;
1918
1919 #define MAX_ALLOWED_GAP_IN_BAI_CHUNK 10 // 10 blocks
1920
simple_bam_writer_update_index(simple_bam_writer * writer,char * rbin,int binlen,srInt_64 block_number,int inbin_pos)1921 void simple_bam_writer_update_index(simple_bam_writer * writer, char * rbin, int binlen, srInt_64 block_number, int inbin_pos){
1922 int chro_no=0;
1923 memcpy(&chro_no, rbin + 4, 4);
1924 if(chro_no<0)return;
1925
1926 unsigned int pos=0, bin_mq_nl=0;
1927 memcpy(&pos, rbin + 8, 4);
1928 memcpy(&bin_mq_nl, rbin + 12, 4);
1929
1930 struct simple_bam_writer_index_per_chro * index_chro = HashTableGet(writer -> index_per_chro, NULL+chro_no+1);
1931 if(NULL==index_chro){
1932 index_chro = simple_bam_writer_new_index_per_chro();
1933 HashTablePut(writer -> index_per_chro, NULL+chro_no+1, index_chro);
1934 }
1935
1936 unsigned int binno = bin_mq_nl>>16;
1937 int cigar_span = SamBam_writer_calc_cigar_span(rbin +4);
1938 int this_w16_no = (pos + cigar_span) >>14; // WIN is calculated on 0-based pos.
1939 unsigned long long this_Vpos = block_number<<16 | inbin_pos;
1940 ArrayList * win16k_list = index_chro -> win16k_list;
1941 // if this read is after the maximum coordinate in the win16k list: all elements before last one and this one starts at this read.
1942 if(this_w16_no > win16k_list->numOfElements){
1943 int bbi;
1944 for(bbi = win16k_list->numOfElements; bbi <=this_w16_no; bbi++)
1945 ArrayListPush(win16k_list, NULL+ this_Vpos);
1946 }
1947
1948 ArrayList * this_bin_chunks = HashTableGet(index_chro -> index_binP1_table, NULL+binno+1);
1949 if(NULL == this_bin_chunks){
1950 this_bin_chunks = ArrayListCreate(4);
1951 HashTablePut(index_chro -> index_binP1_table, NULL+binno+1, this_bin_chunks);
1952 ArrayListPush(index_chro -> index_binP0_list, NULL+binno);
1953 }
1954 int found = 0;
1955 // a bin is not necessarily continuous. Say, a top-level bin only contains a few reads (most reads a in low-level bins), but their locations are everywhere
1956
1957 if(this_bin_chunks -> numOfElements > 0){
1958 long long diff = this_Vpos >>16;
1959 diff -=(this_bin_chunks -> elementList [ this_bin_chunks -> numOfElements - 1] - NULL)>>16;
1960 if(diff < MAX_ALLOWED_GAP_IN_BAI_CHUNK){
1961 this_bin_chunks -> elementList [ this_bin_chunks -> numOfElements - 1] = NULL+this_Vpos + binlen + 4;
1962 found = 1;
1963 }
1964 }
1965 // if the last chunk in this bin isn't good to be extended (too far from the file location of the new read), a new chunk is created.
1966 if(!found){
1967 ArrayListPush(this_bin_chunks, NULL + this_Vpos);
1968 ArrayListPush(this_bin_chunks, NULL + this_Vpos + binlen+4);
1969 }
1970 }
1971
simple_bam_write_compressed_block(simple_bam_writer * writer,char * obuf,int olen,int ilen,unsigned int crcval,srInt_64 block_number)1972 void simple_bam_write_compressed_block(simple_bam_writer * writer,char *obuf, int olen, int ilen, unsigned int crcval, srInt_64 block_number){
1973 if(block_number >= 0)HashTablePut(writer -> bam_blockP1_to_offset0B_table, NULL+1+block_number, NULL+ftello(writer -> bam_FP));
1974 fwrite("\x1f\x8b\x8\x4\0\0\0\0\0\0\x6\0", 1, 12, writer -> bam_FP);
1975 fwrite("\x42\x43\x2\0", 1, 4, writer -> bam_FP);
1976
1977 int BSIZE = olen+19+6;
1978 fwrite(&BSIZE, 1, 2, writer -> bam_FP);
1979 fwrite(obuf, 1, olen, writer -> bam_FP);
1980 fwrite(&crcval, 1, 4, writer -> bam_FP);
1981 fwrite(&ilen, 1, 4, writer -> bam_FP);
1982 }
1983
simple_bam_write(void * bin,int binlen,simple_bam_writer * writer,int force_flush)1984 void simple_bam_write(void * bin, int binlen, simple_bam_writer * writer, int force_flush){
1985 int Z_DEFAULT_MEM_LEVEL = 8;
1986 while(binlen > 0 || (force_flush && writer->inbin_len)){
1987 int concatinate_binlen = min(binlen, 63000 - writer->inbin_len);
1988 memcpy(writer->inbin+writer->inbin_len, bin, concatinate_binlen);
1989
1990 writer->inbin_len += concatinate_binlen;
1991 bin += concatinate_binlen;
1992 binlen -= concatinate_binlen;
1993 if(writer->inbin_len >=63000 || force_flush){
1994 deflateInit2(&writer -> strm, Z_BEST_SPEED, Z_DEFLATED, -15, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
1995 char obuf[MERGER_WORKER_BINSIZE];
1996 writer->strm.next_in = (unsigned char *)writer->inbin;
1997 writer->strm.avail_in = writer->inbin_len;
1998 writer->strm.next_out = (unsigned char *)obuf;
1999 writer->strm.avail_out = MERGER_WORKER_BINSIZE;
2000 deflate(&writer->strm, Z_FINISH);
2001 int have = MERGER_WORKER_BINSIZE-writer->strm.avail_out;
2002 simple_bam_write_compressed_block(writer, obuf, have, writer->inbin_len, FC_CRC32(writer->inbin, writer->inbin_len), -1);
2003 writer->inbin_len=0;
2004 deflateEnd(&writer -> strm);
2005 }
2006 }
2007 }
2008
simple_bam_create(char * fname)2009 simple_bam_writer * simple_bam_create(char * fname){
2010 simple_bam_writer * ret = calloc(sizeof(simple_bam_writer), 1);
2011 ret -> bam_FP = fopen(fname, "wb");
2012 ret -> bam_blockP1_to_offset0B_table = HashTableCreate(100000);
2013 simple_bam_write("BAM\1", 4, ret, 0);
2014
2015 char bainame [strlen(fname)+10];
2016 strcpy(bainame , fname);
2017 strcat(bainame, ".bai");
2018 ret -> bai_FP = fopen(bainame, "wb");
2019 fwrite("BAI\1", 1, 4, ret -> bai_FP);
2020 ret -> index_per_chro = HashTableCreate(1000);
2021 HashTableSetDeallocationFunctions(ret -> index_per_chro , NULL , simple_bam_writer_deallocate_index_per_chro);
2022 return ret;
2023 }
2024
2025 #define vpos_to_rpos rposone = (vposone & 0xffff ) + ( (HashTableGet(writer -> bam_blockP1_to_offset0B_table, NULL+1+(vposone >>16)) - NULL) << 16 )
2026
2027 #define BAM_EOF_MARKER "\x1f\x8b\x08\x04\0\0\0\0\0\xff\x06\0\x42\x43\x02\0\x1b\0\x03\0\0\0\0\0\0\0\0\0"
simple_bam_close(simple_bam_writer * writer)2028 void simple_bam_close(simple_bam_writer * writer){
2029 fwrite(BAM_EOF_MARKER, 1, 28, writer -> bam_FP);
2030 fclose(writer -> bam_FP);
2031
2032 fwrite(&writer -> total_chromosomes, 1, 4, writer -> bai_FP);
2033 int chri;
2034 for(chri=0; chri<writer -> total_chromosomes; chri++){
2035 struct simple_bam_writer_index_per_chro *this_idx = HashTableGet(writer -> index_per_chro , NULL+1+chri);
2036 if(NULL == this_idx ){
2037 fwrite("\0\0\0\0\0\0\0\0", 1, 8, writer -> bai_FP);//0 intervals and 0 bins
2038 }else{
2039 HashTable * new_tab=NULL;
2040 ArrayList * new_arr=NULL;
2041 SamBam_writer_optimize_bins(this_idx -> index_binP1_table, this_idx -> index_binP0_list ,& new_tab, & new_arr);
2042 this_idx -> index_binP1_table = new_tab;
2043 this_idx -> index_binP0_list = new_arr;
2044 fwrite(&this_idx -> index_binP0_list->numOfElements ,1, 4, writer -> bai_FP);
2045 int bini;
2046 for(bini = 0; bini < this_idx -> index_binP0_list -> numOfElements; bini ++){
2047 int binno = ArrayListGet(this_idx -> index_binP0_list, bini)-NULL;
2048 ArrayList * bingaps = HashTableGet(this_idx -> index_binP1_table, NULL+1+binno);
2049 srInt_64 gapi = bingaps -> numOfElements/2;
2050 fwrite(&binno, 1, 4, writer -> bai_FP);
2051 fwrite(&gapi ,1, 4, writer -> bai_FP);
2052 for(gapi = 0; gapi < bingaps -> numOfElements; gapi++){
2053 srInt_64 rposone, vposone = ArrayListGet(bingaps , gapi)-NULL;
2054 vpos_to_rpos;
2055 fwrite(&rposone, 1, 8, writer -> bai_FP);
2056 }
2057 }
2058
2059 fwrite(&this_idx -> win16k_list -> numOfElements ,1, 4, writer -> bai_FP);
2060 for(bini = 0; bini < this_idx -> win16k_list -> numOfElements; bini ++){
2061 srInt_64 rposone, vposone = ArrayListGet(this_idx -> win16k_list , bini )-NULL;
2062 vpos_to_rpos;
2063 fwrite(&rposone, 1, 8, writer -> bai_FP);
2064 }
2065 }
2066 }
2067 HashTableDestroy(writer -> index_per_chro);
2068 fclose(writer -> bai_FP);
2069 free(writer);
2070 }
2071
2072
2073 #ifdef __MINGW32__
2074 #define this_memmem windows_memmem
2075 #else
2076 #define this_memmem memmem
2077 #endif
2078
2079
2080 void ** get_RG_tables(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * rg_name);
2081 int compress_read_detail_BAM(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, int write_start, int write_end, char * bam_buf);
2082
scRNA_sample_SamBam_writers_add_header(void * k,void * v,HashTable * tab)2083 void scRNA_sample_SamBam_writers_add_header(void * k, void * v, HashTable * tab){
2084 char *bin = tab -> appendix1;
2085 int bin_len = tab -> counter1;
2086 int txt_ptr = 0, old_ptr=0, xk1, bam_is_from_cellCounts = 0;
2087 void ** vv = v;
2088 simple_bam_writer * writer = vv[0];
2089
2090 ArrayList * chroname_size_list = ArrayListCreate(2000);
2091 for(txt_ptr=0; txt_ptr <= bin_len; txt_ptr++){
2092 if(bin[txt_ptr] == '\n' || txt_ptr == bin_len){
2093 if(this_memmem(bin+old_ptr, txt_ptr - old_ptr,"Per-sample-BAM-output:cellCounts", 31))bam_is_from_cellCounts = 1;
2094 else if(memcmp("@SQ\t",bin+old_ptr, 4)==0){
2095 unsigned int seqlen = 0;
2096 char seqname[MAX_CHROMOSOME_NAME_LEN];
2097 int seqname_len = 0, state = 0;
2098 for(xk1=3; xk1 + old_ptr < txt_ptr; xk1++){
2099 int nch = bin[xk1 + old_ptr];
2100 switch(state){
2101 case 4:
2102 if(nch == '\t' ) state = 1;
2103 else seqname[seqname_len++]=nch;
2104 break;
2105
2106 case 14:
2107 if(isdigit(nch)) seqlen = seqlen*10+nch-'0';
2108 else state = nch == '\t'?1:0;
2109 break;
2110
2111 case 13:
2112 if(nch == ':') state = 14;
2113 else state = nch == '\t'?1:0;
2114 break;
2115
2116 case 12:
2117 if(nch == 'N') state = 13;
2118 else state = nch == '\t'?1:0;
2119 break;
2120
2121 case 3:
2122 if(nch == ':') state = 4;
2123 else state = nch == '\t'?1:0;
2124 break;
2125
2126 case 2:
2127 if(nch == 'N') state = 3;
2128 else state = nch == '\t'?1:0;
2129 break;
2130
2131 case 1:
2132 if(nch == 'S' && seqname_len<1) state = 2;
2133 else if(nch == 'L' && seqlen<1) state = 12;
2134 else state = nch == '\t'?1:0;
2135 break;
2136
2137 default:
2138 if(nch == '\t') state = 1;
2139 }
2140 }
2141 seqname[seqname_len]=0;
2142 if(seqlen && seqname_len){
2143 ArrayListPush(chroname_size_list,strdup(seqname));
2144 ArrayListPush(chroname_size_list,NULL+seqlen);
2145 }
2146 }
2147 old_ptr = txt_ptr+1;
2148 }
2149 }
2150
2151 char * ncoline = "@CO\tPer-sample-BAM-output:cellCounts\n";
2152
2153 int binlen_wtr = bin_len;
2154 if(bin[bin_len-1]!='\n') binlen_wtr++;
2155 if(!bam_is_from_cellCounts) binlen_wtr += strlen(ncoline);
2156
2157 simple_bam_write(&binlen_wtr,4,writer, 0);
2158 simple_bam_write(bin,bin_len, writer, 0);
2159 if(bin[bin_len-1]!='\n') simple_bam_write("\n",1, writer, 0);
2160 if(!bam_is_from_cellCounts) simple_bam_write(ncoline,strlen(ncoline), writer, 0);
2161
2162 int seq_count = chroname_size_list -> numOfElements/2;
2163 simple_bam_write(&seq_count, 4, writer, 0);
2164 for(xk1 =0; xk1 < chroname_size_list -> numOfElements; xk1+=2){
2165 char * seqname = ArrayListGet(chroname_size_list, xk1);
2166 unsigned int seqlen = ArrayListGet(chroname_size_list, xk1+1)-NULL;
2167 int seqname_len = 1+strlen(seqname);
2168 simple_bam_write(&seqname_len,4,writer,0);
2169 simple_bam_write(seqname,seqname_len,writer,0);
2170 simple_bam_write(&seqlen,4,writer,0);
2171 free(seqname);
2172 }
2173 ArrayListDestroy(chroname_size_list);
2174 simple_bam_write("",0,writer,1);
2175 writer -> total_chromosomes = seq_count;
2176 }
2177
process_pairer_header(void * pairer_vp,int thread_no,int is_text,unsigned int items,char * bin,unsigned int bin_len)2178 int process_pairer_header (void * pairer_vp, int thread_no, int is_text, unsigned int items, char * bin, unsigned int bin_len){
2179 SAM_pairer_context_t * pairer = (SAM_pairer_context_t *) pairer_vp;
2180 fc_thread_global_context_t * global_context = (fc_thread_global_context_t * )pairer -> appendix1;
2181 fc_thread_thread_context_t * thread_context = global_context -> thread_contexts;
2182
2183 //SUBREADprintf("ENTER PROCESS (THRD %d): IS_TXT=%d, ITEMS = %d, CURRENT_ITEMS=%d\n", thread_no, is_text, items, global_context -> sambam_chro_table_items);
2184
2185 if(global_context -> is_scRNA_BAM_FQ_out_generated && global_context -> do_scRNA_table && is_text) {
2186 global_context -> scRNA_sample_BAM_writers -> appendix1 = bin;
2187 global_context -> scRNA_sample_BAM_writers -> counter1 = bin_len;
2188 global_context -> scRNA_sample_BAM_writers -> appendix2 = global_context;
2189 HashTableIteration( global_context -> scRNA_sample_BAM_writers, scRNA_sample_SamBam_writers_add_header);
2190 }
2191 if(global_context -> is_read_details_out == FILE_TYPE_BAM){
2192 int write_cursor;
2193 int first_block = 1;
2194 for(write_cursor = 0; write_cursor < bin_len; write_cursor += 55000){
2195 int wlen = min(55000, bin_len - write_cursor);
2196
2197 if( first_block ){
2198 if(is_text)memcpy(thread_context -> read_details_buff, "BAM\1", 4);
2199 memcpy(thread_context -> read_details_buff + (is_text?4:0), is_text?(&bin_len):(&items), 4);
2200 }
2201
2202 memcpy(thread_context -> read_details_buff + (first_block?4*(1+is_text):0), bin + write_cursor, wlen);
2203 int blen = compress_read_detail_BAM(global_context, thread_context, 0, wlen + (first_block?4*(1+is_text):0), thread_context -> bam_compressed_buff);
2204 fwrite( thread_context -> bam_compressed_buff, 1, blen, global_context -> read_details_out_FP);
2205 first_block = 0;
2206 }
2207 }else if( global_context -> is_read_details_out == FILE_TYPE_SAM && is_text ){
2208 fwrite( bin, 1, bin_len, global_context -> read_details_out_FP);
2209 }
2210 if(is_text ){
2211 if( global_context -> assign_reads_to_RG ){
2212 global_context->RGnames_capacity = 10000;
2213 global_context->RGnames_ptr = 0;
2214 global_context->RGnames_set = malloc( global_context->RGnames_capacity );
2215
2216 int rcursor=0;
2217 for(;rcursor<bin_len; rcursor++){
2218 assert(bin[rcursor] == '@'&& bin[rcursor+3] == '\t');
2219 if(bin[rcursor+1]=='R' && bin[rcursor+2]=='G'){
2220 int id_start = -1, id_end = -1;
2221 for(; rcursor < bin_len; rcursor++){
2222 if(bin[rcursor]=='I' && bin[rcursor+1]=='D'){
2223 id_start = rcursor + 3;
2224 id_end = 0;
2225 }
2226 for(; rcursor < bin_len; rcursor++){
2227 if(bin[rcursor]=='\t' || bin[rcursor]=='\n'){
2228 if(id_end < 1)id_end = rcursor;
2229 break;
2230 }
2231 }
2232 if(bin[rcursor]=='\n') break;
2233 }
2234
2235 if(id_start > 0){
2236 int id_len = id_end - id_start;
2237 if(global_context->RGnames_capacity < global_context->RGnames_ptr + id_len + 3){
2238 global_context->RGnames_capacity = global_context->RGnames_capacity * 17 / 10;
2239 global_context->RGnames_set = realloc( global_context->RGnames_set , global_context->RGnames_capacity );
2240 }
2241 memcpy(global_context->RGnames_set + global_context->RGnames_ptr, bin + id_start, id_len);
2242 global_context->RGnames_set[global_context->RGnames_ptr+id_len]='\t';
2243 global_context->RGnames_ptr += id_len+1;
2244 }
2245 }
2246 for( ;rcursor<bin_len; rcursor++ ) if(bin[rcursor] == '\n')break;
2247 }
2248 if(global_context->RGnames_ptr>0){
2249 global_context->RGnames_set[global_context->RGnames_ptr-1]=0;
2250 global_context->RGnames_ptr--;
2251 }
2252 //SUBREADprintf("RGList: %s\n", global_context->RGnames_set);
2253
2254 int thread_no;
2255 for(thread_no = 0; thread_no < global_context -> thread_number; thread_no ++){
2256 fc_thread_thread_context_t * RGthread_context = global_context -> thread_contexts + thread_no;
2257 int RGcursor = 0;
2258 char *lastRGptr = global_context->RGnames_set;
2259 for(; RGcursor < global_context->RGnames_ptr+1; RGcursor++){
2260 if(global_context->RGnames_set[ RGcursor ] == '\t' || global_context->RGnames_set[ RGcursor ] == 0){
2261 global_context->RGnames_set[ RGcursor ] = 0;
2262 if(strlen(lastRGptr)>0){
2263 // SUBREADprintf("PUT 4Tab:'%s'\n", lastRGptr);
2264 get_RG_tables(global_context, RGthread_context, lastRGptr);
2265 lastRGptr = global_context->RGnames_set + RGcursor +1;
2266 if(RGcursor < global_context->RGnames_ptr)
2267 global_context->RGnames_set[ RGcursor ] = '\t';
2268 }
2269 }
2270 }
2271 }
2272 }
2273 }else{
2274 if(global_context -> sambam_chro_table)
2275 global_context -> sambam_chro_table = delay_realloc(global_context -> sambam_chro_table, global_context -> sambam_chro_table_items * sizeof(SamBam_Reference_Info), (items + global_context -> sambam_chro_table_items) * sizeof(SamBam_Reference_Info));
2276 else global_context -> sambam_chro_table = malloc(items * sizeof(SamBam_Reference_Info));
2277
2278 int x1, bin_ptr = 0;
2279 for(x1 = global_context -> sambam_chro_table_items; x1 < global_context -> sambam_chro_table_items+items; x1++){
2280 int l_name;
2281 memcpy(&l_name, bin + bin_ptr, 4);
2282 bin_ptr += 4;
2283
2284 if( !is_value_contig_name(bin + bin_ptr, l_name)){
2285 SUBREADprintf("The chromosome name contains unexpected characters: \"%s\" (%d chars)\nfeatureCounts has to stop running\n", bin + bin_ptr, l_name);
2286 return -1;
2287 }
2288 if(l_name >= MAX_CHROMOSOME_NAME_LEN){
2289 SUBREADprintf("The chromosome name of \"%s\" contains %d characters, longer than the upper limit of %d\nfeatureCounts has to stop running\n", bin + bin_ptr , l_name, MAX_CHROMOSOME_NAME_LEN - 1);
2290 return -1;
2291 }
2292 memcpy(global_context -> sambam_chro_table[x1].chro_name , bin + bin_ptr, l_name);
2293 //SUBREADprintf("The %d-th is '%s'\n", x1, global_context -> sambam_chro_table[x1].chro_name);
2294 bin_ptr += l_name;
2295 memcpy(&global_context -> sambam_chro_table[x1].chro_length , bin + bin_ptr, 4);
2296 bin_ptr += 4;
2297 }
2298 global_context -> sambam_chro_table_items += items;
2299 }
2300 return 0;
2301 }
2302
2303 void process_line_buffer(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * bin1, char * bin2);
2304
make_dummy(char * rname,char * bin1,char * out_txt2,SamBam_Reference_Info * sambam_chro_table)2305 void make_dummy(char * rname, char * bin1, char * out_txt2, SamBam_Reference_Info * sambam_chro_table){
2306 char * tmptr = NULL;
2307
2308 //SUBREADprintf("S=%s ", rname);
2309 char * realname = strtok_r(rname, "\027", &tmptr);
2310 //int len_name = strlen(realname);
2311 int r1_chro = atoi(strtok_r(NULL, "\027", &tmptr));
2312 int r1_pos = atoi(strtok_r(NULL, "\027", &tmptr));
2313 int r2_chro = atoi(strtok_r(NULL, "\027", &tmptr));
2314 int r2_pos = atoi(strtok_r(NULL, "\027", &tmptr));
2315 int HItag = atoi(strtok_r(NULL, "\027", &tmptr));
2316 int mate_FLAG = 0;
2317 memcpy(&mate_FLAG, bin1 + 16, 4);
2318 mate_FLAG = 0xffff&(mate_FLAG >>16);
2319 int mate_tlen = 0;
2320 memcpy(&mate_tlen, bin1 + 32, 4);
2321
2322 if(r1_chro<0) r1_pos=-1;
2323 if(r2_chro<0) r2_pos=-1;
2324
2325 int my_chro = (mate_FLAG&0x40)? r2_chro : r1_chro;
2326 int my_pos = (mate_FLAG&0x40)? r2_pos : r1_pos;
2327 int mate_chro = (mate_FLAG&0x40)? r1_chro : r2_chro;
2328 int mate_pos = (mate_FLAG&0x40)? r1_pos : r2_pos;
2329
2330 //int bin_mq_nl = (len_name+1);
2331 int my_flag = (mate_FLAG&0x40)? 0x80:0x40;
2332 my_flag |= 1;
2333 if(mate_FLAG & 8)my_flag |=4;
2334 if(mate_FLAG & 4)my_flag |=8;
2335 if(mate_FLAG & 0x10) my_flag |= 0x20;
2336 if(mate_FLAG & 0x20) my_flag |= 0x10;
2337
2338 char HItagStr[20];
2339 if(HItag>=0){
2340 sprintf(HItagStr, "\tHI:i:%d", HItag);
2341 }else{
2342 HItagStr[0]=0;
2343 }
2344
2345 char * my_chro_str = "*";
2346 if(my_chro >= 0) my_chro_str = sambam_chro_table[my_chro].chro_name;
2347
2348 char * mate_chro_str = "*";
2349 if(mate_chro >= 0) mate_chro_str = sambam_chro_table[mate_chro].chro_name;
2350
2351 sprintf(out_txt2, "%s\t%d\t%s\t%d\t0\t*\t%s\t%d\t0\tN\tI\t%s", realname, my_flag, my_chro_str, max(0, my_pos),
2352 mate_chro_str, max(0,mate_pos), HItagStr);
2353 }
2354
reverse_flag(int mf)2355 int reverse_flag(int mf){
2356 int ret = mf & 3;
2357 if(mf & 4) ret |= 8;
2358 if(mf & 8) ret |= 4;
2359 if((mf & 1)==0) ret |= 4;
2360
2361 if(mf & 0x10) ret |= 0x20;
2362 if(mf & 0x20) ret |= 0x10;
2363
2364 if(mf & 0x40) ret |= 0x80;
2365 if(mf & 0x80) ret |= 0x40;
2366 return ret;
2367 }
2368
calc_total_frag_one_len(CIGAR_interval_t * intvs,int intvn,char * read_name)2369 int calc_total_frag_one_len(CIGAR_interval_t * intvs, int intvn, char * read_name){
2370 int ret = 0, x1;
2371 for(x1 = 0; x1 < intvn; x1++){
2372 int x2;
2373 //#warning "=========== DEBUG OUT =============="
2374 if(0 && FIXLENstrcmp("V0112_0155:7:1101:20072:12961#ATCAC", read_name)==0){
2375 SUBREADprintf("READ %s SINGLE: chro_len = %d, secs = %d\n" , read_name, intvs[x1].chromosomal_length, intvs[x1].insertions);
2376 }
2377 for(x2 = 0; x2 < intvs[x1].insertions; x2++) ret += intvs[x1].insertion_lengths[x2];
2378 ret += intvs[x1].chromosomal_length;
2379 }
2380 return ret;
2381 }
2382
calc_total_has_overlap(unsigned int r1_start,unsigned int r1_end,unsigned int r2_start,unsigned int r2_end,unsigned int * overlap_start,unsigned int * overlap_end)2383 int calc_total_has_overlap(unsigned int r1_start, unsigned int r1_end, unsigned int r2_start, unsigned int r2_end, unsigned int * overlap_start, unsigned int * overlap_end){
2384 if((r1_start <= r2_start && r1_end > r2_start) || (r2_start <= r1_start && r2_end > r1_start) ){
2385 (*overlap_start) = max( r1_start, r2_start );
2386 (*overlap_end) = min( r1_end, r2_end );
2387 return 1;
2388 }
2389 return 0;
2390 }
2391
calc_total_frag_len(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,CIGAR_interval_t * CIGAR_intervals_R1,int CIGAR_intervals_R1_sections,CIGAR_interval_t * CIGAR_intervals_R2,int CIGAR_intervals_R2_sections,char * read_name)2392 int calc_total_frag_len( fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, CIGAR_interval_t * CIGAR_intervals_R1, int CIGAR_intervals_R1_sections, CIGAR_interval_t * CIGAR_intervals_R2, int CIGAR_intervals_R2_sections, char * read_name){
2393 if ( CIGAR_intervals_R1_sections == 0 && CIGAR_intervals_R2_sections > 0) return calc_total_frag_one_len( CIGAR_intervals_R2,CIGAR_intervals_R2_sections , read_name);
2394 else if( CIGAR_intervals_R1_sections > 0 && CIGAR_intervals_R2_sections== 0) return calc_total_frag_one_len( CIGAR_intervals_R1,CIGAR_intervals_R1_sections , read_name);
2395 else if( CIGAR_intervals_R1_sections == 0 && CIGAR_intervals_R2_sections== 0) return 0;
2396
2397 if(CIGAR_intervals_R1_sections > 0 && CIGAR_intervals_R2_sections > 0 && strcmp(CIGAR_intervals_R1[0].chro, CIGAR_intervals_R2[0].chro )!=0 )
2398 // two reads are from different chromosomes
2399 return calc_total_frag_one_len( CIGAR_intervals_R2,CIGAR_intervals_R2_sections , read_name) + calc_total_frag_one_len( CIGAR_intervals_R1,CIGAR_intervals_R1_sections , read_name);
2400
2401 if(0 && FIXLENstrcmp("V0112_0155:7:1101:20072:12961#ATCAC", read_name)==0){
2402 int xx;
2403 for(xx = 0; xx < CIGAR_intervals_R1_sections; xx++)
2404 SUBREADprintf("R1 SEC %d: %u + %d\n", xx, CIGAR_intervals_R1[xx].start_pos, CIGAR_intervals_R1[xx].chromosomal_length );
2405 for(xx = 0; xx < CIGAR_intervals_R2_sections; xx++)
2406 SUBREADprintf("R2 SEC %d: %u + %d\n", xx, CIGAR_intervals_R2[xx].start_pos, CIGAR_intervals_R2[xx].chromosomal_length );
2407 }
2408
2409 unsigned int merged_section_count = 0;
2410 unsigned short merged_section_lengths[ MAXIMUM_INSERTION_IN_SECTION * 3 ];
2411 unsigned int merged_section_indel_counts[ MAXIMUM_INSERTION_IN_SECTION * 3 ];
2412 unsigned short merged_section_indel_lengths[ MAXIMUM_INSERTION_IN_SECTION * 3 ][ MAXIMUM_INSERTION_IN_SECTION ];
2413
2414 int R1_i = 0 , R2_i = 0;
2415 while (1){
2416 //SUBREADprintf("FRAGDEBUG %s : %d < %d & %d < %d; MC=%d; INS1=%d INS2=%d\n", read_name, R1_i,CIGAR_intervals_R1_sections,R2_i,CIGAR_intervals_R2_sections, merged_section_count, CIGAR_intervals_R1[R1_i].insertions, CIGAR_intervals_R2[R2_i].insertions);
2417 if( R1_i >= CIGAR_intervals_R1_sections && R2_i >= CIGAR_intervals_R2_sections ) break;
2418
2419 if( R1_i < CIGAR_intervals_R1_sections && R2_i < CIGAR_intervals_R2_sections){
2420 // see if R1 and R2 overlap
2421 // if not: add R2 to specific sction; R2_i ++
2422 // elif overlap: add the R1 first_half and/or R2 first_half or zero to specific section, and add overlapping part to overlapping section; DO NOT add the second specific half!
2423 // if R1_end > R2_end: R1_section_start = overlapping_end; R2_i ++
2424 // elif R2_end > R1_end: R2_section_start = overlapping_end; R1_i ++
2425 // elif R2_end == R1_end: R1_i++; R2_i++
2426
2427 unsigned int overlapping_start= 0 , overlapping_end = 0;
2428
2429 int is_r1r2_overlap = 0;
2430
2431 is_r1r2_overlap = calc_total_has_overlap( CIGAR_intervals_R1[R1_i].start_pos, CIGAR_intervals_R1[R1_i].start_pos + CIGAR_intervals_R1[R1_i].chromosomal_length , CIGAR_intervals_R2[R2_i].start_pos, CIGAR_intervals_R2[R2_i].start_pos + CIGAR_intervals_R2[R2_i].chromosomal_length , & overlapping_start , & overlapping_end);
2432
2433 if( is_r1r2_overlap ){
2434 if (CIGAR_intervals_R1[R1_i].start_pos > CIGAR_intervals_R2[R2_i].start_pos ){
2435 //first half_R2 add special
2436 merged_section_lengths[merged_section_count] = overlapping_start - CIGAR_intervals_R2[R2_i].start_pos;
2437
2438 int indel_i;
2439 for(indel_i = 0; indel_i < min(MAXIMUM_INSERTION_IN_SECTION,CIGAR_intervals_R2[R2_i].insertions); indel_i++){
2440 if( CIGAR_intervals_R2[R2_i].insertion_start_pos[indel_i] >= overlapping_start ){
2441 if(indel_i>0){
2442 int insmov_i, ins_dist_i = 0;
2443 for(insmov_i = indel_i ; insmov_i < CIGAR_intervals_R2[R2_i].insertions; insmov_i++){
2444 assert(MAXIMUM_INSERTION_IN_SECTION > ins_dist_i);
2445 assert(MAXIMUM_INSERTION_IN_SECTION > insmov_i);
2446 CIGAR_intervals_R2[R2_i].insertion_start_pos[ins_dist_i] = CIGAR_intervals_R2[R2_i].insertion_start_pos[insmov_i];
2447 CIGAR_intervals_R2[R2_i].insertion_lengths[ins_dist_i] = CIGAR_intervals_R2[R2_i].insertion_lengths[insmov_i];
2448 ins_dist_i++;
2449 }
2450 CIGAR_intervals_R2[R2_i].insertions = ins_dist_i;
2451 }
2452 break;
2453 }
2454 merged_section_indel_lengths[merged_section_count][indel_i] = CIGAR_intervals_R2[R2_i].insertion_lengths[indel_i];
2455 }
2456 merged_section_indel_counts[merged_section_count] = indel_i;
2457
2458 merged_section_count ++;
2459
2460 }else if( CIGAR_intervals_R1[R1_i].start_pos < CIGAR_intervals_R2[R2_i].start_pos ){
2461 //first half_R1 add special
2462 merged_section_lengths[merged_section_count] = overlapping_start - CIGAR_intervals_R1[R1_i].start_pos;
2463
2464 int indel_i;
2465 for(indel_i = 0; indel_i < min(MAXIMUM_INSERTION_IN_SECTION,CIGAR_intervals_R1[R1_i].insertions); indel_i++){
2466 if( CIGAR_intervals_R1[R1_i].insertion_start_pos[indel_i] >= overlapping_start ){
2467 if(indel_i>0){
2468 int insmov_i, ins_dist_i = 0;
2469 for(insmov_i = indel_i ; insmov_i < CIGAR_intervals_R1[R1_i].insertions; insmov_i++){
2470 assert(MAXIMUM_INSERTION_IN_SECTION > insmov_i);
2471 CIGAR_intervals_R1[R1_i].insertion_start_pos[ins_dist_i] = CIGAR_intervals_R1[R1_i].insertion_start_pos[insmov_i];
2472 CIGAR_intervals_R1[R1_i].insertion_lengths[ins_dist_i] = CIGAR_intervals_R1[R1_i].insertion_lengths[insmov_i];
2473 ins_dist_i++;
2474 }
2475 CIGAR_intervals_R1[R1_i].insertions = ins_dist_i;
2476 }
2477 break;
2478 }
2479
2480 merged_section_indel_lengths[merged_section_count][indel_i] = CIGAR_intervals_R1[R1_i].insertion_lengths[indel_i];
2481 }
2482 merged_section_indel_counts[merged_section_count] = indel_i;
2483
2484 merged_section_count ++;
2485 }
2486
2487 merged_section_lengths[merged_section_count] = overlapping_end - overlapping_start;
2488 merged_section_indel_counts[merged_section_count] = 0;
2489
2490
2491 int indel_i_R1 = 0, indel_i_R2 = 0;
2492 while(1){
2493 //SUBREADprintf("FRAGDEBUG: CC[%d] = %d ; II1=%d < %d; II2=%d < %d\n", merged_section_count, merged_section_indel_counts[merged_section_count], indel_i_R1, CIGAR_intervals_R1[R1_i].insertions , indel_i_R2, CIGAR_intervals_R2[R2_i].insertions);
2494
2495 if( indel_i_R1 >= CIGAR_intervals_R1[R1_i].insertions || indel_i_R2 >= CIGAR_intervals_R2[R2_i].insertions ){
2496 if(indel_i_R1 > 0){
2497 int insmov_i, ins_dist_i = 0;
2498 for(insmov_i = indel_i_R1 ; insmov_i < min(MAXIMUM_INSERTION_IN_SECTION,CIGAR_intervals_R1[R1_i].insertions); insmov_i++){
2499 assert(MAXIMUM_INSERTION_IN_SECTION > insmov_i);
2500 CIGAR_intervals_R1[R1_i].insertion_start_pos[ins_dist_i] = CIGAR_intervals_R1[R1_i].insertion_start_pos[insmov_i];
2501 CIGAR_intervals_R1[R1_i].insertion_lengths[ins_dist_i] = CIGAR_intervals_R1[R1_i].insertion_lengths[insmov_i];
2502 ins_dist_i++;
2503 }
2504 CIGAR_intervals_R1[R1_i].insertions = ins_dist_i;
2505 }
2506 if(indel_i_R2 > 0){
2507 int insmov_i, ins_dist_i = 0;
2508 for(insmov_i = indel_i_R2 ; insmov_i < min(CIGAR_intervals_R2[R2_i].insertions,MAXIMUM_INSERTION_IN_SECTION); insmov_i++){
2509 assert(MAXIMUM_INSERTION_IN_SECTION > insmov_i);
2510 CIGAR_intervals_R2[R2_i].insertion_start_pos[ins_dist_i] = CIGAR_intervals_R2[R2_i].insertion_start_pos[insmov_i];
2511 CIGAR_intervals_R2[R2_i].insertion_lengths[ins_dist_i] = CIGAR_intervals_R2[R2_i].insertion_lengths[insmov_i];
2512 ins_dist_i++;
2513 }
2514 CIGAR_intervals_R2[R2_i].insertions = ins_dist_i;
2515 }
2516 break;
2517 }
2518
2519 if( CIGAR_intervals_R1[R1_i].insertion_start_pos[indel_i_R1] > CIGAR_intervals_R2[R2_i].insertion_start_pos[indel_i_R2] ) indel_i_R2 ++;
2520 else if( CIGAR_intervals_R1[R1_i].insertion_start_pos[indel_i_R1] < CIGAR_intervals_R2[R2_i].insertion_start_pos[indel_i_R2] ) indel_i_R1 ++;
2521 else{
2522 if( CIGAR_intervals_R1[R1_i].insertion_lengths[ indel_i_R1 ] == CIGAR_intervals_R2[R2_i].insertion_lengths[ indel_i_R2 ] ){
2523 merged_section_indel_lengths[merged_section_count][ merged_section_indel_counts[merged_section_count] ] = CIGAR_intervals_R1[R1_i].insertion_lengths[indel_i_R1];
2524 merged_section_indel_counts[merged_section_count] ++;
2525 }
2526 indel_i_R2++;
2527 indel_i_R1++;
2528 }
2529 }
2530
2531 merged_section_count ++;
2532
2533 // add common
2534
2535 if(CIGAR_intervals_R1[R1_i].start_pos + CIGAR_intervals_R1[R1_i].chromosomal_length > CIGAR_intervals_R2[R2_i].start_pos + CIGAR_intervals_R2[R2_i].chromosomal_length){
2536 CIGAR_intervals_R1[R1_i].chromosomal_length -= ( overlapping_end - CIGAR_intervals_R1[R1_i].start_pos );
2537 CIGAR_intervals_R1[R1_i].start_pos = overlapping_end;
2538 R2_i ++;
2539 }else if(CIGAR_intervals_R1[R1_i].start_pos + CIGAR_intervals_R1[R1_i].chromosomal_length < CIGAR_intervals_R2[R2_i].start_pos + CIGAR_intervals_R2[R2_i].chromosomal_length){
2540 CIGAR_intervals_R2[R2_i].chromosomal_length -= ( overlapping_end - CIGAR_intervals_R2[R2_i].start_pos );
2541 CIGAR_intervals_R2[R2_i].start_pos = overlapping_end;
2542 R1_i ++;
2543 }else{
2544 R1_i ++;
2545 R2_i ++;
2546 }
2547
2548 }else if(CIGAR_intervals_R1[R1_i].start_pos > CIGAR_intervals_R2[R2_i].start_pos){
2549 merged_section_lengths[merged_section_count] = CIGAR_intervals_R2[R2_i].chromosomal_length;
2550
2551 int indel_i;
2552 for(indel_i = 0; indel_i < CIGAR_intervals_R2[R2_i].insertions; indel_i++){
2553 merged_section_indel_lengths[merged_section_count][indel_i] = CIGAR_intervals_R2[R2_i].insertion_lengths[indel_i];
2554 }
2555 merged_section_indel_counts[merged_section_count] = CIGAR_intervals_R2[R2_i].insertions;
2556
2557 merged_section_count ++;
2558 R2_i ++;
2559 }else{
2560 merged_section_lengths[merged_section_count] = CIGAR_intervals_R1[R1_i].chromosomal_length;
2561
2562 int indel_i;
2563 for(indel_i = 0; indel_i < CIGAR_intervals_R1[R1_i].insertions; indel_i++){
2564 merged_section_indel_lengths[merged_section_count][indel_i] = CIGAR_intervals_R1[R1_i].insertion_lengths[indel_i];
2565 }
2566 merged_section_indel_counts[merged_section_count] = CIGAR_intervals_R1[R1_i].insertions;
2567
2568 merged_section_count ++;
2569 R1_i ++;
2570 }
2571 }else if(R1_i < CIGAR_intervals_R1_sections){
2572 // add R1 section to specific section
2573 // R1_i ++
2574 merged_section_lengths[merged_section_count] = CIGAR_intervals_R1[R1_i].chromosomal_length;
2575
2576 int indel_i;
2577 for(indel_i = 0; indel_i < CIGAR_intervals_R1[R1_i].insertions; indel_i++){
2578 merged_section_indel_lengths[merged_section_count][indel_i] = CIGAR_intervals_R1[R1_i].insertion_lengths[indel_i];
2579 }
2580 merged_section_indel_counts[merged_section_count] = CIGAR_intervals_R1[R1_i].insertions;
2581
2582 merged_section_count ++;
2583 R1_i ++;
2584 }else if(R2_i < CIGAR_intervals_R2_sections){
2585 merged_section_lengths[merged_section_count] = CIGAR_intervals_R2[R2_i].chromosomal_length;
2586
2587 int indel_i;
2588 for(indel_i = 0; indel_i < CIGAR_intervals_R2[R2_i].insertions; indel_i++){
2589 merged_section_indel_lengths[merged_section_count][indel_i] = CIGAR_intervals_R2[R2_i].insertion_lengths[indel_i];
2590 }
2591 merged_section_indel_counts[merged_section_count] = CIGAR_intervals_R2[R2_i].insertions;
2592
2593 merged_section_count ++;
2594 R2_i ++;
2595 }
2596 }
2597
2598 int ret = 0, x1, x2;
2599 for(x1 = 0; x1 < merged_section_count ; x1++){
2600 ret += merged_section_lengths[x1];
2601 for(x2 = 0; x2 < merged_section_indel_counts[x1]; x2++)
2602 ret += merged_section_indel_lengths[x1][x2];
2603 // SUBREADprintf("FRAGDEBUG %s [%d] : len = %d , indels = %d\n" , read_name, x1, merged_section_lengths[x1] , merged_section_indel_counts[x1]);
2604 }
2605
2606 return ret;
2607 }
2608
get_readname_from_bin(char * bin,char ** read_name)2609 void get_readname_from_bin(char * bin, char ** read_name){
2610 (*read_name) = bin + 36;
2611 }
2612
parse_bin(SamBam_Reference_Info * sambam_chro_table,char * bin,char * bin2,char ** read_name,int * flag,char ** chro,srInt_64 * pos,int * mapq,char ** mate_chro,srInt_64 * mate_pos,srInt_64 * tlen,int * is_junction_read,int * cigar_sect,unsigned int * Starting_Chro_Points_1BASE,unsigned short * Starting_Read_Points,unsigned short * Section_Read_Lengths,char ** ChroNames,char * Event_After_Section,int * NH_value,int max_M,CIGAR_interval_t * intervals_buffer,int * intervals_i,int assign_reads_to_RG,char ** RG_ptr,int * ret_me_refID,int * ret_mate_refID)2613 void parse_bin(SamBam_Reference_Info * sambam_chro_table, char * bin, char * bin2, char ** read_name, int * flag, char ** chro, srInt_64 * pos, int * mapq, char ** mate_chro, srInt_64 * mate_pos, srInt_64 * tlen, int * is_junction_read, int * cigar_sect, unsigned int * Starting_Chro_Points_1BASE, unsigned short * Starting_Read_Points, unsigned short * Section_Read_Lengths, char ** ChroNames, char * Event_After_Section, int * NH_value, int max_M, CIGAR_interval_t * intervals_buffer, int * intervals_i, int assign_reads_to_RG, char ** RG_ptr, int * ret_me_refID, int * ret_mate_refID){
2614 int x1, len_of_S1 = 0;
2615 *cigar_sect = 0;
2616 *NH_value = 1;
2617 *flag = 0;
2618 *is_junction_read = 0;
2619 assert(bin||bin2);
2620
2621 if(bin){
2622 (*read_name) = bin + 36;
2623 memcpy(flag, bin + 16, 4);
2624 int cigar_opts = (*flag) & 0xffff;
2625 (*flag) = (*flag) >> 16;
2626 int refID, mate_refID;
2627 memcpy(&refID, bin + 4, 4);
2628 if(refID >= 0) (*chro) = sambam_chro_table[refID].chro_name;
2629 else (*chro) = NULL;
2630
2631 (*pos) = 0;
2632 memcpy(pos, bin+8, 4);
2633 (*pos) ++;
2634
2635 memcpy(mapq, bin+12, 4);
2636 int l_read_name = (*mapq)& 0xff;
2637 (*mapq) = ((*mapq)>>8)&0xff;
2638
2639 int seq_len;
2640 memcpy(&seq_len, bin + 20,4);
2641 memcpy(&mate_refID, bin+24, 4);
2642 if(mate_refID>=0) (*mate_chro) = sambam_chro_table[mate_refID].chro_name;
2643 else (*mate_chro) = NULL;
2644
2645 *ret_mate_refID = mate_refID;
2646 *ret_me_refID = refID;
2647
2648 (*mate_pos)=0;
2649 memcpy(mate_pos, bin+28, 4);
2650 (*mate_pos)++;
2651
2652 int tlen_int;
2653 memcpy(&tlen_int, bin+32, 4);
2654 (*tlen) = tlen_int;
2655
2656 int * cigar_opt_ints = (int *)(bin + 36 + l_read_name);
2657 unsigned int chro_cursor = (*pos), section_start_chro = (*pos);
2658 unsigned short read_cursor = 0, this_section_length = 0, section_start_read = 0;
2659
2660 if(intervals_buffer){
2661 intervals_buffer[ *intervals_i ].start_pos = chro_cursor;
2662 intervals_buffer[ *intervals_i ].chro = *chro;
2663 }
2664
2665 for(x1 = 0 ; x1 < cigar_opts; x1++){
2666 int optype = cigar_opt_ints[x1]&0xf;
2667 int optval = (cigar_opt_ints[x1]>>4)& 0xfffffff;
2668 if(optype == 0 || optype == 7 || optype == 8){ // 'M' , '=', 'X'
2669 chro_cursor += optval;
2670 read_cursor += optval;
2671 this_section_length += optval;
2672 /* }else if(optype == 1){ // 'I'
2673 read_cursor += optval;
2674 }else if(optype == 2){ // 'D'
2675 chro_cursor += optval;
2676 */ }else if(optype == 1 || optype == 2 || optype == 3){ // 'I', 'D' or 'N'
2677 if(3 == optype)
2678 (*is_junction_read) = 1;
2679 char event_char=0;
2680 if(optype == 3) event_char = 'N';
2681 if(optype == 2) event_char = 'D';
2682 else if(optype == 1){
2683 if(intervals_buffer && intervals_buffer[ *intervals_i ].insertions < MAXIMUM_INSERTION_IN_SECTION){
2684 intervals_buffer[ *intervals_i ].insertion_start_pos[ intervals_buffer[ *intervals_i ].insertions ] = chro_cursor;
2685 intervals_buffer[ *intervals_i ].insertion_lengths[ intervals_buffer[ *intervals_i ].insertions ] = optval;
2686 intervals_buffer[ *intervals_i ].insertions ++;
2687 }
2688 event_char = 'I';
2689 }
2690
2691 if( (*cigar_sect) < max_M){
2692 Event_After_Section[*cigar_sect] = event_char;
2693 Starting_Chro_Points_1BASE[*cigar_sect] = section_start_chro;
2694 Starting_Read_Points[*cigar_sect] = section_start_read;
2695 Section_Read_Lengths[*cigar_sect] = this_section_length;
2696 ChroNames[*cigar_sect] = (*chro);
2697 (*cigar_sect)++;
2698
2699 if(intervals_buffer){
2700 intervals_buffer[ *intervals_i ].chromosomal_length = chro_cursor - intervals_buffer[ *intervals_i ].start_pos;
2701 (*intervals_i) ++;
2702 }
2703 }
2704
2705 if(optype == 2 || optype == 3)// N or D
2706 chro_cursor += optval;
2707 else
2708 read_cursor += optval;
2709
2710 if(intervals_buffer && (*cigar_sect) < max_M){
2711 intervals_buffer[ *intervals_i ].start_pos = chro_cursor;
2712 intervals_buffer[ *intervals_i ].chro = *chro;
2713 }
2714
2715 section_start_chro = chro_cursor;
2716 section_start_read = read_cursor;
2717 this_section_length = 0;
2718 }else if(optype == 4){ // 'S'
2719 if(read_cursor==0)
2720 {
2721 read_cursor += optval;
2722 section_start_read = read_cursor;
2723
2724 if(intervals_buffer){
2725 if(intervals_buffer[ *intervals_i ].start_pos > optval) intervals_buffer[ *intervals_i ].start_pos -= optval;
2726 else intervals_buffer[ *intervals_i ].start_pos = 0;
2727 }
2728 }else len_of_S1 = optval;
2729 } // H and P do not have effect on cigar parsing.
2730 }
2731 if(this_section_length>0){
2732 // add new section
2733 if( (*cigar_sect) < max_M){
2734 if(intervals_buffer){
2735 intervals_buffer[ *intervals_i ].chromosomal_length = chro_cursor - intervals_buffer[ *intervals_i ].start_pos + len_of_S1;
2736 (*intervals_i)++;
2737 }
2738 Starting_Chro_Points_1BASE[*cigar_sect] = section_start_chro;
2739 Starting_Read_Points[*cigar_sect] = section_start_read;
2740 Section_Read_Lengths[*cigar_sect] = this_section_length ;
2741 ChroNames[*cigar_sect] = (*chro);
2742 (*cigar_sect)++;
2743 }
2744 }
2745
2746 int bin_ptr = 36 + l_read_name + seq_len + (seq_len+1)/2 + 4 * cigar_opts;
2747 int block_len;
2748 memcpy(&block_len, bin, 4);
2749 int found_NH = SAM_pairer_iterate_int_tags((unsigned char *)bin+bin_ptr, block_len + 4 - bin_ptr, "NH", NH_value);
2750 if(!found_NH) *(NH_value) = 1;
2751
2752 if(assign_reads_to_RG){
2753 char RG_type = 0;
2754 SAM_pairer_iterate_tags((unsigned char *)bin+bin_ptr, block_len + 4 - bin_ptr, "RG", &RG_type, RG_ptr);
2755 if(RG_type != 'Z') (*RG_ptr) = NULL;
2756 }
2757 //SUBREADprintf("FOUND=%d, NH=%d, TAG=%.*s\n", found_NH, *(NH_value), 3 , bin+bin_ptr);
2758 }else{
2759 (*read_name) = bin2 + 36;
2760 int mate_flag;
2761 memcpy(&mate_flag, bin2 + 16, 4);
2762 mate_flag = mate_flag >> 16;
2763 (*flag) = reverse_flag(mate_flag);
2764
2765 int refID, mate_refID;
2766 memcpy(&refID, bin2 + 24, 4);
2767 memcpy(&mate_refID, bin2 + 4, 4);
2768 if(refID < 0) *chro = NULL;
2769 else (*chro) = sambam_chro_table[refID].chro_name;
2770
2771 if(mate_refID < 0) *mate_chro = NULL;
2772 else (*mate_chro) = sambam_chro_table[mate_refID].chro_name;
2773 *ret_mate_refID = mate_refID;
2774 *ret_me_refID = refID;
2775
2776 *pos=0;
2777 memcpy(pos, bin2+28, 4);
2778 (*pos)++;
2779
2780 *mate_pos=0;
2781 memcpy(mate_pos, bin2+8, 4);
2782 (*mate_pos)++;
2783
2784 (*tlen) = 0;
2785 memcpy(tlen, bin2+32, 4);
2786 (*tlen) = -(*tlen);
2787
2788 if(assign_reads_to_RG){
2789 char RG_type = 0;
2790 int block2_len = 0;
2791 memcpy(&block2_len, bin2, 4);
2792 int rname2len = 0, cigar2len = 0, seq2len = 0;
2793 memcpy(&rname2len, bin2+12, 1);
2794 memcpy(&cigar2len, bin2+16, 2);
2795 memcpy(&seq2len, bin2+20, 4);
2796
2797 int bin2_ptr = 36 + rname2len + 4 * cigar2len + seq2len + (seq2len+1)/2;
2798 SAM_pairer_iterate_tags((unsigned char *)bin2+bin2_ptr, block2_len + 4 - bin2_ptr, "RG", &RG_type, RG_ptr);
2799 if(RG_type != 'Z') (*RG_ptr) = NULL;
2800 }
2801
2802 }
2803 }
2804
2805 /*
2806 typedef struct {
2807 char chromosome_name_left[CHROMOSOME_NAME_LENGTH + 1];
2808 char chromosome_name_right[CHROMOSOME_NAME_LENGTH + 1];
2809 unsigned int last_exon_base_left;
2810 unsigned int first_exon_base_right;
2811 } fc_junction_info_t;
2812
2813 */
calc_junctions_from_cigarInts(fc_thread_global_context_t * global_context,int alignment_masks,int cigar_sections,unsigned int * Starting_Chro_Points_1BASE,unsigned short * Starting_Read_Points,unsigned short * Section_Lengths,char ** ChroNames,char * Event_After_Section,fc_junction_info_t * junctions_current)2814 int calc_junctions_from_cigarInts(fc_thread_global_context_t * global_context, int alignment_masks , int cigar_sections, unsigned int * Starting_Chro_Points_1BASE, unsigned short * Starting_Read_Points, unsigned short * Section_Lengths, char ** ChroNames, char * Event_After_Section, fc_junction_info_t * junctions_current){
2815 int x1, ret = 0;
2816 unsigned int last_base_pos = Starting_Chro_Points_1BASE[0] + Section_Lengths[0] - 1;
2817 for(x1 = 1; x1 < cigar_sections; x1++){
2818 if(!ChroNames[x1]) continue; // NULL chro name for https://groups.google.com/forum/#!topic/subread/QDT6npjAZuE
2819 if(Event_After_Section[x1-1] == 'N'){
2820 unsigned int first_base_pos = Starting_Chro_Points_1BASE[x1];
2821 junctions_current[ret].last_exon_base_left = last_base_pos;
2822 junctions_current[ret].first_exon_base_right = first_base_pos;
2823 strcpy(junctions_current[ret].chromosome_name_left, ChroNames[x1]);
2824 strcpy(junctions_current[ret].chromosome_name_right, ChroNames[x1]);
2825 ret ++;
2826 }
2827
2828 last_base_pos = Starting_Chro_Points_1BASE[x1] + Section_Lengths[x1] - 1;
2829 }
2830 return ret;
2831 }
2832
2833 void add_fragment_supported_junction( fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, fc_junction_info_t * supported_junctions1, int njunc1, fc_junction_info_t * supported_junctions2, int njunc2, char * RG_name);
2834
process_line_junctions(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * bin1,char * bin2)2835 void process_line_junctions(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * bin1, char * bin2) {
2836 fc_junction_info_t supported_junctions1[global_context -> max_M], supported_junctions2[global_context -> max_M];
2837 int is_second_read, njunc1=0, njunc2=0, is_junction_read, cigar_sections;
2838 int alignment_masks, mapping_qual, NH_value;
2839 char *RG_ptr=NULL;
2840
2841 for(is_second_read = 0 ; is_second_read < 2; is_second_read++){
2842 char * read_chr, *read_name, *mate_chr;
2843 srInt_64 read_pos, fragment_length = 0, mate_pos;
2844 unsigned int Starting_Chro_Points_1BASE[global_context -> max_M];
2845 unsigned short Starting_Read_Points[global_context -> max_M];
2846 unsigned short Section_Read_Lengths[global_context -> max_M];
2847 char * ChroNames[global_context -> max_M];
2848 char Event_After_Section[global_context -> max_M];
2849 if(is_second_read && !global_context -> is_paired_end_mode_assign) break;
2850 char * RG_ptr_one = NULL;
2851 int me_refID, mate_refID;
2852
2853 parse_bin(global_context -> sambam_chro_table, is_second_read?bin2:bin1, is_second_read?bin1:bin2 , &read_name, &alignment_masks , &read_chr, &read_pos, &mapping_qual, &mate_chr, &mate_pos, &fragment_length, &is_junction_read, &cigar_sections, Starting_Chro_Points_1BASE, Starting_Read_Points, Section_Read_Lengths, ChroNames, Event_After_Section, &NH_value, global_context -> max_M, NULL, NULL, global_context -> assign_reads_to_RG, &RG_ptr_one, &me_refID, &mate_refID);
2854 assert(cigar_sections <= global_context -> max_M);
2855 if(RG_ptr_one) RG_ptr = RG_ptr_one;
2856
2857 int * njunc_current = is_second_read?&njunc2:&njunc1;
2858 fc_junction_info_t * junctions_current = is_second_read?supported_junctions2:supported_junctions1;
2859 (*njunc_current) = calc_junctions_from_cigarInts(global_context, alignment_masks , cigar_sections, Starting_Chro_Points_1BASE, Starting_Read_Points, Section_Read_Lengths, ChroNames, Event_After_Section, junctions_current);
2860
2861 //if(0 && FIXLENstrcmp("HWI-ST212:219:C0C1TACXX:1:1101:13391:171460", read_name)==0){
2862 // SUBREADprintf("JUNC_FOUND_IN_READ OF %s : %d\n", read_name , *njunc_current);
2863 //}
2864 }
2865 if(njunc1 >0 || njunc2>0)
2866 add_fragment_supported_junction(global_context, thread_context, supported_junctions1, njunc1, supported_junctions2, njunc2, RG_ptr);
2867
2868 }
2869
get_RG_tables(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * rg_name)2870 void ** get_RG_tables(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * rg_name){
2871 void ** ret = HashTableGet(thread_context->RG_table, rg_name);
2872 if(ret) return ret;
2873
2874 ret = malloc(sizeof(void *)*4);
2875
2876 ret[0] = malloc(thread_context -> count_table_size * sizeof(read_count_type_t));
2877 ret[1] = malloc(sizeof(fc_read_counters));
2878
2879 memset(ret[0], 0, thread_context -> count_table_size * sizeof(read_count_type_t));
2880 memset(ret[1], 0, sizeof(fc_read_counters));
2881
2882 if(global_context -> do_junction_counting){
2883 HashTable * junction_counting_table = HashTableCreate(131317);
2884 HashTableSetHashFunction(junction_counting_table,HashTableStringHashFunction);
2885 HashTableSetDeallocationFunctions(junction_counting_table, free, NULL);
2886 HashTableSetKeyComparisonFunction(junction_counting_table, fc_strcmp_chro);
2887
2888 HashTable * splicing_point_table = HashTableCreate(131317);
2889 HashTableSetHashFunction(splicing_point_table,HashTableStringHashFunction);
2890 HashTableSetDeallocationFunctions(splicing_point_table, free, NULL);
2891 HashTableSetKeyComparisonFunction(splicing_point_table, fc_strcmp_chro);
2892
2893 ret [2] = junction_counting_table;
2894 ret [3] = splicing_point_table;
2895 }else ret[2] = NULL;
2896
2897 char * rg_name_mem = malloc(strlen(rg_name)+1);
2898 strcpy(rg_name_mem, rg_name);
2899 HashTablePut(thread_context->RG_table, rg_name_mem, ret);
2900 return ret;
2901 }
2902
2903 void add_scRNA_read_tota1_no( fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * read_name, char * bin1, int step);
process_scRNAr2_line_buffer(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * bin1,char * bin2)2904 void process_scRNAr2_line_buffer(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * bin1, char * bin2){
2905 char * read_name = bin1+36;
2906 add_scRNA_read_tota1_no(global_context, thread_context, read_name, bin1, 2);
2907 }
2908
process_pairer_scRNAr2_output(void * pairer_vp,int thread_no,char * bin1,char * bin2)2909 int process_pairer_scRNAr2_output(void * pairer_vp, int thread_no, char * bin1, char * bin2){
2910 SAM_pairer_context_t * pairer = (SAM_pairer_context_t *) pairer_vp;
2911 fc_thread_global_context_t * global_context = (fc_thread_global_context_t * )pairer -> appendix1;
2912 fc_thread_thread_context_t * thread_context = global_context -> thread_contexts + thread_no;
2913 process_scRNAr2_line_buffer(global_context, thread_context, bin1, bin2);
2914 return 0;
2915 }
2916
2917
process_pairer_output(void * pairer_vp,int thread_no,char * bin1,char * bin2)2918 int process_pairer_output(void * pairer_vp, int thread_no, char * bin1, char * bin2){
2919 SAM_pairer_context_t * pairer = (SAM_pairer_context_t *) pairer_vp;
2920 fc_thread_global_context_t * global_context = (fc_thread_global_context_t * )pairer -> appendix1;
2921 fc_thread_thread_context_t * thread_context = global_context -> thread_contexts + thread_no;
2922
2923 if(pairer -> long_cigar_mode){
2924 if(global_context -> max_M < 65536){
2925 //SUBREADprintf("SWITCHED INTO LONG-READ MODE\n");
2926 global_context -> max_M = 65536;
2927 }
2928 if(!global_context->is_read_too_long_to_SAM_BAM_shown &&(global_context -> is_read_details_out == FILE_TYPE_SAM || global_context -> is_read_details_out == FILE_TYPE_BAM)){
2929 global_context -> is_read_details_out = 0;
2930 SUBREADprintf("ERROR: The read is too long to the SAM or BAM output.\nPlease use the 'CORE' mode for the assignment detail output.\n");
2931 global_context->is_read_too_long_to_SAM_BAM_shown = 1;
2932 }
2933 }
2934
2935 process_line_buffer(global_context, thread_context, bin1, bin2);
2936 if(0 && global_context -> do_junction_counting){
2937 process_line_junctions(global_context, thread_context, bin1, bin2);
2938 }
2939 return 0;
2940 }
2941
2942 void sort_bucket_table(fc_thread_global_context_t * global_context);
2943 void vote_and_add_count(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context,
2944 srInt_64 * hits_indices1, int nhits1, srInt_64 * hits_indices2, int nhits2, unsigned int total_frag_len,
2945 char ** hits_chro1, char ** hits_chro2, unsigned int * hits_start_pos1, unsigned int * hits_start_pos2, unsigned short * hits_length1, unsigned short * hits_length2, int fixed_fractional_count, char * read_name, char * RG_name, char * bin1, char * bin2);
2946
add_bin_new_tags(char * oldbin,char ** newbin,char ** tags,char * types,void ** vals)2947 void add_bin_new_tags(char * oldbin, char **newbin, char ** tags, char * types, void ** vals){
2948 int new_tags_length = 0;
2949 int tagi;
2950 for(tagi = 0; tags[tagi]; tagi++){
2951 char type = types[tagi];
2952 if(type == 'i') new_tags_length += 7;
2953 else new_tags_length += 4 + strlen((char *)vals[tagi]);
2954 }
2955
2956 int oldbin_len;
2957 memcpy(&oldbin_len, oldbin, 4);
2958 oldbin_len += 4;
2959
2960 int newbin_len = oldbin_len + new_tags_length;
2961 (*newbin) = malloc(newbin_len);
2962 memcpy(*newbin, oldbin, oldbin_len);
2963 newbin_len -= 4;
2964 memcpy(*newbin, &newbin_len, 4);
2965 newbin_len += 4;
2966
2967 for(tagi = 0; tags[tagi]; tagi++){
2968 memcpy( (*newbin) + oldbin_len, tags[tagi] ,2);
2969 (*newbin)[oldbin_len+2] = types[tagi];
2970 if(types[tagi] == 'i'){
2971 int intv = vals[tagi] - NULL;
2972 memcpy((*newbin) + oldbin_len + 3, &intv, 4);
2973 oldbin_len += 7;
2974 }else{
2975 int vlen = strlen((char *)(vals[tagi]))+1;
2976 memcpy((*newbin) + oldbin_len + 3, vals[tagi], vlen);
2977 oldbin_len += 3 + vlen;
2978 }
2979 }
2980 }
2981
2982
2983
compress_read_detail_BAM(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,int write_start,int write_end,char * bam_buf)2984 int compress_read_detail_BAM(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, int write_start, int write_end, char * bam_buf){
2985 if(global_context -> is_read_details_out == FILE_TYPE_SAM){
2986 // there MUST be only one read in the buffer.
2987 int write_ptr = write_start;
2988 int tmplen = 0 ;
2989 int sam_ptr = 0;
2990 while(1){
2991 if(write_ptr >= write_end) break;
2992 memcpy(&tmplen, thread_context -> read_details_buff + write_ptr, 4);
2993 tmplen +=4;
2994 int txtlen = convert_BAM_binary_to_SAM(global_context -> sambam_chro_table, thread_context -> read_details_buff + write_ptr, bam_buf + sam_ptr);
2995 bam_buf[sam_ptr + txtlen] = '\n';
2996 bam_buf[sam_ptr + txtlen + 1] = 0;
2997 sam_ptr += txtlen + 1;
2998 write_ptr += tmplen;
2999 }
3000 return sam_ptr;
3001
3002 }else{
3003 // there may be multiple reads in the buffer.
3004 int bin_len = write_end - write_start;
3005 char * compressed_buff = bam_buf + 18;
3006
3007 int compressed_size ;
3008 unsigned int CRC32;
3009 thread_context -> bam_file_output_stream.avail_out = 66600;
3010 thread_context -> bam_file_output_stream.avail_in = bin_len;
3011 //SUBREADprintf("COMPRESS PTR=%p , LEN=%d\n", thread_context -> read_details_buff + write_start , bin_len);
3012 CRC32 = FC_CRC32(thread_context -> read_details_buff + write_start , bin_len);
3013
3014 int Z_DEFAULT_MEM_LEVEL = 8;
3015 thread_context -> bam_file_output_stream.zalloc = Z_NULL;
3016 thread_context -> bam_file_output_stream.zfree = Z_NULL;
3017 thread_context -> bam_file_output_stream.opaque = Z_NULL;
3018
3019 deflateInit2(&thread_context -> bam_file_output_stream, bin_len?Z_BEST_SPEED:Z_DEFAULT_COMPRESSION, Z_DEFLATED, -15, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
3020
3021 thread_context -> bam_file_output_stream.next_in = (unsigned char*) thread_context -> read_details_buff + write_start;
3022 thread_context -> bam_file_output_stream.next_out = (unsigned char*) compressed_buff;
3023
3024 deflate(&thread_context -> bam_file_output_stream, Z_FINISH);
3025 deflateEnd(&thread_context -> bam_file_output_stream);
3026
3027 compressed_size = 66600 -thread_context -> bam_file_output_stream.avail_out;
3028
3029 bam_buf[0]=31;
3030 bam_buf[1]=-117;
3031 bam_buf[2]=8;
3032 bam_buf[3]=4;
3033 memset(bam_buf+4, 0, 5);
3034 bam_buf[9] = 0xff; // OS
3035
3036 int tmpi = 6;
3037 memcpy(bam_buf+10, &tmpi, 2); //XLSN
3038 bam_buf[12]=66; // SI1
3039 bam_buf[13]=67; // SI2
3040 tmpi = 2;
3041 memcpy(bam_buf+14, &tmpi, 2); //BSIZE
3042 tmpi = compressed_size + 19 + 6;
3043 memcpy(bam_buf+16, &tmpi, 2); //BSIZE
3044
3045 memcpy(bam_buf+18+compressed_size, &CRC32, 4);
3046 memcpy(bam_buf+18+compressed_size+4, &bin_len, 4);
3047 return 18+compressed_size+8;
3048 }
3049 }
3050
write_read_detailed_remainder(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context)3051 void write_read_detailed_remainder(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context){
3052 int write_bin_ptr = 0;
3053 int last_written_ptr = 0;
3054 int bam_compressed_buff_ptr = 0;
3055
3056 if(thread_context -> read_details_buff_used <1)return;
3057
3058 if(global_context -> is_read_details_out == FILE_TYPE_BAM && thread_context -> read_details_buff_used < 64000){
3059 bam_compressed_buff_ptr = compress_read_detail_BAM(global_context, thread_context, 0, thread_context -> read_details_buff_used, thread_context -> bam_compressed_buff);
3060 }else while(1){
3061 if(write_bin_ptr >= thread_context -> read_details_buff_used ) break;
3062 int tmplen = 0;
3063 memcpy(&tmplen, thread_context -> read_details_buff + write_bin_ptr, 4);
3064 if(tmplen < 9 || tmplen > 3*MAX_FC_READ_LENGTH){
3065 SUBREADprintf("ERROR: Format error : len = %d\n", tmplen);
3066 //oexit(-1);
3067 return ;
3068 }
3069 tmplen +=4;
3070 write_bin_ptr += tmplen;
3071 if(write_bin_ptr - last_written_ptr > 64000 || write_bin_ptr >= thread_context -> read_details_buff_used || global_context -> is_read_details_out == FILE_TYPE_SAM){
3072 bam_compressed_buff_ptr += compress_read_detail_BAM(global_context, thread_context, last_written_ptr, write_bin_ptr, thread_context -> bam_compressed_buff + bam_compressed_buff_ptr);
3073 last_written_ptr = write_bin_ptr;
3074 }
3075 }
3076 pthread_spin_lock(&global_context -> read_details_out_lock);
3077 fwrite(thread_context -> bam_compressed_buff, 1, bam_compressed_buff_ptr , global_context -> read_details_out_FP);
3078 pthread_spin_unlock(&global_context -> read_details_out_lock);
3079 thread_context -> read_details_buff_used =0;
3080 }
3081
3082
add_read_detail_bin_buff(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * bin,int do_write)3083 int add_read_detail_bin_buff(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * bin, int do_write){
3084 int binlen=0;
3085
3086 memcpy(&binlen, bin, 4);
3087 binlen += 4;
3088 if(binlen > MAX_FC_READ_LENGTH * 3){
3089 if(!global_context->is_read_too_long_to_SAM_BAM_shown){
3090 SUBREADprintf("ERROR: The read is too long to the SAM or BAM output.\nPlease use the 'CORE' mode for the assignment detail output.\n");
3091 global_context->is_read_too_long_to_SAM_BAM_shown = 1;
3092 }
3093 return -1;
3094 }
3095
3096 memcpy(thread_context -> read_details_buff + thread_context -> read_details_buff_used, bin, binlen);
3097 thread_context -> read_details_buff_used += binlen;
3098
3099 if(do_write){
3100 if(global_context -> is_read_details_out == FILE_TYPE_SAM || thread_context -> read_details_buff_used >= 55000) write_read_detailed_remainder(global_context, thread_context);
3101 }
3102 return 0;
3103 }
3104
write_read_details_FP(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * status,int feature_count,char * features,char * bin1,char * bin2)3105 int write_read_details_FP(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * status, int feature_count, char * features, char * bin1, char * bin2){
3106 int ret = 1;
3107
3108 char * read_name;
3109
3110 if(global_context -> is_read_details_out == FILE_TYPE_RSUBREAD){
3111 get_readname_from_bin(bin1?bin1:bin2, &read_name);
3112 fprintf(global_context -> read_details_out_FP, "%s\t%s\t%d\t%s\n", read_name, status, feature_count, features?features:"NA");
3113 }else{
3114 char * out_bin1 = NULL, *out_bin2 = NULL;
3115 char * tags[4];
3116 char types[4];
3117 void * vals[4];
3118
3119 tags[0]="XS";
3120 tags[1]=feature_count >0?"XN":NULL;
3121 tags[2]=feature_count >0?"XT":NULL;
3122 tags[3]=NULL;
3123 types[0]='Z';
3124 types[1]='i';
3125 types[2]='Z';
3126 vals[0]=status;
3127 vals[1]=NULL+feature_count;
3128 vals[2]=features;
3129
3130 if(bin1){
3131 add_bin_new_tags(bin1, &out_bin1, tags, types, vals);
3132 add_read_detail_bin_buff(global_context, thread_context, out_bin1, bin2 == NULL);
3133 free(out_bin1);
3134 }
3135
3136 if(bin2){
3137 add_bin_new_tags(bin2, &out_bin2, tags, types, vals);
3138 add_read_detail_bin_buff(global_context, thread_context, out_bin2, 1);
3139 free(out_bin2);
3140 }
3141 }
3142 if(ret < 1) global_context -> disk_is_full = 1;
3143 return ret;
3144 }
3145
warning_anno_BAM_chromosomes(fc_thread_global_context_t * global_context)3146 void warning_anno_BAM_chromosomes(fc_thread_global_context_t * global_context){
3147 int x1;
3148 HashTable * BAM_chro_tab = HashTableCreate(1117);
3149 HashTableSetHashFunction(BAM_chro_tab,HashTableStringHashFunction);
3150 HashTableSetKeyComparisonFunction(BAM_chro_tab,fc_strcmp_chro );
3151
3152 for(x1 = 0; x1 < global_context -> sambam_chro_table_items; x1++){
3153 char * BAM_chro = global_context -> sambam_chro_table[x1].chro_name;
3154 if( global_context -> BAM_chros_to_anno_table){
3155 char * tmp_chro = HashTableGet(global_context -> BAM_chros_to_anno_table, global_context -> sambam_chro_table[x1].chro_name);
3156 if(tmp_chro) BAM_chro = tmp_chro;
3157 }
3158 HashTablePut(BAM_chro_tab, BAM_chro, NULL+1);
3159 }
3160
3161 HashTable * ANNO_chro_tab = HashTableCreate(1117);
3162 HashTableSetHashFunction(ANNO_chro_tab,HashTableStringHashFunction);
3163 HashTableSetKeyComparisonFunction(ANNO_chro_tab,fc_strcmp_chro );
3164
3165 for(x1 = 0 ; x1 < global_context -> exontable_exons ; x1++)
3166 HashTablePut(ANNO_chro_tab, global_context -> exontable_chr[x1], NULL+1);
3167
3168 if(global_context -> is_verbose){
3169 warning_hash_hash(ANNO_chro_tab, BAM_chro_tab, "Chromosomes/contigs in annotation but not in input file");
3170 warning_hash_hash(BAM_chro_tab, ANNO_chro_tab, "Chromosomes/contigs in input file but not in annotation");
3171 }
3172 HashTableDestroy(BAM_chro_tab);
3173 HashTableDestroy(ANNO_chro_tab);
3174 }
3175
3176 void add_scRNA_read_to_pool( fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, srInt_64 assign_target_number, char * read_name, char * read_bin, ArrayList * target_list );
3177
process_line_buffer(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * bin1,char * bin2)3178 void process_line_buffer(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * bin1, char * bin2)
3179 {
3180 if(global_context -> is_input_bad_format) return;
3181 char * read_chr, *read_name, *mate_chr;
3182 srInt_64 read_pos, fragment_length = 0, mate_pos;
3183 unsigned int search_start = 0, search_end;
3184 int nhits1 = 0, nhits2 = 0, alignment_masks, search_block_id, search_item_id, mapping_qual;
3185
3186
3187 //long * hits_indices1 = thread_context -> hits_indices1, * hits_indices2 = thread_context -> hits_indices2;
3188 //unsigned int * hits_start_pos1 = thread_context -> hits_start_pos1 , * hits_start_pos2 = thread_context -> hits_start_pos2;
3189 //unsigned short * hits_length1 = thread_context -> hits_length1 , * hits_length2 = thread_context -> hits_length2;
3190 //char ** hits_chro1 = thread_context -> hits_chro1 , **hits_chro2 = thread_context -> hits_chro2;
3191
3192 unsigned int total_frag_len =0;
3193
3194 int cigar_sections, is_junction_read;
3195 unsigned int * Starting_Chro_Points_1BASE = thread_context -> proc_Starting_Chro_Points_1BASE;
3196 unsigned short * Starting_Read_Points = thread_context -> proc_Starting_Read_Points;
3197 unsigned short * Section_Read_Lengths = thread_context -> proc_Section_Read_Lengths;
3198 char ** ChroNames = thread_context -> proc_ChroNames;
3199 char * Event_After_Section = thread_context -> proc_Event_After_Section;
3200
3201 CIGAR_interval_t * CIGAR_intervals_R1 = thread_context -> proc_CIGAR_intervals_R1;
3202 CIGAR_interval_t * CIGAR_intervals_R2 = thread_context -> proc_CIGAR_intervals_R2;
3203
3204 int is_second_read;
3205 int maximum_NH_value = 1, NH_value;
3206 int skipped_for_exonic = 0;
3207 int first_read_quality_score = 0, CIGAR_intervals_R1_sections = 0, CIGAR_intervals_R2_sections = 0;
3208
3209 if(thread_context -> thread_id == 0 && thread_context -> all_reads < 1){
3210 warning_anno_BAM_chromosomes(global_context);
3211 }
3212
3213 if(global_context -> need_calculate_overlap_len ){
3214 memset( CIGAR_intervals_R1, 0, sizeof(CIGAR_interval_t) * global_context -> max_M );
3215 memset( CIGAR_intervals_R2, 0, sizeof(CIGAR_interval_t) * global_context -> max_M );
3216 }
3217
3218 thread_context->all_reads++;
3219 //if(thread_context->all_reads>1000000) printf("TA=%llu\n%s\n",thread_context->all_reads, thread_context -> line_buffer1);
3220
3221
3222 char * RG_ptr;
3223 int me_refID =-1, mate_refID =-1, this_is_inconsistent_read_type = 0;
3224 for(is_second_read = 0 ; is_second_read < 2; is_second_read++)
3225 {
3226 if(is_second_read && !global_context -> is_paired_end_mode_assign) break;
3227
3228 RG_ptr = NULL;
3229 parse_bin(global_context -> sambam_chro_table, is_second_read?bin2:bin1, is_second_read?bin1:bin2 , &read_name, &alignment_masks , &read_chr, &read_pos, &mapping_qual, &mate_chr, &mate_pos, &fragment_length, &is_junction_read, &cigar_sections, Starting_Chro_Points_1BASE, Starting_Read_Points, Section_Read_Lengths, ChroNames, Event_After_Section, &NH_value, global_context -> max_M , global_context -> need_calculate_overlap_len?(is_second_read?CIGAR_intervals_R2:CIGAR_intervals_R1):NULL, is_second_read?&CIGAR_intervals_R2_sections:&CIGAR_intervals_R1_sections, global_context -> assign_reads_to_RG, &RG_ptr, &me_refID, &mate_refID);
3230
3231 // this will be done in the other function.
3232 if(global_context -> is_paired_end_mode_assign && (alignment_masks&1)==0) alignment_masks|=8;
3233
3234 //#warning "========= DEBUG OUTPUT =============="
3235 if(0 && FIXLENstrcmp("SEV0112_0155:7:1303:14436:74270", read_name)==0){
3236 SUBREADprintf("RTEST:%s R_%d %p, %p FLAGS %d\n", read_name, 1+is_second_read, bin1, bin2, alignment_masks);
3237 }
3238
3239 if(global_context -> assign_reads_to_RG && NULL == RG_ptr)return;
3240
3241 if( ( alignment_masks & SAM_FLAG_PAIRED_TASK ) && !global_context -> any_reads_are_PE ) global_context -> any_reads_are_PE=1;
3242 if(((!global_context -> is_paired_end_reads_expected) && ( alignment_masks & SAM_FLAG_PAIRED_TASK )) || ((global_context -> is_paired_end_reads_expected) && 0 == ( alignment_masks & SAM_FLAG_PAIRED_TASK ))){
3243 if(global_context -> is_mixed_PE_SE == 0) global_context -> is_mixed_PE_SE =1;
3244 if(!global_context -> is_paired_end_reads_expected){
3245 SUBREADprintf("ERROR: Paired-end reads were detected in single-end read library : %s\n", global_context -> input_file_name);
3246 global_context -> is_input_bad_format = 1;
3247 return;
3248 }
3249 this_is_inconsistent_read_type = 1;
3250 }
3251
3252 if(global_context -> do_scRNA_table)add_scRNA_read_tota1_no(global_context, thread_context, read_name, bin1, 0);
3253
3254 if(is_second_read == 0)
3255 {
3256 //skip the read if unmapped (its mate will be skipped as well if paired-end)
3257 if( ((!global_context -> is_paired_end_mode_assign) && (alignment_masks & SAM_FLAG_UNMAPPED) ) ||
3258 ((alignment_masks & SAM_FLAG_UNMAPPED) && (alignment_masks & SAM_FLAG_MATE_UNMATCHED) && global_context -> is_paired_end_mode_assign)) {
3259 if(RG_ptr){
3260 void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3261 fc_read_counters * sumtab = tab4s[1];
3262 sumtab -> unassigned_unmapped++;
3263 }else
3264 thread_context->read_counters.unassigned_unmapped ++;
3265
3266 if(global_context -> read_details_out_FP)
3267 write_read_details_FP(global_context , thread_context ,"Unassigned_Unmapped",0, NULL, bin1, bin2);
3268
3269 if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3270 return; // do nothing if a read is unmapped, or the first read in a pair of reads is unmapped.
3271 }
3272 }
3273
3274 if(global_context -> do_scRNA_table)add_scRNA_read_tota1_no(global_context, thread_context, read_name, bin1, 1);
3275 if(((alignment_masks & SAM_FLAG_UNMAPPED) || (alignment_masks & SAM_FLAG_MATE_UNMATCHED)) && global_context -> is_paired_end_mode_assign && global_context -> is_both_end_required){
3276 if(RG_ptr){
3277 void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3278 fc_read_counters * sumtab = tab4s[1];
3279 sumtab -> unassigned_singleton++;
3280 }else
3281 thread_context->read_counters.unassigned_singleton ++;
3282
3283 if(global_context -> read_details_out_FP)
3284 write_read_details_FP(global_context , thread_context ,"Unassigned_Singleton",0, NULL, bin1, bin2);
3285
3286 if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3287 return;
3288 }
3289
3290 if(this_is_inconsistent_read_type){
3291 if(global_context -> is_both_end_required && 0 == ( alignment_masks & SAM_FLAG_PAIRED_TASK )){
3292 if(global_context -> read_details_out_FP)
3293 write_read_details_FP(global_context , thread_context ,"Unassigned_Singleton",0, NULL, bin1, bin2);
3294 if(RG_ptr){
3295 void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3296 fc_read_counters * sumtab = tab4s[1];
3297 sumtab -> unassigned_singleton++;
3298 }else thread_context->read_counters.unassigned_singleton ++;
3299
3300 if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3301 return; // when running on PE mode, SE reads are seen as "only one end mapped"
3302 }
3303 }
3304
3305 if(global_context -> min_mapping_quality_score>0)
3306 {
3307 //printf("SECOND=%d; FIRST=%d; THIS=%d; Q=%d\n", is_second_read, first_read_quality_score, mapping_qual, );
3308 if(( mapping_qual < global_context -> min_mapping_quality_score && ! global_context -> is_paired_end_mode_assign)||( is_second_read && max( first_read_quality_score, mapping_qual ) < global_context -> min_mapping_quality_score))
3309 {
3310 thread_context->read_counters.unassigned_mappingquality ++;
3311
3312 if(global_context -> read_details_out_FP)
3313 {
3314 write_read_details_FP(global_context, thread_context, "Unassigned_MappingQuality", 0, NULL, bin1, bin2);
3315 }
3316
3317 if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3318 return;
3319 }
3320 if(is_second_read==0 && global_context -> is_paired_end_mode_assign)
3321 {
3322 first_read_quality_score = mapping_qual;
3323 }
3324 }
3325
3326 if(is_second_read == 0 && global_context -> is_paired_end_mode_assign &&
3327 (global_context -> is_PE_distance_checked || global_context -> is_chimertc_disallowed)
3328 )
3329 {
3330 int is_half_mapped = (alignment_masks & SAM_FLAG_UNMAPPED) || (alignment_masks & SAM_FLAG_MATE_UNMATCHED);
3331
3332 if(!is_half_mapped)
3333 {
3334 fragment_length = abs( fragment_length ); //get the fragment length
3335
3336 int is_first_read_negative_strand = (alignment_masks & SAM_FLAG_REVERSE_STRAND_MATCHED)?1:0;
3337 int is_second_read_negative_strand = (alignment_masks & SAM_FLAG_MATE_REVERSE_STRAND_MATCHED)?1:0;
3338
3339 if(mate_chr == read_chr && is_first_read_negative_strand!=is_second_read_negative_strand) {
3340 //^^^^^^^^^^^^^^^^^^^^ They are directly compared because they are both pointers in the same contig name table.
3341 //
3342 if(global_context -> is_PE_distance_checked && ((fragment_length > global_context -> max_paired_end_distance) || (fragment_length < global_context -> min_paired_end_distance))) {
3343 if(RG_ptr){
3344 void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3345 fc_read_counters * sumtab = tab4s[1];
3346 sumtab -> unassigned_fragmentlength++;
3347 }else
3348 thread_context->read_counters.unassigned_fragmentlength ++;
3349
3350 if(global_context -> read_details_out_FP)
3351 write_read_details_FP(global_context, thread_context, "Unassigned_FragmentLength", -1, NULL, bin1, bin2);
3352 if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3353 return;
3354 }
3355 } else {
3356 if(global_context -> is_chimertc_disallowed) {
3357 if(RG_ptr){
3358 void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3359 fc_read_counters * sumtab = tab4s[1];
3360 sumtab -> unassigned_chimericreads++;
3361 }else
3362 thread_context->read_counters.unassigned_chimericreads ++;
3363
3364 if(global_context -> read_details_out_FP)
3365 write_read_details_FP(global_context, thread_context, "Unassigned_Chimera", -1, NULL, bin1, bin2);
3366 if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3367 return;
3368 }
3369 }
3370 }
3371 }
3372
3373 // This filter has to be put here because the 0x400 FLAG is not about mapping but about sequencing.
3374 // A unmapped read with 0x400 FLAG should be able to kill the mapped mate which may have no 0x400 FLAG.
3375 if(global_context -> is_duplicate_ignored)
3376 {
3377 if(alignment_masks & SAM_FLAG_DUPLICATE)
3378 {
3379 if(RG_ptr){
3380 void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3381 fc_read_counters * sumtab = tab4s[1];
3382 sumtab -> unassigned_duplicate++;
3383 }else thread_context->read_counters.unassigned_duplicate ++;
3384 if(global_context -> read_details_out_FP)
3385 write_read_details_FP(global_context, thread_context, "Unassigned_Duplicate", -1, NULL, bin1, bin2);
3386
3387 if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3388 return;
3389 }
3390
3391 }
3392
3393 if(SAM_FLAG_UNMAPPED & alignment_masks) continue;
3394
3395 if( NH_value > 1 ) {
3396 if(global_context -> is_multi_mapping_allowed == 0) {
3397 // now it is a NH>1 read!
3398 // not allow multimapping -> discard!
3399 if(RG_ptr){
3400 void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3401 fc_read_counters * sumtab = tab4s[1];
3402 sumtab -> unassigned_multimapping++;
3403 }else thread_context->read_counters.unassigned_multimapping ++;
3404
3405 if(global_context -> read_details_out_FP)
3406 write_read_details_FP(global_context, thread_context, "Unassigned_MultiMapping", -1, NULL, bin1, bin2);
3407
3408 if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3409 return;
3410 }
3411 }
3412
3413 maximum_NH_value = max(maximum_NH_value, NH_value);
3414
3415 // if a pair of reads have one secondary, the entire fragment is seen as secondary.
3416 if((alignment_masks & SAM_FLAG_SECONDARY_MAPPING) && (global_context -> is_primary_alignment_only)) {
3417 if(RG_ptr){
3418 void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3419 fc_read_counters * sumtab = tab4s[1];
3420 sumtab -> unassigned_secondary++;
3421 }else thread_context->read_counters.unassigned_secondary ++;
3422
3423 if(global_context -> read_details_out_FP)
3424 write_read_details_FP(global_context, thread_context, "Unassigned_Secondary", -1, NULL, bin1, bin2);
3425 if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3426 return;
3427 }
3428
3429 int is_this_negative_strand = (alignment_masks & SAM_FLAG_REVERSE_STRAND_MATCHED)?1:0;
3430 int is_fragment_negative_strand = is_this_negative_strand;
3431
3432 if(1 || global_context -> is_paired_end_mode_assign){ // On 20 JULY 2020: If strand-specific counting is on, isPairedEnd = TRUE, countReadPairs = FALSE and the BAM file contains mixed reads, then the single-end reads and the R1 reads in read-pairs will be directly compared with the strand of the gene, but the R2 reads in read-pairs will be compared with the opposite strand of the gene. A read-pair will be counted twice no matter if the strand-specific mode is on or off. If the argument to the strand-specific option is "1", then R1 must have the same strand of the gene and R2 must have the opposite strand of the gene to be counted.
3433 int is_second_read_in_pair = alignment_masks & SAM_FLAG_SECOND_READ_IN_PAIR;
3434 //is_fragment_negative_strand = is_second_read_in_pair?(!is_this_negative_strand):is_this_negative_strand;
3435 if(is_second_read_in_pair)
3436 is_fragment_negative_strand = global_context -> is_second_read_straight?is_this_negative_strand:(!is_this_negative_strand);
3437 else
3438 is_fragment_negative_strand = global_context -> is_first_read_reversed?(!is_this_negative_strand):is_this_negative_strand;
3439 }
3440
3441 int nhits = 0;
3442
3443 int cigar_section_id;
3444 srInt_64 * hits_indices = is_second_read?thread_context -> hits_indices2:thread_context -> hits_indices1;
3445 unsigned int * hits_start_pos = is_second_read?thread_context -> hits_start_pos2:thread_context -> hits_start_pos1;
3446 unsigned short * hits_length = is_second_read?thread_context -> hits_length2:thread_context -> hits_length1;
3447 char ** hits_chro = is_second_read?thread_context -> hits_chro2:thread_context -> hits_chro1;
3448
3449 if(global_context->is_split_or_exonic_only == 1 && !is_junction_read) {
3450 skipped_for_exonic ++;
3451
3452 if(skipped_for_exonic == 1 + global_context -> is_paired_end_mode_assign){
3453 if(global_context -> read_details_out_FP)
3454 write_read_details_FP(global_context, thread_context, (global_context->is_split_or_exonic_only == 2)?"Unassigned_Split":"Unassigned_NonSplit", -1, NULL, bin1, bin2);
3455
3456 if(RG_ptr){
3457 void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3458 fc_read_counters * sumtab = tab4s[1];
3459 sumtab -> unassigned_junction_condition++;
3460 }else thread_context->read_counters.unassigned_junction_condition ++;
3461
3462 if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3463 return;
3464 }
3465 }
3466
3467
3468 if(global_context->is_split_or_exonic_only == 2 && is_junction_read) {
3469 if(global_context -> read_details_out_FP)
3470 write_read_details_FP(global_context, thread_context,(global_context->is_split_or_exonic_only == 2)?"Unassigned_Split":"Unassigned_NonSplit", -1, NULL, bin1, bin2);
3471 if(RG_ptr){
3472 void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3473 fc_read_counters * sumtab = tab4s[1];
3474 sumtab -> unassigned_junction_condition++;
3475 }else thread_context->read_counters.unassigned_junction_condition ++;
3476
3477 if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3478 return;
3479 }
3480
3481 if(1) {
3482
3483 if(0)SUBREADprintf("MAPPED R_%d to %s : CHR_POS=%u + %u, CHR_LEN=%u\n", is_second_read+1, global_context -> sambam_chro_table[me_refID]. chro_name, Starting_Chro_Points_1BASE[0], Section_Read_Lengths[0], global_context -> sambam_chro_table[me_refID] .chro_length );
3484
3485 if(global_context -> read_shift_size>0){
3486 int shifting_applied_length = 0;
3487 int shifting_i;
3488
3489 if((global_context -> read_shift_type == READ_SHIFT_UPSTREAM && (!is_this_negative_strand))||
3490 (global_context -> read_shift_type == READ_SHIFT_DOWNSTREAM && is_this_negative_strand ))
3491 shifting_applied_length = -global_context -> read_shift_size;
3492
3493 if((global_context -> read_shift_type == READ_SHIFT_UPSTREAM && is_this_negative_strand)||
3494 (global_context -> read_shift_type == READ_SHIFT_DOWNSTREAM && (!is_this_negative_strand)))
3495 shifting_applied_length = global_context -> read_shift_size;
3496
3497 if(global_context -> read_shift_type == READ_SHIFT_LEFT) shifting_applied_length = -global_context -> read_shift_size;
3498 if(global_context -> read_shift_type == READ_SHIFT_RIGHT) shifting_applied_length = global_context -> read_shift_size;
3499
3500 if(shifting_applied_length < 0 && Starting_Chro_Points_1BASE[0] <=-shifting_applied_length) shifting_applied_length = - Starting_Chro_Points_1BASE[0]+1;
3501 if(shifting_applied_length > 0 && Starting_Chro_Points_1BASE[cigar_sections-1] + Section_Read_Lengths[cigar_sections-1] + shifting_applied_length > global_context -> sambam_chro_table[me_refID].chro_length +1 )
3502 shifting_applied_length = global_context -> sambam_chro_table[me_refID].chro_length +1 - (Starting_Chro_Points_1BASE[cigar_sections-1] + Section_Read_Lengths[cigar_sections-1]);
3503
3504 for(shifting_i = 0; shifting_i < cigar_sections ; shifting_i++)
3505 Starting_Chro_Points_1BASE[shifting_i] += shifting_applied_length;
3506 }
3507
3508 if(global_context -> five_end_extension)
3509 {
3510 if(is_this_negative_strand){
3511 int applied_ext = global_context -> five_end_extension;
3512
3513 if( Starting_Chro_Points_1BASE[cigar_sections-1] + Section_Read_Lengths[cigar_sections-1] + applied_ext > global_context -> sambam_chro_table[me_refID].chro_length +1 )
3514 applied_ext = global_context -> sambam_chro_table[me_refID].chro_length +1 - (Starting_Chro_Points_1BASE[cigar_sections-1] + Section_Read_Lengths[cigar_sections-1]);
3515
3516 Section_Read_Lengths [cigar_sections - 1] += applied_ext;
3517 }else{
3518 //SUBREADprintf("5-end extension: %d [%d]\n", Starting_Chro_Points_1BASE[0], Section_Lengths[0]);
3519 if( read_pos > global_context -> five_end_extension)
3520 {
3521 Section_Read_Lengths [0] += global_context -> five_end_extension;
3522 Starting_Chro_Points_1BASE [0] -= global_context -> five_end_extension;
3523 }
3524 else
3525 {
3526 Section_Read_Lengths [0] += read_pos-1;
3527 Starting_Chro_Points_1BASE [0] -= read_pos-1;
3528 }
3529 }
3530 }
3531
3532 if(global_context -> three_end_extension) {
3533
3534 if(is_this_negative_strand){
3535 if( read_pos > global_context -> three_end_extension)
3536 {
3537 Section_Read_Lengths [0] += global_context -> three_end_extension;
3538 Starting_Chro_Points_1BASE [0] -= global_context -> three_end_extension;
3539 }
3540 else
3541 {
3542 Section_Read_Lengths [0] += read_pos - 1;
3543 Starting_Chro_Points_1BASE [0] -= read_pos - 1;
3544 }
3545 } else{
3546 int applied_ext = global_context -> three_end_extension;
3547 if( Starting_Chro_Points_1BASE[cigar_sections-1] + Section_Read_Lengths[cigar_sections-1] + applied_ext > global_context -> sambam_chro_table[me_refID].chro_length +1 )
3548 applied_ext = global_context -> sambam_chro_table[me_refID].chro_length +1 - (Starting_Chro_Points_1BASE[cigar_sections-1] + Section_Read_Lengths[cigar_sections-1]);
3549 Section_Read_Lengths [cigar_sections - 1] += applied_ext;
3550 }
3551
3552 }
3553
3554 if(global_context -> reduce_5_3_ends_to_one) {
3555 if((REDUCE_TO_5_PRIME_END == global_context -> reduce_5_3_ends_to_one) + is_this_negative_strand == 1) // reduce to 5' end (small coordinate if positive strand / large coordinate if negative strand)
3556 {
3557 Section_Read_Lengths[0]=1;
3558 }
3559 else
3560 {
3561 Starting_Chro_Points_1BASE[0] = Starting_Chro_Points_1BASE[cigar_sections-1] + Section_Read_Lengths[cigar_sections-1] - 1;
3562 Section_Read_Lengths[0]=1;
3563 }
3564 cigar_sections = 1;
3565 }
3566
3567 for(cigar_section_id = 0; cigar_section_id<cigar_sections; cigar_section_id++)
3568 {
3569
3570 if(!ChroNames[ cigar_section_id ]) continue; // NULL chro name for https://groups.google.com/forum/#!topic/subread/QDT6npjAZuE
3571 srInt_64 section_begin_pos = Starting_Chro_Points_1BASE[cigar_section_id];
3572 srInt_64 section_end_pos = Section_Read_Lengths[cigar_section_id] + section_begin_pos - 1;
3573
3574
3575 int start_reverse_table_index = section_begin_pos / REVERSE_TABLE_BUCKET_LENGTH;
3576 int end_reverse_table_index = (1+section_end_pos) / REVERSE_TABLE_BUCKET_LENGTH;
3577
3578 /*if(ChroNames[cigar_section_id] < (char *)NULL + 0xfffff){
3579 unsigned char * tbbin = is_second_read?bin2:bin1;
3580 int * refid = (int*)(tbbin);
3581
3582 SUBREADprintf("DANGEROUS! RNAME=%s, REC_LEN=%d, CNAME=[%d]%p, LEN_P=%d, SECID=%d\n", read_name, refid[0], refid[1], ChroNames[cigar_section_id], Section_Read_Lengths[cigar_section_id], cigar_section_id);
3583 }*/
3584
3585 fc_chromosome_index_info * this_chro_info = HashTableGet(global_context -> exontable_chro_table, ChroNames[cigar_section_id]);
3586 if(this_chro_info == NULL)
3587 {
3588 if(global_context -> BAM_chros_to_anno_table)
3589 {
3590 char * anno_chro_name = HashTableGet( global_context -> BAM_chros_to_anno_table , ChroNames[cigar_section_id]);
3591 if(anno_chro_name)
3592 this_chro_info = HashTableGet(global_context -> exontable_chro_table, anno_chro_name);
3593 }
3594 if(this_chro_info == NULL && memcmp(ChroNames[cigar_section_id], "chr", 3)==0)
3595 {
3596 this_chro_info = HashTableGet(global_context -> exontable_chro_table, ChroNames[cigar_section_id]+3);
3597 // SUBREADprintf("INQ: %p : '%s'\n", this_chro_info , ChroNames[cigar_section_id]+3);
3598 }
3599 if(this_chro_info == NULL && strlen(ChroNames[cigar_section_id])<=2)
3600 {
3601 strcpy(thread_context -> chro_name_buff, "chr");
3602 strcpy(thread_context -> chro_name_buff+3, ChroNames[cigar_section_id]);
3603 this_chro_info = HashTableGet(global_context -> exontable_chro_table, thread_context -> chro_name_buff);
3604 }
3605 }
3606
3607 //SUBREADprintf("INF: %p : %s\n", this_chro_info , ChroNames[cigar_section_id]);
3608
3609 if(this_chro_info)
3610 {
3611 start_reverse_table_index = min(start_reverse_table_index, this_chro_info-> chro_possible_length / REVERSE_TABLE_BUCKET_LENGTH);
3612 end_reverse_table_index = min(end_reverse_table_index, this_chro_info-> chro_possible_length / REVERSE_TABLE_BUCKET_LENGTH+ 1);
3613
3614 while(start_reverse_table_index<=end_reverse_table_index)
3615 {
3616 search_start = this_chro_info -> reverse_table_start_index [start_reverse_table_index];
3617 if(search_start<0xffffff00)break;
3618 start_reverse_table_index++;
3619 }
3620 if(search_start>0xffffff00) continue;
3621
3622 //search_start = this_chro_info -> chro_block_table_start;
3623
3624 search_end = this_chro_info -> chro_block_table_end;//reverse_table_end_index [end_reverse_table_index];
3625
3626 for(search_block_id=search_start;search_block_id<search_end;search_block_id++){
3627 if (global_context -> exontable_block_min_start[search_block_id] > section_end_pos) break;
3628 if (global_context -> exontable_block_max_end[search_block_id] < section_begin_pos) continue;
3629
3630 int search_item_start = 0, search_item_end = global_context -> exontable_block_end_index[search_block_id];
3631 if(search_block_id>0)search_item_start = global_context -> exontable_block_end_index[search_block_id-1];
3632
3633 // search_item_id is the inner number of the exons.
3634 // the exontables in global_index has search_item_id as the index.
3635
3636 for(search_item_id = search_item_start ; search_item_id < search_item_end; search_item_id++)
3637 {
3638 if (global_context -> exontable_stop[search_item_id] >= section_begin_pos)
3639 {
3640 if (global_context -> exontable_start[search_item_id] > section_end_pos) break;
3641 // there is an overlap >=1 between read and feature.
3642 // the overlap length is min(end_r, end_F) - max(start_r, start_F) + 1
3643
3644 int is_strand_ok =1;
3645
3646 if(global_context->is_strand_checked){
3647 if(global_context->is_strand_checked == 1)
3648 is_strand_ok = (is_fragment_negative_strand == global_context -> exontable_strand[search_item_id]);
3649 else// if(global_context->is_strand_checked == 2)
3650 is_strand_ok = (is_fragment_negative_strand != global_context -> exontable_strand[search_item_id]);
3651 //SUBREADprintf("%d = %d == %d\n", is_strand_ok, is_fragment_negative_strand, global_context -> exontable_strand[search_item_id]);
3652 }
3653
3654 if(is_strand_ok){
3655
3656 if(nhits >= thread_context -> hits_number_capacity - 1){
3657 //SUBREADprintf("RESIZE hits: %d\n", thread_context -> hits_number_capacity);
3658 thread_context -> hits_number_capacity = thread_context -> hits_number_capacity/2 * 3;
3659 thread_context -> hits_number_capacity = max(10, thread_context -> hits_number_capacity);
3660 thread_context -> hits_start_pos1 = realloc(thread_context -> hits_start_pos1 , sizeof(int) * thread_context -> hits_number_capacity);
3661 thread_context -> hits_start_pos2 = realloc(thread_context -> hits_start_pos2 , sizeof(int) * thread_context -> hits_number_capacity);
3662
3663 thread_context -> hits_length1 = realloc(thread_context -> hits_length1, sizeof(short) * thread_context -> hits_number_capacity);
3664 thread_context -> hits_length2 = realloc(thread_context -> hits_length2, sizeof(short) * thread_context -> hits_number_capacity);
3665
3666 thread_context -> hits_chro1 = realloc(thread_context -> hits_chro1, sizeof(char *) * thread_context -> hits_number_capacity);
3667 thread_context -> hits_chro2 = realloc(thread_context -> hits_chro2, sizeof(char *) * thread_context -> hits_number_capacity);
3668
3669 thread_context -> hits_indices1 = realloc(thread_context -> hits_indices1, sizeof(srInt_64) * thread_context -> hits_number_capacity);
3670 thread_context -> hits_indices2 = realloc(thread_context -> hits_indices2, sizeof(srInt_64) * thread_context -> hits_number_capacity);
3671
3672 thread_context -> scoring_buff_numbers = realloc(thread_context -> scoring_buff_numbers, sizeof(int)*2*thread_context -> hits_number_capacity);
3673 thread_context -> scoring_buff_flags = realloc(thread_context -> scoring_buff_flags, sizeof(int)*2*thread_context -> hits_number_capacity);
3674 thread_context -> scoring_buff_overlappings = realloc(thread_context -> scoring_buff_overlappings, sizeof(int)*2*thread_context -> hits_number_capacity);
3675 thread_context -> scoring_buff_exon_ids = realloc(thread_context -> scoring_buff_exon_ids, sizeof(srInt_64)*2*thread_context -> hits_number_capacity);
3676
3677 if(global_context -> need_calculate_overlap_len){
3678 thread_context -> scoring_buff_gap_chros = realloc(thread_context -> scoring_buff_gap_chros, sizeof(char *) * 2 * global_context -> max_M *2 * thread_context -> hits_number_capacity);
3679 thread_context -> scoring_buff_gap_starts = realloc(thread_context -> scoring_buff_gap_starts, sizeof(int) * 2 * global_context -> max_M *2 * thread_context -> hits_number_capacity);
3680 thread_context -> scoring_buff_gap_lengths = realloc(thread_context -> scoring_buff_gap_lengths, sizeof(short) * 2 * global_context -> max_M *2 * thread_context -> hits_number_capacity);
3681 }
3682
3683 hits_indices = is_second_read?thread_context -> hits_indices2:thread_context -> hits_indices1;
3684 hits_start_pos = is_second_read?thread_context -> hits_start_pos2:thread_context -> hits_start_pos1;
3685 hits_length = is_second_read?thread_context -> hits_length2:thread_context -> hits_length1;
3686 hits_chro = is_second_read?thread_context -> hits_chro2:thread_context -> hits_chro1;
3687 //SUBREADprintf("RESIZE hits2: %d\n", thread_context -> hits_number_capacity);
3688 }
3689
3690 if(nhits <= MAX_HIT_NUMBER - 1) {
3691 hits_indices[nhits] = search_item_id;
3692
3693 if(global_context -> need_calculate_overlap_len) {
3694 hits_start_pos[nhits] = max(Starting_Chro_Points_1BASE[cigar_section_id], global_context -> exontable_start[search_item_id]);
3695 hits_length[nhits] = min(global_context -> exontable_stop[search_item_id] , section_end_pos)+1 - hits_start_pos[nhits] ;
3696 hits_chro[nhits] = ChroNames[cigar_section_id];
3697 if(0 && FIXLENstrcmp("V0112_0155:7:1101:10214:3701", read_name)==0)
3698 SUBREADprintf("QNAME: [%d] %s %d ~ %d\n", nhits, hits_chro[nhits], hits_start_pos[nhits], hits_start_pos[nhits]+hits_length[nhits]);
3699 }
3700
3701 nhits++;
3702 } else {
3703 SUBREADprintf("ERROR: the read overlapped with more than %d features.\n", nhits);
3704 global_context -> is_input_bad_format = 1;
3705 return ;
3706 }
3707 }
3708 }
3709 }
3710 }
3711 }
3712 }
3713 }
3714
3715
3716 if(is_second_read) nhits2 = nhits;
3717 else nhits1 = nhits;
3718 } // loop for is_second_read
3719
3720
3721 if(global_context -> do_junction_counting)// junction reads that passed the basic filters will be considered with the junction counting. Filters: Unmapped, Singleton, MAPQ, TemplateLength, Chimeric, Duplicate, Multimapping, Secondary alignment, Junction-containing status,
3722 process_line_junctions(global_context, thread_context, bin1, bin2);
3723
3724 if(global_context -> need_calculate_fragment_len )
3725 total_frag_len = calc_total_frag_len( global_context, thread_context, CIGAR_intervals_R1, CIGAR_intervals_R1_sections, CIGAR_intervals_R2, CIGAR_intervals_R2_sections , read_name);
3726
3727 //SUBREADprintf("FRAGLEN: %s %d; CIGARS=%d,%d\n", read_name, total_frag_len, CIGAR_intervals_R1_sections,CIGAR_intervals_R2_sections);
3728
3729 int fixed_fractional_count = ( global_context -> use_fraction_multi_mapping && ! global_context -> is_primary_alignment_only )?calc_fixed_fraction(maximum_NH_value): NH_FRACTION_INT;
3730
3731 // we have hits_indices1 and hits_indices2 and nhits1 and nhits2 here
3732 // we also have fixed_fractional_count which is the value to add
3733
3734 vote_and_add_count(global_context, thread_context,
3735 thread_context -> hits_indices1, nhits1, thread_context -> hits_indices2, nhits2, total_frag_len,
3736 thread_context -> hits_chro1, thread_context -> hits_chro2,
3737 thread_context -> hits_start_pos1, thread_context -> hits_start_pos2,
3738 thread_context -> hits_length1, thread_context ->hits_length2,
3739 fixed_fractional_count, read_name, RG_ptr, bin1, bin2);
3740 return;
3741 }
3742
add_bitmap_overlapping(char * x1_bitmap,short start_base,short len)3743 void add_bitmap_overlapping(char * x1_bitmap, short start_base, short len){
3744 int x1;
3745 int rl16 = start_base+len-16;
3746 for(x1 = start_base; x1 < start_base+len; x1++){
3747 int bit = x1 % 8;
3748 int byte = x1 / 8;
3749 if(bit == 0 && x1 < rl16){
3750 x1_bitmap[byte]=-1;
3751 x1_bitmap[byte+1]=-1;
3752 x1+=15;
3753 }else{
3754 x1_bitmap[byte] |= (1<<bit);
3755 }
3756 }
3757 }
3758
count_bitmap_overlapping(char * x1_bitmap,unsigned short rl)3759 int count_bitmap_overlapping(char * x1_bitmap, unsigned short rl){
3760
3761 int x1;
3762 int ret = 0;
3763 for(x1 = 0; x1 < rl; x1++){
3764 int byte = x1 / 8;
3765 int bit = x1 % 8;
3766
3767 if(bit == 0 && x1_bitmap[byte]==-1){
3768 x1 += 7;
3769 ret += 8;
3770 }else if(x1_bitmap[byte] & (1<<bit)) ret ++;
3771 }
3772 return ret;
3773 }
3774
add_fragment_supported_junction(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,fc_junction_info_t * supported_junctions1,int njunc1,fc_junction_info_t * supported_junctions2,int njunc2,char * RG_name)3775 void add_fragment_supported_junction( fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, fc_junction_info_t * supported_junctions1, int njunc1, fc_junction_info_t * supported_junctions2, int njunc2, char * RG_name){
3776 assert(njunc1 >= 0 && njunc1 <= global_context -> max_M -1 );
3777 assert(njunc2 >= 0 && njunc2 <= global_context -> max_M -1 );
3778 int x1,x2, in_total_junctions = njunc2 + njunc1;
3779
3780 HashTable * junction_counting_table, *splicing_point_table;
3781
3782 if(RG_name){
3783 void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
3784 junction_counting_table = tab4s[2];
3785 splicing_point_table = tab4s[3];
3786 }else{
3787 junction_counting_table = thread_context -> junction_counting_table;
3788 splicing_point_table = thread_context -> splicing_point_table;
3789 }
3790
3791 for(x1 = 0; x1 < in_total_junctions; x1 ++){
3792 fc_junction_info_t * j_one = (x1 >= njunc1)?supported_junctions2+(x1-njunc1):(supported_junctions1+x1);
3793 if(j_one->chromosome_name_left[0]==0) continue;
3794
3795 for(x2 = x1+1; x2 < in_total_junctions ; x2 ++){
3796 fc_junction_info_t * j_two = (x2 >= njunc1)?supported_junctions2+(x2-njunc1):(supported_junctions1+x2);
3797 if(j_two->chromosome_name_left[0]==0) continue;
3798 if(
3799 j_one -> last_exon_base_left == j_two -> last_exon_base_left &&
3800 j_one -> first_exon_base_right == j_two -> first_exon_base_right &&
3801 strcmp(j_one -> chromosome_name_left, j_two -> chromosome_name_left) == 0 &&
3802 strcmp(j_one -> chromosome_name_right, j_two -> chromosome_name_right) == 0
3803 ) j_two -> chromosome_name_left[0]=0;
3804 }
3805
3806 char * this_key = malloc(strlen(j_one->chromosome_name_left) + strlen(j_one->chromosome_name_right) + 36);
3807 sprintf(this_key, "%s\t%u\t%s\t%u", j_one->chromosome_name_left, j_one -> last_exon_base_left, j_one->chromosome_name_right, j_one -> first_exon_base_right);
3808 void * count_ptr = HashTableGet(junction_counting_table, this_key);
3809 srInt_64 count_junc = count_ptr - NULL;
3810 HashTablePut(junction_counting_table, this_key, NULL+count_junc + 1);
3811
3812 // #warning "CONTINUE SHOULD BE REMOVED!!!."
3813 // continue;
3814
3815 char * left_key = malloc(strlen(j_one->chromosome_name_left) + 16);
3816 char * right_key = malloc(strlen(j_one->chromosome_name_right) + 16);
3817 sprintf(left_key, "%s\t%u", j_one->chromosome_name_left, j_one -> last_exon_base_left);
3818 sprintf(right_key, "%s\t%u", j_one->chromosome_name_right, j_one -> first_exon_base_right);
3819
3820 for( x2 = 0 ; x2 < 2 ; x2++ ){
3821 char * lr_key = x2?right_key:left_key;
3822 count_ptr = HashTableGet(splicing_point_table, lr_key);
3823 count_junc = count_ptr - NULL;
3824 HashTablePut(splicing_point_table, lr_key, NULL + count_junc + 1);
3825 }
3826 }
3827 }
3828
overlap_compare(void * arr,int L,int R)3829 int overlap_compare(void * arr, int L, int R){
3830 unsigned int * pos = (unsigned int *)arr;
3831 return pos[ L*2 ] - pos[R*2];
3832 }
3833
overlap_exchange(void * arr,int L,int R)3834 void overlap_exchange(void * arr, int L, int R){
3835 unsigned int * pos = (unsigned int *)arr, tt;
3836 tt=pos[L*2];
3837 pos[L*2] = pos[R*2];
3838 pos[R*2] = tt;
3839
3840 tt=pos[L*2+1];
3841 pos[L*2+1] = pos[R*2+1];
3842 pos[R*2+1] = tt;
3843 }
3844
scRNA_get_sample_id(fc_thread_global_context_t * global_context,char * sbc,int read_laneno)3845 int scRNA_get_sample_id(fc_thread_global_context_t *global_context, char * sbc, int read_laneno){
3846 int x1;
3847
3848 //SUBREADprintf("TOTAL_SBC=%ld\n", global_context -> scRNA_sample_barcode_list -> numOfElements);
3849 for(x1=0; x1 < global_context -> scRNA_sample_barcode_list -> numOfElements ; x1++ ){
3850 char ** lane_and_barcode = ArrayListGet(global_context -> scRNA_sample_barcode_list, x1);
3851 int lane_no = lane_and_barcode[0]-(char*)NULL;
3852 // SUBREADprintf("KNOWN_LANE=%d, IN_LANE=%d, to\n", lane_no, read_laneno);
3853 if(read_laneno == lane_no){
3854 int sample_no = lane_and_barcode[1]-(char*)NULL;
3855 char * knownbar = lane_and_barcode[2];
3856 int hd = hamming_dist_ATGC_max2( sbc, knownbar );
3857 //SUBREADprintf("Testing SampleBC %s vs %s dif=%d it is sample %d\n", knownbar, sbc, hd, sample_no);
3858 if(hd<=2) return sample_no;
3859 }
3860 }
3861 return -1;
3862 }
3863
scRNA_register_umi_id(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * ubc)3864 int scRNA_register_umi_id(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * ubc){
3865
3866 int xk1=0,nch;
3867 for(xk1=0; 0!=(nch = ubc [xk1]); xk1++){
3868 if(!isalpha(nch))break;
3869 }
3870 ubc[xk1]=0;
3871 int uno = HashTableGet(thread_context -> scRNA_registered_UMI_table, ubc ) -NULL -1;
3872 if(uno<0) {
3873 uno = thread_context -> scRNA_registered_UMI_table -> numOfElements;
3874 assert(strlen(ubc) <=MAX_UMI_LEN);
3875 HashTablePut( thread_context -> scRNA_registered_UMI_table, strdup(ubc) , NULL+ uno +1);
3876 }
3877
3878 ubc[xk1]=nch;
3879 return uno;
3880 }
3881
3882 #define IMPOSSIBLE_MEMORY_SPACE 0x5CAFEBABE0000000llu
scRNA_get_cell_id(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * cbc)3883 int scRNA_get_cell_id(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * cbc){
3884 //return -1;
3885 char tmpc [MAX_READ_NAME_LEN];
3886 int xx1;
3887 ArrayList * ret=NULL;
3888
3889 for(xx1=0;xx1<3;xx1++){
3890 int xx2;
3891 if(xx1==1) ret = ArrayListCreate(100);
3892
3893 if(xx1>0){
3894 tmpc[0] = (xx1==2)?'S':'F';
3895 for(xx2=0; xx2<global_context -> known_cell_barcode_length/2 ; xx2++)
3896 tmpc[1+xx2] = cbc[2*xx2+xx1-1];
3897 tmpc[1+global_context -> known_cell_barcode_length/2]=0;
3898 }else{
3899 memcpy(tmpc, cbc, global_context -> known_cell_barcode_length);
3900 tmpc[global_context -> known_cell_barcode_length]=0;
3901 }
3902
3903 void *xrawarr = HashTableGet(global_context -> scRNA_cell_barcode_head_tail_table, tmpc);
3904
3905 if(xx1 == 0){
3906 //if(xrawarr) SUBREADprintf("CAFE ? %p\n", xrawarr);
3907 srInt_64 xint = xrawarr - NULL;
3908 if(( xint & 0xFFFFFFFFF0000000llu)== IMPOSSIBLE_MEMORY_SPACE){
3909 int only_cell_id = xint - IMPOSSIBLE_MEMORY_SPACE;
3910 // no memory was allocated.
3911 return only_cell_id;
3912 }
3913 }else{
3914 ArrayList * rawarr = xrawarr;
3915 if(rawarr){
3916 int xx3,xx2, found;
3917 for(xx2=0; xx2<rawarr->numOfElements; xx2++){
3918 int bcno = ArrayListGet(rawarr, xx2)-NULL;
3919 found=0;
3920 for(xx3=0;xx3<ret -> numOfElements;xx3++){
3921 if(ArrayListGet(ret, xx3)==NULL+bcno){
3922 found=1;
3923 break;
3924 }
3925 }
3926
3927 if(!found)ArrayListPush(ret, NULL+bcno);
3928 }
3929 }
3930 }
3931 }
3932
3933
3934 int tb1=-1;
3935 for(xx1=0; xx1<ret -> numOfElements; xx1++){
3936 int tbcn = ArrayListGet(ret,xx1)-NULL;
3937 char * known_cbc = ArrayListGet(global_context -> scRNA_cell_barcodes_array, tbcn);
3938 int hc = hamming_dist_ATGC_max2( known_cbc, cbc );
3939
3940 // cbc[16]=0; if(hc <=3)SUBREADprintf("TEST_CBC %s ~ %s = %d\n", known_cbc, cbc, hc);
3941 if(hc==1){
3942 tb1 = tbcn;
3943 break;
3944 }
3945 }
3946 //SUBREADprintf("CANDIDATE CELL BARCODES=%ld ; hit = %d\n", ret->numOfElements, tb1);
3947 ArrayListDestroy(ret);
3948
3949 return tb1;
3950 }
3951
3952 #define SCRNA_READ_NAME_SPLIT_CHAR '|'
3953
scRNA_move_barcodes_to_tags(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * inbin,char ** outbin_pr,char * fixed_cell_barcode,char * fixed_UMI)3954 void scRNA_move_barcodes_to_tags(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * inbin, char ** outbin_pr, char * fixed_cell_barcode, char * fixed_UMI){
3955 int inbin_len=0;
3956 memcpy(&inbin_len, inbin, 4);
3957 char* outbin = malloc(inbin_len + 400);
3958 *outbin_pr = outbin;
3959
3960 int l_read_name=0, new_l_read_name=0, x1;
3961 memcpy(&l_read_name, inbin+12, 1);
3962
3963 char * BC_seq=NULL, * BC_qual=NULL, *UMI_seq=NULL, *UMI_qual=NULL, *RG=NULL;
3964 int BC_len=global_context -> known_cell_barcode_length, UMI_len=0, RG_len=0, field_i = 0;
3965
3966 for(x1=1; x1<l_read_name-1; x1++){
3967 char rnchar = inbin [ 36 + x1];
3968 if(rnchar == SCRNA_READ_NAME_SPLIT_CHAR || (rnchar== ':' && global_context -> scRNA_input_mode == GENE_INPUT_BCL)){
3969 field_i ++;
3970 if(field_i == 1){
3971 new_l_read_name = x1+1;
3972 BC_seq = inbin+36 + x1+1;
3973 UMI_seq = BC_seq + BC_len;
3974 }else if(field_i == 2){
3975 BC_qual = inbin+36 + x1+1;
3976 UMI_len = BC_qual - BC_seq - 1 - global_context -> known_cell_barcode_length;
3977 UMI_qual = BC_qual + BC_len;
3978 }else if(field_i == 5){
3979 RG = inbin+36 + x1+1;
3980 RG_len = l_read_name - x1 - 2;
3981 }
3982 }
3983 }
3984
3985 memcpy(outbin, inbin, 36+new_l_read_name);
3986 outbin[36+new_l_read_name-1]=0;
3987 memcpy(outbin + 12,&new_l_read_name, 1);
3988 memcpy(outbin + 36 + new_l_read_name, inbin + 36 + l_read_name, inbin_len +4 - 36 - l_read_name);
3989
3990 int ext_ptr = inbin_len +4 - (l_read_name - new_l_read_name);
3991 for(x1 = 0;x1<7;x1++){
3992 int this_len = BC_len;
3993 char * this_tag = "CR", * this_val = BC_seq;
3994
3995 if(x1 == 1){this_tag = "CY"; this_val = BC_qual;}
3996 if(x1 == 2){this_tag = "CB"; this_val = fixed_cell_barcode;}
3997 if(x1 == 3){this_tag = "UR"; this_val = UMI_seq;}
3998 if(x1 == 4){this_tag = "UY"; this_val = UMI_qual;}
3999 if(x1 == 5){this_tag = "UB"; this_val = fixed_UMI;}
4000 if(x1 == 6){this_tag = "RG"; this_val = RG;}
4001
4002 if(x1 == 3 || x1 == 4 || x1 == 5) this_len = UMI_len;
4003 if(x1 == 6) this_len = RG_len;
4004
4005 outbin[ext_ptr]= this_tag[0];
4006 outbin[ext_ptr+1]= this_tag[1];
4007 outbin[ext_ptr+2]= 'Z';
4008 memcpy(outbin+ext_ptr+3, this_val, this_len);
4009 outbin[ext_ptr+3+this_len]= 0;
4010 ext_ptr += 3+1+this_len;
4011 }
4012
4013 ext_ptr -=4; // block_size excl itself
4014 memcpy(outbin, &ext_ptr, 4);
4015 }
4016
scRNA_scan_read_name_str(fc_thread_global_context_t * global_context,char * read_name,char * read_bin,char ** sample_seq,char ** sample_qual,char ** BC_seq,char ** BC_qual,char ** UMI_seq,char ** UMI_qual,char ** lane_str,char ** RG,int * rname_trimmed_len)4017 int scRNA_scan_read_name_str(fc_thread_global_context_t * global_context, char * read_name, char * read_bin, char ** sample_seq, char ** sample_qual, char ** BC_seq, char ** BC_qual, char ** UMI_seq, char ** UMI_qual, char ** lane_str, char ** RG, int * rname_trimmed_len){
4018 char * testi;
4019 int field_i=0;
4020 if(NULL == read_name && read_bin) read_name = read_bin + 36;
4021 for(testi = read_name +1; * testi; testi ++){
4022 if((*testi)== SCRNA_READ_NAME_SPLIT_CHAR || ((*testi)== ':' && global_context -> scRNA_input_mode == GENE_INPUT_BCL )){
4023 field_i++;
4024 if(field_i == 1) {
4025 if(rname_trimmed_len) (*rname_trimmed_len)=testi-read_name;
4026 if(BC_seq)(*BC_seq) = testi+1;
4027 if(UMI_seq)(*UMI_seq) = testi+1+global_context -> known_cell_barcode_length;
4028 }else if(field_i == 2){
4029 if(BC_qual)(*BC_qual) = testi+1;
4030 if(UMI_qual)(*UMI_qual) = testi+1+global_context -> known_cell_barcode_length;
4031 }else if(field_i == 3){
4032 *sample_seq = testi + 1;
4033 if(RG)(*RG) = *sample_seq;
4034 }else if(field_i == 4){
4035 if(sample_qual)(*sample_qual) = testi + 1;
4036 }else if(field_i == 5){
4037 (*lane_str) = testi + 1;
4038 if(memcmp(*lane_str, "@RgLater@", 9)==0) (*lane_str) += 9;
4039 break;
4040 }
4041 }
4042 }
4043
4044 if(field_i < 3 && read_bin){
4045 int bin_len = 0;
4046 char tag_type = 0;
4047 int bintag_start = SAM_pairer_get_tag_bin_start(read_bin);
4048 memcpy(&bin_len, read_bin, 4);
4049 bin_len = bin_len +4 -bintag_start;
4050
4051 if(BC_seq) SAM_pairer_iterate_tags((unsigned char*)read_bin+bintag_start, bin_len , "CR", &tag_type, BC_seq);
4052 if(UMI_seq) SAM_pairer_iterate_tags((unsigned char*)read_bin+bintag_start, bin_len , "UR", &tag_type, UMI_seq);
4053
4054 if(BC_qual) SAM_pairer_iterate_tags((unsigned char*)read_bin+bintag_start, bin_len , "CY", &tag_type, BC_qual);
4055 if(UMI_qual) SAM_pairer_iterate_tags((unsigned char*)read_bin+bintag_start, bin_len , "UY", &tag_type, UMI_qual);
4056
4057 if(RG) SAM_pairer_iterate_tags((unsigned char*)read_bin+bintag_start, bin_len , "RG", &tag_type, RG);
4058 }
4059
4060 return field_i;
4061 }
4062
scRNA_find_sample_cell_umi_from_readname(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * read_name,char * read_bin,int * sample_id,char ** BC_seq,char ** UMI_seq,char ** RG)4063 void scRNA_find_sample_cell_umi_from_readname(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * read_name, char * read_bin,
4064 int * sample_id, char ** BC_seq, char ** UMI_seq, char ** RG){
4065 int field_i = 0, laneno = 0;
4066 char * testi, * lane_str = NULL, *sample_barcode = NULL;
4067
4068 if(sample_id)*sample_id = -1;
4069
4070 field_i = scRNA_scan_read_name_str(global_context, read_name, read_bin, &sample_barcode, NULL, BC_seq, NULL, UMI_seq, NULL, &lane_str, RG, NULL);
4071
4072 if(!sample_id){
4073 if(!(UMI_seq && BC_seq))SUBREADprintf("ERROR: Cannot get UMI or BC: %s, %s\n", *UMI_seq, *BC_seq);
4074 return;
4075 }
4076
4077 if(global_context -> scRNA_input_mode == GENE_INPUT_SCRNA_BAM){
4078 *sample_id = 1; // on the BAM mode, every featureCounts run only has one sample
4079 }else if(global_context -> scRNA_input_mode == GENE_INPUT_SCRNA_FASTQ){
4080 if(sample_barcode == NULL || memcmp(sample_barcode, "input#", 6) || !isdigit(sample_barcode[6]))
4081 SUBREADprintf("SPBCFMT_ERR %d // %s in %s // %s\n", field_i, sample_barcode, read_name, read_name +13 +global_context -> known_cell_barcode_length);
4082 else{
4083
4084 int lineno = atoi(sample_barcode +6) +1;
4085 *sample_id = (HashTableGet(global_context -> scRNA_lineno1B_to_sampleno1B_tab, NULL+lineno)-NULL);
4086 }
4087 }else{
4088 if(field_i !=5 || (*lane_str)!='L')
4089 SUBREADprintf("LANESTR_ERR %d , %s\n", field_i, lane_str);
4090 for(testi = lane_str+1; *testi; testi++){
4091 if(!isdigit(*testi))break;
4092 laneno = laneno*10 + (*testi)-'0';
4093 }
4094
4095 *sample_id = scRNA_get_sample_id(global_context, sample_barcode, laneno);
4096 //Rprintf("LOOKUP SAMPLE %d by %s-%s-%d\n", *sample_id, sample_barcode, lane_str, laneno);
4097 }
4098 }
4099
add_scRNA_read_tota1_no(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * read_name,char * bambin,int step)4100 void add_scRNA_read_tota1_no( fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * read_name, char * bambin, int step){
4101 int sample_id= -1;
4102 char * cell_bc = NULL, * umi = NULL;
4103 int known_sample_id = 0;
4104 if(global_context -> scRNA_rerun_on_persample_BAM) known_sample_id = global_context -> this_input_number+1;
4105
4106 scRNA_find_sample_cell_umi_from_readname(global_context, thread_context, read_name, bambin, (known_sample_id>0)?NULL:&sample_id, &cell_bc, &umi, NULL);
4107 if(known_sample_id>0) sample_id = known_sample_id;
4108
4109 if(global_context -> scRNA_UMI_length <1){
4110 int umi_end_pos=0,nch;
4111 for(umi_end_pos=0; 0!=(nch = umi [umi_end_pos]); umi_end_pos++) if(!isalpha(nch))break;
4112 global_context -> scRNA_UMI_length = umi_end_pos;
4113 }
4114
4115 if(sample_id>0){
4116 if(step==0){
4117 thread_context -> scRNA_reads_per_sample[sample_id-1] ++;
4118 if(global_context -> is_scRNA_BAM_FQ_out_generated){
4119 void ** sample_bam_2fps = HashTableGet(global_context -> scRNA_sample_BAM_writers, NULL+(sample_id-1) + 1); // sample_id-1: 0,1,2,...
4120 if(sample_bam_2fps==NULL) SUBREADprintf("Error: unknown sample id = %d\n", sample_id);
4121
4122 if(GENE_INPUT_SCRNA_FASTQ != global_context -> scRNA_input_mode){
4123 parallel_gzip_writer_t **gz3fps = (parallel_gzip_writer_t **)sample_bam_2fps+1;
4124 parallel_gzip_writer_add_read_fqs_scRNA(gz3fps, bambin, thread_context -> thread_id);
4125 if( gz3fps[0]-> thread_objs[thread_context -> thread_id].in_buffer_used >= PARALLEL_GZIP_TXT_BUFFER_SIZE - PARALLEL_GZIP_TXT_BUFFER_MARGIN ||
4126 gz3fps[1]-> thread_objs[thread_context -> thread_id].in_buffer_used >= PARALLEL_GZIP_TXT_BUFFER_SIZE - PARALLEL_GZIP_TXT_BUFFER_MARGIN ||
4127 gz3fps[2]-> thread_objs[thread_context -> thread_id].in_buffer_used >= PARALLEL_GZIP_TXT_BUFFER_SIZE - PARALLEL_GZIP_TXT_BUFFER_MARGIN ){
4128 parallel_gzip_zip_texts(gz3fps[0], thread_context -> thread_id, 0);
4129 parallel_gzip_zip_texts(gz3fps[1], thread_context -> thread_id, 0);
4130 parallel_gzip_zip_texts(gz3fps[2], thread_context -> thread_id, 0);
4131 pthread_spin_lock(sample_bam_2fps[4]);
4132 parallel_gzip_writer_flush(gz3fps[0], thread_context -> thread_id);
4133 parallel_gzip_writer_flush(gz3fps[1], thread_context -> thread_id);
4134 parallel_gzip_writer_flush(gz3fps[2], thread_context -> thread_id);
4135 pthread_spin_unlock(sample_bam_2fps[4]);
4136 }
4137 }
4138 }
4139 }else if(step==1) thread_context -> scRNA_mapped_reads_per_sample[sample_id-1] ++;
4140 }
4141 }
4142
scRNA_do_one_batch_write_extend_rbin(fc_thread_global_context_t * global_context,char * rbin,int binlen,FILE * fp,char * fixedbc_seq,char * fixedumi_seq,srInt_64 gene_no,srInt_64 * genes)4143 void scRNA_do_one_batch_write_extend_rbin(fc_thread_global_context_t * global_context, char * rbin, int binlen, FILE * fp, char * fixedbc_seq, char * fixedumi_seq, srInt_64 gene_no, srInt_64 * genes){
4144 char * cellbc_seq=NULL,*umi_seq=NULL, * cellbc_qual=NULL,*umi_qual=NULL, *sample_seq=NULL, *sample_qual=NULL, *lane_str=NULL;
4145 int rname_trimmed_len=0;
4146 scRNA_scan_read_name_str(global_context, NULL, rbin, & sample_seq, & sample_qual, & cellbc_seq, & cellbc_qual, & umi_seq, & umi_qual, &lane_str, NULL, &rname_trimmed_len);
4147 char new_rbin_stake[ binlen + 150 ]; // removed barcodes/qual from read names, add them to extra fields if they weren't there. Gene names are not put here.
4148 char * new_rbin = new_rbin_stake;
4149 int new_rbin_len = 0, n_cigar_op =0, l_read_name=0, l_seq=0;
4150
4151 memcpy(new_rbin, rbin, 36);
4152 new_rbin_len += 36;
4153
4154 memcpy(&n_cigar_op, rbin+16,2);
4155 memcpy(&l_seq, rbin+20,4);
4156 l_read_name=((unsigned char*)rbin)[12];
4157 new_rbin[12] = rname_trimmed_len+1;
4158 memcpy(new_rbin+new_rbin_len, rbin+36, rname_trimmed_len);
4159 new_rbin[36+rname_trimmed_len]=0;
4160 new_rbin_len+= rname_trimmed_len+1;
4161 memcpy(new_rbin+new_rbin_len, rbin +36 + l_read_name, 4*n_cigar_op + l_seq + (l_seq+1)/2);
4162 new_rbin_len += 4*n_cigar_op + l_seq + (l_seq+1)/2;
4163 char * ext_bin_ptr = rbin + 36 + l_read_name +4*n_cigar_op + l_seq + (l_seq+1)/2;
4164
4165 int CR_found=0, CB_found=0, CY_found=0, UR_found=0, UY_found=0, UB_found=0;
4166 while(ext_bin_ptr < rbin+binlen+4){
4167 char * tagstr = NULL;
4168 int taglen = 0;
4169 if(ext_bin_ptr[0]=='C' && ext_bin_ptr[1]=='R' && ext_bin_ptr[2]=='Z'){
4170 CR_found = 1;
4171 tagstr = cellbc_seq;
4172 taglen = global_context -> known_cell_barcode_length;
4173 }else if(ext_bin_ptr[0]=='C' && ext_bin_ptr[1]=='B' && ext_bin_ptr[2]=='Z'){
4174 CB_found = 1;
4175 tagstr = fixedbc_seq;
4176 taglen = global_context -> known_cell_barcode_length;
4177 }else if(ext_bin_ptr[0]=='C' && ext_bin_ptr[1]=='Y' && ext_bin_ptr[2]=='Z'){
4178 CY_found = 1;
4179 tagstr = cellbc_qual;
4180 taglen = global_context -> known_cell_barcode_length;
4181 }else if(ext_bin_ptr[0]=='U' && ext_bin_ptr[1]=='R' && ext_bin_ptr[2]=='Z'){
4182 UR_found = 1;
4183 tagstr = umi_seq;
4184 taglen = global_context -> scRNA_UMI_length;
4185 }else if(ext_bin_ptr[0]=='U' && ext_bin_ptr[1]=='B' && ext_bin_ptr[2]=='Z'){
4186 UB_found = 1;
4187 tagstr = fixedumi_seq;
4188 taglen = global_context -> scRNA_UMI_length;
4189 }else if(ext_bin_ptr[0]=='U' && ext_bin_ptr[1]=='Y' && ext_bin_ptr[2]=='Z'){
4190 UY_found = 1;
4191 tagstr = umi_qual;
4192 taglen = global_context -> scRNA_UMI_length;
4193 }
4194
4195 if(tagstr){
4196 new_rbin[new_rbin_len++]=*(ext_bin_ptr++);
4197 new_rbin[new_rbin_len++]=*(ext_bin_ptr++);
4198 new_rbin[new_rbin_len++]=*(ext_bin_ptr++);
4199 int taglenold = strlen(ext_bin_ptr);
4200 memcpy(new_rbin+new_rbin_len,tagstr, taglen);
4201 *(new_rbin+new_rbin_len+taglen)=0;
4202 ext_bin_ptr += taglenold+1;
4203 new_rbin_len += taglen+1;
4204 }else{
4205 int content_len = SAP_pairer_skip_tag_body_len(ext_bin_ptr);
4206 memcpy(new_rbin + new_rbin_len, ext_bin_ptr, content_len );
4207 new_rbin_len += content_len;
4208 ext_bin_ptr += content_len;
4209 }
4210 }
4211 if(!CR_found){
4212 new_rbin[new_rbin_len++]='C';new_rbin[new_rbin_len++]='R';new_rbin[new_rbin_len++]='Z';
4213 memcpy(new_rbin+new_rbin_len, cellbc_seq, global_context -> known_cell_barcode_length);
4214 *(new_rbin+new_rbin_len+global_context -> known_cell_barcode_length)=0;
4215 new_rbin_len += global_context -> known_cell_barcode_length+1;
4216 }
4217 if(fixedbc_seq && !CB_found){
4218 new_rbin[new_rbin_len++]='C';new_rbin[new_rbin_len++]='B';new_rbin[new_rbin_len++]='Z';
4219 memcpy(new_rbin+new_rbin_len, fixedbc_seq, global_context -> known_cell_barcode_length);
4220 *(new_rbin+new_rbin_len+global_context -> known_cell_barcode_length)=0;
4221 new_rbin_len += global_context -> known_cell_barcode_length+1;
4222 }
4223 if(!CY_found){
4224 new_rbin[new_rbin_len++]='C';new_rbin[new_rbin_len++]='Y';new_rbin[new_rbin_len++]='Z';
4225 memcpy(new_rbin+new_rbin_len, cellbc_qual, global_context -> known_cell_barcode_length);
4226 *(new_rbin+new_rbin_len+global_context -> known_cell_barcode_length)=0;
4227 new_rbin_len += global_context -> known_cell_barcode_length+1;
4228 }
4229
4230 if(!UR_found){
4231 new_rbin[new_rbin_len++]='U';new_rbin[new_rbin_len++]='R';new_rbin[new_rbin_len++]='Z';
4232 memcpy(new_rbin+new_rbin_len, umi_seq, global_context -> scRNA_UMI_length);
4233 *(new_rbin+new_rbin_len+global_context -> scRNA_UMI_length)=0;
4234 new_rbin_len += global_context -> scRNA_UMI_length+1;
4235 }
4236 if(fixedumi_seq && !UB_found){
4237 new_rbin[new_rbin_len++]='U';new_rbin[new_rbin_len++]='B';new_rbin[new_rbin_len++]='Z';
4238 memcpy(new_rbin+new_rbin_len, fixedumi_seq, global_context -> scRNA_UMI_length);
4239 *(new_rbin+new_rbin_len+global_context -> scRNA_UMI_length)=0;
4240 new_rbin_len += global_context -> scRNA_UMI_length+1;
4241 }
4242 if(!UY_found){
4243 new_rbin[new_rbin_len++]='U';new_rbin[new_rbin_len++]='Y';new_rbin[new_rbin_len++]='Z';
4244 memcpy(new_rbin+new_rbin_len, umi_qual, global_context -> scRNA_UMI_length);
4245 *(new_rbin+new_rbin_len+global_context -> scRNA_UMI_length)=0;
4246 new_rbin_len += global_context -> scRNA_UMI_length+1;
4247 }
4248
4249 new_rbin_len-=4;
4250 memcpy(new_rbin, &new_rbin_len,4);
4251 fwrite(new_rbin, 1, new_rbin_len+4, fp);
4252
4253 if(new_rbin!=new_rbin_stake)free(new_rbin);
4254 }
4255
4256 //int cttt = 0;
4257
add_scRNA_read_to_pool(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,srInt_64 assign_target_number,char * read_name,char * read_bin,ArrayList * target_list)4258 void add_scRNA_read_to_pool( fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, srInt_64 assign_target_number, char * read_name, char * read_bin, ArrayList * target_list ){ // the index of gene or the index of exon
4259 char * cell_barcode = NULL, * umi_barcode = NULL;
4260 int sample_id = -1, known_sample_id = 0;
4261 if(global_context -> scRNA_rerun_on_persample_BAM) known_sample_id = global_context -> this_input_number+1;
4262
4263 scRNA_find_sample_cell_umi_from_readname(global_context, thread_context, read_name, read_bin, (known_sample_id >0)?NULL:&sample_id, &cell_barcode, &umi_barcode, NULL);
4264 if(known_sample_id >0) sample_id = known_sample_id;
4265
4266 int cell_id = scRNA_get_cell_id(global_context, thread_context, cell_barcode);
4267 // int umi_id = scRNA_register_umi_id( global_context, thread_context, umi_barcode);
4268
4269 //SUBREADprintf("P0\n");
4270 thread_context -> scRNA_pooled_reads ++;
4271 if(sample_id >0)thread_context -> scRNA_has_valid_sample_index ++;
4272 if(cell_id >=0)thread_context -> scRNA_has_valid_cell_barcode ++;
4273
4274 if(thread_context -> thread_id == 0 && thread_context -> scRNA_pooled_reads == 20000){
4275 print_in_box(80,0,0," scRNA quality control in first 20,000 reads:");
4276 print_in_box(80,0,0," %.1f pct reads have valid sample indices.", thread_context->scRNA_has_valid_sample_index*100./thread_context -> scRNA_pooled_reads);
4277 print_in_box(80,0,0," %.1f pct reads have valid cell barcodes.", thread_context->scRNA_has_valid_cell_barcode*100./thread_context -> scRNA_pooled_reads);
4278 print_in_box(80,0,0,"");
4279 }
4280
4281 if(sample_id >0) thread_context -> scRNA_assigned_reads_per_sample[sample_id-1] ++;
4282 //if(sample_id >0) SUBREADprintf("P1 Cell=%s, Umi=%s, Lane=%d ==> sample %d\n", cell_barcode, umi_barcode, laneno, sample_id);
4283 if(sample_id >0){
4284 int barcode_hashed_key;
4285
4286 // has cell-bc, assigned : to hashed bin
4287 // has cell-bc, not assigned, has mapping location: to hashed bin, with 0-len gene list
4288 // no cell-bc, has mapping location: to the MAX - 1 bin, with 0-len gene list
4289 // no mapping location: to the MAX bin; only has rbins but no headers.
4290 if(cell_id >=0 && (assign_target_number>=0 || target_list))barcode_hashed_key = cell_id % global_context -> scRNA_barcode_batched_bin_no;
4291 else{
4292 int chro_no = -1, chro_pos = -1;
4293 memcpy(&chro_no, read_bin + 4, 4);
4294 memcpy(&chro_pos, read_bin + 8, 4);
4295 if(cell_id >= 0 && chro_no>=0) barcode_hashed_key = cell_id % global_context -> scRNA_barcode_batched_bin_no;
4296 else if(cell_id <0 && chro_no>=0) barcode_hashed_key = global_context -> scRNA_barcode_batched_bin_no;
4297 else barcode_hashed_key = global_context -> scRNA_barcode_batched_bin_no+1;
4298
4299 }
4300
4301 pthread_spin_lock(global_context -> scRNA_barcode_batched_locks+barcode_hashed_key);
4302 FILE * myfp = global_context -> scRNA_barcode_batched_bins[barcode_hashed_key];
4303 fwrite(&sample_id,1,4,myfp);
4304 srInt_64 itemno = 1;
4305 if(barcode_hashed_key<=global_context -> scRNA_barcode_batched_bin_no){
4306 fwrite(&cell_id,1,4,myfp);
4307 if(assign_target_number<0){
4308 itemno = target_list?target_list -> numOfElements:0;
4309 itemno = itemno | (1llu << 63);
4310 fwrite(&itemno,1,8,myfp);
4311 int x1;
4312 if(target_list)for(x1=0;x1<target_list -> numOfElements;x1++){
4313 srInt_64 geneno_0B = ArrayListGet(target_list, x1) -NULL;
4314 fwrite(&geneno_0B,1,8,myfp);
4315 }
4316 }else fwrite(&assign_target_number,1,8,myfp);
4317 fwrite(umi_barcode,1,global_context -> scRNA_UMI_length,myfp);
4318 }
4319 int read_bin_len=0;
4320 memcpy(&read_bin_len , read_bin, 4);
4321
4322 if(barcode_hashed_key==global_context -> scRNA_barcode_batched_bin_no+1){ // the read is unmapped. It can still have a fixed cell barcode
4323 char * new_cellbc = NULL;
4324 if(cell_id>=0)new_cellbc = ArrayListGet(global_context -> scRNA_cell_barcodes_array, cell_id);
4325 scRNA_do_one_batch_write_extend_rbin(global_context, read_bin, read_bin_len, myfp, new_cellbc, NULL, -1, NULL);
4326 }else fwrite(read_bin, 1, read_bin_len+4, myfp);
4327 pthread_spin_unlock(global_context -> scRNA_barcode_batched_locks+barcode_hashed_key);
4328 }
4329 }
4330
scRNA_do_one_batch_sort_compare(void * ar,int l,int r)4331 int scRNA_do_one_batch_sort_compare(void * ar, int l, int r){
4332 void ** arr = ar;
4333 void ** bin_ptrs = arr[0];
4334 fc_thread_global_context_t * global_context = arr[1];
4335
4336 char * Lptr = bin_ptrs[l];
4337 char * Rptr = bin_ptrs[r];
4338 srInt_64 Lgenes=0, Rgenes=0;
4339 memcpy(&Lgenes, Lptr+8, 8);
4340 memcpy(&Rgenes, Rptr+8, 8);
4341 if(Lgenes & (1LLU<<63))Lgenes=Lgenes & 0x7fffffffllu; else Lgenes=0;
4342 if(Rgenes & (1LLU<<63))Rgenes=Rgenes & 0x7fffffffllu; else Rgenes=0;
4343 srInt_64 Lpos= ((0LLU+*(int*)(Lptr+16+Lgenes*8+global_context->scRNA_UMI_length+4))<<32) | *(unsigned int*)(Lptr+16+Lgenes*8+global_context->scRNA_UMI_length+4+4);
4344 srInt_64 Rpos= ((0LLU+*(int*)(Rptr+16+Rgenes*8+global_context->scRNA_UMI_length+4))<<32) | *(unsigned int*)(Rptr+16+Rgenes*8+global_context->scRNA_UMI_length+4+4);
4345 if(Lpos>Rpos)return 1;
4346 if(Lpos<Rpos)return -1;
4347 return 0;
4348 }
4349
scRNA_do_one_batch_sort_exchange(void * ar,int l,int r)4350 void scRNA_do_one_batch_sort_exchange(void * ar, int l, int r){
4351 void ** arr = ar;
4352 void ** bin_ptrs = arr[0];
4353 void * tp = bin_ptrs[l];
4354 bin_ptrs[l]=bin_ptrs[r];
4355 bin_ptrs[r]=tp;
4356 }
4357
scRNA_do_one_batch_sort_merge(void * ar,int start,int items,int items2)4358 void scRNA_do_one_batch_sort_merge(void * ar, int start, int items, int items2){
4359 void ** arr = ar;
4360 void ** bin_ptrs = arr[0];
4361 bin_ptrs +=start;
4362
4363 void ** tmp = malloc(sizeof(void*)*(items2+items));
4364 int i1_cursor=0, i2_cursor=items, wptr=0;
4365 while(1){
4366 if(i1_cursor == items && i2_cursor == items + items2 )break;
4367 int select_items_1 = (i2_cursor == items + items2) || (i1_cursor < items && scRNA_do_one_batch_sort_compare(ar, start+ i1_cursor,start + i2_cursor) <= 0);
4368 if(select_items_1) tmp[wptr++] = bin_ptrs[i1_cursor++];
4369 else tmp[wptr++] = bin_ptrs[i2_cursor++];
4370 }
4371 memcpy(bin_ptrs, tmp, sizeof(void*)*(items2+items));
4372 free(tmp);
4373 }
4374
4375 struct cell_gene_umi_supp{
4376 int cellbc;
4377 srInt_64 gene_no;
4378 char umi[MAX_UMI_LEN];
4379 int supp_reads;
4380 };
4381
scRNA_hamming_max2_fixlen(char * u1,char * u2,int ulen)4382 int scRNA_hamming_max2_fixlen(char * u1, char * u2, int ulen){
4383 int x, ret=0;
4384 for(x=0; x<ulen; x++){
4385 if(u1[x]!=u2[x]) ret++;
4386 if(ret>1)return ret;
4387 }
4388 return ret;
4389 }
4390
4391 #define ADD_count_hash(bc,gn,no) HashTablePut(cellBCp0_genep0_P1_to_UMIs, NULL +1+(((1LLU*(bc))<<32)| (gn) ), HashTableGet( cellBCp0_genep0_P1_to_UMIs, NULL +1+(((1LLU*(bc))<<32)| (gn))) +(no) )
4392
scRNA_do_one_batch_UMI_merge_one_cell(ArrayList * structs,int sec_start,int sec_end,int is_UMI_step2,HashTable * filtered_CGU_table)4393 void scRNA_do_one_batch_UMI_merge_one_cell(ArrayList* structs, int sec_start, int sec_end, int is_UMI_step2, HashTable * filtered_CGU_table){
4394 int x1;
4395 void ** app1 = structs -> appendix1;
4396 fc_thread_global_context_t * global_context = app1[0];
4397 HashTable * cellBCp0_genep0_P1_to_UMIs = app1[2];
4398 int sample_id = app1[3]-NULL;
4399
4400 if(is_UMI_step2){
4401 // NB: when this function is called, sec_end - sec_start MUST be >=2.
4402 for(x1 = sec_start; x1<sec_end; x1++) {
4403 struct cell_gene_umi_supp * str1 = ArrayListGet(structs, x1);
4404 if(x1 == sec_start){
4405 struct cell_gene_umi_supp * str2 = ArrayListGet(structs, sec_start+1);
4406 if(str1 -> supp_reads > str2 -> supp_reads){
4407 ADD_count_hash(str1->cellbc, str1->gene_no,1);
4408 continue;
4409 }
4410 }
4411
4412 str1 -> cellbc = -1;
4413 char replaced_key[40+MAX_UMI_LEN];
4414 #ifdef __MINGW32__
4415 int keyptr = sprintf(replaced_key,"%d-%I64d-", str1 -> cellbc, str1 -> gene_no);
4416 #else
4417 int keyptr = sprintf(replaced_key,"%d-%lld-", str1 -> cellbc, str1 -> gene_no);
4418 #endif
4419 memcpy(replaced_key+keyptr, str1 -> umi, global_context -> scRNA_UMI_length);
4420 replaced_key[keyptr+global_context -> scRNA_UMI_length]=0;
4421 HashTablePut(filtered_CGU_table, strdup(replaced_key), NULL-1);
4422 }
4423 }else{
4424 ArrayList * accepted_list =NULL;
4425 HashTable * looktable = NULL;
4426 if(sec_end - sec_start >30){
4427 looktable = StringTableCreate((sec_end - sec_start)/5);
4428 HashTableSetDeallocationFunctions(looktable, free, (void (*)(void *value))ArrayListDestroy);
4429 }else accepted_list = ArrayListCreate(sec_end - sec_start);
4430
4431 for(x1=sec_start; x1<sec_end; x1++){
4432 struct cell_gene_umi_supp * try_str = ArrayListGet(structs , x1);
4433 int x2, found = 0;
4434 ArrayList * test_accs;
4435 int hx;
4436
4437 if(looktable){
4438 for(hx = 0; hx<2; hx++){
4439 char test_ky[MAX_UMI_LEN];
4440 test_ky[0] = hx?'S':'F';
4441 memcpy(test_ky +1, try_str -> umi + hx * global_context -> scRNA_UMI_length/2 , global_context -> scRNA_UMI_length/2);
4442 test_ky[1+global_context -> scRNA_UMI_length/2]=0;
4443
4444 test_accs = HashTableGet(looktable, test_ky);
4445 if(!test_accs)continue;
4446
4447 for(x2=0; x2<test_accs->numOfElements; x2++){
4448 struct cell_gene_umi_supp * acc_str = ArrayListGet(test_accs, x2);
4449 if(scRNA_hamming_max2_fixlen(acc_str -> umi, try_str -> umi, global_context -> scRNA_UMI_length)<2){
4450 found=1;
4451 acc_str -> supp_reads += try_str -> supp_reads;
4452 try_str -> cellbc = -1;
4453
4454 char replaced_key[55+MAX_UMI_LEN];
4455 #ifdef __MINGW32__
4456 int keyptr = sprintf(replaced_key,"%d-%d-%I64d-", sample_id, try_str -> cellbc, try_str -> gene_no);
4457 #else
4458 int keyptr = sprintf(replaced_key,"%d-%d-%lld-", sample_id, try_str -> cellbc, try_str -> gene_no);
4459 #endif
4460
4461 memcpy(replaced_key+keyptr, try_str -> umi, global_context -> scRNA_UMI_length);
4462 replaced_key[keyptr+global_context -> scRNA_UMI_length]=0;
4463 HashTablePut(filtered_CGU_table, strdup(replaced_key), acc_str -> umi);
4464 break;
4465 }
4466 }
4467 if(found)break;
4468 }
4469 }else{
4470 test_accs = accepted_list;
4471
4472 for(x2=0; x2<test_accs->numOfElements; x2++){
4473 struct cell_gene_umi_supp * acc_str = ArrayListGet(test_accs, x2);
4474 if(scRNA_hamming_max2_fixlen(acc_str -> umi, try_str -> umi, global_context -> scRNA_UMI_length)<2){
4475 found=1;
4476 acc_str -> supp_reads += try_str -> supp_reads;
4477 try_str -> cellbc = -1;
4478
4479 char replaced_key[55+MAX_UMI_LEN];
4480 #ifdef __MINGW32__
4481 int keyptr = sprintf(replaced_key,"%d-%d-%I64d-", sample_id, try_str -> cellbc, try_str -> gene_no);
4482 #else
4483 int keyptr = sprintf(replaced_key,"%d-%d-%lld-", sample_id, try_str -> cellbc, try_str -> gene_no);
4484 #endif
4485 memcpy(replaced_key+keyptr, try_str -> umi, global_context -> scRNA_UMI_length);
4486 replaced_key[keyptr+global_context -> scRNA_UMI_length]=0;
4487 HashTablePut(filtered_CGU_table, strdup(replaced_key), acc_str -> umi);
4488 break;
4489 }
4490 }
4491 }
4492 if(!found){
4493 if(looktable){
4494 for(hx = 0; hx<2; hx++){
4495 char test_ky[MAX_UMI_LEN];
4496 test_ky[0] = hx?'S':'F';
4497 memcpy(test_ky +1, try_str -> umi + hx * global_context -> scRNA_UMI_length/2 , global_context -> scRNA_UMI_length/2);
4498 test_ky[1+global_context -> scRNA_UMI_length/2]=0;
4499 test_accs = HashTableGet(looktable, test_ky);
4500 if(!test_accs){
4501 test_accs = ArrayListCreate(10);
4502 HashTablePut(looktable, strdup(test_ky), test_accs);
4503 }
4504 ArrayListPush(test_accs, try_str);
4505 }
4506 }else ArrayListPush(accepted_list, try_str);
4507 }
4508 }
4509
4510 if(looktable)HashTableDestroy(looktable);
4511 else ArrayListDestroy(accepted_list);
4512 }
4513 }
4514
scRNA_do_one_batch_UMI_merge_one_step(ArrayList * structs,int is_UMI_step2,HashTable * filtered_CGU_table)4515 void scRNA_do_one_batch_UMI_merge_one_step(ArrayList* structs, int is_UMI_step2, HashTable * filtered_CGU_table){
4516 void ** app1 = structs -> appendix1;
4517 fc_thread_global_context_t * global_context = app1[0];
4518 HashTable * cellBCp0_genep0_P1_to_UMIs = app1[2];
4519 srInt_64 x1, sec_start = 0;
4520 srInt_64 old_sec_key = -1;
4521
4522 for(x1=1; x1<=structs -> numOfElements; x1++){
4523 srInt_64 sec_key = -1;
4524 int is_umi_changed = 0;
4525
4526 struct cell_gene_umi_supp * str1 =NULL;
4527 if(x1<structs -> numOfElements){
4528 str1 = ArrayListGet(structs, x1);
4529 if(str1 -> cellbc <0) continue;
4530 sec_key = str1 -> cellbc;
4531 sec_key = sec_key << 32;
4532 if(is_UMI_step2 && sec_key == old_sec_key){
4533 struct cell_gene_umi_supp * strold = ArrayListGet(structs, sec_start);
4534 is_umi_changed = memcmp(strold -> umi, str1-> umi, global_context-> scRNA_UMI_length);
4535 }else if(!is_UMI_step2) sec_key = sec_key | str1 -> gene_no;
4536 // gene_no itself is 64-bit, but it is nearly impossible to have two neighbouring
4537 // structures that have the same last 32-bit of gene_no.
4538 }
4539
4540 if( (x1>sec_start && sec_key!=old_sec_key) || is_umi_changed){ // when x1 == numOfElements, sec_key is -1. If old_sec_key is also -1, no item is included in the list. If old_sec_key is >=0, the last sec is processed.
4541 struct cell_gene_umi_supp * str0 = ArrayListGet(structs, sec_start);
4542 if(x1 - sec_start>1 && str0->cellbc>=0) scRNA_do_one_batch_UMI_merge_one_cell(structs, sec_start, x1, is_UMI_step2, filtered_CGU_table);
4543 else if(is_UMI_step2 && str0->cellbc>=0) ADD_count_hash(str0->cellbc,str0->gene_no,1);
4544
4545 old_sec_key = sec_key;
4546 sec_start = x1;
4547 }
4548 }
4549 }
4550
scRNA_do_one_batch_tab_to_struct_list_compare(void * L_elem,void * R_elem,ArrayList * me)4551 int scRNA_do_one_batch_tab_to_struct_list_compare(void * L_elem, void * R_elem, ArrayList * me){
4552 struct cell_gene_umi_supp *L = L_elem, *R = R_elem;
4553 void ** app1 = me -> appendix1;
4554 fc_thread_global_context_t * global_context = app1[0];
4555 int sort_by_geneid_then_umi = app1[1] - NULL;
4556
4557 if(L->cellbc > R->cellbc) return 1;
4558 if(L->cellbc < R->cellbc) return -1;
4559
4560 if(sort_by_geneid_then_umi){
4561 if(L->gene_no>R->gene_no) return 1;
4562 if(L->gene_no<R->gene_no) return -1;
4563 }else{
4564 int umicmps = memcmp(L->umi, R->umi, global_context -> scRNA_UMI_length);
4565 if(umicmps) return umicmps;
4566 }
4567
4568 if(L->supp_reads < R->supp_reads) return 1;
4569 if(L->supp_reads > R->supp_reads) return -1; // reversed by # supp reads
4570
4571 if(sort_by_geneid_then_umi){
4572 int umicmps = memcmp(L->umi, R->umi, global_context -> scRNA_UMI_length);
4573 if(umicmps) return umicmps;
4574 }else{
4575
4576 if(L->gene_no>R->gene_no) return 1;
4577 if(L->gene_no<R->gene_no) return -1;
4578 }
4579 return 0;
4580 }
4581
scRNA_do_one_batch_tab_to_struct_list(void * ky,void * val,HashTable * tab)4582 void scRNA_do_one_batch_tab_to_struct_list(void *ky, void *val, HashTable * tab){
4583 int supp_reads = val-NULL;
4584 ArrayList ** cell_gene_umi_list = tab -> appendix1;
4585 int UMI_length = tab -> counter1;
4586
4587 struct cell_gene_umi_supp * new_item = malloc(sizeof(struct cell_gene_umi_supp));
4588 char * kyptr = ky;
4589 int sample_id = atoi(kyptr); // one-based sample id
4590 for(; '-' != *kyptr; kyptr++);
4591 kyptr++;
4592 new_item -> cellbc = atoi(kyptr);
4593 for(; '-' != *kyptr; kyptr++);
4594 kyptr++;
4595 new_item -> gene_no = atoll(kyptr);
4596 for(; '-' != *kyptr; kyptr++);
4597 memcpy(new_item->umi, kyptr+1, UMI_length);
4598 new_item -> supp_reads = supp_reads;
4599 if(sample_id<1)SUBREADprintf("WRONG SAMPLE ID: %d from '%s'\n", sample_id, (char*)ky);
4600 ArrayListPush(cell_gene_umi_list[sample_id-1], new_item);
4601 }
4602
scRNA_do_one_batch_write_UMIs(void * vcell_gene,void * vumis,HashTable * me)4603 void scRNA_do_one_batch_write_UMIs(void * vcell_gene, void * vumis, HashTable * me){
4604 FILE * fp = me->appendix1;
4605 vcell_gene --;
4606 fwrite(&vcell_gene,1,8,fp);
4607 fwrite(&vumis,1,8,fp);
4608 }
4609
two_long_hash(void * ky)4610 srInt_64 two_long_hash(void * ky){
4611 srInt_64 * ky2 = ky;
4612 return ky2[0]^ky2[1];
4613 }
4614
two_long_compare(void * k1,void * k2)4615 int two_long_compare(void * k1, void * k2){
4616 srInt_64 * k13 = k1, *k23 = k2;
4617 if(k13[0]!=k23[0])return 1;
4618 if(k13[1]!=k23[1])return 1;
4619 return 0;
4620 }
4621
4622 #ifdef __MINGW32__
4623 #define ADD_key_FMT1 "%d-%d-%I64d-%s"
4624 #else
4625 #define ADD_key_FMT1 "%d-%d-%lld-%s"
4626 #endif
4627 #define ADD_key_struct { char my_key [50+MAX_UMI_LEN]; \
4628 sprintf(my_key,ADD_key_FMT1, sample_id, cell_no, gene_no, UMI_str); \
4629 srInt_64 supp_reads = HashTableGet(supp_reads_SCGU, my_key)-NULL; \
4630 if(1>supp_reads) HashTablePut(supp_reads_SCGU, strdup(my_key), NULL+1); \
4631 else HashTablePutReplaceEx(supp_reads_SCGU, my_key, NULL+supp_reads+1, 0,0,0); }
4632
scRNA_do_one_batch(void * paramsp1)4633 void * scRNA_do_one_batch(void * paramsp1){
4634 srInt_64 x1;
4635 void ** params = paramsp1;
4636 fc_thread_global_context_t * global_context = params[0];
4637 ArrayList * file_size_list = params[2];
4638 char *temp_dir = global_context -> temp_file_dir;
4639 free(paramsp1);
4640 int me_max_Rbin_len = 0;
4641 int me_max_genes = 0;
4642 char ** bin_ptrs = malloc(sizeof(char*) * 1500000), * batch_content=NULL;
4643 int bin_ptr_size = 1500000;
4644 while(1){
4645 int this_batch_no = -1;
4646 pthread_spin_lock(&global_context -> scRNA_do_one_batch_runner_lock);
4647 if(global_context -> scRNA_do_one_batch_runner_current < global_context -> scRNA_barcode_batched_bin_no +1){
4648 int this_batch_sorted_idx = (global_context -> scRNA_do_one_batch_runner_current ++);
4649 srInt_64 this_batch_size_and_no = ArrayListGet(file_size_list, file_size_list->numOfElements-1 -this_batch_sorted_idx)-NULL;
4650 this_batch_no = (int)(this_batch_size_and_no&0xfffffllu);
4651 }
4652 if(me_max_genes > global_context -> scRNA_barcode_batched_max_genes) global_context -> scRNA_barcode_batched_max_genes = me_max_genes;
4653 if(me_max_Rbin_len > global_context->scRNA_barcode_batched_max_Rbin_len) global_context->scRNA_barcode_batched_max_Rbin_len = me_max_Rbin_len;
4654 pthread_spin_unlock(&global_context -> scRNA_do_one_batch_runner_lock);
4655 if(0>this_batch_no)break;
4656 char tmp_fname[MAX_FILE_NAME_LENGTH+80];
4657 sprintf(tmp_fname, "%s/cellCounts-Splitted-Reads-%05d-%05d.bin", temp_dir, getpid(), this_batch_no);
4658 FILE * fp = fopen(tmp_fname, "rb");
4659 fseek(fp, 0, SEEK_END);
4660 srInt_64 batch_fsize = ftello(fp);
4661 fseek(fp, 0, SEEK_SET);
4662 if(batch_content==NULL) batch_content = malloc(batch_fsize);
4663 srInt_64 batch_content_len = fread(batch_content, 1, batch_fsize, fp);
4664 fclose(fp);
4665 if(batch_content_len!=batch_fsize){
4666 SUBREADprintf("ERROR: Cannot load file at once: %d!\n", this_batch_no);
4667 return NULL;
4668 }
4669
4670 HashTable * supp_reads_SCGU = StringTableCreate(500000);
4671 HashTableSetDeallocationFunctions(supp_reads_SCGU, free, NULL);
4672 srInt_64 scanptr = 0;
4673 int rbin_no = 0;
4674 char UMI_str[MAX_UMI_LEN+1];
4675
4676 while(scanptr < batch_content_len-1){
4677 int cell_no=0, sample_id=0;
4678 srInt_64 gene_no=0;
4679 if(bin_ptr_size<=rbin_no){
4680 bin_ptr_size = bin_ptr_size*2;
4681 bin_ptrs = realloc(bin_ptrs, sizeof(char*)*bin_ptr_size);
4682 }
4683 bin_ptrs[rbin_no] = batch_content+scanptr;
4684 memcpy(&sample_id, batch_content+scanptr, 4);
4685 scanptr += 4; // sample_ID
4686 memcpy(&cell_no, batch_content+scanptr, 4);
4687 scanptr += 4; // cellbarcode_NO
4688 memcpy(&gene_no, batch_content+scanptr, 8);
4689 scanptr += 8; // gene_id
4690 if(gene_no & (1LLU<<63)){
4691 int genes = (int)(gene_no & 0x7fffffffllu);
4692 if(genes > me_max_genes)me_max_genes=genes;
4693
4694 memcpy(UMI_str, batch_content+scanptr+8*genes, global_context -> scRNA_UMI_length);
4695 UMI_str[global_context -> scRNA_UMI_length]=0;
4696
4697 for(x1=0; x1<genes; x1++){
4698 memcpy(&gene_no, batch_content+scanptr, 8);
4699 scanptr += 8;
4700 ADD_key_struct;
4701 }
4702 }else{
4703 UMI_str[global_context -> scRNA_UMI_length]=0;
4704 memcpy(UMI_str, batch_content+scanptr, global_context -> scRNA_UMI_length);
4705 ADD_key_struct;
4706 }
4707
4708 scanptr += global_context -> scRNA_UMI_length ; // UMI str
4709
4710 int rbinlen = 0;
4711 memcpy(&rbinlen, batch_content+scanptr, 4);
4712
4713 if(me_max_Rbin_len < rbinlen) me_max_Rbin_len = rbinlen;
4714 scanptr += rbinlen +4; // read_bin
4715
4716 // if(sample_id <0 || sample_id > 1000) SUBREADprintf("Wrong Sample: RNO=%d; ptr=%lld\n", rbin_no, scanptr);
4717 rbin_no++;
4718 }
4719 ArrayList ** cell_gene_umi_list = malloc(sizeof(void*)*global_context -> scRNA_sample_sheet_table -> numOfElements);
4720 for(x1 =0; x1< global_context -> scRNA_sample_sheet_table -> numOfElements; x1++){
4721 cell_gene_umi_list[x1]=ArrayListCreate(2000000);
4722 ArrayListSetDeallocationFunction(cell_gene_umi_list[x1], free);
4723 }
4724 supp_reads_SCGU -> appendix1 = cell_gene_umi_list;
4725 supp_reads_SCGU -> appendix2 = global_context;
4726 supp_reads_SCGU -> counter1 = global_context -> scRNA_UMI_length;
4727 HashTableIteration(supp_reads_SCGU, scRNA_do_one_batch_tab_to_struct_list);
4728 HashTable * filtered_SCGU_table = StringTableCreate(max(10000,cell_gene_umi_list[0] -> numOfElements / 10));
4729 HashTableSetDeallocationFunctions(filtered_SCGU_table, free, NULL);
4730
4731 fp = fopen(tmp_fname, "wb");
4732 for(x1 = 0; x1 < global_context -> scRNA_sample_sheet_table -> numOfElements; x1++){
4733 HashTable * cellbcP0_to_geneno0B_P1_to_UMIs = HashTableCreate(500000);
4734
4735 void * app1[3];
4736 cell_gene_umi_list[x1] -> appendix1 = app1;
4737 app1[0] = global_context;
4738 app1[1] = NULL+1;
4739 // 0 : sorted by cell_bc, then UMIstr, then supported_reads, then gene
4740 // 1 : sorted by cell_bc, then gene, then supported_reads, then UMIstr
4741 // supported_reads : large -> small; the other: small -> large
4742 ArrayListSort(cell_gene_umi_list[x1], scRNA_do_one_batch_tab_to_struct_list_compare);
4743 scRNA_do_one_batch_UMI_merge_one_step(cell_gene_umi_list[x1], 0, filtered_SCGU_table);
4744
4745 app1[1] = NULL+0;
4746 app1[2] = cellbcP0_to_geneno0B_P1_to_UMIs;
4747 ArrayListSort(cell_gene_umi_list[x1], scRNA_do_one_batch_tab_to_struct_list_compare);
4748 scRNA_do_one_batch_UMI_merge_one_step(cell_gene_umi_list[x1], 1, filtered_SCGU_table);
4749
4750 cellbcP0_to_geneno0B_P1_to_UMIs -> appendix1 = fp;
4751 fwrite(&cellbcP0_to_geneno0B_P1_to_UMIs -> numOfElements,1,8,fp);
4752 HashTableIteration(cellbcP0_to_geneno0B_P1_to_UMIs, scRNA_do_one_batch_write_UMIs);
4753 HashTableDestroy(cellbcP0_to_geneno0B_P1_to_UMIs);
4754 }
4755
4756 void * sort_base[2];
4757 sort_base[0] = bin_ptrs;
4758 sort_base[1] = global_context;
4759 merge_sort(sort_base, rbin_no, scRNA_do_one_batch_sort_compare, scRNA_do_one_batch_sort_exchange, scRNA_do_one_batch_sort_merge);
4760
4761
4762
4763 for(x1 = 0; x1 < rbin_no; x1++){
4764 char * binptr = bin_ptrs[x1];
4765 int cellid =0, sampleid = 0;
4766 srInt_64 gene_no =0, genes = 0, geneno_0 = 0;
4767 char * umi, * glist_ptr =NULL;
4768 memcpy(&sampleid, binptr, 4);
4769 memcpy(&cellid, binptr+4, 4);
4770 memcpy(&gene_no, binptr+8, 8);
4771 if(gene_no & (1LLU<<63)){
4772 glist_ptr =binptr + 16;
4773 genes = (int)(gene_no & 0x7fffffff);
4774 memcpy(&geneno_0, binptr+16, 8);
4775 }
4776 umi = binptr + 16 + 8*genes;
4777 char SCGU_key [40+MAX_UMI_LEN];
4778
4779 #ifdef __MINGW32__
4780 int keyptr = sprintf(SCGU_key,"%d-%d-%I64d-", sampleid, cellid, (gene_no & (1LLU<<63))? geneno_0: gene_no);
4781 #else
4782 int keyptr = sprintf(SCGU_key,"%d-%d-%lld-", sampleid, cellid, (gene_no & (1LLU<<63))? geneno_0: gene_no);
4783 #endif
4784 memcpy(SCGU_key+keyptr, umi, global_context -> scRNA_UMI_length);
4785 SCGU_key[keyptr+global_context -> scRNA_UMI_length] = 0;
4786
4787 char * new_UMI = HashTableGet(filtered_SCGU_table, SCGU_key);
4788 if(new_UMI) umi = new_UMI;
4789 if(umi == NULL-1) umi="-----------------------------------------";
4790 fwrite(&sampleid, 1, 4, fp);
4791 fwrite(&cellid, 1, 4, fp);
4792 fwrite(&gene_no, 1, 8, fp);
4793 if(gene_no & (1LLU<<63)) fwrite( glist_ptr, 1, 8*genes, fp );
4794 fwrite(umi,1, global_context -> scRNA_UMI_length, fp);
4795 int binlen;
4796
4797 memcpy(&binlen, binptr+16+8*genes+global_context -> scRNA_UMI_length,4 );
4798 char * new_cellbc = NULL;
4799 if(cellid>=0)new_cellbc = ArrayListGet(global_context -> scRNA_cell_barcodes_array, cellid);
4800 scRNA_do_one_batch_write_extend_rbin(global_context, binptr+16+8*genes+global_context -> scRNA_UMI_length, binlen, fp, new_cellbc, umi[0]=='-'?NULL:umi, gene_no, (srInt_64*)glist_ptr);
4801 }
4802 fclose(fp);
4803 HashTableDestroy(supp_reads_SCGU);
4804 HashTableDestroy(filtered_SCGU_table);
4805 for(x1 =0; x1< global_context -> scRNA_sample_sheet_table -> numOfElements; x1++)ArrayListDestroy(cell_gene_umi_list[x1]);
4806 free(cell_gene_umi_list);
4807 }
4808 free(batch_content);
4809 free(bin_ptrs);
4810 return NULL;
4811 }
4812
calc_score_overlaps(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char ** chros,unsigned int * start_poses,unsigned short * lens,int sections,char * read_name)4813 unsigned int calc_score_overlaps(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char ** chros, unsigned int * start_poses, unsigned short * lens, int sections, char * read_name){
4814 unsigned int in_intervals[ 2*sections ];
4815 unsigned int out_intervals[ 2*sections ], x1;
4816 char used_interval[ sections ];
4817
4818 memset(used_interval, 0 , sections);
4819 unsigned int ret = 0;
4820
4821 for(x1 = 0 ; x1 < sections ; x1++){
4822 if( used_interval [x1] )continue;
4823
4824 in_intervals[0] = start_poses[x1];
4825 in_intervals[1] = start_poses[x1] + lens[x1];
4826 used_interval[x1]=1;
4827
4828 int x2, this_sections = 1;
4829 for(x2 = x1 + 1; x2 < sections; x2++){
4830 if(strcmp( chros[x2], chros[x1] ) == 0){
4831 in_intervals[this_sections*2] = start_poses[x2];
4832 in_intervals[this_sections*2 + 1] = start_poses[x2] + lens[x2];
4833 used_interval[x2]=1;
4834 this_sections++;
4835 }
4836 }
4837
4838 basic_sort( in_intervals, this_sections, overlap_compare, overlap_exchange );
4839
4840 int merged_secs = mergeIntervals( in_intervals, out_intervals, this_sections );
4841 for(x2 = 0; x2 < merged_secs; x2++)
4842 ret += ( out_intervals[x2*2+1] - out_intervals[x2*2] );
4843 }
4844 return ret;
4845 }
4846
4847
vote_and_add_count(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,srInt_64 * hits_indices1,int nhits1,srInt_64 * hits_indices2,int nhits2,unsigned int total_frag_len,char ** hits_chro1,char ** hits_chro2,unsigned int * hits_start_pos1,unsigned int * hits_start_pos2,unsigned short * hits_length1,unsigned short * hits_length2,int fixed_fractional_count,char * read_name,char * RG_name,char * bin1,char * bin2)4848 void vote_and_add_count(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context,
4849 srInt_64 * hits_indices1, int nhits1, srInt_64 * hits_indices2, int nhits2, unsigned int total_frag_len,
4850 char ** hits_chro1, char ** hits_chro2, unsigned int * hits_start_pos1, unsigned int * hits_start_pos2, unsigned short * hits_length1, unsigned short * hits_length2, int fixed_fractional_count, char * read_name, char * RG_name, char * bin1, char * bin2){
4851 if(global_context -> need_calculate_overlap_len == 0 && nhits2+nhits1==1) {
4852 srInt_64 hit_exon_id = nhits2?hits_indices2[0]:hits_indices1[0];
4853
4854 //SUBREADprintf("V_AND_A: '%p'\n", RG_name);
4855
4856 if(RG_name){
4857 void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
4858 fc_read_counters * sumtab = tab4s[1];
4859 sumtab -> assigned_reads++;
4860
4861 read_count_type_t * count_table = tab4s[0];
4862 count_table[hit_exon_id] += fixed_fractional_count;
4863 }else{
4864 thread_context->count_table[hit_exon_id] += fixed_fractional_count;
4865 thread_context->read_counters.assigned_reads ++;
4866 }
4867 thread_context->nreads_mapped_to_exon++;
4868 if(global_context -> read_details_out_FP){
4869 int final_gene_number = global_context -> exontable_geneid[hit_exon_id];
4870 char * final_feture_name = (char *)global_context -> gene_name_array[final_gene_number];
4871 write_read_details_FP(global_context, thread_context, "Assigned", 1, final_feture_name, bin1, bin2);
4872 }
4873 if(global_context -> do_scRNA_table){
4874 srInt_64 assignment_target_number = hit_exon_id;
4875 if(global_context->is_gene_level) assignment_target_number = global_context -> exontable_geneid[hit_exon_id];
4876 add_scRNA_read_to_pool(global_context, thread_context, assignment_target_number, read_name, bin1, NULL);
4877 }
4878 } else if(global_context -> need_calculate_overlap_len == 0 && nhits2 == 1 && nhits1 == 1 && hits_indices2[0]==hits_indices1[0]) {
4879 srInt_64 hit_exon_id = hits_indices1[0];
4880
4881 if(RG_name){
4882 void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
4883 fc_read_counters * sumtab = tab4s[1];
4884 sumtab -> assigned_reads++;
4885
4886 read_count_type_t * count_table = tab4s[0];
4887 count_table[hit_exon_id] += fixed_fractional_count;
4888 }else{
4889 thread_context->count_table[hit_exon_id] += fixed_fractional_count;
4890 thread_context->read_counters.assigned_reads ++;
4891 }
4892 thread_context->nreads_mapped_to_exon++;
4893 if(global_context -> read_details_out_FP)
4894 {
4895 int final_gene_number = global_context -> exontable_geneid[hit_exon_id];
4896 char * final_feture_name = (char *)global_context -> gene_name_array[final_gene_number];
4897 write_read_details_FP(global_context, thread_context, "Assigned", 1, final_feture_name, bin1, bin2);
4898 }
4899
4900 if(global_context -> do_scRNA_table){
4901 srInt_64 assignment_target_number = hit_exon_id;
4902 if(global_context->is_gene_level) assignment_target_number = global_context -> exontable_geneid[hit_exon_id];
4903 add_scRNA_read_to_pool(global_context, thread_context, assignment_target_number, read_name, bin1, NULL);
4904 }
4905 } else {
4906 // Build a voting table.
4907 // The voting table should be:
4908 // total_length [nhit_final] = total_length_overlapping
4909 // final_id [nhit_final] = final_exon_id
4910
4911 // if is_gene_leven, then decision_table_exon_ids[nhit_final] is the exon id where the count is added.
4912
4913 // After all, the count is added to all hits where total_length has the maximum value.
4914 // If there are more than one locations having the same total_length, then the fragment is ambiguous.
4915 // Count is added when "-O" is specified.
4916
4917 // merge feature : if a read overlaps with an EXON twice or more times (by >=2 segments in cigar),
4918 // then the total length of the overlapped bases is calculated.
4919 //
4920 // two ends in a fragment is considered individually; the overlapping bases are not added up.
4921 //
4922
4923
4924 unsigned int * scoring_numbers = thread_context -> scoring_buff_numbers; // size is : MAX_HIT_NUMBER *2
4925 unsigned int * scoring_flags = thread_context -> scoring_buff_flags; // size is : MAX_HIT_NUMBER *2
4926 unsigned int * scoring_overlappings = thread_context -> scoring_buff_overlappings; // size is : MAX_HIT_NUMBER *2
4927 srInt_64 * scoring_exon_ids = thread_context -> scoring_buff_exon_ids; // size is : MAX_HIT_NUMBER *2
4928 int scoring_count = 0, score_x1;
4929
4930
4931 if( global_context -> need_calculate_overlap_len ){
4932 int end1, end2, hit_x1, hit_x2;
4933 char ** scoring_gap_chros = thread_context -> scoring_buff_gap_chros;
4934 unsigned int * scoring_gap_starts = thread_context -> scoring_buff_gap_starts; // size is : MAX_HIT_NUMBER *2;
4935 unsigned short * scoring_gap_lengths = thread_context -> scoring_buff_gap_lengths; // size is : MAX_HIT_NUMBER *2* global_context -> max_M*2
4936
4937 char used_hit1 [nhits1];
4938 char used_hit2 [nhits2];
4939
4940 if( global_context -> fractional_minimum_feature_overlapping > 1E-10 || global_context -> max_missing_bases_in_feature >= 0){
4941 memset(used_hit1 , 0 , nhits1);
4942 memset(used_hit2 , 0 , nhits2);
4943 for(end1 = 0; end1 < global_context -> is_paired_end_mode_assign + 1 ; end1++){
4944 int allhits = end1?nhits2:nhits1;
4945 srInt_64 * hits_indices_X1 = end1?hits_indices2:hits_indices1;
4946 char * used_hit_X1 = end1?used_hit2:used_hit1;
4947
4948 for(hit_x1 = 0; hit_x1 < allhits; hit_x1++){
4949 if(used_hit_X1[hit_x1])continue;
4950
4951 srInt_64 tested_exon_id = hits_indices_X1[hit_x1];
4952 srInt_64 exon_span = global_context -> exontable_stop[tested_exon_id] +1;
4953 exon_span -= global_context -> exontable_start[tested_exon_id];
4954
4955 srInt_64 applied_overlapping_threshold_frac = 0, applied_overlapping_threshold_missing = 0;
4956 if(global_context -> max_missing_bases_in_feature >= 0){
4957 if(exon_span <= global_context -> max_missing_bases_in_feature) applied_overlapping_threshold_missing = 0;
4958 else applied_overlapping_threshold_missing = 10000L * (exon_span - global_context -> max_missing_bases_in_feature);
4959 }
4960
4961 applied_overlapping_threshold_frac = (srInt_64)(exon_span *10000.* global_context -> fractional_minimum_feature_overlapping + 0.9999);
4962
4963 srInt_64 applied_overlapping_threshold = max(applied_overlapping_threshold_frac , applied_overlapping_threshold_missing);
4964
4965 scoring_gap_chros[0 ] = (end1?hits_chro2:hits_chro1)[hit_x1];
4966 scoring_gap_starts[0 ] = (end1?hits_start_pos2:hits_start_pos1)[hit_x1];
4967 scoring_gap_lengths[0 ] = (end1?hits_length2:hits_length1)[hit_x1];
4968 int gaps=1;
4969
4970 for(end2 = 0; end2 < global_context -> is_paired_end_mode_assign + 1 ; end2++){
4971 int allhits2 = end2?nhits2:nhits1;
4972 char * used_hit_X2 = end2?used_hit2:used_hit1;
4973 srInt_64 * hits_indices_X2 = end2?hits_indices2:hits_indices1;
4974
4975
4976 for(hit_x2 = 0; hit_x2 < allhits2; hit_x2++){
4977 if(used_hit_X2[hit_x2]) continue;
4978 srInt_64 other_exon_id = hits_indices_X2[hit_x2];
4979 if(other_exon_id == tested_exon_id){
4980 used_hit_X2[ hit_x2 ]=1;
4981 scoring_gap_chros[ gaps ] = (end2?hits_chro2:hits_chro1)[hit_x2];
4982 scoring_gap_starts[ gaps ] = (end2?hits_start_pos2:hits_start_pos1)[hit_x2];
4983 scoring_gap_lengths[ gaps ] = (end2?hits_length2:hits_length1)[hit_x2];
4984 gaps ++;
4985 }
4986 }
4987 }
4988
4989
4990 srInt_64 tested_exon_overlap_any_read = 10000L*calc_score_overlaps(global_context, thread_context, scoring_gap_chros, scoring_gap_starts, scoring_gap_lengths, gaps, read_name);
4991 if(applied_overlapping_threshold > tested_exon_overlap_any_read){
4992 // remove this exon from lists
4993
4994 for(end2 = 0; end2 < global_context -> is_paired_end_mode_assign + 1 ; end2++){
4995 int allhits2 = end2?nhits2:nhits1;
4996 srInt_64 * hits_indices_X2 = end2?hits_indices2:hits_indices1;
4997
4998 for(hit_x2 = 0; hit_x2 < allhits2; hit_x2++){
4999 srInt_64 other_exon_id = hits_indices_X2[hit_x2];
5000 if(other_exon_id == tested_exon_id){
5001 hits_indices_X2[hit_x2] = -1;
5002 }
5003 }
5004 }
5005 }
5006 }
5007 }
5008 }
5009
5010 memset(used_hit1 , 0 , nhits1);
5011 memset(used_hit2 , 0 , nhits2);
5012
5013 for(end1 = 0; end1 < global_context -> is_paired_end_mode_assign + 1 ; end1++){
5014 srInt_64 * hits_indices_X1 = end1?hits_indices2:hits_indices1;
5015 char * used_hit_X1 = end1?used_hit2:used_hit1;
5016 int nhit_X1 = end1?nhits2:nhits1;
5017
5018 for( hit_x1 = 0 ; hit_x1 < nhit_X1; hit_x1 ++ ){
5019 if(used_hit_X1[hit_x1])continue;
5020
5021 int gaps = 0;
5022 srInt_64 tmp_exon_id = hits_indices_X1[hit_x1];
5023 if(tmp_exon_id < 0) continue;
5024 srInt_64 score_merge_key;
5025 if (global_context -> is_gene_level )
5026 score_merge_key = global_context -> exontable_geneid[tmp_exon_id];
5027 else score_merge_key = tmp_exon_id;
5028
5029
5030 scoring_gap_chros[0 ] = (end1?hits_chro2:hits_chro1)[hit_x1];
5031 scoring_gap_starts[0 ] = (end1?hits_start_pos2:hits_start_pos1)[hit_x1];
5032 scoring_gap_lengths[0 ] = (end1?hits_length2:hits_length1)[hit_x1];
5033
5034 gaps=1;
5035
5036 scoring_flags[scoring_count] = end1?2:1;
5037 scoring_numbers[scoring_count] =1;
5038 scoring_exon_ids[scoring_count] = tmp_exon_id;
5039
5040 used_hit_X1[ hit_x1 ]=1;
5041
5042 for(end2 = 0; end2 < global_context -> is_paired_end_mode_assign + 1 ; end2++){
5043 srInt_64 * hits_indices_X2 = end2?hits_indices2:hits_indices1;
5044 char * used_hit_X2 = end2?used_hit2:used_hit1;
5045 int nhit_X2 = end2?nhits2:nhits1;
5046
5047 for( hit_x2 = 0 ; hit_x2 < nhit_X2; hit_x2 ++ ){
5048 if(used_hit_X2[hit_x2])continue;
5049 if(hits_indices_X2[hit_x2] < 0) continue;
5050
5051 srInt_64 X2_merge_key;
5052 if (global_context -> is_gene_level )
5053 X2_merge_key = global_context -> exontable_geneid[ hits_indices_X2[hit_x2] ];
5054 else X2_merge_key = hits_indices_X2[hit_x2];
5055
5056 if( X2_merge_key == score_merge_key ){
5057 used_hit_X2[ hit_x2 ]=1;
5058 scoring_gap_chros[ gaps ] = (end2?hits_chro2:hits_chro1)[hit_x2];
5059 scoring_gap_starts[ gaps ] = (end2?hits_start_pos2:hits_start_pos1)[hit_x2];
5060 scoring_gap_lengths[ gaps ] = (end2?hits_length2:hits_length1)[hit_x2];
5061
5062 if((scoring_flags[scoring_count] & (end2?2:1))== 0 ){
5063 scoring_flags[scoring_count] |= end2?2:1;
5064 scoring_numbers[scoring_count] ++;
5065 }
5066 gaps ++;
5067 }
5068 }
5069 }
5070
5071 scoring_overlappings [scoring_count] = calc_score_overlaps(global_context, thread_context, scoring_gap_chros, scoring_gap_starts, scoring_gap_lengths, gaps, read_name);
5072 if( global_context -> use_overlapping_break_tie )
5073 scoring_numbers[scoring_count] = scoring_overlappings [scoring_count];
5074 scoring_count++;
5075 }
5076 }
5077 }else{
5078 int ends;
5079 for(ends =0 ; ends < global_context -> is_paired_end_mode_assign + 1 ; ends++){
5080 int nhits = ends?nhits2:nhits1;
5081 srInt_64 * hits_indices = ends?hits_indices2:hits_indices1;
5082
5083 int hit_x1;
5084 for(hit_x1 = 0; hit_x1 < nhits; hit_x1++){
5085 srInt_64 tmp_exon_id = hits_indices[hit_x1], score_merge_key;
5086 int found = 0;
5087 if (global_context -> is_gene_level )
5088 score_merge_key = global_context -> exontable_geneid[tmp_exon_id];
5089 else score_merge_key = tmp_exon_id;
5090
5091 for(score_x1 = 0; score_x1 < scoring_count; score_x1 ++){
5092 srInt_64 score_x1_key ;
5093 if (global_context -> is_gene_level )
5094 score_x1_key = global_context -> exontable_geneid[ scoring_exon_ids[score_x1] ];
5095 else score_x1_key = scoring_exon_ids[score_x1] ;
5096
5097 if( score_x1_key == score_merge_key ){
5098 if((scoring_flags[score_x1] & ( ends?2:1 )) == 0) {
5099 scoring_flags[score_x1] |= (ends?2:1);
5100 scoring_numbers[score_x1] ++;
5101 }
5102
5103 found = 1;
5104 break;
5105 }
5106 }
5107
5108 if(0 == found){
5109 scoring_exon_ids[scoring_count] = tmp_exon_id;
5110 scoring_flags[scoring_count] = ends?2:1;
5111 scoring_numbers[scoring_count] = 1;
5112
5113 scoring_count++;
5114 }
5115 }
5116 }
5117 }
5118
5119
5120 int maximum_score = 0;
5121 int maximum_total_count = 0;
5122 int maximum_score_x1 = 0;
5123 srInt_64 applied_fragment_minimum_overlapping_overlap = 1, applied_fragment_minimum_overlapping_missing = 1;
5124 srInt_64 applied_fragment_minimum_overlapping = 1;
5125 int overlapping_total_count = 0;
5126
5127 if( global_context -> fragment_minimum_overlapping > 1 || global_context -> need_calculate_fragment_len || global_context -> max_missing_bases_in_read >= 0){
5128 if(global_context -> max_missing_bases_in_read >=0){
5129 if(total_frag_len <= global_context -> max_missing_bases_in_read) applied_fragment_minimum_overlapping_missing = 0;
5130 else applied_fragment_minimum_overlapping_missing = 10000L * (total_frag_len - global_context -> max_missing_bases_in_read);
5131 }
5132
5133 applied_fragment_minimum_overlapping_overlap = max( 10000L * global_context -> fragment_minimum_overlapping, 10000. * global_context -> fractional_minimum_overlapping * total_frag_len + 0.9999);
5134
5135 applied_fragment_minimum_overlapping = max(applied_fragment_minimum_overlapping_overlap , applied_fragment_minimum_overlapping_missing);
5136 }
5137
5138 if(scoring_count == 0){
5139 if(global_context -> read_details_out_FP)
5140 write_read_details_FP(global_context, thread_context,"Unassigned_NoFeatures",-1, NULL, bin1, bin2);
5141 if(RG_name){
5142 void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
5143 fc_read_counters * sumtab = tab4s[1];
5144 sumtab -> unassigned_nofeatures++;
5145 }else thread_context->read_counters.unassigned_nofeatures ++;
5146
5147 if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
5148 }else{
5149 for(score_x1 = 0; score_x1 < scoring_count ; score_x1++){
5150 // #warning "======= DEBUG OUT ================"
5151 if(0 && FIXLENstrcmp("V0112_0155:7:1101:20072:12961", read_name)==0)
5152 SUBREADprintf("READ: %s FRAG_LEN=%d, THIS_OVERLAP=%d\n", read_name, total_frag_len, scoring_overlappings[score_x1]);
5153 if( applied_fragment_minimum_overlapping > 1 )
5154 if( applied_fragment_minimum_overlapping > 10000L*scoring_overlappings[score_x1] ){
5155 scoring_numbers[score_x1] = 0;
5156 continue;
5157 }
5158
5159 if( maximum_score < scoring_numbers[score_x1] ){
5160 maximum_total_count = 1;
5161 maximum_score = scoring_numbers[score_x1];
5162 maximum_score_x1 = score_x1;
5163 }else if( maximum_score == scoring_numbers[score_x1] )
5164 maximum_total_count++;
5165 overlapping_total_count ++;
5166 }
5167
5168 if(maximum_total_count == 0){
5169 if(global_context -> read_details_out_FP)
5170 write_read_details_FP(global_context, thread_context,"Unassigned_Overlapping_Length", -1, NULL, bin1, bin2);
5171
5172 if(RG_name){
5173 void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
5174 fc_read_counters * sumtab = tab4s[1];
5175 sumtab -> unassigned_overlapping_length++;
5176 }else thread_context->read_counters.unassigned_overlapping_length ++;
5177
5178 if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
5179 }else{
5180
5181 // final adding votes.
5182 if(1 == maximum_total_count && !global_context -> is_multi_overlap_allowed) {
5183 // simple add to the exon ( EXON_ID = decision_table_exon_ids[maximum_decision_no])
5184 srInt_64 max_exon_id = scoring_exon_ids[maximum_score_x1];
5185
5186 if(RG_name){
5187 void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
5188 fc_read_counters * sumtab = tab4s[1];
5189 sumtab -> assigned_reads++;
5190
5191 read_count_type_t * count_table = tab4s[0];
5192 count_table[max_exon_id] += fixed_fractional_count;
5193 }else{
5194 thread_context->count_table[max_exon_id] += fixed_fractional_count;
5195 thread_context->read_counters.assigned_reads ++;
5196 }
5197 thread_context->nreads_mapped_to_exon++;
5198 if(global_context -> read_details_out_FP) {
5199 int final_gene_number = global_context -> exontable_geneid[max_exon_id];
5200 char * final_feture_name = (char *)global_context -> gene_name_array[final_gene_number];
5201 write_read_details_FP(global_context, thread_context,"Assigned", 1, final_feture_name, bin1, bin2);
5202 }
5203
5204 if(global_context -> do_scRNA_table){
5205 srInt_64 assignment_target_number = max_exon_id;
5206 if(global_context->is_gene_level) assignment_target_number = global_context -> exontable_geneid[max_exon_id];
5207 add_scRNA_read_to_pool(global_context, thread_context, assignment_target_number, read_name, bin1, NULL);
5208 }
5209 }else if(global_context -> is_multi_overlap_allowed) {
5210 #define GENE_NAME_LIST_BUFFER_SIZE (FEATURE_NAME_LENGTH * 50)
5211
5212 char final_feture_names[GENE_NAME_LIST_BUFFER_SIZE];
5213 int assigned_no = 0, xk1;
5214 final_feture_names[0]=0;
5215 int is_etc = 0;
5216
5217 ArrayList * assigned_list = NULL;
5218 if(global_context -> do_scRNA_table)assigned_list = ArrayListCreate(20);
5219 for(xk1 = 0; xk1 < scoring_count; xk1++)
5220 {
5221
5222 // This change was made on 31/MAR/2016
5223 if( scoring_numbers[xk1] < 1 ) continue ;
5224 if( scoring_numbers[xk1] < maximum_score && global_context -> use_overlapping_break_tie ) continue ;
5225
5226 srInt_64 tmp_voter_id = scoring_exon_ids[xk1];
5227
5228 srInt_64 assignment_target_number = tmp_voter_id;
5229 if(global_context->is_gene_level) assignment_target_number = global_context -> exontable_geneid[tmp_voter_id];
5230
5231 if(global_context -> do_scRNA_table)ArrayListPush(assigned_list, NULL+assignment_target_number);
5232 //if(1 && FIXLENstrcmp( read_name , "V0112_0155:7:1101:5467:23779#ATCACG" )==0)
5233 // SUBREADprintf("CountsFrac = %d ; add=%d\n", overlapping_total_count, calculate_multi_overlap_fraction(global_context, fixed_fractional_count, overlapping_total_count) );
5234 if(RG_name){
5235 void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
5236 read_count_type_t * count_table = tab4s[0];
5237 count_table[tmp_voter_id] += calculate_multi_overlap_fraction(global_context, fixed_fractional_count, overlapping_total_count);
5238 }else thread_context->count_table[tmp_voter_id] += calculate_multi_overlap_fraction(global_context, fixed_fractional_count, overlapping_total_count);
5239
5240 if(global_context -> read_details_out_FP) {
5241 if(strlen(final_feture_names)< (GENE_NAME_LIST_BUFFER_SIZE - 40 - FEATURE_NAME_LENGTH)) {
5242 int final_gene_number = global_context -> exontable_geneid[tmp_voter_id];
5243 unsigned char * final_feture_name = global_context -> gene_name_array[final_gene_number];
5244 strncat(final_feture_names, (char *)final_feture_name, GENE_NAME_LIST_BUFFER_SIZE-1);
5245 strncat(final_feture_names, ",", GENE_NAME_LIST_BUFFER_SIZE-1);
5246 }else{
5247 is_etc ++;
5248 }
5249 assigned_no++;
5250 }
5251 }
5252
5253 if(global_context -> do_scRNA_table && assigned_list->numOfElements>0)
5254 add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, assigned_list);
5255
5256 if(assigned_list)ArrayListDestroy(assigned_list);
5257
5258 if(is_etc) sprintf(final_feture_names + strlen(final_feture_names), "... (%d names ommited),", is_etc);
5259 final_feture_names[GENE_NAME_LIST_BUFFER_SIZE-1]=0;
5260
5261 if(RG_name){
5262 void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
5263 fc_read_counters * sumtab = tab4s[1];
5264 sumtab -> assigned_reads++;
5265 }else{
5266 thread_context->read_counters.assigned_reads ++;
5267 }
5268 thread_context->nreads_mapped_to_exon++;
5269
5270 if(global_context -> read_details_out_FP) {
5271 int ffnn = strlen(final_feture_names);
5272 if(ffnn>0) final_feture_names[ffnn-1]=0;
5273 // overlapped but still assigned
5274 write_read_details_FP(global_context, thread_context, "Assigned", assigned_no, final_feture_names, bin1, bin2);
5275 }
5276 } else {
5277 if(global_context -> read_details_out_FP)
5278 write_read_details_FP(global_context, thread_context,"Unassigned_Ambiguity", -1, NULL, bin1, bin2);
5279 if(RG_name){
5280 fc_read_counters * sumtab = get_RG_tables(global_context, thread_context, RG_name)[1];
5281 sumtab -> unassigned_ambiguous++;
5282 }else{
5283 thread_context->read_counters.unassigned_ambiguous ++;
5284 }
5285
5286 if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
5287 }
5288 }
5289 }
5290 }
5291 }
5292
scRNA_merge_thread_reads_in(void * ky,void * val,HashTable * tab)5293 void scRNA_merge_thread_reads_in(void *ky, void *val, HashTable * tab){
5294 int * thread_umi_no_to_global_umi_no = tab->appendix1;
5295 int supp_reads_to_umi_in_cell = val-NULL;
5296 HashTable * merged_genep1_to_bcumip1_lists_table = tab->appendix2;
5297 HashTable * merged_genep1_to_bcumip1_reads_tab_table = tab->appendix3;
5298 srInt_64 gene_no = tab->counter1;
5299 srInt_64 cellno_locel_umino = (ky - NULL-1);
5300 srInt_64 cellno_global_umino = (cellno_locel_umino & 0xffffffff00000000llu) + thread_umi_no_to_global_umi_no[ cellno_locel_umino & 0xffffffff ];
5301
5302 ArrayList * merged_reads_gene_p1_list = HashTableGet(merged_genep1_to_bcumip1_lists_table , NULL+1+gene_no);
5303 if(NULL == merged_reads_gene_p1_list){
5304 merged_reads_gene_p1_list = ArrayListCreate(10);
5305 HashTablePut(merged_genep1_to_bcumip1_lists_table, NULL+1+gene_no, merged_reads_gene_p1_list);
5306 }
5307 ArrayListPush( merged_reads_gene_p1_list, NULL+cellno_global_umino +1 );
5308 // Rprintf("PUSH_GLB : %016llx\n", cellno_global_umino);
5309
5310 HashTable * bc_umip1_to_reads_tab = HashTableGet(merged_genep1_to_bcumip1_reads_tab_table, NULL+1+gene_no);
5311 if(NULL == bc_umip1_to_reads_tab){
5312 bc_umip1_to_reads_tab = HashTableCreate(10000);
5313 HashTablePut(merged_genep1_to_bcumip1_reads_tab_table, NULL+1+gene_no, bc_umip1_to_reads_tab);
5314 }
5315
5316 int sup_reads = HashTableGet(bc_umip1_to_reads_tab, NULL+cellno_global_umino+1) - NULL;
5317 sup_reads += supp_reads_to_umi_in_cell;
5318 HashTablePut(bc_umip1_to_reads_tab, NULL+cellno_global_umino+1, NULL+sup_reads);
5319 }
5320
scRNA_merge_thread_reads(void * ky,void * val,HashTable * tab)5321 void scRNA_merge_thread_reads(void *ky, void *val, HashTable * tab){
5322 int * thread_umi_no_to_global_umi_no = tab->appendix1;
5323 srInt_64 gene_no = ky-NULL -1;
5324 HashTable * merged_genep1_to_bcumip1_lists_table = tab->appendix2;
5325 HashTable * merged_genep1_to_bcumip1_reads_tab_table = tab->appendix3;
5326 HashTable * in_gene_cell_umi_table = val;
5327
5328 //SUBREADprintf("scRNA_merge_thread_reads : %llu has %ld\n", gene_no, in_gene_cell_umi_table -> numOfElements);
5329
5330 in_gene_cell_umi_table -> appendix3 = merged_genep1_to_bcumip1_reads_tab_table;
5331 in_gene_cell_umi_table -> appendix2 = merged_genep1_to_bcumip1_lists_table;
5332 in_gene_cell_umi_table -> appendix1 = thread_umi_no_to_global_umi_no;
5333 in_gene_cell_umi_table -> counter1 = gene_no;
5334 HashTableIteration(in_gene_cell_umi_table, scRNA_merge_thread_reads_in);
5335 }
5336
scRNA_merge_thread_umitables(void * ky,void * val,HashTable * tab)5337 void scRNA_merge_thread_umitables(void *ky, void *val, HashTable * tab){
5338 int * thread_umi_no_to_global_umi_no = tab->appendix1;
5339 HashTable * merged_umi_table = tab->appendix2;
5340 ArrayList * merged_umi_list = tab->appendix3;
5341
5342 char * umicode = ky;
5343 int local_no = val-NULL-1;
5344 assert(local_no >= 0);
5345
5346 int global_no = HashTableGet(merged_umi_table , umicode)-NULL-1;
5347 if(global_no<0){
5348 char * newkey = strdup(umicode);
5349
5350 global_no = merged_umi_table -> numOfElements;
5351 HashTablePut(merged_umi_table, newkey, NULL+global_no+1);
5352
5353 assert(merged_umi_list -> numOfElements == global_no);
5354 ArrayListPush(merged_umi_list, newkey);
5355 }
5356 thread_umi_no_to_global_umi_no[ local_no ] = global_no;
5357 }
5358
5359 #define MIN_EXPRESSED_UMIS_PER_CELL 100
5360 #define MIN_EXPRESSED_UMIS_PER_GENE (3-2)
5361
scRNA_merge_write_copy_gene_nos(void * ky,void * va,HashTable * tab)5362 void scRNA_merge_write_copy_gene_nos(void * ky, void * va , HashTable *tab){
5363 HashTable * used_gene_table = tab -> appendix2;
5364 ArrayList * one_sampl_gene_to_cell_umis = va;
5365
5366 srInt_64 UMIs = HashTableGet(used_gene_table, ky)-NULL;
5367 HashTablePut(used_gene_table, ky, NULL + UMIs + one_sampl_gene_to_cell_umis -> numOfElements);
5368 }
scRNA_merge_write_zero_gene(fc_thread_global_context_t * global_context,char * linebuf,ArrayList * high_confid_barcode_index_list)5369 int scRNA_merge_write_zero_gene(fc_thread_global_context_t * global_context, char * linebuf, ArrayList * high_confid_barcode_index_list){
5370 int ret=0;
5371 srInt_64 x1;
5372 for(x1=0;x1<high_confid_barcode_index_list->numOfElements;x1++)ret += sprintf(linebuf + ret,"\t0");
5373 return ret;
5374 }
5375
5376 //#warning "======== SCRNA_ALLOWED_MAX_HAMMING_DIFF IS ZERO !! ========"
5377 #define SCRNA_ALLOWED_MAX_HAMMING_DIFF 1
5378
scRNA_reduce_cellno_compare(void * arr,int l,int r)5379 int scRNA_reduce_cellno_compare(void * arr, int l, int r){
5380 void **sd = arr;
5381 ArrayList * cellno_umino_p1_list = sd[0];
5382 HashTable * cellno_umino_p1_to_reads_tab = sd[4];
5383 srInt_64 off = sd[1]-NULL;
5384
5385 srInt_64 bc_umi_p1_L = ArrayListGet(cellno_umino_p1_list, off+l) - NULL;
5386 srInt_64 bc_umi_p1_R = ArrayListGet(cellno_umino_p1_list, off+r) - NULL;
5387 int nreads_L = HashTableGet(cellno_umino_p1_to_reads_tab, NULL+bc_umi_p1_L) - NULL;
5388 int nreads_R = HashTableGet(cellno_umino_p1_to_reads_tab, NULL+bc_umi_p1_R) - NULL;
5389
5390 if(nreads_L<1 || nreads_R<1) SUBREADprintf("ERROR: No known read counts: %d, %d\n", nreads_L, nreads_R);
5391 if(nreads_L>nreads_R) return -1;
5392 if(nreads_L<nreads_R) return 1;
5393
5394 srInt_64 umiLno = (bc_umi_p1_L-1) & 0xffffffff;
5395 srInt_64 umiRno = (bc_umi_p1_R-1) & 0xffffffff;
5396 ArrayList * merged_umi_no_to_seq = sd[3];
5397 char * umiLseq = ArrayListGet(merged_umi_no_to_seq, umiLno);
5398 char * umiRseq = ArrayListGet(merged_umi_no_to_seq, umiRno);
5399 return strcmp(umiLseq, umiRseq);
5400 }
5401
scRNA_reduce_cellno_exchange(void * arr,int l,int r)5402 void scRNA_reduce_cellno_exchange(void * arr, int l, int r){
5403 void **sd = arr;
5404 ArrayList * cellno_umino_p1_list = sd[0];
5405 srInt_64 off = sd[1]-NULL;
5406
5407 void* ti = cellno_umino_p1_list->elementList[off+l];
5408 cellno_umino_p1_list->elementList[off+l] = cellno_umino_p1_list->elementList[off+r];
5409 cellno_umino_p1_list->elementList[off+r] = ti;
5410 }
5411
scRNA_reduce_cellno_merge(void * arr,int start,int items,int items2)5412 void scRNA_reduce_cellno_merge(void * arr, int start, int items, int items2){
5413 void **sd = arr;
5414 ArrayList * cellno_umino_p1_list = sd[0];
5415 srInt_64 off = sd[1]-NULL;
5416
5417 void ** tmpelem=malloc(sizeof(void*)*(items+items2));
5418 int i1_cursor = start, i2_cursor = items + start, tmp_cursor=0;
5419 while(1){
5420 if(i1_cursor == items + start && i2_cursor == items + items2 + start )break;
5421 int select_items_1 = (i2_cursor == start + items + items2) || (i1_cursor < items + start && scRNA_reduce_cellno_compare(arr, i1_cursor, i2_cursor) <= 0);
5422
5423 if(select_items_1)
5424 tmpelem[tmp_cursor++] = cellno_umino_p1_list->elementList[off+(i1_cursor++)];
5425 else
5426 tmpelem[tmp_cursor++] = cellno_umino_p1_list->elementList[off+(i2_cursor++)];
5427 }
5428
5429 memcpy(cellno_umino_p1_list -> elementList+off+start, tmpelem, sizeof(void*)*(items+items2));
5430 free(tmpelem);
5431 }
5432
5433 // #define DEBUG_FOR_EXACT
5434 #define MIN_UMIS_FOR_CANDIDATE_RESCUE 500
5435 #define SCRNA_AMBIENT_RESCURE_MEDIAN_FRACTION 0.01
scRNA_merged_ambient_rescure(fc_thread_global_context_t * global_context,HashTable * cellP1_to_geneP1_to_umis_tab,HashTable * cellnoP1_to_umis_tab,ArrayList * this_sample_45k_90k_barcode_no_P0,ArrayList * this_sample_ambient_rescure_candi,ArrayList * highconf_cellbc_list)5436 void scRNA_merged_ambient_rescure(fc_thread_global_context_t * global_context, HashTable * cellP1_to_geneP1_to_umis_tab, HashTable * cellnoP1_to_umis_tab, ArrayList * this_sample_45k_90k_barcode_no_P0, ArrayList * this_sample_ambient_rescure_candi, ArrayList * highconf_cellbc_list){
5437 ArrayList * sorted_bcno_p1 = HashTableSortedIndexes( cellnoP1_to_umis_tab, 1);
5438 HashTable * highconf_cellbc_list_tab = ArrayListToLookupTable_Int(highconf_cellbc_list);
5439 srInt_64 x1, high_conf_cells = 0;
5440 for(x1=0; x1 < sorted_bcno_p1 -> numOfElements; x1++){
5441 void * this_bc_pnt = ArrayListGet(sorted_bcno_p1 , x1);
5442 if(HashTableGet(highconf_cellbc_list_tab, this_bc_pnt)) high_conf_cells = x1+1;
5443 else break; // assuming that all high-umi barcodes are high-confident, this makes x1 being the # of total high-confidence barcodes.
5444 }
5445 #ifdef DEBUG_FOR_EXACT
5446 #warning "============= EXT 1 ==========="
5447 FILE * tfp = fopen("/tmp/del4-YangLiao-rescue-cand.txt","w");
5448 #endif
5449 if(high_conf_cells >0){
5450 srInt_64 median_umis = HashTableGet(cellnoP1_to_umis_tab, ArrayListGet(sorted_bcno_p1 , (high_conf_cells-1)/2))-NULL;
5451 srInt_64 median_umis_001_cut = (srInt_64)(median_umis *1. *SCRNA_AMBIENT_RESCURE_MEDIAN_FRACTION +0.50000001);
5452 for(x1=0; x1 < sorted_bcno_p1 -> numOfElements; x1++){
5453 void * this_bc_pnt_p1 = ArrayListGet(sorted_bcno_p1 , x1);
5454 if(HashTableGet(highconf_cellbc_list_tab, this_bc_pnt_p1)){
5455 continue; // it is in high-conf list
5456 }
5457 srInt_64 this_bc_umis = HashTableGet(cellnoP1_to_umis_tab, this_bc_pnt_p1) - NULL;
5458 if(this_bc_umis < median_umis_001_cut) break;
5459 if(this_bc_umis < MIN_UMIS_FOR_CANDIDATE_RESCUE) break;
5460 if(x1 >= 45000) break;
5461 ArrayListPush(this_sample_ambient_rescure_candi, this_bc_pnt_p1-1);
5462 }
5463 #ifdef DEBUG_FOR_EXACT
5464 #warning "============= EXT 2 ==========="
5465 for(x1=0; x1<this_sample_ambient_rescure_candi->numOfElements; x1++){
5466 int this_bc_no_p0 = ArrayListGet(this_sample_ambient_rescure_candi, x1)-NULL;
5467 srInt_64 this_bc_umis = HashTableGet(used_cell_barcode_tab, NULL+this_bc_no_p0+1) - NULL;
5468 fprintf(tfp,"CAND %d %d\n", this_bc_no_p0+1, this_bc_umis);
5469 }
5470 #endif
5471 }
5472 for(x1=45000; x1 < sorted_bcno_p1 -> numOfElements; x1++){
5473 if(x1 >= 90000) break;
5474 ArrayListPush(this_sample_45k_90k_barcode_no_P0, ArrayListGet(sorted_bcno_p1 , x1)-1 );
5475 #ifdef DEBUG_FOR_EXACT
5476 #warning "============= EXT 3 ==========="
5477 int this_bc_no_p1 = ArrayListGet(sorted_bcno_p1, x1)-NULL;
5478 int this_bc_umis = HashTableGet(used_cell_barcode_tab, NULL+this_bc_no_p1) - NULL;
5479 fprintf(tfp,"45K90K %d %d\n", this_bc_no_p1, this_bc_umis);
5480 #endif
5481 }
5482 ArrayListDestroy(sorted_bcno_p1);
5483 HashTableDestroy(highconf_cellbc_list_tab);
5484 #ifdef DEBUG_FOR_EXACT
5485 #warning "============= EXT 4 ==========="
5486 fclose(tfp);
5487
5488 FILE * fp = fopen("/tmp/del4-YangLiao-from-python-rescue.txt","r");
5489
5490 x1=0;
5491 while(1){
5492 char * tpm=NULL;
5493 char fl[100];
5494 char * fr = fgets(fl, 99, fp);
5495 if(!fr) break;
5496 if(fl[0]!='4') continue;
5497 int bc_no = atoi(fl+7) -1;
5498 this_sample_45k_90k_barcode_no_P0 -> elementList[x1++] = NULL+bc_no;
5499 if(x1 >= 45000)break;
5500 }
5501 fclose(fp);
5502 fp = fopen("/tmp/del4-YangLiao-from-python-rescue.txt","r");
5503
5504 x1=0;
5505 this_sample_ambient_rescure_candi -> numOfElements = 0;
5506 while(1){
5507 char * tpm=NULL;
5508 char fl[100];
5509 char * fr = fgets(fl, 99, fp);
5510 if(!fr) break;
5511 if(fl[0]!='C') continue;
5512 int bc_no = atoi(fl+5) -1;
5513 ArrayListPush(this_sample_ambient_rescure_candi, NULL+bc_no);
5514 }
5515 fclose(fp);
5516 #endif
5517 }
5518
5519
5520 #define SCRNA_BOOTSTRAP_HIGH_INDEX 30
5521 #define SCRNA_BOOTSTRAP_SAMPLING_TIMES 100
5522
5523
scRNA_merged_bootstrap_a_sample(fc_thread_global_context_t * global_context,HashTable * cellP1_to_geneP1_to_umis_tab,HashTable * cellnoP1_to_umis_tab,ArrayList * highconf_cellbc_list)5524 int scRNA_merged_bootstrap_a_sample(fc_thread_global_context_t * global_context, HashTable * cellP1_to_geneP1_to_umis_tab, HashTable * cellnoP1_to_umis_tab, ArrayList * highconf_cellbc_list){
5525 ArrayList * sorted_idx = HashTableSortedIndexes( cellnoP1_to_umis_tab, 1);
5526 srInt_64 x2, x1;
5527 float scRNA_umi_cutoff = global_context -> scRNA_umi_cutoff;
5528
5529 #define SCRNA_IDX_PRIME_NUMBER_BIG 11218439llu;
5530 srInt_64 this_total = 0, seed_rand = sorted_idx -> numOfElements/2;
5531
5532 #ifdef DEBUG_FOR_EXACT
5533 #warning "============== THIS BUILD IS ONLY FOR DEBUGGING EXACT RESULTS !!!! ================="
5534 ArrayListSort(sorted_idx, NULL);
5535 FILE * dfp = fopen("/tmp/del4-YangLiao-for-resample.txt","w");
5536 for(x2 = 0; x2 < sorted_idx -> numOfElements ; x2++){
5537 int bc_no_p1 = ArrayListGet(sorted_idx, x2)-NULL;
5538 int bc_umis = HashTableGet(used_cell_barcode_tab, NULL+bc_no_p1) - NULL;
5539 fprintf(dfp,"%d\t%d\t%s\n", bc_no_p1, bc_umis, ArrayListGet(global_context -> scRNA_cell_barcodes_array, bc_no_p1-1));
5540 }
5541 fclose(dfp);
5542 system("python /usr/local/work/liao/subread/scripts/Cellranger-replicate/CrepPY-resample.py");
5543 FILE * rfp = fopen("/tmp/del4-YangLiao-from-resample.txt","r");
5544 #endif
5545
5546
5547 int last_umi_no= -1;
5548 if(scRNA_umi_cutoff >= 0.0){
5549 for(x1 = 0; x1 < sorted_idx -> numOfElements ; x1++){
5550 void * cellbc_p1_ptr = ArrayListGet(sorted_idx,x1);
5551 srInt_64 this_umis = HashTableGet(cellnoP1_to_umis_tab, cellbc_p1_ptr )-NULL;
5552 if(this_umis >= scRNA_umi_cutoff-0.1){
5553 ArrayListPush(highconf_cellbc_list, ArrayListGet( sorted_idx, x1 ) - 1 );
5554 last_umi_no = this_umis;
5555 }else break; // #UMI-sorted so no need to scan more
5556 }
5557 }else{
5558 for(x1 = 0; x1 < SCRNA_BOOTSTRAP_SAMPLING_TIMES; x1++){
5559 ArrayList * resampled_list_of_umis = ArrayListCreate( sorted_idx->numOfElements );
5560
5561 #ifdef DEBUG_FOR_EXACT
5562 #warning "============== THIS BUILD IS ONLY FOR DEBUGGING EXACT RESULTS !!!! ================="
5563 for(x2 = 0; x2 < sorted_idx -> numOfElements ; x2++){
5564 char fl [100];
5565 fgets(fl, 99, rfp);
5566 int bc_no_p1 = atoi(fl);
5567 int this_umis = HashTableGet(cellnoP1_to_umis_tab, NULL+bc_no_p1);
5568 ArrayListPush(resampled_list_of_umis, NULL+this_umis);
5569 }
5570 ArrayListSort(resampled_list_of_umis, NULL);
5571 #else
5572 for(x2 = 0; x2 < sorted_idx -> numOfElements ; x2++){
5573 seed_rand %= sorted_idx -> numOfElements;
5574 void * cellbc_p1_ptr = ArrayListGet(sorted_idx, seed_rand);
5575 seed_rand += SCRNA_IDX_PRIME_NUMBER_BIG;
5576 srInt_64 this_umis = HashTableGet( cellnoP1_to_umis_tab, cellbc_p1_ptr )-NULL;
5577 ArrayListPush(resampled_list_of_umis,NULL+this_umis);
5578 }
5579 #endif
5580 ArrayListSort( resampled_list_of_umis, NULL );
5581 srInt_64 UMIs_30th_div10 = ArrayListGet(resampled_list_of_umis, resampled_list_of_umis -> numOfElements - SCRNA_BOOTSTRAP_HIGH_INDEX) -NULL;
5582 UMIs_30th_div10 = (srInt_64)(UMIs_30th_div10*1./10 + 0.500000001);
5583
5584 for(x2 =0; x2< resampled_list_of_umis -> numOfElements; x2++){
5585 srInt_64 lli = resampled_list_of_umis -> numOfElements -1 -x2;
5586 srInt_64 this_umis = ArrayListGet(resampled_list_of_umis, lli)-NULL;
5587 if(this_umis >= UMIs_30th_div10) this_total ++;
5588 else break;
5589 }
5590 ArrayListDestroy(resampled_list_of_umis);
5591 }
5592 double total_f = this_total*1. / SCRNA_BOOTSTRAP_SAMPLING_TIMES;
5593 if(0) SUBREADprintf("FINAL_5CODE SELECTION_IDX = %.5f\n",total_f);
5594 this_total = (int)(total_f + 0.500000001);
5595
5596 #ifdef DEBUG_FOR_EXACT
5597 #warning "============== THIS BUILD IS ONLY FOR DEBUGGING EXACT RESULTS !!!! ================="
5598 sorted_idx = HashTableSortedIndexes( used_cell_barcode_tab, 1);
5599 #endif
5600
5601 void * last_ptr =NULL;
5602 for(x1 = 0; x1 < min(sorted_idx -> numOfElements, this_total) ; x1++){
5603 last_ptr = ArrayListGet( sorted_idx, x1 );
5604 ArrayListPush(highconf_cellbc_list, last_ptr - 1 );
5605 }
5606 last_umi_no = HashTableGet(cellnoP1_to_umis_tab ,last_ptr)-NULL;
5607 }
5608 ArrayListDestroy(sorted_idx);
5609 return last_umi_no;
5610 }
5611
build_exon_name(fc_thread_global_context_t * global_context,fc_feature_info_t * loaded_features,int sorted_order,char * exon_name,HashTable * sorted_order_p1_to_i_p1_tab)5612 void build_exon_name(fc_thread_global_context_t * global_context, fc_feature_info_t * loaded_features, int sorted_order, char * exon_name, HashTable * sorted_order_p1_to_i_p1_tab){
5613 srInt_64 i = HashTableGet( sorted_order_p1_to_i_p1_tab , NULL+1+sorted_order )-NULL-1;
5614 sprintf(exon_name, "%s:fc@R@Spl:%s:fc@R@Spl:%u:fc@R@Spl:%u:fc@R@Spl:%c", global_context -> unistr_buffer_space + loaded_features[i].feature_name_pos,
5615 global_context-> unistr_buffer_space + loaded_features[i].feature_name_pos + loaded_features[i].chro_name_pos_delta,
5616 loaded_features[i].start, loaded_features[i].end, loaded_features[i].is_negative_strand == 1?'N':( loaded_features[i].is_negative_strand == 0? 'P':'X'));
5617 }
5618
scRNA_merged_write_sparse_unique_genes(void * ky,void * va,HashTable * tab)5619 void scRNA_merged_write_sparse_unique_genes(void * ky, void * va, HashTable * tab){
5620 HashTable * unique_geneno1B_tab = tab -> appendix1;
5621 HashTable * used_cellnoP1_tab = tab -> appendix2;
5622
5623 int cellbcP1 = ky-NULL;
5624 if(used_cellnoP1_tab && !HashTableGet(used_cellnoP1_tab, NULL+cellbcP1))return;
5625 HashTable * g2u = va;
5626 ArrayList * g2ul = HashTableKeys(g2u);
5627 int x1;
5628 for(x1=0; x1<g2ul->numOfElements; x1++){
5629 void *geneno1B_ptr = ArrayListGet(g2ul,x1);
5630 if(!HashTableGet(unique_geneno1B_tab, ArrayListGet(g2ul,x1))) HashTablePut(unique_geneno1B_tab, geneno1B_ptr, NULL+1);
5631 tab -> counter1 += HashTableGet(g2u, geneno1B_ptr)-NULL;
5632 }
5633 ArrayListDestroy(g2ul);
5634 }
5635
scRNA_merged_write_sparse_matrix(fc_thread_global_context_t * global_context,HashTable * cellP1_to_geneP1_to_umis_tab,HashTable * cellnoP1_to_umis_tab,ArrayList * used_cell_barcodes,int sample_index,char * tabtype,fc_feature_info_t * loaded_features,HashTable * sorted_order_p1_to_i_p1_tab)5636 int scRNA_merged_write_sparse_matrix(fc_thread_global_context_t * global_context, HashTable * cellP1_to_geneP1_to_umis_tab, HashTable * cellnoP1_to_umis_tab, ArrayList * used_cell_barcodes, int sample_index, char * tabtype, fc_feature_info_t* loaded_features, HashTable * sorted_order_p1_to_i_p1_tab){
5637 int x1,x2;
5638
5639 char ofname[MAX_FILE_NAME_LENGTH + 100];
5640 sprintf(ofname,"%s.scRNA.%03d.%s.summary",global_context->input_file_name, sample_index+1,tabtype);
5641 sprintf(ofname,"%s.scRNA.%03d.%s.BCtab",global_context->input_file_name, sample_index+1,tabtype);
5642 FILE * ofp_bcs = fopen( ofname , "w" );
5643 sprintf(ofname,"%s.scRNA.%03d.%s.GENEtab",global_context->input_file_name, sample_index+1,tabtype);
5644 FILE * ofp_genes = fopen( ofname , "w" );
5645 sprintf(ofname,"%s.scRNA.%03d.%s.spmtx",global_context->input_file_name, sample_index+1,tabtype);
5646 FILE * ofp_mtx = fopen( ofname , "w" );
5647 fprintf(ofp_mtx,"%%%%MatrixMarket matrix coordinate integer general\n");
5648
5649 HashTable * used_cellnoP1_tab = ArrayListToLookupTable_Int(used_cell_barcodes);
5650 HashTable * unique_NZ_geneno1B_table = HashTableCreate(10000);
5651 cellP1_to_geneP1_to_umis_tab -> counter1 = 0;
5652 cellP1_to_geneP1_to_umis_tab -> appendix1 = unique_NZ_geneno1B_table;
5653 cellP1_to_geneP1_to_umis_tab -> appendix2 = used_cellnoP1_tab;
5654 HashTableIteration(cellP1_to_geneP1_to_umis_tab, scRNA_merged_write_sparse_unique_genes);
5655 srInt_64 total_UMIs = cellP1_to_geneP1_to_umis_tab -> counter1;
5656 ArrayList * unique_NZ_genenosP1_list = HashTableKeys(unique_NZ_geneno1B_table);
5657 HashTableDestroy(unique_NZ_geneno1B_table);
5658 HashTableDestroy(used_cellnoP1_tab);
5659 ArrayListSort(unique_NZ_genenosP1_list, NULL);
5660
5661 #ifdef __MINGW32__
5662 fprintf(ofp_mtx, "%I64d %I64d %I64d\n", unique_NZ_genenosP1_list -> numOfElements , used_cell_barcodes -> numOfElements, total_UMIs );
5663 #else
5664 fprintf(ofp_mtx, "%lld %lld %lld\n", unique_NZ_genenosP1_list -> numOfElements , used_cell_barcodes -> numOfElements, total_UMIs );
5665 #endif
5666
5667 for(x2=0; x2 < unique_NZ_genenosP1_list -> numOfElements; x2++){
5668 int gene_index_0B = ArrayListGet(unique_NZ_genenosP1_list, x2) - NULL-1;
5669 if(global_context->is_gene_level){
5670 char* gene_name = (char*)global_context -> gene_name_array [gene_index_0B];
5671 fprintf(ofp_genes,"%s\n", gene_name);
5672 }else{
5673 char exon_name[FEATURE_NAME_LENGTH+60];
5674 build_exon_name(global_context, loaded_features, gene_index_0B, exon_name, sorted_order_p1_to_i_p1_tab);
5675 fprintf(ofp_genes,"%s\n", exon_name);
5676 }
5677 }
5678
5679 for(x1 = 0; x1 < used_cell_barcodes -> numOfElements; x1++){
5680 srInt_64 cellno = ArrayListGet(used_cell_barcodes, x1)-NULL;
5681 char * cellbc_seq = ArrayListGet(global_context -> scRNA_cell_barcodes_array, cellno);
5682 fprintf(ofp_bcs,"%s\n", cellbc_seq);
5683 }
5684
5685 for(x1 = 0; x1 < used_cell_barcodes -> numOfElements; x1++){
5686 srInt_64 cellno = ArrayListGet(used_cell_barcodes, x1)-NULL;
5687 HashTable * geneno1B_to_UMIs = HashTableGet(cellP1_to_geneP1_to_umis_tab, NULL+1+cellno);
5688
5689 for(x2=0; x2 < unique_NZ_genenosP1_list -> numOfElements; x2++){
5690 int geneno1B = ArrayListGet(unique_NZ_genenosP1_list, x2)-NULL;
5691 int this_umis = HashTableGet(geneno1B_to_UMIs, NULL+geneno1B) -NULL;
5692 if(this_umis>0)fprintf(ofp_mtx,"%d %d %d\n", x2+1, x1+1, this_umis);
5693 }
5694 }
5695 ArrayListDestroy(unique_NZ_genenosP1_list);
5696 fclose(ofp_bcs);
5697 fclose(ofp_genes);
5698 fclose(ofp_mtx);
5699
5700 return 0;
5701 }
5702
scRNA_merged_45K_to_90K_sum_SUM_Level2(void * GeneNo1B,void * vUMIs,HashTable * m2)5703 void scRNA_merged_45K_to_90K_sum_SUM_Level2(void * GeneNo1B, void * vUMIs, HashTable * m2){
5704 HashTable * summed_gene_to_umis = m2 -> appendix1;
5705 HashTablePut(summed_gene_to_umis, GeneNo1B, vUMIs + (HashTableGet(summed_gene_to_umis, GeneNo1B)-NULL));
5706 }
5707
scRNA_merged_45K_to_90K_sum_SUM(void * keyCellNoP1,void * Vgno_umi_tab,HashTable * me)5708 void scRNA_merged_45K_to_90K_sum_SUM(void * keyCellNoP1, void * Vgno_umi_tab, HashTable * me){
5709 HashTable * summed_gene_to_umis = me -> appendix1;
5710 HashTable * bcid_look_tab = me -> appendix2;
5711 //fc_thread_global_context_t * global_context = me -> appendix3;
5712 HashTable * geneno1B_to_UMIs_tab = Vgno_umi_tab;
5713 if(!HashTableGet(bcid_look_tab, keyCellNoP1))return;
5714 geneno1B_to_UMIs_tab -> appendix1 = summed_gene_to_umis;
5715 HashTableIteration(geneno1B_to_UMIs_tab ,scRNA_merged_45K_to_90K_sum_SUM_Level2 );
5716 }
5717
scRNA_merged_45K_to_90K_sum_WRT(void * kyGeneID,void * valUMIs,HashTable * me)5718 void scRNA_merged_45K_to_90K_sum_WRT(void * kyGeneID, void * valUMIs, HashTable * me){
5719 fc_thread_global_context_t * global_context = me -> appendix1;
5720 FILE * ofp = me -> appendix2;
5721 void ** vp2 = me->appendix3;
5722 fc_feature_info_t * loaded_features = vp2[0];
5723 HashTable * sorted_order_p1_to_i_p1_tab = vp2[1];
5724
5725 if(global_context -> is_gene_level){
5726 unsigned char * gene_name = global_context -> gene_name_array[ kyGeneID - NULL-1 ];
5727 fprintf(ofp, "%s\t%u\n", gene_name, (unsigned int) (valUMIs-NULL));
5728 }else{
5729 char exon_name[FEATURE_NAME_LENGTH+60];
5730 build_exon_name(global_context, loaded_features, kyGeneID-NULL-1, exon_name, sorted_order_p1_to_i_p1_tab);
5731 fprintf(ofp,"%s\t%u\n", exon_name, (unsigned int) (valUMIs-NULL));
5732 }
5733 }
5734
scRNA_merged_45K_to_90K_sum(fc_thread_global_context_t * global_context,HashTable * cellP1_geneP1_UMIs_tab,ArrayList * bcid_P0_arr,int sample_no,fc_feature_info_t * loaded_features,HashTable * sorted_index_p1_to_i_p1_tab)5735 void scRNA_merged_45K_to_90K_sum(fc_thread_global_context_t * global_context, HashTable * cellP1_geneP1_UMIs_tab, ArrayList * bcid_P0_arr, int sample_no, fc_feature_info_t * loaded_features, HashTable * sorted_index_p1_to_i_p1_tab){
5736 HashTable * summed_gene_to_umis = HashTableCreate( 3+cellP1_geneP1_UMIs_tab->numOfElements/6 );
5737 HashTable * bcid_look_tab = ArrayListToLookupTable_Int(bcid_P0_arr);
5738 cellP1_geneP1_UMIs_tab -> appendix1 = summed_gene_to_umis;
5739 cellP1_geneP1_UMIs_tab -> appendix2 = bcid_look_tab;
5740 cellP1_geneP1_UMIs_tab -> appendix3 = global_context;
5741 HashTableIteration( cellP1_geneP1_UMIs_tab, scRNA_merged_45K_to_90K_sum_SUM );
5742
5743 char ofname[MAX_FILE_NAME_LENGTH + 100];
5744 sprintf(ofname,"%s.scRNA.%03d.AmbSum",global_context->input_file_name, sample_no+1);
5745 FILE * write_fp = fopen(ofname,"w");
5746 fprintf(write_fp,"GeneID\tUMIs\n");
5747 summed_gene_to_umis -> appendix1 = global_context;
5748 summed_gene_to_umis -> appendix2 = write_fp;
5749 void * vp2[2];
5750 vp2[0]=loaded_features;
5751 vp2[1]=sorted_index_p1_to_i_p1_tab;
5752 summed_gene_to_umis -> appendix3 = vp2;
5753 summed_gene_to_umis -> counter1 = sample_no;
5754 HashTableIteration( summed_gene_to_umis, scRNA_merged_45K_to_90K_sum_WRT );
5755 HashTableDestroy(bcid_look_tab);
5756 HashTableDestroy(summed_gene_to_umis);
5757 fclose(write_fp);
5758 }
5759
scRNA_merged_write_nozero_geneids_WRT(void * k,void * v,HashTable * me)5760 void scRNA_merged_write_nozero_geneids_WRT(void *k, void *v, HashTable* me){
5761 FILE * fp = me->appendix1;
5762 fc_thread_global_context_t * global_context = me->appendix2;
5763 void ** tv2 = me->appendix3;
5764 fc_feature_info_t * loaded_features = tv2[0];
5765 HashTable * sorted_order_p1_to_i_p1_tab = tv2[1];
5766 if(global_context -> is_gene_level){
5767 unsigned char* gene_symbol = global_context -> gene_name_array [k-NULL-1];
5768 fprintf(fp, "%s\n", gene_symbol);
5769 }else{
5770 char exon_name[FEATURE_NAME_LENGTH+60];
5771 build_exon_name(global_context, loaded_features, k-NULL-1, exon_name, sorted_order_p1_to_i_p1_tab);
5772 fprintf(fp,"%s\n", exon_name);
5773 }
5774 }
5775
scRNA_merged_write_nozero_geneids(fc_thread_global_context_t * global_context,HashTable * no0genes,int samplenno,fc_feature_info_t * loaded_features,HashTable * sorted_order_p1_to_i_p1_tab)5776 void scRNA_merged_write_nozero_geneids(fc_thread_global_context_t * global_context, HashTable * no0genes, int samplenno, fc_feature_info_t * loaded_features, HashTable * sorted_order_p1_to_i_p1_tab){
5777 char ofname[MAX_FILE_NAME_LENGTH + 100];
5778 sprintf(ofname,"%s.scRNA.%03d.no0Genes",global_context->input_file_name, samplenno+1);
5779 FILE * fp = fopen( ofname , "w" );
5780 no0genes -> appendix1 =fp;
5781 void * tv2[2];
5782 no0genes -> appendix2 =global_context;
5783 tv2[0]=loaded_features;
5784 tv2[1]=sorted_order_p1_to_i_p1_tab;
5785 no0genes -> appendix3 =tv2;
5786 HashTableIteration(no0genes, scRNA_merged_write_nozero_geneids_WRT);
5787 fclose(fp);
5788 }
5789
scRNA_merged_to_tables_write_build_UMIcount_in(void * ky,void * val,HashTable * tab)5790 void scRNA_merged_to_tables_write_build_UMIcount_in(void * ky, void * val, HashTable * tab){
5791 tab -> counter1 += (val-NULL);
5792 }
5793
scRNA_merged_to_tables_write_build_UMIcounts(void * ky,void * val,HashTable * tab)5794 void scRNA_merged_to_tables_write_build_UMIcounts(void * ky, void * val, HashTable * tab){
5795 HashTable * cellbcP1_to_umis_tab = tab -> appendix1;
5796 int cell_no = ky-NULL-1;
5797 HashTable * geneP1_to_counts_tab = val;
5798
5799 geneP1_to_counts_tab -> counter1 = 0;
5800 HashTableIteration(geneP1_to_counts_tab, scRNA_merged_to_tables_write_build_UMIcount_in);
5801 HashTablePut(cellbcP1_to_umis_tab, NULL+1+cell_no, NULL+geneP1_to_counts_tab -> counter1);
5802 }
5803
5804 // this function writes a single count table.
5805 // Rows: genes
5806 // Cols: Cell_Barcode +"."+ SampleName
scRNA_merged_to_tables_write(fc_thread_global_context_t * global_context,HashTable ** cellP1_to_geneP1_to_umis,fc_feature_info_t * loaded_features,srInt_64 nexons)5807 void scRNA_merged_to_tables_write( fc_thread_global_context_t * global_context, HashTable ** cellP1_to_geneP1_to_umis, fc_feature_info_t * loaded_features, srInt_64 nexons){
5808 char ofname[MAX_FILE_NAME_LENGTH + 20];
5809 sprintf(ofname,"%s.scRNA.SampleTable",global_context->input_file_name);
5810 FILE * sample_tab_fp = fopen( ofname , "w" );
5811 int x1;
5812
5813 fprintf(sample_tab_fp,"SampleName\tUMICutoff\tTotalReads\tMappedReads\tAssignedReads\tIndex\n");
5814 for(x1 = 0; x1 < global_context -> scRNA_sample_sheet_table -> numOfElements ; x1++){
5815 srInt_64 mapped_reads = 0, all_reads = 0, assigned_reads = 0;
5816 int thrid;
5817 for(thrid=0; thrid<global_context-> thread_number; thrid++){
5818 mapped_reads += global_context -> thread_contexts[thrid].scRNA_mapped_reads_per_sample[x1];
5819 assigned_reads += global_context -> thread_contexts[thrid].scRNA_assigned_reads_per_sample[x1];
5820 all_reads += global_context -> thread_contexts[thrid].scRNA_reads_per_sample[x1];
5821 }
5822 ArrayList * high_confid_barcode_index_list = ArrayListCreate(20000);
5823 ArrayList * this_sample_ambient_rescure_candi = ArrayListCreate(10000);
5824 ArrayList * this_sample_45k_90k_barcode_no_P0 = ArrayListCreate(90000 - 45000 + 100);
5825
5826 HashTable * cellbcP1_to_umis_tab = HashTableCreate(cellP1_to_geneP1_to_umis[x1] -> numOfElements);
5827 cellP1_to_geneP1_to_umis[x1] -> appendix1 = cellbcP1_to_umis_tab;
5828 HashTableIteration(cellP1_to_geneP1_to_umis[x1], scRNA_merged_to_tables_write_build_UMIcounts);
5829
5830 int applied_umi_cut = scRNA_merged_bootstrap_a_sample(global_context, cellP1_to_geneP1_to_umis[x1], cellbcP1_to_umis_tab, high_confid_barcode_index_list);
5831 global_context -> scRNA_applied_umi_cut[x1] = applied_umi_cut;
5832 scRNA_merged_ambient_rescure(global_context, cellP1_to_geneP1_to_umis[x1], cellbcP1_to_umis_tab, this_sample_45k_90k_barcode_no_P0, this_sample_ambient_rescure_candi, high_confid_barcode_index_list);
5833
5834 int umi_cutoff = global_context -> scRNA_applied_umi_cut[x1];
5835 char * this_sample_name = ArrayListGet(global_context -> scRNA_sample_id_to_name, x1);
5836 #ifdef __MINGW32__
5837 fprintf(sample_tab_fp,"%s\t%d\t%I64d\t%I64d\t%I64d\t%d\n", this_sample_name, umi_cutoff, all_reads, mapped_reads, assigned_reads,x1+1);
5838 #else
5839 fprintf(sample_tab_fp,"%s\t%d\t%lld\t%lld\t%lld\t%d\n", this_sample_name, umi_cutoff, all_reads, mapped_reads, assigned_reads, x1+1);
5840 #endif
5841 srInt_64 xk1;
5842 HashTable * sorted_order_p1_to_i_p1_tab = HashTableCreate(nexons/4);
5843 for(xk1 = 0; xk1 < nexons ; xk1++){
5844 HashTablePut(sorted_order_p1_to_i_p1_tab, NULL+loaded_features[xk1].sorted_order+1 , NULL+xk1+1 );
5845 }
5846
5847 #ifdef DEBUG_FOR_EXACT
5848 #warning " ======= Another debug ======"
5849 scRNA_merged_write_sparse_matrix(global_context, merged_tables_gene_to_cell_umis[x1], used_cell_barcode_tabs[x1], NULL, x1, "RawMatrix", loaded_features, sorted_order_p1_to_i_p1_tab);
5850 //scRNA_merged_write_sparse_matrix(global_context, merged_tables_gene_to_cell_umis[x1], used_cell_barcode_tabs[x1] this_sample_45k_90k_barcode_no_P0, x1, "AmbProfCells", loaded_features, sorted_order_p1_to_i_p1_tab);
5851 #endif
5852
5853 scRNA_merged_write_sparse_matrix(global_context, cellP1_to_geneP1_to_umis[x1], cellbcP1_to_umis_tab, high_confid_barcode_index_list, x1, "HighConf", loaded_features, sorted_order_p1_to_i_p1_tab);
5854 scRNA_merged_write_sparse_matrix(global_context, cellP1_to_geneP1_to_umis[x1], cellbcP1_to_umis_tab, this_sample_ambient_rescure_candi, x1, "RescCand", loaded_features, sorted_order_p1_to_i_p1_tab);
5855 scRNA_merged_45K_to_90K_sum( global_context, cellP1_to_geneP1_to_umis[x1], this_sample_45k_90k_barcode_no_P0, x1 , loaded_features, sorted_order_p1_to_i_p1_tab);
5856 HashTable * no0genes = HashTableCreate(10000);
5857 cellP1_to_geneP1_to_umis[x1] -> appendix1 = no0genes;
5858 cellP1_to_geneP1_to_umis[x1] -> appendix2 = NULL;
5859 HashTableIteration(cellP1_to_geneP1_to_umis[x1], scRNA_merged_write_sparse_unique_genes);
5860 scRNA_merged_write_nozero_geneids(global_context, no0genes, x1, loaded_features, sorted_order_p1_to_i_p1_tab);
5861
5862 HashTableDestroy(no0genes);
5863 ArrayListDestroy(this_sample_ambient_rescure_candi);
5864 ArrayListDestroy(this_sample_45k_90k_barcode_no_P0);
5865 ArrayListDestroy(high_confid_barcode_index_list);
5866 HashTableDestroy(cellbcP1_to_umis_tab);
5867 HashTableDestroy(sorted_order_p1_to_i_p1_tab);
5868 }
5869
5870 fclose(sample_tab_fp);
5871 }
5872
scRNA_find_gene_to_umi_do_merger(void * ky_genep1,void * val_arr_bc_umip1,HashTable * tab)5873 void scRNA_find_gene_to_umi_do_merger(void * ky_genep1, void * val_arr_bc_umip1, HashTable * tab){
5874 HashTable * bc_umi_to_genes_tab = tab -> appendix1;
5875 ArrayList * arr_bc_umip1 = val_arr_bc_umip1;
5876 int gene_no = ky_genep1-NULL-1;
5877 int x1;
5878 for(x1=0; x1<arr_bc_umip1 -> numOfElements; x1++){
5879 void * bc_umip1 = ArrayListGet(arr_bc_umip1,x1);
5880 ArrayList * gene_list = HashTableGet(bc_umi_to_genes_tab, bc_umip1);
5881 if(!gene_list){
5882 gene_list = ArrayListCreate(1);
5883 HashTablePut(bc_umi_to_genes_tab, bc_umip1, gene_list);
5884 }
5885 ArrayListPush(gene_list, NULL+gene_no);
5886 }
5887 }
5888
scRNA_find_gene_to_umi_sortCompare(void * L_elem,void * R_elem,ArrayList * me)5889 int scRNA_find_gene_to_umi_sortCompare(void * L_elem, void * R_elem, ArrayList * me){
5890 void ** pnts = me -> appendix1;
5891 void * key_bc_umi_p1 = pnts[0];
5892 HashTable * geneno_umi_bc_counts = pnts[1];
5893 int geneno_L = L_elem-NULL;
5894 int geneno_R = R_elem-NULL;
5895 int nsupp_L = HashTableGet(HashTableGet(geneno_umi_bc_counts, NULL+1+ geneno_L), key_bc_umi_p1) - NULL;
5896 int nsupp_R = HashTableGet(HashTableGet(geneno_umi_bc_counts, NULL+1+ geneno_R), key_bc_umi_p1) - NULL;
5897 if(nsupp_L > nsupp_R) return -1;
5898 if(nsupp_L < nsupp_R) return 1;
5899 return 0;
5900 }
5901
scRNA_find_gene_to_umi_sortByReads(void * key_bc_umi_p1,void * val_arr_genes,HashTable * tab)5902 void scRNA_find_gene_to_umi_sortByReads(void * key_bc_umi_p1, void * val_arr_genes, HashTable * tab){
5903 ArrayList * arr_genes = val_arr_genes;
5904 if(arr_genes -> numOfElements<2) return;
5905
5906 HashTable * geneno_umi_bc_counts = tab -> appendix1;
5907 void * pnts[2];
5908 pnts[0]=key_bc_umi_p1;
5909 pnts[1]=geneno_umi_bc_counts;
5910 arr_genes -> appendix1 = pnts;
5911 ArrayListSort(arr_genes, scRNA_find_gene_to_umi_sortCompare);
5912 }
5913
scRNA_find_gene_to_umi_mark_deletee(void * key_bc_umi_p1,void * val_arr_genes_sorted,HashTable * tab)5914 void scRNA_find_gene_to_umi_mark_deletee(void * key_bc_umi_p1, void * val_arr_genes_sorted, HashTable * tab){
5915 ArrayList * arr_genes = val_arr_genes_sorted;
5916 if(arr_genes -> numOfElements<2) return;
5917
5918 HashTable * gene_bc_umi_to_deleted_genes_tab = tab->appendix1;
5919 HashTable * gene_to_bc_umi_p1_to_reads_tab = tab->appendix2;
5920 int gene1_no = ArrayListGet(arr_genes, 0)-NULL;
5921 int gene2_no = ArrayListGet(arr_genes, 1)-NULL;
5922
5923 if(0){
5924 int x1;
5925 for(x1=0; x1<arr_genes -> numOfElements; x1++){
5926 int gene_no = ArrayListGet(arr_genes, x1)-NULL;
5927 int nsupp = HashTableGet(HashTableGet(gene_to_bc_umi_p1_to_reads_tab , NULL+gene_no+1), key_bc_umi_p1) - NULL;
5928 int cellno = ( key_bc_umi_p1-NULL-1 ) >> 32;
5929 SUBREADprintf("TESTING_SORT : %d of %d have %d reads\n", gene_no, cellno, nsupp);
5930 }
5931 }
5932
5933
5934
5935 int supp1 = HashTableGet(HashTableGet(gene_to_bc_umi_p1_to_reads_tab, NULL+gene1_no+1), key_bc_umi_p1)-NULL;
5936 int supp2 = HashTableGet(HashTableGet(gene_to_bc_umi_p1_to_reads_tab, NULL+gene2_no+1), key_bc_umi_p1)-NULL;
5937
5938 ArrayList * to_del_genes = HashTableGet(gene_bc_umi_to_deleted_genes_tab, key_bc_umi_p1);
5939 if(NULL==to_del_genes){
5940 to_del_genes = ArrayListCreate(3);
5941 HashTablePut(gene_bc_umi_to_deleted_genes_tab, key_bc_umi_p1, to_del_genes);
5942 }
5943
5944 if(supp1 == supp2) ArrayListPush(to_del_genes, NULL+gene1_no);
5945 int x1;
5946 for(x1=1; x1< arr_genes->numOfElements; x1++) ArrayListPush(to_del_genes, ArrayListGet(arr_genes, x1));
5947 }
5948
scRNA_find_gene_to_umi_merger(fc_thread_global_context_t * global_context,HashTable * gene_to_bc_umi_p1_tab,HashTable * gene_to_bc_umi_p1_to_reads_tab)5949 HashTable * scRNA_find_gene_to_umi_merger(fc_thread_global_context_t * global_context, HashTable * gene_to_bc_umi_p1_tab, HashTable * gene_to_bc_umi_p1_to_reads_tab){
5950 HashTable * bc_umi_to_genes_tab = HashTableCreate( 1000000);
5951 HashTableSetDeallocationFunctions(bc_umi_to_genes_tab, NULL, (void (*) (void *))ArrayListDestroy);
5952 gene_to_bc_umi_p1_tab -> appendix1 = bc_umi_to_genes_tab;
5953 gene_to_bc_umi_p1_tab -> appendix2 = global_context;
5954
5955 HashTableIteration(gene_to_bc_umi_p1_tab, scRNA_find_gene_to_umi_do_merger);
5956 bc_umi_to_genes_tab -> appendix1 = gene_to_bc_umi_p1_to_reads_tab;
5957 HashTableIteration(bc_umi_to_genes_tab, scRNA_find_gene_to_umi_sortByReads);
5958
5959 HashTable * gene_bc_umi_to_deleted_genes_tab = HashTableCreate( 1000000); // bc_um1_p1 => ArrayList (deleted_gene_1, deleted_gene_2, ...)
5960 bc_umi_to_genes_tab -> appendix1 = gene_bc_umi_to_deleted_genes_tab;
5961 bc_umi_to_genes_tab -> appendix2 = gene_to_bc_umi_p1_to_reads_tab;
5962 HashTableIteration(bc_umi_to_genes_tab, scRNA_find_gene_to_umi_mark_deletee);
5963 HashTableDestroy(bc_umi_to_genes_tab);
5964 return gene_bc_umi_to_deleted_genes_tab ;
5965 }
5966
5967 struct scRNA_merge_batches_worker_task{
5968 int sample_id;
5969 int inbin_len;
5970 srInt_64 block_number;
5971 char inbin[MERGER_WORKER_BINSIZE];
5972 };
5973
5974 struct scRNA_merge_batches_worker_current{
5975 struct scRNA_merge_batches_worker_task * task;
5976 char outbin[MERGER_WORKER_BINSIZE];
5977 int outbin_len;
5978 unsigned int crc32;
5979
5980 z_stream strm;
5981 };
5982
scRNA_merge_batches_worker(void * vp)5983 void * scRNA_merge_batches_worker(void * vp){
5984 void **vpp = vp;
5985 fc_thread_global_context_t * global_context = vpp[0];
5986 worker_master_mutex_t * worker_mut = vpp[1];
5987 int my_worker_id = vpp[2] - NULL;
5988 struct scRNA_merge_batches_worker_current * my_current_job = vpp[3];
5989 free(vp);
5990
5991 int Z_DEFAULT_MEM_LEVEL = 8;
5992 worker_thread_start(worker_mut, my_worker_id);
5993 while(1){
5994 if(worker_wait_for_job(worker_mut, my_worker_id)) break;
5995 if(!global_context -> is_scRNA_BAM_FQ_out_generated) continue;
5996
5997 deflateInit2(&my_current_job -> strm , SAMBAM_COMPRESS_LEVEL_NORMAL, Z_DEFLATED, SAMBAM_GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
5998
5999 struct scRNA_merge_batches_worker_task * current_input = my_current_job -> task;
6000 my_current_job -> strm.avail_in = current_input -> inbin_len;
6001 my_current_job -> strm.next_in = (unsigned char*)current_input -> inbin;
6002 my_current_job -> strm.avail_out = MERGER_WORKER_BINSIZE;
6003 my_current_job -> strm.next_out = (unsigned char*)my_current_job -> outbin;
6004
6005 deflate(&my_current_job -> strm, Z_FINISH);
6006 my_current_job -> outbin_len = MERGER_WORKER_BINSIZE-my_current_job -> strm.avail_out;
6007 my_current_job -> crc32 = FC_CRC32(current_input -> inbin, current_input -> inbin_len);
6008 deflateEnd(&my_current_job -> strm);
6009 }
6010 return NULL;
6011 }
6012
scRNA_save_BAM_result(fc_thread_global_context_t * global_context,struct scRNA_merge_batches_worker_current * finished_job)6013 void scRNA_save_BAM_result(fc_thread_global_context_t * global_context, struct scRNA_merge_batches_worker_current * finished_job){
6014 if(!finished_job -> task)return;
6015 if(global_context -> is_scRNA_BAM_FQ_out_generated){
6016 int sample_id = finished_job -> task -> sample_id;
6017 void ** fps = HashTableGet(global_context -> scRNA_sample_BAM_writers, NULL+sample_id);
6018 simple_bam_writer * wtr = fps[0];
6019 int inbin_pos = 0;
6020 while(inbin_pos < finished_job -> task -> inbin_len){
6021 int binlen = 0;
6022 memcpy(&binlen, finished_job -> task -> inbin+inbin_pos, 4);
6023 simple_bam_writer_update_index(wtr, finished_job -> task -> inbin+inbin_pos, binlen, finished_job -> task -> block_number, inbin_pos);
6024 inbin_pos += 4+binlen;
6025 }
6026 simple_bam_write_compressed_block(wtr, finished_job -> outbin, finished_job -> outbin_len, finished_job -> task -> inbin_len, finished_job -> crc32, finished_job -> task -> block_number);
6027 }
6028 finished_job -> task = NULL;
6029 }
6030
6031 // return the number of RG result sets
fc_thread_merge_results(fc_thread_global_context_t * global_context,read_count_type_t * nreads,srInt_64 * nreads_mapped_to_exon,fc_read_counters * my_read_counter,HashTable * junction_global_table,HashTable * splicing_global_table,HashTable * RGmerged_table,fc_feature_info_t * loaded_features,srInt_64 nexons)6032 int fc_thread_merge_results(fc_thread_global_context_t * global_context, read_count_type_t * nreads , srInt_64 *nreads_mapped_to_exon, fc_read_counters * my_read_counter, HashTable * junction_global_table, HashTable * splicing_global_table, HashTable * RGmerged_table, fc_feature_info_t * loaded_features, srInt_64 nexons)
6033 {
6034 int xk1, xk2, ret = 0, sample_i;
6035
6036 srInt_64 total_input_reads = 0 ;
6037 (*nreads_mapped_to_exon)=0;
6038 SAM_pairer_destroy(&global_context -> read_pairer);
6039
6040 if(global_context -> do_scRNA_table){
6041 int compress_workers = max(1,global_context-> thread_number-1);
6042 HashTable * cellnoP1_to_genenoP1_to_UMIs[global_context -> scRNA_sample_sheet_table -> numOfElements];
6043 struct scRNA_merge_batches_worker_task * task_buffers = malloc(sizeof(struct scRNA_merge_batches_worker_task) * (1+compress_workers)* global_context->scRNA_sample_sheet_table -> numOfElements);
6044 int current_filling_worker_per_sample [global_context->scRNA_sample_sheet_table -> numOfElements];
6045 struct scRNA_merge_batches_worker_current * worker_current_jobs = calloc(sizeof(struct scRNA_merge_batches_worker_current), compress_workers);
6046
6047 ArrayList * file_size_list = ArrayListCreate(global_context-> scRNA_barcode_batched_bin_no +1);
6048 for(xk1=0; xk1<global_context-> scRNA_barcode_batched_bin_no +2; xk1++){
6049 if(xk1<global_context-> scRNA_barcode_batched_bin_no +1){
6050 srInt_64 batchsize = ftello(global_context -> scRNA_barcode_batched_bins[xk1]);
6051 ArrayListPush(file_size_list, NULL+( batchsize<<20 | xk1));
6052 }
6053 fclose(global_context -> scRNA_barcode_batched_bins[xk1]);
6054 }
6055 ArrayListSort(file_size_list, NULL);
6056
6057 srInt_64 block_numbers_current [global_context->scRNA_sample_sheet_table -> numOfElements];
6058 for(xk1=0; xk1<global_context->scRNA_sample_sheet_table -> numOfElements; xk1++){
6059 cellnoP1_to_genenoP1_to_UMIs[xk1] = HashTableCreate(10000);
6060 HashTableSetDeallocationFunctions(cellnoP1_to_genenoP1_to_UMIs[xk1], NULL,(void (*) (void*))HashTableDestroy);
6061 current_filling_worker_per_sample[xk1] = 0;
6062 task_buffers[xk1].inbin_len = 0;
6063 block_numbers_current[xk1] = 0;
6064 }
6065
6066 pthread_t *threads = malloc(sizeof(pthread_t)*global_context-> thread_number);
6067 for(xk1=0; xk1<compress_workers+1; xk1++)for(xk2 = 0; xk2 < global_context->scRNA_sample_sheet_table -> numOfElements; xk2++) task_buffers[xk1*global_context->scRNA_sample_sheet_table -> numOfElements + xk2].sample_id = xk2+1;
6068
6069 for(xk1=0; xk1<global_context-> thread_number; xk1++){
6070 void ** vpp = malloc(sizeof(void*)*3);
6071 vpp[0] = global_context;
6072 vpp[1] = global_context -> thread_contexts+xk1;
6073 vpp[2] = file_size_list;
6074 pthread_create(threads + xk1, NULL, scRNA_do_one_batch, vpp);
6075 }
6076
6077 for(xk1=0; xk1<global_context-> thread_number; xk1++)
6078 pthread_join(threads[xk1],NULL);
6079 ArrayListDestroy(file_size_list);
6080
6081 worker_master_mutex_t worker_mut;
6082 worker_master_mutex_init(&worker_mut, max(1,global_context-> thread_number-1));
6083
6084 for(xk1=0; xk1<max(1,global_context-> thread_number-1); xk1++){
6085 void ** vpp = malloc(sizeof(void*)*4);
6086 vpp[0] = global_context;
6087 vpp[1] = &worker_mut;
6088 vpp[2] = NULL + xk1;
6089 vpp[3] = worker_current_jobs + xk1;
6090 pthread_create(threads + xk1, NULL, scRNA_merge_batches_worker, vpp);
6091 }
6092
6093 FILE * input_fps[global_context -> scRNA_barcode_batched_bin_no+2];
6094 char * last_rbin_buffer[global_context -> scRNA_barcode_batched_bin_no+1];
6095 srInt_64 current_sorting_key[global_context -> scRNA_barcode_batched_bin_no+1];
6096
6097 for(xk1=0; xk1<global_context -> scRNA_barcode_batched_bin_no+2; xk1++){
6098 char tmp_fname[MAX_FILE_NAME_LENGTH+80];
6099 sprintf(tmp_fname, "%s/cellCounts-Splitted-Reads-%05d-%05d.bin", global_context -> temp_file_dir, getpid(), xk1);
6100 input_fps[xk1] = fopen(tmp_fname,"rb");
6101 if(xk1 == global_context -> scRNA_barcode_batched_bin_no+1)break;
6102
6103 srInt_64 section1_items=0;
6104 for(sample_i = 0; sample_i < global_context -> scRNA_sample_sheet_table -> numOfElements; sample_i++){
6105 fread(§ion1_items,1, 8, input_fps[xk1]);
6106 for(xk2 = 0; xk2 < section1_items; xk2++){
6107 srInt_64 cellbcP0_geneno0B=0, umis=0;
6108 fread(&cellbcP0_geneno0B,1,8,input_fps[xk1]);
6109 fread(&umis,1,8,input_fps[xk1]);
6110
6111 int cellbc_no = cellbcP0_geneno0B>>32;
6112 int gene_no0B = (int)(cellbcP0_geneno0B&0xffffffffu);
6113 HashTable *gene_tab = HashTableGet(cellnoP1_to_genenoP1_to_UMIs[sample_i], NULL+cellbc_no+1);
6114 if(gene_tab==NULL){
6115 gene_tab = HashTableCreate(300);
6116 HashTablePut(cellnoP1_to_genenoP1_to_UMIs[sample_i], NULL+cellbc_no+1, gene_tab);
6117 }
6118 HashTablePut(gene_tab, NULL+gene_no0B+1 , NULL+umis);
6119 }
6120 }
6121 last_rbin_buffer[xk1] = malloc( global_context -> scRNA_barcode_batched_max_genes *8 + global_context -> scRNA_barcode_batched_max_Rbin_len + 4 + MAX_UMI_LEN + 16 + 10000);
6122 int rlen = fread(last_rbin_buffer[xk1], 1, 16, input_fps[xk1]);
6123 if(rlen >0){
6124 int binlen = 0;
6125 srInt_64 genes = 0;
6126 memcpy(&genes, last_rbin_buffer[xk1]+8, 8);
6127 if(genes & (1LLU<<63))genes = genes & 0x7fffffff;
6128 else genes= 0;
6129
6130 fread(last_rbin_buffer[xk1]+16, 1, 8*genes+ global_context -> scRNA_UMI_length + 4, input_fps[xk1]);
6131 memcpy(&binlen, last_rbin_buffer[xk1] +16 +8*genes+ global_context -> scRNA_UMI_length , 4);
6132 fread(last_rbin_buffer[xk1] + 16+ 8*genes+ global_context -> scRNA_UMI_length + 4, 1, binlen, input_fps[xk1]);
6133
6134 srInt_64 sorting_key = *(int*)(last_rbin_buffer[xk1] + 16 +8*genes+global_context -> scRNA_UMI_length +4);
6135 sorting_key = sorting_key << 32;
6136 sorting_key |= *(int*)(last_rbin_buffer[xk1] + 16+ 8*genes+global_context -> scRNA_UMI_length +8);
6137 current_sorting_key[xk1] = sorting_key;
6138 }else current_sorting_key[xk1] = 0x7fffffffffffffffLLU;
6139 }
6140
6141 int current_worker = 0;
6142 while(1){
6143 int selected_fp_no = 0;
6144 srInt_64 selected_fp_key = current_sorting_key[0];
6145 for(xk1=1; xk1<global_context -> scRNA_barcode_batched_bin_no+1; xk1++){
6146 if(current_sorting_key[xk1] < selected_fp_key){
6147 selected_fp_key = current_sorting_key[xk1] ;
6148 selected_fp_no = xk1;
6149 }
6150 }
6151 if(selected_fp_key == 0x7fffffffffffffffLLU) break;
6152
6153 int sample_id = 0, binlen = 0;
6154 srInt_64 genes = 0;
6155 memcpy(&sample_id, last_rbin_buffer[selected_fp_no], 4);
6156 memcpy(&genes, last_rbin_buffer[selected_fp_no]+8, 8);
6157 if(genes & (1LLU<<63)) genes = genes & 0x7fffffff;
6158 else genes = 0;
6159 memcpy(&binlen,last_rbin_buffer[selected_fp_no]+16+8*genes+global_context -> scRNA_UMI_length,4);
6160
6161 struct scRNA_merge_batches_worker_task * tofill = task_buffers+(current_filling_worker_per_sample[sample_id-1] * global_context->scRNA_sample_sheet_table -> numOfElements +sample_id-1);
6162 memcpy(tofill->inbin + tofill-> inbin_len, last_rbin_buffer[selected_fp_no]+16+8*genes+global_context -> scRNA_UMI_length, binlen + 4);
6163 tofill -> inbin_len += (binlen + 4);
6164 //SUBREADprintf("ADDING BLOCKKK = %d WKR = %d IT THINK IT'S %d ; GENES=%d\n", tofill -> inbin_len, current_worker, tofill -> sample_id, genes);
6165 if(tofill-> inbin_len > 60000){
6166 master_wait_for_job_done(&worker_mut, current_worker);
6167 struct scRNA_merge_batches_worker_current * my_finished_job = worker_current_jobs+current_worker;
6168 scRNA_save_BAM_result(global_context, my_finished_job);
6169 my_finished_job -> task = tofill;
6170 my_finished_job -> task -> block_number = (block_numbers_current[sample_id-1]++);
6171 my_finished_job -> outbin_len = 0;
6172 master_notify_worker(&worker_mut, current_worker);
6173
6174 current_filling_worker_per_sample[sample_id-1] ++;
6175 if(current_filling_worker_per_sample[sample_id-1] == compress_workers +1) current_filling_worker_per_sample[sample_id-1] = 0;
6176 tofill = task_buffers+(current_filling_worker_per_sample[sample_id-1] * global_context->scRNA_sample_sheet_table -> numOfElements +sample_id-1);
6177 tofill -> inbin_len = 0;
6178
6179 current_worker ++;
6180 if(current_worker == compress_workers) current_worker=0;
6181 }
6182
6183 int rlen = fread(last_rbin_buffer[selected_fp_no], 1, 16, input_fps[selected_fp_no]);
6184 if(rlen >0){
6185 int binlen = 0;
6186 srInt_64 genes = 0;
6187 memcpy(&genes, last_rbin_buffer[selected_fp_no]+8, 8);
6188 if(genes & (1LLU<<63))genes = genes & 0x7fffffff;
6189 else genes= 0;
6190 fread(last_rbin_buffer[selected_fp_no]+16, 1, 8*genes+ global_context -> scRNA_UMI_length + 4, input_fps[selected_fp_no]);
6191 memcpy(&binlen, last_rbin_buffer[selected_fp_no] +16 +8*genes+ global_context -> scRNA_UMI_length , 4);
6192
6193 fread(last_rbin_buffer[selected_fp_no] + 16+ 8*genes+ global_context -> scRNA_UMI_length + 4, 1, binlen, input_fps[selected_fp_no]);
6194 srInt_64 sorting_key = *(int*)(last_rbin_buffer[selected_fp_no] + 16+8*genes +global_context -> scRNA_UMI_length +4);
6195 sorting_key = sorting_key << 32;
6196 sorting_key |= *(int*)(last_rbin_buffer[selected_fp_no] + 16 +8*genes+global_context -> scRNA_UMI_length +8);
6197 current_sorting_key[selected_fp_no] = sorting_key;
6198 } else current_sorting_key[selected_fp_no] = 0x7fffffffffffffffLLU;
6199 }
6200
6201 for(xk1=0; xk1<global_context -> scRNA_barcode_batched_bin_no+1; xk1++){
6202 fclose(input_fps[xk1]);
6203 free(last_rbin_buffer[xk1]);
6204 }
6205
6206 for(xk1=0; xk1<global_context -> scRNA_sample_sheet_table -> numOfElements; xk1++){
6207 struct scRNA_merge_batches_worker_task * tofill = task_buffers+(current_filling_worker_per_sample[xk1] * global_context->scRNA_sample_sheet_table -> numOfElements +xk1);
6208 if(tofill->inbin_len<1) continue;
6209
6210 master_wait_for_job_done(&worker_mut, current_worker);
6211 struct scRNA_merge_batches_worker_current * my_finished_job = worker_current_jobs+current_worker;
6212 scRNA_save_BAM_result(global_context, my_finished_job);
6213 my_finished_job -> task = tofill;
6214 my_finished_job -> task -> block_number = (block_numbers_current[xk1]++);
6215 my_finished_job -> outbin_len = 0;
6216 master_notify_worker(&worker_mut, current_worker);
6217 current_worker ++;
6218 if(current_worker == compress_workers) current_worker=0;
6219 }
6220 for(xk1=0; xk1<compress_workers; xk1++){
6221 struct scRNA_merge_batches_worker_current * my_finished_job = worker_current_jobs+current_worker;
6222 if(my_finished_job -> task)master_wait_for_job_done(&worker_mut, current_worker);
6223 scRNA_save_BAM_result(global_context, my_finished_job);
6224
6225 current_worker ++;
6226 if(current_worker == compress_workers) current_worker=0;
6227 }
6228
6229 //TODO: add "scRNA_barcode_batched_bin_no+1" bin into "all unmapped"
6230
6231 for(xk1 = 0; xk1 < 1+compress_workers; xk1++) for(xk2 = 0; xk2 < global_context->scRNA_sample_sheet_table -> numOfElements;xk2++)
6232 task_buffers[ xk1 * global_context->scRNA_sample_sheet_table -> numOfElements + xk2 ].inbin_len = 0;
6233 current_worker = 0;
6234 FILE * notmapped_fp = input_fps[global_context -> scRNA_barcode_batched_bin_no+1];
6235 while(1){
6236 int sample_id = 0, binlen = 0;
6237 int rlen = fread(&sample_id, 1, 4, notmapped_fp);
6238 if(rlen < 4) break;
6239 struct scRNA_merge_batches_worker_task * tofill = task_buffers+(current_filling_worker_per_sample[sample_id -1] * global_context->scRNA_sample_sheet_table -> numOfElements +sample_id-1);
6240 fread(&binlen, 1, 4, notmapped_fp);
6241 memcpy(tofill -> inbin + tofill -> inbin_len, &binlen, 4);
6242 tofill -> inbin_len += 4;
6243 fread(tofill -> inbin + tofill -> inbin_len, 1, binlen, notmapped_fp);
6244 tofill -> inbin_len += binlen;
6245 if(tofill-> inbin_len > 60000){
6246 struct scRNA_merge_batches_worker_current * my_finished_job = worker_current_jobs+current_worker;
6247 if(my_finished_job -> task)master_wait_for_job_done(&worker_mut, current_worker);
6248 scRNA_save_BAM_result(global_context, my_finished_job);
6249 my_finished_job -> task = tofill;
6250 my_finished_job -> task -> block_number = (block_numbers_current[sample_id-1]++);
6251 my_finished_job -> outbin_len = 0;
6252 master_notify_worker(&worker_mut, current_worker);
6253
6254 current_filling_worker_per_sample[sample_id-1] ++;
6255 if(current_filling_worker_per_sample[sample_id-1] == compress_workers +1) current_filling_worker_per_sample[sample_id-1] = 0;
6256 tofill = task_buffers+(current_filling_worker_per_sample[sample_id-1] * global_context->scRNA_sample_sheet_table -> numOfElements +sample_id-1);
6257 tofill -> inbin_len = 0;
6258
6259 current_worker ++;
6260 if(current_worker == compress_workers) current_worker=0;
6261 }
6262 }
6263
6264 for(xk1=0; xk1<global_context -> scRNA_sample_sheet_table -> numOfElements; xk1++){
6265 struct scRNA_merge_batches_worker_task * tofill = task_buffers+(current_filling_worker_per_sample[xk1] * global_context->scRNA_sample_sheet_table -> numOfElements +xk1);
6266 if(tofill->inbin_len<1) continue;
6267
6268 master_wait_for_job_done(&worker_mut, current_worker);
6269 struct scRNA_merge_batches_worker_current * my_finished_job = worker_current_jobs+current_worker;
6270 scRNA_save_BAM_result(global_context, my_finished_job);
6271 my_finished_job -> task = tofill;
6272 my_finished_job -> task -> block_number = (block_numbers_current[xk1]++);
6273 my_finished_job -> outbin_len = 0;
6274 master_notify_worker(&worker_mut, current_worker);
6275 current_worker ++;
6276 if(current_worker == compress_workers) current_worker=0;
6277 }
6278
6279 for(xk1=0; xk1<compress_workers; xk1++){
6280 master_wait_for_job_done(&worker_mut, current_worker);
6281 struct scRNA_merge_batches_worker_current * my_finished_job = worker_current_jobs+current_worker;
6282 scRNA_save_BAM_result(global_context, my_finished_job);
6283
6284 current_worker ++;
6285 if(current_worker == compress_workers) current_worker=0;
6286 }
6287
6288 fclose(notmapped_fp);
6289 terminate_workers(&worker_mut);
6290 free(task_buffers);
6291 free(worker_current_jobs);
6292
6293
6294 for(xk1=0; xk1< compress_workers; xk1++){
6295 pthread_join(threads[xk1],NULL);
6296 }
6297
6298 worker_master_mutex_destroy(&worker_mut);
6299 global_context -> scRNA_applied_umi_cut = calloc(sizeof(int), global_context -> scRNA_sample_sheet_table -> numOfElements);
6300 scRNA_merged_to_tables_write(global_context , cellnoP1_to_genenoP1_to_UMIs , loaded_features, nexons);
6301
6302 for(xk1=0; xk1<global_context -> scRNA_sample_sheet_table -> numOfElements; xk1++)
6303 HashTableDestroy(cellnoP1_to_genenoP1_to_UMIs[xk1]);
6304
6305
6306 HashTable ** used_cell_no_tables = malloc(sizeof(HashTable*) * global_context -> scRNA_sample_sheet_table -> numOfElements);
6307 for(xk1=0; xk1<global_context -> scRNA_sample_sheet_table -> numOfElements; xk1++){
6308 used_cell_no_tables[xk1] = HashTableCreate(30000);
6309 used_cell_no_tables[xk1] -> appendix1 = malloc(sizeof(pthread_spinlock_t));
6310 pthread_spin_init((pthread_spinlock_t*)used_cell_no_tables[xk1] -> appendix1,1);
6311 }
6312
6313
6314
6315 for(xk1=0; xk1<global_context -> scRNA_sample_sheet_table -> numOfElements; xk1++){
6316 pthread_spin_destroy((pthread_spinlock_t*)used_cell_no_tables[xk1] -> appendix1);
6317 HashTableDestroy(used_cell_no_tables[xk1]);
6318 }
6319
6320 free(used_cell_no_tables);
6321 }
6322
6323 for(xk1=0; xk1<global_context-> thread_number; xk1++)
6324 {
6325 if(global_context -> assign_reads_to_RG){
6326 HashTable * thread_rg_tab = global_context -> thread_contexts[xk1].RG_table;
6327 int buck_i;
6328 for(buck_i = 0; buck_i < thread_rg_tab -> numOfBuckets; buck_i++){
6329 KeyValuePair *cursor = thread_rg_tab -> bucketArray[buck_i];
6330 while(cursor){
6331 char * rg_name = (char *)cursor -> key;
6332 void ** rg_thread_tabs = cursor -> value;
6333 void ** rg_old_tabs = HashTableGet(RGmerged_table, rg_name);
6334 if(!rg_old_tabs){
6335 rg_old_tabs = malloc(sizeof(char *)*4); // all_counts, sum_counts , junc_table, split_table
6336 rg_old_tabs[0] = calloc(global_context -> thread_contexts[xk1].count_table_size, sizeof(srInt_64));
6337 rg_old_tabs[1] = calloc(1, sizeof(fc_read_counters));
6338 if(global_context -> do_junction_counting){
6339 HashTable * junction_counting_table = HashTableCreate(131317);
6340 HashTableSetHashFunction(junction_counting_table,HashTableStringHashFunction);
6341 HashTableSetDeallocationFunctions(junction_counting_table, free, NULL);
6342 HashTableSetKeyComparisonFunction(junction_counting_table, fc_strcmp_chro);
6343
6344 HashTable * splicing_point_table = HashTableCreate(131317);
6345 HashTableSetHashFunction(splicing_point_table,HashTableStringHashFunction);
6346 HashTableSetDeallocationFunctions(splicing_point_table, free, NULL);
6347 HashTableSetKeyComparisonFunction(splicing_point_table, fc_strcmp_chro);
6348
6349 rg_old_tabs[2] = junction_counting_table;
6350 rg_old_tabs[3] = splicing_point_table;
6351 }else rg_old_tabs[2] = NULL;
6352
6353 HashTablePut(RGmerged_table, memstrcpy(rg_name), rg_old_tabs);
6354 }
6355 srInt_64 * rg_counts = rg_old_tabs[0];
6356 fc_read_counters * rg_sum_reads = rg_old_tabs[1];
6357 HashTable * rg_junc_tab = rg_old_tabs[2];
6358 HashTable * rg_split_tab = rg_old_tabs[3];
6359
6360 srInt_64 * rg_thread_counts = rg_thread_tabs[0];
6361 fc_read_counters * rg_thread_sum_reads = rg_thread_tabs[1];
6362 HashTable * rg_thread_junc_table = rg_thread_tabs[2];
6363 HashTable * rg_thread_split_table = rg_thread_tabs[3];
6364
6365 for(xk2=0; xk2<global_context -> exontable_exons; xk2++)
6366 rg_counts[xk2] += rg_thread_counts[xk2];
6367
6368 rg_sum_reads->unassigned_ambiguous += rg_thread_sum_reads->unassigned_ambiguous;
6369 rg_sum_reads->unassigned_nofeatures += rg_thread_sum_reads->unassigned_nofeatures;
6370 rg_sum_reads->unassigned_overlapping_length += rg_thread_sum_reads->unassigned_overlapping_length;
6371 rg_sum_reads->unassigned_unmapped += rg_thread_sum_reads->unassigned_unmapped;
6372 rg_sum_reads->unassigned_singleton += rg_thread_sum_reads->unassigned_singleton;
6373 rg_sum_reads->unassigned_read_type += rg_thread_sum_reads->unassigned_read_type;
6374 rg_sum_reads->unassigned_mappingquality += rg_thread_sum_reads->unassigned_mappingquality;
6375 rg_sum_reads->unassigned_fragmentlength += rg_thread_sum_reads->unassigned_fragmentlength;
6376 rg_sum_reads->unassigned_chimericreads += rg_thread_sum_reads->unassigned_chimericreads;
6377 rg_sum_reads->unassigned_multimapping += rg_thread_sum_reads->unassigned_multimapping;
6378 rg_sum_reads->unassigned_secondary += rg_thread_sum_reads->unassigned_secondary;
6379 rg_sum_reads->unassigned_junction_condition += rg_thread_sum_reads->unassigned_junction_condition;
6380 rg_sum_reads->unassigned_duplicate += rg_thread_sum_reads->unassigned_duplicate;
6381 rg_sum_reads->assigned_reads += rg_thread_sum_reads->assigned_reads;
6382
6383 if(global_context -> do_junction_counting){
6384 int bucket_i;
6385 for(bucket_i = 0 ; bucket_i < rg_thread_junc_table -> numOfBuckets; bucket_i++){
6386 KeyValuePair * cursor;
6387 cursor = rg_thread_junc_table -> bucketArray[bucket_i];
6388 while(cursor){
6389 char * junckey = (char *) cursor -> key;
6390 void * globval = HashTableGet(rg_junc_tab, junckey);
6391 char * new_key = memstrcpy(junckey);
6392
6393 globval += (cursor -> value - NULL);
6394 HashTablePut(rg_junc_tab, new_key, globval);
6395 // new_key will be freed when it is replaced next time or when the global table is destroyed.
6396
6397 cursor = cursor->next;
6398 }
6399 }
6400
6401 for(bucket_i = 0 ; bucket_i < rg_thread_split_table -> numOfBuckets; bucket_i++){
6402 KeyValuePair * cursor;
6403 cursor = rg_thread_split_table -> bucketArray[bucket_i];
6404 while(cursor){
6405 char * junckey = (char *) cursor -> key;
6406 void * globval = HashTableGet(rg_split_tab, junckey);
6407 char * new_key = memstrcpy(junckey);
6408
6409 //if(xk1>0)
6410 //SUBREADprintf("MERGE THREAD-%d : %s VAL=%u, ADD=%u\n", xk1, junckey, globval - NULL, cursor -> value - NULL);
6411 globval += (cursor -> value - NULL);
6412 HashTablePut(rg_split_tab, new_key, globval);
6413 cursor = cursor->next;
6414 }
6415 }
6416 } // end : merge junc tables
6417 ret++;
6418 cursor = cursor -> next;
6419 }
6420 }
6421 }
6422
6423 for(xk2=0; xk2<global_context -> exontable_exons; xk2++)
6424 nreads[xk2]+=global_context -> thread_contexts[xk1].count_table[xk2];
6425
6426 total_input_reads += global_context -> thread_contexts[xk1].all_reads;
6427 (*nreads_mapped_to_exon) += global_context -> thread_contexts[xk1].nreads_mapped_to_exon;
6428
6429 global_context -> read_counters.unassigned_ambiguous += global_context -> thread_contexts[xk1].read_counters.unassigned_ambiguous;
6430 global_context -> read_counters.unassigned_nofeatures += global_context -> thread_contexts[xk1].read_counters.unassigned_nofeatures;
6431 global_context -> read_counters.unassigned_overlapping_length += global_context -> thread_contexts[xk1].read_counters.unassigned_overlapping_length;
6432 global_context -> read_counters.unassigned_unmapped += global_context -> thread_contexts[xk1].read_counters.unassigned_unmapped;
6433 global_context -> read_counters.unassigned_singleton += global_context -> thread_contexts[xk1].read_counters.unassigned_singleton;
6434 global_context -> read_counters.unassigned_read_type += global_context -> thread_contexts[xk1].read_counters.unassigned_read_type;
6435 global_context -> read_counters.unassigned_mappingquality += global_context -> thread_contexts[xk1].read_counters.unassigned_mappingquality;
6436 global_context -> read_counters.unassigned_fragmentlength += global_context -> thread_contexts[xk1].read_counters.unassigned_fragmentlength;
6437 global_context -> read_counters.unassigned_chimericreads += global_context -> thread_contexts[xk1].read_counters.unassigned_chimericreads;
6438 global_context -> read_counters.unassigned_multimapping += global_context -> thread_contexts[xk1].read_counters.unassigned_multimapping;
6439 global_context -> read_counters.unassigned_secondary += global_context -> thread_contexts[xk1].read_counters.unassigned_secondary;
6440 global_context -> read_counters.unassigned_junction_condition += global_context -> thread_contexts[xk1].read_counters.unassigned_junction_condition;
6441 global_context -> read_counters.unassigned_duplicate += global_context -> thread_contexts[xk1].read_counters.unassigned_duplicate;
6442 global_context -> read_counters.assigned_reads += global_context -> thread_contexts[xk1].read_counters.assigned_reads;
6443
6444 my_read_counter->unassigned_ambiguous += global_context -> thread_contexts[xk1].read_counters.unassigned_ambiguous;
6445 my_read_counter->unassigned_nofeatures += global_context -> thread_contexts[xk1].read_counters.unassigned_nofeatures;
6446 my_read_counter->unassigned_overlapping_length += global_context -> thread_contexts[xk1].read_counters.unassigned_overlapping_length;
6447 my_read_counter->unassigned_unmapped += global_context -> thread_contexts[xk1].read_counters.unassigned_unmapped;
6448 my_read_counter->unassigned_singleton += global_context -> thread_contexts[xk1].read_counters.unassigned_singleton;
6449 my_read_counter->unassigned_read_type += global_context -> thread_contexts[xk1].read_counters.unassigned_read_type;
6450 my_read_counter->unassigned_mappingquality += global_context -> thread_contexts[xk1].read_counters.unassigned_mappingquality;
6451 my_read_counter->unassigned_fragmentlength += global_context -> thread_contexts[xk1].read_counters.unassigned_fragmentlength;
6452 my_read_counter->unassigned_chimericreads += global_context -> thread_contexts[xk1].read_counters.unassigned_chimericreads;
6453 my_read_counter->unassigned_multimapping += global_context -> thread_contexts[xk1].read_counters.unassigned_multimapping;
6454 my_read_counter->unassigned_secondary += global_context -> thread_contexts[xk1].read_counters.unassigned_secondary;
6455 my_read_counter->unassigned_junction_condition += global_context -> thread_contexts[xk1].read_counters.unassigned_junction_condition;
6456 my_read_counter->unassigned_duplicate += global_context -> thread_contexts[xk1].read_counters.unassigned_duplicate;
6457 my_read_counter->assigned_reads += global_context -> thread_contexts[xk1].read_counters.assigned_reads;
6458
6459 if(global_context -> do_junction_counting){
6460 int bucket_i;
6461 for(bucket_i = 0 ; bucket_i < global_context -> thread_contexts[xk1].junction_counting_table -> numOfBuckets; bucket_i++){
6462 KeyValuePair * cursor;
6463 cursor = global_context -> thread_contexts[xk1].junction_counting_table -> bucketArray[bucket_i];
6464 while(cursor){
6465 char * junckey = (char *) cursor -> key;
6466
6467 void * globval = HashTableGet(junction_global_table, junckey);
6468 char * new_key = malloc(strlen(junckey)+1);
6469 strcpy(new_key, junckey);
6470 globval += (cursor -> value - NULL);
6471 HashTablePut(junction_global_table, new_key, globval);
6472 // new_key will be freed when it is replaced next time or when the global table is destroyed.
6473
6474 cursor = cursor->next;
6475 }
6476 }
6477
6478 for(bucket_i = 0 ; bucket_i < global_context -> thread_contexts[xk1].splicing_point_table -> numOfBuckets; bucket_i++){
6479 KeyValuePair * cursor;
6480 cursor = global_context -> thread_contexts[xk1].splicing_point_table -> bucketArray[bucket_i];
6481 while(cursor){
6482 char * junckey = (char *) cursor -> key;
6483 void * globval = HashTableGet(splicing_global_table, junckey);
6484 char * new_key = malloc(strlen(junckey)+1);
6485 strcpy(new_key, junckey);
6486
6487 //if(xk1>0)
6488 //SUBREADprintf("MERGE THREAD-%d : %s VAL=%u, ADD=%u\n", xk1, junckey, globval - NULL, cursor -> value - NULL);
6489
6490 globval += (cursor -> value - NULL);
6491 HashTablePut(splicing_global_table, new_key, globval);
6492 cursor = cursor->next;
6493 }
6494 }
6495 }
6496 }
6497
6498
6499
6500 if(0 == global_context -> is_input_bad_format){
6501
6502 if(global_context -> is_paired_end_reads_expected){
6503 if(global_context -> is_mixed_PE_SE)
6504 print_in_box(80,0,0," WARNING: Single-end reads were found%s.", global_context -> is_strand_checked?" and excluded":"");
6505 else print_in_box(80,0,0," Paired-end reads are included.");
6506 if(!global_context -> is_paired_end_mode_assign)
6507 print_in_box(80,0,0, " The reads are assigned on the single-end mode.");
6508 }else{
6509 // paired-end reads in a single-end lib will result in error.
6510 print_in_box(80,0,0," Single-end reads are included.");
6511 }
6512
6513 char pct_str[10];
6514 if(total_input_reads>0)
6515 sprintf(pct_str,"(%.1f%%%%)", (*nreads_mapped_to_exon)*100./total_input_reads);
6516 else pct_str[0]=0;
6517
6518 int show_summary = 1;
6519 if(global_context -> assign_reads_to_RG){
6520 if(RGmerged_table -> numOfElements)
6521 print_in_box(80,0,0," Total read groups : %ld", RGmerged_table -> numOfElements);
6522 else{
6523 print_in_box(80,0,0," No read groups are found; no output is generated.");
6524 show_summary = 0;
6525 }
6526 }
6527 if(show_summary){
6528 print_in_box(80,0,0," Total alignments : %llu", total_input_reads);
6529 print_in_box(pct_str[0]?81:80,0,0," Successfully assigned alignments : %llu %s", *nreads_mapped_to_exon,pct_str);
6530 }
6531 print_in_box(80,0,0," Running time : %.2f minutes", (miltime() - global_context -> start_time)/60);
6532 print_in_box(80,0,0,"");
6533 }
6534 return ret;
6535 }
6536
get_temp_dir_from_out(char * tmp,char * out)6537 void get_temp_dir_from_out(char * tmp, char * out){
6538 char * slash = strrchr(out,'/');
6539 if(NULL == slash){
6540 strcpy(tmp, "./");
6541 }else{
6542 memcpy(tmp, out, slash - out);
6543 tmp[slash - out]=0;
6544 }
6545 }
6546
fc_thread_init_input_files(fc_thread_global_context_t * global_context,char * in_fnames,char ** out_ptr)6547 void fc_thread_init_input_files(fc_thread_global_context_t * global_context, char * in_fnames, char ** out_ptr ){
6548 if(global_context -> use_stdin_file){
6549 #ifdef MAKE_STANDALONE
6550
6551 char MAC_or_random[13];
6552
6553 (*out_ptr) = malloc(MAX_FILE_NAME_LENGTH);
6554 mac_or_rand_str(MAC_or_random);
6555 sprintf(*out_ptr, "%s/temp-core-%06u-%s.sam", global_context -> temp_file_dir, getpid(), MAC_or_random);
6556
6557 SUBREADprintf("\nReading data from <STDIN> for featureCounts ...\n\n");
6558
6559 FILE * ifp = fopen(*out_ptr,"w");
6560 while(1){
6561 char nchar[100];
6562 int rlen = fread(nchar, 1, 100, stdin);
6563 if(rlen > 0) fwrite(nchar, 1, rlen, ifp);
6564 else break;
6565 //if(rlen < 100)break;
6566 }
6567 fclose(ifp);
6568
6569 #endif
6570 }else{
6571 (*out_ptr) = malloc(strlen(in_fnames)+1);
6572 strcpy((*out_ptr), in_fnames);
6573 }
6574
6575 }
6576
fc_NCfree(void * vv)6577 void fc_NCfree(void * vv){
6578 char ** cc = vv;
6579 int i;
6580 for(i=0; cc[i]; i++) free(cc[i]);
6581 free(vv);
6582 }
6583
scRNA_convert_ss_to_arr(void * key,void * hashed_obj,HashTable * tab)6584 void scRNA_convert_ss_to_arr( void * key, void * hashed_obj, HashTable * tab ){
6585 ArrayList * hashed_arr = hashed_obj ;
6586 fc_thread_global_context_t * global_context = tab->appendix1;
6587 ArrayListPush(global_context -> scRNA_sample_id_to_name, key);
6588 hashed_arr -> appendix1 = NULL+global_context -> scRNA_sample_id_to_name -> numOfElements; // One-based
6589
6590 srInt_64 xx1;
6591 for(xx1 =0; xx1< hashed_arr -> numOfElements; xx1++){
6592 char ** push_arr = malloc(sizeof(char*)*3);
6593 char ** sbc_lane_sample = ArrayListGet(hashed_arr, xx1);
6594 srInt_64 lane_sample_int = sbc_lane_sample[0]-(char*)NULL;
6595
6596 ArrayListPush(global_context -> scRNA_sample_barcode_list, push_arr);
6597 push_arr[0] = NULL + lane_sample_int;
6598 push_arr[1] = NULL + global_context -> scRNA_sample_id_to_name -> numOfElements;
6599 push_arr[2] = sbc_lane_sample[1]; // Sample Barcode
6600
6601 int line_no_in_sheet = sbc_lane_sample[2] - (char*)NULL;
6602 HashTablePut(global_context -> scRNA_lineno1B_to_sampleno1B_tab , NULL+line_no_in_sheet, NULL + global_context -> scRNA_sample_id_to_name -> numOfElements);
6603 //SUBREADprintf("Push_LineNo : %d -> %d\n", line_no_in_sheet, global_context -> scRNA_sample_id_to_name -> numOfElements);
6604 //SUBREADprintf("Push Barcode %s in Lane %ld for %s [%d]\n", push_arr[2], (sbc_lane_sample[0]-(char*)NULL), key, global_context -> scRNA_sample_id_to_name -> numOfElements);
6605 }
6606 }
6607
6608
scRNA_cell_barcode_tabel_destroy(void * a)6609 void scRNA_cell_barcode_tabel_destroy(void *a){
6610 if(((a-NULL) & 0xfffffffff0000000llu ) ==IMPOSSIBLE_MEMORY_SPACE )return;
6611 ArrayListDestroy((ArrayList*)a);
6612 }
6613
scRNA_make_barcode_HT_table(fc_thread_global_context_t * global_context)6614 void scRNA_make_barcode_HT_table( fc_thread_global_context_t * global_context ){
6615 int xx1,xx2;
6616 global_context -> scRNA_cell_barcode_head_tail_table = StringTableCreate(600000);
6617 HashTableSetDeallocationFunctions( global_context -> scRNA_cell_barcode_head_tail_table, free, scRNA_cell_barcode_tabel_destroy);
6618
6619 for(xx1=0;xx1 < global_context-> scRNA_cell_barcodes_array -> numOfElements; xx1++){
6620 char * bc = ArrayListGet(global_context-> scRNA_cell_barcodes_array, xx1);
6621 int bcl =strlen(bc);
6622 if(global_context -> known_cell_barcode_length==0) global_context -> known_cell_barcode_length=bcl;
6623 if(bcl!=global_context -> known_cell_barcode_length){
6624 // SUBREADprintf("The cell barcodes have variable lengths. This may be a new protocol and we don't support it yet.\n");
6625 assert(bcl==global_context -> known_cell_barcode_length);
6626 }
6627 char bctmp[20];
6628 HashTablePut(global_context -> scRNA_cell_barcode_head_tail_table, strdup(bc), NULL+xx1+IMPOSSIBLE_MEMORY_SPACE);
6629 for(xx2=0; xx2<2; xx2++){
6630 bctmp[0] = xx2?'S':'F';
6631 int xx3;
6632 for(xx3 = 0; xx3< global_context -> known_cell_barcode_length/2; xx3++)
6633 bctmp[xx3+1] = bc[ xx3*2+xx2 ];
6634 bctmp[bcl/2+1]=0;
6635
6636 ArrayList * array_of_codes = HashTableGet(global_context -> scRNA_cell_barcode_head_tail_table, bctmp);
6637 if(!array_of_codes){
6638 array_of_codes = ArrayListCreate(4);
6639 HashTablePut(global_context -> scRNA_cell_barcode_head_tail_table, strdup(bctmp), array_of_codes);
6640 }
6641 ArrayListPush(array_of_codes, NULL+xx1);
6642 }
6643 }
6644 }
6645
6646
scRNA_close_sample_SamBam_writers(void * v)6647 void scRNA_close_sample_SamBam_writers(void *v){
6648 void ** vv = v;
6649 simple_bam_writer * wtr = vv[0];
6650 simple_bam_close(wtr);
6651
6652 if(vv[1]){
6653 parallel_gzip_writer_t* gzfp = vv[1];
6654 parallel_gzip_writer_close(gzfp);
6655
6656 gzfp = vv[2];
6657 parallel_gzip_writer_close(gzfp);
6658
6659 gzfp = vv[3];
6660 parallel_gzip_writer_close(gzfp);
6661 }
6662
6663 pthread_spinlock_t * gz_lock = vv[4];
6664 pthread_spin_destroy(gz_lock);
6665 free(gz_lock);
6666
6667 free(vv);
6668 }
6669
6670 #define SORT_BAM_FROM_SCRNA 1
scRNA_sample_SamBam_writers_new_files(void * k,void * v,HashTable * tab)6671 void scRNA_sample_SamBam_writers_new_files(void *k, void *v, HashTable * tab){
6672 HashTable * fp_tab = tab -> appendix1;
6673 fc_thread_global_context_t * global_context = tab -> appendix2;
6674 ArrayList * scRNA_sample_id_to_name = tab -> appendix3;
6675
6676 char * samplename = k;
6677 char fname [MAX_FILE_NAME_LENGTH+20], fnamet[MAX_FILE_NAME_LENGTH+20];
6678 sprintf(fname, "%s.bam", samplename);
6679 sprintf(fnamet, "del4-cC-tmp0-%s.del", samplename);
6680 simple_bam_writer * wtr = simple_bam_create(fname);
6681 parallel_gzip_writer_t * gzipR1fq=NULL, * gzipI1fq=NULL, * gzipR2fq=NULL;
6682
6683 if(global_context -> scRNA_input_mode == GENE_INPUT_BCL || global_context -> scRNA_input_mode == GENE_INPUT_SCRNA_BAM){
6684 gzipR1fq = calloc(sizeof(parallel_gzip_writer_t),1);
6685 gzipI1fq = calloc(sizeof(parallel_gzip_writer_t),1);
6686 gzipR2fq = calloc(sizeof(parallel_gzip_writer_t),1);
6687 sprintf(fname, "%s_R1.fastq.gz", samplename);
6688 parallel_gzip_writer_init(gzipR1fq, fname, global_context -> thread_number);
6689 sprintf(fname, "%s_I1.fastq.gz", samplename);
6690 parallel_gzip_writer_init(gzipI1fq, fname, global_context -> thread_number);
6691 sprintf(fname, "%s_R2.fastq.gz", samplename);
6692 parallel_gzip_writer_init(gzipR2fq, fname, global_context -> thread_number);
6693 }
6694
6695 pthread_spinlock_t * gzfp_lock = malloc(sizeof(pthread_spinlock_t));
6696 pthread_spin_init(gzfp_lock, PTHREAD_PROCESS_PRIVATE);
6697 int x1;
6698 for(x1=0; x1<scRNA_sample_id_to_name -> numOfElements; x1++){
6699 char * sample_name = ArrayListGet( scRNA_sample_id_to_name, x1 );
6700 if(strcmp(sample_name, samplename)==0){
6701 void ** wtrptr = malloc(sizeof(void*)*6);
6702 wtrptr[0]=wtr;
6703 wtrptr[1]=gzipR1fq;
6704 wtrptr[2]=gzipI1fq;
6705 wtrptr[3]=gzipR2fq;
6706 wtrptr[4]=gzfp_lock;
6707 wtrptr[5]=NULL;
6708 HashTablePut(fp_tab, NULL+x1+1 , wtrptr);
6709 break;
6710 }
6711 }
6712 }
6713
fc_thread_init_global_context(fc_thread_global_context_t * global_context,unsigned int buffer_size,unsigned short threads,int line_length,int min_pe_dist,int max_pe_dist,int is_gene_level,int is_overlap_allowed,char * strand_check_mode,char * output_fname,int is_sam_out,int is_both_end_required,int is_chimertc_disallowed,int is_PE_distance_checked,char * feature_name_column,char * gene_id_column,int min_map_qual_score,int is_multi_mapping_allowed,int is_SAM,char * alias_file_name,char * cmd_rebuilt,int is_input_file_resort_needed,int feature_block_size,int isCVersion,int fiveEndExtension,int threeEndExtension,int minFragmentOverlap,int is_split_or_exonic_only,int reduce_5_3_ends_to_one,char * debug_command,int is_duplicate_ignored,int is_not_sort,int use_fraction_multimapping,int useOverlappingBreakTie,char * pair_orientations,int do_junction_cnt,int max_M,int isRestrictlyNoOvelrapping,float fracOverlap,char * temp_dir,int use_stdin_file,int assign_reads_to_RG,int long_read_minimum_length,int is_verbose,float frac_feature_overlap,int do_detection_call,int max_missing_bases_in_read,int max_missing_bases_in_feature,int is_primary_alignment_only,char * Rpath,char * extra_column_names,char * annotation_file_screen_output,int read_shift_type,int read_shift_size,char * scRNA_sample_sheet,char * scRNA_cell_barcode_list,int is_scRNA_BAM_FQ_out_generated,int scRNA_input_mode,int scRNA_rerun_on_persample_BAM,float scRNA_umi_cutoff)6714 void fc_thread_init_global_context(fc_thread_global_context_t * global_context, unsigned int buffer_size, unsigned short threads, int line_length, int min_pe_dist, int max_pe_dist, int is_gene_level, int is_overlap_allowed, char * strand_check_mode, char * output_fname, int is_sam_out, int is_both_end_required, int is_chimertc_disallowed, int is_PE_distance_checked, char *feature_name_column, char * gene_id_column, int min_map_qual_score, int is_multi_mapping_allowed, int is_SAM, char * alias_file_name, char * cmd_rebuilt, int is_input_file_resort_needed, int feature_block_size, int isCVersion, int fiveEndExtension, int threeEndExtension, int minFragmentOverlap, int is_split_or_exonic_only, int reduce_5_3_ends_to_one, char * debug_command, int is_duplicate_ignored, int is_not_sort, int use_fraction_multimapping, int useOverlappingBreakTie, char * pair_orientations, int do_junction_cnt, int max_M, int isRestrictlyNoOvelrapping, float fracOverlap, char * temp_dir, int use_stdin_file, int assign_reads_to_RG, int long_read_minimum_length, int is_verbose, float frac_feature_overlap, int do_detection_call, int max_missing_bases_in_read, int max_missing_bases_in_feature, int is_primary_alignment_only, char * Rpath, char * extra_column_names , char * annotation_file_screen_output, int read_shift_type, int read_shift_size, char * scRNA_sample_sheet, char * scRNA_cell_barcode_list, int is_scRNA_BAM_FQ_out_generated, int scRNA_input_mode, int scRNA_rerun_on_persample_BAM, float scRNA_umi_cutoff) {
6715 int x1;
6716 myrand_srand(time(NULL));
6717
6718 memset(global_context, 0, sizeof(fc_thread_global_context_t));
6719 global_context -> max_BAM_header_size = buffer_size;
6720 global_context -> all_reads = 0;
6721 global_context -> redo = 0;
6722 global_context -> read_details_out_FP = NULL;
6723
6724 global_context -> reported_extra_columns = extra_column_names;
6725 global_context -> isCVersion = isCVersion;
6726 global_context -> is_read_details_out = is_sam_out;
6727 global_context -> is_multi_overlap_allowed = is_overlap_allowed;
6728 global_context -> restricted_no_multi_overlap = isRestrictlyNoOvelrapping;
6729 global_context -> is_gene_level = is_gene_level;
6730 global_context -> strand_check_mode = strand_check_mode;
6731 global_context -> is_both_end_required = is_both_end_required;
6732 global_context -> is_chimertc_disallowed = is_chimertc_disallowed;
6733 global_context -> is_PE_distance_checked = is_PE_distance_checked;
6734 global_context -> is_multi_mapping_allowed = is_multi_mapping_allowed;
6735 global_context -> is_primary_alignment_only = is_primary_alignment_only;
6736 global_context -> is_split_or_exonic_only = is_split_or_exonic_only;
6737 global_context -> is_duplicate_ignored = is_duplicate_ignored;
6738 global_context -> use_stdin_file = use_stdin_file;
6739 global_context -> assign_reads_to_RG = assign_reads_to_RG;
6740 global_context -> long_read_minimum_length = long_read_minimum_length;
6741 global_context -> is_verbose = is_verbose;
6742 global_context -> do_detection_call = do_detection_call;
6743 //global_context -> is_first_read_reversed = (pair_orientations[0]=='r');
6744 //global_context -> is_second_read_straight = (pair_orientations[1]=='f');
6745
6746 global_context -> reduce_5_3_ends_to_one = reduce_5_3_ends_to_one;
6747 global_context -> do_not_sort = is_not_sort;
6748 global_context -> is_SAM_file = is_SAM;
6749 global_context -> use_fraction_multi_mapping = use_fraction_multimapping;
6750 global_context -> do_junction_counting = do_junction_cnt;
6751
6752 global_context -> thread_number = threads;
6753 global_context -> min_mapping_quality_score = min_map_qual_score;
6754 global_context -> unistr_buffer_size = 1024*1024*2;
6755 global_context -> unistr_buffer_used = 0;
6756 global_context -> unistr_buffer_space = malloc(global_context -> unistr_buffer_size);
6757 global_context -> BAM_chros_to_anno_table = NULL;
6758 global_context -> cmd_rebuilt = cmd_rebuilt;
6759 global_context -> feature_block_size = feature_block_size;
6760 global_context -> five_end_extension = fiveEndExtension;
6761 global_context -> three_end_extension = threeEndExtension;
6762 global_context -> read_shift_type = read_shift_type;
6763 global_context -> read_shift_size = read_shift_size;
6764 global_context -> fragment_minimum_overlapping = minFragmentOverlap;
6765 global_context -> fractional_minimum_overlapping = fracOverlap;
6766 global_context -> fractional_minimum_feature_overlapping = frac_feature_overlap;
6767 global_context -> max_missing_bases_in_read = max_missing_bases_in_read;
6768 global_context -> max_missing_bases_in_feature = max_missing_bases_in_feature;
6769 global_context -> use_overlapping_break_tie = useOverlappingBreakTie;
6770 global_context -> need_calculate_fragment_len = ( global_context -> fractional_minimum_overlapping > 1E-10 ) || (global_context -> fractional_minimum_feature_overlapping > 1E-10) || ( global_context -> max_missing_bases_in_read >= 0 ) || ( global_context -> max_missing_bases_in_feature >= 0 );
6771 global_context -> need_calculate_overlap_len = (global_context -> fractional_minimum_overlapping > 1E-10) || (global_context -> fragment_minimum_overlapping > 1) || global_context -> use_overlapping_break_tie || (global_context -> fractional_minimum_feature_overlapping > 1E-10) || ( global_context -> max_missing_bases_in_read >= 0 ) || ( global_context -> max_missing_bases_in_feature >= 0 );
6772 global_context -> debug_command = debug_command;
6773 global_context -> max_M = max_M;
6774 global_context -> max_BAM_header_size = buffer_size;
6775 if(scRNA_sample_sheet){
6776 global_context -> scRNA_umi_cutoff = scRNA_umi_cutoff;
6777 global_context -> do_scRNA_table = 1;
6778 global_context -> scRNA_sample_id_to_name = ArrayListCreate(64);
6779 global_context -> scRNA_lineno1B_to_sampleno1B_tab = HashTableCreate(10);
6780 strcpy(global_context->scRNA_sample_sheet,scRNA_sample_sheet);
6781 global_context-> scRNA_sample_sheet_table = input_BLC_parse_SampleSheet( global_context->scRNA_sample_sheet );
6782 global_context-> scRNA_sample_sheet_table -> appendix1 = global_context;
6783 global_context -> scRNA_sample_barcode_list = ArrayListCreate(64);
6784 ArrayListSetDeallocationFunction(global_context -> scRNA_sample_barcode_list, free);
6785 HashTableIteration(global_context-> scRNA_sample_sheet_table, scRNA_convert_ss_to_arr);
6786
6787 if(scRNA_cell_barcode_list){
6788 strcpy(global_context->scRNA_cell_barcode_list,scRNA_cell_barcode_list);
6789 global_context-> scRNA_cell_barcodes_array = input_BLC_parse_CellBarcodes( global_context->scRNA_cell_barcode_list );
6790 scRNA_make_barcode_HT_table( global_context );
6791 //print_in_box(80,0,0,"Loaded %ld cell barcodes from the list.", global_context-> scRNA_cell_barcodes_array -> numOfElements);
6792 }
6793 global_context -> is_scRNA_BAM_FQ_out_generated = is_scRNA_BAM_FQ_out_generated;
6794 global_context -> scRNA_input_mode = scRNA_input_mode;
6795 global_context -> scRNA_rerun_on_persample_BAM = scRNA_rerun_on_persample_BAM;
6796 global_context -> scRNA_barcode_batched_bin_no = 149;
6797 pthread_spin_init(&global_context -> scRNA_do_one_batch_runner_lock, PTHREAD_PROCESS_PRIVATE);
6798 global_context -> scRNA_barcode_batched_locks = malloc(sizeof(pthread_spinlock_t)*(global_context -> scRNA_barcode_batched_bin_no+2));
6799 global_context -> scRNA_barcode_batched_bins = malloc(sizeof(FILE*)*(global_context -> scRNA_barcode_batched_bin_no+2));
6800 for(x1=0; x1<global_context -> scRNA_barcode_batched_bin_no+2; x1++){
6801 char tmp_fname[MAX_FILE_NAME_LENGTH+20];
6802 sprintf(tmp_fname, "%s/cellCounts-Splitted-Reads-%05d-%05d.bin", temp_dir, getpid(), x1);
6803 //SUBREADprintf("CREATE TEMP FILE %s\n" , tmp_fname);
6804 global_context -> scRNA_barcode_batched_bins[x1]=fopen(tmp_fname, "wb");
6805 pthread_spin_init(global_context -> scRNA_barcode_batched_locks+x1, PTHREAD_PROCESS_PRIVATE);
6806 }
6807 }else{
6808 global_context -> do_scRNA_table = 0;
6809 global_context-> scRNA_cell_barcodes_array = NULL;
6810 global_context-> scRNA_sample_sheet_table = NULL;
6811 }
6812
6813 global_context -> read_counters.unassigned_ambiguous=0;
6814 global_context -> read_counters.unassigned_nofeatures=0;
6815 global_context -> read_counters.unassigned_overlapping_length=0;
6816 global_context -> read_counters.unassigned_unmapped=0;
6817 global_context -> read_counters.unassigned_read_type=0;
6818 global_context -> read_counters.unassigned_singleton=0;
6819 global_context -> read_counters.unassigned_mappingquality=0;
6820 global_context -> read_counters.unassigned_fragmentlength=0;
6821 global_context -> read_counters.unassigned_chimericreads=0;
6822 global_context -> read_counters.unassigned_multimapping=0;
6823 global_context -> read_counters.unassigned_secondary=0;
6824 global_context -> read_counters.unassigned_junction_condition=0;
6825 global_context -> read_counters.unassigned_duplicate=0;
6826 global_context -> read_counters.assigned_reads=0;
6827
6828 global_context -> GCcontent_table = HashTableCreate(20000);
6829 HashTableSetHashFunction(global_context -> GCcontent_table, HashTableStringHashFunction);
6830 HashTableSetDeallocationFunctions(global_context -> GCcontent_table, free, free);
6831 HashTableSetKeyComparisonFunction(global_context -> GCcontent_table, fc_strcmp_chro);
6832
6833 if(annotation_file_screen_output) strcpy(global_context -> annotation_file_screen_output, annotation_file_screen_output);
6834 else global_context ->annotation_file_screen_output[0]=0;
6835
6836 if(alias_file_name && alias_file_name[0])
6837 {
6838 strcpy(global_context -> alias_file_name,alias_file_name);
6839 global_context -> BAM_chros_to_anno_table = load_alias_table(alias_file_name);
6840 }
6841 else global_context -> alias_file_name[0]=0;
6842
6843 global_context -> read_details_path[0]=0;
6844 if(Rpath)strcpy(global_context -> read_details_path, Rpath);
6845
6846 strcpy(global_context -> feature_name_column,feature_name_column);
6847 strcpy(global_context -> gene_id_column,gene_id_column);
6848 strcpy(global_context -> output_file_name, output_fname);
6849 global_context -> output_file_path[0]=0;
6850 for( x1 = strlen(output_fname)-1; x1 >= 0; x1 --){
6851 if(output_fname[x1]=='/'){
6852 memcpy(global_context -> output_file_path, output_fname, x1);
6853 global_context -> output_file_path[x1]=0;
6854 break;
6855 }
6856 }
6857 if(0 == global_context -> output_file_path[0]){
6858 strcpy(global_context -> output_file_path, ".");
6859 }
6860
6861 if(temp_dir == NULL)get_temp_dir_from_out(global_context -> temp_file_dir, output_fname);
6862 else strcpy(global_context -> temp_file_dir, temp_dir);
6863 //SUBREADprintf("OFPP:%s, OFNN:%s\n", global_context -> output_file_path, global_context -> output_file_name);
6864
6865 global_context -> min_paired_end_distance = min_pe_dist;
6866 global_context -> max_paired_end_distance = max_pe_dist;
6867 global_context -> thread_number = threads;
6868 global_context -> line_length = line_length;
6869 }
6870
6871
6872
fc_thread_start_threads(fc_thread_global_context_t * global_context,int et_exons,int * et_geneid,char ** et_chr,srInt_64 * et_start,srInt_64 * et_stop,unsigned char * et_strand,char * et_anno_chr_2ch,char ** et_anno_chrs,srInt_64 * et_anno_chr_heads,srInt_64 * et_bk_end_index,srInt_64 * et_bk_min_start,srInt_64 * et_bk_max_end,int read_length)6873 int fc_thread_start_threads(fc_thread_global_context_t * global_context, int et_exons, int * et_geneid, char ** et_chr, srInt_64 * et_start, srInt_64 * et_stop, unsigned char * et_strand, char * et_anno_chr_2ch, char ** et_anno_chrs, srInt_64 * et_anno_chr_heads, srInt_64 * et_bk_end_index, srInt_64 * et_bk_min_start, srInt_64 * et_bk_max_end, int read_length)
6874 {
6875 int xk1;
6876
6877 global_context -> read_length = read_length;
6878 global_context -> is_unpaired_warning_shown = 0;
6879 global_context -> is_stake_warning_shown = 0;
6880 global_context -> is_read_too_long_to_SAM_BAM_shown = 0;
6881
6882 if(global_context -> is_read_details_out)
6883 {
6884 char tmp_fname[MAX_FILE_NAME_LENGTH+20], *modified_fname;
6885 int i=0;
6886 char * applied_detail_path = global_context -> output_file_path;
6887 if(global_context -> read_details_path[0]) applied_detail_path = global_context -> read_details_path;
6888
6889 if( global_context -> input_file_unique ){
6890 sprintf(tmp_fname, "%s/%s.featureCounts%s", applied_detail_path, global_context -> input_file_short_name, global_context -> is_read_details_out == FILE_TYPE_BAM?".bam":(global_context -> is_read_details_out == FILE_TYPE_SAM?".sam":""));
6891 global_context -> read_details_out_FP = f_subr_open(tmp_fname, "w");
6892 //SUBREADprintf("FCSSF=%s\n", tmp_fname);
6893 } else {
6894 sprintf(tmp_fname, "%s.featureCounts%s", global_context -> raw_input_file_name, global_context -> is_read_details_out == FILE_TYPE_BAM?".bam":(global_context -> is_read_details_out == FILE_TYPE_SAM?".sam":""));
6895 modified_fname = tmp_fname;
6896 while(modified_fname[0]=='/' || modified_fname[0]=='.' || modified_fname[0]=='\\'){
6897 modified_fname ++;
6898 }
6899 while(modified_fname[i]){
6900 if(modified_fname[i]=='\\' || modified_fname[i]=='/'||modified_fname[i]==' ')modified_fname[i]='.';
6901 i++;
6902 }
6903 char tmp_fname2[MAX_FILE_NAME_LENGTH*2+100];
6904 sprintf(tmp_fname2, "%s/%s", applied_detail_path, modified_fname);
6905 global_context -> read_details_out_FP = f_subr_open(tmp_fname2, "w");
6906 //SUBREADprintf("FCSSF=%s\n", tmp_fname2);
6907 }
6908 if(global_context -> read_details_out_FP){
6909 pthread_spin_init(&global_context -> read_details_out_lock, 1);
6910 }else{
6911 SUBREADprintf("Unable to create file '%s'; the read assignment details are not written.\n", tmp_fname);
6912 }
6913 }
6914 else
6915 global_context -> read_details_out_FP = NULL;
6916
6917 global_context -> redo = 0;
6918 global_context -> exontable_geneid = et_geneid;
6919 global_context -> exontable_chr = et_chr;
6920 global_context -> exontable_start = et_start;
6921 global_context -> exontable_stop = et_stop;
6922 global_context -> exontable_strand = (char *)et_strand;
6923 global_context -> exontable_anno_chr_2ch = et_anno_chr_2ch;
6924 global_context -> exontable_anno_chrs = et_anno_chrs;
6925 global_context -> exontable_anno_chr_heads = et_anno_chr_heads;
6926 global_context -> exontable_block_end_index = et_bk_end_index;
6927 global_context -> exontable_block_max_end = et_bk_max_end;
6928 global_context -> exontable_block_min_start = et_bk_min_start;
6929 global_context -> sambam_chro_table_items = 0;
6930 global_context -> sambam_chro_table = NULL;
6931
6932 global_context -> thread_contexts = malloc(sizeof(fc_thread_thread_context_t) * global_context -> thread_number);
6933 for(xk1=0; xk1<global_context -> thread_number; xk1++)
6934 {
6935 // printf("CHRR_MALLOC\n");
6936 global_context -> thread_contexts[xk1].thread_id = xk1;
6937 global_context -> thread_contexts[xk1].chunk_read_ptr = 0;
6938 global_context -> thread_contexts[xk1].count_table = calloc(sizeof(read_count_type_t), et_exons);
6939 global_context -> thread_contexts[xk1].count_table_size = et_exons;
6940 global_context -> thread_contexts[xk1].nreads_mapped_to_exon = 0;
6941 global_context -> thread_contexts[xk1].all_reads = 0;
6942 global_context -> thread_contexts[xk1].chro_name_buff = malloc(CHROMOSOME_NAME_LENGTH);
6943
6944 global_context -> thread_contexts[xk1].read_counters.assigned_reads = 0;
6945 global_context -> thread_contexts[xk1].read_counters.unassigned_ambiguous = 0;
6946 global_context -> thread_contexts[xk1].read_counters.unassigned_nofeatures = 0;
6947 global_context -> thread_contexts[xk1].read_counters.unassigned_unmapped = 0;
6948 global_context -> thread_contexts[xk1].read_counters.unassigned_singleton = 0;
6949 global_context -> thread_contexts[xk1].read_counters.unassigned_read_type = 0;
6950 global_context -> thread_contexts[xk1].read_counters.unassigned_mappingquality = 0;
6951 global_context -> thread_contexts[xk1].read_counters.unassigned_fragmentlength = 0;
6952 global_context -> thread_contexts[xk1].read_counters.unassigned_chimericreads = 0;
6953 global_context -> thread_contexts[xk1].read_counters.unassigned_multimapping = 0;
6954 global_context -> thread_contexts[xk1].read_counters.unassigned_secondary = 0;
6955 global_context -> thread_contexts[xk1].read_counters.unassigned_junction_condition = 0;
6956 global_context -> thread_contexts[xk1].read_counters.unassigned_overlapping_length = 0;
6957 global_context -> thread_contexts[xk1].read_counters.unassigned_duplicate = 0;
6958 global_context -> thread_contexts[xk1].read_details_buff_used = 0;
6959 global_context -> thread_contexts[xk1].hits_number_capacity = 300 ;
6960
6961 global_context -> thread_contexts[xk1].hits_start_pos1 = malloc(sizeof(int)* global_context -> thread_contexts[xk1].hits_number_capacity);
6962 global_context -> thread_contexts[xk1].hits_start_pos2 = malloc(sizeof(int)* global_context -> thread_contexts[xk1].hits_number_capacity);
6963 global_context -> thread_contexts[xk1].hits_length1 = malloc(sizeof(short)* global_context -> thread_contexts[xk1].hits_number_capacity);
6964 global_context -> thread_contexts[xk1].hits_length2 = malloc(sizeof(short)* global_context -> thread_contexts[xk1].hits_number_capacity);
6965 global_context -> thread_contexts[xk1].hits_chro1 = malloc(sizeof(char*)* global_context -> thread_contexts[xk1].hits_number_capacity);
6966 global_context -> thread_contexts[xk1].hits_chro2 = malloc(sizeof(char*)* global_context -> thread_contexts[xk1].hits_number_capacity);
6967 global_context -> thread_contexts[xk1].hits_indices1 = malloc(sizeof(srInt_64)* global_context -> thread_contexts[xk1].hits_number_capacity);
6968 global_context -> thread_contexts[xk1].hits_indices2 = malloc(sizeof(srInt_64)* global_context -> thread_contexts[xk1].hits_number_capacity);
6969
6970 global_context -> thread_contexts[xk1].scoring_buff_numbers = malloc(sizeof(int)* global_context -> thread_contexts[xk1].hits_number_capacity * 2);
6971 global_context -> thread_contexts[xk1].scoring_buff_flags = malloc(sizeof(int)* global_context -> thread_contexts[xk1].hits_number_capacity * 2);
6972 global_context -> thread_contexts[xk1].scoring_buff_overlappings = malloc(sizeof(int)* global_context -> thread_contexts[xk1].hits_number_capacity * 2);
6973 global_context -> thread_contexts[xk1].scoring_buff_exon_ids =malloc(sizeof(srInt_64)* global_context -> thread_contexts[xk1].hits_number_capacity * 2);
6974
6975 if(global_context -> read_details_out_FP){
6976 global_context -> thread_contexts[xk1].read_details_buff = malloc(70000 + 2 * MAX_FC_READ_LENGTH * 3);
6977 global_context -> thread_contexts[xk1].bam_compressed_buff = malloc(70000 + 2 * MAX_FC_READ_LENGTH * 3);
6978 }
6979
6980 if(global_context -> need_calculate_overlap_len){
6981 global_context -> thread_contexts[xk1].scoring_buff_gap_chros = malloc( sizeof(char *) * global_context -> thread_contexts[xk1].hits_number_capacity * 2 * global_context -> max_M *2);
6982 global_context -> thread_contexts[xk1].scoring_buff_gap_starts = malloc( sizeof(unsigned int ) * global_context -> thread_contexts[xk1].hits_number_capacity * 2 * global_context -> max_M *2);
6983 global_context -> thread_contexts[xk1].scoring_buff_gap_lengths = malloc( sizeof(unsigned short) * global_context -> thread_contexts[xk1].hits_number_capacity * 2 * global_context -> max_M *2);
6984 } else global_context -> thread_contexts[xk1].scoring_buff_gap_chros = NULL;
6985
6986 if(global_context -> do_junction_counting)
6987 {
6988 global_context -> thread_contexts[xk1].junction_counting_table = HashTableCreate(131317);
6989 HashTableSetHashFunction(global_context -> thread_contexts[xk1].junction_counting_table,HashTableStringHashFunction);
6990 HashTableSetDeallocationFunctions(global_context -> thread_contexts[xk1].junction_counting_table, free, NULL);
6991 HashTableSetKeyComparisonFunction(global_context -> thread_contexts[xk1].junction_counting_table, fc_strcmp_chro);
6992
6993 global_context -> thread_contexts[xk1].splicing_point_table = HashTableCreate(131317);
6994 HashTableSetHashFunction(global_context -> thread_contexts[xk1].splicing_point_table,HashTableStringHashFunction);
6995 HashTableSetDeallocationFunctions(global_context -> thread_contexts[xk1].splicing_point_table, free, NULL);
6996 HashTableSetKeyComparisonFunction(global_context -> thread_contexts[xk1].splicing_point_table, fc_strcmp_chro);
6997 }
6998
6999 if(global_context -> assign_reads_to_RG){
7000 global_context -> thread_contexts[xk1].RG_table = HashTableCreate(97);
7001 HashTableSetHashFunction(global_context -> thread_contexts[xk1].RG_table,HashTableStringHashFunction);
7002 HashTableSetDeallocationFunctions(global_context -> thread_contexts[xk1].RG_table, free, disallocate_RG_tables);
7003 HashTableSetKeyComparisonFunction(global_context -> thread_contexts[xk1].RG_table, fc_strcmp_chro);
7004 }
7005
7006 if(global_context -> do_scRNA_table){
7007 global_context -> thread_contexts[xk1].scRNA_reads_per_sample = calloc(sizeof(srInt_64),global_context-> scRNA_sample_sheet_table ->numOfElements);
7008 global_context -> thread_contexts[xk1].scRNA_mapped_reads_per_sample = calloc(sizeof(srInt_64),global_context-> scRNA_sample_sheet_table ->numOfElements);
7009 global_context -> thread_contexts[xk1].scRNA_assigned_reads_per_sample = calloc(sizeof(srInt_64),global_context-> scRNA_sample_sheet_table ->numOfElements);
7010 global_context -> thread_contexts[xk1].scRNA_sample_bc_tables = malloc(sizeof(HashTable*) * global_context -> scRNA_sample_id_to_name -> numOfElements);
7011 global_context -> thread_contexts[xk1].scRNA_registered_UMI_table = StringTableCreate(100000);
7012 HashTableSetDeallocationFunctions(global_context -> thread_contexts[xk1].scRNA_registered_UMI_table, free, NULL);
7013 int xk2;
7014 for(xk2 = 0; xk2 < global_context -> scRNA_sample_id_to_name -> numOfElements; xk2++){
7015 HashTable *al = HashTableCreate(2000);
7016 HashTableSetDeallocationFunctions(al, NULL, (void (*)(void*))HashTableDestroy);
7017 //SUBREADprintf("PUSH ARR for THR %d XK2 %d\n", xk1, xk2);
7018 global_context -> thread_contexts[xk1].scRNA_sample_bc_tables[xk2] = al;
7019 }
7020 global_context -> thread_contexts[xk1].scRNA_pooled_reads=0;
7021 global_context -> thread_contexts[xk1].scRNA_has_valid_sample_index =0;
7022 global_context -> thread_contexts[xk1].scRNA_has_valid_cell_barcode =0;
7023 }
7024
7025 if(!global_context -> thread_contexts[xk1].count_table) return 1;
7026 }
7027
7028 char new_fn[MAX_FILE_NAME_LENGTH+10];
7029 char MAC_or_random[13];
7030 mac_or_rand_str(MAC_or_random);
7031 char rand_prefix[MAX_FILE_NAME_LENGTH+100];
7032 sprintf(rand_prefix, "%s/temp-core-%06u-%s.sam", global_context -> temp_file_dir, getpid(), MAC_or_random);
7033 if(global_context -> use_stdin_file) sprintf(new_fn, "<%s", global_context -> input_file_name );
7034 else sprintf(new_fn, "%s", global_context -> input_file_name );
7035
7036 //#warning " ===================== REMOVE ' 0 && ' FROM NEXT LINE !!!!!! =================="
7037 SAM_pairer_create(&global_context -> read_pairer, global_context -> thread_number , global_context -> max_BAM_header_size/1024/1024+2, !global_context-> is_SAM_file, !( global_context -> is_read_details_out == FILE_TYPE_BAM ||global_context -> is_read_details_out == FILE_TYPE_SAM ) , !global_context -> is_paired_end_mode_assign, global_context ->is_paired_end_mode_assign && global_context -> do_not_sort, global_context -> assign_reads_to_RG ,0, new_fn, process_pairer_reset, process_pairer_header, process_pairer_output, rand_prefix, global_context, global_context -> long_read_minimum_length);
7038
7039 return 0;
7040 }
7041
fc_thread_destroy_thread_context(fc_thread_global_context_t * global_context)7042 void fc_thread_destroy_thread_context(fc_thread_global_context_t * global_context)
7043 {
7044 int xk1;
7045
7046 if(global_context -> is_read_details_out)for(xk1=0; xk1<global_context-> thread_number; xk1++)
7047 write_read_detailed_remainder(global_context, global_context -> thread_contexts+xk1);
7048
7049 if(global_context -> is_read_details_out) {
7050 if( global_context -> is_read_details_out == FILE_TYPE_BAM ){
7051 char bam_tail_block[1000];
7052 int tail_size = compress_read_detail_BAM( global_context, global_context -> thread_contexts, 0,0,bam_tail_block);
7053 assert(tail_size > 0);
7054 //SUBREADprintf("TAIL SIZE=%d\n", tail_size);
7055 fwrite(bam_tail_block, 1, tail_size, global_context -> read_details_out_FP);
7056 }
7057 fclose(global_context -> read_details_out_FP);
7058 global_context -> read_details_out_FP = NULL;
7059 pthread_spin_destroy(&global_context -> read_details_out_lock);
7060 }
7061
7062 for(xk1=0; xk1<global_context-> thread_number; xk1++) {
7063 //printf("CHRR_FREE\n");
7064 free(global_context -> thread_contexts[xk1].count_table);
7065 free(global_context -> thread_contexts[xk1].chro_name_buff);
7066 free(global_context -> thread_contexts[xk1].hits_start_pos1);
7067 free(global_context -> thread_contexts[xk1].hits_start_pos2);
7068 free(global_context -> thread_contexts[xk1].hits_length1);
7069 free(global_context -> thread_contexts[xk1].hits_length2);
7070 free(global_context -> thread_contexts[xk1].hits_chro1);
7071 free(global_context -> thread_contexts[xk1].hits_chro2);
7072 free(global_context -> thread_contexts[xk1].hits_indices1);
7073 free(global_context -> thread_contexts[xk1].hits_indices2);
7074 free(global_context -> thread_contexts[xk1].scoring_buff_numbers);
7075 free(global_context -> thread_contexts[xk1].scoring_buff_flags);
7076 free(global_context -> thread_contexts[xk1].scoring_buff_overlappings);
7077 free(global_context -> thread_contexts[xk1].scoring_buff_exon_ids);
7078
7079 if(global_context -> thread_contexts[xk1].scoring_buff_gap_chros){
7080 free(global_context -> thread_contexts[xk1].scoring_buff_gap_chros);
7081 free(global_context -> thread_contexts[xk1].scoring_buff_gap_starts);
7082 free(global_context -> thread_contexts[xk1].scoring_buff_gap_lengths);
7083 }
7084 if(global_context -> do_junction_counting){
7085 HashTableDestroy(global_context -> thread_contexts[xk1].junction_counting_table);
7086 HashTableDestroy(global_context -> thread_contexts[xk1].splicing_point_table);
7087 }
7088 if(global_context -> assign_reads_to_RG)
7089 HashTableDestroy(global_context -> thread_contexts[xk1].RG_table);
7090 if(global_context -> is_read_details_out ){
7091 free(global_context -> thread_contexts[xk1].read_details_buff);
7092 free(global_context -> thread_contexts[xk1].bam_compressed_buff);
7093 }
7094
7095 if(global_context -> do_scRNA_table){
7096 int xk2;
7097 for(xk2=0;xk2< global_context -> scRNA_sample_id_to_name -> numOfElements;xk2++) {
7098 HashTableDestroy(global_context -> thread_contexts[xk1].scRNA_sample_bc_tables[xk2]);
7099 }
7100 //HashTableDestroy(global_context -> scRNA_sample_BAM_writers);
7101 free(global_context -> thread_contexts[xk1].scRNA_reads_per_sample);
7102 free(global_context -> thread_contexts[xk1].scRNA_mapped_reads_per_sample);
7103 free(global_context -> thread_contexts[xk1].scRNA_assigned_reads_per_sample);
7104 free(global_context -> thread_contexts[xk1].scRNA_sample_bc_tables);
7105 HashTableDestroy(global_context -> thread_contexts[xk1].scRNA_registered_UMI_table);
7106 }
7107 }
7108
7109 free(global_context -> thread_contexts);
7110 }
fc_thread_wait_threads(fc_thread_global_context_t * global_context)7111 void fc_thread_wait_threads(fc_thread_global_context_t * global_context)
7112 {
7113 int assign_ret = SAM_pairer_run(&global_context -> read_pairer);
7114 if(0 && assign_ret){
7115 print_in_box(80,0,0,"");
7116 print_in_box(80,0,0," format error found in this file.");
7117 }
7118 global_context -> is_input_bad_format |= assign_ret;
7119 }
7120
merge_repeated_extra_columns(char * cols)7121 void merge_repeated_extra_columns(char * cols){
7122 if(cols[0]!=';')return;
7123
7124 int is_diff = 0;
7125 int seglen = -1, laststart = 0;
7126 int xx;
7127 for(xx=0; ; xx++){
7128 if(cols[xx]==';' || cols[xx]==0){
7129 if(seglen <0)seglen = xx -1;
7130 else{
7131 is_diff = (xx-laststart != seglen )|| memcmp(cols+laststart, cols+1, seglen);
7132 if(is_diff)break;
7133 }
7134 laststart = xx+1;
7135 }
7136 if(cols[xx]==0)break;
7137 }
7138
7139 if(seglen>0 && !is_diff) cols[seglen+1]=0;
7140 }
7141
BUFstrcat(char * targ,char * src,char ** buf)7142 void BUFstrcat(char * targ, char * src, char ** buf){
7143 int srclen = strlen(src);
7144 if( (*buf) == NULL){
7145 (*buf) = targ;
7146 }
7147 memcpy((*buf), src, srclen);
7148 (*buf) += srclen;
7149 (**buf) = 0;
7150 }
7151
fc_write_final_gene_results(fc_thread_global_context_t * global_context,int * et_geneid,char ** et_chr,srInt_64 * et_start,srInt_64 * et_stop,unsigned char * et_strand,char ** et_extra_columns,const char * out_file,int features,ArrayList * column_numbers,ArrayList * column_names,fc_feature_info_t * loaded_features,int header_out)7152 void fc_write_final_gene_results(fc_thread_global_context_t * global_context, int * et_geneid, char ** et_chr, srInt_64 * et_start, srInt_64 * et_stop, unsigned char * et_strand, char ** et_extra_columns, const char * out_file, int features, ArrayList * column_numbers, ArrayList * column_names, fc_feature_info_t * loaded_features, int header_out)
7153 {
7154 int xk1,xk4;
7155 int genes = global_context -> gene_name_table -> numOfElements;
7156 read_count_type_t *gene_columns;
7157
7158 FILE * fp_out = f_subr_open(out_file,"w");
7159 if(!fp_out){
7160 SUBREADprintf("Failed to create file %s\n", out_file);
7161 return;
7162 }
7163
7164 if(header_out)
7165 {
7166 fprintf(fp_out, "# Program:featureCounts v%s", SUBREAD_VERSION);
7167 if(global_context->cmd_rebuilt)
7168 fprintf(fp_out, "; Command:%s", global_context->cmd_rebuilt);
7169 fprintf(fp_out, "\n");
7170 }
7171
7172 int i_files;
7173 fprintf(fp_out,"Geneid\t%sChr\tStart\tEnd\tStrand\tLength%s%s", global_context->do_detection_call?"GCfraction\t":"", global_context -> reported_extra_columns?"\t":"", global_context -> reported_extra_columns?global_context -> reported_extra_columns:"");
7174 for(i_files=0; i_files<column_names->numOfElements; i_files++)
7175 {
7176 char * next_fn = ArrayListGet(column_names, i_files);
7177 fprintf(fp_out,"\t%s", global_context -> use_stdin_file?"STDIN":next_fn);
7178 }
7179
7180 fprintf(fp_out,"\n");
7181
7182 gene_columns = calloc(sizeof(read_count_type_t) , genes * column_names->numOfElements);
7183 unsigned int * gene_exons_number = calloc(sizeof(unsigned int) , genes);
7184 unsigned int * gene_exons_pointer = calloc(sizeof(unsigned int) , genes);
7185 unsigned int * gene_exons_start = malloc(sizeof(unsigned int) * features);
7186 unsigned int * gene_exons_end = malloc(sizeof(unsigned int) * features);
7187 char ** gene_exons_chr = malloc(sizeof(char *) * features);
7188 char ** gene_exons_extra_columns = malloc(sizeof(char *) * features);
7189 char * gene_exons_strand = malloc(features);
7190
7191 for(xk1 = 0; xk1 < features; xk1++)
7192 {
7193 int gene_id = et_geneid[xk1];
7194 gene_exons_number[gene_id]++;
7195 }
7196
7197 unsigned int accumulative_no = 0;
7198 unsigned longest_gene_exons = 0;
7199 for(xk1 = 0 ; xk1 < genes; xk1++)
7200 {
7201 unsigned int this_gene_exons = gene_exons_number[xk1];
7202 longest_gene_exons = max(longest_gene_exons, this_gene_exons);
7203 gene_exons_number[xk1] = accumulative_no;
7204 accumulative_no += this_gene_exons;
7205 }
7206
7207 for(xk1 = 0; xk1 < features; xk1++)
7208 {
7209 int gene_id = et_geneid[xk1];
7210 int gene_write_ptr = gene_exons_number[gene_id] + gene_exons_pointer[gene_id];
7211
7212 gene_exons_chr[gene_write_ptr] = et_chr[xk1];
7213 gene_exons_start[gene_write_ptr] = et_start[xk1];
7214 gene_exons_end[gene_write_ptr] = et_stop[xk1];
7215 gene_exons_strand[gene_write_ptr] = et_strand[xk1];
7216 if(global_context -> reported_extra_columns!=NULL)gene_exons_extra_columns[gene_write_ptr] = et_extra_columns[xk1];
7217
7218 gene_exons_pointer[gene_id]++;
7219 }
7220
7221 for(xk1 = 0; xk1 < features; xk1++)
7222 {
7223 int gene_id = et_geneid[xk1], k_noempty = 0;
7224 for(i_files=0;i_files < column_names->numOfElements; i_files++)
7225 {
7226 srInt_64 * this_col = ArrayListGet(column_numbers, i_files);
7227 gene_columns[gene_id * column_names->numOfElements + k_noempty ] += this_col[xk1];
7228 k_noempty++;
7229 }
7230 }
7231
7232
7233 char *is_occupied = malloc(longest_gene_exons);
7234 unsigned int * input_start_stop_list = malloc(longest_gene_exons * sizeof(int) * 2);
7235 unsigned int * output_start_stop_list = malloc(longest_gene_exons * sizeof(int) * 2);
7236 int disk_is_full = 0;
7237
7238 char * out_chr_list = malloc(longest_gene_exons * (1+global_context -> longest_chro_name) + 1), * tmp_chr_list = NULL;
7239 char * out_start_list = malloc(11 * longest_gene_exons + 1), * tmp_start_list = NULL;
7240 char * out_end_list = malloc(11 * longest_gene_exons + 1), * tmp_end_list = NULL;
7241 char * out_strand_list = malloc(2 * longest_gene_exons + 1), * tmp_strand_list = NULL;
7242
7243 char * out_extra_columns[MAX_EXTRA_COLS];
7244 int out_extra_column_size[MAX_EXTRA_COLS];
7245 int total_extra_cols = 0;
7246 if(global_context -> reported_extra_columns){
7247 char * tnamep = global_context -> reported_extra_columns;
7248 total_extra_cols =1;
7249 while(*(tnamep++))
7250 total_extra_cols += '\t' ==(*tnamep);
7251 for(xk1=0; xk1<total_extra_cols; xk1++){
7252 out_extra_columns[xk1] = malloc(220);
7253 out_extra_column_size[xk1] = 220;
7254 }
7255 }
7256
7257
7258 for(xk1 = 0 ; xk1 < genes; xk1++)
7259 {
7260 int xk2;
7261
7262 memset(is_occupied,0,gene_exons_pointer[xk1]);
7263 tmp_chr_list = NULL;
7264 tmp_start_list = NULL;
7265 tmp_end_list = NULL;
7266 tmp_strand_list = NULL;
7267 out_chr_list[0]=0;
7268 out_start_list[0]=0;
7269 out_end_list[0]=0;
7270 out_strand_list[0]=0;
7271 for(xk4=0; xk4<total_extra_cols; xk4++)
7272 out_extra_columns[xk4][0]=0;
7273 int gene_nonoverlap_len =0;
7274
7275 unsigned char * gene_symbol = global_context -> gene_name_array [xk1];
7276 for(xk2=0; xk2<gene_exons_pointer[xk1]; xk2++)
7277 {
7278 if(!is_occupied[xk2])
7279 {
7280 int xk3;
7281 char * matched_chr = gene_exons_chr[xk2 + gene_exons_number[xk1]];
7282 char matched_strand = gene_exons_strand[xk2 + gene_exons_number[xk1]];
7283
7284 memset(input_start_stop_list, 0, gene_exons_pointer[xk1] * sizeof(int) * 2);
7285 int gap_merge_ptr = 1;
7286 input_start_stop_list[0] = gene_exons_start[xk2 + gene_exons_number[xk1]];
7287 input_start_stop_list[1] = gene_exons_end[xk2 + gene_exons_number[xk1]] + 1;
7288
7289 for(xk3 = xk2; xk3 < gene_exons_pointer[xk1]; xk3++)
7290 {
7291 if( global_context -> reported_extra_columns && (xk3==xk2 || (0 == is_occupied[xk3] && strcmp(matched_chr, gene_exons_chr[xk3+gene_exons_number[xk1]])==0 && matched_strand == gene_exons_strand[xk3 + gene_exons_number[xk1]] ))){
7292 char * this_col_ptr = NULL;
7293 char * this_col = strtok_r(gene_exons_extra_columns[xk3+gene_exons_number[xk1]], "\t", &this_col_ptr);
7294 for(xk4 = 0; xk4 < total_extra_cols; xk4++){
7295 int exlen = strlen( this_col), ollen = strlen(out_extra_columns[xk4]);
7296 if(ollen + exlen +2 > out_extra_column_size[xk4]){
7297 out_extra_column_size[xk4] = max(ollen + exlen +2, out_extra_column_size[xk4]);
7298 out_extra_columns[xk4] = realloc(out_extra_columns[xk4], out_extra_column_size[xk4]);
7299 }
7300 sprintf(out_extra_columns[xk4]+ollen,";%s", this_col);
7301 this_col = strtok_r(NULL, "\t", &this_col_ptr);
7302 }
7303 }
7304
7305 if(xk3==xk2)continue;
7306
7307 if((!is_occupied[xk3]) && strcmp(matched_chr, gene_exons_chr[xk3+gene_exons_number[xk1]])==0 && matched_strand == gene_exons_strand[xk3 + gene_exons_number[xk1]])
7308 {
7309 is_occupied[xk3]=1;
7310 input_start_stop_list[gap_merge_ptr*2] = gene_exons_start[xk3+gene_exons_number[xk1]];
7311 input_start_stop_list[gap_merge_ptr*2+1] = gene_exons_end[xk3+gene_exons_number[xk1]]+1;
7312
7313 gap_merge_ptr++;
7314 }
7315 }
7316
7317 {
7318 int merged_gaps = mergeIntervals(input_start_stop_list, output_start_stop_list, gap_merge_ptr);
7319
7320 for(xk3=0; xk3<gap_merge_ptr; xk3++)
7321 {
7322 char numbbuf[12];
7323 BUFstrcat(out_chr_list, matched_chr, &tmp_chr_list);
7324 BUFstrcat(out_chr_list, ";", &tmp_chr_list);
7325
7326 sprintf(numbbuf,"%u;", input_start_stop_list[xk3 * 2]);
7327 BUFstrcat(out_start_list, numbbuf, &tmp_start_list);
7328 sprintf(numbbuf,"%u;", input_start_stop_list[xk3 * 2 + 1] - 1);
7329 BUFstrcat(out_end_list, numbbuf, &tmp_end_list);
7330 sprintf(numbbuf,"%c;", (matched_strand==1)?'-':( ( matched_strand==0 )? '+':'.'));
7331 BUFstrcat(out_strand_list, numbbuf, &tmp_strand_list);
7332
7333 }
7334 for(xk3=0; xk3<merged_gaps; xk3++)
7335 gene_nonoverlap_len += output_start_stop_list[xk3 * 2 + 1] - output_start_stop_list[xk3 * 2];
7336 }
7337 }
7338 }
7339 #define _cut_tail(x) (x)[strlen(x)-1]=0
7340
7341 _cut_tail(out_chr_list);
7342 _cut_tail(out_start_list);
7343 _cut_tail(out_end_list);
7344 _cut_tail(out_strand_list);
7345
7346 char * QCcontent = "";
7347 char * QCtab = "";
7348 if(global_context -> do_detection_call){
7349 QCcontent = HashTableGet(global_context -> GCcontent_table, gene_symbol);
7350 QCtab = "\t";
7351 if(!QCcontent)QCcontent="nan";
7352 }
7353
7354 int wlen = fprintf(fp_out, "%s\t%s%s%s\t%s\t%s\t%s\t%d", gene_symbol, QCcontent, QCtab, out_chr_list, out_start_list, out_end_list, out_strand_list, gene_nonoverlap_len);
7355 for(xk4 = 0; xk4<total_extra_cols; xk4++){
7356 merge_repeated_extra_columns(out_extra_columns[xk4]);
7357 fprintf(fp_out, "\t%s", out_extra_columns[xk4]+1);
7358 }
7359
7360 for(i_files=0; i_files< column_names->numOfElements; i_files++)
7361 {
7362 read_count_type_t longlong_res = 0;
7363 double double_res = 0;
7364 int is_double_number = calc_float_fraction(gene_columns[i_files + column_names->numOfElements*xk1], &longlong_res, &double_res);
7365 if(is_double_number){
7366 fprintf(fp_out,"\t%.2f", double_res);
7367 }else{
7368 #ifdef __MINGW32__
7369 fprintf(fp_out,"\t%I64u", (srInt_64)longlong_res);
7370 #else
7371 fprintf(fp_out,"\t%lld", (srInt_64)longlong_res);
7372 #endif
7373 }
7374 }
7375 fprintf(fp_out,"\n");
7376 if(wlen < 6)disk_is_full = 1;
7377 }
7378
7379 for(xk1=0; xk1<total_extra_cols; xk1++) free(out_extra_columns[xk1]);
7380 free(is_occupied);
7381 free(input_start_stop_list);
7382 free(output_start_stop_list);
7383 free(out_chr_list);
7384 free(out_strand_list);
7385 free(out_start_list);
7386 free(out_end_list);
7387
7388 free(gene_exons_number);
7389 free(gene_exons_pointer);
7390 free(gene_columns);
7391 free(gene_exons_chr);
7392 free(gene_exons_extra_columns);
7393 free(gene_exons_start);
7394 free(gene_exons_end);
7395 free(gene_exons_strand);
7396 fclose(fp_out);
7397
7398 if(disk_is_full){
7399 SUBREADprintf("ERROR: disk is full; the count file cannot be generated.\n");
7400 unlink(out_file);
7401 }
7402 }
7403
fc_write_final_counts(fc_thread_global_context_t * global_context,const char * out_file,ArrayList * column_names,ArrayList * read_counters,int isCVersion)7404 void fc_write_final_counts(fc_thread_global_context_t * global_context, const char * out_file, ArrayList * column_names, ArrayList * read_counters, int isCVersion)
7405 {
7406 char fname[MAX_FILE_NAME_LENGTH];
7407 int i_files, xk1, disk_is_full = 0;
7408
7409 sprintf(fname, "%s.summary", out_file);
7410 FILE * fp_out = f_subr_open(fname,"w");
7411
7412 if(!fp_out){
7413 SUBREADprintf("Unable to create summary file '%s'\n", fname);
7414 return;
7415 }
7416
7417 fprintf(fp_out,"Status");
7418
7419 for(i_files=0; i_files<column_names->numOfElements; i_files++)
7420 {
7421 char * next_fn = ArrayListGet(column_names, i_files);
7422 fprintf(fp_out,"\t%s", global_context -> use_stdin_file?"STDIN":next_fn);
7423 }
7424
7425 fprintf(fp_out,"\n");
7426 char * keys [] ={ "Assigned" , "Unassigned_Unmapped", "Unassigned_Read_Type", "Unassigned_Singleton", "Unassigned_MappingQuality", "Unassigned_Chimera", "Unassigned_FragmentLength", "Unassigned_Duplicate", "Unassigned_MultiMapping" , "Unassigned_Secondary", (global_context->is_split_or_exonic_only == 2)?"Unassigned_Split":"Unassigned_NonSplit", "Unassigned_NoFeatures", "Unassigned_Overlapping_Length", "Unassigned_Ambiguity"};
7427
7428 for(xk1=0; xk1<14; xk1++)
7429 {
7430 fprintf(fp_out,"%s", keys[xk1]);
7431 for(i_files = 0; i_files < column_names->numOfElements; i_files ++)
7432 {
7433 srInt_64 * array_0 = ArrayListGet(read_counters,i_files);
7434 srInt_64 * cntr = array_0 + xk1;
7435 #ifdef __MINGW32__
7436 fprintf(fp_out,"\t%I64u", (srInt_64)*cntr);
7437 #else
7438 fprintf(fp_out,"\t%lld", (srInt_64)*cntr);
7439 #endif
7440 }
7441 int wlen = fprintf(fp_out,"\n");
7442 if(wlen < 1)disk_is_full = 1;
7443 }
7444
7445
7446 fclose(fp_out);
7447
7448 if(disk_is_full){
7449 SUBREADprintf("ERROR: disk is full; the count file cannot be generated.\n");
7450 unlink(out_file);
7451 }
7452
7453 }
fc_write_final_results(fc_thread_global_context_t * global_context,const char * out_file,int features,ArrayList * column_numbers,ArrayList * column_names,fc_feature_info_t * loaded_features,int header_out)7454 void fc_write_final_results(fc_thread_global_context_t * global_context, const char * out_file, int features, ArrayList* column_numbers, ArrayList * column_names,fc_feature_info_t * loaded_features, int header_out)
7455 {
7456 /* save the results */
7457 FILE * fp_out;
7458 int i, i_files = 0, disk_is_full =0;
7459 fp_out = f_subr_open(out_file,"w");
7460 if(!fp_out){
7461 SUBREADprintf("Failed to create file %s\n", out_file);
7462 return;
7463 }
7464
7465 if(header_out)
7466 {
7467 fprintf(fp_out, "# Program:featureCounts v%s", SUBREAD_VERSION);
7468 if(global_context->cmd_rebuilt)
7469 fprintf(fp_out, "; Command:%s", global_context->cmd_rebuilt);
7470 fprintf(fp_out, "\n");
7471 }
7472
7473
7474
7475 char * next_fn;
7476 fprintf(fp_out,"Geneid\tChr\tStart\tEnd\tStrand\tLength");
7477 if(global_context -> reported_extra_columns)fprintf(fp_out,"\t%s", global_context -> reported_extra_columns);
7478
7479 for(i_files = 0; i_files < column_names -> numOfElements; i_files++){
7480 next_fn = ArrayListGet(column_names, i_files);
7481 fprintf(fp_out,"\t%s", global_context -> use_stdin_file?"STDIN":next_fn);
7482 }
7483 fprintf(fp_out,"\n");
7484 for(i=0;i<features;i++)
7485 {
7486 fprintf(fp_out,"%s\t%s\t%u\t%u\t%c\t%d%s%s", global_context -> unistr_buffer_space + loaded_features[i].feature_name_pos,
7487 global_context -> unistr_buffer_space + loaded_features[i].feature_name_pos + loaded_features[i].chro_name_pos_delta,
7488 loaded_features[i].start, loaded_features[i].end, loaded_features[i].is_negative_strand == 1?'-':( loaded_features[i].is_negative_strand == 0? '+':'.'),
7489 loaded_features[i].end-loaded_features[i].start+1, global_context -> reported_extra_columns ?"\t":"", global_context -> reported_extra_columns ?loaded_features[i].extra_columns:"");
7490 for(i_files=0; i_files < column_names -> numOfElements; i_files++)
7491 {
7492 srInt_64 * this_list = ArrayListGet(column_numbers, i_files);
7493 int sorted_exon_no = loaded_features[i].sorted_order;
7494 srInt_64 count_frac_raw = this_list[sorted_exon_no], longlong_res = 0;
7495
7496 double double_res = 0;
7497 int is_double_number = calc_float_fraction(count_frac_raw, &longlong_res, &double_res);
7498 if(is_double_number){
7499 fprintf(fp_out,"\t%.2f", double_res);
7500 }else{
7501 #ifdef __MINGW32__
7502 fprintf(fp_out,"\t%I64d", (srInt_64)longlong_res);
7503 #else
7504 fprintf(fp_out,"\t%lld", (srInt_64)longlong_res);
7505 #endif
7506 }
7507 }
7508 int wlen = fprintf(fp_out,"\n");
7509 if(wlen < 1)disk_is_full = 1;
7510 }
7511
7512 fclose(fp_out);
7513 if(disk_is_full){
7514 SUBREADprintf("ERROR: disk is full; unable to write into the output file.\n");
7515 unlink(out_file);
7516 }
7517 }
7518
7519 static struct option long_options[] =
7520 {
7521 {"primary",no_argument, 0, 0},
7522 {"readShiftSize", required_argument, 0, 0},
7523 {"readShiftType", required_argument, 0, 0},
7524 {"readExtension5", required_argument, 0, 0},
7525 {"readExtension5", required_argument, 0, 0},
7526 {"readExtension3", required_argument, 0, 0},
7527 {"read2pos", required_argument, 0, 0},
7528 {"minOverlap", required_argument, 0, 0},
7529 {"fracOverlap", required_argument, 0, 0},
7530 {"nonOverlap", required_argument, 0, 0},
7531 {"nonOverlapFeature", required_argument, 0, 0},
7532 {"fracOverlapFeature", required_argument, 0, 0},
7533 {"splitOnly", no_argument, 0, 0},
7534 {"nonSplitOnly", no_argument, 0, 0},
7535 {"debugCommand", required_argument, 0, 0},
7536 {"ignoreDup", no_argument, 0, 0},
7537 {"donotsort", no_argument, 0, 0},
7538 {"restrictedlyNoOverlap", no_argument, 0, 0},
7539 {"fraction", no_argument, 0, 0},
7540 {"order", required_argument, 0, 'S'},
7541 {"genome", required_argument, 0, 'G'},
7542 {"maxMOp", required_argument, 0, 0},
7543 {"tmpDir", required_argument, 0, 0},
7544 {"extraAttributes", required_argument, 0, 0},
7545 {"largestOverlap", no_argument, 0,0},
7546 {"countReadPairs", no_argument, 0, 0},
7547 {"byReadGroup", no_argument, 0,0},
7548 {"verbose", no_argument, 0,0},
7549 {"detectionCall", no_argument, 0,0},
7550 {"Rpath", required_argument, 0, 0},
7551 {"scSampleSheet", required_argument, 0, 0},
7552 {"scInputMode", required_argument, 0, 0},
7553 {"scCellBarcodeFile", required_argument, 0, 0},
7554 {0, 0, 0, 0}
7555 };
7556
print_usage()7557 void print_usage()
7558 {
7559 SUBREADprintf("\nVersion %s\n\n", SUBREAD_VERSION);
7560
7561 SUBREADputs("Usage: featureCounts [options] -a <annotation_file> -o <output_file> input_file1 [input_file2] ... \n");
7562 SUBREADputs("## Mandatory arguments:");
7563 SUBREADputs("");
7564 SUBREADputs(" -a <string> Name of an annotation file. GTF/GFF format by default. See");
7565 SUBREADputs(" -F option for more format information. Inbuilt annotations");
7566 SUBREADputs(" (SAF format) is available in 'annotation' directory of the");
7567 SUBREADputs(" package. Gzipped file is also accepted.");
7568 SUBREADputs("");
7569 SUBREADputs(" -o <string> Name of output file including read counts. A separate file");
7570 SUBREADputs(" including summary statistics of counting results is also");
7571 SUBREADputs(" included in the output ('<string>.summary'). Both files");
7572 SUBREADputs(" are in tab delimited format.");
7573 SUBREADputs("");
7574 SUBREADputs(" input_file1 [input_file2] ... A list of SAM or BAM format files. They can be");
7575 SUBREADputs(" either name or location sorted. If no files provided,");
7576 SUBREADputs(" <stdin> input is expected. Location-sorted paired-end reads");
7577 SUBREADputs(" are automatically sorted by read names.");
7578 SUBREADputs("");
7579
7580 SUBREADputs("## Optional arguments:");
7581 SUBREADputs("# Annotation");
7582 SUBREADputs("");
7583 SUBREADputs(" -F <string> Specify format of the provided annotation file. Acceptable");
7584 SUBREADputs(" formats include 'GTF' (or compatible GFF format) and");
7585 SUBREADputs(" 'SAF'. 'GTF' by default. For SAF format, please refer to");
7586 SUBREADputs(" Users Guide.");
7587 SUBREADputs("");
7588 SUBREADputs(" -t <string> Specify feature type(s) in a GTF annotation. If multiple");
7589 SUBREADputs(" types are provided, they should be separated by ',' with");
7590 SUBREADputs(" no space in between. 'exon' by default. Rows in the");
7591 SUBREADputs(" annotation with a matched feature will be extracted and");
7592 SUBREADputs(" used for read mapping. ");
7593 SUBREADputs("");
7594 SUBREADputs(" -g <string> Specify attribute type in GTF annotation. 'gene_id' by ");
7595 SUBREADputs(" default. Meta-features used for read counting will be ");
7596 SUBREADputs(" extracted from annotation using the provided value.");
7597 SUBREADputs("");
7598 SUBREADputs(" --extraAttributes Extract extra attribute types from the provided GTF");
7599 SUBREADputs(" annotation and include them in the counting output. These");
7600 SUBREADputs(" attribute types will not be used to group features. If");
7601 SUBREADputs(" more than one attribute type is provided they should be");
7602 SUBREADputs(" separated by comma.");
7603 SUBREADputs("");
7604 SUBREADputs(" -A <string> Provide a chromosome name alias file to match chr names in");
7605 SUBREADputs(" annotation with those in the reads. This should be a two-");
7606 SUBREADputs(" column comma-delimited text file. Its first column should");
7607 SUBREADputs(" include chr names in the annotation and its second column");
7608 SUBREADputs(" should include chr names in the reads. Chr names are case");
7609 SUBREADputs(" sensitive. No column header should be included in the");
7610 SUBREADputs(" file.");
7611 SUBREADputs("");
7612
7613 SUBREADputs("# Level of summarization");
7614 SUBREADputs("");
7615 SUBREADputs(" -f Perform read counting at feature level (eg. counting ");
7616 SUBREADputs(" reads for exons rather than genes).");
7617 SUBREADputs("");
7618
7619 SUBREADputs("# Overlap between reads and features");
7620 SUBREADputs("");
7621 SUBREADputs(" -O Assign reads to all their overlapping meta-features (or ");
7622 SUBREADputs(" features if -f is specified).");
7623 SUBREADputs("");
7624 SUBREADputs(" --minOverlap <int> Minimum number of overlapping bases in a read that is");
7625 SUBREADputs(" required for read assignment. 1 by default. Number of");
7626 SUBREADputs(" overlapping bases is counted from both reads if paired");
7627 SUBREADputs(" end. If a negative value is provided, then a gap of up");
7628 SUBREADputs(" to specified size will be allowed between read and the");
7629 SUBREADputs(" feature that the read is assigned to.");
7630 SUBREADputs("");
7631 SUBREADputs(" --fracOverlap <float> Minimum fraction of overlapping bases in a read that is");
7632 SUBREADputs(" required for read assignment. Value should be within range");
7633 SUBREADputs(" [0,1]. 0 by default. Number of overlapping bases is");
7634 SUBREADputs(" counted from both reads if paired end. Both this option");
7635 SUBREADputs(" and '--minOverlap' option need to be satisfied for read");
7636 SUBREADputs(" assignment.");
7637 SUBREADputs("");
7638 SUBREADputs(" --fracOverlapFeature <float> Minimum fraction of overlapping bases in a");
7639 SUBREADputs(" feature that is required for read assignment. Value");
7640 SUBREADputs(" should be within range [0,1]. 0 by default.");
7641 SUBREADputs("");
7642 SUBREADputs(" --largestOverlap Assign reads to a meta-feature/feature that has the ");
7643 SUBREADputs(" largest number of overlapping bases.");
7644 SUBREADputs("");
7645 SUBREADputs(" --nonOverlap <int> Maximum number of non-overlapping bases in a read (or a");
7646 SUBREADputs(" read pair) that is allowed when being assigned to a");
7647 SUBREADputs(" feature. No limit is set by default.");
7648 SUBREADputs("");
7649 SUBREADputs(" --nonOverlapFeature <int> Maximum number of non-overlapping bases in a feature");
7650 SUBREADputs(" that is allowed in read assignment. No limit is set by");
7651 SUBREADputs(" default.");
7652 SUBREADputs("");
7653 SUBREADputs(" --readExtension5 <int> Reads are extended upstream by <int> bases from their");
7654 SUBREADputs(" 5' end.");
7655 SUBREADputs("");
7656 SUBREADputs(" --readExtension3 <int> Reads are extended upstream by <int> bases from their");
7657 SUBREADputs(" 3' end.");
7658 SUBREADputs("");
7659 SUBREADputs(" --read2pos <5:3> Reduce reads to their 5' most base or 3' most base. Read");
7660 SUBREADputs(" counting is then performed based on the single base the ");
7661 SUBREADputs(" read is reduced to.");
7662 SUBREADputs("");
7663
7664 SUBREADputs("# Multi-mapping reads");
7665 SUBREADputs("");
7666 SUBREADputs(" -M Multi-mapping reads will also be counted. For a multi-");
7667 SUBREADputs(" mapping read, all its reported alignments will be ");
7668 SUBREADputs(" counted. The 'NH' tag in BAM/SAM input is used to detect ");
7669 SUBREADputs(" multi-mapping reads.");
7670 SUBREADputs("");
7671 SUBREADputs("# Fractional counting");
7672 SUBREADputs("");
7673 SUBREADputs(" --fraction Assign fractional counts to features. This option must");
7674 SUBREADputs(" be used together with '-M' or '-O' or both. When '-M' is");
7675 SUBREADputs(" specified, each reported alignment from a multi-mapping");
7676 SUBREADputs(" read (identified via 'NH' tag) will carry a fractional");
7677 SUBREADputs(" count of 1/x, instead of 1 (one), where x is the total");
7678 SUBREADputs(" number of alignments reported for the same read. When '-O'");
7679 SUBREADputs(" is specified, each overlapping feature will receive a");
7680 SUBREADputs(" fractional count of 1/y, where y is the total number of");
7681 SUBREADputs(" features overlapping with the read. When both '-M' and");
7682 SUBREADputs(" '-O' are specified, each alignment will carry a fractional");
7683 SUBREADputs(" count of 1/(x*y).");
7684 SUBREADputs("");
7685
7686
7687 SUBREADputs("# Read filtering");
7688 SUBREADputs("");
7689 SUBREADputs(" -Q <int> The minimum mapping quality score a read must satisfy in");
7690 SUBREADputs(" order to be counted. For paired-end reads, at least one");
7691 SUBREADputs(" end should satisfy this criteria. 0 by default.");
7692 SUBREADputs("");
7693 SUBREADputs(" --splitOnly Count split alignments only (ie. alignments with CIGAR");
7694 SUBREADputs(" string containing 'N'). An example of split alignments is");
7695 SUBREADputs(" exon-spanning reads in RNA-seq data.");
7696 SUBREADputs("");
7697 SUBREADputs(" --nonSplitOnly If specified, only non-split alignments (CIGAR strings do");
7698 SUBREADputs(" not contain letter 'N') will be counted. All the other");
7699 SUBREADputs(" alignments will be ignored.");
7700 SUBREADputs("");
7701 SUBREADputs(" --primary Count primary alignments only. Primary alignments are ");
7702 SUBREADputs(" identified using bit 0x100 in SAM/BAM FLAG field.");
7703 SUBREADputs("");
7704 SUBREADputs(" --ignoreDup Ignore duplicate reads in read counting. Duplicate reads ");
7705 SUBREADputs(" are identified using bit Ox400 in BAM/SAM FLAG field. The ");
7706 SUBREADputs(" whole read pair is ignored if one of the reads is a ");
7707 SUBREADputs(" duplicate read for paired end data.");
7708 SUBREADputs("");
7709
7710 SUBREADputs("# Strandness");
7711 SUBREADputs("");
7712 SUBREADputs(" -s <int or string> Perform strand-specific read counting. A single integer");
7713 SUBREADputs(" value (applied to all input files) or a string of comma-");
7714 SUBREADputs(" separated values (applied to each corresponding input");
7715 SUBREADputs(" file) should be provided. Possible values include:");
7716 SUBREADputs(" 0 (unstranded), 1 (stranded) and 2 (reversely stranded).");
7717 SUBREADputs(" Default value is 0 (ie. unstranded read counting carried");
7718 SUBREADputs(" out for all input files).");
7719 SUBREADputs("");
7720
7721 SUBREADputs("# Exon-exon junctions");
7722 SUBREADputs("");
7723 SUBREADputs(" -J Count number of reads supporting each exon-exon junction.");
7724 SUBREADputs(" Junctions were identified from those exon-spanning reads");
7725 SUBREADputs(" in the input (containing 'N' in CIGAR string). Counting");
7726 SUBREADputs(" results are saved to a file named '<output_file>.jcounts'");
7727 SUBREADputs("");
7728 SUBREADputs(" -G <string> Provide the name of a FASTA-format file that contains the");
7729 SUBREADputs(" reference sequences used in read mapping that produced the");
7730 SUBREADputs(" provided SAM/BAM files. This optional argument can be used");
7731 SUBREADputs(" with '-J' option to improve read counting for junctions.");
7732 SUBREADputs("");
7733
7734 SUBREADputs("# Parameters specific to paired end reads");
7735 SUBREADputs("");
7736 SUBREADputs(" -p If specified, libraries are assumed to contain paired-end");
7737 SUBREADputs(" reads. For any library that contains paired-end reads, the");
7738 SUBREADputs(" 'countReadPairs' parameter controls if read pairs or reads");
7739 SUBREADputs(" should be counted.");
7740 SUBREADputs("");
7741 SUBREADputs(" --countReadPairs If specified, fragments (or templates) will be counted");
7742 SUBREADputs(" instead of reads. This option is only applicable for");
7743 SUBREADputs(" paired-end reads. For single-end data, it is ignored.");
7744 SUBREADputs("");
7745 SUBREADputs(" -B Only count read pairs that have both ends aligned.");
7746 SUBREADputs("");
7747 SUBREADputs(" -P Check validity of paired-end distance when counting read ");
7748 SUBREADputs(" pairs. Use -d and -D to set thresholds.");
7749 SUBREADputs("");
7750 SUBREADputs(" -d <int> Minimum fragment/template length, 50 by default.");
7751 SUBREADputs("");
7752 SUBREADputs(" -D <int> Maximum fragment/template length, 600 by default.");
7753 SUBREADputs("");
7754 SUBREADputs(" -C Do not count read pairs that have their two ends mapping ");
7755 SUBREADputs(" to different chromosomes or mapping to same chromosome ");
7756 SUBREADputs(" but on different strands.");
7757 SUBREADputs("");
7758 SUBREADputs(" --donotsort Do not sort reads in BAM/SAM input. Note that reads from ");
7759 SUBREADputs(" the same pair are required to be located next to each ");
7760 SUBREADputs(" other in the input.");
7761 SUBREADputs("");
7762
7763 SUBREADputs("# Number of CPU threads");
7764 SUBREADputs("");
7765 SUBREADputs(" -T <int> Number of the threads. 1 by default.");
7766 SUBREADputs("");
7767
7768 SUBREADputs("# Read groups");
7769 SUBREADputs("");
7770 SUBREADputs(" --byReadGroup Assign reads by read group. \"RG\" tag is required to be");
7771 SUBREADputs(" present in the input BAM/SAM files.");
7772 SUBREADputs(" ");
7773 SUBREADputs("");
7774
7775 SUBREADputs("# Long reads");
7776 SUBREADputs("");
7777 SUBREADputs(" -L Count long reads such as Nanopore and PacBio reads. Long");
7778 SUBREADputs(" read counting can only run in one thread and only reads");
7779 SUBREADputs(" (not read-pairs) can be counted. There is no limitation on");
7780 SUBREADputs(" the number of 'M' operations allowed in a CIGAR string in");
7781 SUBREADputs(" long read counting.");
7782 SUBREADputs("");
7783
7784 SUBREADputs("# Assignment results for each read");
7785 SUBREADputs("");
7786 SUBREADputs(" -R <format> Output detailed assignment results for each read or read-");
7787 SUBREADputs(" pair. Results are saved to a file that is in one of the");
7788 SUBREADputs(" following formats: CORE, SAM and BAM. See Users Guide for");
7789 SUBREADputs(" more info about these formats.");
7790 SUBREADputs("");
7791 SUBREADputs(" --Rpath <string> Specify a directory to save the detailed assignment");
7792 SUBREADputs(" results. If unspecified, the directory where counting");
7793 SUBREADputs(" results are saved is used.");
7794 SUBREADputs("");
7795
7796 SUBREADputs("# Miscellaneous");
7797 SUBREADputs("");
7798 SUBREADputs(" --tmpDir <string> Directory under which intermediate files are saved (later");
7799 SUBREADputs(" removed). By default, intermediate files will be saved to");
7800 SUBREADputs(" the directory specified in '-o' argument.");
7801 SUBREADputs("");
7802 SUBREADputs(" --maxMOp <int> Maximum number of 'M' operations allowed in a CIGAR");
7803 SUBREADputs(" string. 10 by default. Both 'X' and '=' are treated as 'M'");
7804 SUBREADputs(" and adjacent 'M' operations are merged in the CIGAR");
7805 SUBREADputs(" string.");
7806 SUBREADputs("");
7807 SUBREADputs(" --verbose Output verbose information for debugging, such as un-");
7808 SUBREADputs(" matched chromosome/contig names.");
7809 SUBREADputs("");
7810 SUBREADputs(" -v Output version of the program.");
7811 SUBREADputs("");
7812
7813 }
7814
junckey_sort_compare(void * inptr,int i,int j)7815 int junckey_sort_compare(void * inptr, int i, int j){
7816 char ** inp = (char **) inptr;
7817 int x1;
7818
7819 int chrI=-1, chrJ=-1;
7820
7821 if(atoi(inp[i])>0) chrI = atoi(inp[i]);
7822 if(atoi(inp[j])>0) chrJ = atoi(inp[j]);
7823
7824 if(inp[i][0]=='X' && !isdigit(inp[i][1])&& !isalpha(inp[i][1])) chrI = 90;
7825 if(inp[i][0]=='Y' && !isdigit(inp[i][1])&& !isalpha(inp[i][1])) chrI = 91;
7826 if(inp[i][0]=='M' && !isdigit(inp[i][1])&& !isalpha(inp[i][1])) chrI = 99;
7827 if(inp[j][0]=='X' && !isdigit(inp[j][1])&& !isalpha(inp[j][1])) chrJ = 90;
7828 if(inp[j][0]=='Y' && !isdigit(inp[j][1])&& !isalpha(inp[j][1])) chrJ = 91;
7829 if(inp[j][0]=='M' && !isdigit(inp[j][1])&& !isalpha(inp[j][1])) chrJ = 99;
7830
7831
7832
7833 if(memcmp(inp[i], "chr", 3)==0){
7834 chrI=atoi(inp[i]+3);
7835 if(0 == chrI && inp[i][3] == 'X') chrI = 90;
7836 if(0 == chrI && inp[i][3] == 'Y') chrI = 91;
7837 if(0 == chrI && inp[i][3] == 'M') chrI = 99;
7838 }
7839 if(memcmp(inp[j], "chr", 3)==0){
7840 chrJ=atoi(inp[j]+3);
7841 if(0 == chrJ && inp[j][3] == 'X') chrJ = 90;
7842 if(0 == chrJ && inp[j][3] == 'Y') chrJ = 91;
7843 if(0 == chrJ && inp[j][3] == 'M') chrJ = 99;
7844 }
7845
7846 int len_I_long = 9;
7847 for(x1 = 0 ; x1 < FEATURE_NAME_LENGTH + 15 ; x1++){
7848 int c1 = inp[i][x1];
7849 int c2 = inp[j][x1];
7850 if(c1 == '\t' && c2 != '\t')
7851 len_I_long = -1;
7852 else if(c1 != '\t' && c2 == '\t')
7853 len_I_long = 1;
7854 else if(c1 == '\t' && c2 == '\t')
7855 len_I_long = 0;
7856
7857 if(len_I_long != 9) break;
7858 }
7859
7860 if(chrI != chrJ || len_I_long != 0){
7861 return (chrI * 100 + len_I_long) - (chrJ * 100);
7862 }
7863
7864 for(x1 = 0 ; x1 < FEATURE_NAME_LENGTH + 15 ; x1++){
7865 int c1 = inp[i][x1];
7866 int c2 = inp[j][x1];
7867 if(c1 != c2){
7868 return c1 - c2;
7869 }else if(c1 == '\t' && c1 == c2){
7870 int pos1 = atoi(inp[i]+x1+1);
7871 int pos2 = atoi(inp[j]+x1+1);
7872 if( pos1 == pos2)
7873 return strcmp(inp[i], inp[j]);
7874 else
7875 return pos1 - pos2;
7876 }
7877
7878 if(c1 == 0 || c2 == 0)return c1 - c2;
7879 }
7880 return 0;
7881 }
7882
junckey_sort_exchange(void * inptr,int i,int j)7883 void junckey_sort_exchange(void * inptr, int i, int j){
7884
7885 char ** inp = (char **) inptr;
7886 char * tmpp = inp[j];
7887 inp[j]=inp[i];
7888 inp[i]=tmpp;
7889 }
7890
junckey_sort_merge(void * inptr,int start,int items1,int items2)7891 void junckey_sort_merge(void * inptr, int start, int items1, int items2){
7892 char ** inp = (char **) inptr;
7893 char ** tmpp = malloc(sizeof(char *) * (items1+items2));
7894 int read_1_ptr = start, read_2_ptr = start+items1, outptr = 0;
7895 while(1){
7896 if(read_1_ptr == start+items1 && read_2_ptr == start+items1+items2) break;
7897 if((read_1_ptr == start+items1)||(read_2_ptr < start+items1+items2 && junckey_sort_compare(inptr, read_1_ptr, read_2_ptr) > 0 )) {
7898 // select 2
7899 tmpp[outptr++]=inp[read_2_ptr++];
7900 } else {
7901 // select 1
7902 tmpp[outptr++]=inp[read_1_ptr++];
7903 }
7904 }
7905 memcpy(inp + start, tmpp, sizeof(char *)*(items1+items2));
7906 free(tmpp);
7907 }
7908
junccmp(fc_junction_gene_t * j1,fc_junction_gene_t * j2)7909 int junccmp(fc_junction_gene_t * j1, fc_junction_gene_t * j2){
7910 if(strcmp( j1 -> gene_name, j2 -> gene_name ) == 0)
7911 return 0;
7912 return 1;
7913 }
7914
7915
fc_write_final_junctions(fc_thread_global_context_t * global_context,char * output_file_name,ArrayList * column_names,ArrayList * junction_global_table_list,ArrayList * splicing_global_table_list)7916 void fc_write_final_junctions(fc_thread_global_context_t * global_context, char * output_file_name, ArrayList * column_names, ArrayList * junction_global_table_list, ArrayList * splicing_global_table_list){
7917 int infile_i, disk_is_full = 0;
7918
7919 HashTable * merged_junction_table = HashTableCreate(156679);
7920
7921 HashTableSetHashFunction(merged_junction_table,HashTableStringHashFunction);
7922 HashTableSetDeallocationFunctions(merged_junction_table, NULL, NULL);
7923 HashTableSetKeyComparisonFunction(merged_junction_table, fc_strcmp_chro);
7924
7925 HashTable * merged_splicing_table = HashTableCreate(156679);
7926
7927 HashTableSetHashFunction(merged_splicing_table,HashTableStringHashFunction);
7928 HashTableSetDeallocationFunctions(merged_splicing_table, NULL, NULL);
7929 HashTableSetKeyComparisonFunction(merged_splicing_table, fc_strcmp_chro);
7930
7931
7932 for(infile_i = 0 ; infile_i < column_names -> numOfElements ; infile_i ++){
7933 KeyValuePair * cursor;
7934 int bucket;
7935 HashTable * spl_table = ArrayListGet(splicing_global_table_list, infile_i);
7936 for(bucket=0; bucket < spl_table -> numOfBuckets; bucket++)
7937 {
7938 cursor = spl_table -> bucketArray[bucket];
7939 while (cursor)
7940 {
7941 char * ky = (char *)cursor -> key;
7942 unsigned int old_supp = HashTableGet(merged_splicing_table, ky) - NULL;
7943 old_supp += (cursor -> value - NULL);
7944 HashTablePut(merged_splicing_table, ky, NULL+old_supp);
7945 cursor = cursor -> next;
7946 }
7947 }
7948 }
7949
7950 for(infile_i = 0 ; infile_i < column_names -> numOfElements ; infile_i ++){
7951 KeyValuePair * cursor;
7952 int bucket;
7953 HashTable * junc_table = ArrayListGet(junction_global_table_list, infile_i);
7954 for(bucket=0; bucket < junc_table -> numOfBuckets; bucket++)
7955 {
7956 cursor = junc_table -> bucketArray[bucket];
7957 while (cursor)
7958 {
7959 char * ky = (char *)cursor -> key;
7960
7961 if(HashTableGet(merged_junction_table, ky)==NULL)
7962 HashTablePut(merged_junction_table, ky, NULL+1);
7963 cursor = cursor -> next;
7964 }
7965 }
7966 }
7967
7968 char ** key_list;
7969 key_list = malloc(sizeof(char *) * merged_junction_table -> numOfElements);
7970
7971 KeyValuePair * cursor;
7972 int bucket, ky_i = 0;
7973 for(bucket=0; bucket < merged_junction_table -> numOfBuckets; bucket++){
7974 cursor = merged_junction_table -> bucketArray[bucket];
7975 while (cursor){
7976 char * ky = (char *)cursor -> key;
7977
7978 key_list[ky_i ++] = ky;
7979 cursor = cursor -> next;
7980 }
7981 }
7982
7983 merge_sort(key_list, merged_junction_table -> numOfElements , junckey_sort_compare, junckey_sort_exchange, junckey_sort_merge);
7984
7985 char outfname[MAX_FILE_NAME_LENGTH];
7986 sprintf(outfname, "%s.jcounts", output_file_name);
7987
7988 int max_junction_genes = 3000;
7989 char * gene_names = malloc(max_junction_genes * FEATURE_NAME_LENGTH), * gene_name_tail;
7990 fc_junction_gene_t ** ret_juncs_small = malloc(sizeof(fc_junction_gene_t *) * max_junction_genes);
7991 fc_junction_gene_t ** ret_juncs_large = malloc(sizeof(fc_junction_gene_t *) * max_junction_genes);
7992 fc_junction_gene_t ** junction_key_list = malloc(sizeof(fc_junction_gene_t *)* max_junction_genes * 2);
7993 unsigned int * junction_support_list = malloc(sizeof(int)* max_junction_genes * 2);
7994 unsigned char * junction_source_list = malloc(sizeof(char)* max_junction_genes * 2 );
7995
7996 int ky_i1, ky_i2;
7997 FILE * ofp = fopen(outfname, "w");
7998 char * tmpp = NULL;
7999
8000 fprintf(ofp, "PrimaryGene\tSecondaryGenes\tSite1_chr\tSite1_location\tSite1_strand\tSite2_chr\tSite2_location\tSite2_strand");
8001
8002 for(infile_i=0; infile_i < column_names -> numOfElements; infile_i++)
8003 {
8004 char * next_fn = ArrayListGet(column_names, infile_i);
8005 fprintf(ofp,"\t%s", global_context -> use_stdin_file?"STDIN":next_fn);
8006 }
8007 fprintf(ofp, "\n");
8008
8009 for(ky_i = 0; ky_i < merged_junction_table -> numOfElements ; ky_i ++){
8010
8011 //SUBREADprintf("KY=%s\n", key_list[ky_i]);
8012
8013 int unique_junctions = 0;
8014 char * chro_small = strtok_r( key_list[ky_i] , "\t", &tmpp);
8015 char * pos_small_str = strtok_r( NULL, "\t", &tmpp);
8016 char * chro_large = strtok_r( NULL, "\t", &tmpp);
8017 char * pos_large_str = strtok_r( NULL, "\t", &tmpp);
8018
8019 unsigned int pos_small = atoi(pos_small_str);
8020 unsigned int pos_large = atoi(pos_large_str);
8021
8022 int found_features_small = locate_junc_features(global_context, chro_small, pos_small, ret_juncs_small , max_junction_genes);
8023 int found_features_large = locate_junc_features(global_context, chro_large, pos_large, ret_juncs_large , max_junction_genes);
8024
8025 char * strand = "NA";
8026 if(global_context -> fasta_contigs){
8027 char donor[3], receptor[3];
8028 donor[2]=receptor[2]=0;
8029 int has = !get_contig_fasta(global_context -> fasta_contigs, chro_small, pos_small, 2, donor);
8030 has = has && !get_contig_fasta(global_context -> fasta_contigs, chro_large, pos_large-3, 2, receptor);
8031 if(has){
8032 if(donor[0]=='G' && donor[1]=='T' && receptor[0]=='A' && receptor[1]=='G') strand = "+";
8033 else if(donor[0]=='C' && donor[1]=='T' && receptor[0]=='A' && receptor[1]=='C') strand = "-";
8034 }else if(!global_context ->is_junction_no_chro_shown){
8035 global_context ->is_junction_no_chro_shown = 1;
8036 print_in_box(80,0,0, " WARNING contig '%s' is not found in the", chro_small);
8037 print_in_box(80,0,0, " provided genome file.");
8038 print_in_box(80,0,0,"");
8039
8040 }
8041 }
8042
8043 //SUBREADprintf("FOUND=%d, %d\n", found_features_small, found_features_large);
8044
8045 gene_name_tail = gene_names;
8046 gene_names[0]=0;
8047
8048 // rules to choose the primary gene:
8049 // (1) if some genes have one support but the other have multiple supporting reads: remove the lowly supported genes
8050 // (2) if all genes have only one support but from different ends of the fragment, then remove the genes that are assigned to the end having lower supporting fragments
8051 // (3) choose the gene that have the smallest coordinate.
8052
8053 int max_supp = 0;
8054 for(ky_i1 = 0; ky_i1 < found_features_small + found_features_large; ky_i1++){
8055 int is_duplicate = 0;
8056 fc_junction_gene_t * tested_key = (ky_i1 < found_features_small)?ret_juncs_small[ky_i1] :ret_juncs_large[ky_i1 - found_features_small];
8057 for(ky_i2 = 0; ky_i2 < unique_junctions; ky_i2 ++){
8058 if(junccmp( tested_key, junction_key_list[ky_i2] )==0){
8059 junction_support_list[ ky_i2 ] ++;
8060 junction_source_list[ky_i2] |= ( (ky_i1 < found_features_small)? 1 : 2 );
8061 is_duplicate = 1;
8062
8063 max_supp = max(junction_support_list[ky_i2], max_supp);
8064 break;
8065 }
8066 }
8067
8068 if(!is_duplicate){
8069 junction_key_list[unique_junctions] = tested_key;
8070 junction_support_list[unique_junctions] = 1;
8071 junction_source_list[unique_junctions] = ( (ky_i1 < found_features_small)? 1 : 2 );
8072 max_supp = max(junction_support_list[unique_junctions], max_supp);
8073 unique_junctions++;
8074 }
8075 }
8076
8077 if(1 == max_supp){
8078 if(found_features_small > 0 && found_features_large > 0){
8079 char junc_key [FEATURE_NAME_LENGTH + 15];
8080 sprintf(junc_key, "%s\t%u", chro_small, pos_small);
8081 unsigned int supp_small = HashTableGet(merged_splicing_table, junc_key) - NULL;
8082 sprintf(junc_key, "%s\t%u", chro_large, pos_large);
8083 unsigned int supp_large = HashTableGet(merged_splicing_table, junc_key) - NULL;
8084
8085 if(supp_small !=supp_large){
8086 for(ky_i2 = 0; ky_i2 < unique_junctions; ky_i2 ++){
8087 if(supp_small > supp_large && junction_source_list[ky_i2] == 1) junction_key_list[ky_i2] = NULL;
8088 else if(supp_small < supp_large && junction_source_list[ky_i2] == 2) junction_key_list[ky_i2] = NULL;
8089 }
8090 }
8091 }
8092 }
8093
8094 int smallest_coordinate_gene = 0x7fffffff;
8095 fc_junction_gene_t * primary_gene = NULL;
8096
8097 for(ky_i2 = 0; ky_i2 < unique_junctions; ky_i2 ++){
8098 fc_junction_gene_t * tested_key = junction_key_list[ky_i2];
8099 if(tested_key != NULL && junction_support_list[ky_i2] == max_supp && tested_key -> pos_first_base < smallest_coordinate_gene){
8100 primary_gene = tested_key;
8101 smallest_coordinate_gene = tested_key -> pos_first_base;
8102 }
8103 }
8104
8105 if(primary_gene == NULL){
8106 strcpy(gene_names, "NA");
8107 }else{
8108 strcpy(gene_names, primary_gene -> gene_name);
8109 }
8110
8111 *(pos_small_str-1)='\t';
8112 *(pos_large_str-1)='\t';
8113
8114 fprintf(ofp, "%s", gene_names);
8115
8116 gene_name_tail = gene_names;
8117 gene_names[0]=0;
8118 for(ky_i2 = 0; ky_i2 < unique_junctions; ky_i2 ++){
8119 fc_junction_gene_t * tested_key = junction_key_list[ky_i2];
8120 if(tested_key && tested_key != primary_gene)
8121 gene_name_tail += sprintf(gene_name_tail, "%s,", tested_key -> gene_name);
8122 }
8123 if( gene_names[0] ) gene_name_tail[-1]=0;
8124 else strcpy(gene_names, "NA");
8125 fprintf(ofp, "\t%s", gene_names);
8126
8127 fprintf(ofp, "\t%s\t%s\t%s\t%s", chro_small, strand, chro_large, strand);
8128
8129 chro_large[-1]='\t';
8130
8131 for(infile_i = 0 ; infile_i < column_names -> numOfElements ; infile_i ++){
8132 HashTable * junc_table = ArrayListGet(junction_global_table_list, infile_i);
8133 srInt_64 count = HashTableGet(junc_table, key_list[ky_i]) - NULL;
8134 #ifdef __MINGW32__
8135 fprintf(ofp,"\t%I64d", count);
8136 #else
8137 fprintf(ofp,"\t%lld", count);
8138 #endif
8139 }
8140 int wlen = fprintf(ofp, "\n");
8141 if(wlen < 1) disk_is_full = 1;
8142 }
8143 fclose(ofp);
8144 free(junction_key_list);
8145 free(gene_names);
8146 free(ret_juncs_small);
8147 free(ret_juncs_large);
8148 free(junction_support_list);
8149 free(key_list);
8150 free(junction_source_list);
8151
8152 //print_in_box(80,0,PRINT_BOX_CENTER,"Found %llu junctions in all the input files.", merged_junction_table -> numOfElements);
8153 //print_in_box(80,0,0,"");
8154
8155 HashTableDestroy(merged_junction_table);
8156 HashTableDestroy(merged_splicing_table);
8157 if(disk_is_full){
8158 unlink(outfname);
8159 SUBREADprintf("ERROR: disk is full; no junction counting table is generated.\n");
8160 }
8161 }
8162
scRNA_copy_loaded_features(srInt_64 nexons,fc_feature_info_t * loaded_features)8163 HashTable * scRNA_copy_loaded_features(srInt_64 nexons, fc_feature_info_t* loaded_features){
8164 HashTable * ret = HashTableCreate(50000);
8165 srInt_64 x1;
8166 for(x1 =0; x1<nexons; x1++)
8167 HashTablePut(ret , NULL +1 +loaded_features[x1].sorted_order, NULL +1 +x1);
8168 return ret;
8169 }
8170
8171 int readSummary_single_file(fc_thread_global_context_t * global_context, read_count_type_t * column_numbers, srInt_64 nexons, int * geneid, char ** chr, srInt_64 * start, srInt_64 * stop, unsigned char * sorted_strand, char * anno_chr_2ch, char ** anno_chrs, srInt_64 * anno_chr_head, srInt_64 * block_end_index, srInt_64 * block_min_start , srInt_64 * block_max_end, fc_read_counters * my_read_counter, HashTable * junc_glob_tab, HashTable * splicing_glob_tab, HashTable * merged_RG_table, fc_feature_info_t * loaded_features);
8172
Input_Files_And_Strand_Mode_Pair(char * fnames,char * smodes)8173 int Input_Files_And_Strand_Mode_Pair(char * fnames, char * smodes){
8174 int ret = 0, ch, bad_fmt = 0, numbs = 0;
8175 //SUBREADputs(fnames);
8176 //SUBREADputs(smodes);
8177 if(strstr(smodes, ".")==NULL){
8178 bad_fmt = smodes[0]<'0' || smodes[0]>'2';
8179 }else{
8180 while('\0'!=(ch=*(fnames++)))if(ch == FC_FLIST_SPLITOR[0])ret++;
8181 while('\0'!=(ch=*(smodes++))){
8182 if(ch == '.'){
8183 if(numbs != 1) bad_fmt = 1;
8184 numbs = 0;
8185 ret--;
8186 }else if(ch >= '0' && ch <= '2') numbs++;
8187 }
8188 if(numbs != 1) bad_fmt = 1;
8189 }
8190 if(bad_fmt) SUBREADputs("Error: The strand mode list has a wrong format.");
8191 if(ret) SUBREADputs("Error: The length of strand mode list differs from the length of input file list");
8192 ret |= bad_fmt;
8193 return ret;
8194 }
8195
readSummary(int argc,char * argv[])8196 int readSummary(int argc,char *argv[]){
8197
8198 /*
8199 This function counts the number of reads falling into each exon region.
8200 The order of exons in the output is the same as that of exons included in the annotation.
8201 The annotation, if provided as a file, should be sorted by chromosome name.
8202
8203 Parameters passed from the featureCounts R function:
8204 0: "readSummary"
8205 1: ann
8206 2: files[i]
8207 3: fout
8208 4: as.numeric(isPairedEnd)
8209 5: min.distance
8210 6: max.distance
8211 7: as.numeric(tolower(file.type)=="sam")
8212 8: as.numeric(allowMultiOverlap)
8213 9: as.numeric(isGeneLevel)
8214 10: as.numeric(nthreads)
8215 11: as.numeric(isGTFannotation)
8216 12: isStrandChecked
8217 13: as.numeric(isReadSummaryReported)
8218 14: as.numeric(isBothEndMapped)
8219 15: as.numeric(isChimericDisallowed)
8220 16: as.numeric(isPEDistChecked)
8221 17: nameFeatureTypeColumn
8222 18: nameGeneIDColumn
8223 19: min.MappingQualityScore
8224 20: as.numeric(isMultiMappingAllowed) # "1" : NH tag > 1 is allowed ; "0" : not allowd (by default)
8225 21: Annotation Chromosome Alias Name File. If the file is not specified, set this value to NULL or a zero-length string.
8226 22: Command line for CfeatureCounts header output; RfeatureCounts should set this value to NULL or a zero-length string or a space (' ').
8227 23: as.numeric(isInputFileResortNeeded)
8228 24: NOT IN USE: as.numeric(feature_block_size) # This parameter is no longer used. Give "14" for safe.
8229 25: as.numeric(Five_End_Extension_Length) # 5' end extension
8230 26: as.numeric(Three_End_Extension_Length) # 3' end extension
8231 27: as.numeric(Minimum_Overlap_Between_Read_And_Feature) # 1 by default
8232 28: as.numeric(is_Split_or_Exonic_Only) # 0 by default; 0: all reads are counted ; 1: only split (Cigar has "N") reads are counted ; 2: only exonic (no "N" in Cigar) are counted.
8233 29: as.numeric(reduce_5_3_ends_to_one) # 0= no reduction; 1= reduce to 5' end; 2= reduce to 3' end
8234 30: debug_command # This is for debug only; RfeatureCounts should pass a space (" ") to this parameter, disabling the debug command.
8235 31: as.numeric(is_duplicate_ignored) # 0 = INCLUDE DUPLICATE READS; 1 = IGNORE DUPLICATE READS (0x400 FLAG IS SET) ; "0" by default.
8236 32: as.numeric(do_not_sort) # 1 = NEVER SORT THE PE BAM/SAM FILES; 0 = SORT THE BAM/SAM FILE IF IT IS FOUND NOT SORTED.
8237 33: as.numeric(fractionMultiMapping) # 1 = calculate fraction numbers if a read overlaps with multiple features or meta-features. "-M" must be specified when fractions are caculated.
8238 34: as.numeric(useOverlappingBreakTie) # 1 = Select features or meta-features with a longer overlapping length; 0 = just use read-voting strategy: one overlapping read = 1 vote
8239 35: Pair_Orientations # FF, FR, RF or RR. This parameter matters only if "-s" option is 1 or 2.
8240 36: as.numeric(doJunctionCounting) # 1 = count the number of junction reads spaining each exon-exon pairs; 0 = do not.
8241 37: file name of genome fasta (for determine the strandness of junctions by looking for GT/AG or CT/AC).
8242 38: as.numeric(max_M_Ops) # maximum "M" sections allowed in the CIGAR string. This parameter is needed in parse_BIN()
8243 39: as.numeric(is_Restrictly_No_Overlapping) # when "1", disable the voting-based tie breaking (e.g., when the reads are paired-end and one gene receives two votes but the other gene only has one.). "0" by default.
8244 40: as.numeric(min_Fractional_Overlap) # A fractioal number. 0.00 : at least 1 bp overlapping
8245 41: temp_directory # the directory to put temp files. "<use output directory>" by default, namely find it from the output file dir.
8246 42: as.numeric(use_stdin_stdout) # only for CfeatureCounts. When use_stdin_stdout & 0x01 > 0, the input file is from stdin (stored in a temporary file); when use_stdin_stdout & 0x02 > 0, the output should be written to STDOUT instead of a file.
8247 43: as.numeric(assign_reads_to_RG) # 1: reads with "RG" tags will be assigned to read groups' 0: default setting
8248 44: as.numeric(long_read_minimum_length) # "1": treat the input BAM or SAM files as containing long reads. No multi-threading. "0": classic behaviour.
8249 45: as.numeric(is_verbose) # 1: show the mismatched chromosome names on screet; 0: don't do so
8250 46: as.numeric(frac_feature_overlap) # fraction of the feature to be overlapped with a read
8251 47: as.numeric(do_detection_call) # do detectionCalls : put the GC fraction into the 2nd column.
8252 48: as.numeric(max_missing_bases_in_read) # maximum # of bases in a read or fragment not overlapping with an exon ; efault value: "-1" means no limit
8253 49: as.numeric(max_missing_bases_in_feature) # maximum # of bases in an exon not overlapping with a read or fragment ; default value: "-1" means no limit
8254 50: as.numeric(is_Primary_Alignment_only) # "1" : only count the primary alignment (FLAG doesn't have 0x100 bit); "0" : count alignments no metter the 0x100 bit (by default)
8255 51: Rpath : the path where the assignment details per read are stored.
8256 52: AdditionalColumnList: the names of additional column names written after "Length". Comma deliminated.
8257 53: annotation_file_screen_output : just for displaying the annotation file name or inbuilt (mm10/hg39/...) or R data.frame.
8258 54: read_shift_type : how to shift reads? "upstream" : to the 5' end; "downstream" : to the 3' end; "left" : to the smaller coordinates in chromosome ; "right" : to the larger coordinates in chromosome.
8259 55: as.numeric(read_shift_size) : how many bases to shift. Mush be a positive number or zero.
8260 */
8261
8262 int isCVersion, isChimericDisallowed, isPEDistChecked, minMappingQualityScore=0, isInputFileResortNeeded, feature_block_size = 20, reduce_5_3_ends_to_one, useStdinFile, assignReadsToRG, long_read_minimum_length, is_verbose, do_detectionCall, max_missing_bases_in_feature, max_missing_bases_in_read, is_Primary_Alignment_only, read_shift_size, read_shift_type, scRNA_input_mode;
8263 float fracOverlap, fracOverlapFeature, umi_cutoff;
8264 char **chr;
8265 srInt_64 *start, *stop;
8266 int *geneid;
8267
8268 char *nameFeatureTypeColumn, *nameGeneIDColumn,*debug_command, *pair_orientations="fr", *temp_dir, *file_name_ptr =NULL, *strand_check_mode = NULL, *extra_column_names = NULL, *scRNA_sample_sheet = NULL, *scRNA_cell_barcode_list = NULL ;
8269 srInt_64 nexons;
8270
8271
8272 srInt_64 * anno_chr_head, * block_min_start, *block_max_end, *block_end_index;
8273 char ** anno_chrs, * anno_chr_2ch;
8274 char * fasta_contigs_fname, *annotation_file_screen_output;
8275 unsigned char * sorted_strand;
8276
8277 int minPEDistance, maxPEDistance, isReadSummaryReport, isBothEndRequired, isMultiMappingAllowed, fiveEndExtension, threeEndExtension, minFragmentOverlap, isSplitOrExonicOnly, is_duplicate_ignored, doNotSort, fractionMultiMapping, useOverlappingBreakTie, doJuncCounting, max_M, isRestrictlyNoOvelrapping ,is_scRNA_BAM_FQ_out_generated, scRNA_rerun_on_persample_BAM;
8278 char * isPEassign, *is_paired_end_reads_expected;
8279
8280 int isGTF, n_input_files=0;
8281 char * alias_file_name = NULL, * cmd_rebuilt = NULL, * Rpath = NULL;
8282
8283 int isMultiOverlapAllowed, isGeneLevel;
8284
8285 isCVersion = ((argv[0][0]=='C')?1:0);
8286
8287 isPEassign = argv[4];
8288 minPEDistance = atoi(argv[5]);
8289 maxPEDistance = atoi(argv[6]);
8290
8291 // isSAM = atoi(argv[7]);
8292 isMultiOverlapAllowed = atoi(argv[8]);
8293 isGeneLevel = atoi(argv[9]);
8294 unsigned short thread_number;
8295 if(argc > 10)
8296 thread_number = atoi(argv[10]);
8297 else thread_number = 4;
8298 if(argc > 11)
8299 isGTF = atoi(argv[11]);
8300 else isGTF = 0;
8301 if(argc > 12)
8302 strand_check_mode = argv[12];
8303 else strand_check_mode = NULL;
8304 if(argc > 13)
8305 isReadSummaryReport = atoi(argv[13]);
8306 else isReadSummaryReport = 0;
8307 if(argc > 14)
8308 isBothEndRequired = atoi(argv[14]);
8309 else isBothEndRequired = 0;
8310 if(argc > 15)
8311 isChimericDisallowed = atoi(argv[15]);
8312 else isChimericDisallowed = 0;
8313 if(argc > 16)
8314 isPEDistChecked = atoi(argv[16]);
8315 else isPEDistChecked = 0;
8316
8317
8318 if(isPEDistChecked && 0==isBothEndRequired){
8319 #ifdef MAKE_STANDALONE
8320 SUBREADprintf("ERROR: when the '-P' option is specified for checking fragment lengths, the '-B' option must also be specified to require both ends mapped.\n");
8321 #else
8322 SUBREADprintf("ERROR: when parameter checkFragLength is set to TRUE, parameter requireBothEndMapped also needs to be set to TRUE.\n");
8323 #endif
8324 return -1;
8325 }
8326
8327 if(argc > 17)
8328 nameFeatureTypeColumn = argv[17];
8329 else nameFeatureTypeColumn = "exon";
8330 if(argc > 18)
8331 nameGeneIDColumn = argv[18];
8332 else nameGeneIDColumn = "gene_id";
8333 if(argc > 19)
8334 minMappingQualityScore = atoi(argv[19]);
8335 else minMappingQualityScore = 0;
8336 if(argc > 20)
8337 isMultiMappingAllowed = atoi(argv[20]);
8338 else isMultiMappingAllowed = 0;
8339 if(argc > 21)
8340 {
8341 alias_file_name = argv[21];
8342 if(alias_file_name == NULL || alias_file_name[0]==' ' || alias_file_name[0]==0)
8343 alias_file_name = NULL;
8344 }
8345 else alias_file_name = NULL;
8346 if(argc > 22)
8347 {
8348 cmd_rebuilt = argv[22];
8349 if(cmd_rebuilt == NULL || cmd_rebuilt[0]==' '||cmd_rebuilt[0]==0)
8350 cmd_rebuilt=NULL;
8351 }
8352 else cmd_rebuilt = NULL;
8353 if(argc>23)
8354 isInputFileResortNeeded = atoi(argv[23]);
8355 else isInputFileResortNeeded = 0;
8356 if(thread_number<1) thread_number=1;
8357 if(thread_number>FC_MAX_THREADS)thread_number=FC_MAX_THREADS;
8358
8359 int Param_fiveEndExtension, Param_threeEndExtension;
8360 if(argc>25)
8361 Param_fiveEndExtension = atoi(argv[25]);
8362 else Param_fiveEndExtension = 0;
8363
8364 if(argc>26)
8365 Param_threeEndExtension = atoi(argv[26]);
8366 else Param_threeEndExtension = 0;
8367
8368 if(argc>27)
8369 minFragmentOverlap = atoi(argv[27]);
8370 else minFragmentOverlap = 1;
8371
8372 if(minFragmentOverlap <1){
8373 fiveEndExtension = 1 - minFragmentOverlap;
8374 threeEndExtension = 1 - minFragmentOverlap;
8375 minFragmentOverlap = 1;
8376 }else{
8377 fiveEndExtension = Param_fiveEndExtension;
8378 threeEndExtension = Param_threeEndExtension;
8379 }
8380
8381 if(argc>28)
8382 isSplitOrExonicOnly = atoi(argv[28]);
8383 else isSplitOrExonicOnly = 0;
8384
8385 if(argc>29)
8386 reduce_5_3_ends_to_one = atoi(argv[29]); // 0 : no reduce; 1: reduce to 5' end; 2: reduce to 3' end.
8387 else reduce_5_3_ends_to_one = 0;
8388
8389
8390 if(argc>30 && strlen(argv[30])>0 && argv[30][0]!=' ')
8391 debug_command = argv[30];
8392 else
8393 debug_command = " ";
8394
8395 if(argc>31)
8396 is_duplicate_ignored = atoi(argv[31]);
8397 else
8398 is_duplicate_ignored = 0;
8399
8400 if(argc>32)
8401 doNotSort = atoi(argv[32]);
8402 else
8403 doNotSort = 0;
8404
8405 if(argc>33)
8406 fractionMultiMapping = atoi(argv[33]);
8407 else
8408 fractionMultiMapping = 0;
8409
8410 if(argc>34)
8411 useOverlappingBreakTie = atoi(argv[34]);
8412 else useOverlappingBreakTie = 0;
8413
8414
8415 /*if(argc>35) "-S" is depreciated.
8416 pair_orientations = argv[35];
8417 else pair_orientations = "FR";
8418 */
8419
8420 if(argc>36)
8421 doJuncCounting = atoi(argv[36]);
8422 else doJuncCounting = 0;
8423
8424 fasta_contigs_fname = NULL;
8425 if(argc>37)
8426 if(argv[37][0] != 0 && argv[37][0]!=' ')
8427 fasta_contigs_fname = argv[37];
8428
8429 if(argc>38)
8430 max_M = atoi(argv[38]);
8431 else max_M = 10;
8432
8433 if(argc>39)
8434 isRestrictlyNoOvelrapping = atoi(argv[39]);
8435 else isRestrictlyNoOvelrapping = 0;
8436
8437 if(argc>40)
8438 fracOverlap = atof(argv[40]);
8439 else fracOverlap= 0.0;
8440
8441 if(argc>41){
8442 if(strcmp("<use output directory>", argv[41])!=0)temp_dir = argv[41];
8443 else temp_dir = NULL;
8444 }
8445 else temp_dir = NULL;// get_temp_dir_from_out(temp_dir, (char *)argv[3]);
8446
8447 if(argc>42){
8448 useStdinFile = (atoi(argv[42]) & 1)!=0;
8449 }else useStdinFile = 0;
8450
8451 if(argc>43)
8452 assignReadsToRG = (argv[43][0]=='1');
8453 else assignReadsToRG = 0;
8454
8455 if(argc>44)
8456 long_read_minimum_length = atoi(argv[44])?1:1999999999;
8457 else long_read_minimum_length = 1999999999;
8458
8459 if(long_read_minimum_length < 2 && isPEassign[0]=='1'){
8460 SUBREADputs("ERROR: long read assignment can only be done on single-end mode");
8461 return -1;
8462 }
8463
8464 if(argc>45)
8465 is_verbose = (argv[45][0]=='1');
8466 else is_verbose = 0;
8467
8468 if(argc>46)
8469 fracOverlapFeature = atof(argv[46]);
8470 else fracOverlapFeature = 0.0;
8471
8472 if(argc>47)
8473 do_detectionCall = (argv[47][0]=='1');
8474 else do_detectionCall = 0;
8475
8476 if(argc>48) max_missing_bases_in_read = atoi(argv[48]);
8477 else max_missing_bases_in_read = -1;
8478
8479 if(argc>49) max_missing_bases_in_feature = atoi(argv[49]);
8480 else max_missing_bases_in_feature = -1;
8481
8482 if(argc>50) is_Primary_Alignment_only = atoi(argv[50]);
8483 else is_Primary_Alignment_only = 0;
8484
8485 if(argc>51 && argv[51]!=NULL && argv[51][0]!=0 && argv[51][0]!=' ') Rpath = argv[51];
8486 else Rpath = NULL;
8487
8488 if(argc>52 && argv[52]!=NULL && argv[52][0]!=0 && argv[52][0]!=' ') extra_column_names = argv[52];
8489 else extra_column_names = NULL;
8490
8491 annotation_file_screen_output = NULL;
8492 #ifndef MAKE_STANDALONE
8493 if(argc>53) annotation_file_screen_output = argv[53];
8494 #endif
8495
8496 if(argc>54){
8497 read_shift_type = -1;
8498 if(strcmp(argv[54], "upstream")==0)read_shift_type = READ_SHIFT_UPSTREAM;
8499 if(strcmp(argv[54], "downstream")==0) read_shift_type = READ_SHIFT_DOWNSTREAM;
8500 if(strcmp(argv[54], "left")==0) read_shift_type = READ_SHIFT_LEFT;
8501 if(strcmp(argv[54], "right")==0) read_shift_type = READ_SHIFT_RIGHT;
8502 } else read_shift_type = READ_SHIFT_UPSTREAM;
8503
8504 if(argc>55) read_shift_size = atoi(argv[55]);
8505 else read_shift_size = 0;
8506
8507 if(argc>56 && strlen(argv[56])>0 && argv[56][0]!=' ') scRNA_sample_sheet = argv[56];
8508 else scRNA_sample_sheet = NULL;
8509
8510 if(argc>57 && strlen(argv[57])>0 && argv[57][0]!=' ') scRNA_cell_barcode_list = argv[57];
8511 else scRNA_cell_barcode_list = NULL;
8512
8513 if(argc>58 && strlen(argv[58])>0 && argv[58][0]!=' ') is_paired_end_reads_expected = argv[58];
8514 else is_paired_end_reads_expected = "0";
8515
8516 if(argc>59 && strlen(argv[59])>0 && argv[59][0]!=' ') is_scRNA_BAM_FQ_out_generated = atoi(argv[59]);
8517 else is_scRNA_BAM_FQ_out_generated = 1;
8518
8519 if(argc>60) scRNA_input_mode = (argv[60][0]-'0');
8520 else scRNA_input_mode = GENE_INPUT_BCL;
8521
8522 if(argc>61) scRNA_rerun_on_persample_BAM = (argv[61][0]-'0');
8523 else scRNA_rerun_on_persample_BAM = 0;
8524
8525 if(argc>62) umi_cutoff = atof(argv[62]);
8526 else umi_cutoff = -1;
8527
8528 if(read_shift_size<0){
8529 SUBREADprintf("ERROR: why the value for read_shift_size is negative?\n");
8530 return -1;
8531 }
8532
8533 if(read_shift_type<0){
8534 SUBREADprintf("ERROR: why the value for read_shift_type is %s?\n", argv[54]);
8535 return -1;
8536 }
8537
8538 if(SAM_pairer_warning_file_open_limit()) return -1;
8539 if(strand_check_mode != NULL && Input_Files_And_Strand_Mode_Pair(argv[2],strand_check_mode)) return -1;
8540 if(extra_column_names){
8541 if(!isGTF){
8542 SUBREADputs("ERROR: only GTF files contain additional attributes");
8543 return -1;
8544 }
8545 int xk1, total_cols =1;
8546 for(xk1=0; extra_column_names[xk1]; xk1++)
8547 if(extra_column_names[xk1] == ';' || extra_column_names[xk1]==',' || extra_column_names[xk1]=='\t'){
8548 extra_column_names[xk1]='\t';
8549 total_cols ++;
8550 }
8551 if(total_cols>MAX_EXTRA_COLS){
8552 SUBREADprintf("ERROR: there are more than %d additional attributes required\n", MAX_EXTRA_COLS);
8553 return -1;
8554 }
8555 }
8556
8557 fc_thread_global_context_t global_context;
8558
8559 fc_thread_init_global_context(& global_context, FEATURECOUNTS_BUFFER_SIZE, thread_number, MAX_LINE_LENGTH, minPEDistance, maxPEDistance,isGeneLevel, isMultiOverlapAllowed, strand_check_mode, (char *)argv[3] , isReadSummaryReport, isBothEndRequired, isChimericDisallowed, isPEDistChecked, nameFeatureTypeColumn, nameGeneIDColumn, minMappingQualityScore,isMultiMappingAllowed, 0, alias_file_name, cmd_rebuilt, isInputFileResortNeeded, feature_block_size, isCVersion, fiveEndExtension, threeEndExtension , minFragmentOverlap, isSplitOrExonicOnly, reduce_5_3_ends_to_one, debug_command, is_duplicate_ignored, doNotSort, fractionMultiMapping, useOverlappingBreakTie, pair_orientations, doJuncCounting, max_M, isRestrictlyNoOvelrapping, fracOverlap, temp_dir, useStdinFile, assignReadsToRG, long_read_minimum_length, is_verbose, fracOverlapFeature, do_detectionCall, max_missing_bases_in_read, max_missing_bases_in_feature, is_Primary_Alignment_only, Rpath, extra_column_names, annotation_file_screen_output, read_shift_type, read_shift_size, scRNA_sample_sheet, scRNA_cell_barcode_list, is_scRNA_BAM_FQ_out_generated, scRNA_input_mode, scRNA_rerun_on_persample_BAM, umi_cutoff);
8560
8561 fc_thread_init_input_files( & global_context, argv[2], &file_name_ptr );
8562
8563 if( print_FC_configuration(&global_context, argv[1], file_name_ptr, argv[3], global_context.is_SAM_file, isGTF, & n_input_files, isReadSummaryReport, is_paired_end_reads_expected, isPEassign) )
8564 return -1;
8565 // Loading the annotations.
8566 // Nothing is done if the annotation does not exist.
8567 fc_feature_info_t * loaded_features;
8568 print_in_box(84,0,0,"Load annotation file %s %c[0m...", get_short_fname(argv[1]), CHAR_ESC);
8569 nexons = load_feature_info(&global_context,argv[1], isGTF?FILE_TYPE_GTF:FILE_TYPE_RSUBREAD, &loaded_features);
8570 if(nexons<1){
8571 if(nexons >= -1) SUBREADprintf("Failed to open the annotation file %s, or its format is incorrect, or it contains no '%s' features.\n",argv[1], nameFeatureTypeColumn);
8572 return -1;
8573 }
8574
8575 sort_feature_info(&global_context, nexons, loaded_features, &chr, &geneid, &start, &stop, &sorted_strand, &anno_chr_2ch, &anno_chrs, &anno_chr_head, & block_end_index, & block_min_start, & block_max_end);
8576 if((!global_context.do_scRNA_table) || global_context.is_gene_level) global_context.lineno_2_sortedno_tab = NULL;
8577 else global_context.lineno_2_sortedno_tab = scRNA_copy_loaded_features(nexons, loaded_features);
8578 if(global_context.do_junction_counting){
8579 sort_bucket_table(&global_context);
8580 }
8581 print_in_box(80,0,0," Meta-features : %d", global_context . gene_name_table -> numOfElements);
8582 print_in_box(80,0,0," Chromosomes/contigs : %d", global_context . exontable_nchrs);
8583
8584 print_in_box(80,0,0,"");
8585
8586 if(global_context.do_scRNA_table){
8587 print_in_box(80,0,0,"Load scRNA-related files...");
8588 print_in_box(80,0,0," scRNA samples : %d", global_context.scRNA_sample_sheet_table->numOfElements);
8589 print_in_box(80,0,0," scRNA cell barcodes : %d", global_context.scRNA_cell_barcodes_array -> numOfElements);
8590 print_in_box(80,0,0,"");
8591 }
8592
8593 if(fasta_contigs_fname){
8594 print_in_box(80,0,0,"Load FASTA contigs from %s...", get_short_fname(fasta_contigs_fname));
8595 global_context.fasta_contigs = malloc(sizeof(fasta_contigs_t));
8596 int ret_fq = read_contig_fasta(global_context.fasta_contigs, fasta_contigs_fname);
8597 if(ret_fq){
8598 print_in_box(80,0,0," WARNING unable to open the FASTA file.");
8599 print_in_box(80,0,0,"");
8600 free(global_context.fasta_contigs);
8601 global_context.fasta_contigs = NULL;
8602 }else{
8603 print_in_box(80,0,0," %lu contigs were loaded", global_context.fasta_contigs -> contig_table -> numOfElements);
8604 print_in_box(80,0,0,"");
8605 }
8606 }else global_context.fasta_contigs = NULL;
8607
8608
8609 global_context.exontable_exons = nexons;
8610 unsigned int x1, total_written_coulmns=0;
8611
8612
8613
8614
8615 char * tmp_pntr = NULL, *tmp_smode_ptr = NULL;
8616 char * strand_mode_list = strdup(global_context.strand_check_mode);
8617 char * file_list_used = malloc(strlen(file_name_ptr)+1);
8618 char * file_list_used2 = malloc(strlen(file_name_ptr)+1);
8619 char * is_unique = malloc(strlen(file_name_ptr)+1);
8620 strcpy(file_list_used, file_name_ptr);
8621 for(x1 = 0;;x1++){
8622 char * test_fn = strtok_r(x1?NULL:file_list_used, FC_FLIST_SPLITOR, &tmp_pntr);
8623 if(NULL == test_fn) break;
8624 char * short_fname = get_short_fname(test_fn);
8625 strcpy(file_list_used2, file_name_ptr);
8626
8627 is_unique[x1]=1;
8628 char * loop_ptr = NULL;
8629 int x2;
8630 for(x2 = 0;;x2++){
8631 char * test_loopfn = strtok_r(x2?NULL:file_list_used2, FC_FLIST_SPLITOR, &loop_ptr);
8632 if(NULL == test_loopfn) break;
8633 if(x1==x2)continue;
8634
8635 char * short_loop_fname = get_short_fname(test_loopfn);
8636
8637 if(strcmp(short_loop_fname, short_fname)==0) {
8638 is_unique[x1] = 0;
8639 break;
8640 }
8641 }
8642 }
8643 free(file_list_used2);
8644
8645 tmp_pntr = NULL;
8646 strcpy(file_list_used, file_name_ptr);
8647 char * next_fn = strtok_r(file_list_used, FC_FLIST_SPLITOR, &tmp_pntr);
8648 char * next_strand_mode = strtok_r(strand_mode_list, ".", &tmp_smode_ptr);
8649 int one_single_strand_mode = -1;
8650 if(NULL == strstr( global_context.strand_check_mode, "." )){
8651 one_single_strand_mode = next_strand_mode[0] - '0';
8652 assert(one_single_strand_mode >= 0 && one_single_strand_mode < 3);
8653 }
8654
8655 ArrayList * table_columns = ArrayListCreate(n_input_files+1);
8656 ArrayList * table_column_names = ArrayListCreate(n_input_files+1);
8657 ArrayList * read_counters = ArrayListCreate(n_input_files+1);
8658 ArrayListSetDeallocationFunction(table_columns, free);
8659 ArrayListSetDeallocationFunction(table_column_names, free);
8660 ArrayListSetDeallocationFunction(read_counters, free);
8661
8662 ArrayList * junction_global_table_list = NULL;
8663 ArrayList * splicing_global_table_list = NULL;
8664
8665 if(global_context.do_junction_counting){
8666 junction_global_table_list = ArrayListCreate(n_input_files+1);
8667 splicing_global_table_list = ArrayListCreate(n_input_files+1);
8668 ArrayListSetDeallocationFunction(junction_global_table_list, (void (*)(void *))HashTableDestroy);
8669 ArrayListSetDeallocationFunction(splicing_global_table_list, (void (*)(void *))HashTableDestroy);
8670 }
8671
8672 int ret_int = 0;
8673
8674 #ifdef MAKE_STANDALONE
8675 #define NO_SORT_OPTION_NAME "donotsort"
8676 #else
8677 #define NO_SORT_OPTION_NAME "autosort"
8678 #endif
8679
8680 for(x1 = 0;;x1++){
8681 int orininal_isPE = global_context.is_paired_end_mode_assign;
8682 if(next_fn==NULL || strlen(next_fn)<1 || global_context.disk_is_full) break;
8683 int this_file_isPEassign = isPEassign[1]?isPEassign[x1] == '1' :(isPEassign[0]=='1');
8684 int this_file_isPEexpected = is_paired_end_reads_expected[1]?is_paired_end_reads_expected[x1]=='1' :(is_paired_end_reads_expected[0]=='1');
8685 global_context.is_paired_end_reads_expected = this_file_isPEexpected;
8686 global_context.is_paired_end_mode_assign = this_file_isPEassign;
8687 if(global_context.do_not_sort && 0==this_file_isPEassign){
8688 print_in_box(80,0,0," WARNING the %s option is ignored when single-end reads", NO_SORT_OPTION_NAME);
8689 print_in_box(80,0,0," are being counted.");
8690 }
8691
8692 read_count_type_t * column_numbers = calloc(nexons, sizeof(read_count_type_t));
8693 HashTable * junction_global_table = NULL;
8694 HashTable * splicing_global_table = NULL;
8695
8696 strcpy(global_context.input_file_name, next_fn);
8697 strcpy(global_context.raw_input_file_name, next_fn);
8698 global_context.this_input_number = x1;
8699 global_context.input_file_unique = is_unique[x1];
8700 global_context.input_file_short_name = get_short_fname(next_fn);
8701 if(strstr( global_context.strand_check_mode, "." )){
8702 global_context.is_strand_checked = next_strand_mode[0]-'0';
8703 assert(global_context.is_strand_checked >=0 && global_context.is_strand_checked <=2);
8704 }else global_context.is_strand_checked = one_single_strand_mode;
8705 global_context.redo=0;
8706
8707 if(global_context.is_scRNA_BAM_FQ_out_generated && global_context.scRNA_sample_sheet_table){
8708 global_context.scRNA_sample_BAM_writers = HashTableCreate(global_context.scRNA_sample_sheet_table -> numOfElements);
8709 HashTableSetDeallocationFunctions(global_context.scRNA_sample_BAM_writers, NULL, scRNA_close_sample_SamBam_writers);
8710 global_context.scRNA_sample_sheet_table ->appendix1 = global_context.scRNA_sample_BAM_writers;
8711 global_context.scRNA_sample_sheet_table ->appendix2 = &global_context;
8712 global_context.scRNA_sample_sheet_table ->appendix3 = global_context.scRNA_sample_id_to_name;
8713 HashTableIteration( global_context.scRNA_sample_sheet_table, scRNA_sample_SamBam_writers_new_files);
8714 }
8715
8716 if(global_context.do_junction_counting){
8717 junction_global_table = HashTableCreate(156679);
8718 splicing_global_table = HashTableCreate(156679);
8719
8720 HashTableSetHashFunction(junction_global_table,HashTableStringHashFunction);
8721 HashTableSetDeallocationFunctions(junction_global_table, free, NULL);
8722 HashTableSetKeyComparisonFunction(junction_global_table, fc_strcmp_chro);
8723
8724 HashTableSetHashFunction(splicing_global_table,HashTableStringHashFunction);
8725 HashTableSetDeallocationFunctions(splicing_global_table, free, NULL);
8726 HashTableSetKeyComparisonFunction(splicing_global_table, fc_strcmp_chro);
8727 }
8728
8729 HashTable * merged_RG_table = NULL;
8730 if(global_context.assign_reads_to_RG){
8731 merged_RG_table = HashTableCreate(97);
8732 HashTableSetHashFunction(merged_RG_table,HashTableStringHashFunction);
8733 HashTableSetDeallocationFunctions(merged_RG_table, NULL, free); // the names are put into the column_names table, but the 4-pointer arrays are not used anymore.
8734 HashTableSetKeyComparisonFunction(merged_RG_table, fc_strcmp_chro);
8735 }
8736
8737 fc_read_counters * my_read_counter = calloc(1, sizeof(fc_read_counters));
8738 global_context.is_read_details_out = isReadSummaryReport;
8739 global_context.max_M = max_M;
8740
8741 ret_int = ret_int || readSummary_single_file(& global_context, column_numbers, nexons, geneid, chr, start, stop, sorted_strand, anno_chr_2ch, anno_chrs, anno_chr_head, block_end_index, block_min_start, block_max_end, my_read_counter, junction_global_table, splicing_global_table, merged_RG_table, loaded_features);
8742 if(global_context.disk_is_full){
8743 SUBREADprintf("ERROR: disk is full. Please check the free space in the output directory.\n");
8744 }
8745 if(ret_int!=0){
8746 // give up this file.
8747 if(global_context.do_junction_counting){
8748 HashTableDestroy(junction_global_table);
8749 HashTableDestroy(splicing_global_table);
8750 }
8751 free(column_numbers);
8752 } else {
8753 // finished
8754
8755 char * mem_file_name = memstrcpy(next_fn);
8756 if(!global_context.assign_reads_to_RG){
8757 ArrayListPush(table_columns, column_numbers);
8758 ArrayListPush(table_column_names, mem_file_name);
8759 ArrayListPush(read_counters, my_read_counter);
8760 if(global_context.do_junction_counting){
8761 ArrayListPush(junction_global_table_list,junction_global_table);
8762 ArrayListPush(splicing_global_table_list,splicing_global_table);
8763 }
8764 }
8765
8766 if(global_context.assign_reads_to_RG){
8767 int rgcur;
8768 char * rg_name = global_context.RGnames_set;
8769 for(rgcur = 0; rgcur < global_context.RGnames_ptr+1; rgcur ++){
8770 if(global_context.RGnames_set[rgcur] == '\t'||global_context.RGnames_set[rgcur] == '\0'){
8771 global_context.RGnames_set[rgcur] = 0;
8772 int rg_name_len = strlen(rg_name);
8773 if(rg_name_len > 0){
8774 // SUBREADprintf("GET 4Tab:'%s'\n", rg_name);
8775 void ** tab4 = HashTableGet(merged_RG_table, rg_name);
8776 int file_len = strlen(mem_file_name);
8777
8778 char * rg_file_name = malloc(rg_name_len + 3 + file_len);
8779 sprintf(rg_file_name, "%s:%s", mem_file_name, rg_name);
8780
8781 ArrayListPush(table_column_names, rg_file_name);
8782 ArrayListPush(table_columns, tab4[0]);
8783 ArrayListPush(read_counters, tab4[1]);
8784 if(global_context.do_junction_counting){
8785 ArrayListPush(junction_global_table_list,tab4[2]);
8786 ArrayListPush(splicing_global_table_list,tab4[3]);
8787 }
8788 rg_name = global_context.RGnames_set + rgcur + 1;
8789 }
8790 }
8791 }
8792 free(mem_file_name);
8793 }
8794 total_written_coulmns ++;
8795 }
8796 global_context.is_paired_end_mode_assign = orininal_isPE;
8797 next_fn = strtok_r(NULL, FC_FLIST_SPLITOR, &tmp_pntr);
8798
8799 if(strstr( global_context.strand_check_mode, "." )) next_strand_mode = strtok_r(NULL, ".", &tmp_smode_ptr);
8800 if(global_context.assign_reads_to_RG) free(global_context.RGnames_set);
8801 if(merged_RG_table) HashTableDestroy(merged_RG_table);
8802 }
8803
8804 free(file_list_used);
8805 free(is_unique);
8806
8807 if(global_context.is_input_bad_format){
8808 // SUBREADprintf("\nEEROR: The program has to terminate and no counting file is generated.\n\n");
8809 }else if(!global_context.disk_is_full){
8810 print_in_box(80,0,0,"Write the final count table.");
8811 if(isGeneLevel){
8812 char ** sorted_extra_columns = NULL;
8813 if(global_context.reported_extra_columns != NULL){
8814 sorted_extra_columns = malloc(sizeof(char**) * nexons);
8815 int ii;
8816 for(ii = 0; ii < nexons; ii++){
8817 sorted_extra_columns[loaded_features[ii].sorted_order] = loaded_features[ii].extra_columns;
8818 //SUBREADprintf("SSMQ: %d = %s\n", loaded_features[ii].sorted_order, loaded_features[ii].extra_columns);
8819 }
8820 }
8821
8822 fc_write_final_gene_results(&global_context, geneid, chr, start, stop, sorted_strand, sorted_extra_columns, argv[3], nexons, table_columns, table_column_names, loaded_features, isCVersion);
8823
8824 if(sorted_extra_columns) free(sorted_extra_columns);
8825 } else
8826 fc_write_final_results(&global_context, argv[3], nexons, table_columns, table_column_names, loaded_features, isCVersion);
8827 }
8828 if(global_context.do_junction_counting && global_context.is_input_bad_format == 0 && !global_context.disk_is_full){
8829 print_in_box(80,0,0,"Write the junction count table.");
8830 fc_write_final_junctions(&global_context, argv[3], table_column_names, junction_global_table_list, splicing_global_table_list);
8831 }
8832
8833 if(global_context.is_input_bad_format == 0 && !global_context.disk_is_full){
8834 print_in_box(80,0,0,"Write the read assignment summary.");
8835 fc_write_final_counts(&global_context, argv[3], table_column_names, read_counters, isCVersion);
8836 }
8837
8838 ArrayListDestroy(table_columns);
8839 ArrayListDestroy(table_column_names);
8840 ArrayListDestroy(read_counters);
8841 if(global_context.do_junction_counting){
8842 ArrayListDestroy(junction_global_table_list);
8843 ArrayListDestroy(splicing_global_table_list);
8844 }
8845 free(file_name_ptr);
8846
8847 if(global_context.is_input_bad_format == 0) print_FC_results(&global_context, (char *)argv[3]/*out file name*/);
8848 KeyValuePair * cursor;
8849 int bucket;
8850 for(bucket=0; bucket < global_context.exontable_chro_table -> numOfBuckets; bucket++)
8851 {
8852 cursor = global_context.exontable_chro_table -> bucketArray[bucket];
8853 while (1)
8854 {
8855 if (!cursor) break;
8856 fc_chromosome_index_info * del_chro_info = cursor->value;
8857 free(del_chro_info->reverse_table_start_index);
8858 //free(del_chro_info->reverse_table_end_index);
8859 free((void *)cursor -> key);
8860 free(del_chro_info);
8861 cursor = cursor->next;
8862 }
8863 }
8864
8865 if(global_context.read_details_out_FP) fclose(global_context. read_details_out_FP);
8866 HashTableDestroy(global_context.gene_name_table);
8867 HashTableDestroy(global_context.GCcontent_table);
8868 if(global_context.scRNA_sample_sheet_table){
8869 HashTableDestroy(global_context.scRNA_sample_sheet_table);
8870 ArrayListDestroy(global_context.scRNA_sample_barcode_list);
8871 ArrayListDestroy(global_context.scRNA_sample_id_to_name);
8872 HashTableDestroy(global_context.scRNA_lineno1B_to_sampleno1B_tab);
8873
8874 for(x1=0; x1<global_context.scRNA_barcode_batched_bin_no +2; x1++){
8875 char tmp_fname[MAX_FILE_NAME_LENGTH+20];
8876 sprintf(tmp_fname, "%s/cellCounts-Splitted-Reads-%05d-%05d.bin", temp_dir, getpid(), x1);
8877 unlink(tmp_fname);
8878 pthread_spin_destroy(global_context.scRNA_barcode_batched_locks+x1);
8879 }
8880 pthread_spin_destroy(&global_context.scRNA_do_one_batch_runner_lock);
8881
8882 if(global_context.is_scRNA_BAM_FQ_out_generated){
8883 HashTableDestroy(global_context.scRNA_sample_BAM_writers);
8884 }
8885 }
8886 if(global_context.scRNA_cell_barcodes_array){
8887 SUBREADprintf("DESTROYING global_context.scRNA_cell_barcodes_array : %p and %p, having %lld\n", global_context.scRNA_cell_barcodes_array, global_context.scRNA_cell_barcodes_array->elemDeallocator, global_context.scRNA_cell_barcodes_array-> numOfElements);
8888 ArrayListDestroy(global_context.scRNA_cell_barcodes_array);
8889 HashTableDestroy(global_context.scRNA_cell_barcode_head_tail_table);
8890 }
8891 free(global_context.gene_name_array);
8892
8893 HashTableDestroy(global_context.exontable_chro_table);
8894 if(global_context.fasta_contigs){
8895 destroy_contig_fasta(global_context.fasta_contigs);
8896 free(global_context.fasta_contigs);
8897 }
8898 if(global_context.BAM_chros_to_anno_table)
8899 HashTableDestroy(global_context.BAM_chros_to_anno_table);
8900 if(global_context.do_junction_counting){
8901 HashTableDestroy(global_context.junction_bucket_table);
8902 HashTableDestroy(global_context.junction_features_table);
8903 }
8904
8905
8906 free(global_context.unistr_buffer_space);
8907
8908 if(global_context.reported_extra_columns){
8909 for(bucket = 0; bucket < nexons; bucket++)
8910 free(loaded_features[bucket].extra_columns);
8911 }
8912 if(global_context.lineno_2_sortedno_tab)HashTableDestroy(global_context.lineno_2_sortedno_tab);
8913
8914 free(loaded_features);
8915 free(geneid);
8916 free(chr);
8917 free(start);
8918 free(sorted_strand);
8919 free(anno_chr_2ch);
8920 free(anno_chrs);
8921 free(anno_chr_head);
8922 free(block_min_start);
8923 free(block_max_end);
8924 free(block_end_index);
8925 free(stop);
8926 free(strand_mode_list);
8927
8928 return total_written_coulmns?0:-1;
8929 }
8930
register_buckets(fc_thread_global_context_t * global_context,HashTable * gene_feature_table,char * chro_name)8931 void register_buckets(fc_thread_global_context_t * global_context , HashTable * gene_feature_table, char * chro_name){
8932 KeyValuePair * cursor;
8933 int bucket;
8934 for(bucket=0; bucket < gene_feature_table -> numOfBuckets; bucket++){
8935 cursor = gene_feature_table -> bucketArray[bucket];
8936 while(1){
8937 if (!cursor) break;
8938 fc_junction_gene_t * gene = (fc_junction_gene_t *) cursor -> value;
8939 unsigned int x1;
8940
8941 for(x1 = gene -> pos_first_base - gene -> pos_first_base % JUNCTION_BUCKET_STEP; x1 <= gene -> pos_last_base ; x1 += JUNCTION_BUCKET_STEP){
8942 char bucket_key[CHROMOSOME_NAME_LENGTH + 20];
8943 sprintf(bucket_key, "%s:%u", chro_name, x1);
8944 gene_info_list_t * list = HashTableGet(global_context -> junction_bucket_table, bucket_key);
8945 if(list == NULL){
8946 list = malloc(sizeof(gene_info_list_t));
8947 list -> space = 3;
8948 list -> used = 0;
8949 list -> genes = malloc(sizeof(void *) * list -> space);
8950 char * mem_bucket_key = malloc(strlen(bucket_key) + 1);
8951 strcpy(mem_bucket_key , bucket_key);
8952 HashTablePut(global_context -> junction_bucket_table, mem_bucket_key , list);
8953 }
8954
8955 if(list -> used == list -> space){
8956 list -> space = max(list -> space + 3, list -> space * 1.3);
8957 list -> genes = realloc(list -> genes , list -> space * sizeof(void *));
8958 }
8959 list -> genes[list -> used++] = gene;
8960 }
8961 cursor = cursor -> next;
8962 }
8963 }
8964 }
8965
sort_bucket_table(fc_thread_global_context_t * global_context)8966 void sort_bucket_table(fc_thread_global_context_t * global_context){
8967 KeyValuePair * cursor;
8968 int bucket;
8969 for(bucket=0; bucket < global_context -> junction_features_table -> numOfBuckets; bucket++){
8970 cursor = global_context -> junction_features_table -> bucketArray[bucket];
8971 while(1){
8972 if (!cursor) break;
8973 HashTable * gene_feature_table = cursor -> value;
8974 char * chro_name = (char *)cursor -> key;
8975 register_buckets(global_context , gene_feature_table, chro_name);
8976 cursor = cursor -> next;
8977 }
8978 }
8979 }
8980
8981
scRNA_generate_BAM_FASTQ(fc_thread_global_context_t * global_context)8982 void scRNA_generate_BAM_FASTQ(fc_thread_global_context_t * global_context){
8983 char MAC_or_random[13];
8984 mac_or_rand_str(MAC_or_random);
8985 char rand_prefix[MAX_FILE_NAME_LENGTH+100];
8986 sprintf(rand_prefix, "%s/temp-core-%06u-%s.sam", global_context -> temp_file_dir, getpid(), MAC_or_random);
8987
8988 SAM_pairer_create(&global_context -> scRNA_read_pairer, global_context -> thread_number , global_context -> max_BAM_header_size/1024/1024+2, 1 /* is bam */, 0 /* do not drop seq/qual */ , 1 /*single end*/, 0 /*do not sort*/,0 /* no RG*/ ,0, global_context -> input_file_name, NULL, NULL, process_pairer_scRNAr2_output, rand_prefix, global_context, 9999);
8989 SAM_pairer_run(&global_context -> scRNA_read_pairer);
8990 SAM_pairer_destroy(&global_context -> scRNA_read_pairer);
8991 }
8992
readSummary_single_file(fc_thread_global_context_t * global_context,read_count_type_t * column_numbers,srInt_64 nexons,int * geneid,char ** chr,srInt_64 * start,srInt_64 * stop,unsigned char * sorted_strand,char * anno_chr_2ch,char ** anno_chrs,srInt_64 * anno_chr_head,srInt_64 * block_end_index,srInt_64 * block_min_start,srInt_64 * block_max_end,fc_read_counters * my_read_counter,HashTable * junction_global_table,HashTable * splicing_global_table,HashTable * merged_RG_table,fc_feature_info_t * loaded_features)8993 int readSummary_single_file(fc_thread_global_context_t * global_context, read_count_type_t * column_numbers, srInt_64 nexons, int * geneid, char ** chr, srInt_64 * start, srInt_64 * stop, unsigned char * sorted_strand, char * anno_chr_2ch, char ** anno_chrs, srInt_64 * anno_chr_head, srInt_64 * block_end_index, srInt_64 * block_min_start , srInt_64 * block_max_end, fc_read_counters * my_read_counter, HashTable * junction_global_table, HashTable * splicing_global_table, HashTable * merged_RG_table, fc_feature_info_t * loaded_features)
8994 {
8995 int read_length = 0;
8996 int is_first_read_PE=0;
8997 char * line = (char*)calloc(MAX_LINE_LENGTH, 1);
8998 char * file_str = "";
8999
9000 int file_probe = is_certainly_bam_file(global_context->input_file_name, &is_first_read_PE, NULL);
9001
9002 // a Singel-end SAM/BAM file cannot be assigned as a PE SAM/BAM file;
9003 // but a PE SAM/BAM file may be assigned as a SE file if the user wishes to do so.
9004
9005 global_context->is_SAM_file = 1;
9006 if(file_probe == 1) global_context->is_SAM_file = 0;
9007 global_context->is_mixed_PE_SE = 0;
9008 global_context->any_reads_are_PE = 0;
9009 global_context -> start_time = miltime();
9010
9011 file_str = "SAM";
9012 if(file_probe == 1) file_str = "BAM" ;
9013 if(file_probe == -1) file_str = "Unknown";
9014
9015 if(!global_context->redo)
9016 {
9017 print_in_box(80,0,0,"Process %s file %s...", file_str, global_context -> use_stdin_file? "<STDIN>":get_short_fname(global_context->input_file_name));
9018 if(global_context->is_strand_checked)
9019 print_in_box(80,0,0," Strand specific : %s", global_context->is_strand_checked==1?"stranded":"reversely stranded");
9020 }
9021
9022 // Open the SAM/BAM file
9023 // Nothing is done if the file does not exist.
9024
9025 fc_thread_start_threads(global_context, nexons, geneid, chr, start, stop, sorted_strand, anno_chr_2ch, anno_chrs, anno_chr_head, block_end_index, block_min_start , block_max_end, read_length);
9026 fc_thread_wait_threads(global_context);
9027 if(global_context -> is_paired_end_reads_expected && !global_context -> any_reads_are_PE){
9028 SUBREADprintf("ERROR: No paired-end reads were detected in paired-end read library : %s\n", global_context -> input_file_name);
9029 global_context -> is_input_bad_format=1;
9030 return -1;
9031 }
9032
9033 srInt_64 nreads_mapped_to_exon = 0;
9034 fc_thread_merge_results(global_context, column_numbers , &nreads_mapped_to_exon, my_read_counter, junction_global_table, splicing_global_table, merged_RG_table, loaded_features, nexons);
9035 if(global_context -> do_scRNA_table){
9036 scRNA_generate_BAM_FASTQ(global_context);
9037 free(global_context -> scRNA_applied_umi_cut);
9038 }
9039 fc_thread_destroy_thread_context(global_context);
9040
9041 if(global_context -> sambam_chro_table) free(global_context -> sambam_chro_table);
9042 global_context -> sambam_chro_table = NULL;
9043
9044 free(line);
9045 if(global_context -> is_input_bad_format) return -1;
9046 return 0;
9047 }
9048
9049
9050 #ifdef MAKE_STANDALONE
main(int argc,char ** argv)9051 int main(int argc, char ** argv)
9052 #else
9053 int feature_count_main(int argc, char ** argv)
9054 #endif
9055 {
9056 char * Rargv[61];
9057 char annot_name[MAX_FILE_NAME_LENGTH];
9058 char temp_dir[MAX_FILE_NAME_LENGTH];
9059 char * out_name = malloc(MAX_FILE_NAME_LENGTH);
9060 char * fasta_contigs_name = malloc(MAX_FILE_NAME_LENGTH);
9061 char * alias_file_name = malloc(MAX_FILE_NAME_LENGTH);
9062 char * Rpath = malloc(MAX_FILE_NAME_LENGTH);
9063 char * scRNA_sample_sheet = malloc(MAX_FILE_NAME_LENGTH);
9064 char * scRNA_cell_barcode_list = malloc(MAX_FILE_NAME_LENGTH);
9065
9066 int cmd_rebuilt_size = 2000;
9067 char * cmd_rebuilt = malloc(cmd_rebuilt_size);
9068 char max_M_str[8];
9069 char nameFeatureTypeColumn[2000];
9070 char nameGeneIDColumn[66];
9071 int min_qual_score = 0;
9072 int min_dist = 50;
9073 int max_dist = 600;
9074 int read_shift_size = 0;
9075 char debug_command[15];
9076 char max_missing_bases_in_read_str[15];
9077 char max_missing_bases_in_feature_str[15];
9078 char min_dist_str[15];
9079 char max_dist_str[15];
9080 char read_shift_size_str[15];
9081 char read_shift_type[15];
9082 char min_qual_score_str[15];
9083 char feature_block_size_str[15];
9084 char * Strand_Sensitive_Str = "0";
9085 char * old_zero_smode = Strand_Sensitive_Str;
9086 char strFeatureFracOverlap[15];
9087 char Pair_Orientations[3];
9088 char * extra_column_names = NULL;
9089 char * very_long_file_names;
9090 char is_paired_end_reads_expected[2];
9091 int is_Input_Need_Reorder = 0;
9092 int is_PE = 0;
9093 int is_SAM = 1;
9094 int is_primary_alignment_only = 0;
9095 int is_GeneLevel = 1;
9096 int is_Overlap = 0;
9097 int is_Both_End_Mapped = 0;
9098 int is_Restrictedly_No_Overlap = 0;
9099 int feature_block_size = 14;
9100 int is_ReadSummary_Report = 0;
9101 int is_Chimeric_Disallowed = 0;
9102 int is_PE_Dist_Checked = 0;
9103 int is_Multi_Mapping_Allowed = 0;
9104 int is_Split_or_Exonic_Only = 0;
9105 int is_duplicate_ignored = 0;
9106 int assign_reads_to_RG = 0;
9107 int do_not_sort = 0;
9108 int do_junction_cnt = 0;
9109 int do_detection_call = 0;
9110 int reduce_5_3_ends_to_one = 0;
9111 int use_fraction_multimapping = 0;
9112 int threads = 1;
9113 int isGTF = 1;
9114 int use_overlapping_length_break_tie = 0;
9115 char nthread_str[4];
9116 int option_index = 0;
9117 int max_missing_bases_in_feature = -1;
9118 int max_missing_bases_in_read = -1;
9119 int scRNA_input_mode = GENE_INPUT_BCL;
9120 int c;
9121 int very_long_file_names_size = 200;
9122 int fiveEndExtension = 0, threeEndExtension = 0, minFragmentOverlap = 1;
9123 float fracOverlap = 0.0, fracOverlapFeature = 0.0;
9124 int std_input_output_mode = 0, long_read_mode = 0, is_verbose = 0;
9125 int is_scRNA_BAM_FQ_out_generated = 1;
9126 char strFiveEndExtension[11], strThreeEndExtension[11], strMinFragmentOverlap[11], fracOverlapStr[20], std_input_output_mode_str[16], long_read_mode_str[16];
9127 very_long_file_names = malloc(very_long_file_names_size);
9128 very_long_file_names [0] = 0;
9129 fasta_contigs_name[0]=0;
9130 scRNA_cell_barcode_list[0]=0;
9131 scRNA_sample_sheet[0]=0;
9132 is_paired_end_reads_expected[0]='0';
9133 is_paired_end_reads_expected[1]='\0';
9134
9135 alias_file_name[0]=0;
9136 debug_command[0] = 0;
9137
9138 strcpy(read_shift_type,"upstream");
9139 strcpy(nameFeatureTypeColumn,"exon");
9140 strcpy(nameGeneIDColumn,"gene_id");
9141 strcpy(temp_dir, "<use output directory>");
9142 annot_name[0]=0;out_name[0]=0;Rpath[0]=0;
9143
9144
9145 cmd_rebuilt[0]=0;
9146 for(c = 0; c<argc;c++)
9147 {
9148 if(strlen(cmd_rebuilt) + 1000 > cmd_rebuilt_size)
9149 {
9150 cmd_rebuilt_size*=2;
9151 cmd_rebuilt = realloc(cmd_rebuilt, cmd_rebuilt_size);
9152 }
9153 sprintf(cmd_rebuilt+strlen(cmd_rebuilt), "\"%s\" ", argv[c]);
9154 }
9155
9156 optind=0;
9157 opterr=1;
9158 optopt=63;
9159 strcpy(max_M_str, "10");
9160 strcpy(Pair_Orientations,"fr");
9161
9162 while ((c = getopt_long (argc, argv, "G:A:g:t:T:o:a:d:D:LQ:pbF:fs:S:CBJPMOR:v?", long_options, &option_index)) != -1)
9163 switch(c)
9164 {
9165 case 'S':
9166 /*
9167 if(strlen(optarg)!=2 || (strcmp(optarg, "ff")!=0 && strcmp(optarg, "rf")!=0 && strcmp(optarg, "fr")!=0)){
9168 SUBREADprintf("The order parameter can only be ff, fr or rf.\n");
9169 print_usage();
9170 return -1;
9171 }
9172 Pair_Orientations[0]=(optarg[0]=='r'?'r':'f');
9173 Pair_Orientations[1]=(optarg[1]=='f'?'f':'r');
9174 Pair_Orientations[2]=0;
9175 */
9176 SUBREADprintf("The \"-S\" option has been depreciated.\n");
9177
9178 break;
9179 case 'G':
9180 strcpy(fasta_contigs_name , optarg);
9181 break;
9182 case 'J':
9183 do_junction_cnt = 1;
9184 break;
9185 case 'A':
9186 strcpy(alias_file_name, optarg);
9187 break;
9188 case 'M':
9189 is_Multi_Mapping_Allowed = 1;
9190 break;
9191 case 'v':
9192 core_version_number("featureCounts");
9193 return 0;
9194 case 'Q':
9195 if(!is_valid_digit_range(optarg, "Q", 0 , 255))
9196 STANDALONE_exit(-1);
9197
9198 min_qual_score = atoi(optarg);
9199 break;
9200 case 't':
9201 strcpy(nameFeatureTypeColumn, optarg);
9202 break;
9203 case 'g':
9204 while((*optarg) == ' ') optarg++;
9205 strcpy(nameGeneIDColumn, optarg);
9206 break;
9207 case 'T':
9208 if(!is_valid_digit_range(optarg, "T", 1, FC_MAX_THREADS))
9209 STANDALONE_exit(-1);
9210
9211 threads = atoi(optarg);
9212 break;
9213 case 'd':
9214 if(!is_valid_digit(optarg, "d"))
9215 STANDALONE_exit(-1);
9216
9217 min_dist = atoi(optarg);
9218 break;
9219 case 'D':
9220 if(!is_valid_digit(optarg, "D"))
9221 STANDALONE_exit(-1);
9222
9223 max_dist = atoi(optarg);
9224 break;
9225 case 'p':
9226 is_paired_end_reads_expected[0]='1';
9227 break;
9228 case 'C':
9229 is_Chimeric_Disallowed = 1;
9230 break;
9231 case 'P':
9232 is_PE_Dist_Checked = 1;
9233 break;
9234 case 'B':
9235 is_Both_End_Mapped = 1;
9236 break;
9237 case 'f':
9238 is_GeneLevel = 0;
9239 break;
9240 case 'F':
9241 isGTF = 1;
9242 if(strcmp("SAF", optarg)==0) isGTF=0;
9243 else if(strcmp("GTF", optarg)==0) isGTF=1;
9244 else SUBREADprintf("\nWarning: Unknown annotation format: %s. GTF format is used.\n\n", optarg);
9245 break;
9246 case 'O':
9247 is_Overlap = 1;
9248 break;
9249 case 'R':
9250 if(strcmp(optarg, "SAM")==0) is_ReadSummary_Report = FILE_TYPE_SAM;
9251 else if(strcmp(optarg, "BAM")==0) is_ReadSummary_Report = FILE_TYPE_BAM;
9252 else if(strcmp(optarg, "CORE")==0) is_ReadSummary_Report = FILE_TYPE_RSUBREAD;
9253 else{
9254 SUBREADprintf("\nERROR: unknown output format: '%s'\n\n", optarg);
9255 STANDALONE_exit(-1);
9256 }
9257 break;
9258 case 's':
9259 Strand_Sensitive_Str = strdup(optarg);
9260 int xx;
9261 for(xx =0; Strand_Sensitive_Str[xx]!='\0'; xx++) if(Strand_Sensitive_Str[xx]==',') Strand_Sensitive_Str[xx]='.';
9262 break;
9263 // case 'i':
9264 // term_strncpy(sam_name, optarg,299);
9265 // break;
9266 case 'o':
9267 term_strncpy(out_name, optarg,MAX_FILE_NAME_LENGTH-1);
9268 break;
9269 case 'a':
9270 term_strncpy(annot_name, optarg,MAX_FILE_NAME_LENGTH-1);
9271 break;
9272 case 'L':
9273 long_read_mode = 1;
9274 break;
9275 case 0 : // long options
9276
9277 if(strcmp("countReadPairs", long_options[option_index].name)==0){
9278 is_PE=1;
9279 }
9280
9281 if(strcmp("primary", long_options[option_index].name)==0)
9282 {
9283 is_primary_alignment_only = 1;
9284 }
9285
9286 if(strcmp("readExtension5", long_options[option_index].name)==0)
9287 {
9288 if(!is_valid_digit_range(optarg, "readExtension5", 0, 0x7fffffff))
9289 STANDALONE_exit(-1);
9290 fiveEndExtension = atoi(optarg);
9291 fiveEndExtension = max(0, fiveEndExtension);
9292 }
9293
9294 if(strcmp("readExtension3", long_options[option_index].name)==0)
9295 {
9296 if(!is_valid_digit_range(optarg, "readExtension3", 0, 0x7fffffff))
9297 STANDALONE_exit(-1);
9298 threeEndExtension = atoi(optarg);
9299 threeEndExtension = max(0, threeEndExtension);
9300 }
9301
9302 if(strcmp("fracOverlap", long_options[option_index].name)==0)
9303 {
9304 if(!is_valid_float(optarg, "fracOverlap"))
9305 STANDALONE_exit(-1);
9306 fracOverlap = atof(optarg);
9307 }
9308
9309
9310 if(strcmp("fracOverlapFeature", long_options[option_index].name)==0)
9311 {
9312 if(!is_valid_float(optarg, "fracOverlapFeature"))
9313 STANDALONE_exit(-1);
9314 fracOverlapFeature = atof(optarg);
9315 }
9316
9317 if(strcmp("nonOverlapFeature", long_options[option_index].name)==0){
9318 if(!is_valid_digit_range(optarg, "nonOverlapFeature", 0, 0x7fffffff))
9319 STANDALONE_exit(-1);
9320 max_missing_bases_in_feature = atoi(optarg);
9321 }
9322
9323 if(strcmp("nonOverlap", long_options[option_index].name)==0){
9324 if(!is_valid_digit_range(optarg, "nonOverlap", 0, 0x7fffffff))
9325 STANDALONE_exit(-1);
9326 max_missing_bases_in_read = atoi(optarg);
9327 }
9328
9329 if(strcmp("scCellBarcodeFile", long_options[option_index].name)==0)
9330 {
9331 strcpy(scRNA_cell_barcode_list,optarg);
9332 }
9333
9334 if(strcmp("scSampleSheet", long_options[option_index].name)==0)
9335 {
9336 strcpy(scRNA_sample_sheet,optarg);
9337 }
9338
9339 if(strcmp("scInputMode", long_options[option_index].name)==0)
9340 {
9341 if(strcmp("FASTQ", optarg)==0)
9342 scRNA_input_mode=GENE_INPUT_SCRNA_FASTQ;
9343 if(strcmp("BAM", optarg)==0)
9344 scRNA_input_mode=GENE_INPUT_SCRNA_BAM;
9345 }
9346
9347
9348 if(strcmp("extraAttributes", long_options[option_index].name)==0)
9349 {
9350 extra_column_names = strdup(optarg);
9351 }
9352
9353 if(strcmp("Rpath", long_options[option_index].name)==0)
9354 {
9355 strcpy(Rpath, optarg);
9356 }
9357
9358 if(strcmp("minOverlap", long_options[option_index].name)==0)
9359 {
9360 if(!is_valid_digit(optarg, "minOverlap"))
9361 STANDALONE_exit(-1);
9362 minFragmentOverlap = atoi(optarg);
9363 }
9364
9365 if(strcmp("debugCommand", long_options[option_index].name)==0)
9366 {
9367 strcpy(debug_command, optarg);
9368 }
9369
9370
9371 if(strcmp("ignoreDup", long_options[option_index].name)==0)
9372 {
9373 is_duplicate_ignored = 1 ;
9374 }
9375
9376 if(strcmp("fraction", long_options[option_index].name)==0)
9377 {
9378 use_fraction_multimapping = 1;
9379 }
9380 if(strcmp("tmpDir", long_options[option_index].name)==0){
9381 strcpy(temp_dir, optarg);
9382 }
9383 if(strcmp("maxMOp", long_options[option_index].name)==0){
9384 if(!is_valid_digit_range(optarg, "maxMOp", 1 , 65555))
9385 STANDALONE_exit(-1);
9386 strcpy(max_M_str, optarg);
9387 }
9388 if(strcmp("read2pos", long_options[option_index].name)==0)
9389 {
9390 if(optarg[0]=='3')
9391 reduce_5_3_ends_to_one = REDUCE_TO_3_PRIME_END;
9392 else if(optarg[0]=='5')
9393 reduce_5_3_ends_to_one = REDUCE_TO_5_PRIME_END;
9394 else{
9395 SUBREADprintf("Invalide parameter to the --read2pos option: %s\n", optarg);
9396 STANDALONE_exit(-1);
9397 }
9398 }
9399
9400 if(strcmp("largestOverlap", long_options[option_index].name)==0)
9401 {
9402 use_overlapping_length_break_tie = 1;
9403 }
9404
9405 if(strcmp("detectionCall", long_options[option_index].name)==0)
9406 {
9407 do_detection_call = 1;
9408 }
9409
9410 if(strcmp("donotsort", long_options[option_index].name)==0)
9411 {
9412 do_not_sort = 1;
9413 }
9414
9415 if(strcmp("readShiftSize", long_options[option_index].name)==0)
9416 {
9417 if(!is_valid_digit_range(optarg, "readShiftSize", 1 , 0x7fffffff))
9418 STANDALONE_exit(-1);
9419 read_shift_size = atoi(optarg);
9420 }
9421
9422 if(strcmp("readShiftType", long_options[option_index].name)==0)
9423 {
9424 if(strcmp(optarg,"upstream")!=0 && strcmp(optarg,"downstream")!=0 && strcmp(optarg,"left")!=0 && strcmp(optarg,"right")!=0){
9425 SUBREADprintf("Error: the readShiftType parameter can only be 'upstream', 'downstream', 'left' or 'right'\n");
9426 STANDALONE_exit(-1);
9427 }
9428 strcpy(read_shift_type, optarg);
9429 }
9430
9431 if(strcmp("splitOnly", long_options[option_index].name)==0)
9432 {
9433 if(is_Split_or_Exonic_Only == 2) {
9434 SUBREADprintf("Error: You can not specify both splitOnly and nonSplitOnly\n");
9435 return -1;
9436 }
9437 is_Split_or_Exonic_Only = 1;
9438 }
9439
9440 if(strcmp("restrictedlyNoOverlap", long_options[option_index].name)==0)
9441 {
9442 is_Restrictedly_No_Overlap = 1;
9443 }
9444 if(strcmp("nonSplitOnly", long_options[option_index].name)==0)
9445 {
9446 if(is_Split_or_Exonic_Only == 1) {
9447 SUBREADprintf("Error: You can not specify both splitOnly and nonSplitOnly\n");
9448 return -1;
9449 }
9450 is_Split_or_Exonic_Only = 2;
9451 }
9452
9453 if(strcmp("verbose", long_options[option_index].name)==0){
9454 is_verbose = 1;
9455 }
9456
9457 if(strcmp("byReadGroup", long_options[option_index].name)==0){
9458 assign_reads_to_RG = 1;
9459 }
9460 break;
9461 case '?':
9462 default :
9463 print_usage();
9464 return -1;
9465 break;
9466 }
9467
9468
9469 if(minFragmentOverlap<1)
9470 {
9471 fiveEndExtension = - minFragmentOverlap + 1;
9472 threeEndExtension = - minFragmentOverlap + 1;
9473 minFragmentOverlap = 1;
9474 }
9475
9476 if(out_name[0]==0 || annot_name[0]==0)
9477 {
9478 print_usage();
9479 return -1;
9480 }
9481
9482 for(; optind < argc; optind++)
9483 {
9484 int curr_strlen = strlen(very_long_file_names);
9485 if( very_long_file_names_size - curr_strlen < MAX_FILE_NAME_LENGTH+1)
9486 {
9487 very_long_file_names_size *=2;
9488 //printf("CL=%d ; NS=%d\n", curr_strlen , very_long_file_names_size);
9489 very_long_file_names=realloc(very_long_file_names , very_long_file_names_size);
9490 }
9491
9492 strcat(very_long_file_names, argv[optind]);
9493 strcat(very_long_file_names, FC_FLIST_SPLITOR);
9494 }
9495
9496 very_long_file_names[strlen(very_long_file_names)-1]=0;
9497 std_input_output_mode = (strcmp(very_long_file_names, "") == 0?1:0);
9498
9499 sprintf(strFiveEndExtension, "%d", fiveEndExtension);
9500 sprintf(strThreeEndExtension, "%d", threeEndExtension);
9501 sprintf(strMinFragmentOverlap, "%d", minFragmentOverlap);
9502 sprintf(nthread_str,"%d", threads);
9503 sprintf(min_dist_str,"%d",min_dist);
9504 sprintf(max_dist_str,"%d",max_dist);
9505 sprintf(min_qual_score_str,"%d", min_qual_score);
9506 sprintf(feature_block_size_str,"%d", feature_block_size);
9507 sprintf(fracOverlapStr, "%g", fracOverlap);
9508 sprintf(std_input_output_mode_str,"%d",std_input_output_mode);
9509 sprintf(long_read_mode_str, "%d", long_read_mode);
9510 sprintf(strFeatureFracOverlap, "%g", fracOverlapFeature);
9511 sprintf(max_missing_bases_in_feature_str, "%d", max_missing_bases_in_feature);
9512 sprintf(max_missing_bases_in_read_str, "%d", max_missing_bases_in_read);
9513 sprintf(read_shift_size_str, "%d", read_shift_size);
9514
9515 Rargv[0] = "CreadSummary";
9516 Rargv[1] = annot_name;
9517 Rargv[2] = very_long_file_names;
9518 Rargv[3] = out_name;
9519 Rargv[4] = is_PE?"1":"0";
9520 Rargv[5] = min_dist_str;
9521 Rargv[6] = max_dist_str;
9522 Rargv[7] = is_SAM?"1":"0";
9523 Rargv[8] = is_Overlap?"1":"0";
9524 Rargv[9] = is_GeneLevel?"1":"0";
9525 Rargv[10] = nthread_str;
9526 Rargv[11] = isGTF?"1":"0";
9527 Rargv[12] = Strand_Sensitive_Str;
9528 Rargv[13] = is_ReadSummary_Report == 0 ? "0":(is_ReadSummary_Report == FILE_TYPE_RSUBREAD?"10":(is_ReadSummary_Report == FILE_TYPE_BAM?"500":"50"));
9529 Rargv[14] = is_Both_End_Mapped?"1":"0";
9530 Rargv[15] = is_Chimeric_Disallowed?"1":"0";
9531 Rargv[16] = is_PE_Dist_Checked?"1":"0";
9532 Rargv[17] = nameFeatureTypeColumn;
9533 Rargv[18] = nameGeneIDColumn;
9534 Rargv[19] = min_qual_score_str;
9535 Rargv[20] = is_Multi_Mapping_Allowed?"1":"0";
9536 Rargv[21] = alias_file_name;
9537 Rargv[22] = cmd_rebuilt;
9538 Rargv[23] = is_Input_Need_Reorder?"1":"0";
9539 Rargv[24] = feature_block_size_str;
9540 Rargv[25] = strFiveEndExtension;
9541 Rargv[26] = strThreeEndExtension;
9542 Rargv[27] = strMinFragmentOverlap;
9543 Rargv[28] = is_Split_or_Exonic_Only == 1?"1":(is_Split_or_Exonic_Only == 2 ? "2":"0");
9544 Rargv[29] = (reduce_5_3_ends_to_one == 0?"0":(reduce_5_3_ends_to_one==REDUCE_TO_3_PRIME_END?"3":"5"));
9545 Rargv[30] = debug_command;
9546 Rargv[31] = is_duplicate_ignored?"1":"0";
9547 Rargv[32] = do_not_sort?"1":"0";
9548 Rargv[33] = use_fraction_multimapping?"1":"0";
9549 Rargv[34] = use_overlapping_length_break_tie?"1":"0";
9550 Rargv[35] = Pair_Orientations;
9551 Rargv[36] = do_junction_cnt?"1":"0";
9552 Rargv[37] = fasta_contigs_name;
9553 Rargv[38] = max_M_str;
9554 Rargv[39] = is_Restrictedly_No_Overlap?"1":"0";
9555 Rargv[40] = fracOverlapStr;
9556 Rargv[41] = temp_dir;
9557 Rargv[42] = std_input_output_mode_str;
9558 Rargv[43] = assign_reads_to_RG?"1":"0";
9559 Rargv[44] = long_read_mode_str;
9560 Rargv[45] = is_verbose?"1":"0";
9561 Rargv[46] = strFeatureFracOverlap;
9562 Rargv[47] = do_detection_call?"1":"0";
9563 Rargv[48] = max_missing_bases_in_read_str;
9564 Rargv[49] = max_missing_bases_in_feature_str;
9565 Rargv[50] = is_primary_alignment_only?"1":"0";
9566 Rargv[51] = Rpath;
9567 Rargv[52] = extra_column_names;
9568 Rargv[54] = "NA"; // C featureCounts dosn't need the display_annotation_name.
9569 Rargv[54] = read_shift_type;
9570 Rargv[55] = read_shift_size_str;
9571 Rargv[56] = scRNA_sample_sheet;
9572 Rargv[57] = scRNA_cell_barcode_list;
9573 Rargv[58] = is_paired_end_reads_expected;
9574 Rargv[59] = is_scRNA_BAM_FQ_out_generated?"1":"0";
9575
9576 Rargv[60] = "3";
9577 if(scRNA_input_mode == GENE_INPUT_SCRNA_FASTQ) Rargv[60] = "4";
9578 if(scRNA_input_mode == GENE_INPUT_SCRNA_BAM) Rargv[60] = "5";
9579
9580 int retvalue = -1;
9581 if(is_ReadSummary_Report && (std_input_output_mode & 1)==1) SUBREADprintf("ERROR: no detailed assignment results can be written when the input is from STDIN. Please remove the '-R' option.\n");
9582 else retvalue = readSummary(61, Rargv);
9583
9584 free(very_long_file_names);
9585 free(out_name);
9586 free(alias_file_name);
9587 free(fasta_contigs_name);
9588 if(old_zero_smode != Strand_Sensitive_Str)free(Strand_Sensitive_Str);
9589 free(cmd_rebuilt);
9590 free(Rpath);
9591 free(scRNA_sample_sheet);
9592 free(scRNA_cell_barcode_list);
9593 if(extra_column_names)free(extra_column_names);
9594
9595 return retvalue;
9596
9597 }
9598
9599
9600