1 /***************************************************************
2 
3    The Subread and Rsubread software packages are free
4    software packages:
5 
6    you can redistribute it and/or modify it under the terms
7    of the GNU General Public License as published by the
8    Free Software Foundation, either version 3 of the License,
9    or (at your option) any later version.
10 
11    Subread is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty
13    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14 
15    See the GNU General Public License for more details.
16 
17    Authors: Drs Yang Liao and Wei Shi
18 
19   ***************************************************************/
20 
21 
22 #define _GNU_SOURCE
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <stdarg.h>
26 #include <assert.h>
27 #include <string.h>
28 #include <unistd.h>
29 #include <ctype.h>
30 #include <time.h>
31 
32 
33 #ifndef MAKE_STANDALONE
34   #include <R.h>
35 #endif
36 
37 #include <zlib.h>
38 #include <math.h>
39 #include <pthread.h>
40 #include <getopt.h>
41 #include "subread.h"
42 #include "interval_merge.h"
43 #include "core.h"
44 #include "gene-algorithms.h"
45 #include "sambam-file.h"
46 #include "input-files.h"
47 #include "input-blc.h"
48 #include "hashtable.h"
49 #include "seek-zlib.h"
50 #include "HelperFunctions.h"
51 
52 /********************************************************************/
53 /********************************************************************/
54 /********************************************************************/
55 //  NEW FUNCTION FOR MULTI-THREADING
56 /********************************************************************/
57 /********************************************************************/
58 /********************************************************************/
59 #define CHROMOSOME_NAME_LENGTH 256
60 #define MAX_FC_READ_LENGTH 10001
61 #define MAX_HIT_NUMBER (1000*1000*1000)
62 #define MAX_EXTRA_COLS 15
63 #define MAX_UMI_LEN 14
64 #define FC_FLIST_SPLITOR "\026"
65 
66 typedef struct{
67 	char * gene_name;
68 	unsigned int pos_first_base;
69 	unsigned int pos_last_base;
70 } fc_junction_gene_t;
71 
72 
73 #define MAXIMUM_INSERTION_IN_SECTION 8
74 
75 typedef struct {
76 	char * chro;
77 	unsigned int start_pos;
78 	unsigned int chromosomal_length;
79 	short insertions;
80 	unsigned int insertion_start_pos[ MAXIMUM_INSERTION_IN_SECTION ];
81 	unsigned short insertion_lengths[ MAXIMUM_INSERTION_IN_SECTION ];
82 } CIGAR_interval_t;
83 
84 
85 
86 typedef struct {
87 	int space;
88 	int used;
89 	fc_junction_gene_t ** genes;
90 } gene_info_list_t;
91 
92 typedef struct {
93 	char chromosome_name_left[CHROMOSOME_NAME_LENGTH + 1];
94 	char chromosome_name_right[CHROMOSOME_NAME_LENGTH + 1];
95 	unsigned int last_exon_base_left;
96 	unsigned int first_exon_base_right;
97 } fc_junction_info_t;
98 
99 typedef struct {
100 	srInt_64 feature_name_pos;
101 	unsigned int start;
102 	unsigned int end;
103 	unsigned int sorted_order;
104 
105 	unsigned short chro_name_pos_delta;
106 	char is_negative_strand;
107 	char * extra_columns;
108 } fc_feature_info_t;
109 
110 typedef struct {
111 	srInt_64 assigned_reads;
112 
113 	srInt_64 unassigned_unmapped;
114 	srInt_64 unassigned_read_type;
115 	srInt_64 unassigned_singleton;
116 	srInt_64 unassigned_mappingquality;
117 	srInt_64 unassigned_chimericreads;
118 	srInt_64 unassigned_fragmentlength;
119 	srInt_64 unassigned_duplicate;
120 	srInt_64 unassigned_multimapping;
121 	srInt_64 unassigned_secondary;
122 	srInt_64 unassigned_junction_condition;
123 	srInt_64 unassigned_nofeatures;
124 	srInt_64 unassigned_overlapping_length;
125 	srInt_64 unassigned_ambiguous;
126 } fc_read_counters;
127 
128 typedef srInt_64 read_count_type_t;
129 
130 typedef struct {
131 	unsigned short thread_id;
132 	srInt_64 nreads_mapped_to_exon;
133 	srInt_64 all_reads;
134 	//unsigned short current_read_length1;
135 	//unsigned short current_read_length2;
136 	unsigned int count_table_size;
137 	read_count_type_t * count_table;
138 	unsigned int chunk_read_ptr;
139 	pthread_t thread_object;
140 
141 	int hits_number_capacity;
142 	unsigned int * hits_start_pos1;
143 	unsigned int * hits_start_pos2;
144 
145 	unsigned short * hits_length1;
146 	unsigned short * hits_length2;
147 
148 	char ** hits_chro1;
149 	char ** hits_chro2;
150 
151 	srInt_64 * hits_indices1;
152 	srInt_64 * hits_indices2;
153 
154 	unsigned int proc_Starting_Chro_Points_1BASE[65536];
155 	unsigned short proc_Starting_Read_Points[65536];
156 	unsigned short proc_Section_Read_Lengths[65536];
157 	char * proc_ChroNames[65536];
158 	char proc_Event_After_Section[65536];
159 	CIGAR_interval_t proc_CIGAR_intervals_R1[65536], proc_CIGAR_intervals_R2[65536];
160 
161 	char ** scoring_buff_gap_chros;
162 	unsigned int * scoring_buff_gap_starts;
163 	unsigned short * scoring_buff_gap_lengths;
164 	char * read_details_buff;
165 	char * bam_compressed_buff;
166 	int read_details_buff_used;
167 
168 	unsigned int * scoring_buff_numbers;
169 	unsigned int * scoring_buff_flags;
170 	unsigned int * scoring_buff_overlappings;
171 	srInt_64 * scoring_buff_exon_ids;
172 	srInt_64 del4_added_reads;
173 
174 	char * chro_name_buff;
175 	z_stream bam_file_output_stream;
176 
177 	HashTable ** scRNA_sample_bc_tables; // sample_ID ==> int64s: cell barcode id <<32 | umi barcode id
178 	HashTable * scRNA_registered_UMI_table; // UMI bases ==> UMI_no +1 in this thread
179 	HashTable * junction_counting_table;   // key: string chro_name \t last_base_previous_exont \t first_base_next_exon
180 	HashTable * splicing_point_table;
181 	HashTable * RG_table;	// rg_name -> [ count_table, sum_fc_read_counters, junction_counting_table,  splicing_point_table]
182 				// NOTE: some reads have no RG tag. These reads are put into the tables in this object but not in the RG_table -> tables.
183 	srInt_64 scRNA_pooled_reads;
184 	srInt_64 *scRNA_reads_per_sample;
185 	srInt_64 *scRNA_mapped_reads_per_sample;
186 	srInt_64 *scRNA_assigned_reads_per_sample;
187 	srInt_64 scRNA_has_valid_sample_index;
188 	srInt_64 scRNA_has_valid_cell_barcode;
189 	fc_read_counters read_counters;
190 } fc_thread_thread_context_t;
191 
192 #define READ_SHIFT_UPSTREAM 10
193 #define READ_SHIFT_DOWNSTREAM 20
194 #define READ_SHIFT_LEFT 30
195 #define READ_SHIFT_RIGHT 40
196 #define REVERSE_TABLE_BUCKET_LENGTH 131072
197 #define REDUCE_TO_5_PRIME_END 5
198 #define REDUCE_TO_3_PRIME_END 3
199 
200 typedef struct {
201 	unsigned int chro_number;
202 	unsigned int chro_features;
203 	unsigned int chro_feature_table_start;
204 	unsigned int chro_block_table_start;
205 	unsigned int chro_block_table_end;
206 	unsigned int chro_possible_length;
207 
208 	unsigned short chro_reverse_table_current_size;
209 	unsigned int * reverse_table_start_index;
210 	int reverse_table_start_index_size;
211 	//unsigned int * reverse_table_end_index;
212 } fc_chromosome_index_info;
213 
214 typedef struct {
215 	int is_gene_level;
216 	int is_paired_end_mode_assign;
217 	int is_paired_end_reads_expected;
218 	int is_multi_overlap_allowed;
219 	int restricted_no_multi_overlap;
220 	char * strand_check_mode;
221 	int is_strand_checked;
222 	int is_both_end_required;
223 	int is_chimertc_disallowed;
224 	int is_PE_distance_checked;
225 	int is_multi_mapping_allowed;
226 	int is_primary_alignment_only;
227 	int is_SAM_file;
228 	int is_read_details_out;
229 	int is_junction_no_chro_shown;
230 	int is_unpaired_warning_shown;
231 	int is_stake_warning_shown;
232 	int is_read_too_long_to_SAM_BAM_shown;
233 	int is_split_or_exonic_only;
234 	int is_duplicate_ignored;
235 	int is_first_read_reversed;
236 	int is_second_read_straight;
237 	int is_verbose;
238 	int long_read_minimum_length;
239 	int assign_reads_to_RG;
240 	int use_stdin_file;
241 	int is_mixed_PE_SE;
242 	int disk_is_full;
243 	int do_not_sort;
244 	int reduce_5_3_ends_to_one;
245 	int isCVersion;
246 	int use_fraction_multi_mapping;
247 	int do_junction_counting;
248 	int do_detection_call;
249 	int this_input_number;
250 
251 	int need_calculate_overlap_len;
252 	int need_calculate_fragment_len;
253 
254 	int min_mapping_quality_score;
255 	int min_paired_end_distance;
256 	int max_paired_end_distance;
257 	int max_M;
258 	int feature_block_size;
259 	int read_length;
260 	int line_length;
261 	int longest_chro_name;
262 	int five_end_extension;
263 	int three_end_extension;
264 	int read_shift_type;
265 	int read_shift_size;
266 	int fragment_minimum_overlapping;
267 	int do_scRNA_table;
268 	float fractional_minimum_overlapping;
269 	float fractional_minimum_feature_overlapping;
270 	int use_overlapping_break_tie;
271 	int max_missing_bases_in_read, max_missing_bases_in_feature;
272 
273 	srInt_64 all_reads;
274 
275 	unsigned short thread_number;
276 	fc_thread_thread_context_t * thread_contexts;
277 	int sambam_chro_table_items;
278 	int is_input_bad_format;
279 	int any_reads_are_PE;
280 	SamBam_Reference_Info * sambam_chro_table;
281 	pthread_spinlock_t read_details_out_lock;
282 
283 	SAM_pairer_context_t read_pairer;
284 	SAM_pairer_context_t scRNA_read_pairer;
285 
286 	char * debug_command;
287 	char * unistr_buffer_space;
288 	srInt_64 max_BAM_header_size;
289 	srInt_64 unistr_buffer_size;
290 	srInt_64 unistr_buffer_used;
291 	int is_scRNA_BAM_FQ_out_generated;
292 	int scRNA_input_mode;
293 	float scRNA_umi_cutoff;
294 	int scRNA_rerun_on_persample_BAM;
295 	int * scRNA_applied_umi_cut;
296 	HashTable * scRNA_sample_sheet_table;
297 	ArrayList * scRNA_sample_barcode_list;
298 	ArrayList * scRNA_cell_barcodes_array;
299 	HashTable * scRNA_cell_barcode_head_tail_table;
300 	HashTable * scRNA_lineno1B_to_sampleno1B_tab;
301 	ArrayList * scRNA_sample_id_to_name;
302 	HashTable * scRNA_sample_BAM_writers; // sample_ID(1-base) ==> SamBam_writer
303 	HashTable * lineno_2_sortedno_tab;
304 	int known_cell_barcode_length;
305 	HashTable * junction_features_table;
306 	HashTable * junction_bucket_table;
307 	fasta_contigs_t * fasta_contigs;
308 	HashTable * gene_name_table;	// gene_name -> gene_number
309 	HashTable * BAM_chros_to_anno_table;	// name in annotation file -> alias name
310 	HashTable * GCcontent_table; // gene_name -> "qc_content_frac"
311 	int scRNA_do_one_batch_runner_current;
312 	pthread_spinlock_t scRNA_do_one_batch_runner_lock;
313 	FILE ** scRNA_barcode_batched_bins;
314 	pthread_spinlock_t * scRNA_barcode_batched_locks;
315 	int scRNA_barcode_batched_bin_no;
316 	int scRNA_barcode_batched_max_Rbin_len;
317 	int scRNA_barcode_batched_max_genes;
318 	int scRNA_UMI_length;
319 
320 
321 	char * RGnames_set;
322 	int RGnames_capacity;
323 	int RGnames_ptr;
324 
325 	char alias_file_name[MAX_FILE_NAME_LENGTH];
326 	char input_file_name[MAX_FILE_NAME_LENGTH];
327 	char * input_file_short_name;
328 	char raw_input_file_name[MAX_FILE_NAME_LENGTH];
329 	char output_file_name[MAX_FILE_NAME_LENGTH];
330 	char output_file_path[MAX_FILE_NAME_LENGTH];
331 	char temp_file_dir[MAX_FILE_NAME_LENGTH];
332 	char read_details_path[MAX_FILE_NAME_LENGTH];
333 	char annotation_file_screen_output[MAX_FILE_NAME_LENGTH];
334 	char scRNA_sample_sheet[MAX_FILE_NAME_LENGTH];
335 	char scRNA_cell_barcode_list[MAX_FILE_NAME_LENGTH];
336 	unsigned char ** gene_name_array;	// gene_internal_number -> gene_name
337 	int input_file_unique;
338 
339 	char * reported_extra_columns;
340 	HashTable * exontable_chro_table;	// gene_name -> fc_chromosome_index_info structure (contains chro_number, feature_number, block_start, block_end, etc)
341 	int exontable_nchrs;
342 	int exontable_exons;
343 	int * exontable_geneid;
344 	char * exontable_strand;
345 	char ** exontable_chr;
346 	srInt_64 * exontable_start;
347 	srInt_64 * exontable_stop;
348 	char feature_name_column[2000];
349 	char gene_id_column[100];
350 
351 	srInt_64 * exontable_block_end_index;
352 	srInt_64 * exontable_block_max_end;
353 	srInt_64 * exontable_block_min_start;
354 
355 	char ** exontable_anno_chrs;
356 	char * exontable_anno_chr_2ch;
357 	srInt_64 * exontable_anno_chr_heads;
358 
359 	FILE * read_details_out_FP;
360 	double start_time;
361 
362 	char * cmd_rebuilt;
363 	char   redo;
364 
365 	fc_read_counters read_counters;
366 
367 } fc_thread_global_context_t;
368 
369 unsigned int tick_time = 1000;
370 
fetch_boundaries(char * chroname,char * cigar,unsigned int pos,char strand,int * has_left,unsigned short * left_on_read,unsigned int * left_pos,int * has_right,unsigned short * right_on_read,unsigned int * right_pos,fc_junction_info_t * result_junctions,int junction_space)371 int fetch_boundaries(char * chroname,char * cigar, unsigned int pos, char strand, int *has_left, unsigned short *left_on_read, unsigned int *left_pos, int *has_right, unsigned short *right_on_read, unsigned int *right_pos, fc_junction_info_t *  result_junctions, int junction_space){
372 
373 	int cigar_cursor = 0, nch, read_len = 0, ret = 0;
374 	unsigned int chro_cursor = pos, tmpi = 0;
375 	unsigned int right_boundary = 0;
376 	unsigned short left_clipped = 0;
377 	unsigned short right_clipped = 0;
378 	*has_right = 0;
379 	*has_left = 0;
380 
381 	for(; (nch = cigar[cigar_cursor])!=0 ; cigar_cursor++){
382 		if(isdigit(nch)){
383 			tmpi = tmpi*10 + (nch - '0');
384 		} else {
385 			if (nch == 'S'){
386 				if(chro_cursor == pos) left_clipped = tmpi;else right_clipped=tmpi;
387 				read_len += tmpi;
388 			} else if(nch == 'M' || nch == 'D'){
389 				if(nch == 'M')read_len += tmpi;
390 
391 				chro_cursor += tmpi;
392 				right_boundary = chro_cursor -1;
393 			} else if(nch == 'N'){
394 				unsigned int last_exon_last_base = chro_cursor - 1;
395 				unsigned int next_exon_first_base = chro_cursor + tmpi;
396 				chro_cursor += tmpi;
397 
398 				if(ret < junction_space){
399 					result_junctions[ret].last_exon_base_left = last_exon_last_base;
400 					result_junctions[ret].first_exon_base_right = next_exon_first_base;
401 					strcpy(result_junctions[ret].chromosome_name_left, chroname);
402 					strcpy(result_junctions[ret].chromosome_name_right, chroname);
403 
404 					ret ++;
405 				}
406 
407 
408 			} else if(nch == 'I') read_len += tmpi;
409 			tmpi = 0;
410 		}
411 	}
412 	if(left_clipped){
413 		*has_left = 1;
414 		*left_on_read = left_clipped;
415 		*left_pos = pos;
416 	}
417 	if(right_clipped){
418 		*has_right = 1;
419 		*right_on_read = read_len - right_clipped - 1;
420 		*right_pos = right_boundary;
421 	}
422 	return ret;
423 }
424 
425 // This function parses the cigar string and returns the number of exon-exon junctions found in the cigar.
426 // It returns 0 if no junctions are found.
calc_junctions_from_cigar(fc_thread_global_context_t * global_context,int flag,char * chroname,unsigned int pos,char * cigar,char * extra_tags,fc_junction_info_t * result_junctions)427 int calc_junctions_from_cigar(fc_thread_global_context_t * global_context, int flag, char * chroname, unsigned int pos, char * cigar , char * extra_tags, fc_junction_info_t * result_junctions){
428 	unsigned short boundaries_inclusive_base_on_read[global_context -> max_M];
429 	unsigned int boundaries_inclusive_base_pos[global_context -> max_M];
430 	char boundaries_chromosomes[global_context -> max_M][MAX_CHROMOSOME_NAME_LEN];
431 	char boundaries_extend_to_left_on_read[global_context -> max_M];
432 	int boundaries = 0;
433 
434 	int cigar_cursor = 0, nch, ret = 0, read_len = 0, x1, x2;
435 	unsigned int chro_cursor = pos, tmpi = 0;
436 	unsigned short left_clipped = 0;
437 	unsigned short right_clipped = 0;
438 
439 	for(; (nch = cigar[cigar_cursor])!=0 ; cigar_cursor++){
440 		if(isdigit(nch)){
441 			tmpi = tmpi*10 + (nch - '0');
442 		} else {
443 			if (nch == 'S'){
444 				if(chro_cursor == pos) left_clipped = tmpi;else right_clipped=tmpi;
445 				read_len += tmpi;
446 			} else if(nch == 'M' || nch == 'D'){
447 				if(nch == 'M')read_len += tmpi;
448 
449 				chro_cursor += tmpi;
450 			} else if(nch == 'N'){
451 				unsigned int last_exon_last_base = chro_cursor - 1;
452 				unsigned int next_exon_first_base = chro_cursor + tmpi;
453 				if(ret <= global_context -> max_M - 1){
454 					result_junctions[ret].last_exon_base_left = last_exon_last_base;
455 					result_junctions[ret].first_exon_base_right = next_exon_first_base;
456 					strcpy(result_junctions[ret].chromosome_name_left, chroname);
457 					strcpy(result_junctions[ret].chromosome_name_right, chroname);
458 
459 					ret ++;
460 				}
461 				chro_cursor += tmpi;
462 			} else if(nch == 'I') read_len += tmpi;
463 			tmpi = 0;
464 		}
465 	}
466 	if(left_clipped){
467 		strcpy(boundaries_chromosomes[boundaries] , chroname);
468 		boundaries_extend_to_left_on_read[boundaries] = 0;
469 		boundaries_inclusive_base_pos[boundaries] = pos;
470 		boundaries_inclusive_base_on_read[boundaries++] = left_clipped;
471 	}
472 	if(right_clipped){
473 		strcpy(boundaries_chromosomes[boundaries] , chroname);
474 		boundaries_extend_to_left_on_read[boundaries] = 1;
475 		boundaries_inclusive_base_pos[boundaries] = chro_cursor - 1;
476 		boundaries_inclusive_base_on_read[boundaries++] = read_len - right_clipped - 1;
477 	}
478 
479 	int tag_cursor=0;
480 
481 	//if(strstr(extra_tags, "CG:Z")) {
482 	//	SUBREADprintf("CIGAR=%s, EXTRA=%s\n", cigar, extra_tags);
483 	//}
484 	int status = PARSE_STATUS_TAGNAME;
485 	char tag_name[2], typechar=0;
486 	int tag_inner_cursor=0;
487 
488 	char read_main_strand = (((flag & 0x10) == 0x10) == ((flag & 0x40)==0x40))?'-':'+';
489 	char current_fusion_char[MAX_CHROMOSOME_NAME_LEN];
490 	unsigned int current_fusion_pos = 0;
491 	char current_fusion_strand = 0;
492 	char current_fusion_cigar[global_context -> max_M * 15];
493 	current_fusion_cigar [0] =0;
494 	current_fusion_char [0]=0;
495 
496 	while(1){
497 		int nch = extra_tags[tag_cursor];
498 		if(status == PARSE_STATUS_TAGNAME){
499 			tag_name[tag_inner_cursor++] = nch;
500 			if(tag_inner_cursor == 2){
501 				status = PARSE_STATUS_TAGTYPE;
502 				tag_cursor += 1;
503 				assert(extra_tags[tag_cursor] == ':');
504 			}
505 		}else if(status == PARSE_STATUS_TAGTYPE){
506 			typechar = nch;
507 			tag_cursor +=1;
508 			assert(extra_tags[tag_cursor] == ':');
509 			tag_inner_cursor = 0;
510 			status = PARSE_STATUS_TAGVALUE;
511 		}else if(status == PARSE_STATUS_TAGVALUE){
512 			if(nch == '\t' || nch == 0){
513 				if(current_fusion_cigar[0] && current_fusion_char[0] && current_fusion_pos && current_fusion_strand){
514 
515 					unsigned int left_pos = 0, right_pos = 0;
516 					unsigned short left_on_read = 0, right_on_read = 0;
517 					int has_left = 0, has_right = 0;
518 
519 					unsigned int start_pos = current_fusion_pos;
520 					if(current_fusion_strand!=read_main_strand)
521 						start_pos = find_left_end_cigar(current_fusion_pos, current_fusion_cigar);
522 
523 					ret += fetch_boundaries(current_fusion_char, current_fusion_cigar, start_pos, current_fusion_strand, &has_left, &left_on_read, &left_pos, &has_right, &right_on_read, &right_pos, result_junctions + ret, global_context -> max_M - ret );
524 
525 					if(has_left){
526 						strcpy(boundaries_chromosomes[boundaries] , current_fusion_char);
527 						boundaries_extend_to_left_on_read[boundaries] = 0;
528 						boundaries_inclusive_base_pos[boundaries] = left_pos;
529 						boundaries_inclusive_base_on_read[boundaries++] = left_on_read;
530 					}
531 					if(has_right){
532 						strcpy(boundaries_chromosomes[boundaries] , current_fusion_char);
533 						boundaries_extend_to_left_on_read[boundaries] = 1;
534 						boundaries_inclusive_base_pos[boundaries] = right_pos;
535 						boundaries_inclusive_base_on_read[boundaries++] = right_on_read;
536 					}
537 
538 
539 			//		SUBREADprintf("BOUND_EXT: %s:%u (at %u) (%c)  ~  %s:%u (at %u) (%c)\n", current_fusion_char, left_pos, left_on_read, has_left?'Y':'X' , current_fusion_char, right_pos, right_on_read,  has_right?'Y':'X');
540 
541 					current_fusion_pos = 0;
542 					current_fusion_strand = 0;
543 					current_fusion_cigar [0] =0;
544 					current_fusion_char [0]=0;
545 				}
546 
547 				tag_inner_cursor = 0;
548 				status = PARSE_STATUS_TAGNAME;
549 			}else{
550 				if(tag_name[0]=='C' && tag_name[1]=='C' && typechar == 'Z'){
551 					current_fusion_char[tag_inner_cursor++]=nch;
552 					current_fusion_char[tag_inner_cursor]=0;
553 				}else if(tag_name[0]=='C' && tag_name[1]=='G' && typechar == 'Z'){
554 					current_fusion_cigar[tag_inner_cursor++]=nch;
555 					current_fusion_cigar[tag_inner_cursor]=0;
556 				}else if(tag_name[0]=='C' && tag_name[1]=='P' && typechar == 'i'){
557 					current_fusion_pos = current_fusion_pos * 10 + (nch - '0');
558 				}else if(tag_name[0]=='C' && tag_name[1]=='T' && typechar == 'Z'){
559 					current_fusion_strand = nch;
560 				}
561 			}
562 		}
563 
564 		if(nch == 0){
565 			assert(status == PARSE_STATUS_TAGNAME);
566 			break;
567 		}
568 
569 		tag_cursor++;
570 	}
571 
572 
573 	//for(x1 = 0; x1 < boundaries; x1++)
574 	//	SUBREADprintf("HAS: LR:%d, READ:%d\n", boundaries_extend_to_left_on_read[x1], boundaries_inclusive_base_on_read[x1]);
575 
576 	for(x1 = 0; x1 < boundaries; x1++)
577 		for(x2 = 0; x2 < boundaries; x2++){
578 			if(x1==x2) continue;
579 			if(boundaries_chromosomes[x1][0]==0 || boundaries_chromosomes[x2][0]==0) continue;
580 			if(boundaries_extend_to_left_on_read[x1] == 1 && boundaries_extend_to_left_on_read[x2] == 0){
581 				if( boundaries_inclusive_base_on_read[x1] == boundaries_inclusive_base_on_read[x2]-1 ){
582 
583 					if(ret <= global_context -> max_M - 1){
584 						result_junctions[ret].last_exon_base_left = boundaries_inclusive_base_pos[x1];
585 						result_junctions[ret].first_exon_base_right = boundaries_inclusive_base_pos[x2];
586 						strcpy(result_junctions[ret].chromosome_name_left, boundaries_chromosomes[x1]);
587 						strcpy(result_junctions[ret].chromosome_name_right, boundaries_chromosomes[x2]);
588 						ret++;
589 					}
590 
591 
592 	//				SUBREADprintf("MATCH: %d ~ %d\n", boundaries_inclusive_base_on_read[x1], boundaries_inclusive_base_on_read[x2]);
593 					boundaries_chromosomes[x1][0]=0;
594 					boundaries_chromosomes[x2][0]=0;
595 				}
596 			}
597 		}
598 
599 	//for(x1 = 0; x1 < boundaries; x1++)
600 	//	if(boundaries_chromosomes[x1][0])
601 	//		SUBREADprintf("LEFT: LR:%d, READ:%d\n", boundaries_extend_to_left_on_read[x1], boundaries_inclusive_base_on_read[x1]);
602 	return ret;
603 }
604 
605 
unistr_cpy(fc_thread_global_context_t * global_context,char * str,int strl)606 srInt_64 unistr_cpy(fc_thread_global_context_t * global_context, char * str, int strl)
607 {
608 	srInt_64 ret;
609 	if(global_context->unistr_buffer_used + strl >= global_context->unistr_buffer_size-1)
610 	{
611 		if( global_context->unistr_buffer_size < (1000llu*1000u*1000u*32)) // 32GB
612 		{
613 			global_context -> unistr_buffer_size = global_context->unistr_buffer_size /2 *3;
614 			global_context -> unistr_buffer_space = realloc(global_context -> unistr_buffer_space, global_context->unistr_buffer_size);
615 		}
616 		else
617 		{
618 			SUBREADprintf("Error: exceed memory limit (32GB) for storing feature names.\n");
619 			return 0xffffffffu;
620 		}
621 	}
622 
623 	strcpy(global_context -> unistr_buffer_space + global_context->unistr_buffer_used, str);
624 	ret = global_context->unistr_buffer_used;
625 
626 	global_context->unistr_buffer_used += strl +1;
627 
628 	return ret;
629 }
630 
print_FC_configuration(fc_thread_global_context_t * global_context,char * annot,char * sam,char * out,int is_sam,int is_GTF,int * n_input_files,int isReadSummaryReport,char * PE_exp,char * PE_ass)631 int print_FC_configuration(fc_thread_global_context_t * global_context, char * annot, char * sam, char * out, int is_sam, int is_GTF, int *n_input_files, int isReadSummaryReport, char * PE_exp, char * PE_ass)
632 {
633 	char * tmp_ptr1 = NULL , * next_fn, *sam_used = malloc(strlen(sam)+MAX_FILE_NAME_LENGTH), sam_ntxt[30],bam_ntxt[30], next_ntxt[50];
634 	int nfiles=1, nBAMfiles = 0, nNonExistFiles = 0, x1;
635 	char MAC_or_random[13];
636 	mac_or_rand_str(MAC_or_random);
637 
638 	/*
639 	if(global_context -> max_missing_bases_in_read >= 0 && global_context -> fractional_minimum_overlapping > 0.000001){
640 		SUBREADprintf("\nERROR: multiple filtering conditions on overlapping bases in reads\n");
641 		return 1;
642 	}
643 
644 	if(global_context -> max_missing_bases_in_feature >= 0 && global_context -> fractional_minimum_feature_overlapping > 0.000001){
645 		SUBREADprintf("\nERROR: multiple filtering conditions on overlapping bases in features\n");
646 		return 1;
647 	}*/
648 
649 	sprintf(sam_used, "%s/featureCounts_test_file_writable-%06d-%s.tmp", global_context -> temp_file_dir, getpid(), MAC_or_random);
650 	FILE * fp = fopen(sam_used,"w");
651 	if(fp){
652 		fclose(fp);
653 		unlink(sam_used);
654 	}else{
655 		SUBREADprintf("\nERROR: temporary directory is not writable: '%s'\n\n", global_context -> temp_file_dir);
656 		return 1;
657 	}
658 
659 	strcpy(sam_used, sam);
660 	nfiles = 0;
661 	while(1)
662 	{
663 		next_fn = strtok_r(nfiles==0?sam_used:NULL, FC_FLIST_SPLITOR, &tmp_ptr1);
664 		if(next_fn == NULL || strlen(next_fn)<1) break;
665 		nfiles++;
666 
667 		srInt_64 BAM_header_size = -1;
668 		int file_probe = is_certainly_bam_file(next_fn, NULL, &BAM_header_size);
669 		if(BAM_header_size>0) global_context -> max_BAM_header_size = max( global_context -> max_BAM_header_size , BAM_header_size + 180000);
670 		if(file_probe==-1){
671 			nNonExistFiles++;
672 			if(global_context -> use_stdin_file){
673 				SUBREADprintf("\nERROR: no valid SAM or BAM file is received from <STDIN>\n\n");
674 			}else{
675 				SUBREADprintf("\nERROR: invalid parameter: '%s'\n\n", next_fn);
676 			}
677 			return 1;
678 		}
679 		if(file_probe == 1) nBAMfiles++;
680 	}
681 
682 	SUBREADputs("");
683 	print_subread_logo();
684 	SUBREADputs("");
685 	print_in_box(80,1,1,"featureCounts setting");
686 	print_in_box(80,0,0,"");
687 
688 	sam_ntxt[0]=0;
689 	bam_ntxt[0]=0;
690 	next_ntxt[0]=0;
691 
692 	if(nNonExistFiles)
693 		sprintf(next_ntxt, "%d unknown file%s", nNonExistFiles, nNonExistFiles>1?"s":"");
694 	if(nBAMfiles)
695 		sprintf(bam_ntxt, "%d BAM file%s  ", nBAMfiles, nBAMfiles>1?"s":"");
696 	if(nfiles-nNonExistFiles-nBAMfiles)
697 		sprintf(sam_ntxt, "%d SAM file%s  ", nfiles-nNonExistFiles-nBAMfiles , (nfiles-nNonExistFiles-nBAMfiles)>1?"s":"");
698 
699 
700 	strcpy(sam_used, sam);
701 
702 	print_in_box(80,0,0,"            Input files : %s%s%s", sam_ntxt, bam_ntxt, next_ntxt);
703 	print_in_box(80,0,0,"");
704 	nfiles=0;
705 
706 	while(1){
707 		next_fn = strtok_r(nfiles==0?sam_used:NULL, FC_FLIST_SPLITOR, &tmp_ptr1);
708 		if(next_fn == NULL || strlen(next_fn)<1) break;
709 		//int is_first_read_PE = 0 , file_probe = is_certainly_bam_file(next_fn, &is_first_read_PE, NULL);
710 		print_in_box(89,0,0,"                          %c[36m%s%c[0m",CHAR_ESC, global_context -> use_stdin_file?"<STDIN>":get_short_fname(next_fn),CHAR_ESC);
711 		nfiles++;
712 	}
713 
714 	(*n_input_files) = nfiles;
715 	print_in_box(80,0,0,"");
716 
717 	if(global_context -> annotation_file_screen_output[0]==0){
718 		print_in_box(80,0,0,"            Output file : %s", get_short_fname(out));
719 		print_in_box(80,0,0,"                Summary : %s.summary", get_short_fname(out));
720 	}
721 
722 	char * PEassignStr = malloc(nfiles * 6);
723 	char * PEexpectStr = malloc(nfiles * 6);
724 
725 	int exp_all_same = 1, ass_all_same = 1;
726 
727 	sprintf(PEexpectStr,"%s, ", (PE_exp[0]=='1')?"yes":"no");
728 	char * Ystr = nfiles>5?"Y":"yes";
729 	char * Nstr = nfiles>5?"N":"no";
730 	if(PE_exp[1]){
731 		for(x1=1; PE_exp[x1]; x1++)
732 			if(PE_exp[x1]!=PE_exp[0]) exp_all_same=0;
733 
734 		if(!exp_all_same){
735 			PEexpectStr[0]=0;
736 			for(x1=0; PE_exp[x1]; x1++)
737 				sprintf(PEexpectStr+strlen(PEexpectStr), "%s, ", (PE_exp[x1]=='1')?Ystr:Nstr);
738 		}
739 	}
740 	PEexpectStr[strlen(PEexpectStr)-2]=0;
741 
742 	sprintf(PEassignStr,"%s, ", (PE_ass[0]=='1')?"yes":"no");
743 	if(PE_ass[1]){
744 		for(x1=1; PE_ass[x1]; x1++)
745 			if(PE_ass[x1]!=PE_ass[0]) ass_all_same=0;
746 
747 		if(!ass_all_same){
748 			PEassignStr[0]=0;
749 			for(x1=0; PE_ass[x1]; x1++)
750 				sprintf(PEassignStr+strlen(PEassignStr), "%s, ", (PE_ass[x1]=='1')?Ystr:Nstr);
751 		}
752 	}
753 	PEassignStr[strlen(PEassignStr)-2]=0;
754 
755 	print_in_box(80,0,0,"             Paired-end : %s",PEexpectStr);
756 	print_in_box(80,0,0,"       Count read pairs : %s",PEassignStr);
757 	free(PEassignStr);
758 	free(PEexpectStr);
759 
760 	if(global_context -> annotation_file_screen_output[0])
761 		print_in_box(80,0,0,"             Annotation : %s",global_context -> annotation_file_screen_output);
762 	else
763 		print_in_box(80,0,0,"             Annotation : %s (%s)", get_short_fname(annot), is_GTF?"GTF":"SAF");
764 	print_in_box(80,0,0,"     Dir for temp files : %s", global_context->temp_file_dir);
765 
766 	if(global_context -> do_scRNA_table){
767 		print_in_box(80,0,0,"");
768 		print_in_box(80,0,0,"      scRNA count table : <input_file>.scRNA.table");
769 		print_in_box(80,0,0,"     scRNA sample sheet : %s", get_short_fname(global_context->scRNA_sample_sheet));
770 		print_in_box(80,0,0,"     scRNA barcode list : %s", get_short_fname(global_context->scRNA_cell_barcode_list));
771 	}
772 
773 	if(isReadSummaryReport){
774 		print_in_box(80,0,0,"     Assignment details : <input_file>.featureCounts%s", isReadSummaryReport == FILE_TYPE_BAM?".bam":(isReadSummaryReport == FILE_TYPE_SAM?".sam":""));
775 		if(global_context -> read_details_path[0])
776 			print_in_box(80,0,0,"    Details output path : %s", global_context ->read_details_path);
777 		else
778 			print_in_box(80,0,0,"                     (Note that files are saved to the output directory)");
779 		print_in_box(80,0,0,"");
780 	}
781 
782 	if(global_context -> do_junction_counting)
783 		print_in_box(80,0,0,"      Junction Counting : <output_file>.jcounts");
784 	#ifdef MAKE_STANDALONE
785 	#endif
786 
787 	if(global_context -> alias_file_name[0])
788 		print_in_box(80,0,0,"  Chromosome alias file : %s", get_short_fname(global_context -> alias_file_name));
789 
790 	#ifdef MAKE_STANDALONE
791 	print_in_box(80,0,0,"");
792 	#endif
793 	print_in_box(80,0,0,"                Threads : %d", global_context->thread_number);
794 	print_in_box(80,0,0,"                  Level : %s level", global_context->is_gene_level?"meta-feature":"feature");
795 //	print_in_box(80,0,0,"             Paired-end : %s", global_context->is_paired_end_mode_assign?"yes":"no");
796 	if(global_context -> do_not_sort && global_context->is_paired_end_mode_assign) {
797 		print_in_box(80,0,0,"       Sorting PE Reads : never");
798 		print_in_box(80,0,0,"");
799 	}
800 
801 	char * multi_mapping_allow_mode = "not counted";
802 	if(global_context->is_multi_mapping_allowed)
803 		multi_mapping_allow_mode = global_context -> use_fraction_multi_mapping?"counted (fractional)": "counted";
804 
805 	print_in_box(80,0,0,"     Multimapping reads : %s", multi_mapping_allow_mode);
806 
807 	if(global_context-> is_primary_alignment_only)
808 		print_in_box(80,0,0,"    Multiple alignments : primary alignment only");
809 
810 	print_in_box(80,0,0,"Multi-overlapping reads : %s", global_context->is_multi_overlap_allowed?"counted":"not counted");
811 	if(global_context -> is_split_or_exonic_only)
812 		print_in_box(80,0,0,"       Split alignments : %s", (1 == global_context -> is_split_or_exonic_only)?"only split alignments":"only exonic alignments");
813 	print_in_box(80,0,0,"  Min overlapping bases : %d", global_context -> fragment_minimum_overlapping);
814 	if(global_context -> max_missing_bases_in_read >= 0)
815 		print_in_box(80,0,0,"      Max missing bases : %d in reads", global_context -> max_missing_bases_in_read);
816 	if(global_context -> max_missing_bases_in_feature >= 0)
817 		print_in_box(80,0,0,"      Max missing bases : %d in features", global_context -> max_missing_bases_in_feature);
818 	if(global_context -> fractional_minimum_overlapping > 0.000001)
819 		print_in_box(81,0,0,"  Min overlapping frac. : %0.1f%%%% to reads", global_context -> fractional_minimum_overlapping*100);
820 	if(global_context -> fractional_minimum_feature_overlapping > 0.000001)
821 		print_in_box(81,0,0,"  Min overlapping frac. : %0.1f%%%% to features", global_context -> fractional_minimum_feature_overlapping*100);
822 	if(global_context -> read_shift_size >0)
823 		print_in_box(80,0,0,"             Read shift : %d to %s", global_context -> read_shift_size, global_context -> read_shift_type==READ_SHIFT_UPSTREAM?"upstream":( global_context -> read_shift_type==READ_SHIFT_DOWNSTREAM?"downstream":( global_context -> read_shift_type==READ_SHIFT_LEFT?"left":"right")));
824 	if(global_context -> five_end_extension || global_context -> three_end_extension)
825 		print_in_box(80,0,0,"         Read extension : %d on 5' and %d on 3' ends", global_context -> five_end_extension , global_context -> three_end_extension);
826 	if(global_context -> reduce_5_3_ends_to_one)
827 		print_in_box(80,0,0,"         Read reduction : to %d' end" , global_context -> reduce_5_3_ends_to_one == REDUCE_TO_5_PRIME_END ?5:3);
828 	if(global_context -> is_duplicate_ignored)
829 		print_in_box(80,0,0,"       Duplicated Reads : ignored");
830 	if(global_context -> long_read_minimum_length < 5000)
831 		print_in_box(80,0,0,"         Long read mode : yes");
832 	//print_in_box(80,0,0,"      Read orientations : %c%c", global_context->is_first_read_reversed?'r':'f', global_context->is_second_read_straight?'f':'r' );
833 
834 	if(global_context->is_paired_end_mode_assign)
835 	{
836 		print_in_box(80,0,0,"");
837 		print_in_box(80,0,0,"         Chimeric reads : %s", global_context->is_chimertc_disallowed?"not counted":"counted");
838 		print_in_box(80,0,0,"       Both ends mapped : %s", global_context->is_both_end_required?"required":"not required");
839 
840 		if(global_context->is_PE_distance_checked)
841 			print_in_box(80,0,0,"        Fragment length : %d - %d", global_context -> min_paired_end_distance, global_context -> max_paired_end_distance);
842 	}
843 
844 	print_in_box(80,0,0,"");
845 	print_in_box(80,2,1,"");
846 	SUBREADputs("");
847 	print_in_box(80,1,1,"Running");
848 	print_in_box(80,0,0,"");
849 	if( global_context -> max_BAM_header_size > 32 * 1024 * 1024 ){
850 	}
851 	if(global_context->BAM_chros_to_anno_table)
852 		print_in_box(80,0,0,"%ld chromosome name aliases are loaded.", global_context -> BAM_chros_to_anno_table ->numOfElements);
853 
854 	free(sam_used);
855 	return 0;
856 }
857 
print_FC_results(fc_thread_global_context_t * global_context,char * out)858 void print_FC_results(fc_thread_global_context_t * global_context, char * out)
859 {
860 	//print_in_box(89,0,1,"%c[36mAlignment assignment finished.%c[0m", CHAR_ESC, CHAR_ESC);
861 	print_in_box(80,0,0,"");
862 	#ifdef MAKE_STANDALONE
863 	print_in_box(80,0,PRINT_BOX_WRAPPED,"Summary of counting results can be found in file \"%s.summary\"", out);
864 	print_in_box(80,0,0,"");
865 	#endif
866 	print_in_box(80,2,1,"");
867 	SUBREADputs("");
868 	return;
869 
870 	SUBREADputs("");
871 }
872 
fc_strcmp(const void * s1,const void * s2)873 int fc_strcmp(const void * s1, const void * s2)
874 {
875 	return strcmp((char*)s1, (char*)s2);
876 }
877 
junc_gene_free(void * vv)878 void junc_gene_free(void *vv){
879 	fc_junction_gene_t *v = vv;
880 	free(v -> gene_name);
881 	free(v);
882 }
883 
register_junc_feature(fc_thread_global_context_t * global_context,char * feature_name,char * chro,unsigned int start,unsigned int stop)884 void register_junc_feature(fc_thread_global_context_t *global_context, char * feature_name, char * chro, unsigned int start, unsigned int stop){
885 	HashTable * gene_table = HashTableGet(global_context -> junction_features_table, chro);
886 	//SUBREADprintf("REG %s : %p\n", chro, gene_table);
887 	if(NULL == gene_table){
888 		gene_table = HashTableCreate(48367);
889 		HashTableSetDeallocationFunctions(gene_table, NULL, junc_gene_free);
890 		HashTableSetKeyComparisonFunction(gene_table, fc_strcmp);
891 		HashTableSetHashFunction(gene_table, fc_chro_hash);
892 
893 		char * new_name = malloc(strlen(chro)+1);
894 		strcpy(new_name, chro);
895 		HashTablePut(global_context -> junction_features_table, new_name, gene_table);
896 	}
897 	fc_junction_gene_t * gene_info = HashTableGet(gene_table, feature_name);
898 	if(NULL == gene_info){
899 		gene_info = malloc(sizeof(fc_junction_gene_t));
900 		gene_info -> gene_name = strdup(feature_name);
901 		gene_info -> pos_first_base = start;
902 		gene_info -> pos_last_base = stop;
903 
904 		HashTablePut(gene_table, gene_info -> gene_name, gene_info);
905 	}else{
906 		gene_info -> pos_first_base = min(start, gene_info -> pos_first_base);
907 		gene_info -> pos_last_base = max(stop, gene_info -> pos_last_base);
908 	}
909 }
910 
free_bucket_table_list(void * pv)911 void free_bucket_table_list(void * pv){
912 	gene_info_list_t * list = (gene_info_list_t*) pv;
913 	free(list -> genes);
914 	free(list);
915 }
916 
match_feature_name_column(char * infile,char * needed)917 int match_feature_name_column(char * infile, char * needed){
918 	char * ptt = NULL;
919 	char lneeded[strlen(needed)+1];
920 	strcpy(lneeded, needed);
921 	char * t1 = strtok_r(lneeded, ",", &ptt);
922 	while(t1){
923 		if(strcmp(t1, infile)==0) return 1;
924 		t1 = strtok_r(NULL,",", &ptt);
925 	}
926 	return 0;
927 }
928 
929 #define JUNCTION_BUCKET_STEP (128*1024)
930 
locate_junc_features(fc_thread_global_context_t * global_context,char * chro,unsigned int pos,fc_junction_gene_t ** ret_info,int max_ret_info_size)931 int locate_junc_features(fc_thread_global_context_t *global_context, char * chro, unsigned int pos, fc_junction_gene_t ** ret_info, int max_ret_info_size){
932 	gene_info_list_t * list = NULL;
933 	char bucket_key[CHROMOSOME_NAME_LENGTH + 20];
934 
935 	if(global_context -> BAM_chros_to_anno_table) {
936 		char * anno_chro_name = HashTableGet( global_context -> BAM_chros_to_anno_table , chro);
937 		if(anno_chro_name){
938 			sprintf(bucket_key, "%s:%u", anno_chro_name, pos - pos % JUNCTION_BUCKET_STEP);
939 			list = HashTableGet(global_context -> junction_bucket_table, bucket_key);
940 		}
941 	}
942 
943 	if(list == NULL){
944 		sprintf(bucket_key, "%s:%u", chro, pos - pos % JUNCTION_BUCKET_STEP);
945 		list = HashTableGet(global_context -> junction_bucket_table, bucket_key);
946 	}
947 
948 	if(list == NULL && strlen(chro)>3 && memcmp(chro, "chr", 3)==0){
949 		sprintf(bucket_key, "%s:%u", chro+3, pos - pos % JUNCTION_BUCKET_STEP);
950 		list = HashTableGet(global_context -> junction_bucket_table, bucket_key);
951 	}
952 
953 	if(list == NULL){
954 		sprintf(bucket_key, "chr%s:%u", chro, pos - pos % JUNCTION_BUCKET_STEP);
955 		list = HashTableGet(global_context -> junction_bucket_table, bucket_key);
956 	}
957 
958 	int ret = 0;
959 
960 	if(list){
961 		int x1;
962 		for(x1 = 0; x1 < list -> used; x1++){
963 			fc_junction_gene_t * gene_info = list -> genes[x1];
964 			if(gene_info -> pos_first_base <= pos && gene_info -> pos_last_base >= pos){
965 				if(ret < max_ret_info_size)
966 					ret_info [ret ++] = gene_info;
967 			}
968 		}
969 	}
970 
971 	return ret;
972 }
973 
974 // This function loads annotations from the file.
975 // It returns the number of featres loaded, or -1 if something is wrong.
976 // Memory will be allowcated in this function. The pointer is saved in *loaded_features.
977 // The invoker must release the memory itself.
978 
979 #define MAX_ANNOT_LINE_LENGTH 1000000
load_feature_info(fc_thread_global_context_t * global_context,const char * annotation_file,int file_type,fc_feature_info_t ** loaded_features)980 int load_feature_info(fc_thread_global_context_t *global_context, const char * annotation_file, int file_type, fc_feature_info_t ** loaded_features)
981 {
982 	unsigned int features = 0, xk1 = 0, lineno=0;
983 	char * file_line = malloc(MAX_ANNOT_LINE_LENGTH+1);
984 	autozip_fp anno_fp;
985 	int apret = autozip_open(annotation_file, &anno_fp);
986 	int is_GFF_warned = 0;
987 	if(apret < 0) return -1;
988 
989 	HashTable * chro_name_table = HashTableCreate(1603);
990 	HashTableSetHashFunction(chro_name_table, fc_chro_hash);
991 	HashTableSetKeyComparisonFunction(chro_name_table, fc_strcmp_chro);
992 	global_context -> longest_chro_name = 0;
993 
994 	if(global_context -> do_junction_counting){
995 		global_context -> junction_bucket_table = HashTableCreate(76037);
996 		HashTableSetDeallocationFunctions(global_context -> junction_bucket_table, free, free_bucket_table_list);
997 		HashTableSetKeyComparisonFunction(global_context -> junction_bucket_table, fc_strcmp);
998 		HashTableSetHashFunction(global_context -> junction_bucket_table, fc_chro_hash);
999 
1000 		global_context -> junction_features_table = HashTableCreate(1603);
1001 		HashTableSetDeallocationFunctions(global_context -> junction_features_table, free, (void (*)(void *))HashTableDestroy);
1002 		HashTableSetKeyComparisonFunction(global_context -> junction_features_table, fc_strcmp);
1003 		HashTableSetHashFunction(global_context -> junction_features_table, fc_chro_hash);
1004 	}
1005 
1006 
1007 	// first scan: get the chromosome size (that have exons), total number of features
1008 	// also create chro_name_table : chro_name => fc_chromosome_index_info
1009 	while(0)
1010 	{
1011 		int rchars = autozip_gets(&anno_fp, file_line, MAX_ANNOT_LINE_LENGTH);
1012 		char * token_temp = NULL, *chro_name;
1013 		fc_chromosome_index_info * chro_stab;
1014 		unsigned int feature_pos = 0;
1015 		if(rchars < 1) break;
1016 
1017 		lineno++;
1018 		if(is_comment_line(file_line, file_type, lineno-1))continue;
1019 		if(file_type == FILE_TYPE_GTF)
1020 		{
1021 			chro_name = strtok_r(file_line,"\t",&token_temp);
1022 			strtok_r(NULL,"\t", &token_temp); // lib_name (not needed)
1023 			char * feature_type = strtok_r(NULL,"\t", &token_temp);
1024 			if(match_feature_name_column(feature_type, global_context -> feature_name_column))
1025 			{
1026 				strtok_r(NULL,"\t", &token_temp); // feature_start
1027 				feature_pos = atoi(strtok_r(NULL,"\t", &token_temp));// feature_end
1028 				features++;
1029 			}
1030 			else chro_name = NULL;
1031 		}
1032 		else
1033 		{
1034 			strtok_r(file_line,"\t", &token_temp);
1035 			chro_name = strtok_r(NULL,"\t",&token_temp);
1036 			strtok_r(NULL,"\t",&token_temp);	// feature_start
1037 			feature_pos = atoi(strtok_r(NULL,"\t", &token_temp));// feature_end
1038 
1039 			features++;
1040 		}
1041 
1042 		if(chro_name)
1043 		{
1044 			if(strlen(chro_name)>=CHROMOSOME_NAME_LENGTH)
1045 				chro_name[CHROMOSOME_NAME_LENGTH-1]=0;
1046 			chro_stab = HashTableGet(chro_name_table, chro_name);
1047 
1048 			if(chro_stab)
1049 			{
1050 				chro_stab -> chro_possible_length = max(chro_stab -> chro_possible_length , feature_pos+1);
1051 			}else
1052 			{
1053 				char * tmp_chro_name = malloc(CHROMOSOME_NAME_LENGTH);
1054 				term_strncpy(tmp_chro_name, chro_name, CHROMOSOME_NAME_LENGTH);
1055 				chro_stab = calloc(sizeof(fc_chromosome_index_info),1);
1056 				chro_stab -> chro_number = chro_name_table->numOfElements;
1057 				chro_stab -> chro_possible_length = feature_pos+1;
1058 				chro_stab -> reverse_table_start_index_size = 5000000;
1059 				chro_stab -> reverse_table_start_index = NULL;
1060 				HashTablePut(chro_name_table, tmp_chro_name, chro_stab);
1061 			}
1062 
1063 			chro_stab -> chro_features ++;
1064 		}
1065 	}
1066 
1067 	//autozip_rewind(&anno_fp);
1068 
1069 	unsigned int ret_features_size = 400000;
1070 	fc_feature_info_t * ret_features = malloc(sizeof(fc_feature_info_t) * ret_features_size);
1071 	char * tmpnameex = malloc(50001);
1072 
1073 	lineno = 0;
1074 	while(1)
1075 	{
1076 		int is_gene_id_found = 0;
1077 		int rchars = autozip_gets(&anno_fp, file_line, MAX_ANNOT_LINE_LENGTH);
1078 		if(rchars < 1) break;
1079 		if(rchars >= MAX_ANNOT_LINE_LENGTH - 1){
1080 			SUBREADprintf("\nERROR: the %u-th line in your GTF file is extremely long (longer than %d bytes).\nThe program cannot parse this line.\n", lineno+1, MAX_ANNOT_LINE_LENGTH-1);
1081 			return -2;
1082 		}
1083 
1084 		lineno++;
1085 		char * token_temp = NULL;
1086 		if(is_comment_line(file_line, file_type, lineno-1))continue;
1087 
1088 		if(file_type == FILE_TYPE_RSUBREAD){
1089 			if(xk1 >= ret_features_size) {
1090 				ret_features_size *=2;
1091 				ret_features = realloc(ret_features, sizeof(fc_feature_info_t) * ret_features_size);
1092 			}
1093 			char * feature_name = strtok_r(file_line,"\t",&token_temp);
1094 			int feature_name_len = strlen(feature_name);
1095 			if(feature_name_len > FEATURE_NAME_LENGTH-2){
1096 				SUBREADprintf("WARNING: feature name on the %d-th line is longer than %d bytes. The name is truncated\n", lineno, FEATURE_NAME_LENGTH -2);
1097 				feature_name[FEATURE_NAME_LENGTH -2 ] = 0;
1098 			}
1099 
1100 			srInt_64 genename_pos = unistr_cpy(global_context, (char *)feature_name, feature_name_len);
1101 
1102 	//		SUBREADprintf("REALL: '%s'=%d  [%d] %p  POS=%d\t\tOLD_NAME_POS=%d\n" , feature_name, feature_name_len , xk1, ret_features+xk1, genename_pos, xk1>0?ret_features[xk1-1].feature_name_pos:-1);
1103 			ret_features[xk1].feature_name_pos = genename_pos;
1104 
1105 			char * seq_name = strtok_r(NULL,"\t", &token_temp);
1106 			int chro_name_len = strlen(seq_name);
1107 			if(chro_name_len > CHROMOSOME_NAME_LENGTH) seq_name[CHROMOSOME_NAME_LENGTH -1 ] = 0;
1108 			srInt_64 chro_name_pos = unistr_cpy(global_context, (char *)seq_name, chro_name_len);
1109 			global_context -> longest_chro_name = max(chro_name_len, global_context -> longest_chro_name);
1110 
1111 
1112 			char * start_ptr = strtok_r(NULL,"\t", &token_temp);
1113 			char * end_ptr = strtok_r(NULL,"\t", &token_temp);
1114 
1115 			if(start_ptr == NULL || end_ptr == NULL){
1116 				SUBREADprintf("\nWarning: the format on the %d-th line is wrong.\n", lineno);
1117 			}
1118 			srInt_64 tv1 = atoll(start_ptr);
1119 			srInt_64 tv2 = atoll(end_ptr);
1120 
1121 			if( isdigit(start_ptr[0]) && isdigit(end_ptr[0]) ){
1122 				if(strlen(start_ptr) > 10 || strlen(end_ptr) > 10 || tv1 > 0x7fffffff || tv2> 0x7fffffff){
1123 					SUBREADprintf("\nError: Line %d contains a coordinate greater than 2^31.\n", lineno);
1124 					return -2;
1125 				}
1126 
1127 				if(tv1 >tv2){
1128 					SUBREADprintf("\nError: Line %d contains a feature that do not have a positive length.\n", lineno);
1129 					return -2;
1130 				}
1131 			}else{
1132 				SUBREADprintf("\nError: Line %d contains a format error. The expected annotation format is SAF.\n", lineno);
1133 				return -2;
1134 			}
1135 
1136 			ret_features[xk1].chro_name_pos_delta = chro_name_pos - ret_features[xk1].feature_name_pos;
1137 			ret_features[xk1].start = atoi( start_ptr );// start
1138 			if(ret_features[xk1].start>0x7fffffff)
1139 			{
1140 				ret_features[xk1].start = 0;
1141 				print_in_box(80,0,0,"WARNING the %d-th line has a negative chro coordinate.", lineno);
1142 			}
1143 
1144 			ret_features[xk1].end = atoi( end_ptr );//end
1145 			if(ret_features[xk1].end>0x7fffffff)
1146 			{
1147 				ret_features[xk1].end = 0;
1148 				print_in_box(80,0,0,"WARNING the %d-th line has a negative chro coordinate.", lineno);
1149 			}
1150 
1151 			char * strand_str = strtok_r(NULL,"\t", &token_temp);
1152 			if(strand_str == NULL)
1153 				ret_features[xk1].is_negative_strand = 0;
1154 			else
1155 				ret_features[xk1].is_negative_strand = ('+' ==strand_str[0])?0:(('-' ==strand_str[0])?1:-1);
1156 
1157 			if(global_context -> do_detection_call){
1158 				char * GCcontent = strtok_r(NULL,"\t", &token_temp);
1159 				if(GCcontent){
1160 					int gclen = strlen(GCcontent);
1161 					if(gclen>0)GCcontent[gclen-1]=0;
1162 					HashTablePut(global_context -> GCcontent_table, strdup(feature_name) , strdup(GCcontent));
1163 				}
1164 			}
1165 
1166 			ret_features[xk1].sorted_order = xk1;
1167 
1168 			int bin_location = ret_features[xk1].start / REVERSE_TABLE_BUCKET_LENGTH;
1169 
1170 			fc_chromosome_index_info * chro_stab = HashTableGet(chro_name_table, seq_name);
1171 			int feature_pos = ret_features[xk1].end;
1172 			if(NULL == chro_stab){
1173 				char * tmp_chro_name = malloc(CHROMOSOME_NAME_LENGTH);
1174 				term_strncpy(tmp_chro_name, seq_name, CHROMOSOME_NAME_LENGTH);
1175 				chro_stab = calloc(sizeof(fc_chromosome_index_info),1);
1176 				chro_stab -> chro_number = chro_name_table->numOfElements;
1177 				chro_stab -> chro_possible_length = feature_pos+1;
1178 				chro_stab -> reverse_table_start_index_size = 5000000;
1179 				chro_stab -> reverse_table_start_index = calloc( chro_stab -> reverse_table_start_index_size  / REVERSE_TABLE_BUCKET_LENGTH +2, sizeof(int));
1180 				HashTablePut(chro_name_table, tmp_chro_name, chro_stab);
1181 			}else chro_stab -> chro_possible_length = max(feature_pos+1, chro_stab -> chro_possible_length);
1182 			chro_stab -> chro_features ++;
1183 
1184 			if( chro_stab -> chro_possible_length >= chro_stab -> reverse_table_start_index_size ) {
1185 				int old_end = sizeof(int) *( chro_stab -> reverse_table_start_index_size  / REVERSE_TABLE_BUCKET_LENGTH +2);
1186 				chro_stab -> reverse_table_start_index_size = max(chro_stab -> reverse_table_start_index_size * 2, (int)(chro_stab -> chro_possible_length * 1.3));
1187 				int new_size = sizeof(int) *( chro_stab -> reverse_table_start_index_size  / REVERSE_TABLE_BUCKET_LENGTH +2);
1188 				chro_stab -> reverse_table_start_index = realloc( chro_stab -> reverse_table_start_index , new_size);
1189 				memset(chro_stab -> reverse_table_start_index + old_end / sizeof(int), 0, new_size - old_end);
1190 			}
1191 			chro_stab -> reverse_table_start_index[bin_location]++;
1192 			is_gene_id_found = 1;
1193 
1194 			assert(feature_name);
1195 			if(global_context -> do_junction_counting)
1196 				register_junc_feature(global_context , feature_name, seq_name, ret_features[xk1].start, ret_features[xk1].end);
1197 
1198 			xk1++;
1199 		} else if(file_type == FILE_TYPE_GTF) {
1200 			char feature_name_tmp[FEATURE_NAME_LENGTH];
1201 			sprintf(feature_name_tmp, "LINE_%07u", xk1 + 1);
1202 			char * seq_name = strtok_r(file_line,"\t",&token_temp);
1203 			strtok_r(NULL,"\t", &token_temp);// source
1204 			char * feature_type = strtok_r(NULL,"\t", &token_temp);// feature_type
1205 			if(match_feature_name_column(feature_type, global_context -> feature_name_column)) {
1206 
1207 				if(xk1 >= ret_features_size) {
1208 					ret_features_size *=2;
1209 					ret_features = realloc(ret_features, sizeof(fc_feature_info_t) * ret_features_size);
1210 				}
1211 
1212 				char * start_ptr = strtok_r(NULL,"\t", &token_temp);
1213 				char * end_ptr = strtok_r(NULL,"\t", &token_temp);
1214 
1215 				if(start_ptr == NULL || end_ptr == NULL){
1216 					SUBREADprintf("\nWarning: the format on the %d-th line is wrong.\n", lineno);
1217 				}
1218 				srInt_64 tv1 = atoll(start_ptr);
1219 				srInt_64 tv2 = atoll(end_ptr);
1220 
1221 				if( isdigit(start_ptr[0]) && isdigit(end_ptr[0]) ){
1222 					if(strlen(start_ptr) > 10 || strlen(end_ptr) > 10 || tv1 > 0x7fffffff || tv2> 0x7fffffff){
1223 						SUBREADprintf("\nError: Line %d contains a coordinate greater than 2^31.\n", lineno);
1224 						return -2;
1225 					}
1226 				}else{
1227 					SUBREADprintf("\nError: Line %d contains a format error. The expected annotation format is GTF/GFF.\n", lineno);
1228 					return -2;
1229 				}
1230 				ret_features[xk1].start = atoi(start_ptr);// start
1231 				ret_features[xk1].end = atoi(end_ptr);//end
1232 
1233 				if(ret_features[xk1].start < 1 || ret_features[xk1].end<1 ||  ret_features[xk1].start > 0x7fffffff ||  ret_features[xk1].end > 0x7fffffff || ret_features[xk1].start > ret_features[xk1].end){
1234 					SUBREADprintf("\Error: the feature on the %d-th line has zero coordinate or zero lengths\n\n", lineno);
1235 					return -2;
1236 				}
1237 
1238 
1239 				strtok_r(NULL,"\t", &token_temp);// score
1240 				char * strand_str = strtok_r(NULL,"\t", &token_temp);
1241 				ret_features[xk1].is_negative_strand = ('-' == strand_str[0])?1:( ('+' == strand_str[0])?0:-1 );//strand
1242 				ret_features[xk1].sorted_order = xk1;
1243 				strtok_r(NULL,"\t",&token_temp);	// "frame"
1244 				char * extra_attrs = strtok_r(NULL,"\t",&token_temp);	// name_1 "val1"; name_2 "val2"; ...
1245 				ret_features[xk1].extra_columns = NULL;
1246 				if(extra_attrs && (strlen(extra_attrs)>2))
1247 				{
1248 					int attr_val_len = GTF_extra_column_value(extra_attrs , global_context -> gene_id_column , feature_name_tmp, FEATURE_NAME_LENGTH);
1249 					if(attr_val_len>0) is_gene_id_found=1;
1250 			//		printf("V=%s\tR=%d\n", extra_attrs , attr_val_len);
1251 
1252 					if(global_context -> reported_extra_columns){
1253 						char * extcols = malloc(30);
1254 						int extcols_size = 30, extcols_len = 0;
1255 
1256 						char * this_exname_ptr=global_context -> reported_extra_columns;
1257 						while(1){
1258 							int padd0, is_last=1;
1259 							for(padd0=0; this_exname_ptr[padd0]; padd0++)
1260 								if(this_exname_ptr[padd0]=='\t'){
1261 									this_exname_ptr[padd0]=0;
1262 									is_last=0;
1263 									break;
1264 								}
1265 
1266 							attr_val_len = GTF_extra_column_value(extra_attrs , this_exname_ptr , tmpnameex, 50000);
1267 
1268 							if(attr_val_len<0){
1269 								attr_val_len=2;
1270 								strcpy(tmpnameex,"NA");
1271 							}
1272 							if(attr_val_len + extcols_len + 2 > extcols_size){
1273 								extcols_size = max(extcols_size*2, attr_val_len + extcols_len+2);
1274 								extcols = realloc(extcols, extcols_size);
1275 							}
1276 							memcpy(extcols+extcols_len, tmpnameex, attr_val_len);
1277 							extcols_len += attr_val_len;
1278 							extcols[extcols_len]='\t';
1279 							extcols_len += 1;
1280 
1281 							if(is_last)break;
1282 							this_exname_ptr[padd0]='\t';
1283 							this_exname_ptr += padd0+1;
1284 						}
1285 						extcols[extcols_len-1]=0;
1286 						ret_features[xk1].extra_columns = extcols;
1287 					}
1288 				}
1289 
1290 				if(!is_gene_id_found) {
1291 					if(!is_GFF_warned)
1292 					{
1293 						int ext_att_len = strlen(extra_attrs);
1294 						if(extra_attrs[ext_att_len-1] == '\n') extra_attrs[ext_att_len-1] =0;
1295 						SUBREADprintf("\nERROR: failed to find the gene identifier attribute in the 9th column of the provided GTF file.\nThe specified gene identifier attribute is '%s' \nAn example of attributes included in your GTF annotation is '%s'.\n\n",  global_context -> gene_id_column, extra_attrs);
1296 					}
1297 					is_GFF_warned++;
1298 				}
1299 
1300 				int feature_name_len = strlen(feature_name_tmp);
1301 				if(feature_name_len > FEATURE_NAME_LENGTH-2){
1302 					SUBREADprintf("WARNING: feature name on the %d-th line is longer than %d bytes. The name is truncated\n", lineno, FEATURE_NAME_LENGTH-2);
1303 					feature_name_tmp[FEATURE_NAME_LENGTH -2 ] = 0;
1304 				}
1305 				ret_features[xk1].feature_name_pos = unistr_cpy(global_context, (char *)feature_name_tmp, feature_name_len);
1306 
1307 				int chro_name_len = strlen(seq_name);
1308 				if(chro_name_len > CHROMOSOME_NAME_LENGTH) seq_name[CHROMOSOME_NAME_LENGTH -1 ] = 0;
1309 				srInt_64 chro_name_pos = unistr_cpy(global_context, (char *)seq_name, chro_name_len);
1310 				global_context -> longest_chro_name = max(chro_name_len, global_context -> longest_chro_name);
1311 
1312 				ret_features[xk1].chro_name_pos_delta = chro_name_pos - ret_features[xk1].feature_name_pos;
1313 
1314 				int bin_location = ret_features[xk1].start / REVERSE_TABLE_BUCKET_LENGTH;
1315 				fc_chromosome_index_info * chro_stab = HashTableGet(chro_name_table, seq_name);
1316 				int feature_pos = ret_features[xk1].end;
1317 
1318 				if(NULL == chro_stab){
1319 					char * tmp_chro_name = malloc(CHROMOSOME_NAME_LENGTH);
1320 					term_strncpy(tmp_chro_name, seq_name, CHROMOSOME_NAME_LENGTH);
1321 					chro_stab = calloc(sizeof(fc_chromosome_index_info),1);
1322 					chro_stab -> chro_number = chro_name_table->numOfElements;
1323 					chro_stab -> chro_possible_length = feature_pos+1;
1324 					chro_stab -> reverse_table_start_index_size = 5000000;
1325 					chro_stab -> reverse_table_start_index = calloc( chro_stab -> reverse_table_start_index_size  / REVERSE_TABLE_BUCKET_LENGTH +2 , sizeof(int));
1326 					HashTablePut(chro_name_table, tmp_chro_name, chro_stab);
1327 				}else chro_stab -> chro_possible_length = max(feature_pos+1, chro_stab -> chro_possible_length);
1328 				chro_stab -> chro_features ++;
1329 
1330 				if( chro_stab -> chro_possible_length >= chro_stab -> reverse_table_start_index_size ) {
1331 					int old_end = sizeof(int) *( chro_stab -> reverse_table_start_index_size  / REVERSE_TABLE_BUCKET_LENGTH +2);
1332 					chro_stab -> reverse_table_start_index_size = max(chro_stab -> reverse_table_start_index_size * 2, (int)(chro_stab -> chro_possible_length * 1.3));
1333 					int new_size = sizeof(int) *( chro_stab -> reverse_table_start_index_size  / REVERSE_TABLE_BUCKET_LENGTH +2);
1334 					chro_stab -> reverse_table_start_index = realloc(chro_stab -> reverse_table_start_index, new_size);
1335 					memset(chro_stab -> reverse_table_start_index + old_end / sizeof(int), 0, new_size - old_end);
1336 				}
1337 
1338 				chro_stab -> reverse_table_start_index[bin_location]++;
1339 
1340 				if(global_context -> do_junction_counting)
1341 					register_junc_feature(global_context , feature_name_tmp, seq_name, ret_features[xk1].start, ret_features[xk1].end);
1342 
1343 				xk1++;
1344 			}
1345 		}
1346 	}
1347 	features = xk1;
1348 	autozip_close(&anno_fp);
1349 	free(file_line);
1350 	free(tmpnameex);
1351 
1352 	(*loaded_features) = ret_features;
1353 	global_context -> exontable_nchrs = (int)chro_name_table-> numOfElements;
1354 	global_context -> exontable_chro_table = chro_name_table;
1355 
1356 
1357 	if(is_GFF_warned) return -2;
1358 	if(features < 1){
1359 		if(global_context -> annotation_file_screen_output[0]){
1360 			SUBREADprintf("ERROR: no features were loaded in format %s. The annotation format can be specified by the 'isGTFAnnotationFile' option%s.\n", file_type == FILE_TYPE_GTF?"GTF":"SAF", file_type == FILE_TYPE_GTF?", and the required feature type can be specified by the 'GTF.featureType' option":"");
1361 		}else{
1362 			SUBREADprintf("ERROR: no features were loaded in format %s. The annotation format can be specified by the '-F' option%s.\n", file_type == FILE_TYPE_GTF?"GTF":"SAF", file_type == FILE_TYPE_GTF?", and the required feature type can be specified by the '-t' option.":"");
1363 		}
1364 		SUBREADprintf("\n\n");
1365 		return -2;
1366 	}
1367 
1368 	print_in_box(80,0,0,"   Features : %d\n", features);
1369 	return features;
1370 }
1371 
find_or_insert_gene_name(fc_thread_global_context_t * global_context,unsigned char * feature_name)1372 int find_or_insert_gene_name(fc_thread_global_context_t * global_context, unsigned char * feature_name)
1373 {
1374 	HashTable * genetable = global_context -> gene_name_table;
1375 
1376 	srInt_64 gene_number = HashTableGet(genetable, feature_name) - NULL;
1377 	if(gene_number>0)
1378 		return gene_number-1;
1379 	else
1380 	{
1381 		gene_number = genetable -> numOfElements;
1382 		HashTablePut(genetable, feature_name, NULL+gene_number+1);
1383 		global_context -> gene_name_array[gene_number] = feature_name;
1384 			// real memory space of feature_name is in the "loaded_features" data structure.
1385 			// now we only save its pointer.
1386 
1387 		return gene_number;
1388 	}
1389 }
1390 
register_reverse_table(int block_no,srInt_64 this_block_min_start,srInt_64 this_block_max_end,fc_chromosome_index_info * chro_inf)1391 void register_reverse_table(int block_no, srInt_64 this_block_min_start, srInt_64 this_block_max_end, fc_chromosome_index_info * chro_inf)
1392 {
1393 
1394 	unsigned int reversed_bucket_start = this_block_min_start /  REVERSE_TABLE_BUCKET_LENGTH;
1395 	unsigned int reversed_bucket_end = this_block_max_end / REVERSE_TABLE_BUCKET_LENGTH;
1396 	assert(this_block_min_start <= this_block_max_end);
1397 	assert(reversed_bucket_end < chro_inf -> chro_possible_length);
1398 	int x1;
1399 	for(x1 = reversed_bucket_start; x1 <= reversed_bucket_end; x1++)
1400 	{
1401 		chro_inf->reverse_table_start_index[x1] = min(chro_inf->reverse_table_start_index[x1], block_no);
1402 		//chro_inf->reverse_table_end_index[x1] = max(chro_inf->reverse_table_end_index[x1], block_no+1);
1403 	}
1404 
1405 }
1406 
feature_merge(void * arrv,int start,int items,int items2)1407 void feature_merge(void * arrv, int start, int items, int items2)
1408 {
1409 
1410 	void ** arr = (void **) arrv;
1411 
1412 	srInt_64 * ret_start = (srInt_64 *) arr[0];
1413 	srInt_64 * ret_end = (srInt_64 *) arr[1];
1414 	unsigned char * ret_strand = (unsigned char *) arr[2];
1415 	int * ret_entyrez = (int *) arr[3];
1416 	fc_feature_info_t ** old_info_ptr = (fc_feature_info_t **) arr[4];
1417 
1418 	int total_items = items+items2;
1419 	srInt_64 * tmp_start = malloc(sizeof(srInt_64) * total_items);
1420 	srInt_64 * tmp_end = malloc(sizeof(srInt_64) * total_items);
1421 	unsigned char * tmp_strand = malloc(sizeof(char) * total_items);
1422 	int * tmp_entyrez = malloc(sizeof(int) * total_items);
1423 	fc_feature_info_t ** tmp_info_ptr = malloc(sizeof(fc_feature_info_t*) * total_items);
1424 
1425 	int read_1_ptr = start;
1426 	int read_2_ptr = start+items;
1427 	int write_ptr;
1428 
1429 	for(write_ptr=0; write_ptr<total_items; write_ptr++)
1430 	{
1431 		if((read_1_ptr >= start+items)||(read_2_ptr < start+total_items && ret_start[read_1_ptr] >= ret_start[read_2_ptr]))
1432 		{
1433 			tmp_start[write_ptr] = ret_start[read_2_ptr];
1434 			tmp_end[write_ptr] = ret_end[read_2_ptr];
1435 			tmp_strand[write_ptr] = ret_strand[read_2_ptr];
1436 			tmp_entyrez[write_ptr] = ret_entyrez[read_2_ptr];
1437 			tmp_info_ptr[write_ptr] = old_info_ptr[read_2_ptr];
1438 			read_2_ptr++;
1439 		}
1440 		else
1441 		{
1442 			tmp_start[write_ptr] = ret_start[read_1_ptr];
1443 			tmp_end[write_ptr] = ret_end[read_1_ptr];
1444 			tmp_strand[write_ptr] = ret_strand[read_1_ptr];
1445 			tmp_entyrez[write_ptr] = ret_entyrez[read_1_ptr];
1446 			tmp_info_ptr[write_ptr] = old_info_ptr[read_1_ptr];
1447 			read_1_ptr++;
1448 		}
1449 	}
1450 
1451 	memcpy(ret_start+ start, tmp_start, sizeof(srInt_64) * total_items);
1452 	memcpy(ret_end+ start, tmp_end, sizeof(srInt_64) * total_items);
1453 	memcpy(ret_strand+ start, tmp_strand, sizeof(char) * total_items);
1454 	memcpy(ret_entyrez+ start, tmp_entyrez, sizeof(int) * total_items);
1455 	memcpy(old_info_ptr+ start, tmp_info_ptr, sizeof(fc_feature_info_t*) * total_items);
1456 
1457 	free(tmp_start);
1458 	free(tmp_end);
1459 	free(tmp_strand);
1460 	free(tmp_entyrez);
1461 	free(tmp_info_ptr);
1462 }
1463 
1464 
feature_sort_compare(void * arrv,int l,int r)1465 int feature_sort_compare(void * arrv, int l, int r)
1466 {
1467 	void ** arr = (void **) arrv;
1468 	srInt_64 * ret_start = (srInt_64 *)arr[0];
1469 	srInt_64 ll = ret_start[l];
1470 	srInt_64 rl = ret_start[r];
1471 
1472 	if(ll==rl) return 0;
1473 	else if(ll>rl) return 1;
1474 	else return -1;
1475 }
1476 
feature_sort_exchange(void * arrv,int l,int r)1477 void feature_sort_exchange(void * arrv, int l, int r)
1478 {
1479 	void ** arr = (void **) arrv;
1480 	srInt_64 tmp;
1481 	fc_feature_info_t * tmpptr;
1482 
1483 	srInt_64 * ret_start = (srInt_64 *) arr[0];
1484 	srInt_64 * ret_end = (srInt_64 *) arr[1];
1485 	unsigned char * ret_strand = (unsigned char *) arr[2];
1486 	int * ret_entyrez = (int *) arr[3];
1487 	fc_feature_info_t ** old_info_ptr = (fc_feature_info_t **) arr[4];
1488 
1489 
1490 	tmp = ret_start[r];
1491 	ret_start[r]=ret_start[l];
1492 	ret_start[l]=tmp;
1493 
1494 	tmp = ret_end[r];
1495 	ret_end[r]=ret_end[l];
1496 	ret_end[l]=tmp;
1497 
1498 	tmp = ret_strand[r];
1499 	ret_strand[r]=ret_strand[l];
1500 	ret_strand[l]=tmp;
1501 
1502 	tmp = ret_entyrez[r];
1503 	ret_entyrez[r]=ret_entyrez[l];
1504 	ret_entyrez[l]=tmp;
1505 
1506 	tmpptr = old_info_ptr[r];
1507 	old_info_ptr[r]=old_info_ptr[l];
1508 	old_info_ptr[l]=tmpptr;
1509 
1510 }
1511 
1512 
1513 
sort_feature_info(fc_thread_global_context_t * global_context,unsigned int features,fc_feature_info_t * loaded_features,char *** sorted_chr_names,int ** sorted_entrezid,srInt_64 ** sorted_start,srInt_64 ** sorted_end,unsigned char ** sorted_strand,char ** anno_chr_2ch,char *** anno_chrs,srInt_64 ** anno_chr_head,srInt_64 ** block_end_index,srInt_64 ** block_min_start_pos,srInt_64 ** block_max_end_pos)1514 void sort_feature_info(fc_thread_global_context_t * global_context, unsigned int features, fc_feature_info_t * loaded_features, char *** sorted_chr_names, int ** sorted_entrezid, srInt_64 ** sorted_start, srInt_64 ** sorted_end, unsigned char ** sorted_strand, char ** anno_chr_2ch, char *** anno_chrs, srInt_64 ** anno_chr_head, srInt_64 ** block_end_index, srInt_64 ** block_min_start_pos, srInt_64 ** block_max_end_pos)
1515 {
1516 	unsigned int chro_pnt;
1517 	unsigned int xk1,xk2;
1518 	int * ret_entrez = malloc(sizeof(int) * features);
1519 	srInt_64 * ret_start = malloc(sizeof(srInt_64) * features);
1520 	srInt_64 * ret_end = malloc(sizeof(srInt_64) * features);
1521 	int current_block_buffer_size = 2000;
1522 
1523 	srInt_64 * ret_block_end_index = malloc(sizeof(srInt_64) * current_block_buffer_size);
1524 	srInt_64 * ret_block_min_start = malloc(sizeof(srInt_64) * current_block_buffer_size);
1525 	srInt_64 * ret_block_max_end = malloc(sizeof(srInt_64) * current_block_buffer_size);
1526 	unsigned char * ret_strand = malloc(features);
1527 	char ** ret_char_name = malloc(sizeof(void *) * features);
1528 	fc_feature_info_t ** old_info_ptr = malloc(sizeof(void *) * features);
1529 	(*anno_chrs) = malloc(sizeof(void *) * global_context -> exontable_nchrs);
1530 	(*anno_chr_head) = malloc(sizeof(srInt_64) * global_context -> exontable_nchrs);
1531 	(*anno_chr_2ch) = malloc(sizeof(char) * global_context -> exontable_nchrs*2);
1532 	unsigned int * chro_feature_ptr = calloc(sizeof(int) * global_context -> exontable_nchrs,1);
1533 	fc_chromosome_index_info ** tmp_chro_info_ptrs = malloc(global_context -> exontable_nchrs * sizeof(fc_chromosome_index_info *));
1534 
1535 	global_context -> gene_name_array = malloc(sizeof(char *) * features);	// there should be much less identical names.
1536 	global_context -> gene_name_table = HashTableCreate(5000);
1537 	HashTableSetHashFunction(global_context -> gene_name_table, HashTableStringHashFunction);
1538 	HashTableSetKeyComparisonFunction(global_context -> gene_name_table, fc_strcmp);
1539 
1540 	// init start positions of each chromosome block.
1541 	if(1)
1542 	{
1543 		KeyValuePair * cursor;
1544 		int bucket;
1545 		unsigned int sum_ptr = 0;
1546 		for(bucket=0; bucket < global_context -> exontable_chro_table  -> numOfBuckets; bucket++)
1547 		{
1548 			cursor = global_context -> exontable_chro_table -> bucketArray[bucket];
1549 			while (1)
1550 			{
1551 				if (!cursor) break;
1552 				fc_chromosome_index_info * tmp_chro_inf = cursor -> value;
1553 				cursor = cursor->next;
1554 				//tmp_chro_inf -> reverse_table_end_index = calloc(sizeof(int), tmp_chro_inf->chro_possible_length / REVERSE_TABLE_BUCKET_LENGTH +2);
1555 				chro_feature_ptr [tmp_chro_inf -> chro_number] = tmp_chro_inf -> chro_features;
1556 				tmp_chro_info_ptrs[tmp_chro_inf -> chro_number] = tmp_chro_inf;
1557 			}
1558 		}
1559 
1560 		for(xk1 = 0; xk1 < global_context -> exontable_nchrs; xk1++)
1561 		{
1562 			unsigned int tmpv = chro_feature_ptr[xk1];
1563 			chro_feature_ptr[xk1] = sum_ptr;
1564 			tmp_chro_info_ptrs[xk1] -> chro_feature_table_start = sum_ptr;
1565 		//		printf("SII=%u  +  %u\n", sum_ptr, tmpv);
1566 			sum_ptr += tmpv;
1567 		}
1568 
1569 	}
1570 	int current_block_id = 0, sort_i = 0;
1571 
1572 	(*sorted_chr_names) = ret_char_name;
1573 	(*sorted_entrezid) = ret_entrez;
1574 	(*sorted_start) = ret_start;
1575 	(*sorted_end) = ret_end;
1576 	(*sorted_strand) = ret_strand;
1577 	int curr_chro_number = 0;
1578 
1579 	for(chro_pnt=0; chro_pnt < features; chro_pnt++)
1580 	{
1581 		char * this_chro_name = global_context -> unistr_buffer_space + loaded_features[chro_pnt].feature_name_pos + loaded_features[chro_pnt].chro_name_pos_delta;
1582 		fc_chromosome_index_info * this_chro_info = HashTableGet(global_context -> exontable_chro_table , this_chro_name);
1583 		assert(this_chro_info);
1584 		unsigned int this_chro_number = this_chro_info -> chro_number;
1585 		unsigned int this_chro_table_ptr = chro_feature_ptr[this_chro_number];
1586 
1587 		ret_char_name[this_chro_table_ptr] = this_chro_name;// (char *)loaded_features[chro_pnt].chro;
1588 		ret_entrez[this_chro_table_ptr] = find_or_insert_gene_name(global_context, (unsigned char *)(global_context -> unistr_buffer_space + loaded_features[chro_pnt].feature_name_pos));
1589 		ret_start[this_chro_table_ptr] = loaded_features[chro_pnt].start;
1590 		ret_end[this_chro_table_ptr] = loaded_features[chro_pnt].end;
1591 		ret_strand[this_chro_table_ptr] = loaded_features[chro_pnt].is_negative_strand;
1592 		old_info_ptr[this_chro_table_ptr] = &loaded_features[chro_pnt];
1593 
1594 		chro_feature_ptr[this_chro_number]++;
1595 	}
1596 
1597 	for(xk1 = 0; xk1 < global_context -> exontable_nchrs; xk1++)
1598 	{
1599 		fc_chromosome_index_info * tmp_chro_inf = tmp_chro_info_ptrs[xk1];
1600 		int bins_in_chr = ( tmp_chro_inf->chro_possible_length / REVERSE_TABLE_BUCKET_LENGTH +2);
1601 		short * features_per_block_bins = malloc(sizeof(short)*bins_in_chr);
1602 		for(xk2=0; xk2<bins_in_chr; xk2++)
1603 		{
1604 			features_per_block_bins[xk2] = max(1,min(1000,(int)(0.9999999+sqrt(tmp_chro_inf -> reverse_table_start_index[xk2]))));
1605 			//printf("CHR%d : SQR[%d]=%d (%d)\n",  tmp_chro_inf -> chro_number,xk2, features_per_block_bins[xk2], tmp_chro_inf -> reverse_table_start_index[xk2] );
1606 		}
1607 
1608 		memset(tmp_chro_inf -> reverse_table_start_index, 0xff, sizeof(int) *bins_in_chr);
1609 
1610 		tmp_chro_inf -> chro_block_table_start = current_block_id;
1611 		unsigned int this_block_items = 0;
1612 		srInt_64 this_block_min_start = 0x7fffffff, this_block_max_end = 0;
1613 		unsigned int this_chro_tab_end =  tmp_chro_inf -> chro_features + tmp_chro_inf -> chro_feature_table_start;
1614 
1615 		void * in_array[5];
1616 		in_array[0] = ret_start + tmp_chro_inf -> chro_feature_table_start;
1617 		in_array[1] = ret_end + tmp_chro_inf -> chro_feature_table_start;
1618 		in_array[2] = ret_strand + tmp_chro_inf -> chro_feature_table_start;
1619 		in_array[3] = ret_entrez + tmp_chro_inf -> chro_feature_table_start;
1620 		in_array[4] = old_info_ptr + tmp_chro_inf -> chro_feature_table_start;
1621 
1622 		merge_sort(in_array, this_chro_tab_end - tmp_chro_inf -> chro_feature_table_start, feature_sort_compare, feature_sort_exchange, feature_merge);
1623 
1624 		for(sort_i = tmp_chro_inf -> chro_feature_table_start; sort_i< this_chro_tab_end ; sort_i++)
1625 		{
1626 			// NOW THE FEATURES (ret_start, ret_end, ret_strand, ret_entrez, old_info_ptr) ARE ALL SORTED!
1627 			//printf("NT=%lu\tCHRO=%d\n", ret_start[sort_i], tmp_chro_inf->chro_number);
1628 			old_info_ptr[sort_i]->sorted_order = sort_i;
1629 
1630 			int feature_bin_location = ret_start[sort_i] / REVERSE_TABLE_BUCKET_LENGTH;
1631 			int block_bin_location = this_block_min_start / REVERSE_TABLE_BUCKET_LENGTH;
1632 
1633 			if(this_block_items && (this_block_items > features_per_block_bins[block_bin_location] || feature_bin_location != block_bin_location))//global_context -> feature_block_size)
1634 			{
1635 
1636 				if(current_block_id >= current_block_buffer_size - 1)
1637 				{
1638 					current_block_buffer_size *= 1.3;
1639 					ret_block_min_start = realloc(ret_block_min_start, sizeof(srInt_64)*current_block_buffer_size);
1640 					ret_block_max_end = realloc(ret_block_max_end, sizeof(srInt_64)*current_block_buffer_size);
1641 					ret_block_end_index = realloc(ret_block_end_index, sizeof(srInt_64)*current_block_buffer_size);
1642 				}
1643 
1644 
1645 				ret_block_end_index[current_block_id] = sort_i;	// FIRST UNWANTED ID
1646 				ret_block_min_start[current_block_id] = this_block_min_start;
1647 				ret_block_max_end[current_block_id] = this_block_max_end;
1648 				register_reverse_table(current_block_id, this_block_min_start, this_block_max_end, tmp_chro_inf);
1649 				//printf("B=%d; ST=%ld, END=%ld, ITM=%d\n", current_block_id, this_block_min_start, this_block_max_end, this_block_items);
1650 				current_block_id++;
1651 				this_block_max_end = 0;
1652 				this_block_items = 0;
1653 				this_block_min_start = 0x7fffffff;
1654 			}
1655 
1656 			this_block_max_end = max(this_block_max_end, ret_end[sort_i]);
1657 			this_block_min_start = min(this_block_min_start, ret_start[sort_i]);
1658 			this_block_items ++;
1659 
1660 		}
1661 		if(this_block_items)
1662 		{
1663 			if(current_block_id >= current_block_buffer_size)
1664 			{
1665 				current_block_buffer_size *= 1.3;
1666 				ret_block_min_start = realloc(ret_block_min_start, sizeof(srInt_64)*current_block_buffer_size);
1667 				ret_block_max_end = realloc(ret_block_max_end, sizeof(srInt_64)*current_block_buffer_size);
1668 				ret_block_end_index = realloc(ret_block_end_index, sizeof(srInt_64)*current_block_buffer_size);
1669 			}
1670 
1671 			ret_block_end_index[current_block_id] = this_chro_tab_end;	// FIRST UNWANTED ID
1672 			ret_block_min_start[current_block_id] = this_block_min_start;
1673 			ret_block_max_end[current_block_id] = this_block_max_end;
1674 			register_reverse_table(current_block_id, this_block_min_start, this_block_max_end, tmp_chro_inf);
1675 			current_block_id++;
1676 		}
1677 
1678 		(*anno_chr_head) [curr_chro_number] = current_block_id;
1679 		tmp_chro_inf -> chro_block_table_end = current_block_id;
1680 		free(features_per_block_bins);
1681 	}
1682 
1683 	(*block_end_index) = ret_block_end_index;
1684 	(*block_min_start_pos) = ret_block_min_start;
1685 	(*block_max_end_pos) = ret_block_max_end;
1686 
1687 	//print_in_box(80, 0,0,"The %u features are sorted.\n", sort_i);
1688 	free(old_info_ptr);
1689 	free(tmp_chro_info_ptrs);
1690 	free(chro_feature_ptr);
1691 }
1692 
strcmp_slash(char * s1,char * s2)1693 int strcmp_slash(char * s1, char * s2)
1694 {
1695 	char nch;
1696 	while(0!=(nch = *(s1++))){
1697 		if(nch == '/') break;
1698 		if(nch != (*s2)) return 1;
1699 		s2++;
1700 	}
1701 	return nch != *s2;
1702 }
1703 
1704 #define NH_FRACTION_INT 65536
1705 
calculate_multi_overlap_fraction(fc_thread_global_context_t * global_context,unsigned int fixed_fractional_count,int maximum_total_count)1706 unsigned int calculate_multi_overlap_fraction(fc_thread_global_context_t * global_context, unsigned int fixed_fractional_count, int maximum_total_count){
1707 	//SUBREADprintf("SSSSFRAC = %d ; FIXED / MAX = %u , %d\n", global_context -> use_fraction_multi_mapping, fixed_fractional_count, maximum_total_count);
1708 	if(global_context -> use_fraction_multi_mapping) return fixed_fractional_count / maximum_total_count;
1709 	else return fixed_fractional_count;
1710 }
1711 
calc_fixed_fraction(int nh)1712 unsigned int calc_fixed_fraction(int nh){
1713 	if(nh==1) return NH_FRACTION_INT;
1714 	else if(nh == 2) return NH_FRACTION_INT>>1;
1715 	else return NH_FRACTION_INT / nh;
1716 }
1717 
1718 
calc_float_fraction(read_count_type_t score,read_count_type_t * integer_count,double * float_count)1719 int calc_float_fraction(read_count_type_t score, read_count_type_t * integer_count, double * float_count){
1720 	if(score % NH_FRACTION_INT == 0){
1721 		(*integer_count) = score / NH_FRACTION_INT;
1722 		return 0;
1723 	}else{
1724 		(*float_count) = score * 1./NH_FRACTION_INT;
1725 		return 1;
1726 	}
1727 }
1728 
1729 
print_read_wrapping(char * rl,int is_second)1730 void print_read_wrapping(char * rl, int is_second){
1731 	int refill_spaces = 3;
1732 
1733 	int read_length = 0, x1 = 0, spaces=0;
1734 
1735 	for(x1 = 0; x1 < 3100; x1++){
1736 		if(rl[x1]==0 && rl[x1+1]==0)break;
1737 		if(rl[x1]=='0' || rl[x1]=='\t') spaces++;
1738 		read_length ++;
1739 	}
1740 
1741 	char *out_buf1 = malloc(read_length + spaces * refill_spaces + 1), out_buf2[100];
1742 	int ox=0;
1743 
1744 	for(x1 = 0; x1 < 3000; x1++){
1745 		if(rl[x1]=='\n' || (rl[x1]==0 && rl[x1+1]==0)){
1746 			out_buf1[ox]=0;
1747 			break;
1748 		} else if((rl[x1]==0 && rl[x1+1]!=0) || rl[x1] == '\t'){
1749 			int x2;
1750 			for(x2 = 0; x2 < refill_spaces ; x2++){
1751 				out_buf1[ox]=' ';
1752 				ox++;
1753 			}
1754 		} else {
1755 			out_buf1[ox]=rl[x1];
1756 			ox++;
1757 		}
1758 	}
1759 	out_buf1[ox] = 0;
1760 
1761 	x1=0;
1762 
1763 	while(1){
1764 		int x2;
1765 		for(x2 = 0; x2 < 67 ; x2 ++){
1766 			char nch = out_buf1[x1];
1767 			if(nch == 0) break;
1768 			out_buf2[x2] = nch;
1769 			x1++;
1770 		}
1771 		out_buf2[x2] = 0;
1772 
1773 		print_in_box(80,0,PRINT_BOX_NOCOLOR_FOR_COLON,"      %s", out_buf2);
1774 		if(out_buf1[x1] == 0)break;
1775 	}
1776 
1777 	free(out_buf1);
1778 
1779 }
1780 
1781 
disallocate_RG_tables(void * pt)1782 void disallocate_RG_tables(void * pt){
1783 	void ** t4 = pt;
1784 	free(t4[0]);
1785 	free(t4[1]);
1786 	if(t4[2]){
1787 		HashTableDestroy(t4[2]);
1788 		HashTableDestroy(t4[3]);
1789 	}
1790 	free(pt);
1791 }
1792 
1793 
process_pairer_reset(void * pairer_vp)1794 void process_pairer_reset(void * pairer_vp){
1795 	SAM_pairer_context_t * pairer = (SAM_pairer_context_t *) pairer_vp;
1796 	fc_thread_global_context_t * global_context = (fc_thread_global_context_t * )pairer -> appendix1;
1797 	if(global_context -> sambam_chro_table) free(global_context -> sambam_chro_table);
1798 	global_context -> sambam_chro_table = NULL;
1799 	global_context -> sambam_chro_table_items = 0;
1800 	if(global_context -> assign_reads_to_RG) free(global_context -> RGnames_set);
1801 	if(global_context -> do_scRNA_table){
1802 		SUBREADprintf("ERROR: the BAM input is incompatible with scRNA\n");
1803 		assert(0);
1804 	}
1805 
1806 	int xk1, xk2;
1807 	for(xk1=0; xk1<global_context-> thread_number; xk1++)
1808 	{
1809 		for(xk2=0; xk2<global_context -> exontable_exons; xk2++)
1810 		{
1811 			global_context -> thread_contexts[xk1].count_table[xk2] = 0;
1812 		}
1813 
1814 		global_context -> thread_contexts[xk1].del4_added_reads = 0;
1815 
1816 		global_context -> thread_contexts[xk1].all_reads = 0;
1817 		global_context -> thread_contexts[xk1].nreads_mapped_to_exon = 0;
1818 
1819 		global_context -> thread_contexts[xk1].read_counters.unassigned_ambiguous = 0;
1820 		global_context -> thread_contexts[xk1].read_counters.unassigned_nofeatures = 0;
1821 		global_context -> thread_contexts[xk1].read_counters.unassigned_unmapped = 0;
1822 		global_context -> thread_contexts[xk1].read_counters.unassigned_singleton = 0;
1823 		global_context -> thread_contexts[xk1].read_counters.unassigned_read_type = 0;
1824 		global_context -> thread_contexts[xk1].read_counters.unassigned_mappingquality = 0;
1825 		global_context -> thread_contexts[xk1].read_counters.unassigned_fragmentlength = 0;
1826 		global_context -> thread_contexts[xk1].read_counters.unassigned_chimericreads = 0;
1827 		global_context -> thread_contexts[xk1].read_counters.unassigned_multimapping = 0;
1828 		global_context -> thread_contexts[xk1].read_counters.unassigned_secondary = 0;
1829 		global_context -> thread_contexts[xk1].read_counters.unassigned_junction_condition = 0;
1830 		global_context -> thread_contexts[xk1].read_counters.unassigned_duplicate = 0;
1831 		global_context -> thread_contexts[xk1].read_counters.unassigned_overlapping_length = 0;
1832 		global_context -> thread_contexts[xk1].read_counters.assigned_reads = 0;
1833 		global_context -> thread_contexts[xk1].read_details_buff_used = 0;
1834 
1835 		if(global_context -> do_junction_counting)
1836 		{
1837 			HashTableDestroy(global_context -> thread_contexts[xk1].junction_counting_table);
1838 			global_context -> thread_contexts[xk1].junction_counting_table = HashTableCreate(131317);
1839 			HashTableSetHashFunction(global_context -> thread_contexts[xk1].junction_counting_table,HashTableStringHashFunction);
1840 			HashTableSetDeallocationFunctions(global_context -> thread_contexts[xk1].junction_counting_table, free, NULL);
1841 			HashTableSetKeyComparisonFunction(global_context -> thread_contexts[xk1].junction_counting_table, fc_strcmp_chro);
1842 
1843 			HashTableDestroy(global_context -> thread_contexts[xk1].splicing_point_table);
1844 			global_context -> thread_contexts[xk1].splicing_point_table = HashTableCreate(131317);
1845 			HashTableSetHashFunction(global_context -> thread_contexts[xk1].splicing_point_table,HashTableStringHashFunction);
1846 			HashTableSetDeallocationFunctions(global_context -> thread_contexts[xk1].splicing_point_table, free, NULL);
1847 			HashTableSetKeyComparisonFunction(global_context -> thread_contexts[xk1].splicing_point_table, fc_strcmp_chro);
1848 		}
1849 
1850 		if(global_context -> assign_reads_to_RG){
1851 			HashTableDestroy(global_context -> thread_contexts[xk1].RG_table);
1852 			global_context -> thread_contexts[xk1].RG_table = HashTableCreate(97);
1853 			HashTableSetHashFunction(global_context -> thread_contexts[xk1].RG_table,HashTableStringHashFunction);
1854 			HashTableSetDeallocationFunctions(global_context -> thread_contexts[xk1].RG_table, free, disallocate_RG_tables);
1855 			HashTableSetKeyComparisonFunction(global_context -> thread_contexts[xk1].RG_table, fc_strcmp_chro);
1856 		}
1857 
1858 
1859 	}
1860 
1861 	if(global_context -> read_details_out_FP){
1862 		int tranc_ret = ftruncate(fileno(global_context -> read_details_out_FP), 0);
1863 		if(0 != tranc_ret) SUBREADprintf("ERROR: Unable to truncate assignment detail file\n");
1864 		fseek(global_context -> read_details_out_FP, 0 , SEEK_SET);
1865 	}
1866 }
1867 
is_value_contig_name(char * n,int l)1868 int is_value_contig_name(char * n, int l){
1869 	int x;
1870 	for(x=0; x<l; x++){
1871 		if(n[x]==0)continue;
1872 		if(n[x]>'~' || n[x]<'!') return 0;
1873 	}
1874 	return 1;
1875 }
1876 
FC_CRC32(char * dat,int len)1877 unsigned int FC_CRC32(char * dat, int len){
1878 	unsigned int crc0 = crc32(0, NULL, 0);
1879 	unsigned int ret = crc32(crc0, (unsigned char *)dat, len);
1880 	return ret;
1881 }
1882 
1883 struct simple_bam_writer_index_per_chro{
1884 	HashTable * index_binP1_table;
1885 	ArrayList * index_binP0_list;
1886 	ArrayList * win16k_list;
1887 };
1888 
simple_bam_writer_deallocate_index_per_chro(void * p)1889 void simple_bam_writer_deallocate_index_per_chro(void * p){
1890 	struct simple_bam_writer_index_per_chro * ch = p;
1891 	HashTableDestroy(ch->index_binP1_table);
1892 	ArrayListDestroy(ch->index_binP0_list);
1893 	ArrayListDestroy(ch->win16k_list);
1894 	free(ch);
1895 }
1896 
1897 
simple_bam_writer_new_index_per_chro()1898 struct simple_bam_writer_index_per_chro * simple_bam_writer_new_index_per_chro(){
1899 	struct simple_bam_writer_index_per_chro * ret = malloc(sizeof(struct simple_bam_writer_index_per_chro ));
1900 	ret -> index_binP1_table = HashTableCreate(4000);
1901 	HashTableSetDeallocationFunctions(ret -> index_binP1_table, NULL, (void (*) (void*)) ArrayListDestroy);
1902 	ret -> index_binP0_list = ArrayListCreate(20000);
1903 	ret -> win16k_list = ArrayListCreate(20000);
1904 	return ret;
1905 }
1906 
1907 #define MERGER_WORKER_BINSIZE 66000
1908 typedef struct {
1909 	FILE * bam_FP;
1910 	FILE * bai_FP;
1911 	z_stream strm;
1912 	char inbin[MERGER_WORKER_BINSIZE];
1913 	int inbin_len;
1914 	int total_chromosomes;
1915 	HashTable * bam_blockP1_to_offset0B_table;
1916 	HashTable * index_per_chro;
1917 } simple_bam_writer;
1918 
1919 #define MAX_ALLOWED_GAP_IN_BAI_CHUNK 10  // 10 blocks
1920 
simple_bam_writer_update_index(simple_bam_writer * writer,char * rbin,int binlen,srInt_64 block_number,int inbin_pos)1921 void simple_bam_writer_update_index(simple_bam_writer * writer, char * rbin, int binlen, srInt_64 block_number, int inbin_pos){
1922 	int chro_no=0;
1923 	memcpy(&chro_no, rbin + 4, 4);
1924 	if(chro_no<0)return;
1925 
1926 	unsigned int pos=0, bin_mq_nl=0;
1927 	memcpy(&pos, rbin + 8, 4);
1928 	memcpy(&bin_mq_nl, rbin + 12, 4);
1929 
1930 	struct simple_bam_writer_index_per_chro * index_chro = HashTableGet(writer -> index_per_chro, NULL+chro_no+1);
1931 	if(NULL==index_chro){
1932 		index_chro = simple_bam_writer_new_index_per_chro();
1933 		HashTablePut(writer -> index_per_chro, NULL+chro_no+1, index_chro);
1934 	}
1935 
1936 	unsigned int binno = bin_mq_nl>>16;
1937 	int cigar_span = SamBam_writer_calc_cigar_span(rbin +4);
1938 	int this_w16_no = (pos + cigar_span) >>14;      // WIN is calculated on 0-based pos.
1939 	unsigned long long this_Vpos = block_number<<16 | inbin_pos;
1940 	ArrayList * win16k_list = index_chro -> win16k_list;
1941 	// if this read is after the maximum coordinate in the win16k list: all elements before last one and this one starts at this read.
1942 	if(this_w16_no > win16k_list->numOfElements){
1943 		int bbi;
1944 		for(bbi = win16k_list->numOfElements; bbi <=this_w16_no; bbi++)
1945 			ArrayListPush(win16k_list, NULL+ this_Vpos);
1946 	}
1947 
1948 	ArrayList * this_bin_chunks = HashTableGet(index_chro -> index_binP1_table, NULL+binno+1);
1949 	if(NULL == this_bin_chunks){
1950 		this_bin_chunks = ArrayListCreate(4);
1951 		HashTablePut(index_chro -> index_binP1_table, NULL+binno+1, this_bin_chunks);
1952 		ArrayListPush(index_chro -> index_binP0_list, NULL+binno);
1953 	}
1954 	int found = 0;
1955 	// a bin is not necessarily continuous. Say, a top-level bin only contains a few reads (most reads a in low-level bins), but their locations are everywhere
1956 
1957 	if(this_bin_chunks -> numOfElements > 0){
1958 		long long diff = this_Vpos >>16;
1959 		diff -=(this_bin_chunks -> elementList [ this_bin_chunks -> numOfElements - 1] - NULL)>>16;
1960 		if(diff < MAX_ALLOWED_GAP_IN_BAI_CHUNK){
1961 			this_bin_chunks -> elementList [ this_bin_chunks -> numOfElements - 1] = NULL+this_Vpos + binlen + 4;
1962 			found = 1;
1963 		}
1964 	}
1965 	// if the last chunk in this bin isn't good to be extended (too far from the file location of the new read), a new chunk is created.
1966 	if(!found){
1967 		ArrayListPush(this_bin_chunks, NULL + this_Vpos);
1968 		ArrayListPush(this_bin_chunks, NULL + this_Vpos + binlen+4);
1969 	}
1970 }
1971 
simple_bam_write_compressed_block(simple_bam_writer * writer,char * obuf,int olen,int ilen,unsigned int crcval,srInt_64 block_number)1972 void simple_bam_write_compressed_block(simple_bam_writer * writer,char *obuf, int olen, int ilen, unsigned int crcval, srInt_64 block_number){
1973 	if(block_number >= 0)HashTablePut(writer -> bam_blockP1_to_offset0B_table, NULL+1+block_number, NULL+ftello(writer -> bam_FP));
1974 	fwrite("\x1f\x8b\x8\x4\0\0\0\0\0\0\x6\0", 1, 12, writer -> bam_FP);
1975 	fwrite("\x42\x43\x2\0", 1, 4, writer -> bam_FP);
1976 
1977 	int BSIZE = olen+19+6;
1978 	fwrite(&BSIZE, 1, 2, writer -> bam_FP);
1979 	fwrite(obuf, 1, olen, writer -> bam_FP);
1980 	fwrite(&crcval, 1, 4, writer -> bam_FP);
1981 	fwrite(&ilen, 1, 4, writer -> bam_FP);
1982 }
1983 
simple_bam_write(void * bin,int binlen,simple_bam_writer * writer,int force_flush)1984 void simple_bam_write(void * bin, int binlen, simple_bam_writer * writer, int force_flush){
1985 	int Z_DEFAULT_MEM_LEVEL = 8;
1986 	while(binlen > 0 || (force_flush && writer->inbin_len)){
1987 		int concatinate_binlen = min(binlen, 63000 - writer->inbin_len);
1988 		memcpy(writer->inbin+writer->inbin_len, bin, concatinate_binlen);
1989 
1990 		writer->inbin_len += concatinate_binlen;
1991 		bin += concatinate_binlen;
1992 		binlen -= concatinate_binlen;
1993 		if(writer->inbin_len >=63000 || force_flush){
1994 			deflateInit2(&writer -> strm, Z_BEST_SPEED, Z_DEFLATED, -15, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
1995 			char obuf[MERGER_WORKER_BINSIZE];
1996 			writer->strm.next_in = (unsigned char *)writer->inbin;
1997 			writer->strm.avail_in = writer->inbin_len;
1998 			writer->strm.next_out = (unsigned char *)obuf;
1999 			writer->strm.avail_out = MERGER_WORKER_BINSIZE;
2000 			deflate(&writer->strm, Z_FINISH);
2001 			int have = MERGER_WORKER_BINSIZE-writer->strm.avail_out;
2002 			simple_bam_write_compressed_block(writer, obuf, have, writer->inbin_len, FC_CRC32(writer->inbin, writer->inbin_len), -1);
2003 			writer->inbin_len=0;
2004 			deflateEnd(&writer -> strm);
2005 		}
2006 	}
2007 }
2008 
simple_bam_create(char * fname)2009 simple_bam_writer * simple_bam_create(char * fname){
2010 	simple_bam_writer * ret = calloc(sizeof(simple_bam_writer), 1);
2011 	ret -> bam_FP = fopen(fname, "wb");
2012 	ret -> bam_blockP1_to_offset0B_table = HashTableCreate(100000);
2013 	simple_bam_write("BAM\1", 4, ret, 0);
2014 
2015 	char bainame [strlen(fname)+10];
2016 	strcpy(bainame , fname);
2017 	strcat(bainame, ".bai");
2018 	ret -> bai_FP = fopen(bainame, "wb");
2019 	fwrite("BAI\1", 1, 4, ret -> bai_FP);
2020 	ret -> index_per_chro = HashTableCreate(1000);
2021 	HashTableSetDeallocationFunctions(ret -> index_per_chro , NULL , simple_bam_writer_deallocate_index_per_chro);
2022 	return ret;
2023 }
2024 
2025 #define vpos_to_rpos rposone = (vposone & 0xffff ) + ( (HashTableGet(writer -> bam_blockP1_to_offset0B_table, NULL+1+(vposone >>16)) - NULL) << 16 )
2026 
2027 #define BAM_EOF_MARKER "\x1f\x8b\x08\x04\0\0\0\0\0\xff\x06\0\x42\x43\x02\0\x1b\0\x03\0\0\0\0\0\0\0\0\0"
simple_bam_close(simple_bam_writer * writer)2028 void simple_bam_close(simple_bam_writer * writer){
2029 	fwrite(BAM_EOF_MARKER, 1, 28, writer -> bam_FP);
2030 	fclose(writer -> bam_FP);
2031 
2032 	fwrite(&writer -> total_chromosomes, 1, 4, writer -> bai_FP);
2033 	int chri;
2034 	for(chri=0; chri<writer -> total_chromosomes; chri++){
2035 		struct simple_bam_writer_index_per_chro *this_idx = HashTableGet(writer -> index_per_chro , NULL+1+chri);
2036 		if(NULL == this_idx ){
2037 			fwrite("\0\0\0\0\0\0\0\0", 1, 8, writer -> bai_FP);//0 intervals and 0 bins
2038 		}else{
2039 			HashTable * new_tab=NULL;
2040 			ArrayList * new_arr=NULL;
2041 			SamBam_writer_optimize_bins(this_idx -> index_binP1_table, this_idx -> index_binP0_list ,& new_tab, & new_arr);
2042 			this_idx -> index_binP1_table = new_tab;
2043 			this_idx -> index_binP0_list = new_arr;
2044 			fwrite(&this_idx -> index_binP0_list->numOfElements ,1, 4, writer -> bai_FP);
2045 			int bini;
2046 			for(bini = 0; bini < this_idx -> index_binP0_list -> numOfElements; bini ++){
2047 				int binno = ArrayListGet(this_idx -> index_binP0_list, bini)-NULL;
2048 				ArrayList * bingaps = HashTableGet(this_idx -> index_binP1_table, NULL+1+binno);
2049 				srInt_64 gapi = bingaps -> numOfElements/2;
2050 				fwrite(&binno, 1, 4, writer -> bai_FP);
2051 				fwrite(&gapi ,1, 4, writer -> bai_FP);
2052 				for(gapi = 0; gapi < bingaps -> numOfElements; gapi++){
2053 					srInt_64 rposone, vposone = ArrayListGet(bingaps , gapi)-NULL;
2054 					vpos_to_rpos;
2055 					fwrite(&rposone, 1, 8, writer -> bai_FP);
2056 				}
2057 			}
2058 
2059 			fwrite(&this_idx -> win16k_list -> numOfElements ,1, 4, writer -> bai_FP);
2060 			for(bini = 0; bini < this_idx -> win16k_list -> numOfElements; bini ++){
2061 				srInt_64 rposone, vposone = ArrayListGet(this_idx -> win16k_list , bini )-NULL;
2062 				vpos_to_rpos;
2063 				fwrite(&rposone, 1, 8, writer -> bai_FP);
2064 			}
2065 		}
2066 	}
2067 	HashTableDestroy(writer -> index_per_chro);
2068 	fclose(writer -> bai_FP);
2069 	free(writer);
2070 }
2071 
2072 
2073 #ifdef __MINGW32__
2074 #define this_memmem windows_memmem
2075 #else
2076 #define this_memmem memmem
2077 #endif
2078 
2079 
2080 void ** get_RG_tables(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * rg_name);
2081 int compress_read_detail_BAM(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, int write_start, int write_end, char * bam_buf);
2082 
scRNA_sample_SamBam_writers_add_header(void * k,void * v,HashTable * tab)2083 void scRNA_sample_SamBam_writers_add_header(void * k, void * v, HashTable * tab){
2084 	char *bin = tab -> appendix1;
2085 	int bin_len = tab -> counter1;
2086 	int txt_ptr = 0, old_ptr=0, xk1, bam_is_from_cellCounts = 0;
2087 	void ** vv = v;
2088 	simple_bam_writer * writer = vv[0];
2089 
2090 	ArrayList * chroname_size_list = ArrayListCreate(2000);
2091 	for(txt_ptr=0; txt_ptr <= bin_len; txt_ptr++){
2092 		if(bin[txt_ptr] == '\n' || txt_ptr == bin_len){
2093 			if(this_memmem(bin+old_ptr, txt_ptr - old_ptr,"Per-sample-BAM-output:cellCounts", 31))bam_is_from_cellCounts = 1;
2094 			else if(memcmp("@SQ\t",bin+old_ptr, 4)==0){
2095 				unsigned int seqlen = 0;
2096 				char seqname[MAX_CHROMOSOME_NAME_LEN];
2097 				int seqname_len = 0, state = 0;
2098 				for(xk1=3; xk1 + old_ptr < txt_ptr; xk1++){
2099 					int nch = bin[xk1 + old_ptr];
2100 					switch(state){
2101 						case 4:
2102 							if(nch == '\t' ) state = 1;
2103 							else seqname[seqname_len++]=nch;
2104 						break;
2105 
2106 						case 14:
2107 							if(isdigit(nch)) seqlen = seqlen*10+nch-'0';
2108 							else state = nch == '\t'?1:0;
2109 						break;
2110 
2111 						case 13:
2112 							if(nch == ':') state = 14;
2113 							else state = nch == '\t'?1:0;
2114 						break;
2115 
2116 						case 12:
2117 							if(nch == 'N') state = 13;
2118 							else state = nch == '\t'?1:0;
2119 						break;
2120 
2121 						case 3:
2122 							if(nch == ':') state = 4;
2123 							else state = nch == '\t'?1:0;
2124 						break;
2125 
2126 						case 2:
2127 							if(nch == 'N') state = 3;
2128 							else state = nch == '\t'?1:0;
2129 						break;
2130 
2131 						case 1:
2132 							if(nch == 'S' && seqname_len<1) state = 2;
2133 							else if(nch == 'L' && seqlen<1) state = 12;
2134 							else state = nch == '\t'?1:0;
2135 						break;
2136 
2137 						default:
2138 							if(nch == '\t') state = 1;
2139 					}
2140 				}
2141 				seqname[seqname_len]=0;
2142 				if(seqlen && seqname_len){
2143 					ArrayListPush(chroname_size_list,strdup(seqname));
2144 					ArrayListPush(chroname_size_list,NULL+seqlen);
2145 				}
2146 			}
2147 			old_ptr = txt_ptr+1;
2148 		}
2149 	}
2150 
2151 	char * ncoline = "@CO\tPer-sample-BAM-output:cellCounts\n";
2152 
2153 	int binlen_wtr = bin_len;
2154 	if(bin[bin_len-1]!='\n') binlen_wtr++;
2155 	if(!bam_is_from_cellCounts) binlen_wtr += strlen(ncoline);
2156 
2157 	simple_bam_write(&binlen_wtr,4,writer, 0);
2158 	simple_bam_write(bin,bin_len, writer, 0);
2159 	if(bin[bin_len-1]!='\n') simple_bam_write("\n",1, writer, 0);
2160 	if(!bam_is_from_cellCounts) simple_bam_write(ncoline,strlen(ncoline), writer, 0);
2161 
2162 	int seq_count = chroname_size_list -> numOfElements/2;
2163 	simple_bam_write(&seq_count, 4, writer, 0);
2164 	for(xk1 =0; xk1 < chroname_size_list -> numOfElements; xk1+=2){
2165 		char * seqname = ArrayListGet(chroname_size_list, xk1);
2166 		unsigned int seqlen = ArrayListGet(chroname_size_list, xk1+1)-NULL;
2167 		int seqname_len = 1+strlen(seqname);
2168 		simple_bam_write(&seqname_len,4,writer,0);
2169 		simple_bam_write(seqname,seqname_len,writer,0);
2170 		simple_bam_write(&seqlen,4,writer,0);
2171 		free(seqname);
2172 	}
2173 	ArrayListDestroy(chroname_size_list);
2174 	simple_bam_write("",0,writer,1);
2175 	writer -> total_chromosomes = seq_count;
2176 }
2177 
process_pairer_header(void * pairer_vp,int thread_no,int is_text,unsigned int items,char * bin,unsigned int bin_len)2178 int process_pairer_header (void * pairer_vp, int thread_no, int is_text, unsigned int items, char * bin, unsigned int bin_len){
2179 	SAM_pairer_context_t * pairer = (SAM_pairer_context_t *) pairer_vp;
2180 	fc_thread_global_context_t * global_context = (fc_thread_global_context_t * )pairer -> appendix1;
2181 	fc_thread_thread_context_t * thread_context = global_context -> thread_contexts;
2182 
2183 	//SUBREADprintf("ENTER PROCESS (THRD %d): IS_TXT=%d,  ITEMS = %d, CURRENT_ITEMS=%d\n", thread_no, is_text, items, global_context -> sambam_chro_table_items);
2184 
2185 	if(global_context -> is_scRNA_BAM_FQ_out_generated && global_context -> do_scRNA_table && is_text) {
2186 		global_context -> scRNA_sample_BAM_writers -> appendix1 = bin;
2187 		global_context -> scRNA_sample_BAM_writers -> counter1 = bin_len;
2188 		global_context -> scRNA_sample_BAM_writers -> appendix2 = global_context;
2189 		HashTableIteration( global_context -> scRNA_sample_BAM_writers, scRNA_sample_SamBam_writers_add_header);
2190 	}
2191 	if(global_context -> is_read_details_out == FILE_TYPE_BAM){
2192 		int write_cursor;
2193 		int first_block = 1;
2194 		for(write_cursor = 0; write_cursor < bin_len; write_cursor += 55000){
2195 			int wlen = min(55000, bin_len - write_cursor);
2196 
2197 			if( first_block ){
2198 				if(is_text)memcpy(thread_context -> read_details_buff, "BAM\1", 4);
2199 				memcpy(thread_context -> read_details_buff + (is_text?4:0), is_text?(&bin_len):(&items), 4);
2200 			}
2201 
2202 			memcpy(thread_context -> read_details_buff + (first_block?4*(1+is_text):0), bin + write_cursor, wlen);
2203 			int blen = compress_read_detail_BAM(global_context, thread_context, 0, wlen + (first_block?4*(1+is_text):0), thread_context -> bam_compressed_buff);
2204 			fwrite( thread_context -> bam_compressed_buff, 1, blen, global_context -> read_details_out_FP);
2205 			first_block = 0;
2206 		}
2207 	}else if( global_context -> is_read_details_out == FILE_TYPE_SAM && is_text ){
2208 		fwrite( bin, 1, bin_len, global_context -> read_details_out_FP);
2209 	}
2210 	if(is_text ){
2211 		if( global_context -> assign_reads_to_RG ){
2212 			global_context->RGnames_capacity = 10000;
2213 			global_context->RGnames_ptr = 0;
2214 			global_context->RGnames_set =  malloc( global_context->RGnames_capacity );
2215 
2216 			int rcursor=0;
2217 			for(;rcursor<bin_len; rcursor++){
2218 				assert(bin[rcursor] == '@'&& bin[rcursor+3] == '\t');
2219 				if(bin[rcursor+1]=='R' && bin[rcursor+2]=='G'){
2220 					int id_start = -1, id_end = -1;
2221 					for(; rcursor < bin_len; rcursor++){
2222 						if(bin[rcursor]=='I' && bin[rcursor+1]=='D'){
2223 							id_start = rcursor + 3;
2224 							id_end = 0;
2225 						}
2226 						for(; rcursor < bin_len; rcursor++){
2227 							if(bin[rcursor]=='\t' || bin[rcursor]=='\n'){
2228 								if(id_end < 1)id_end = rcursor;
2229 								break;
2230 							}
2231 						}
2232 						if(bin[rcursor]=='\n') break;
2233 					}
2234 
2235 					if(id_start > 0){
2236 						int id_len = id_end - id_start;
2237 						if(global_context->RGnames_capacity < global_context->RGnames_ptr + id_len + 3){
2238 							global_context->RGnames_capacity = global_context->RGnames_capacity * 17 / 10;
2239 							global_context->RGnames_set = realloc( global_context->RGnames_set , global_context->RGnames_capacity );
2240 						}
2241 						memcpy(global_context->RGnames_set + global_context->RGnames_ptr, bin + id_start, id_len);
2242 						global_context->RGnames_set[global_context->RGnames_ptr+id_len]='\t';
2243 						global_context->RGnames_ptr += id_len+1;
2244 					}
2245 				}
2246 				for( ;rcursor<bin_len; rcursor++ ) if(bin[rcursor] == '\n')break;
2247 			}
2248 			if(global_context->RGnames_ptr>0){
2249 				global_context->RGnames_set[global_context->RGnames_ptr-1]=0;
2250 				global_context->RGnames_ptr--;
2251 			}
2252 			//SUBREADprintf("RGList: %s\n", global_context->RGnames_set);
2253 
2254 			int thread_no;
2255 			for(thread_no = 0; thread_no < global_context -> thread_number; thread_no ++){
2256 				fc_thread_thread_context_t * RGthread_context = global_context -> thread_contexts + thread_no;
2257 				int RGcursor = 0;
2258 				char *lastRGptr = global_context->RGnames_set;
2259 				for(; RGcursor < global_context->RGnames_ptr+1; RGcursor++){
2260 					if(global_context->RGnames_set[ RGcursor ] == '\t' || global_context->RGnames_set[ RGcursor ] == 0){
2261 						global_context->RGnames_set[ RGcursor ] = 0;
2262 						if(strlen(lastRGptr)>0){
2263 					//		SUBREADprintf("PUT 4Tab:'%s'\n", lastRGptr);
2264 							get_RG_tables(global_context, RGthread_context, lastRGptr);
2265 							lastRGptr = global_context->RGnames_set + RGcursor +1;
2266 							if(RGcursor < global_context->RGnames_ptr)
2267 								global_context->RGnames_set[ RGcursor ] = '\t';
2268 						}
2269 					}
2270 				}
2271 			}
2272 		}
2273 	}else{
2274 		if(global_context -> sambam_chro_table)
2275 			global_context -> sambam_chro_table = delay_realloc(global_context -> sambam_chro_table, global_context -> sambam_chro_table_items * sizeof(SamBam_Reference_Info), (items + global_context -> sambam_chro_table_items) * sizeof(SamBam_Reference_Info));
2276 		else global_context -> sambam_chro_table = malloc(items * sizeof(SamBam_Reference_Info));
2277 
2278 		int x1, bin_ptr = 0;
2279 		for(x1 =  global_context -> sambam_chro_table_items; x1 <  global_context -> sambam_chro_table_items+items; x1++){
2280 			int l_name;
2281 			memcpy(&l_name, bin + bin_ptr, 4);
2282 			bin_ptr += 4;
2283 
2284 			if( !is_value_contig_name(bin + bin_ptr, l_name)){
2285 				SUBREADprintf("The chromosome name contains unexpected characters: \"%s\" (%d chars)\nfeatureCounts has to stop running\n", bin + bin_ptr, l_name);
2286 				return -1;
2287 			}
2288 			if(l_name >= MAX_CHROMOSOME_NAME_LEN){
2289 				SUBREADprintf("The chromosome name of \"%s\" contains %d characters, longer than the upper limit of %d\nfeatureCounts has to stop running\n",  bin + bin_ptr , l_name, MAX_CHROMOSOME_NAME_LEN - 1);
2290 				return -1;
2291 			}
2292 			memcpy(global_context -> sambam_chro_table[x1].chro_name ,  bin + bin_ptr, l_name);
2293 			//SUBREADprintf("The %d-th is '%s'\n", x1, global_context -> sambam_chro_table[x1].chro_name);
2294 			bin_ptr += l_name;
2295 			memcpy(&global_context -> sambam_chro_table[x1].chro_length ,  bin + bin_ptr, 4);
2296 			bin_ptr += 4;
2297 		}
2298 		global_context -> sambam_chro_table_items += items;
2299 	}
2300 	return 0;
2301 }
2302 
2303 void process_line_buffer(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * bin1, char * bin2);
2304 
make_dummy(char * rname,char * bin1,char * out_txt2,SamBam_Reference_Info * sambam_chro_table)2305 void make_dummy(char * rname, char * bin1, char * out_txt2,  SamBam_Reference_Info * sambam_chro_table){
2306 	char * tmptr = NULL;
2307 
2308 	//SUBREADprintf("S=%s  ", rname);
2309 	char * realname = strtok_r(rname, "\027", &tmptr);
2310 	//int len_name = strlen(realname);
2311 	int r1_chro = atoi(strtok_r(NULL, "\027", &tmptr));
2312 	int r1_pos = atoi(strtok_r(NULL, "\027", &tmptr));
2313 	int r2_chro = atoi(strtok_r(NULL, "\027", &tmptr));
2314 	int r2_pos = atoi(strtok_r(NULL, "\027", &tmptr));
2315 	int HItag = atoi(strtok_r(NULL, "\027", &tmptr));
2316 	int mate_FLAG = 0;
2317 	memcpy(&mate_FLAG, bin1 + 16, 4);
2318 	mate_FLAG = 0xffff&(mate_FLAG >>16);
2319 	int mate_tlen = 0;
2320 	memcpy(&mate_tlen, bin1 + 32, 4);
2321 
2322 	if(r1_chro<0) r1_pos=-1;
2323 	if(r2_chro<0) r2_pos=-1;
2324 
2325 	int my_chro = (mate_FLAG&0x40)? r2_chro : r1_chro;
2326 	int my_pos = (mate_FLAG&0x40)? r2_pos : r1_pos;
2327 	int mate_chro = (mate_FLAG&0x40)? r1_chro : r2_chro;
2328 	int mate_pos = (mate_FLAG&0x40)? r1_pos : r2_pos;
2329 
2330 	//int bin_mq_nl = (len_name+1);
2331 	int my_flag = (mate_FLAG&0x40)? 0x80:0x40;
2332 	my_flag |= 1;
2333 	if(mate_FLAG & 8)my_flag |=4;
2334 	if(mate_FLAG & 4)my_flag |=8;
2335 	if(mate_FLAG & 0x10) my_flag |= 0x20;
2336 	if(mate_FLAG & 0x20) my_flag |= 0x10;
2337 
2338 	char HItagStr[20];
2339 	if(HItag>=0){
2340 		sprintf(HItagStr, "\tHI:i:%d", HItag);
2341 	}else{
2342 		HItagStr[0]=0;
2343 	}
2344 
2345 	char * my_chro_str = "*";
2346 	if(my_chro >= 0) my_chro_str = sambam_chro_table[my_chro].chro_name;
2347 
2348 	char * mate_chro_str = "*";
2349 	if(mate_chro >= 0) mate_chro_str = sambam_chro_table[mate_chro].chro_name;
2350 
2351 	sprintf(out_txt2, "%s\t%d\t%s\t%d\t0\t*\t%s\t%d\t0\tN\tI\t%s", realname, my_flag, my_chro_str, max(0, my_pos),
2352 		mate_chro_str, max(0,mate_pos), HItagStr);
2353 }
2354 
reverse_flag(int mf)2355 int reverse_flag(int mf){
2356 	int ret = mf & 3;
2357 	if(mf & 4) ret |= 8;
2358 	if(mf & 8) ret |= 4;
2359 	if((mf & 1)==0) ret |= 4;
2360 
2361 	if(mf & 0x10) ret |= 0x20;
2362 	if(mf & 0x20) ret |= 0x10;
2363 
2364 	if(mf & 0x40) ret |= 0x80;
2365 	if(mf & 0x80) ret |= 0x40;
2366 	return ret;
2367 }
2368 
calc_total_frag_one_len(CIGAR_interval_t * intvs,int intvn,char * read_name)2369 int calc_total_frag_one_len(CIGAR_interval_t * intvs, int intvn, char * read_name){
2370 	int ret = 0, x1;
2371 	for(x1 = 0; x1 < intvn; x1++){
2372 		int x2;
2373 		//#warning "=========== DEBUG OUT =============="
2374 		if(0 && FIXLENstrcmp("V0112_0155:7:1101:20072:12961#ATCAC", read_name)==0){
2375 			SUBREADprintf("READ %s SINGLE: chro_len = %d, secs = %d\n" , read_name, intvs[x1].chromosomal_length, intvs[x1].insertions);
2376 		}
2377 		for(x2 = 0; x2 < intvs[x1].insertions; x2++) ret += intvs[x1].insertion_lengths[x2];
2378 		ret += intvs[x1].chromosomal_length;
2379 	}
2380 	return ret;
2381 }
2382 
calc_total_has_overlap(unsigned int r1_start,unsigned int r1_end,unsigned int r2_start,unsigned int r2_end,unsigned int * overlap_start,unsigned int * overlap_end)2383 int calc_total_has_overlap(unsigned int r1_start, unsigned int r1_end, unsigned int r2_start, unsigned int r2_end, unsigned int * overlap_start, unsigned int * overlap_end){
2384 	if((r1_start <= r2_start && r1_end > r2_start) || (r2_start <= r1_start && r2_end > r1_start) ){
2385 		(*overlap_start) = max( r1_start, r2_start );
2386 		(*overlap_end) = min( r1_end, r2_end );
2387 		return 1;
2388 	}
2389 	return 0;
2390 }
2391 
calc_total_frag_len(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,CIGAR_interval_t * CIGAR_intervals_R1,int CIGAR_intervals_R1_sections,CIGAR_interval_t * CIGAR_intervals_R2,int CIGAR_intervals_R2_sections,char * read_name)2392 int calc_total_frag_len( fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, CIGAR_interval_t * CIGAR_intervals_R1, int CIGAR_intervals_R1_sections, CIGAR_interval_t * CIGAR_intervals_R2, int CIGAR_intervals_R2_sections, char * read_name){
2393 	if     ( CIGAR_intervals_R1_sections == 0 && CIGAR_intervals_R2_sections > 0) return calc_total_frag_one_len( CIGAR_intervals_R2,CIGAR_intervals_R2_sections , read_name);
2394 	else if( CIGAR_intervals_R1_sections  > 0 && CIGAR_intervals_R2_sections== 0) return calc_total_frag_one_len( CIGAR_intervals_R1,CIGAR_intervals_R1_sections , read_name);
2395 	else if( CIGAR_intervals_R1_sections == 0 && CIGAR_intervals_R2_sections== 0) return 0;
2396 
2397 	if(CIGAR_intervals_R1_sections > 0 && CIGAR_intervals_R2_sections > 0 && strcmp(CIGAR_intervals_R1[0].chro, CIGAR_intervals_R2[0].chro )!=0 )
2398 		// two reads are from different chromosomes
2399 		return calc_total_frag_one_len( CIGAR_intervals_R2,CIGAR_intervals_R2_sections , read_name) + calc_total_frag_one_len( CIGAR_intervals_R1,CIGAR_intervals_R1_sections , read_name);
2400 
2401 	if(0 && FIXLENstrcmp("V0112_0155:7:1101:20072:12961#ATCAC", read_name)==0){
2402 		int xx;
2403 		for(xx = 0; xx < CIGAR_intervals_R1_sections; xx++)
2404 				SUBREADprintf("R1 SEC %d: %u + %d\n", xx, CIGAR_intervals_R1[xx].start_pos,  CIGAR_intervals_R1[xx].chromosomal_length );
2405 		for(xx = 0; xx < CIGAR_intervals_R2_sections; xx++)
2406 				SUBREADprintf("R2 SEC %d: %u + %d\n", xx, CIGAR_intervals_R2[xx].start_pos,  CIGAR_intervals_R2[xx].chromosomal_length );
2407 	}
2408 
2409 	unsigned int merged_section_count = 0;
2410 	unsigned short merged_section_lengths[ MAXIMUM_INSERTION_IN_SECTION * 3 ];
2411 	unsigned int merged_section_indel_counts[ MAXIMUM_INSERTION_IN_SECTION * 3 ];
2412 	unsigned short merged_section_indel_lengths[ MAXIMUM_INSERTION_IN_SECTION * 3 ][ MAXIMUM_INSERTION_IN_SECTION ];
2413 
2414 	int R1_i = 0 , R2_i = 0;
2415 	while (1){
2416 		//SUBREADprintf("FRAGDEBUG %s : %d < %d & %d < %d; MC=%d; INS1=%d INS2=%d\n", read_name, R1_i,CIGAR_intervals_R1_sections,R2_i,CIGAR_intervals_R2_sections, merged_section_count, CIGAR_intervals_R1[R1_i].insertions, CIGAR_intervals_R2[R2_i].insertions);
2417 		if( R1_i >= CIGAR_intervals_R1_sections &&  R2_i >= CIGAR_intervals_R2_sections ) break;
2418 
2419 		if( R1_i < CIGAR_intervals_R1_sections && R2_i < CIGAR_intervals_R2_sections){
2420 			// see if R1 and R2 overlap
2421 			// if not: add R2 to specific sction; R2_i ++
2422 			// elif overlap: add the R1 first_half and/or R2 first_half or zero to specific section, and add overlapping part to overlapping section; DO NOT add the second specific half!
2423 			// 	if R1_end > R2_end: R1_section_start = overlapping_end; R2_i ++
2424 			// 	elif R2_end > R1_end: R2_section_start = overlapping_end; R1_i ++
2425 			// 	elif R2_end == R1_end: R1_i++; R2_i++
2426 
2427 			unsigned int overlapping_start= 0 ,  overlapping_end = 0;
2428 
2429 			int is_r1r2_overlap = 0;
2430 
2431 			is_r1r2_overlap = calc_total_has_overlap( CIGAR_intervals_R1[R1_i].start_pos, CIGAR_intervals_R1[R1_i].start_pos + CIGAR_intervals_R1[R1_i].chromosomal_length , CIGAR_intervals_R2[R2_i].start_pos, CIGAR_intervals_R2[R2_i].start_pos + CIGAR_intervals_R2[R2_i].chromosomal_length , & overlapping_start , & overlapping_end);
2432 
2433 			if( is_r1r2_overlap ){
2434 				if (CIGAR_intervals_R1[R1_i].start_pos > CIGAR_intervals_R2[R2_i].start_pos ){
2435 					//first half_R2 add special
2436 					merged_section_lengths[merged_section_count] = overlapping_start - CIGAR_intervals_R2[R2_i].start_pos;
2437 
2438 					int indel_i;
2439 					for(indel_i = 0; indel_i < min(MAXIMUM_INSERTION_IN_SECTION,CIGAR_intervals_R2[R2_i].insertions); indel_i++){
2440 						if( CIGAR_intervals_R2[R2_i].insertion_start_pos[indel_i] >= overlapping_start ){
2441 							if(indel_i>0){
2442 								int insmov_i, ins_dist_i = 0;
2443 								for(insmov_i = indel_i ; insmov_i < CIGAR_intervals_R2[R2_i].insertions; insmov_i++){
2444 									assert(MAXIMUM_INSERTION_IN_SECTION > ins_dist_i);
2445 									assert(MAXIMUM_INSERTION_IN_SECTION > insmov_i);
2446 									CIGAR_intervals_R2[R2_i].insertion_start_pos[ins_dist_i] = CIGAR_intervals_R2[R2_i].insertion_start_pos[insmov_i];
2447 									CIGAR_intervals_R2[R2_i].insertion_lengths[ins_dist_i] = CIGAR_intervals_R2[R2_i].insertion_lengths[insmov_i];
2448 									ins_dist_i++;
2449 								}
2450 								CIGAR_intervals_R2[R2_i].insertions = ins_dist_i;
2451 							}
2452 							break;
2453 						}
2454 						merged_section_indel_lengths[merged_section_count][indel_i] = CIGAR_intervals_R2[R2_i].insertion_lengths[indel_i];
2455 					}
2456 					merged_section_indel_counts[merged_section_count] = indel_i;
2457 
2458 					merged_section_count ++;
2459 
2460 				}else if( CIGAR_intervals_R1[R1_i].start_pos < CIGAR_intervals_R2[R2_i].start_pos ){
2461 					//first half_R1 add special
2462 					merged_section_lengths[merged_section_count] = overlapping_start - CIGAR_intervals_R1[R1_i].start_pos;
2463 
2464 					int indel_i;
2465 					for(indel_i = 0; indel_i < min(MAXIMUM_INSERTION_IN_SECTION,CIGAR_intervals_R1[R1_i].insertions); indel_i++){
2466 						if( CIGAR_intervals_R1[R1_i].insertion_start_pos[indel_i] >= overlapping_start ){
2467 							if(indel_i>0){
2468 								int insmov_i, ins_dist_i = 0;
2469 								for(insmov_i = indel_i ; insmov_i < CIGAR_intervals_R1[R1_i].insertions; insmov_i++){
2470 									assert(MAXIMUM_INSERTION_IN_SECTION > insmov_i);
2471 									CIGAR_intervals_R1[R1_i].insertion_start_pos[ins_dist_i] = CIGAR_intervals_R1[R1_i].insertion_start_pos[insmov_i];
2472 									CIGAR_intervals_R1[R1_i].insertion_lengths[ins_dist_i] = CIGAR_intervals_R1[R1_i].insertion_lengths[insmov_i];
2473 									ins_dist_i++;
2474 								}
2475 								CIGAR_intervals_R1[R1_i].insertions = ins_dist_i;
2476 							}
2477 							break;
2478 						}
2479 
2480 						merged_section_indel_lengths[merged_section_count][indel_i] = CIGAR_intervals_R1[R1_i].insertion_lengths[indel_i];
2481 					}
2482 					merged_section_indel_counts[merged_section_count] = indel_i;
2483 
2484 					merged_section_count ++;
2485 				}
2486 
2487 				merged_section_lengths[merged_section_count] = overlapping_end - overlapping_start;
2488 				merged_section_indel_counts[merged_section_count] = 0;
2489 
2490 
2491 				int indel_i_R1 = 0, indel_i_R2 = 0;
2492 				while(1){
2493 					//SUBREADprintf("FRAGDEBUG: CC[%d] = %d ; II1=%d < %d; II2=%d < %d\n", merged_section_count,  merged_section_indel_counts[merged_section_count], indel_i_R1, CIGAR_intervals_R1[R1_i].insertions , indel_i_R2, CIGAR_intervals_R2[R2_i].insertions);
2494 
2495 					if( indel_i_R1 >= CIGAR_intervals_R1[R1_i].insertions ||  indel_i_R2 >= CIGAR_intervals_R2[R2_i].insertions ){
2496 						if(indel_i_R1 > 0){
2497 							int insmov_i, ins_dist_i = 0;
2498 							for(insmov_i = indel_i_R1 ; insmov_i < min(MAXIMUM_INSERTION_IN_SECTION,CIGAR_intervals_R1[R1_i].insertions); insmov_i++){
2499 								assert(MAXIMUM_INSERTION_IN_SECTION > insmov_i);
2500 								CIGAR_intervals_R1[R1_i].insertion_start_pos[ins_dist_i] = CIGAR_intervals_R1[R1_i].insertion_start_pos[insmov_i];
2501 								CIGAR_intervals_R1[R1_i].insertion_lengths[ins_dist_i] = CIGAR_intervals_R1[R1_i].insertion_lengths[insmov_i];
2502 								ins_dist_i++;
2503 							}
2504 							CIGAR_intervals_R1[R1_i].insertions = ins_dist_i;
2505 						}
2506 						if(indel_i_R2 > 0){
2507 							int insmov_i, ins_dist_i = 0;
2508 							for(insmov_i = indel_i_R2 ; insmov_i < min(CIGAR_intervals_R2[R2_i].insertions,MAXIMUM_INSERTION_IN_SECTION); insmov_i++){
2509 								assert(MAXIMUM_INSERTION_IN_SECTION > insmov_i);
2510 								CIGAR_intervals_R2[R2_i].insertion_start_pos[ins_dist_i] = CIGAR_intervals_R2[R2_i].insertion_start_pos[insmov_i];
2511 								CIGAR_intervals_R2[R2_i].insertion_lengths[ins_dist_i] = CIGAR_intervals_R2[R2_i].insertion_lengths[insmov_i];
2512 								ins_dist_i++;
2513 							}
2514 							CIGAR_intervals_R2[R2_i].insertions = ins_dist_i;
2515 						}
2516 						break;
2517 					}
2518 
2519 					if( CIGAR_intervals_R1[R1_i].insertion_start_pos[indel_i_R1] > CIGAR_intervals_R2[R2_i].insertion_start_pos[indel_i_R2] ) indel_i_R2 ++;
2520 					else if( CIGAR_intervals_R1[R1_i].insertion_start_pos[indel_i_R1] < CIGAR_intervals_R2[R2_i].insertion_start_pos[indel_i_R2]  ) indel_i_R1 ++;
2521 					else{
2522 						if( CIGAR_intervals_R1[R1_i].insertion_lengths[ indel_i_R1 ] == CIGAR_intervals_R2[R2_i].insertion_lengths[ indel_i_R2 ] ){
2523 							merged_section_indel_lengths[merged_section_count][ merged_section_indel_counts[merged_section_count] ] = CIGAR_intervals_R1[R1_i].insertion_lengths[indel_i_R1];
2524 							merged_section_indel_counts[merged_section_count] ++;
2525 						}
2526 						indel_i_R2++;
2527 						indel_i_R1++;
2528 					}
2529 				}
2530 
2531 				merged_section_count ++;
2532 
2533 				// add common
2534 
2535 				if(CIGAR_intervals_R1[R1_i].start_pos + CIGAR_intervals_R1[R1_i].chromosomal_length > CIGAR_intervals_R2[R2_i].start_pos + CIGAR_intervals_R2[R2_i].chromosomal_length){
2536 					CIGAR_intervals_R1[R1_i].chromosomal_length -= ( overlapping_end - CIGAR_intervals_R1[R1_i].start_pos );
2537 					CIGAR_intervals_R1[R1_i].start_pos = overlapping_end;
2538 					R2_i ++;
2539 				}else if(CIGAR_intervals_R1[R1_i].start_pos + CIGAR_intervals_R1[R1_i].chromosomal_length < CIGAR_intervals_R2[R2_i].start_pos + CIGAR_intervals_R2[R2_i].chromosomal_length){
2540 					CIGAR_intervals_R2[R2_i].chromosomal_length -= ( overlapping_end - CIGAR_intervals_R2[R2_i].start_pos );
2541 					CIGAR_intervals_R2[R2_i].start_pos = overlapping_end;
2542 					R1_i ++;
2543 				}else{
2544 					R1_i ++;
2545 					R2_i ++;
2546 				}
2547 
2548 			}else if(CIGAR_intervals_R1[R1_i].start_pos >  CIGAR_intervals_R2[R2_i].start_pos){
2549 				merged_section_lengths[merged_section_count] = CIGAR_intervals_R2[R2_i].chromosomal_length;
2550 
2551 				int indel_i;
2552 				for(indel_i = 0; indel_i < CIGAR_intervals_R2[R2_i].insertions; indel_i++){
2553 					merged_section_indel_lengths[merged_section_count][indel_i] = CIGAR_intervals_R2[R2_i].insertion_lengths[indel_i];
2554 				}
2555 				merged_section_indel_counts[merged_section_count] = CIGAR_intervals_R2[R2_i].insertions;
2556 
2557 				merged_section_count ++;
2558 				R2_i ++;
2559 			}else{
2560 				merged_section_lengths[merged_section_count] = CIGAR_intervals_R1[R1_i].chromosomal_length;
2561 
2562 				int indel_i;
2563 				for(indel_i = 0; indel_i < CIGAR_intervals_R1[R1_i].insertions; indel_i++){
2564 					merged_section_indel_lengths[merged_section_count][indel_i] = CIGAR_intervals_R1[R1_i].insertion_lengths[indel_i];
2565 				}
2566 				merged_section_indel_counts[merged_section_count] = CIGAR_intervals_R1[R1_i].insertions;
2567 
2568 				merged_section_count ++;
2569 				R1_i ++;
2570 			}
2571 		}else if(R1_i < CIGAR_intervals_R1_sections){
2572 			// add R1 section to specific section
2573 			// R1_i ++
2574 			merged_section_lengths[merged_section_count] = CIGAR_intervals_R1[R1_i].chromosomal_length;
2575 
2576 			int indel_i;
2577 			for(indel_i = 0; indel_i < CIGAR_intervals_R1[R1_i].insertions; indel_i++){
2578 				merged_section_indel_lengths[merged_section_count][indel_i] = CIGAR_intervals_R1[R1_i].insertion_lengths[indel_i];
2579 			}
2580 			merged_section_indel_counts[merged_section_count] = CIGAR_intervals_R1[R1_i].insertions;
2581 
2582 			merged_section_count ++;
2583 			R1_i ++;
2584 		}else if(R2_i < CIGAR_intervals_R2_sections){
2585 			merged_section_lengths[merged_section_count] = CIGAR_intervals_R2[R2_i].chromosomal_length;
2586 
2587 			int indel_i;
2588 			for(indel_i = 0; indel_i < CIGAR_intervals_R2[R2_i].insertions; indel_i++){
2589 				merged_section_indel_lengths[merged_section_count][indel_i] = CIGAR_intervals_R2[R2_i].insertion_lengths[indel_i];
2590 			}
2591 			merged_section_indel_counts[merged_section_count] = CIGAR_intervals_R2[R2_i].insertions;
2592 
2593 			merged_section_count ++;
2594 			R2_i ++;
2595 		}
2596 	}
2597 
2598 	int ret = 0, x1, x2;
2599 	for(x1 = 0; x1 < merged_section_count ; x1++){
2600 		ret += merged_section_lengths[x1];
2601 		for(x2 = 0; x2 < merged_section_indel_counts[x1]; x2++)
2602 			ret += merged_section_indel_lengths[x1][x2];
2603 //		SUBREADprintf("FRAGDEBUG %s [%d] : len = %d , indels = %d\n" , read_name, x1, merged_section_lengths[x1] , merged_section_indel_counts[x1]);
2604 	}
2605 
2606 	return ret;
2607 }
2608 
get_readname_from_bin(char * bin,char ** read_name)2609 void get_readname_from_bin(char * bin, char ** read_name){
2610 	(*read_name) = bin + 36;
2611 }
2612 
parse_bin(SamBam_Reference_Info * sambam_chro_table,char * bin,char * bin2,char ** read_name,int * flag,char ** chro,srInt_64 * pos,int * mapq,char ** mate_chro,srInt_64 * mate_pos,srInt_64 * tlen,int * is_junction_read,int * cigar_sect,unsigned int * Starting_Chro_Points_1BASE,unsigned short * Starting_Read_Points,unsigned short * Section_Read_Lengths,char ** ChroNames,char * Event_After_Section,int * NH_value,int max_M,CIGAR_interval_t * intervals_buffer,int * intervals_i,int assign_reads_to_RG,char ** RG_ptr,int * ret_me_refID,int * ret_mate_refID)2613 void parse_bin(SamBam_Reference_Info * sambam_chro_table, char * bin, char * bin2, char ** read_name, int * flag, char ** chro, srInt_64 * pos, int * mapq, char ** mate_chro, srInt_64 * mate_pos, srInt_64 * tlen, int * is_junction_read, int * cigar_sect, unsigned int * Starting_Chro_Points_1BASE, unsigned short * Starting_Read_Points, unsigned short * Section_Read_Lengths, char ** ChroNames, char * Event_After_Section, int * NH_value, int max_M, CIGAR_interval_t * intervals_buffer, int * intervals_i, int assign_reads_to_RG, char ** RG_ptr, int * ret_me_refID, int * ret_mate_refID){
2614 	int x1, len_of_S1 = 0;
2615 	*cigar_sect = 0;
2616 	*NH_value = 1;
2617 	*flag = 0;
2618 	*is_junction_read = 0;
2619 	assert(bin||bin2);
2620 
2621 	if(bin){
2622 		(*read_name) = bin + 36;
2623 		memcpy(flag, bin + 16, 4);
2624 		int cigar_opts = (*flag) & 0xffff;
2625 		(*flag) = (*flag) >> 16;
2626 		int refID, mate_refID;
2627 		memcpy(&refID, bin + 4, 4);
2628 		if(refID >= 0) (*chro) = sambam_chro_table[refID].chro_name;
2629 		else (*chro) = NULL;
2630 
2631 		(*pos) = 0;
2632 		memcpy(pos, bin+8, 4);
2633 		(*pos) ++;
2634 
2635 		memcpy(mapq, bin+12, 4);
2636 		int l_read_name = (*mapq)& 0xff;
2637 		(*mapq) = ((*mapq)>>8)&0xff;
2638 
2639 		int seq_len;
2640 		memcpy(&seq_len, bin + 20,4);
2641 		memcpy(&mate_refID, bin+24, 4);
2642 		if(mate_refID>=0) (*mate_chro) = sambam_chro_table[mate_refID].chro_name;
2643 		else	(*mate_chro) = NULL;
2644 
2645 		*ret_mate_refID = mate_refID;
2646 		*ret_me_refID = refID;
2647 
2648 		(*mate_pos)=0;
2649 		memcpy(mate_pos, bin+28, 4);
2650 		(*mate_pos)++;
2651 
2652 		int tlen_int;
2653 		memcpy(&tlen_int, bin+32, 4);
2654 		(*tlen) = tlen_int;
2655 
2656 		int * cigar_opt_ints = (int *)(bin + 36 + l_read_name);
2657 		unsigned int chro_cursor = (*pos), section_start_chro = (*pos);
2658 		unsigned short read_cursor = 0, this_section_length = 0, section_start_read = 0;
2659 
2660 		if(intervals_buffer){
2661 			intervals_buffer[ *intervals_i ].start_pos = chro_cursor;
2662 			intervals_buffer[ *intervals_i ].chro = *chro;
2663 		}
2664 
2665 		for(x1 = 0 ; x1 < cigar_opts; x1++){
2666 			int optype = cigar_opt_ints[x1]&0xf;
2667 			int optval = (cigar_opt_ints[x1]>>4)& 0xfffffff;
2668 			if(optype == 0 || optype == 7 || optype == 8){ // 'M' , '=', 'X'
2669 				chro_cursor += optval;
2670 				read_cursor += optval;
2671 				this_section_length += optval;
2672 /*			}else if(optype == 1){ // 'I'
2673 				read_cursor += optval;
2674 			}else if(optype == 2){ // 'D'
2675 				chro_cursor += optval;
2676 */			}else if(optype == 1 || optype == 2 || optype == 3){ // 'I', 'D' or 'N'
2677 				if(3 == optype)
2678 					(*is_junction_read) = 1;
2679 				char event_char=0;
2680 				if(optype == 3) event_char = 'N';
2681 				if(optype == 2) event_char = 'D';
2682 				else if(optype == 1){
2683 					if(intervals_buffer && intervals_buffer[ *intervals_i ].insertions < MAXIMUM_INSERTION_IN_SECTION){
2684 						intervals_buffer[ *intervals_i ].insertion_start_pos[  intervals_buffer[ *intervals_i ].insertions  ] = chro_cursor;
2685 						intervals_buffer[ *intervals_i ].insertion_lengths[ intervals_buffer[ *intervals_i ].insertions ] = optval;
2686 						intervals_buffer[ *intervals_i ].insertions ++;
2687 					}
2688 					event_char = 'I';
2689 				}
2690 
2691 				if( (*cigar_sect) < max_M){
2692 					Event_After_Section[*cigar_sect] = event_char;
2693 					Starting_Chro_Points_1BASE[*cigar_sect] = section_start_chro;
2694 					Starting_Read_Points[*cigar_sect] = section_start_read;
2695 					Section_Read_Lengths[*cigar_sect] = this_section_length;
2696 					ChroNames[*cigar_sect] = (*chro);
2697 					(*cigar_sect)++;
2698 
2699 					if(intervals_buffer){
2700 						intervals_buffer[ *intervals_i ].chromosomal_length = chro_cursor - intervals_buffer[ *intervals_i ].start_pos;
2701 						(*intervals_i) ++;
2702 					}
2703 				}
2704 
2705 				if(optype == 2 || optype == 3)// N or D
2706 					chro_cursor += optval;
2707 				else
2708 					read_cursor += optval;
2709 
2710 				if(intervals_buffer && (*cigar_sect) < max_M){
2711 					intervals_buffer[ *intervals_i ].start_pos = chro_cursor;
2712 					intervals_buffer[ *intervals_i ].chro = *chro;
2713 				}
2714 
2715 				section_start_chro = chro_cursor;
2716 				section_start_read = read_cursor;
2717 				this_section_length = 0;
2718 			}else if(optype == 4){ // 'S'
2719 				if(read_cursor==0)
2720 				{
2721 					read_cursor += optval;
2722 					section_start_read = read_cursor;
2723 
2724 					if(intervals_buffer){
2725 						if(intervals_buffer[ *intervals_i ].start_pos > optval) intervals_buffer[ *intervals_i ].start_pos -= optval;
2726 						else intervals_buffer[ *intervals_i ].start_pos = 0;
2727 					}
2728 				}else	len_of_S1 = optval;
2729 			}	// H and P do not have effect on cigar parsing.
2730 		}
2731 		if(this_section_length>0){
2732 			// add new section
2733 			if( (*cigar_sect) < max_M){
2734 				if(intervals_buffer){
2735 					intervals_buffer[ *intervals_i ].chromosomal_length = chro_cursor - intervals_buffer[ *intervals_i ].start_pos + len_of_S1;
2736 					(*intervals_i)++;
2737 				}
2738 				Starting_Chro_Points_1BASE[*cigar_sect] = section_start_chro;
2739 				Starting_Read_Points[*cigar_sect] = section_start_read;
2740 				Section_Read_Lengths[*cigar_sect] = this_section_length ;
2741 				ChroNames[*cigar_sect] = (*chro);
2742 				(*cigar_sect)++;
2743 			}
2744 		}
2745 
2746 		int bin_ptr = 36 + l_read_name + seq_len + (seq_len+1)/2 + 4 * cigar_opts;
2747 		int block_len;
2748 		memcpy(&block_len, bin, 4);
2749 		int found_NH = SAM_pairer_iterate_int_tags((unsigned char *)bin+bin_ptr, block_len + 4 - bin_ptr, "NH", NH_value);
2750 		if(!found_NH) *(NH_value) = 1;
2751 
2752 		if(assign_reads_to_RG){
2753 			char RG_type = 0;
2754 			SAM_pairer_iterate_tags((unsigned char *)bin+bin_ptr, block_len + 4 - bin_ptr, "RG", &RG_type, RG_ptr);
2755 			if(RG_type != 'Z') (*RG_ptr) = NULL;
2756 		}
2757 		//SUBREADprintf("FOUND=%d, NH=%d, TAG=%.*s\n", found_NH, *(NH_value), 3 , bin+bin_ptr);
2758 	}else{
2759 		(*read_name) = bin2 + 36;
2760 		int mate_flag;
2761 		memcpy(&mate_flag, bin2 + 16, 4);
2762 		mate_flag = mate_flag >> 16;
2763 		(*flag) = reverse_flag(mate_flag);
2764 
2765 		int refID, mate_refID;
2766 		memcpy(&refID, bin2 + 24, 4);
2767 		memcpy(&mate_refID, bin2 + 4, 4);
2768 		if(refID < 0) *chro = NULL;
2769 		else (*chro) = sambam_chro_table[refID].chro_name;
2770 
2771 		if(mate_refID < 0) *mate_chro = NULL;
2772 		else (*mate_chro) = sambam_chro_table[mate_refID].chro_name;
2773 		*ret_mate_refID = mate_refID;
2774 		*ret_me_refID = refID;
2775 
2776 		*pos=0;
2777 		memcpy(pos, bin2+28, 4);
2778 		(*pos)++;
2779 
2780 		*mate_pos=0;
2781 		memcpy(mate_pos, bin2+8, 4);
2782 		(*mate_pos)++;
2783 
2784 		(*tlen) = 0;
2785 		memcpy(tlen, bin2+32, 4);
2786 		(*tlen) = -(*tlen);
2787 
2788 		if(assign_reads_to_RG){
2789 			char RG_type = 0;
2790 			int block2_len = 0;
2791 			memcpy(&block2_len, bin2, 4);
2792 			int rname2len = 0, cigar2len = 0, seq2len = 0;
2793 			memcpy(&rname2len, bin2+12, 1);
2794 			memcpy(&cigar2len, bin2+16, 2);
2795 			memcpy(&seq2len, bin2+20, 4);
2796 
2797 			int bin2_ptr = 36 + rname2len + 4 * cigar2len + seq2len + (seq2len+1)/2;
2798 			SAM_pairer_iterate_tags((unsigned char *)bin2+bin2_ptr, block2_len + 4 - bin2_ptr, "RG", &RG_type, RG_ptr);
2799 			if(RG_type != 'Z') (*RG_ptr) = NULL;
2800 		}
2801 
2802 	}
2803 }
2804 
2805 /*
2806 typedef struct {
2807 	char chromosome_name_left[CHROMOSOME_NAME_LENGTH + 1];
2808 	char chromosome_name_right[CHROMOSOME_NAME_LENGTH + 1];
2809 	unsigned int last_exon_base_left;
2810 	unsigned int first_exon_base_right;
2811 } fc_junction_info_t;
2812 
2813 */
calc_junctions_from_cigarInts(fc_thread_global_context_t * global_context,int alignment_masks,int cigar_sections,unsigned int * Starting_Chro_Points_1BASE,unsigned short * Starting_Read_Points,unsigned short * Section_Lengths,char ** ChroNames,char * Event_After_Section,fc_junction_info_t * junctions_current)2814 int calc_junctions_from_cigarInts(fc_thread_global_context_t * global_context, int alignment_masks , int cigar_sections, unsigned int * Starting_Chro_Points_1BASE, unsigned short * Starting_Read_Points, unsigned short * Section_Lengths, char ** ChroNames, char * Event_After_Section, fc_junction_info_t * junctions_current){
2815 	int x1, ret = 0;
2816 	unsigned int last_base_pos = Starting_Chro_Points_1BASE[0] + Section_Lengths[0] - 1;
2817 	for(x1 = 1; x1 < cigar_sections; x1++){
2818 		if(!ChroNames[x1]) continue; // NULL chro name for https://groups.google.com/forum/#!topic/subread/QDT6npjAZuE
2819 		if(Event_After_Section[x1-1] == 'N'){
2820 			unsigned int first_base_pos = Starting_Chro_Points_1BASE[x1];
2821 			junctions_current[ret].last_exon_base_left = last_base_pos;
2822 			junctions_current[ret].first_exon_base_right = first_base_pos;
2823 			strcpy(junctions_current[ret].chromosome_name_left, ChroNames[x1]);
2824 			strcpy(junctions_current[ret].chromosome_name_right, ChroNames[x1]);
2825 			ret ++;
2826 		}
2827 
2828 		last_base_pos = Starting_Chro_Points_1BASE[x1] + Section_Lengths[x1] - 1;
2829 	}
2830 	return ret;
2831 }
2832 
2833 void add_fragment_supported_junction(	fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, fc_junction_info_t * supported_junctions1, int njunc1, fc_junction_info_t * supported_junctions2, int njunc2, char * RG_name);
2834 
process_line_junctions(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * bin1,char * bin2)2835 void process_line_junctions(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * bin1, char * bin2) {
2836 	fc_junction_info_t supported_junctions1[global_context -> max_M], supported_junctions2[global_context -> max_M];
2837 	int is_second_read, njunc1=0, njunc2=0, is_junction_read, cigar_sections;
2838 	int alignment_masks, mapping_qual, NH_value;
2839 	char *RG_ptr=NULL;
2840 
2841 	for(is_second_read = 0 ; is_second_read < 2; is_second_read++){
2842 		char * read_chr, *read_name, *mate_chr;
2843 		srInt_64 read_pos, fragment_length = 0, mate_pos;
2844 		unsigned int Starting_Chro_Points_1BASE[global_context -> max_M];
2845 		unsigned short Starting_Read_Points[global_context -> max_M];
2846 		unsigned short Section_Read_Lengths[global_context -> max_M];
2847 		char * ChroNames[global_context -> max_M];
2848 		char Event_After_Section[global_context -> max_M];
2849 		if(is_second_read && !global_context -> is_paired_end_mode_assign) break;
2850 		char * RG_ptr_one = NULL;
2851 		int me_refID, mate_refID;
2852 
2853 		parse_bin(global_context -> sambam_chro_table, is_second_read?bin2:bin1, is_second_read?bin1:bin2 , &read_name,  &alignment_masks , &read_chr, &read_pos, &mapping_qual, &mate_chr, &mate_pos, &fragment_length, &is_junction_read, &cigar_sections, Starting_Chro_Points_1BASE, Starting_Read_Points, Section_Read_Lengths, ChroNames, Event_After_Section, &NH_value, global_context -> max_M, NULL, NULL, global_context -> assign_reads_to_RG, &RG_ptr_one, &me_refID, &mate_refID);
2854 		assert(cigar_sections <= global_context -> max_M);
2855 		if(RG_ptr_one) RG_ptr = RG_ptr_one;
2856 
2857 		int * njunc_current = is_second_read?&njunc2:&njunc1;
2858 		fc_junction_info_t * junctions_current = is_second_read?supported_junctions2:supported_junctions1;
2859 		(*njunc_current) = calc_junctions_from_cigarInts(global_context, alignment_masks , cigar_sections, Starting_Chro_Points_1BASE, Starting_Read_Points, Section_Read_Lengths, ChroNames, Event_After_Section, junctions_current);
2860 
2861 		//if(0 && FIXLENstrcmp("HWI-ST212:219:C0C1TACXX:1:1101:13391:171460", read_name)==0){
2862 		//	SUBREADprintf("JUNC_FOUND_IN_READ OF %s : %d\n", read_name , *njunc_current);
2863 		//}
2864 	}
2865 	if(njunc1 >0 || njunc2>0)
2866 		add_fragment_supported_junction(global_context, thread_context, supported_junctions1, njunc1, supported_junctions2, njunc2, RG_ptr);
2867 
2868 }
2869 
get_RG_tables(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * rg_name)2870 void ** get_RG_tables(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * rg_name){
2871 	void ** ret = HashTableGet(thread_context->RG_table, rg_name);
2872 	if(ret) return ret;
2873 
2874 	ret = malloc(sizeof(void *)*4);
2875 
2876 	ret[0] = malloc(thread_context -> count_table_size * sizeof(read_count_type_t));
2877 	ret[1] = malloc(sizeof(fc_read_counters));
2878 
2879 	memset(ret[0], 0, thread_context -> count_table_size * sizeof(read_count_type_t));
2880 	memset(ret[1], 0, sizeof(fc_read_counters));
2881 
2882 	if(global_context -> do_junction_counting){
2883 		HashTable * junction_counting_table = HashTableCreate(131317);
2884 		HashTableSetHashFunction(junction_counting_table,HashTableStringHashFunction);
2885 		HashTableSetDeallocationFunctions(junction_counting_table, free, NULL);
2886 		HashTableSetKeyComparisonFunction(junction_counting_table, fc_strcmp_chro);
2887 
2888 		HashTable * splicing_point_table = HashTableCreate(131317);
2889 		HashTableSetHashFunction(splicing_point_table,HashTableStringHashFunction);
2890 		HashTableSetDeallocationFunctions(splicing_point_table, free, NULL);
2891 		HashTableSetKeyComparisonFunction(splicing_point_table, fc_strcmp_chro);
2892 
2893 		ret [2] = junction_counting_table;
2894 		ret [3] = splicing_point_table;
2895 	}else ret[2] = NULL;
2896 
2897 	char * rg_name_mem = malloc(strlen(rg_name)+1);
2898 	strcpy(rg_name_mem, rg_name);
2899 	HashTablePut(thread_context->RG_table, rg_name_mem, ret);
2900 	return ret;
2901 }
2902 
2903 void add_scRNA_read_tota1_no( fc_thread_global_context_t * global_context,  fc_thread_thread_context_t * thread_context, char * read_name, char * bin1, int step);
process_scRNAr2_line_buffer(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * bin1,char * bin2)2904 void process_scRNAr2_line_buffer(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context,  char * bin1, char * bin2){
2905 	char * read_name = bin1+36;
2906 	add_scRNA_read_tota1_no(global_context, thread_context, read_name, bin1, 2);
2907 }
2908 
process_pairer_scRNAr2_output(void * pairer_vp,int thread_no,char * bin1,char * bin2)2909 int process_pairer_scRNAr2_output(void * pairer_vp, int thread_no, char * bin1, char * bin2){
2910 	SAM_pairer_context_t * pairer = (SAM_pairer_context_t *) pairer_vp;
2911 	fc_thread_global_context_t * global_context = (fc_thread_global_context_t * )pairer -> appendix1;
2912 	fc_thread_thread_context_t * thread_context = global_context -> thread_contexts + thread_no;
2913 	process_scRNAr2_line_buffer(global_context, thread_context, bin1, bin2);
2914 	return 0;
2915 }
2916 
2917 
process_pairer_output(void * pairer_vp,int thread_no,char * bin1,char * bin2)2918 int process_pairer_output(void * pairer_vp, int thread_no, char * bin1, char * bin2){
2919 	SAM_pairer_context_t * pairer = (SAM_pairer_context_t *) pairer_vp;
2920 	fc_thread_global_context_t * global_context = (fc_thread_global_context_t * )pairer -> appendix1;
2921 	fc_thread_thread_context_t * thread_context = global_context -> thread_contexts + thread_no;
2922 
2923 	if(pairer -> long_cigar_mode){
2924 		if(global_context -> max_M < 65536){
2925 			//SUBREADprintf("SWITCHED INTO LONG-READ MODE\n");
2926 			global_context -> max_M = 65536;
2927 		}
2928 		if(!global_context->is_read_too_long_to_SAM_BAM_shown &&(global_context -> is_read_details_out == FILE_TYPE_SAM || global_context -> is_read_details_out == FILE_TYPE_BAM)){
2929 			global_context -> is_read_details_out = 0;
2930 			SUBREADprintf("ERROR: The read is too long to the SAM or BAM output.\nPlease use the 'CORE' mode for the assignment detail output.\n");
2931 			global_context->is_read_too_long_to_SAM_BAM_shown = 1;
2932 		}
2933 	}
2934 
2935 	process_line_buffer(global_context, thread_context, bin1, bin2);
2936 	if(0 && global_context -> do_junction_counting){
2937 		process_line_junctions(global_context, thread_context, bin1, bin2);
2938 	}
2939 	return 0;
2940 }
2941 
2942 void sort_bucket_table(fc_thread_global_context_t * global_context);
2943 void vote_and_add_count(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context,
2944 			srInt_64 * hits_indices1, int nhits1, srInt_64 * hits_indices2, int nhits2, unsigned int total_frag_len,
2945 			char ** hits_chro1, char ** hits_chro2, unsigned int * hits_start_pos1, unsigned int * hits_start_pos2, unsigned short * hits_length1, unsigned short * hits_length2, int fixed_fractional_count, char * read_name, char * RG_name, char * bin1, char * bin2);
2946 
add_bin_new_tags(char * oldbin,char ** newbin,char ** tags,char * types,void ** vals)2947 void add_bin_new_tags(char * oldbin, char **newbin, char ** tags, char * types, void ** vals){
2948 	int new_tags_length = 0;
2949 	int tagi;
2950 	for(tagi = 0; tags[tagi]; tagi++){
2951 		char type = types[tagi];
2952 		if(type == 'i') new_tags_length += 7;
2953 		else new_tags_length += 4 + strlen((char *)vals[tagi]);
2954 	}
2955 
2956 	int oldbin_len;
2957 	memcpy(&oldbin_len, oldbin, 4);
2958 	oldbin_len += 4;
2959 
2960 	int newbin_len = oldbin_len + new_tags_length;
2961 	(*newbin) = malloc(newbin_len);
2962 	memcpy(*newbin, oldbin, oldbin_len);
2963 	newbin_len -= 4;
2964 	memcpy(*newbin, &newbin_len, 4);
2965 	newbin_len += 4;
2966 
2967 	for(tagi = 0; tags[tagi]; tagi++){
2968 		memcpy( (*newbin) + oldbin_len, tags[tagi] ,2);
2969 		(*newbin)[oldbin_len+2] = types[tagi];
2970 		if(types[tagi] == 'i'){
2971 			int intv = vals[tagi] - NULL;
2972 			memcpy((*newbin) + oldbin_len + 3, &intv, 4);
2973 			oldbin_len += 7;
2974 		}else{
2975 			int vlen = strlen((char *)(vals[tagi]))+1;
2976 			memcpy((*newbin) + oldbin_len + 3, vals[tagi], vlen);
2977 			oldbin_len += 3 + vlen;
2978 		}
2979 	}
2980 }
2981 
2982 
2983 
compress_read_detail_BAM(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,int write_start,int write_end,char * bam_buf)2984 int compress_read_detail_BAM(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, int write_start, int write_end, char * bam_buf){
2985 	if(global_context -> is_read_details_out == FILE_TYPE_SAM){
2986 		// there MUST be only one read in the buffer.
2987 		int write_ptr = write_start;
2988 		int tmplen = 0 ;
2989 		int sam_ptr = 0;
2990 		while(1){
2991 			if(write_ptr >= write_end) break;
2992 			memcpy(&tmplen, thread_context -> read_details_buff + write_ptr, 4);
2993 			tmplen +=4;
2994 			int txtlen = convert_BAM_binary_to_SAM(global_context -> sambam_chro_table, thread_context -> read_details_buff + write_ptr, bam_buf + sam_ptr);
2995 			bam_buf[sam_ptr + txtlen] = '\n';
2996 			bam_buf[sam_ptr + txtlen + 1] = 0;
2997 			sam_ptr += txtlen + 1;
2998 			write_ptr += tmplen;
2999 		}
3000 		return sam_ptr;
3001 
3002 	}else{
3003 		// there may be multiple reads in the buffer.
3004 			int bin_len = write_end - write_start;
3005 			char * compressed_buff = bam_buf + 18;
3006 
3007 			int compressed_size ;
3008 			unsigned int CRC32;
3009 			thread_context -> bam_file_output_stream.avail_out = 66600;
3010 			thread_context -> bam_file_output_stream.avail_in = bin_len;
3011 			//SUBREADprintf("COMPRESS PTR=%p , LEN=%d\n", thread_context -> read_details_buff + write_start , bin_len);
3012 			CRC32 = FC_CRC32(thread_context -> read_details_buff + write_start , bin_len);
3013 
3014 			int Z_DEFAULT_MEM_LEVEL = 8;
3015 			thread_context -> bam_file_output_stream.zalloc = Z_NULL;
3016 			thread_context -> bam_file_output_stream.zfree = Z_NULL;
3017 			thread_context -> bam_file_output_stream.opaque = Z_NULL;
3018 
3019 			deflateInit2(&thread_context -> bam_file_output_stream, bin_len?Z_BEST_SPEED:Z_DEFAULT_COMPRESSION, Z_DEFLATED, -15, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
3020 
3021 			thread_context -> bam_file_output_stream.next_in = (unsigned char*) thread_context -> read_details_buff + write_start;
3022 			thread_context -> bam_file_output_stream.next_out = (unsigned char*) compressed_buff;
3023 
3024 			deflate(&thread_context -> bam_file_output_stream, Z_FINISH);
3025 			deflateEnd(&thread_context -> bam_file_output_stream);
3026 
3027 			compressed_size = 66600 -thread_context -> bam_file_output_stream.avail_out;
3028 
3029 			bam_buf[0]=31;
3030 			bam_buf[1]=-117;
3031 			bam_buf[2]=8;
3032 			bam_buf[3]=4;
3033 			memset(bam_buf+4, 0, 5);
3034 			bam_buf[9] = 0xff;	// OS
3035 
3036 			int tmpi = 6;
3037 			memcpy(bam_buf+10, &tmpi, 2); //XLSN
3038 			bam_buf[12]=66; // SI1
3039 			bam_buf[13]=67; // SI2
3040 			tmpi = 2;
3041 			memcpy(bam_buf+14, &tmpi, 2); //BSIZE
3042 			tmpi = compressed_size + 19 + 6;
3043 			memcpy(bam_buf+16, &tmpi, 2); //BSIZE
3044 
3045 			memcpy(bam_buf+18+compressed_size, &CRC32, 4);
3046 			memcpy(bam_buf+18+compressed_size+4, &bin_len, 4);
3047 			return 18+compressed_size+8;
3048 	}
3049 }
3050 
write_read_detailed_remainder(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context)3051 void write_read_detailed_remainder(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context){
3052 	int write_bin_ptr = 0;
3053 	int last_written_ptr = 0;
3054 	int bam_compressed_buff_ptr = 0;
3055 
3056 	if(thread_context -> read_details_buff_used <1)return;
3057 
3058 	if(global_context -> is_read_details_out == FILE_TYPE_BAM && thread_context -> read_details_buff_used < 64000){
3059 		bam_compressed_buff_ptr = compress_read_detail_BAM(global_context, thread_context, 0, thread_context -> read_details_buff_used, thread_context -> bam_compressed_buff);
3060 	}else while(1){
3061 		if(write_bin_ptr >= thread_context -> read_details_buff_used ) break;
3062 		int tmplen = 0;
3063 		memcpy(&tmplen, thread_context -> read_details_buff + write_bin_ptr, 4);
3064 		if(tmplen < 9 || tmplen > 3*MAX_FC_READ_LENGTH){
3065 			SUBREADprintf("ERROR: Format error : len = %d\n", tmplen);
3066 			//oexit(-1);
3067 			return ;
3068 		}
3069 		tmplen +=4;
3070 		write_bin_ptr += tmplen;
3071 		if(write_bin_ptr - last_written_ptr > 64000 || write_bin_ptr >= thread_context -> read_details_buff_used || global_context -> is_read_details_out == FILE_TYPE_SAM){
3072 			bam_compressed_buff_ptr += compress_read_detail_BAM(global_context, thread_context, last_written_ptr, write_bin_ptr, thread_context -> bam_compressed_buff + bam_compressed_buff_ptr);
3073 			last_written_ptr = write_bin_ptr;
3074 		}
3075 	}
3076 	pthread_spin_lock(&global_context -> read_details_out_lock);
3077 	fwrite(thread_context -> bam_compressed_buff, 1, bam_compressed_buff_ptr , global_context -> read_details_out_FP);
3078 	pthread_spin_unlock(&global_context -> read_details_out_lock);
3079 	thread_context -> read_details_buff_used =0;
3080 }
3081 
3082 
add_read_detail_bin_buff(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * bin,int do_write)3083 int add_read_detail_bin_buff(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context,  char * bin, int do_write){
3084 	int binlen=0;
3085 
3086 	memcpy(&binlen, bin, 4);
3087 	binlen += 4;
3088 	if(binlen > MAX_FC_READ_LENGTH * 3){
3089 		if(!global_context->is_read_too_long_to_SAM_BAM_shown){
3090 				SUBREADprintf("ERROR: The read is too long to the SAM or BAM output.\nPlease use the 'CORE' mode for the assignment detail output.\n");
3091 				global_context->is_read_too_long_to_SAM_BAM_shown = 1;
3092 		}
3093 		return -1;
3094 	}
3095 
3096 	memcpy(thread_context -> read_details_buff + thread_context -> read_details_buff_used, bin, binlen);
3097 	thread_context -> read_details_buff_used  += binlen;
3098 
3099 	if(do_write){
3100 		if(global_context -> is_read_details_out == FILE_TYPE_SAM || thread_context -> read_details_buff_used >= 55000) write_read_detailed_remainder(global_context, thread_context);
3101 	}
3102 	return 0;
3103 }
3104 
write_read_details_FP(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * status,int feature_count,char * features,char * bin1,char * bin2)3105 int write_read_details_FP(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * status, int feature_count, char * features, char * bin1, char * bin2){
3106 	int ret = 1;
3107 
3108 	char * read_name;
3109 
3110 	if(global_context -> is_read_details_out == FILE_TYPE_RSUBREAD){
3111 		get_readname_from_bin(bin1?bin1:bin2, &read_name);
3112 		fprintf(global_context -> read_details_out_FP, "%s\t%s\t%d\t%s\n", read_name, status, feature_count, features?features:"NA");
3113 	}else{
3114 		char * out_bin1 = NULL, *out_bin2 = NULL;
3115 		char * tags[4];
3116 		char types[4];
3117 		void * vals[4];
3118 
3119 		tags[0]="XS";
3120 		tags[1]=feature_count >0?"XN":NULL;
3121 		tags[2]=feature_count >0?"XT":NULL;
3122 		tags[3]=NULL;
3123 		types[0]='Z';
3124 		types[1]='i';
3125 		types[2]='Z';
3126 		vals[0]=status;
3127 		vals[1]=NULL+feature_count;
3128 		vals[2]=features;
3129 
3130 		if(bin1){
3131 			add_bin_new_tags(bin1, &out_bin1, tags, types, vals);
3132 			add_read_detail_bin_buff(global_context, thread_context, out_bin1, bin2 == NULL);
3133 			free(out_bin1);
3134 		}
3135 
3136 		if(bin2){
3137 			add_bin_new_tags(bin2, &out_bin2, tags, types, vals);
3138 			add_read_detail_bin_buff(global_context, thread_context, out_bin2, 1);
3139 			free(out_bin2);
3140 		}
3141 	}
3142 	if(ret < 1) global_context -> disk_is_full = 1;
3143 	return ret;
3144 }
3145 
warning_anno_BAM_chromosomes(fc_thread_global_context_t * global_context)3146 void warning_anno_BAM_chromosomes(fc_thread_global_context_t * global_context){
3147 	int x1;
3148 	HashTable * BAM_chro_tab = HashTableCreate(1117);
3149 	HashTableSetHashFunction(BAM_chro_tab,HashTableStringHashFunction);
3150 	HashTableSetKeyComparisonFunction(BAM_chro_tab,fc_strcmp_chro );
3151 
3152 	for(x1 = 0; x1 < global_context -> sambam_chro_table_items; x1++){
3153 		char * BAM_chro = global_context -> sambam_chro_table[x1].chro_name;
3154 		if( global_context -> BAM_chros_to_anno_table){
3155 			char * tmp_chro = HashTableGet(global_context -> BAM_chros_to_anno_table, global_context -> sambam_chro_table[x1].chro_name);
3156 			if(tmp_chro) BAM_chro = tmp_chro;
3157 		}
3158 		HashTablePut(BAM_chro_tab, BAM_chro, NULL+1);
3159 	}
3160 
3161 	HashTable * ANNO_chro_tab = HashTableCreate(1117);
3162 	HashTableSetHashFunction(ANNO_chro_tab,HashTableStringHashFunction);
3163 	HashTableSetKeyComparisonFunction(ANNO_chro_tab,fc_strcmp_chro );
3164 
3165 	for(x1 = 0 ; x1 < global_context -> exontable_exons ; x1++)
3166 		HashTablePut(ANNO_chro_tab, global_context -> exontable_chr[x1], NULL+1);
3167 
3168 	if(global_context -> is_verbose){
3169 		warning_hash_hash(ANNO_chro_tab, BAM_chro_tab, "Chromosomes/contigs in annotation but not in input file");
3170 		warning_hash_hash(BAM_chro_tab, ANNO_chro_tab, "Chromosomes/contigs in input file but not in annotation");
3171 	}
3172 	HashTableDestroy(BAM_chro_tab);
3173 	HashTableDestroy(ANNO_chro_tab);
3174 }
3175 
3176 void add_scRNA_read_to_pool( fc_thread_global_context_t * global_context,  fc_thread_thread_context_t * thread_context, srInt_64 assign_target_number, char * read_name, char * read_bin, ArrayList * target_list );
3177 
process_line_buffer(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * bin1,char * bin2)3178 void process_line_buffer(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, char * bin1, char * bin2)
3179 {
3180 	if(global_context -> is_input_bad_format) return;
3181 	char * read_chr, *read_name, *mate_chr;
3182 	srInt_64 read_pos, fragment_length = 0, mate_pos;
3183 	unsigned int search_start = 0, search_end;
3184 	int nhits1 = 0, nhits2 = 0, alignment_masks, search_block_id, search_item_id, mapping_qual;
3185 
3186 
3187 	//long * hits_indices1 = thread_context -> hits_indices1, * hits_indices2 = thread_context -> hits_indices2;
3188 	//unsigned int * hits_start_pos1 = thread_context -> hits_start_pos1 ,  * hits_start_pos2 = thread_context -> hits_start_pos2;
3189 	//unsigned short * hits_length1 = thread_context -> hits_length1 ,  * hits_length2 = thread_context -> hits_length2;
3190 	//char ** hits_chro1 = thread_context -> hits_chro1 , **hits_chro2 = thread_context -> hits_chro2;
3191 
3192 	unsigned int  total_frag_len =0;
3193 
3194 	int cigar_sections, is_junction_read;
3195 	unsigned int * Starting_Chro_Points_1BASE = thread_context -> proc_Starting_Chro_Points_1BASE;
3196 	unsigned short * Starting_Read_Points = thread_context -> proc_Starting_Read_Points;
3197 	unsigned short * Section_Read_Lengths = thread_context -> proc_Section_Read_Lengths;
3198 	char ** ChroNames = thread_context -> proc_ChroNames;
3199 	char * Event_After_Section = thread_context -> proc_Event_After_Section;
3200 
3201 	CIGAR_interval_t * CIGAR_intervals_R1 = thread_context -> proc_CIGAR_intervals_R1;
3202 	CIGAR_interval_t * CIGAR_intervals_R2 = thread_context -> proc_CIGAR_intervals_R2;
3203 
3204 	int is_second_read;
3205 	int maximum_NH_value = 1, NH_value;
3206 	int skipped_for_exonic = 0;
3207 	int first_read_quality_score = 0, CIGAR_intervals_R1_sections = 0, CIGAR_intervals_R2_sections = 0;
3208 
3209 	if(thread_context -> thread_id == 0 && thread_context -> all_reads < 1){
3210 		warning_anno_BAM_chromosomes(global_context);
3211 	}
3212 
3213 	if(global_context -> need_calculate_overlap_len ){
3214 		memset( CIGAR_intervals_R1, 0, sizeof(CIGAR_interval_t) *  global_context -> max_M  );
3215 		memset( CIGAR_intervals_R2, 0, sizeof(CIGAR_interval_t) *  global_context -> max_M  );
3216 	}
3217 
3218 	thread_context->all_reads++;
3219 	//if(thread_context->all_reads>1000000) printf("TA=%llu\n%s\n",thread_context->all_reads, thread_context -> line_buffer1);
3220 
3221 
3222 	char * RG_ptr;
3223 	int me_refID =-1, mate_refID =-1, this_is_inconsistent_read_type = 0;
3224 	for(is_second_read = 0 ; is_second_read < 2; is_second_read++)
3225 	{
3226 		if(is_second_read && !global_context -> is_paired_end_mode_assign) break;
3227 
3228 		RG_ptr = NULL;
3229 		parse_bin(global_context -> sambam_chro_table, is_second_read?bin2:bin1, is_second_read?bin1:bin2 , &read_name,  &alignment_masks , &read_chr, &read_pos, &mapping_qual, &mate_chr, &mate_pos, &fragment_length, &is_junction_read, &cigar_sections, Starting_Chro_Points_1BASE, Starting_Read_Points, Section_Read_Lengths, ChroNames, Event_After_Section, &NH_value, global_context -> max_M , global_context -> need_calculate_overlap_len?(is_second_read?CIGAR_intervals_R2:CIGAR_intervals_R1):NULL, is_second_read?&CIGAR_intervals_R2_sections:&CIGAR_intervals_R1_sections, global_context -> assign_reads_to_RG, &RG_ptr, &me_refID, &mate_refID);
3230 
3231 		// this will be done in the other function.
3232 		if(global_context -> is_paired_end_mode_assign && (alignment_masks&1)==0) alignment_masks|=8;
3233 
3234 		//#warning "========= DEBUG OUTPUT =============="
3235 		if(0 && FIXLENstrcmp("SEV0112_0155:7:1303:14436:74270", read_name)==0){
3236 			SUBREADprintf("RTEST:%s R_%d   %p, %p    FLAGS %d\n", read_name, 1+is_second_read, bin1, bin2, alignment_masks);
3237 		}
3238 
3239 		if(global_context -> assign_reads_to_RG && NULL == RG_ptr)return;
3240 
3241 		if(  ( alignment_masks & SAM_FLAG_PAIRED_TASK ) && !global_context -> any_reads_are_PE ) global_context -> any_reads_are_PE=1;
3242 		if(((!global_context -> is_paired_end_reads_expected)  && ( alignment_masks & SAM_FLAG_PAIRED_TASK )) || ((global_context -> is_paired_end_reads_expected)  && 0 == ( alignment_masks & SAM_FLAG_PAIRED_TASK ))){
3243 			if(global_context -> is_mixed_PE_SE == 0) global_context -> is_mixed_PE_SE =1;
3244 			if(!global_context -> is_paired_end_reads_expected){
3245 				SUBREADprintf("ERROR: Paired-end reads were detected in single-end read library : %s\n", global_context -> input_file_name);
3246 				global_context -> is_input_bad_format = 1;
3247 				return;
3248 			}
3249 			this_is_inconsistent_read_type = 1;
3250 		}
3251 
3252 		if(global_context -> do_scRNA_table)add_scRNA_read_tota1_no(global_context, thread_context, read_name, bin1, 0);
3253 
3254 		if(is_second_read == 0)
3255 		{
3256 			//skip the read if unmapped (its mate will be skipped as well if paired-end)
3257 			if( ((!global_context -> is_paired_end_mode_assign) &&  (alignment_masks & SAM_FLAG_UNMAPPED) ) ||
3258 			    ((alignment_masks & SAM_FLAG_UNMAPPED)   &&  (alignment_masks & SAM_FLAG_MATE_UNMATCHED) && global_context -> is_paired_end_mode_assign)) {
3259 				if(RG_ptr){
3260 					void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3261 					fc_read_counters * sumtab = tab4s[1];
3262 					sumtab -> unassigned_unmapped++;
3263 				}else
3264 					thread_context->read_counters.unassigned_unmapped ++;
3265 
3266 				if(global_context -> read_details_out_FP)
3267 					write_read_details_FP(global_context , thread_context ,"Unassigned_Unmapped",0, NULL, bin1, bin2);
3268 
3269 				if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3270 				return;	// do nothing if a read is unmapped, or the first read in a pair of reads is unmapped.
3271 			}
3272 		}
3273 
3274 		if(global_context -> do_scRNA_table)add_scRNA_read_tota1_no(global_context, thread_context, read_name, bin1, 1);
3275 		if(((alignment_masks & SAM_FLAG_UNMAPPED) || (alignment_masks & SAM_FLAG_MATE_UNMATCHED)) && global_context -> is_paired_end_mode_assign && global_context -> is_both_end_required){
3276 				if(RG_ptr){
3277 					void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3278 					fc_read_counters * sumtab = tab4s[1];
3279 					sumtab -> unassigned_singleton++;
3280 				}else
3281 					thread_context->read_counters.unassigned_singleton ++;
3282 
3283 				if(global_context -> read_details_out_FP)
3284 					write_read_details_FP(global_context , thread_context ,"Unassigned_Singleton",0, NULL, bin1, bin2);
3285 
3286 				if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3287 				return;
3288 		}
3289 
3290 		if(this_is_inconsistent_read_type){
3291 			if(global_context -> is_both_end_required && 0 == ( alignment_masks & SAM_FLAG_PAIRED_TASK )){
3292 				if(global_context -> read_details_out_FP)
3293 					write_read_details_FP(global_context , thread_context ,"Unassigned_Singleton",0, NULL, bin1, bin2);
3294 				if(RG_ptr){
3295 					void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3296 					fc_read_counters * sumtab = tab4s[1];
3297 					sumtab -> unassigned_singleton++;
3298 				}else thread_context->read_counters.unassigned_singleton ++;
3299 
3300 				if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3301 				return; // when running on PE mode, SE reads are seen as "only one end mapped"
3302 			}
3303 		}
3304 
3305 		if(global_context -> min_mapping_quality_score>0)
3306 		{
3307 			//printf("SECOND=%d; FIRST=%d; THIS=%d; Q=%d\n", is_second_read, first_read_quality_score, mapping_qual, );
3308 			if(( mapping_qual < global_context -> min_mapping_quality_score  && ! global_context -> is_paired_end_mode_assign)||( is_second_read  && max( first_read_quality_score, mapping_qual ) < global_context -> min_mapping_quality_score))
3309 			{
3310 				thread_context->read_counters.unassigned_mappingquality ++;
3311 
3312 				if(global_context -> read_details_out_FP)
3313 				{
3314 					write_read_details_FP(global_context, thread_context, "Unassigned_MappingQuality", 0, NULL, bin1, bin2);
3315 				}
3316 
3317 				if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3318 				return;
3319 			}
3320 			if(is_second_read==0 && global_context -> is_paired_end_mode_assign)
3321 			{
3322 				first_read_quality_score = mapping_qual;
3323 			}
3324 		}
3325 
3326 		if(is_second_read == 0 && global_context -> is_paired_end_mode_assign &&
3327 	   	  (global_context -> is_PE_distance_checked || global_context -> is_chimertc_disallowed)
3328 		  )
3329 		{
3330 			int is_half_mapped = (alignment_masks & SAM_FLAG_UNMAPPED) || (alignment_masks & SAM_FLAG_MATE_UNMATCHED);
3331 
3332 			if(!is_half_mapped)
3333 			{
3334 				fragment_length = abs( fragment_length ); //get the fragment length
3335 
3336 				int is_first_read_negative_strand = (alignment_masks & SAM_FLAG_REVERSE_STRAND_MATCHED)?1:0;
3337 				int is_second_read_negative_strand = (alignment_masks & SAM_FLAG_MATE_REVERSE_STRAND_MATCHED)?1:0;
3338 
3339 				if(mate_chr == read_chr && is_first_read_negative_strand!=is_second_read_negative_strand) {
3340 				 //^^^^^^^^^^^^^^^^^^^^ They are directly compared because they are both pointers in the same contig name table.
3341 				 //
3342 					if(global_context -> is_PE_distance_checked && ((fragment_length > global_context -> max_paired_end_distance) || (fragment_length < global_context -> min_paired_end_distance))) {
3343 						if(RG_ptr){
3344 							void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3345 							fc_read_counters * sumtab = tab4s[1];
3346 							sumtab -> unassigned_fragmentlength++;
3347 						}else
3348 							thread_context->read_counters.unassigned_fragmentlength ++;
3349 
3350 						if(global_context -> read_details_out_FP)
3351 							write_read_details_FP(global_context, thread_context, "Unassigned_FragmentLength", -1, NULL, bin1, bin2);
3352 						if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3353 						return;
3354 					}
3355 				} else {
3356 					if(global_context -> is_chimertc_disallowed) {
3357 						if(RG_ptr){
3358 							void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3359 							fc_read_counters * sumtab = tab4s[1];
3360 							sumtab -> unassigned_chimericreads++;
3361 						}else
3362 							thread_context->read_counters.unassigned_chimericreads ++;
3363 
3364 						if(global_context -> read_details_out_FP)
3365 							write_read_details_FP(global_context, thread_context, "Unassigned_Chimera", -1, NULL, bin1, bin2);
3366 						if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3367 						return;
3368 					}
3369 				}
3370 			}
3371 		}
3372 
3373 		// This filter has to be put here because the 0x400 FLAG is not about mapping but about sequencing.
3374 		// A unmapped read with 0x400 FLAG should be able to kill the mapped mate which may have no 0x400 FLAG.
3375 		if(global_context -> is_duplicate_ignored)
3376 		{
3377 			if(alignment_masks & SAM_FLAG_DUPLICATE)
3378 			{
3379 				if(RG_ptr){
3380 					void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3381 					fc_read_counters * sumtab = tab4s[1];
3382 					sumtab -> unassigned_duplicate++;
3383 				}else thread_context->read_counters.unassigned_duplicate ++;
3384 				if(global_context -> read_details_out_FP)
3385 					write_read_details_FP(global_context, thread_context, "Unassigned_Duplicate", -1, NULL, bin1, bin2);
3386 
3387 				if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3388 				return;
3389 			}
3390 
3391 		}
3392 
3393 		if(SAM_FLAG_UNMAPPED & alignment_masks) continue;
3394 
3395 		if( NH_value > 1 ) {
3396 			if(global_context -> is_multi_mapping_allowed == 0) {
3397 				// now it is a NH>1 read!
3398 				// not allow multimapping -> discard!
3399 				if(RG_ptr){
3400 					void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3401 					fc_read_counters * sumtab = tab4s[1];
3402 					sumtab -> unassigned_multimapping++;
3403 				}else thread_context->read_counters.unassigned_multimapping ++;
3404 
3405 				if(global_context -> read_details_out_FP)
3406 					write_read_details_FP(global_context, thread_context, "Unassigned_MultiMapping", -1, NULL, bin1, bin2);
3407 
3408 				if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3409 				return;
3410 			}
3411 		}
3412 
3413 		maximum_NH_value = max(maximum_NH_value, NH_value);
3414 
3415 		// if a pair of reads have one secondary, the entire fragment is seen as secondary.
3416 		if((alignment_masks & SAM_FLAG_SECONDARY_MAPPING) && (global_context -> is_primary_alignment_only)) {
3417 			if(RG_ptr){
3418 				void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3419 				fc_read_counters * sumtab = tab4s[1];
3420 				sumtab -> unassigned_secondary++;
3421 			}else thread_context->read_counters.unassigned_secondary ++;
3422 
3423 			if(global_context -> read_details_out_FP)
3424 				write_read_details_FP(global_context, thread_context, "Unassigned_Secondary", -1, NULL, bin1, bin2);
3425 			if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3426 			return;
3427 		}
3428 
3429 		int is_this_negative_strand = (alignment_masks & SAM_FLAG_REVERSE_STRAND_MATCHED)?1:0;
3430 		int is_fragment_negative_strand = is_this_negative_strand;
3431 
3432 		if(1 || global_context -> is_paired_end_mode_assign){ // On 20 JULY 2020: If strand-specific counting is on, isPairedEnd = TRUE, countReadPairs = FALSE and the BAM file contains mixed reads, then the single-end reads and the R1 reads in read-pairs will be directly compared with the strand of the gene, but the R2 reads in read-pairs will be compared with the opposite strand of the gene. A read-pair will be counted twice no matter if the strand-specific mode is on or off. If the argument to the strand-specific option is "1", then R1 must have the same strand of the gene and R2 must have the opposite strand of the gene to be counted.
3433 			int is_second_read_in_pair = alignment_masks & SAM_FLAG_SECOND_READ_IN_PAIR;
3434 			//is_fragment_negative_strand = is_second_read_in_pair?(!is_this_negative_strand):is_this_negative_strand;
3435 			if(is_second_read_in_pair)
3436 				is_fragment_negative_strand = global_context -> is_second_read_straight?is_this_negative_strand:(!is_this_negative_strand);
3437 			else
3438 				is_fragment_negative_strand = global_context -> is_first_read_reversed?(!is_this_negative_strand):is_this_negative_strand;
3439 		}
3440 
3441 		int nhits = 0;
3442 
3443 		int cigar_section_id;
3444 		srInt_64 * hits_indices = is_second_read?thread_context -> hits_indices2:thread_context -> hits_indices1;
3445 		unsigned int * hits_start_pos = is_second_read?thread_context -> hits_start_pos2:thread_context -> hits_start_pos1;
3446 		unsigned short * hits_length = is_second_read?thread_context -> hits_length2:thread_context -> hits_length1;
3447 		char ** hits_chro = is_second_read?thread_context -> hits_chro2:thread_context -> hits_chro1;
3448 
3449 		if(global_context->is_split_or_exonic_only == 1 && !is_junction_read) {
3450 			skipped_for_exonic ++;
3451 
3452 			if(skipped_for_exonic == 1 + global_context -> is_paired_end_mode_assign){
3453 				if(global_context -> read_details_out_FP)
3454 					write_read_details_FP(global_context, thread_context, (global_context->is_split_or_exonic_only == 2)?"Unassigned_Split":"Unassigned_NonSplit", -1, NULL, bin1, bin2);
3455 
3456 				if(RG_ptr){
3457 					void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3458 					fc_read_counters * sumtab = tab4s[1];
3459 					sumtab -> unassigned_junction_condition++;
3460 				}else thread_context->read_counters.unassigned_junction_condition ++;
3461 
3462 				if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3463 				return;
3464 			}
3465 		}
3466 
3467 
3468 		if(global_context->is_split_or_exonic_only == 2 && is_junction_read) {
3469 			if(global_context -> read_details_out_FP)
3470 				write_read_details_FP(global_context, thread_context,(global_context->is_split_or_exonic_only == 2)?"Unassigned_Split":"Unassigned_NonSplit", -1, NULL, bin1, bin2);
3471 			if(RG_ptr){
3472 				void ** tab4s = get_RG_tables(global_context, thread_context, RG_ptr);
3473 				fc_read_counters * sumtab = tab4s[1];
3474 				sumtab -> unassigned_junction_condition++;
3475 			}else thread_context->read_counters.unassigned_junction_condition ++;
3476 
3477 			if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
3478 			return;
3479 		}
3480 
3481 		if(1) {
3482 
3483 			if(0)SUBREADprintf("MAPPED R_%d to %s : CHR_POS=%u + %u, CHR_LEN=%u\n", is_second_read+1, global_context -> sambam_chro_table[me_refID]. chro_name, Starting_Chro_Points_1BASE[0], Section_Read_Lengths[0], global_context -> sambam_chro_table[me_refID] .chro_length );
3484 
3485 			if(global_context -> read_shift_size>0){
3486 				int shifting_applied_length = 0;
3487 				int shifting_i;
3488 
3489 				if((global_context -> read_shift_type == READ_SHIFT_UPSTREAM   && (!is_this_negative_strand))||
3490 					(global_context -> read_shift_type == READ_SHIFT_DOWNSTREAM &&   is_this_negative_strand ))
3491 					shifting_applied_length = -global_context -> read_shift_size;
3492 
3493 				if((global_context -> read_shift_type == READ_SHIFT_UPSTREAM   &&   is_this_negative_strand)||
3494 					(global_context -> read_shift_type == READ_SHIFT_DOWNSTREAM && (!is_this_negative_strand)))
3495 					shifting_applied_length = global_context -> read_shift_size;
3496 
3497 				if(global_context -> read_shift_type == READ_SHIFT_LEFT) shifting_applied_length = -global_context -> read_shift_size;
3498 				if(global_context -> read_shift_type == READ_SHIFT_RIGHT) shifting_applied_length = global_context -> read_shift_size;
3499 
3500 				if(shifting_applied_length < 0 && Starting_Chro_Points_1BASE[0] <=-shifting_applied_length) shifting_applied_length = - Starting_Chro_Points_1BASE[0]+1;
3501 				if(shifting_applied_length > 0 && Starting_Chro_Points_1BASE[cigar_sections-1] + Section_Read_Lengths[cigar_sections-1] + shifting_applied_length > global_context -> sambam_chro_table[me_refID].chro_length +1 )
3502 					shifting_applied_length =  global_context -> sambam_chro_table[me_refID].chro_length +1 - (Starting_Chro_Points_1BASE[cigar_sections-1] + Section_Read_Lengths[cigar_sections-1]);
3503 
3504 				for(shifting_i = 0; shifting_i < cigar_sections ; shifting_i++)
3505 						Starting_Chro_Points_1BASE[shifting_i] += shifting_applied_length;
3506 			}
3507 
3508 			if(global_context -> five_end_extension)
3509 			{
3510 				if(is_this_negative_strand){
3511 					int applied_ext = global_context -> five_end_extension;
3512 
3513 					if( Starting_Chro_Points_1BASE[cigar_sections-1] + Section_Read_Lengths[cigar_sections-1] + applied_ext > global_context -> sambam_chro_table[me_refID].chro_length +1  )
3514 						applied_ext =  global_context -> sambam_chro_table[me_refID].chro_length +1 - (Starting_Chro_Points_1BASE[cigar_sections-1] + Section_Read_Lengths[cigar_sections-1]);
3515 
3516 					Section_Read_Lengths [cigar_sections - 1] += applied_ext;
3517 				}else{
3518 					//SUBREADprintf("5-end extension: %d [%d]\n", Starting_Chro_Points_1BASE[0], Section_Lengths[0]);
3519 					if( read_pos > global_context -> five_end_extension)
3520 					{
3521 						Section_Read_Lengths [0] += global_context -> five_end_extension;
3522 						Starting_Chro_Points_1BASE [0] -= global_context -> five_end_extension;
3523 					}
3524 					else
3525 					{
3526 						Section_Read_Lengths [0] += read_pos-1;
3527 						Starting_Chro_Points_1BASE [0] -= read_pos-1;
3528 					}
3529 				}
3530 			}
3531 
3532 			if(global_context -> three_end_extension) {
3533 
3534 				if(is_this_negative_strand){
3535 					if( read_pos > global_context -> three_end_extension)
3536 					{
3537 						Section_Read_Lengths [0] += global_context -> three_end_extension;
3538 						Starting_Chro_Points_1BASE [0] -= global_context -> three_end_extension;
3539 					}
3540 					else
3541 					{
3542 						Section_Read_Lengths [0] += read_pos - 1;
3543 						Starting_Chro_Points_1BASE [0] -= read_pos - 1;
3544 					}
3545 				} else{
3546 					int applied_ext = global_context -> three_end_extension;
3547 					if( Starting_Chro_Points_1BASE[cigar_sections-1] + Section_Read_Lengths[cigar_sections-1] + applied_ext > global_context -> sambam_chro_table[me_refID].chro_length +1 )
3548 						applied_ext = global_context -> sambam_chro_table[me_refID].chro_length +1 - (Starting_Chro_Points_1BASE[cigar_sections-1] + Section_Read_Lengths[cigar_sections-1]);
3549 					Section_Read_Lengths [cigar_sections - 1] += applied_ext;
3550 				}
3551 
3552 			}
3553 
3554 			if(global_context -> reduce_5_3_ends_to_one) {
3555 				if((REDUCE_TO_5_PRIME_END == global_context -> reduce_5_3_ends_to_one) + is_this_negative_strand == 1) // reduce to 5' end (small coordinate if positive strand / large coordinate if negative strand)
3556 				{
3557 					Section_Read_Lengths[0]=1;
3558 				}
3559 				else
3560 				{
3561 					Starting_Chro_Points_1BASE[0] = Starting_Chro_Points_1BASE[cigar_sections-1] + Section_Read_Lengths[cigar_sections-1] - 1;
3562 					Section_Read_Lengths[0]=1;
3563 				}
3564 				cigar_sections = 1;
3565 			}
3566 
3567 			for(cigar_section_id = 0; cigar_section_id<cigar_sections; cigar_section_id++)
3568 			{
3569 
3570 				if(!ChroNames[ cigar_section_id ]) continue; // NULL chro name for https://groups.google.com/forum/#!topic/subread/QDT6npjAZuE
3571 				srInt_64 section_begin_pos = Starting_Chro_Points_1BASE[cigar_section_id];
3572 				srInt_64 section_end_pos = Section_Read_Lengths[cigar_section_id] + section_begin_pos - 1;
3573 
3574 
3575 				int start_reverse_table_index = section_begin_pos / REVERSE_TABLE_BUCKET_LENGTH;
3576 				int end_reverse_table_index = (1+section_end_pos) / REVERSE_TABLE_BUCKET_LENGTH;
3577 
3578 				/*if(ChroNames[cigar_section_id] < (char *)NULL + 0xfffff){
3579 					unsigned char * tbbin = is_second_read?bin2:bin1;
3580 					int * refid = (int*)(tbbin);
3581 
3582 					SUBREADprintf("DANGEROUS! RNAME=%s, REC_LEN=%d,  CNAME=[%d]%p,  LEN_P=%d,  SECID=%d\n", read_name, refid[0], refid[1], ChroNames[cigar_section_id], Section_Read_Lengths[cigar_section_id], cigar_section_id);
3583 				}*/
3584 
3585 				fc_chromosome_index_info * this_chro_info = HashTableGet(global_context -> exontable_chro_table, ChroNames[cigar_section_id]);
3586 				if(this_chro_info == NULL)
3587 				{
3588 					if(global_context -> BAM_chros_to_anno_table)
3589 					{
3590 						char * anno_chro_name = HashTableGet( global_context -> BAM_chros_to_anno_table , ChroNames[cigar_section_id]);
3591 						if(anno_chro_name)
3592 							this_chro_info = HashTableGet(global_context -> exontable_chro_table, anno_chro_name);
3593 					}
3594 					if(this_chro_info == NULL && memcmp(ChroNames[cigar_section_id], "chr", 3)==0)
3595 					{
3596 						this_chro_info = HashTableGet(global_context -> exontable_chro_table, ChroNames[cigar_section_id]+3);
3597 					//	SUBREADprintf("INQ: %p : '%s'\n", this_chro_info , ChroNames[cigar_section_id]+3);
3598 					}
3599 					if(this_chro_info == NULL && strlen(ChroNames[cigar_section_id])<=2)
3600 					{
3601 						strcpy(thread_context -> chro_name_buff, "chr");
3602 						strcpy(thread_context -> chro_name_buff+3, ChroNames[cigar_section_id]);
3603 						this_chro_info = HashTableGet(global_context -> exontable_chro_table, thread_context -> chro_name_buff);
3604 					}
3605 				}
3606 
3607 				//SUBREADprintf("INF: %p : %s\n", this_chro_info , ChroNames[cigar_section_id]);
3608 
3609 				if(this_chro_info)
3610 				{
3611 					start_reverse_table_index = min(start_reverse_table_index, this_chro_info-> chro_possible_length / REVERSE_TABLE_BUCKET_LENGTH);
3612 					end_reverse_table_index = min(end_reverse_table_index, this_chro_info-> chro_possible_length / REVERSE_TABLE_BUCKET_LENGTH+ 1);
3613 
3614 					while(start_reverse_table_index<=end_reverse_table_index)
3615 					{
3616 						search_start = this_chro_info -> reverse_table_start_index [start_reverse_table_index];
3617 						if(search_start<0xffffff00)break;
3618 						start_reverse_table_index++;
3619 					}
3620 					if(search_start>0xffffff00) continue;
3621 
3622 					//search_start = this_chro_info -> chro_block_table_start;
3623 
3624 					search_end = this_chro_info -> chro_block_table_end;//reverse_table_end_index [end_reverse_table_index];
3625 
3626 					for(search_block_id=search_start;search_block_id<search_end;search_block_id++){
3627 						if (global_context -> exontable_block_min_start[search_block_id] > section_end_pos) break;
3628 						if (global_context -> exontable_block_max_end[search_block_id] < section_begin_pos) continue;
3629 
3630 						int search_item_start = 0, search_item_end = global_context -> exontable_block_end_index[search_block_id];
3631 						if(search_block_id>0)search_item_start = global_context -> exontable_block_end_index[search_block_id-1];
3632 
3633 						// search_item_id is the inner number of the exons.
3634 						// the exontables in global_index has search_item_id as the index.
3635 
3636 						for(search_item_id = search_item_start ; search_item_id < search_item_end; search_item_id++)
3637 						{
3638 							if (global_context -> exontable_stop[search_item_id] >= section_begin_pos)
3639 							{
3640 								if (global_context -> exontable_start[search_item_id] > section_end_pos) break;
3641 								// there is an overlap >=1 between read and feature.
3642 								// the overlap length is min(end_r, end_F) - max(start_r, start_F) + 1
3643 
3644 								int is_strand_ok =1;
3645 
3646 								if(global_context->is_strand_checked){
3647 									if(global_context->is_strand_checked == 1)
3648 										is_strand_ok = (is_fragment_negative_strand == global_context -> exontable_strand[search_item_id]);
3649 									else// if(global_context->is_strand_checked == 2)
3650 										is_strand_ok = (is_fragment_negative_strand != global_context -> exontable_strand[search_item_id]);
3651 									//SUBREADprintf("%d = %d == %d\n", is_strand_ok, is_fragment_negative_strand, global_context -> exontable_strand[search_item_id]);
3652 								}
3653 
3654 								if(is_strand_ok){
3655 
3656 									if(nhits >= thread_context -> hits_number_capacity - 1){
3657 										//SUBREADprintf("RESIZE hits: %d\n", thread_context -> hits_number_capacity);
3658 										thread_context -> hits_number_capacity = thread_context -> hits_number_capacity/2 * 3;
3659 										thread_context -> hits_number_capacity = max(10, thread_context -> hits_number_capacity);
3660 										thread_context -> hits_start_pos1 = realloc(thread_context -> hits_start_pos1 , sizeof(int) * thread_context -> hits_number_capacity);
3661 										thread_context -> hits_start_pos2 = realloc(thread_context -> hits_start_pos2 , sizeof(int) * thread_context -> hits_number_capacity);
3662 
3663 										thread_context -> hits_length1 = realloc(thread_context -> hits_length1, sizeof(short) * thread_context -> hits_number_capacity);
3664 										thread_context -> hits_length2 = realloc(thread_context -> hits_length2, sizeof(short) * thread_context -> hits_number_capacity);
3665 
3666 										thread_context -> hits_chro1 = realloc(thread_context -> hits_chro1, sizeof(char *) * thread_context -> hits_number_capacity);
3667 										thread_context -> hits_chro2 = realloc(thread_context -> hits_chro2, sizeof(char *) * thread_context -> hits_number_capacity);
3668 
3669 										thread_context -> hits_indices1 = realloc(thread_context -> hits_indices1, sizeof(srInt_64) * thread_context -> hits_number_capacity);
3670 										thread_context -> hits_indices2 = realloc(thread_context -> hits_indices2, sizeof(srInt_64) * thread_context -> hits_number_capacity);
3671 
3672 										thread_context -> scoring_buff_numbers = realloc(thread_context -> scoring_buff_numbers, sizeof(int)*2*thread_context -> hits_number_capacity);
3673 										thread_context -> scoring_buff_flags = realloc(thread_context -> scoring_buff_flags, sizeof(int)*2*thread_context -> hits_number_capacity);
3674 										thread_context -> scoring_buff_overlappings = realloc(thread_context -> scoring_buff_overlappings, sizeof(int)*2*thread_context -> hits_number_capacity);
3675 										thread_context -> scoring_buff_exon_ids = realloc(thread_context -> scoring_buff_exon_ids, sizeof(srInt_64)*2*thread_context -> hits_number_capacity);
3676 
3677 										if(global_context -> need_calculate_overlap_len){
3678 											thread_context -> scoring_buff_gap_chros = realloc(thread_context -> scoring_buff_gap_chros, sizeof(char *) * 2 * global_context -> max_M *2 * thread_context -> hits_number_capacity);
3679 											thread_context -> scoring_buff_gap_starts = realloc(thread_context -> scoring_buff_gap_starts, sizeof(int) * 2 * global_context -> max_M *2 * thread_context -> hits_number_capacity);
3680 											thread_context -> scoring_buff_gap_lengths = realloc(thread_context -> scoring_buff_gap_lengths, sizeof(short) * 2 * global_context -> max_M *2 * thread_context -> hits_number_capacity);
3681 										}
3682 
3683 										hits_indices = is_second_read?thread_context -> hits_indices2:thread_context -> hits_indices1;
3684 										hits_start_pos = is_second_read?thread_context -> hits_start_pos2:thread_context -> hits_start_pos1;
3685 										hits_length = is_second_read?thread_context -> hits_length2:thread_context -> hits_length1;
3686 										hits_chro = is_second_read?thread_context -> hits_chro2:thread_context -> hits_chro1;
3687 										//SUBREADprintf("RESIZE hits2: %d\n", thread_context -> hits_number_capacity);
3688 									}
3689 
3690 									if(nhits <= MAX_HIT_NUMBER - 1) {
3691 										hits_indices[nhits] = search_item_id;
3692 
3693 										if(global_context -> need_calculate_overlap_len) {
3694 											hits_start_pos[nhits] = max(Starting_Chro_Points_1BASE[cigar_section_id], global_context -> exontable_start[search_item_id]);
3695 											hits_length[nhits] =  min(global_context -> exontable_stop[search_item_id] , section_end_pos)+1 - hits_start_pos[nhits] ;
3696 											hits_chro[nhits] = ChroNames[cigar_section_id];
3697 											if(0 && FIXLENstrcmp("V0112_0155:7:1101:10214:3701", read_name)==0)
3698 												SUBREADprintf("QNAME: [%d] %s %d ~ %d\n", nhits, hits_chro[nhits],  hits_start_pos[nhits],  hits_start_pos[nhits]+hits_length[nhits]);
3699 										}
3700 
3701 										nhits++;
3702 									} else {
3703 										SUBREADprintf("ERROR: the read overlapped with more than %d features.\n", nhits);
3704 										global_context -> is_input_bad_format = 1;
3705 										return ;
3706 									}
3707 								}
3708 							}
3709 						}
3710 					}
3711 				}
3712 			}
3713 		}
3714 
3715 
3716 		if(is_second_read) nhits2 = nhits;
3717 		else	nhits1 = nhits;
3718 	}	// loop for is_second_read
3719 
3720 
3721 	if(global_context -> do_junction_counting)// junction reads that passed the basic filters will be considered with the junction counting. Filters: Unmapped, Singleton, MAPQ, TemplateLength, Chimeric, Duplicate, Multimapping, Secondary alignment, Junction-containing status,
3722 	        process_line_junctions(global_context, thread_context, bin1, bin2);
3723 
3724 	if(global_context -> need_calculate_fragment_len )
3725 		total_frag_len = calc_total_frag_len( global_context, thread_context, CIGAR_intervals_R1, CIGAR_intervals_R1_sections, CIGAR_intervals_R2, CIGAR_intervals_R2_sections , read_name);
3726 
3727 	//SUBREADprintf("FRAGLEN: %s %d; CIGARS=%d,%d\n", read_name, total_frag_len, CIGAR_intervals_R1_sections,CIGAR_intervals_R2_sections);
3728 
3729 	int fixed_fractional_count = ( global_context -> use_fraction_multi_mapping && ! global_context -> is_primary_alignment_only )?calc_fixed_fraction(maximum_NH_value): NH_FRACTION_INT;
3730 
3731 	// we have hits_indices1 and hits_indices2 and nhits1 and nhits2 here
3732 	// we also have fixed_fractional_count which is the value to add
3733 
3734 	vote_and_add_count(global_context, thread_context,
3735 			    thread_context -> hits_indices1,  nhits1, thread_context -> hits_indices2,  nhits2, total_frag_len,
3736 			    thread_context -> hits_chro1, thread_context -> hits_chro2,
3737 				thread_context -> hits_start_pos1, thread_context -> hits_start_pos2,
3738 				thread_context -> hits_length1, thread_context ->hits_length2,
3739 			    fixed_fractional_count, read_name, RG_ptr, bin1, bin2);
3740 	return;
3741 }
3742 
add_bitmap_overlapping(char * x1_bitmap,short start_base,short len)3743 void add_bitmap_overlapping(char * x1_bitmap, short start_base, short len){
3744 	int x1;
3745 	int rl16 = start_base+len-16;
3746 	for(x1 = start_base; x1 < start_base+len; x1++){
3747 		int bit = x1 % 8;
3748 		int byte = x1 / 8;
3749 		if(bit == 0 && x1 < rl16){
3750 			x1_bitmap[byte]=-1;
3751 			x1_bitmap[byte+1]=-1;
3752 			x1+=15;
3753 		}else{
3754 			x1_bitmap[byte] |= (1<<bit);
3755 		}
3756 	}
3757 }
3758 
count_bitmap_overlapping(char * x1_bitmap,unsigned short rl)3759 int count_bitmap_overlapping(char * x1_bitmap, unsigned short rl){
3760 
3761 	int x1;
3762 	int ret = 0;
3763 	for(x1 = 0; x1 < rl; x1++){
3764 		int byte = x1 / 8;
3765 		int bit = x1 % 8;
3766 
3767 		if(bit == 0 && x1_bitmap[byte]==-1){
3768 			x1 += 7;
3769 			ret += 8;
3770 		}else if(x1_bitmap[byte] &  (1<<bit)) ret ++;
3771 	}
3772 	return ret;
3773 }
3774 
add_fragment_supported_junction(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,fc_junction_info_t * supported_junctions1,int njunc1,fc_junction_info_t * supported_junctions2,int njunc2,char * RG_name)3775 void add_fragment_supported_junction(	fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context, fc_junction_info_t * supported_junctions1, int njunc1, fc_junction_info_t * supported_junctions2, int njunc2, char * RG_name){
3776 	assert(njunc1 >= 0 && njunc1 <= global_context -> max_M -1 );
3777 	assert(njunc2 >= 0 && njunc2 <= global_context -> max_M -1 );
3778 	int x1,x2, in_total_junctions = njunc2 + njunc1;
3779 
3780 	HashTable * junction_counting_table, *splicing_point_table;
3781 
3782 	if(RG_name){
3783 		void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
3784 		junction_counting_table = tab4s[2];
3785 		splicing_point_table = tab4s[3];
3786 	}else{
3787 		junction_counting_table = thread_context -> junction_counting_table;
3788 		splicing_point_table = thread_context -> splicing_point_table;
3789 	}
3790 
3791 	for(x1 = 0; x1 < in_total_junctions; x1 ++){
3792 		fc_junction_info_t * j_one = (x1 >= njunc1)?supported_junctions2+(x1-njunc1):(supported_junctions1+x1);
3793 		if(j_one->chromosome_name_left[0]==0) continue;
3794 
3795 		for(x2 = x1+1; x2 < in_total_junctions ; x2 ++){
3796 			fc_junction_info_t * j_two = (x2 >= njunc1)?supported_junctions2+(x2-njunc1):(supported_junctions1+x2);
3797 			if(j_two->chromosome_name_left[0]==0) continue;
3798 			if(
3799 				j_one -> last_exon_base_left == j_two -> last_exon_base_left &&
3800 				j_one -> first_exon_base_right == j_two -> first_exon_base_right &&
3801 				strcmp(j_one -> chromosome_name_left, j_two -> chromosome_name_left) == 0 &&
3802 				strcmp(j_one -> chromosome_name_right, j_two -> chromosome_name_right) == 0
3803 			) j_two -> chromosome_name_left[0]=0;
3804 		}
3805 
3806 		char * this_key = malloc(strlen(j_one->chromosome_name_left) + strlen(j_one->chromosome_name_right)  + 36);
3807 		sprintf(this_key, "%s\t%u\t%s\t%u", j_one->chromosome_name_left, j_one -> last_exon_base_left, j_one->chromosome_name_right, j_one -> first_exon_base_right);
3808 		void * count_ptr = HashTableGet(junction_counting_table, this_key);
3809 		srInt_64 count_junc = count_ptr - NULL;
3810 		HashTablePut(junction_counting_table, this_key, NULL+count_junc + 1);
3811 
3812 //		#warning "CONTINUE SHOULD BE REMOVED!!!."
3813 //			continue;
3814 
3815 		char * left_key = malloc(strlen(j_one->chromosome_name_left) + 16);
3816 		char * right_key = malloc(strlen(j_one->chromosome_name_right) + 16);
3817 		sprintf(left_key, "%s\t%u", j_one->chromosome_name_left, j_one -> last_exon_base_left);
3818 		sprintf(right_key, "%s\t%u", j_one->chromosome_name_right, j_one -> first_exon_base_right);
3819 
3820 		for( x2 = 0 ; x2 < 2 ; x2++ ){
3821 			char * lr_key = x2?right_key:left_key;
3822 			count_ptr = HashTableGet(splicing_point_table, lr_key);
3823 			count_junc = count_ptr - NULL;
3824 			HashTablePut(splicing_point_table, lr_key, NULL + count_junc + 1);
3825 		}
3826 	}
3827 }
3828 
overlap_compare(void * arr,int L,int R)3829 int overlap_compare(void * arr, int L, int R){
3830 	unsigned int * pos = (unsigned int *)arr;
3831 	return pos[ L*2 ] -  pos[R*2];
3832 }
3833 
overlap_exchange(void * arr,int L,int R)3834 void overlap_exchange(void * arr, int L, int R){
3835 	unsigned int * pos = (unsigned int *)arr, tt;
3836 	tt=pos[L*2];
3837 	pos[L*2] = pos[R*2];
3838 	pos[R*2] = tt;
3839 
3840 	tt=pos[L*2+1];
3841 	pos[L*2+1] = pos[R*2+1];
3842 	pos[R*2+1] = tt;
3843 }
3844 
scRNA_get_sample_id(fc_thread_global_context_t * global_context,char * sbc,int read_laneno)3845 int scRNA_get_sample_id(fc_thread_global_context_t *global_context, char * sbc, int read_laneno){
3846 	int x1;
3847 
3848 	//SUBREADprintf("TOTAL_SBC=%ld\n", global_context -> scRNA_sample_barcode_list -> numOfElements);
3849 	for(x1=0; x1 < global_context -> scRNA_sample_barcode_list -> numOfElements ; x1++ ){
3850 		char ** lane_and_barcode = ArrayListGet(global_context -> scRNA_sample_barcode_list, x1);
3851 		int lane_no = lane_and_barcode[0]-(char*)NULL;
3852 	//	SUBREADprintf("KNOWN_LANE=%d, IN_LANE=%d, to\n", lane_no, read_laneno);
3853 		if(read_laneno == lane_no){
3854 			int sample_no = lane_and_barcode[1]-(char*)NULL;
3855 			char * knownbar = lane_and_barcode[2];
3856 			int hd = hamming_dist_ATGC_max2( sbc, knownbar );
3857 			//SUBREADprintf("Testing SampleBC %s vs %s dif=%d  it is sample %d\n", knownbar, sbc, hd, sample_no);
3858 			if(hd<=2) return sample_no;
3859 		}
3860 	}
3861 	return -1;
3862 }
3863 
scRNA_register_umi_id(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * ubc)3864 int scRNA_register_umi_id(fc_thread_global_context_t * global_context, fc_thread_thread_context_t *  thread_context, char * ubc){
3865 
3866 	int xk1=0,nch;
3867 	for(xk1=0; 0!=(nch = ubc [xk1]); xk1++){
3868 		if(!isalpha(nch))break;
3869 	}
3870 	ubc[xk1]=0;
3871 	int uno = HashTableGet(thread_context -> scRNA_registered_UMI_table, ubc ) -NULL -1;
3872 	if(uno<0) {
3873 		uno =  thread_context -> scRNA_registered_UMI_table -> numOfElements;
3874 		assert(strlen(ubc) <=MAX_UMI_LEN);
3875 		HashTablePut( thread_context -> scRNA_registered_UMI_table, strdup(ubc) , NULL+ uno +1);
3876 	}
3877 
3878 	ubc[xk1]=nch;
3879 	return uno;
3880 }
3881 
3882 #define IMPOSSIBLE_MEMORY_SPACE 0x5CAFEBABE0000000llu
scRNA_get_cell_id(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * cbc)3883 int scRNA_get_cell_id(fc_thread_global_context_t * global_context, fc_thread_thread_context_t *  thread_context, char * cbc){
3884 	//return -1;
3885 	char tmpc [MAX_READ_NAME_LEN];
3886 	int xx1;
3887 	ArrayList * ret=NULL;
3888 
3889 	for(xx1=0;xx1<3;xx1++){
3890 		int xx2;
3891 		if(xx1==1) ret = ArrayListCreate(100);
3892 
3893 		if(xx1>0){
3894 			tmpc[0] = (xx1==2)?'S':'F';
3895 			for(xx2=0; xx2<global_context -> known_cell_barcode_length/2 ; xx2++)
3896 				tmpc[1+xx2] = cbc[2*xx2+xx1-1];
3897 			tmpc[1+global_context -> known_cell_barcode_length/2]=0;
3898 		}else{
3899 			memcpy(tmpc, cbc, global_context -> known_cell_barcode_length);
3900 			tmpc[global_context -> known_cell_barcode_length]=0;
3901 		}
3902 
3903 		void *xrawarr = HashTableGet(global_context -> scRNA_cell_barcode_head_tail_table, tmpc);
3904 
3905 		if(xx1 == 0){
3906 			//if(xrawarr) SUBREADprintf("CAFE ? %p\n", xrawarr);
3907 			srInt_64 xint = xrawarr - NULL;
3908 			if(( xint & 0xFFFFFFFFF0000000llu)== IMPOSSIBLE_MEMORY_SPACE){
3909 				int only_cell_id = xint - IMPOSSIBLE_MEMORY_SPACE;
3910 				// no memory was allocated.
3911 				return only_cell_id;
3912 			}
3913 		}else{
3914 			ArrayList * rawarr = xrawarr;
3915 			if(rawarr){
3916 				int xx3,xx2, found;
3917 				for(xx2=0; xx2<rawarr->numOfElements; xx2++){
3918 					int bcno = ArrayListGet(rawarr, xx2)-NULL;
3919 					found=0;
3920 					for(xx3=0;xx3<ret -> numOfElements;xx3++){
3921 						if(ArrayListGet(ret, xx3)==NULL+bcno){
3922 							found=1;
3923 							break;
3924 						}
3925 					}
3926 
3927 					if(!found)ArrayListPush(ret, NULL+bcno);
3928 				}
3929 			}
3930 		}
3931 	}
3932 
3933 
3934 	int tb1=-1;
3935 	for(xx1=0; xx1<ret -> numOfElements; xx1++){
3936 		int tbcn = ArrayListGet(ret,xx1)-NULL;
3937 		char * known_cbc = ArrayListGet(global_context -> scRNA_cell_barcodes_array, tbcn);
3938 		int hc = hamming_dist_ATGC_max2( known_cbc, cbc );
3939 
3940 	//	cbc[16]=0; if(hc <=3)SUBREADprintf("TEST_CBC %s ~ %s = %d\n", known_cbc, cbc, hc);
3941 		if(hc==1){
3942 			tb1 = tbcn;
3943 			break;
3944 		}
3945 	}
3946 	//SUBREADprintf("CANDIDATE CELL BARCODES=%ld ; hit = %d\n", ret->numOfElements, tb1);
3947 	ArrayListDestroy(ret);
3948 
3949 	return tb1;
3950 }
3951 
3952 #define SCRNA_READ_NAME_SPLIT_CHAR '|'
3953 
scRNA_move_barcodes_to_tags(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * inbin,char ** outbin_pr,char * fixed_cell_barcode,char * fixed_UMI)3954 void scRNA_move_barcodes_to_tags(fc_thread_global_context_t * global_context,  fc_thread_thread_context_t * thread_context, char * inbin, char ** outbin_pr, char * fixed_cell_barcode, char * fixed_UMI){
3955 	int inbin_len=0;
3956 	memcpy(&inbin_len, inbin, 4);
3957 	char* outbin = malloc(inbin_len + 400);
3958 	*outbin_pr = outbin;
3959 
3960 	int l_read_name=0, new_l_read_name=0, x1;
3961 	memcpy(&l_read_name, inbin+12, 1);
3962 
3963 	char * BC_seq=NULL, * BC_qual=NULL, *UMI_seq=NULL, *UMI_qual=NULL, *RG=NULL;
3964 	int BC_len=global_context -> known_cell_barcode_length, UMI_len=0, RG_len=0, field_i = 0;
3965 
3966 	for(x1=1; x1<l_read_name-1; x1++){
3967 		char rnchar = inbin [ 36 + x1];
3968 		if(rnchar == SCRNA_READ_NAME_SPLIT_CHAR || (rnchar== ':' && global_context -> scRNA_input_mode == GENE_INPUT_BCL)){
3969 			field_i ++;
3970 			if(field_i == 1){
3971 				new_l_read_name = x1+1;
3972 				BC_seq = inbin+36 + x1+1;
3973 				UMI_seq = BC_seq + BC_len;
3974 			}else if(field_i == 2){
3975 				BC_qual = inbin+36 + x1+1;
3976 				UMI_len = BC_qual - BC_seq - 1 - global_context -> known_cell_barcode_length;
3977 				UMI_qual = BC_qual + BC_len;
3978 			}else if(field_i == 5){
3979 				RG = inbin+36 + x1+1;
3980 				RG_len = l_read_name - x1 - 2;
3981 			}
3982 		}
3983 	}
3984 
3985 	memcpy(outbin, inbin, 36+new_l_read_name);
3986 	outbin[36+new_l_read_name-1]=0;
3987 	memcpy(outbin + 12,&new_l_read_name, 1);
3988 	memcpy(outbin + 36 + new_l_read_name, inbin + 36 + l_read_name, inbin_len +4 - 36 - l_read_name);
3989 
3990 	int ext_ptr = inbin_len +4 - (l_read_name - new_l_read_name);
3991 	for(x1 = 0;x1<7;x1++){
3992 		int this_len = BC_len;
3993 		char * this_tag = "CR", * this_val = BC_seq;
3994 
3995 		if(x1 == 1){this_tag = "CY"; this_val = BC_qual;}
3996 		if(x1 == 2){this_tag = "CB"; this_val = fixed_cell_barcode;}
3997 		if(x1 == 3){this_tag = "UR"; this_val = UMI_seq;}
3998 		if(x1 == 4){this_tag = "UY"; this_val = UMI_qual;}
3999 		if(x1 == 5){this_tag = "UB"; this_val = fixed_UMI;}
4000 		if(x1 == 6){this_tag = "RG"; this_val = RG;}
4001 
4002 		if(x1 == 3 || x1 == 4 || x1 == 5) this_len = UMI_len;
4003 		if(x1 == 6) this_len = RG_len;
4004 
4005 		outbin[ext_ptr]= this_tag[0];
4006 		outbin[ext_ptr+1]= this_tag[1];
4007 		outbin[ext_ptr+2]= 'Z';
4008 		memcpy(outbin+ext_ptr+3, this_val, this_len);
4009 		outbin[ext_ptr+3+this_len]= 0;
4010 		ext_ptr += 3+1+this_len;
4011 	}
4012 
4013 	ext_ptr -=4;	// block_size excl itself
4014 	memcpy(outbin, &ext_ptr, 4);
4015 }
4016 
scRNA_scan_read_name_str(fc_thread_global_context_t * global_context,char * read_name,char * read_bin,char ** sample_seq,char ** sample_qual,char ** BC_seq,char ** BC_qual,char ** UMI_seq,char ** UMI_qual,char ** lane_str,char ** RG,int * rname_trimmed_len)4017 int scRNA_scan_read_name_str(fc_thread_global_context_t * global_context, char * read_name, char * read_bin, char ** sample_seq, char ** sample_qual, char ** BC_seq, char ** BC_qual, char ** UMI_seq, char ** UMI_qual, char ** lane_str, char ** RG, int * rname_trimmed_len){
4018 	char * testi;
4019 	int field_i=0;
4020 	if(NULL == read_name && read_bin) read_name = read_bin + 36;
4021 	for(testi = read_name +1; * testi; testi ++){
4022 		if((*testi)== SCRNA_READ_NAME_SPLIT_CHAR || ((*testi)== ':' && global_context -> scRNA_input_mode == GENE_INPUT_BCL )){
4023 			field_i++;
4024 			if(field_i == 1) {
4025 				if(rname_trimmed_len) (*rname_trimmed_len)=testi-read_name;
4026 				if(BC_seq)(*BC_seq) = testi+1;
4027 				if(UMI_seq)(*UMI_seq) = testi+1+global_context -> known_cell_barcode_length;
4028 			}else if(field_i == 2){
4029 				if(BC_qual)(*BC_qual) = testi+1;
4030 				if(UMI_qual)(*UMI_qual) = testi+1+global_context -> known_cell_barcode_length;
4031 			}else if(field_i == 3){
4032 				*sample_seq = testi + 1;
4033 				if(RG)(*RG) = *sample_seq;
4034 			}else if(field_i == 4){
4035 				if(sample_qual)(*sample_qual) = testi + 1;
4036 			}else if(field_i == 5){
4037 				(*lane_str) = testi + 1;
4038 				if(memcmp(*lane_str, "@RgLater@", 9)==0) (*lane_str) += 9;
4039 				break;
4040 			}
4041 		}
4042 	}
4043 
4044 	if(field_i < 3 && read_bin){
4045 		int bin_len = 0;
4046 		char tag_type = 0;
4047 		int bintag_start = SAM_pairer_get_tag_bin_start(read_bin);
4048 		memcpy(&bin_len, read_bin, 4);
4049 		bin_len = bin_len +4 -bintag_start;
4050 
4051 		if(BC_seq) SAM_pairer_iterate_tags((unsigned char*)read_bin+bintag_start, bin_len , "CR", &tag_type, BC_seq);
4052 		if(UMI_seq) SAM_pairer_iterate_tags((unsigned char*)read_bin+bintag_start, bin_len , "UR", &tag_type, UMI_seq);
4053 
4054 		if(BC_qual) SAM_pairer_iterate_tags((unsigned char*)read_bin+bintag_start, bin_len , "CY", &tag_type, BC_qual);
4055 		if(UMI_qual) SAM_pairer_iterate_tags((unsigned char*)read_bin+bintag_start, bin_len , "UY", &tag_type, UMI_qual);
4056 
4057 		if(RG) SAM_pairer_iterate_tags((unsigned char*)read_bin+bintag_start, bin_len , "RG", &tag_type, RG);
4058 	}
4059 
4060 	return field_i;
4061 }
4062 
scRNA_find_sample_cell_umi_from_readname(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * read_name,char * read_bin,int * sample_id,char ** BC_seq,char ** UMI_seq,char ** RG)4063 void scRNA_find_sample_cell_umi_from_readname(fc_thread_global_context_t * global_context,  fc_thread_thread_context_t * thread_context, char * read_name, char * read_bin,
4064   int * sample_id, char ** BC_seq, char ** UMI_seq, char ** RG){
4065 	int field_i = 0, laneno = 0;
4066 	char * testi, * lane_str = NULL, *sample_barcode = NULL;
4067 
4068 	if(sample_id)*sample_id = -1;
4069 
4070 	field_i = scRNA_scan_read_name_str(global_context, read_name, read_bin, &sample_barcode, NULL, BC_seq, NULL, UMI_seq, NULL, &lane_str, RG, NULL);
4071 
4072 	if(!sample_id){
4073 		if(!(UMI_seq && BC_seq))SUBREADprintf("ERROR: Cannot get UMI or BC: %s, %s\n", *UMI_seq, *BC_seq);
4074 		return;
4075 	}
4076 
4077 	if(global_context -> scRNA_input_mode == GENE_INPUT_SCRNA_BAM){
4078 		*sample_id = 1; // on the BAM mode, every featureCounts run only has one sample
4079 	}else if(global_context -> scRNA_input_mode == GENE_INPUT_SCRNA_FASTQ){
4080 		if(sample_barcode == NULL || memcmp(sample_barcode, "input#", 6) || !isdigit(sample_barcode[6]))
4081 			SUBREADprintf("SPBCFMT_ERR %d // %s in %s // %s\n", field_i, sample_barcode, read_name, read_name +13 +global_context -> known_cell_barcode_length);
4082 		else{
4083 
4084 			int lineno = atoi(sample_barcode +6) +1;
4085 			*sample_id = (HashTableGet(global_context -> scRNA_lineno1B_to_sampleno1B_tab, NULL+lineno)-NULL);
4086 		}
4087 	}else{
4088 		if(field_i !=5 || (*lane_str)!='L')
4089 			SUBREADprintf("LANESTR_ERR %d , %s\n", field_i, lane_str);
4090 		for(testi = lane_str+1; *testi; testi++){
4091 			if(!isdigit(*testi))break;
4092 			laneno = laneno*10 + (*testi)-'0';
4093 		}
4094 
4095 		*sample_id = scRNA_get_sample_id(global_context, sample_barcode, laneno);
4096 		//Rprintf("LOOKUP SAMPLE %d by %s-%s-%d\n", *sample_id, sample_barcode, lane_str, laneno);
4097  	}
4098 }
4099 
add_scRNA_read_tota1_no(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char * read_name,char * bambin,int step)4100 void add_scRNA_read_tota1_no( fc_thread_global_context_t * global_context,  fc_thread_thread_context_t * thread_context, char * read_name, char * bambin, int step){
4101 	int sample_id= -1;
4102 	char * cell_bc = NULL, * umi = NULL;
4103 	int known_sample_id = 0;
4104 	if(global_context -> scRNA_rerun_on_persample_BAM) known_sample_id = global_context -> this_input_number+1;
4105 
4106 	scRNA_find_sample_cell_umi_from_readname(global_context, thread_context, read_name, bambin, (known_sample_id>0)?NULL:&sample_id, &cell_bc, &umi, NULL);
4107 	if(known_sample_id>0) sample_id = known_sample_id;
4108 
4109 	if(global_context -> scRNA_UMI_length <1){
4110 		int umi_end_pos=0,nch;
4111 		for(umi_end_pos=0; 0!=(nch = umi [umi_end_pos]); umi_end_pos++) if(!isalpha(nch))break;
4112 		global_context -> scRNA_UMI_length = umi_end_pos;
4113 	}
4114 
4115 	if(sample_id>0){
4116 		if(step==0){
4117 			thread_context -> scRNA_reads_per_sample[sample_id-1] ++;
4118 			if(global_context -> is_scRNA_BAM_FQ_out_generated){
4119 				void ** sample_bam_2fps = HashTableGet(global_context -> scRNA_sample_BAM_writers, NULL+(sample_id-1) + 1); // sample_id-1: 0,1,2,...
4120 				if(sample_bam_2fps==NULL) SUBREADprintf("Error: unknown sample id = %d\n", sample_id);
4121 
4122 				if(GENE_INPUT_SCRNA_FASTQ != global_context -> scRNA_input_mode){
4123 					parallel_gzip_writer_t **gz3fps = (parallel_gzip_writer_t **)sample_bam_2fps+1;
4124 					parallel_gzip_writer_add_read_fqs_scRNA(gz3fps, bambin, thread_context -> thread_id);
4125 					if( gz3fps[0]-> thread_objs[thread_context -> thread_id].in_buffer_used >= PARALLEL_GZIP_TXT_BUFFER_SIZE - PARALLEL_GZIP_TXT_BUFFER_MARGIN ||
4126 					    gz3fps[1]-> thread_objs[thread_context -> thread_id].in_buffer_used >= PARALLEL_GZIP_TXT_BUFFER_SIZE - PARALLEL_GZIP_TXT_BUFFER_MARGIN ||
4127 					    gz3fps[2]-> thread_objs[thread_context -> thread_id].in_buffer_used >= PARALLEL_GZIP_TXT_BUFFER_SIZE - PARALLEL_GZIP_TXT_BUFFER_MARGIN ){
4128 						parallel_gzip_zip_texts(gz3fps[0], thread_context -> thread_id, 0);
4129 						parallel_gzip_zip_texts(gz3fps[1], thread_context -> thread_id, 0);
4130 						parallel_gzip_zip_texts(gz3fps[2], thread_context -> thread_id, 0);
4131 						pthread_spin_lock(sample_bam_2fps[4]);
4132 						parallel_gzip_writer_flush(gz3fps[0], thread_context -> thread_id);
4133 						parallel_gzip_writer_flush(gz3fps[1], thread_context -> thread_id);
4134 						parallel_gzip_writer_flush(gz3fps[2], thread_context -> thread_id);
4135 						pthread_spin_unlock(sample_bam_2fps[4]);
4136 					}
4137 				}
4138 			}
4139 		}else if(step==1) thread_context -> scRNA_mapped_reads_per_sample[sample_id-1] ++;
4140 	}
4141 }
4142 
scRNA_do_one_batch_write_extend_rbin(fc_thread_global_context_t * global_context,char * rbin,int binlen,FILE * fp,char * fixedbc_seq,char * fixedumi_seq,srInt_64 gene_no,srInt_64 * genes)4143 void scRNA_do_one_batch_write_extend_rbin(fc_thread_global_context_t * global_context, char * rbin, int binlen, FILE * fp, char * fixedbc_seq, char * fixedumi_seq, srInt_64 gene_no, srInt_64 * genes){
4144 	char * cellbc_seq=NULL,*umi_seq=NULL, * cellbc_qual=NULL,*umi_qual=NULL, *sample_seq=NULL, *sample_qual=NULL, *lane_str=NULL;
4145 	int rname_trimmed_len=0;
4146 	scRNA_scan_read_name_str(global_context, NULL, rbin, & sample_seq, & sample_qual, & cellbc_seq, & cellbc_qual, & umi_seq, & umi_qual, &lane_str, NULL, &rname_trimmed_len);
4147 	char new_rbin_stake[ binlen + 150 ]; // removed barcodes/qual from read names, add them to extra fields if they weren't there. Gene names are not put here.
4148 	char * new_rbin = new_rbin_stake;
4149 	int new_rbin_len = 0, n_cigar_op =0, l_read_name=0, l_seq=0;
4150 
4151 	memcpy(new_rbin, rbin, 36);
4152 	new_rbin_len += 36;
4153 
4154 	memcpy(&n_cigar_op, rbin+16,2);
4155 	memcpy(&l_seq, rbin+20,4);
4156 	l_read_name=((unsigned char*)rbin)[12];
4157 	new_rbin[12] = rname_trimmed_len+1;
4158 	memcpy(new_rbin+new_rbin_len, rbin+36, rname_trimmed_len);
4159 	new_rbin[36+rname_trimmed_len]=0;
4160 	new_rbin_len+= rname_trimmed_len+1;
4161 	memcpy(new_rbin+new_rbin_len, rbin +36 + l_read_name, 4*n_cigar_op + l_seq + (l_seq+1)/2);
4162 	new_rbin_len += 4*n_cigar_op + l_seq + (l_seq+1)/2;
4163 	char * ext_bin_ptr = rbin + 36 + l_read_name +4*n_cigar_op + l_seq + (l_seq+1)/2;
4164 
4165 	int CR_found=0, CB_found=0, CY_found=0, UR_found=0, UY_found=0, UB_found=0;
4166 	while(ext_bin_ptr < rbin+binlen+4){
4167 		char * tagstr = NULL;
4168 		int taglen = 0;
4169 		if(ext_bin_ptr[0]=='C' && ext_bin_ptr[1]=='R' && ext_bin_ptr[2]=='Z'){
4170 			CR_found = 1;
4171 			tagstr = cellbc_seq;
4172 			taglen = global_context -> known_cell_barcode_length;
4173 		}else if(ext_bin_ptr[0]=='C' && ext_bin_ptr[1]=='B' && ext_bin_ptr[2]=='Z'){
4174 			CB_found = 1;
4175 			tagstr = fixedbc_seq;
4176 			taglen = global_context -> known_cell_barcode_length;
4177 		}else if(ext_bin_ptr[0]=='C' && ext_bin_ptr[1]=='Y' && ext_bin_ptr[2]=='Z'){
4178 			CY_found = 1;
4179 			tagstr = cellbc_qual;
4180 			taglen = global_context -> known_cell_barcode_length;
4181 		}else if(ext_bin_ptr[0]=='U' && ext_bin_ptr[1]=='R' && ext_bin_ptr[2]=='Z'){
4182 			UR_found = 1;
4183 			tagstr = umi_seq;
4184 			taglen = global_context -> scRNA_UMI_length;
4185 		}else if(ext_bin_ptr[0]=='U' && ext_bin_ptr[1]=='B' && ext_bin_ptr[2]=='Z'){
4186 			UB_found = 1;
4187 			tagstr = fixedumi_seq;
4188 			taglen = global_context -> scRNA_UMI_length;
4189 		}else if(ext_bin_ptr[0]=='U' && ext_bin_ptr[1]=='Y' && ext_bin_ptr[2]=='Z'){
4190 			UY_found = 1;
4191 			tagstr = umi_qual;
4192 			taglen = global_context -> scRNA_UMI_length;
4193 		}
4194 
4195 		if(tagstr){
4196 			new_rbin[new_rbin_len++]=*(ext_bin_ptr++);
4197 			new_rbin[new_rbin_len++]=*(ext_bin_ptr++);
4198 			new_rbin[new_rbin_len++]=*(ext_bin_ptr++);
4199 			int taglenold = strlen(ext_bin_ptr);
4200 			memcpy(new_rbin+new_rbin_len,tagstr, taglen);
4201 			*(new_rbin+new_rbin_len+taglen)=0;
4202 			ext_bin_ptr += taglenold+1;
4203 			new_rbin_len += taglen+1;
4204 		}else{
4205 			int content_len = SAP_pairer_skip_tag_body_len(ext_bin_ptr);
4206 			memcpy(new_rbin + new_rbin_len, ext_bin_ptr, content_len );
4207 			new_rbin_len += content_len;
4208 			ext_bin_ptr += content_len;
4209 		}
4210 	}
4211 	if(!CR_found){
4212 		new_rbin[new_rbin_len++]='C';new_rbin[new_rbin_len++]='R';new_rbin[new_rbin_len++]='Z';
4213 		memcpy(new_rbin+new_rbin_len, cellbc_seq, global_context -> known_cell_barcode_length);
4214 		*(new_rbin+new_rbin_len+global_context -> known_cell_barcode_length)=0;
4215 		new_rbin_len += global_context -> known_cell_barcode_length+1;
4216 	}
4217 	if(fixedbc_seq && !CB_found){
4218 		new_rbin[new_rbin_len++]='C';new_rbin[new_rbin_len++]='B';new_rbin[new_rbin_len++]='Z';
4219 		memcpy(new_rbin+new_rbin_len, fixedbc_seq, global_context -> known_cell_barcode_length);
4220 		*(new_rbin+new_rbin_len+global_context -> known_cell_barcode_length)=0;
4221 		new_rbin_len += global_context -> known_cell_barcode_length+1;
4222 	}
4223 	if(!CY_found){
4224 		new_rbin[new_rbin_len++]='C';new_rbin[new_rbin_len++]='Y';new_rbin[new_rbin_len++]='Z';
4225 		memcpy(new_rbin+new_rbin_len, cellbc_qual, global_context -> known_cell_barcode_length);
4226 		*(new_rbin+new_rbin_len+global_context -> known_cell_barcode_length)=0;
4227 		new_rbin_len += global_context -> known_cell_barcode_length+1;
4228 	}
4229 
4230 	if(!UR_found){
4231 		new_rbin[new_rbin_len++]='U';new_rbin[new_rbin_len++]='R';new_rbin[new_rbin_len++]='Z';
4232 		memcpy(new_rbin+new_rbin_len, umi_seq, global_context -> scRNA_UMI_length);
4233 		*(new_rbin+new_rbin_len+global_context -> scRNA_UMI_length)=0;
4234 		new_rbin_len += global_context -> scRNA_UMI_length+1;
4235 	}
4236 	if(fixedumi_seq && !UB_found){
4237 		new_rbin[new_rbin_len++]='U';new_rbin[new_rbin_len++]='B';new_rbin[new_rbin_len++]='Z';
4238 		memcpy(new_rbin+new_rbin_len, fixedumi_seq, global_context -> scRNA_UMI_length);
4239 		*(new_rbin+new_rbin_len+global_context -> scRNA_UMI_length)=0;
4240 		new_rbin_len += global_context -> scRNA_UMI_length+1;
4241 	}
4242 	if(!UY_found){
4243 		new_rbin[new_rbin_len++]='U';new_rbin[new_rbin_len++]='Y';new_rbin[new_rbin_len++]='Z';
4244 		memcpy(new_rbin+new_rbin_len, umi_qual, global_context -> scRNA_UMI_length);
4245 		*(new_rbin+new_rbin_len+global_context -> scRNA_UMI_length)=0;
4246 		new_rbin_len += global_context -> scRNA_UMI_length+1;
4247 	}
4248 
4249 	new_rbin_len-=4;
4250 	memcpy(new_rbin, &new_rbin_len,4);
4251 	fwrite(new_rbin, 1, new_rbin_len+4, fp);
4252 
4253 	if(new_rbin!=new_rbin_stake)free(new_rbin);
4254 }
4255 
4256 //int cttt = 0;
4257 
add_scRNA_read_to_pool(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,srInt_64 assign_target_number,char * read_name,char * read_bin,ArrayList * target_list)4258 void add_scRNA_read_to_pool( fc_thread_global_context_t * global_context,  fc_thread_thread_context_t * thread_context, srInt_64 assign_target_number, char * read_name, char * read_bin, ArrayList * target_list ){ // the index of gene or the index of exon
4259 	char * cell_barcode = NULL, * umi_barcode = NULL;
4260 	int sample_id = -1, known_sample_id = 0;
4261 	if(global_context -> scRNA_rerun_on_persample_BAM) known_sample_id = global_context -> this_input_number+1;
4262 
4263 	scRNA_find_sample_cell_umi_from_readname(global_context, thread_context, read_name, read_bin, (known_sample_id >0)?NULL:&sample_id, &cell_barcode, &umi_barcode, NULL);
4264 	if(known_sample_id >0) sample_id = known_sample_id;
4265 
4266 	int cell_id = scRNA_get_cell_id(global_context, thread_context, cell_barcode);
4267 //	int umi_id = scRNA_register_umi_id( global_context, thread_context, umi_barcode);
4268 
4269 	//SUBREADprintf("P0\n");
4270 	thread_context -> scRNA_pooled_reads ++;
4271 	if(sample_id >0)thread_context -> scRNA_has_valid_sample_index ++;
4272 	if(cell_id >=0)thread_context -> scRNA_has_valid_cell_barcode ++;
4273 
4274 	if(thread_context -> thread_id == 0 && thread_context -> scRNA_pooled_reads == 20000){
4275 		print_in_box(80,0,0,"   scRNA quality control in first 20,000 reads:");
4276 		print_in_box(80,0,0,"     %.1f pct reads have valid sample indices.", thread_context->scRNA_has_valid_sample_index*100./thread_context -> scRNA_pooled_reads);
4277 		print_in_box(80,0,0,"     %.1f pct reads have valid cell barcodes.", thread_context->scRNA_has_valid_cell_barcode*100./thread_context -> scRNA_pooled_reads);
4278 		print_in_box(80,0,0,"");
4279 	}
4280 
4281 	if(sample_id >0) thread_context -> scRNA_assigned_reads_per_sample[sample_id-1] ++;
4282 	//if(sample_id >0) SUBREADprintf("P1 Cell=%s, Umi=%s, Lane=%d ==> sample %d\n", cell_barcode, umi_barcode, laneno, sample_id);
4283 	if(sample_id >0){
4284 		int barcode_hashed_key;
4285 
4286 		// has cell-bc, assigned : to hashed bin
4287 		// has cell-bc, not assigned, has mapping location: to hashed bin, with 0-len gene list
4288 		// no cell-bc, has mapping location: to the MAX - 1 bin, with 0-len gene list
4289 		// no mapping location: to the MAX bin; only has rbins but no headers.
4290 		if(cell_id >=0 && (assign_target_number>=0 || target_list))barcode_hashed_key = cell_id % global_context -> scRNA_barcode_batched_bin_no;
4291 		else{
4292 			int chro_no = -1, chro_pos = -1;
4293 			memcpy(&chro_no, read_bin + 4, 4);
4294 			memcpy(&chro_pos, read_bin + 8, 4);
4295 			if(cell_id >= 0 && chro_no>=0) barcode_hashed_key = cell_id % global_context -> scRNA_barcode_batched_bin_no;
4296 			else if(cell_id <0 && chro_no>=0) barcode_hashed_key = global_context -> scRNA_barcode_batched_bin_no;
4297 			else barcode_hashed_key = global_context -> scRNA_barcode_batched_bin_no+1;
4298 
4299 		}
4300 
4301 		pthread_spin_lock(global_context -> scRNA_barcode_batched_locks+barcode_hashed_key);
4302 		FILE * myfp = global_context -> scRNA_barcode_batched_bins[barcode_hashed_key];
4303 		fwrite(&sample_id,1,4,myfp);
4304 		srInt_64 itemno = 1;
4305 		if(barcode_hashed_key<=global_context -> scRNA_barcode_batched_bin_no){
4306 			fwrite(&cell_id,1,4,myfp);
4307 			if(assign_target_number<0){
4308 				itemno = target_list?target_list -> numOfElements:0;
4309 				itemno = itemno | (1llu << 63);
4310 				fwrite(&itemno,1,8,myfp);
4311 				int x1;
4312 				if(target_list)for(x1=0;x1<target_list -> numOfElements;x1++){
4313 					srInt_64 geneno_0B = ArrayListGet(target_list, x1) -NULL;
4314 					fwrite(&geneno_0B,1,8,myfp);
4315 				}
4316 			}else fwrite(&assign_target_number,1,8,myfp);
4317 			fwrite(umi_barcode,1,global_context -> scRNA_UMI_length,myfp);
4318 		}
4319 		int read_bin_len=0;
4320 		memcpy(&read_bin_len , read_bin, 4);
4321 
4322 		if(barcode_hashed_key==global_context -> scRNA_barcode_batched_bin_no+1){ // the read is unmapped. It can still have a fixed cell barcode
4323 			char * new_cellbc = NULL;
4324 			if(cell_id>=0)new_cellbc = ArrayListGet(global_context -> scRNA_cell_barcodes_array, cell_id);
4325 			scRNA_do_one_batch_write_extend_rbin(global_context, read_bin, read_bin_len, myfp, new_cellbc, NULL, -1, NULL);
4326 		}else fwrite(read_bin, 1, read_bin_len+4, myfp);
4327 		pthread_spin_unlock(global_context -> scRNA_barcode_batched_locks+barcode_hashed_key);
4328 	}
4329 }
4330 
scRNA_do_one_batch_sort_compare(void * ar,int l,int r)4331 int scRNA_do_one_batch_sort_compare(void * ar, int l, int r){
4332 	void ** arr = ar;
4333 	void ** bin_ptrs = arr[0];
4334 	fc_thread_global_context_t * global_context = arr[1];
4335 
4336 	char * Lptr = bin_ptrs[l];
4337 	char * Rptr = bin_ptrs[r];
4338 	srInt_64 Lgenes=0, Rgenes=0;
4339 	memcpy(&Lgenes, Lptr+8, 8);
4340 	memcpy(&Rgenes, Rptr+8, 8);
4341 	if(Lgenes & (1LLU<<63))Lgenes=Lgenes & 0x7fffffffllu; else Lgenes=0;
4342 	if(Rgenes & (1LLU<<63))Rgenes=Rgenes & 0x7fffffffllu; else Rgenes=0;
4343 	srInt_64 Lpos= ((0LLU+*(int*)(Lptr+16+Lgenes*8+global_context->scRNA_UMI_length+4))<<32) | *(unsigned int*)(Lptr+16+Lgenes*8+global_context->scRNA_UMI_length+4+4);
4344 	srInt_64 Rpos= ((0LLU+*(int*)(Rptr+16+Rgenes*8+global_context->scRNA_UMI_length+4))<<32) | *(unsigned int*)(Rptr+16+Rgenes*8+global_context->scRNA_UMI_length+4+4);
4345 	if(Lpos>Rpos)return 1;
4346 	if(Lpos<Rpos)return -1;
4347 	return 0;
4348 }
4349 
scRNA_do_one_batch_sort_exchange(void * ar,int l,int r)4350 void scRNA_do_one_batch_sort_exchange(void * ar, int l, int r){
4351 	void ** arr = ar;
4352 	void ** bin_ptrs = arr[0];
4353 	void * tp = bin_ptrs[l];
4354 	bin_ptrs[l]=bin_ptrs[r];
4355 	bin_ptrs[r]=tp;
4356 }
4357 
scRNA_do_one_batch_sort_merge(void * ar,int start,int items,int items2)4358 void scRNA_do_one_batch_sort_merge(void * ar, int start, int items, int items2){
4359 	void ** arr = ar;
4360 	void ** bin_ptrs = arr[0];
4361 	bin_ptrs +=start;
4362 
4363 	void ** tmp = malloc(sizeof(void*)*(items2+items));
4364 	int i1_cursor=0, i2_cursor=items, wptr=0;
4365 	while(1){
4366 		if(i1_cursor == items && i2_cursor == items + items2 )break;
4367 		int select_items_1 = (i2_cursor == items + items2) || (i1_cursor < items && scRNA_do_one_batch_sort_compare(ar, start+ i1_cursor,start + i2_cursor) <= 0);
4368 		if(select_items_1) tmp[wptr++] = bin_ptrs[i1_cursor++];
4369 		else tmp[wptr++] = bin_ptrs[i2_cursor++];
4370 	}
4371 	memcpy(bin_ptrs, tmp, sizeof(void*)*(items2+items));
4372 	free(tmp);
4373 }
4374 
4375 struct cell_gene_umi_supp{
4376 	int cellbc;
4377 	srInt_64 gene_no;
4378 	char umi[MAX_UMI_LEN];
4379 	int supp_reads;
4380 };
4381 
scRNA_hamming_max2_fixlen(char * u1,char * u2,int ulen)4382 int scRNA_hamming_max2_fixlen(char * u1, char * u2, int ulen){
4383 	int x, ret=0;
4384 	for(x=0; x<ulen; x++){
4385 		if(u1[x]!=u2[x]) ret++;
4386 		if(ret>1)return ret;
4387 	}
4388 	return ret;
4389 }
4390 
4391 #define ADD_count_hash(bc,gn,no)   HashTablePut(cellBCp0_genep0_P1_to_UMIs, NULL +1+(((1LLU*(bc))<<32)| (gn) ),  HashTableGet(   cellBCp0_genep0_P1_to_UMIs, NULL +1+(((1LLU*(bc))<<32)| (gn))) +(no) )
4392 
scRNA_do_one_batch_UMI_merge_one_cell(ArrayList * structs,int sec_start,int sec_end,int is_UMI_step2,HashTable * filtered_CGU_table)4393 void scRNA_do_one_batch_UMI_merge_one_cell(ArrayList* structs, int sec_start, int sec_end, int is_UMI_step2, HashTable * filtered_CGU_table){
4394 	int x1;
4395 	void ** app1 = structs -> appendix1;
4396 	fc_thread_global_context_t * global_context = app1[0];
4397 	HashTable * cellBCp0_genep0_P1_to_UMIs = app1[2];
4398 	int sample_id = app1[3]-NULL;
4399 
4400 	if(is_UMI_step2){
4401 		// NB: when this function is called, sec_end - sec_start MUST be >=2.
4402 		for(x1 = sec_start; x1<sec_end; x1++) {
4403 			struct cell_gene_umi_supp * str1 = ArrayListGet(structs, x1);
4404 			if(x1 == sec_start){
4405 				struct cell_gene_umi_supp * str2 = ArrayListGet(structs, sec_start+1);
4406 				if(str1 -> supp_reads > str2 -> supp_reads){
4407 					ADD_count_hash(str1->cellbc, str1->gene_no,1);
4408 					continue;
4409 				}
4410 			}
4411 
4412 			str1 -> cellbc = -1;
4413 			char replaced_key[40+MAX_UMI_LEN];
4414 #ifdef __MINGW32__
4415 			int keyptr = sprintf(replaced_key,"%d-%I64d-", str1 -> cellbc, str1 -> gene_no);
4416 #else
4417 			int keyptr = sprintf(replaced_key,"%d-%lld-", str1 -> cellbc, str1 -> gene_no);
4418 #endif
4419 			memcpy(replaced_key+keyptr, str1 -> umi, global_context -> scRNA_UMI_length);
4420 			replaced_key[keyptr+global_context -> scRNA_UMI_length]=0;
4421 			HashTablePut(filtered_CGU_table, strdup(replaced_key), NULL-1);
4422 		}
4423 	}else{
4424 		ArrayList * accepted_list =NULL;
4425 		HashTable * looktable = NULL;
4426 		if(sec_end - sec_start >30){
4427 			looktable = StringTableCreate((sec_end - sec_start)/5);
4428 			HashTableSetDeallocationFunctions(looktable, free, (void (*)(void *value))ArrayListDestroy);
4429 		}else accepted_list = ArrayListCreate(sec_end - sec_start);
4430 
4431 		for(x1=sec_start; x1<sec_end; x1++){
4432 			struct cell_gene_umi_supp * try_str = ArrayListGet(structs , x1);
4433 			int x2, found = 0;
4434 			ArrayList * test_accs;
4435 			int hx;
4436 
4437 			if(looktable){
4438 				for(hx = 0; hx<2; hx++){
4439 					char test_ky[MAX_UMI_LEN];
4440 					test_ky[0] = hx?'S':'F';
4441 					memcpy(test_ky +1, try_str -> umi + hx * global_context -> scRNA_UMI_length/2 , global_context -> scRNA_UMI_length/2);
4442 					test_ky[1+global_context -> scRNA_UMI_length/2]=0;
4443 
4444 					test_accs = HashTableGet(looktable, test_ky);
4445 					if(!test_accs)continue;
4446 
4447 					for(x2=0; x2<test_accs->numOfElements; x2++){
4448 						struct cell_gene_umi_supp * acc_str = ArrayListGet(test_accs, x2);
4449 						if(scRNA_hamming_max2_fixlen(acc_str -> umi, try_str -> umi, global_context -> scRNA_UMI_length)<2){
4450 							found=1;
4451 							acc_str -> supp_reads += try_str -> supp_reads;
4452 							try_str -> cellbc = -1;
4453 
4454 							char replaced_key[55+MAX_UMI_LEN];
4455 #ifdef __MINGW32__
4456 							int keyptr = sprintf(replaced_key,"%d-%d-%I64d-", sample_id, try_str -> cellbc, try_str -> gene_no);
4457 #else
4458 							int keyptr = sprintf(replaced_key,"%d-%d-%lld-", sample_id, try_str -> cellbc, try_str -> gene_no);
4459 #endif
4460 
4461 							memcpy(replaced_key+keyptr, try_str -> umi, global_context -> scRNA_UMI_length);
4462 							replaced_key[keyptr+global_context -> scRNA_UMI_length]=0;
4463 							HashTablePut(filtered_CGU_table, strdup(replaced_key), acc_str -> umi);
4464 							break;
4465 						}
4466 					}
4467 					if(found)break;
4468 				}
4469 			}else{
4470 				test_accs = accepted_list;
4471 
4472 				for(x2=0; x2<test_accs->numOfElements; x2++){
4473 					struct cell_gene_umi_supp * acc_str = ArrayListGet(test_accs, x2);
4474 					if(scRNA_hamming_max2_fixlen(acc_str -> umi, try_str -> umi, global_context -> scRNA_UMI_length)<2){
4475 						found=1;
4476 						acc_str -> supp_reads += try_str -> supp_reads;
4477 						try_str -> cellbc = -1;
4478 
4479 						char replaced_key[55+MAX_UMI_LEN];
4480 #ifdef __MINGW32__
4481 						int keyptr = sprintf(replaced_key,"%d-%d-%I64d-", sample_id, try_str -> cellbc, try_str -> gene_no);
4482 #else
4483 						int keyptr = sprintf(replaced_key,"%d-%d-%lld-", sample_id, try_str -> cellbc, try_str -> gene_no);
4484 #endif
4485 						memcpy(replaced_key+keyptr, try_str -> umi, global_context -> scRNA_UMI_length);
4486 						replaced_key[keyptr+global_context -> scRNA_UMI_length]=0;
4487 						HashTablePut(filtered_CGU_table, strdup(replaced_key), acc_str -> umi);
4488 						break;
4489 					}
4490 				}
4491 			}
4492 			if(!found){
4493 				if(looktable){
4494 					for(hx = 0; hx<2; hx++){
4495 						char test_ky[MAX_UMI_LEN];
4496 						test_ky[0] = hx?'S':'F';
4497 						memcpy(test_ky +1, try_str -> umi + hx * global_context -> scRNA_UMI_length/2 , global_context -> scRNA_UMI_length/2);
4498 						test_ky[1+global_context -> scRNA_UMI_length/2]=0;
4499 						test_accs = HashTableGet(looktable, test_ky);
4500 						if(!test_accs){
4501 							test_accs = ArrayListCreate(10);
4502 							HashTablePut(looktable, strdup(test_ky), test_accs);
4503 						}
4504 						ArrayListPush(test_accs, try_str);
4505 					}
4506 				}else ArrayListPush(accepted_list, try_str);
4507 			}
4508 		}
4509 
4510 		if(looktable)HashTableDestroy(looktable);
4511 		else ArrayListDestroy(accepted_list);
4512 	}
4513 }
4514 
scRNA_do_one_batch_UMI_merge_one_step(ArrayList * structs,int is_UMI_step2,HashTable * filtered_CGU_table)4515 void scRNA_do_one_batch_UMI_merge_one_step(ArrayList* structs, int is_UMI_step2, HashTable * filtered_CGU_table){
4516 	void ** app1 = structs -> appendix1;
4517 	fc_thread_global_context_t * global_context = app1[0];
4518 	HashTable * cellBCp0_genep0_P1_to_UMIs = app1[2];
4519 	srInt_64 x1, sec_start = 0;
4520 	srInt_64 old_sec_key = -1;
4521 
4522 	for(x1=1; x1<=structs -> numOfElements; x1++){
4523 		srInt_64 sec_key = -1;
4524 		int is_umi_changed = 0;
4525 
4526 		struct cell_gene_umi_supp * str1 =NULL;
4527 		if(x1<structs -> numOfElements){
4528 			str1 = ArrayListGet(structs, x1);
4529 			if(str1 -> cellbc <0) continue;
4530 			sec_key = str1 -> cellbc;
4531 			sec_key = sec_key << 32;
4532 			if(is_UMI_step2 && sec_key == old_sec_key){
4533 				struct cell_gene_umi_supp * strold = ArrayListGet(structs, sec_start);
4534 				is_umi_changed = memcmp(strold -> umi, str1-> umi, global_context-> scRNA_UMI_length);
4535 			}else if(!is_UMI_step2) sec_key = sec_key | str1 -> gene_no;
4536 				// gene_no itself is 64-bit, but it is nearly impossible to have two neighbouring
4537 				// structures that have the same last 32-bit of gene_no.
4538 		}
4539 
4540 		if( (x1>sec_start && sec_key!=old_sec_key) || is_umi_changed){ // when x1 == numOfElements, sec_key is -1. If old_sec_key is also -1, no item is included in the list. If old_sec_key is >=0, the last sec is processed.
4541 			struct cell_gene_umi_supp * str0 = ArrayListGet(structs, sec_start);
4542 			if(x1 - sec_start>1 && str0->cellbc>=0) scRNA_do_one_batch_UMI_merge_one_cell(structs, sec_start, x1, is_UMI_step2, filtered_CGU_table);
4543 			else if(is_UMI_step2 && str0->cellbc>=0) ADD_count_hash(str0->cellbc,str0->gene_no,1);
4544 
4545 			old_sec_key = sec_key;
4546 			sec_start = x1;
4547 		}
4548 	}
4549 }
4550 
scRNA_do_one_batch_tab_to_struct_list_compare(void * L_elem,void * R_elem,ArrayList * me)4551 int scRNA_do_one_batch_tab_to_struct_list_compare(void * L_elem, void * R_elem, ArrayList * me){
4552 	struct cell_gene_umi_supp *L = L_elem, *R = R_elem;
4553 	void ** app1 = me -> appendix1;
4554 	fc_thread_global_context_t * global_context = app1[0];
4555 	int sort_by_geneid_then_umi = app1[1] - NULL;
4556 
4557 	if(L->cellbc > R->cellbc) return 1;
4558 	if(L->cellbc < R->cellbc) return -1;
4559 
4560 	if(sort_by_geneid_then_umi){
4561 		if(L->gene_no>R->gene_no) return 1;
4562 		if(L->gene_no<R->gene_no) return -1;
4563 	}else{
4564 		int umicmps = memcmp(L->umi, R->umi, global_context -> scRNA_UMI_length);
4565 		if(umicmps) return umicmps;
4566 	}
4567 
4568 	if(L->supp_reads < R->supp_reads) return 1;
4569 	if(L->supp_reads > R->supp_reads) return -1; // reversed by # supp reads
4570 
4571 	if(sort_by_geneid_then_umi){
4572 		int umicmps = memcmp(L->umi, R->umi, global_context -> scRNA_UMI_length);
4573 		if(umicmps) return umicmps;
4574 	}else{
4575 
4576 		if(L->gene_no>R->gene_no) return 1;
4577 		if(L->gene_no<R->gene_no) return -1;
4578 	}
4579 	return 0;
4580 }
4581 
scRNA_do_one_batch_tab_to_struct_list(void * ky,void * val,HashTable * tab)4582 void scRNA_do_one_batch_tab_to_struct_list(void *ky, void *val, HashTable * tab){
4583 	int supp_reads = val-NULL;
4584 	ArrayList ** cell_gene_umi_list = tab -> appendix1;
4585 	int UMI_length = tab -> counter1;
4586 
4587 	struct cell_gene_umi_supp * new_item = malloc(sizeof(struct cell_gene_umi_supp));
4588 	char * kyptr = ky;
4589 	int sample_id = atoi(kyptr); // one-based sample id
4590 	for(; '-' != *kyptr; kyptr++);
4591 	kyptr++;
4592 	new_item -> cellbc = atoi(kyptr);
4593 	for(; '-' != *kyptr; kyptr++);
4594 	kyptr++;
4595 	new_item -> gene_no = atoll(kyptr);
4596 	for(; '-' != *kyptr; kyptr++);
4597 	memcpy(new_item->umi, kyptr+1, UMI_length);
4598 	new_item -> supp_reads = supp_reads;
4599 	if(sample_id<1)SUBREADprintf("WRONG SAMPLE ID: %d from '%s'\n", sample_id, (char*)ky);
4600 	ArrayListPush(cell_gene_umi_list[sample_id-1], new_item);
4601 }
4602 
scRNA_do_one_batch_write_UMIs(void * vcell_gene,void * vumis,HashTable * me)4603 void scRNA_do_one_batch_write_UMIs(void * vcell_gene, void * vumis, HashTable * me){
4604 	FILE * fp = me->appendix1;
4605 	vcell_gene --;
4606 	fwrite(&vcell_gene,1,8,fp);
4607 	fwrite(&vumis,1,8,fp);
4608 }
4609 
two_long_hash(void * ky)4610 srInt_64 two_long_hash(void * ky){
4611 	srInt_64 * ky2 = ky;
4612 	return ky2[0]^ky2[1];
4613 }
4614 
two_long_compare(void * k1,void * k2)4615 int two_long_compare(void * k1, void * k2){
4616 	srInt_64 * k13 = k1, *k23 = k2;
4617 	if(k13[0]!=k23[0])return 1;
4618 	if(k13[1]!=k23[1])return 1;
4619 	return 0;
4620 }
4621 
4622 #ifdef __MINGW32__
4623 #define ADD_key_FMT1 "%d-%d-%I64d-%s"
4624 #else
4625 #define ADD_key_FMT1 "%d-%d-%lld-%s"
4626 #endif
4627 #define ADD_key_struct { char my_key [50+MAX_UMI_LEN]; \
4628 	sprintf(my_key,ADD_key_FMT1, sample_id, cell_no, gene_no, UMI_str); \
4629 	srInt_64 supp_reads = HashTableGet(supp_reads_SCGU, my_key)-NULL; \
4630 	if(1>supp_reads) HashTablePut(supp_reads_SCGU, strdup(my_key), NULL+1); \
4631 	else HashTablePutReplaceEx(supp_reads_SCGU, my_key, NULL+supp_reads+1, 0,0,0); }
4632 
scRNA_do_one_batch(void * paramsp1)4633 void * scRNA_do_one_batch(void * paramsp1){
4634 	srInt_64 x1;
4635 	void ** params = paramsp1;
4636 	fc_thread_global_context_t * global_context = params[0];
4637 	ArrayList * file_size_list = params[2];
4638 	char *temp_dir = global_context -> temp_file_dir;
4639 	free(paramsp1);
4640 	int me_max_Rbin_len = 0;
4641 	int me_max_genes = 0;
4642 	char ** bin_ptrs = malloc(sizeof(char*) * 1500000), * batch_content=NULL;
4643 	int bin_ptr_size = 1500000;
4644 	while(1){
4645 		int this_batch_no = -1;
4646 		pthread_spin_lock(&global_context -> scRNA_do_one_batch_runner_lock);
4647 		if(global_context -> scRNA_do_one_batch_runner_current < global_context -> scRNA_barcode_batched_bin_no +1){
4648 			int this_batch_sorted_idx = (global_context -> scRNA_do_one_batch_runner_current ++);
4649 			srInt_64 this_batch_size_and_no = ArrayListGet(file_size_list, file_size_list->numOfElements-1 -this_batch_sorted_idx)-NULL;
4650 			this_batch_no = (int)(this_batch_size_and_no&0xfffffllu);
4651 		}
4652 		if(me_max_genes > global_context -> scRNA_barcode_batched_max_genes) global_context -> scRNA_barcode_batched_max_genes = me_max_genes;
4653 		if(me_max_Rbin_len > global_context->scRNA_barcode_batched_max_Rbin_len) global_context->scRNA_barcode_batched_max_Rbin_len = me_max_Rbin_len;
4654 		pthread_spin_unlock(&global_context -> scRNA_do_one_batch_runner_lock);
4655 		if(0>this_batch_no)break;
4656 		char tmp_fname[MAX_FILE_NAME_LENGTH+80];
4657 		sprintf(tmp_fname, "%s/cellCounts-Splitted-Reads-%05d-%05d.bin", temp_dir, getpid(), this_batch_no);
4658 		FILE * fp = fopen(tmp_fname, "rb");
4659 		fseek(fp, 0, SEEK_END);
4660 		srInt_64 batch_fsize = ftello(fp);
4661 		fseek(fp, 0, SEEK_SET);
4662 		if(batch_content==NULL) batch_content = malloc(batch_fsize);
4663 		srInt_64 batch_content_len = fread(batch_content, 1, batch_fsize, fp);
4664 		fclose(fp);
4665 		if(batch_content_len!=batch_fsize){
4666 			SUBREADprintf("ERROR: Cannot load file at once: %d!\n", this_batch_no);
4667 			return NULL;
4668 		}
4669 
4670 		HashTable * supp_reads_SCGU = StringTableCreate(500000);
4671 		HashTableSetDeallocationFunctions(supp_reads_SCGU, free, NULL);
4672 		srInt_64 scanptr = 0;
4673 		int rbin_no = 0;
4674 		char UMI_str[MAX_UMI_LEN+1];
4675 
4676 		while(scanptr < batch_content_len-1){
4677 			int cell_no=0, sample_id=0;
4678 			srInt_64 gene_no=0;
4679 			if(bin_ptr_size<=rbin_no){
4680 				bin_ptr_size = bin_ptr_size*2;
4681 				bin_ptrs = realloc(bin_ptrs, sizeof(char*)*bin_ptr_size);
4682 			}
4683 			bin_ptrs[rbin_no] = batch_content+scanptr;
4684 			memcpy(&sample_id, batch_content+scanptr, 4);
4685 			scanptr += 4; // sample_ID
4686 			memcpy(&cell_no, batch_content+scanptr, 4);
4687 			scanptr += 4; // cellbarcode_NO
4688 			memcpy(&gene_no, batch_content+scanptr, 8);
4689 			scanptr += 8; // gene_id
4690 			if(gene_no & (1LLU<<63)){
4691 				int genes = (int)(gene_no & 0x7fffffffllu);
4692 				if(genes > me_max_genes)me_max_genes=genes;
4693 
4694 				memcpy(UMI_str, batch_content+scanptr+8*genes, global_context -> scRNA_UMI_length);
4695 				UMI_str[global_context -> scRNA_UMI_length]=0;
4696 
4697 				for(x1=0; x1<genes; x1++){
4698 					memcpy(&gene_no, batch_content+scanptr, 8);
4699 					scanptr += 8;
4700 					ADD_key_struct;
4701 				}
4702 			}else{
4703 				UMI_str[global_context -> scRNA_UMI_length]=0;
4704 				memcpy(UMI_str, batch_content+scanptr, global_context -> scRNA_UMI_length);
4705 				ADD_key_struct;
4706 			}
4707 
4708 			scanptr += global_context -> scRNA_UMI_length ; // UMI str
4709 
4710 			int rbinlen = 0;
4711 			memcpy(&rbinlen, batch_content+scanptr, 4);
4712 
4713 			if(me_max_Rbin_len < rbinlen) me_max_Rbin_len = rbinlen;
4714 			scanptr += rbinlen +4; // read_bin
4715 
4716 //			if(sample_id <0 || sample_id > 1000) SUBREADprintf("Wrong Sample: RNO=%d; ptr=%lld\n", rbin_no, scanptr);
4717 			rbin_no++;
4718 		}
4719 		ArrayList ** cell_gene_umi_list = malloc(sizeof(void*)*global_context -> scRNA_sample_sheet_table -> numOfElements);
4720 		for(x1 =0; x1< global_context -> scRNA_sample_sheet_table -> numOfElements; x1++){
4721 			cell_gene_umi_list[x1]=ArrayListCreate(2000000);
4722 			ArrayListSetDeallocationFunction(cell_gene_umi_list[x1], free);
4723 		}
4724 		supp_reads_SCGU -> appendix1 = cell_gene_umi_list;
4725 		supp_reads_SCGU -> appendix2 = global_context;
4726 		supp_reads_SCGU -> counter1 = global_context -> scRNA_UMI_length;
4727 		HashTableIteration(supp_reads_SCGU, scRNA_do_one_batch_tab_to_struct_list);
4728 		HashTable * filtered_SCGU_table = StringTableCreate(max(10000,cell_gene_umi_list[0] -> numOfElements / 10));
4729 		HashTableSetDeallocationFunctions(filtered_SCGU_table, free, NULL);
4730 
4731 		fp = fopen(tmp_fname, "wb");
4732 		for(x1 = 0; x1 < global_context -> scRNA_sample_sheet_table -> numOfElements; x1++){
4733 			HashTable * cellbcP0_to_geneno0B_P1_to_UMIs = HashTableCreate(500000);
4734 
4735 			void * app1[3];
4736 			cell_gene_umi_list[x1] -> appendix1 = app1;
4737 			app1[0] = global_context;
4738 			app1[1] = NULL+1;
4739 						// 0 : sorted by cell_bc, then UMIstr, then supported_reads, then gene
4740 						// 1 : sorted by cell_bc, then gene, then supported_reads, then UMIstr
4741 						// supported_reads : large -> small; the other: small -> large
4742 			ArrayListSort(cell_gene_umi_list[x1],  scRNA_do_one_batch_tab_to_struct_list_compare);
4743 			scRNA_do_one_batch_UMI_merge_one_step(cell_gene_umi_list[x1], 0, filtered_SCGU_table);
4744 
4745 			app1[1] = NULL+0;
4746 			app1[2] = cellbcP0_to_geneno0B_P1_to_UMIs;
4747 			ArrayListSort(cell_gene_umi_list[x1], scRNA_do_one_batch_tab_to_struct_list_compare);
4748 			scRNA_do_one_batch_UMI_merge_one_step(cell_gene_umi_list[x1], 1, filtered_SCGU_table);
4749 
4750 			cellbcP0_to_geneno0B_P1_to_UMIs -> appendix1 = fp;
4751 			fwrite(&cellbcP0_to_geneno0B_P1_to_UMIs -> numOfElements,1,8,fp);
4752 			HashTableIteration(cellbcP0_to_geneno0B_P1_to_UMIs, scRNA_do_one_batch_write_UMIs);
4753 			HashTableDestroy(cellbcP0_to_geneno0B_P1_to_UMIs);
4754 		}
4755 
4756 		void * sort_base[2];
4757 		sort_base[0] = bin_ptrs;
4758 		sort_base[1] = global_context;
4759 		merge_sort(sort_base, rbin_no, scRNA_do_one_batch_sort_compare, scRNA_do_one_batch_sort_exchange, scRNA_do_one_batch_sort_merge);
4760 
4761 
4762 
4763 		for(x1 = 0; x1 < rbin_no; x1++){
4764 			char * binptr = bin_ptrs[x1];
4765 			int cellid =0, sampleid = 0;
4766 			srInt_64 gene_no =0, genes = 0, geneno_0 = 0;
4767 			char * umi, * glist_ptr =NULL;
4768 			memcpy(&sampleid, binptr, 4);
4769 			memcpy(&cellid, binptr+4, 4);
4770 			memcpy(&gene_no, binptr+8, 8);
4771 			if(gene_no & (1LLU<<63)){
4772 				glist_ptr =binptr + 16;
4773 				genes = (int)(gene_no & 0x7fffffff);
4774 				memcpy(&geneno_0, binptr+16, 8);
4775 			}
4776 			umi = binptr + 16 + 8*genes;
4777 			char SCGU_key [40+MAX_UMI_LEN];
4778 
4779 #ifdef __MINGW32__
4780 			int keyptr = sprintf(SCGU_key,"%d-%d-%I64d-", sampleid, cellid,  (gene_no & (1LLU<<63))? geneno_0: gene_no);
4781 #else
4782 			int keyptr = sprintf(SCGU_key,"%d-%d-%lld-", sampleid, cellid,  (gene_no & (1LLU<<63))? geneno_0: gene_no);
4783 #endif
4784 			memcpy(SCGU_key+keyptr, umi, global_context -> scRNA_UMI_length);
4785 			SCGU_key[keyptr+global_context -> scRNA_UMI_length] = 0;
4786 
4787 			char * new_UMI = HashTableGet(filtered_SCGU_table, SCGU_key);
4788 			if(new_UMI) umi = new_UMI;
4789 			if(umi == NULL-1) umi="-----------------------------------------";
4790 			fwrite(&sampleid, 1, 4, fp);
4791 			fwrite(&cellid, 1, 4, fp);
4792 			fwrite(&gene_no, 1, 8, fp);
4793 			if(gene_no & (1LLU<<63)) fwrite( glist_ptr, 1, 8*genes, fp );
4794 			fwrite(umi,1, global_context -> scRNA_UMI_length, fp);
4795 			int binlen;
4796 
4797 			memcpy(&binlen, binptr+16+8*genes+global_context -> scRNA_UMI_length,4 );
4798 			char * new_cellbc = NULL;
4799 			if(cellid>=0)new_cellbc = ArrayListGet(global_context -> scRNA_cell_barcodes_array, cellid);
4800 			scRNA_do_one_batch_write_extend_rbin(global_context, binptr+16+8*genes+global_context -> scRNA_UMI_length, binlen, fp, new_cellbc, umi[0]=='-'?NULL:umi, gene_no, (srInt_64*)glist_ptr);
4801 		}
4802 		fclose(fp);
4803 		HashTableDestroy(supp_reads_SCGU);
4804 		HashTableDestroy(filtered_SCGU_table);
4805 		for(x1 =0; x1< global_context -> scRNA_sample_sheet_table -> numOfElements; x1++)ArrayListDestroy(cell_gene_umi_list[x1]);
4806 		free(cell_gene_umi_list);
4807 	}
4808 	free(batch_content);
4809 	free(bin_ptrs);
4810 	return NULL;
4811 }
4812 
calc_score_overlaps(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,char ** chros,unsigned int * start_poses,unsigned short * lens,int sections,char * read_name)4813 unsigned int calc_score_overlaps(fc_thread_global_context_t * global_context,  fc_thread_thread_context_t * thread_context, char ** chros, unsigned int * start_poses, unsigned short * lens, int sections, char * read_name){
4814 	unsigned int in_intervals[ 2*sections ];
4815 	unsigned int out_intervals[ 2*sections ], x1;
4816 	char used_interval[ sections ];
4817 
4818 	memset(used_interval, 0 , sections);
4819 	unsigned int ret = 0;
4820 
4821 	for(x1 = 0  ; x1 < sections ; x1++){
4822 		if( used_interval [x1] )continue;
4823 
4824 		in_intervals[0] = start_poses[x1];
4825 		in_intervals[1] = start_poses[x1] + lens[x1];
4826 		used_interval[x1]=1;
4827 
4828 		int x2, this_sections = 1;
4829 		for(x2 = x1 + 1; x2 < sections; x2++){
4830 			if(strcmp( chros[x2], chros[x1] ) == 0){
4831 				in_intervals[this_sections*2] = start_poses[x2];
4832 				in_intervals[this_sections*2 + 1] = start_poses[x2] + lens[x2];
4833 				used_interval[x2]=1;
4834 				this_sections++;
4835 			}
4836 		}
4837 
4838 		basic_sort( in_intervals, this_sections, overlap_compare, overlap_exchange );
4839 
4840 		int merged_secs = mergeIntervals( in_intervals, out_intervals, this_sections );
4841 		for(x2 = 0; x2 < merged_secs; x2++)
4842 			ret += ( out_intervals[x2*2+1] - out_intervals[x2*2] );
4843 	}
4844 	return ret;
4845 }
4846 
4847 
vote_and_add_count(fc_thread_global_context_t * global_context,fc_thread_thread_context_t * thread_context,srInt_64 * hits_indices1,int nhits1,srInt_64 * hits_indices2,int nhits2,unsigned int total_frag_len,char ** hits_chro1,char ** hits_chro2,unsigned int * hits_start_pos1,unsigned int * hits_start_pos2,unsigned short * hits_length1,unsigned short * hits_length2,int fixed_fractional_count,char * read_name,char * RG_name,char * bin1,char * bin2)4848 void vote_and_add_count(fc_thread_global_context_t * global_context, fc_thread_thread_context_t * thread_context,
4849 			srInt_64 * hits_indices1, int nhits1, srInt_64 * hits_indices2, int nhits2, unsigned int total_frag_len,
4850 			char ** hits_chro1, char ** hits_chro2, unsigned int * hits_start_pos1, unsigned int * hits_start_pos2, unsigned short * hits_length1, unsigned short * hits_length2, int fixed_fractional_count, char * read_name, char * RG_name, char * bin1, char * bin2){
4851 	if(global_context -> need_calculate_overlap_len == 0 && nhits2+nhits1==1) {
4852 		srInt_64 hit_exon_id = nhits2?hits_indices2[0]:hits_indices1[0];
4853 
4854 		//SUBREADprintf("V_AND_A: '%p'\n", RG_name);
4855 
4856 		if(RG_name){
4857 			void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
4858 			fc_read_counters * sumtab = tab4s[1];
4859 			sumtab -> assigned_reads++;
4860 
4861 			read_count_type_t * count_table = tab4s[0];
4862 			count_table[hit_exon_id] += fixed_fractional_count;
4863 		}else{
4864 			thread_context->count_table[hit_exon_id] += fixed_fractional_count;
4865 			thread_context->read_counters.assigned_reads ++;
4866 		}
4867 		thread_context->nreads_mapped_to_exon++;
4868 		if(global_context -> read_details_out_FP){
4869 			int final_gene_number = global_context -> exontable_geneid[hit_exon_id];
4870 			char * final_feture_name = (char *)global_context -> gene_name_array[final_gene_number];
4871 			write_read_details_FP(global_context, thread_context, "Assigned", 1, final_feture_name, bin1, bin2);
4872 		}
4873 		if(global_context -> do_scRNA_table){
4874 			srInt_64 assignment_target_number = hit_exon_id;
4875 			if(global_context->is_gene_level) assignment_target_number = global_context -> exontable_geneid[hit_exon_id];
4876 			add_scRNA_read_to_pool(global_context, thread_context, assignment_target_number, read_name, bin1, NULL);
4877 		}
4878 	} else if(global_context -> need_calculate_overlap_len == 0 && nhits2 == 1 && nhits1 == 1 && hits_indices2[0]==hits_indices1[0]) {
4879 		srInt_64 hit_exon_id = hits_indices1[0];
4880 
4881 		if(RG_name){
4882 			void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
4883 			fc_read_counters * sumtab = tab4s[1];
4884 			sumtab -> assigned_reads++;
4885 
4886 			read_count_type_t * count_table = tab4s[0];
4887 			count_table[hit_exon_id] += fixed_fractional_count;
4888 		}else{
4889 			thread_context->count_table[hit_exon_id] += fixed_fractional_count;
4890 			thread_context->read_counters.assigned_reads ++;
4891 		}
4892 		thread_context->nreads_mapped_to_exon++;
4893 		if(global_context -> read_details_out_FP)
4894 		{
4895 			int final_gene_number = global_context -> exontable_geneid[hit_exon_id];
4896 			char * final_feture_name = (char *)global_context -> gene_name_array[final_gene_number];
4897 			write_read_details_FP(global_context, thread_context, "Assigned", 1, final_feture_name, bin1, bin2);
4898 		}
4899 
4900 		if(global_context -> do_scRNA_table){
4901 			srInt_64 assignment_target_number = hit_exon_id;
4902 			if(global_context->is_gene_level) assignment_target_number = global_context -> exontable_geneid[hit_exon_id];
4903 			add_scRNA_read_to_pool(global_context, thread_context, assignment_target_number, read_name, bin1, NULL);
4904 		}
4905 	} else {
4906 		// Build a voting table.
4907 		// The voting table should be:
4908 		//      total_length [nhit_final] = total_length_overlapping
4909 		//      final_id [nhit_final] = final_exon_id
4910 
4911 		// if is_gene_leven, then decision_table_exon_ids[nhit_final] is the exon id where the count is added.
4912 
4913 		// After all, the count is added to all hits where total_length has the maximum value.
4914 		// If there are more than one locations having the same total_length, then the fragment is ambiguous.
4915 		// Count is added when "-O" is specified.
4916 
4917 		// merge feature : if a read overlaps with an EXON twice or more times (by >=2 segments in cigar),
4918 		//                 then the total length of the overlapped bases is calculated.
4919 		//
4920 		// two ends in a fragment is considered individually; the overlapping bases are not added up.
4921 		//
4922 
4923 
4924 		unsigned int * scoring_numbers = thread_context -> scoring_buff_numbers;	// size is : MAX_HIT_NUMBER *2
4925 		unsigned int * scoring_flags = thread_context -> scoring_buff_flags;		// size is : MAX_HIT_NUMBER *2
4926 		unsigned int * scoring_overlappings = thread_context -> scoring_buff_overlappings;		// size is : MAX_HIT_NUMBER *2
4927 		srInt_64 * scoring_exon_ids = thread_context -> scoring_buff_exon_ids;		// size is : MAX_HIT_NUMBER *2
4928 		int scoring_count = 0,  score_x1;
4929 
4930 
4931 		if( global_context -> need_calculate_overlap_len ){
4932 			int end1, end2, hit_x1, hit_x2;
4933 			char ** scoring_gap_chros = thread_context -> scoring_buff_gap_chros;
4934 			unsigned int * scoring_gap_starts = thread_context -> scoring_buff_gap_starts; // size is : MAX_HIT_NUMBER *2;
4935 			unsigned short * scoring_gap_lengths = thread_context -> scoring_buff_gap_lengths; 	// size is : MAX_HIT_NUMBER *2*  global_context -> max_M*2
4936 
4937 			char used_hit1 [nhits1];
4938 			char used_hit2 [nhits2];
4939 
4940 			if( global_context ->  fractional_minimum_feature_overlapping > 1E-10 || global_context -> max_missing_bases_in_feature >= 0){
4941 				memset(used_hit1 , 0 , nhits1);
4942 				memset(used_hit2 , 0 , nhits2);
4943 				for(end1 = 0; end1 < global_context -> is_paired_end_mode_assign + 1 ; end1++){
4944 					int allhits = end1?nhits2:nhits1;
4945 					srInt_64 * hits_indices_X1 = end1?hits_indices2:hits_indices1;
4946 					char * used_hit_X1 = end1?used_hit2:used_hit1;
4947 
4948 					for(hit_x1 = 0; hit_x1 < allhits; hit_x1++){
4949 						if(used_hit_X1[hit_x1])continue;
4950 
4951 						srInt_64 tested_exon_id = hits_indices_X1[hit_x1];
4952 						srInt_64 exon_span = global_context -> exontable_stop[tested_exon_id] +1;
4953 						exon_span -= global_context -> exontable_start[tested_exon_id];
4954 
4955 						srInt_64 applied_overlapping_threshold_frac = 0, applied_overlapping_threshold_missing = 0;
4956 						if(global_context -> max_missing_bases_in_feature >= 0){
4957 							if(exon_span <= global_context -> max_missing_bases_in_feature) applied_overlapping_threshold_missing = 0;
4958 							else applied_overlapping_threshold_missing = 10000L * (exon_span - global_context -> max_missing_bases_in_feature);
4959 						}
4960 
4961 						applied_overlapping_threshold_frac = (srInt_64)(exon_span *10000.* global_context ->  fractional_minimum_feature_overlapping + 0.9999);
4962 
4963 						srInt_64 applied_overlapping_threshold = max(applied_overlapping_threshold_frac , applied_overlapping_threshold_missing);
4964 
4965 						scoring_gap_chros[0 ] = (end1?hits_chro2:hits_chro1)[hit_x1];
4966 						scoring_gap_starts[0 ] = (end1?hits_start_pos2:hits_start_pos1)[hit_x1];
4967 						scoring_gap_lengths[0 ] = (end1?hits_length2:hits_length1)[hit_x1];
4968 						int gaps=1;
4969 
4970 						for(end2 = 0; end2 < global_context -> is_paired_end_mode_assign + 1 ; end2++){
4971 							int allhits2 = end2?nhits2:nhits1;
4972 							char * used_hit_X2 = end2?used_hit2:used_hit1;
4973 							srInt_64 * hits_indices_X2 = end2?hits_indices2:hits_indices1;
4974 
4975 
4976 							for(hit_x2 = 0; hit_x2 < allhits2; hit_x2++){
4977 								if(used_hit_X2[hit_x2]) continue;
4978 								srInt_64 other_exon_id =  hits_indices_X2[hit_x2];
4979 								if(other_exon_id == tested_exon_id){
4980 									used_hit_X2[ hit_x2 ]=1;
4981 									scoring_gap_chros[ gaps ] = (end2?hits_chro2:hits_chro1)[hit_x2];
4982 									scoring_gap_starts[ gaps ] = (end2?hits_start_pos2:hits_start_pos1)[hit_x2];
4983 									scoring_gap_lengths[ gaps ] = (end2?hits_length2:hits_length1)[hit_x2];
4984 									gaps ++;
4985 								}
4986 							}
4987 						}
4988 
4989 
4990 						srInt_64 tested_exon_overlap_any_read = 10000L*calc_score_overlaps(global_context, thread_context, scoring_gap_chros, scoring_gap_starts, scoring_gap_lengths, gaps, read_name);
4991 						if(applied_overlapping_threshold > tested_exon_overlap_any_read){
4992 							// remove this exon from lists
4993 
4994 							for(end2 = 0; end2 < global_context -> is_paired_end_mode_assign + 1 ; end2++){
4995 								int allhits2 = end2?nhits2:nhits1;
4996 								srInt_64 * hits_indices_X2 = end2?hits_indices2:hits_indices1;
4997 
4998 								for(hit_x2 = 0; hit_x2 < allhits2; hit_x2++){
4999 									srInt_64 other_exon_id =  hits_indices_X2[hit_x2];
5000 									if(other_exon_id == tested_exon_id){
5001 										hits_indices_X2[hit_x2] = -1;
5002 									}
5003 								}
5004 							}
5005 						}
5006 					}
5007 				}
5008 			}
5009 
5010 			memset(used_hit1 , 0 , nhits1);
5011 			memset(used_hit2 , 0 , nhits2);
5012 
5013 			for(end1 = 0; end1 < global_context -> is_paired_end_mode_assign + 1 ; end1++){
5014 				srInt_64 * hits_indices_X1 = end1?hits_indices2:hits_indices1;
5015 				char * used_hit_X1 = end1?used_hit2:used_hit1;
5016 				int nhit_X1 = end1?nhits2:nhits1;
5017 
5018 				for( hit_x1 = 0 ; hit_x1 < nhit_X1; hit_x1 ++ ){
5019 					if(used_hit_X1[hit_x1])continue;
5020 
5021 					int gaps = 0;
5022 					srInt_64 tmp_exon_id = hits_indices_X1[hit_x1];
5023 					if(tmp_exon_id < 0) continue;
5024 					srInt_64 score_merge_key;
5025 					if (global_context -> is_gene_level )
5026 						score_merge_key = global_context -> exontable_geneid[tmp_exon_id];
5027 					else	score_merge_key = tmp_exon_id;
5028 
5029 
5030 					scoring_gap_chros[0 ] = (end1?hits_chro2:hits_chro1)[hit_x1];
5031 					scoring_gap_starts[0 ] = (end1?hits_start_pos2:hits_start_pos1)[hit_x1];
5032 					scoring_gap_lengths[0 ] = (end1?hits_length2:hits_length1)[hit_x1];
5033 
5034 					gaps=1;
5035 
5036 					scoring_flags[scoring_count] = end1?2:1;
5037 					scoring_numbers[scoring_count] =1;
5038 					scoring_exon_ids[scoring_count] = tmp_exon_id;
5039 
5040 					used_hit_X1[ hit_x1 ]=1;
5041 
5042 					for(end2 = 0; end2 < global_context -> is_paired_end_mode_assign + 1 ; end2++){
5043 						srInt_64 * hits_indices_X2 = end2?hits_indices2:hits_indices1;
5044 						char * used_hit_X2 = end2?used_hit2:used_hit1;
5045 						int nhit_X2 = end2?nhits2:nhits1;
5046 
5047 						for( hit_x2 = 0 ; hit_x2 < nhit_X2; hit_x2 ++ ){
5048 							if(used_hit_X2[hit_x2])continue;
5049 							if(hits_indices_X2[hit_x2] < 0) continue;
5050 
5051 							srInt_64 X2_merge_key;
5052 							if (global_context -> is_gene_level )
5053 								X2_merge_key = global_context -> exontable_geneid[ hits_indices_X2[hit_x2] ];
5054 							else	X2_merge_key = hits_indices_X2[hit_x2];
5055 
5056 							if( X2_merge_key == score_merge_key ){
5057 								used_hit_X2[ hit_x2 ]=1;
5058 								scoring_gap_chros[ gaps ] = (end2?hits_chro2:hits_chro1)[hit_x2];
5059 								scoring_gap_starts[ gaps ] = (end2?hits_start_pos2:hits_start_pos1)[hit_x2];
5060 								scoring_gap_lengths[ gaps ] = (end2?hits_length2:hits_length1)[hit_x2];
5061 
5062 								if((scoring_flags[scoring_count] & (end2?2:1))== 0 ){
5063 									scoring_flags[scoring_count] |= end2?2:1;
5064 									scoring_numbers[scoring_count] ++;
5065 								}
5066 								gaps ++;
5067 							}
5068 						}
5069 					}
5070 
5071 					scoring_overlappings [scoring_count] = calc_score_overlaps(global_context, thread_context, scoring_gap_chros, scoring_gap_starts, scoring_gap_lengths, gaps, read_name);
5072 					if( global_context -> use_overlapping_break_tie )
5073 						scoring_numbers[scoring_count] = scoring_overlappings [scoring_count];
5074 					scoring_count++;
5075 				}
5076 			}
5077 		}else{
5078 			int ends;
5079 			for(ends =0 ; ends < global_context -> is_paired_end_mode_assign + 1 ; ends++){
5080 				int nhits = ends?nhits2:nhits1;
5081 				srInt_64 * hits_indices = ends?hits_indices2:hits_indices1;
5082 
5083 				int hit_x1;
5084 				for(hit_x1 = 0; hit_x1 < nhits; hit_x1++){
5085 					srInt_64 tmp_exon_id = hits_indices[hit_x1], score_merge_key;
5086 					int found = 0;
5087 					if (global_context -> is_gene_level )
5088 						score_merge_key = global_context -> exontable_geneid[tmp_exon_id];
5089 					else	score_merge_key = tmp_exon_id;
5090 
5091 					for(score_x1 = 0; score_x1 < scoring_count; score_x1 ++){
5092 						srInt_64 score_x1_key ;
5093 						if (global_context -> is_gene_level )
5094 							score_x1_key = global_context -> exontable_geneid[ scoring_exon_ids[score_x1] ];
5095 						else	score_x1_key = scoring_exon_ids[score_x1] ;
5096 
5097 						if( score_x1_key == score_merge_key ){
5098 							if((scoring_flags[score_x1] & ( ends?2:1 )) == 0) {
5099 								scoring_flags[score_x1] |= (ends?2:1);
5100 								scoring_numbers[score_x1] ++;
5101 							}
5102 
5103 							found = 1;
5104 							break;
5105 						}
5106 					}
5107 
5108 					if(0 == found){
5109 						scoring_exon_ids[scoring_count] = tmp_exon_id;
5110 						scoring_flags[scoring_count] = ends?2:1;
5111 						scoring_numbers[scoring_count] = 1;
5112 
5113 						scoring_count++;
5114 					}
5115 				}
5116 			}
5117 		}
5118 
5119 
5120 		int maximum_score = 0;
5121 		int maximum_total_count = 0;
5122 		int maximum_score_x1 = 0;
5123 		srInt_64 applied_fragment_minimum_overlapping_overlap = 1, applied_fragment_minimum_overlapping_missing = 1;
5124 		srInt_64 applied_fragment_minimum_overlapping = 1;
5125 		int overlapping_total_count = 0;
5126 
5127 		if( global_context -> fragment_minimum_overlapping > 1 ||  global_context -> need_calculate_fragment_len || global_context -> max_missing_bases_in_read >= 0){
5128 			if(global_context -> max_missing_bases_in_read >=0){
5129 				if(total_frag_len <= global_context -> max_missing_bases_in_read) applied_fragment_minimum_overlapping_missing = 0;
5130 				else applied_fragment_minimum_overlapping_missing = 10000L * (total_frag_len - global_context -> max_missing_bases_in_read);
5131 			}
5132 
5133 			applied_fragment_minimum_overlapping_overlap = max( 10000L * global_context -> fragment_minimum_overlapping, 10000. * global_context -> fractional_minimum_overlapping * total_frag_len + 0.9999);
5134 
5135 			applied_fragment_minimum_overlapping = max(applied_fragment_minimum_overlapping_overlap , applied_fragment_minimum_overlapping_missing);
5136 		}
5137 
5138 		if(scoring_count == 0){
5139 			if(global_context -> read_details_out_FP)
5140 				write_read_details_FP(global_context, thread_context,"Unassigned_NoFeatures",-1, NULL, bin1, bin2);
5141 			if(RG_name){
5142 				void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
5143 				fc_read_counters * sumtab = tab4s[1];
5144 				sumtab -> unassigned_nofeatures++;
5145 			}else thread_context->read_counters.unassigned_nofeatures ++;
5146 
5147 			if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
5148 		}else{
5149 				for(score_x1 = 0; score_x1 < scoring_count ; score_x1++){
5150 //					#warning "======= DEBUG OUT ================"
5151 					if(0 && FIXLENstrcmp("V0112_0155:7:1101:20072:12961", read_name)==0)
5152 						SUBREADprintf("READ: %s  FRAG_LEN=%d,  THIS_OVERLAP=%d\n", read_name, total_frag_len, scoring_overlappings[score_x1]);
5153 					if( applied_fragment_minimum_overlapping > 1 )
5154 						if( applied_fragment_minimum_overlapping > 10000L*scoring_overlappings[score_x1] ){
5155 							scoring_numbers[score_x1] = 0;
5156 							continue;
5157 						}
5158 
5159 					if( maximum_score < scoring_numbers[score_x1] ){
5160 						maximum_total_count = 1;
5161 						maximum_score = scoring_numbers[score_x1];
5162 						maximum_score_x1 = score_x1;
5163 					}else if( maximum_score == scoring_numbers[score_x1] )
5164 						maximum_total_count++;
5165 					overlapping_total_count ++;
5166 				}
5167 
5168 				if(maximum_total_count == 0){
5169 					if(global_context -> read_details_out_FP)
5170 						write_read_details_FP(global_context, thread_context,"Unassigned_Overlapping_Length", -1, NULL, bin1, bin2);
5171 
5172 					if(RG_name){
5173 						void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
5174 						fc_read_counters * sumtab = tab4s[1];
5175 						sumtab -> unassigned_overlapping_length++;
5176 					}else thread_context->read_counters.unassigned_overlapping_length ++;
5177 
5178 					if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
5179 				}else{
5180 
5181 					// final adding votes.
5182 					if(1 == maximum_total_count && !global_context -> is_multi_overlap_allowed) {
5183 						// simple add to the exon ( EXON_ID = decision_table_exon_ids[maximum_decision_no])
5184 						srInt_64 max_exon_id = scoring_exon_ids[maximum_score_x1];
5185 
5186 						if(RG_name){
5187 							void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
5188 							fc_read_counters * sumtab = tab4s[1];
5189 							sumtab -> assigned_reads++;
5190 
5191 							read_count_type_t * count_table = tab4s[0];
5192 							count_table[max_exon_id] += fixed_fractional_count;
5193 						}else{
5194 							thread_context->count_table[max_exon_id] += fixed_fractional_count;
5195 							thread_context->read_counters.assigned_reads ++;
5196 						}
5197 						thread_context->nreads_mapped_to_exon++;
5198 						if(global_context -> read_details_out_FP) {
5199 							int final_gene_number = global_context -> exontable_geneid[max_exon_id];
5200 							char * final_feture_name = (char *)global_context -> gene_name_array[final_gene_number];
5201 							write_read_details_FP(global_context, thread_context,"Assigned", 1, final_feture_name, bin1, bin2);
5202 						}
5203 
5204 						if(global_context -> do_scRNA_table){
5205 							srInt_64 assignment_target_number = max_exon_id;
5206 							if(global_context->is_gene_level) assignment_target_number = global_context -> exontable_geneid[max_exon_id];
5207 							add_scRNA_read_to_pool(global_context, thread_context, assignment_target_number, read_name, bin1, NULL);
5208 						}
5209 					}else if(global_context -> is_multi_overlap_allowed) {
5210 						#define GENE_NAME_LIST_BUFFER_SIZE (FEATURE_NAME_LENGTH * 50)
5211 
5212 						char final_feture_names[GENE_NAME_LIST_BUFFER_SIZE];
5213 						int assigned_no = 0, xk1;
5214 						final_feture_names[0]=0;
5215 						int is_etc = 0;
5216 
5217 						ArrayList * assigned_list = NULL;
5218 						if(global_context -> do_scRNA_table)assigned_list = ArrayListCreate(20);
5219 						for(xk1 = 0; xk1 < scoring_count; xk1++)
5220 						{
5221 
5222 							// This change was made on 31/MAR/2016
5223 							if( scoring_numbers[xk1] < 1 ) continue ;
5224 							if( scoring_numbers[xk1] < maximum_score && global_context -> use_overlapping_break_tie ) continue ;
5225 
5226 							srInt_64 tmp_voter_id = scoring_exon_ids[xk1];
5227 
5228 							srInt_64 assignment_target_number = tmp_voter_id;
5229 							if(global_context->is_gene_level) assignment_target_number = global_context -> exontable_geneid[tmp_voter_id];
5230 
5231 							if(global_context -> do_scRNA_table)ArrayListPush(assigned_list, NULL+assignment_target_number);
5232 							//if(1 && FIXLENstrcmp( read_name , "V0112_0155:7:1101:5467:23779#ATCACG" )==0)
5233 							//	SUBREADprintf("CountsFrac = %d ; add=%d\n", overlapping_total_count, calculate_multi_overlap_fraction(global_context, fixed_fractional_count, overlapping_total_count) );
5234 							if(RG_name){
5235 								void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
5236 								read_count_type_t * count_table = tab4s[0];
5237 								count_table[tmp_voter_id] += calculate_multi_overlap_fraction(global_context, fixed_fractional_count, overlapping_total_count);
5238 							}else thread_context->count_table[tmp_voter_id] += calculate_multi_overlap_fraction(global_context, fixed_fractional_count, overlapping_total_count);
5239 
5240 							if(global_context -> read_details_out_FP) {
5241 								if(strlen(final_feture_names)< (GENE_NAME_LIST_BUFFER_SIZE - 40 - FEATURE_NAME_LENGTH)) {
5242 									int final_gene_number = global_context -> exontable_geneid[tmp_voter_id];
5243 									unsigned char * final_feture_name = global_context -> gene_name_array[final_gene_number];
5244 									strncat(final_feture_names, (char *)final_feture_name, GENE_NAME_LIST_BUFFER_SIZE-1);
5245 									strncat(final_feture_names, ",", GENE_NAME_LIST_BUFFER_SIZE-1);
5246 								}else{
5247 									is_etc ++;
5248 								}
5249 								assigned_no++;
5250 							}
5251 						}
5252 
5253 						if(global_context -> do_scRNA_table && assigned_list->numOfElements>0)
5254 							add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, assigned_list);
5255 
5256 						if(assigned_list)ArrayListDestroy(assigned_list);
5257 
5258 						if(is_etc) sprintf(final_feture_names + strlen(final_feture_names), "... (%d names ommited),", is_etc);
5259 						final_feture_names[GENE_NAME_LIST_BUFFER_SIZE-1]=0;
5260 
5261 						if(RG_name){
5262 							void ** tab4s = get_RG_tables(global_context, thread_context, RG_name);
5263 							fc_read_counters * sumtab = tab4s[1];
5264 							sumtab -> assigned_reads++;
5265 						}else{
5266 							thread_context->read_counters.assigned_reads ++;
5267 						}
5268 						thread_context->nreads_mapped_to_exon++;
5269 
5270 						if(global_context -> read_details_out_FP) {
5271 							int ffnn = strlen(final_feture_names);
5272 							if(ffnn>0) final_feture_names[ffnn-1]=0;
5273 							// overlapped but still assigned
5274 							write_read_details_FP(global_context, thread_context, "Assigned", assigned_no, final_feture_names, bin1, bin2);
5275 						}
5276 					} else {
5277 						if(global_context -> read_details_out_FP)
5278 							write_read_details_FP(global_context, thread_context,"Unassigned_Ambiguity", -1, NULL, bin1, bin2);
5279 						if(RG_name){
5280 							fc_read_counters * sumtab = get_RG_tables(global_context, thread_context, RG_name)[1];
5281 							sumtab -> unassigned_ambiguous++;
5282 						}else{
5283 							thread_context->read_counters.unassigned_ambiguous ++;
5284 						}
5285 
5286 						if(global_context -> do_scRNA_table) add_scRNA_read_to_pool(global_context, thread_context, -1, read_name, bin1, NULL);
5287 					}
5288 				}
5289 		}
5290 	}
5291 }
5292 
scRNA_merge_thread_reads_in(void * ky,void * val,HashTable * tab)5293 void scRNA_merge_thread_reads_in(void *ky, void *val, HashTable * tab){
5294 	int * thread_umi_no_to_global_umi_no = tab->appendix1;
5295 	int supp_reads_to_umi_in_cell = val-NULL;
5296 	HashTable * merged_genep1_to_bcumip1_lists_table = tab->appendix2;
5297 	HashTable * merged_genep1_to_bcumip1_reads_tab_table = tab->appendix3;
5298 	srInt_64 gene_no = tab->counter1;
5299 	srInt_64 cellno_locel_umino = (ky - NULL-1);
5300 	srInt_64 cellno_global_umino = (cellno_locel_umino & 0xffffffff00000000llu) + thread_umi_no_to_global_umi_no[ cellno_locel_umino & 0xffffffff ];
5301 
5302 	ArrayList * merged_reads_gene_p1_list = HashTableGet(merged_genep1_to_bcumip1_lists_table , NULL+1+gene_no);
5303 	if(NULL == merged_reads_gene_p1_list){
5304 		merged_reads_gene_p1_list = ArrayListCreate(10);
5305 		HashTablePut(merged_genep1_to_bcumip1_lists_table, NULL+1+gene_no, merged_reads_gene_p1_list);
5306 	}
5307 	ArrayListPush( merged_reads_gene_p1_list, NULL+cellno_global_umino +1 );
5308 //	Rprintf("PUSH_GLB : %016llx\n", cellno_global_umino);
5309 
5310 	HashTable * bc_umip1_to_reads_tab = HashTableGet(merged_genep1_to_bcumip1_reads_tab_table, NULL+1+gene_no);
5311 	if(NULL == bc_umip1_to_reads_tab){
5312 		bc_umip1_to_reads_tab = HashTableCreate(10000);
5313 		HashTablePut(merged_genep1_to_bcumip1_reads_tab_table, NULL+1+gene_no, bc_umip1_to_reads_tab);
5314 	}
5315 
5316 	int sup_reads = HashTableGet(bc_umip1_to_reads_tab, NULL+cellno_global_umino+1) - NULL;
5317 	sup_reads += supp_reads_to_umi_in_cell;
5318 	HashTablePut(bc_umip1_to_reads_tab, NULL+cellno_global_umino+1, NULL+sup_reads);
5319 }
5320 
scRNA_merge_thread_reads(void * ky,void * val,HashTable * tab)5321 void scRNA_merge_thread_reads(void *ky, void *val, HashTable * tab){
5322 	int * thread_umi_no_to_global_umi_no = tab->appendix1;
5323 	srInt_64 gene_no = ky-NULL -1;
5324 	HashTable * merged_genep1_to_bcumip1_lists_table = tab->appendix2;
5325 	HashTable * merged_genep1_to_bcumip1_reads_tab_table = tab->appendix3;
5326 	HashTable * in_gene_cell_umi_table = val;
5327 
5328 	//SUBREADprintf("scRNA_merge_thread_reads : %llu has %ld\n", gene_no, in_gene_cell_umi_table -> numOfElements);
5329 
5330 	in_gene_cell_umi_table -> appendix3 = merged_genep1_to_bcumip1_reads_tab_table;
5331 	in_gene_cell_umi_table -> appendix2 = merged_genep1_to_bcumip1_lists_table;
5332 	in_gene_cell_umi_table -> appendix1 = thread_umi_no_to_global_umi_no;
5333 	in_gene_cell_umi_table -> counter1 = gene_no;
5334 	HashTableIteration(in_gene_cell_umi_table, scRNA_merge_thread_reads_in);
5335 }
5336 
scRNA_merge_thread_umitables(void * ky,void * val,HashTable * tab)5337 void scRNA_merge_thread_umitables(void *ky, void *val, HashTable * tab){
5338 	int * thread_umi_no_to_global_umi_no = tab->appendix1;
5339 	HashTable * merged_umi_table = tab->appendix2;
5340 	ArrayList * merged_umi_list = tab->appendix3;
5341 
5342 	char * umicode = ky;
5343 	int local_no = val-NULL-1;
5344 	assert(local_no >= 0);
5345 
5346 	int global_no = HashTableGet(merged_umi_table , umicode)-NULL-1;
5347 	if(global_no<0){
5348 		char * newkey = strdup(umicode);
5349 
5350 		global_no = merged_umi_table -> numOfElements;
5351 		HashTablePut(merged_umi_table, newkey, NULL+global_no+1);
5352 
5353 		assert(merged_umi_list -> numOfElements == global_no);
5354 		ArrayListPush(merged_umi_list, newkey);
5355 	}
5356 	thread_umi_no_to_global_umi_no[ local_no ] = global_no;
5357 }
5358 
5359 #define MIN_EXPRESSED_UMIS_PER_CELL 100
5360 #define MIN_EXPRESSED_UMIS_PER_GENE (3-2)
5361 
scRNA_merge_write_copy_gene_nos(void * ky,void * va,HashTable * tab)5362 void scRNA_merge_write_copy_gene_nos(void * ky, void * va , HashTable *tab){
5363 	HashTable * used_gene_table = tab -> appendix2;
5364 	ArrayList * one_sampl_gene_to_cell_umis = va;
5365 
5366 	srInt_64 UMIs = HashTableGet(used_gene_table, ky)-NULL;
5367 	HashTablePut(used_gene_table, ky, NULL + UMIs + one_sampl_gene_to_cell_umis -> numOfElements);
5368 }
scRNA_merge_write_zero_gene(fc_thread_global_context_t * global_context,char * linebuf,ArrayList * high_confid_barcode_index_list)5369 int scRNA_merge_write_zero_gene(fc_thread_global_context_t * global_context, char * linebuf, ArrayList * high_confid_barcode_index_list){
5370 	int ret=0;
5371 	srInt_64 x1;
5372 	for(x1=0;x1<high_confid_barcode_index_list->numOfElements;x1++)ret += sprintf(linebuf + ret,"\t0");
5373 	return ret;
5374 }
5375 
5376 //#warning "======== SCRNA_ALLOWED_MAX_HAMMING_DIFF IS ZERO !! ========"
5377 #define SCRNA_ALLOWED_MAX_HAMMING_DIFF 1
5378 
scRNA_reduce_cellno_compare(void * arr,int l,int r)5379 int scRNA_reduce_cellno_compare(void * arr, int l, int r){
5380 	void **sd = arr;
5381 	ArrayList * cellno_umino_p1_list = sd[0];
5382 	HashTable * cellno_umino_p1_to_reads_tab = sd[4];
5383 	srInt_64 off = sd[1]-NULL;
5384 
5385 	srInt_64 bc_umi_p1_L = ArrayListGet(cellno_umino_p1_list, off+l) - NULL;
5386 	srInt_64 bc_umi_p1_R = ArrayListGet(cellno_umino_p1_list, off+r) - NULL;
5387 	int nreads_L = HashTableGet(cellno_umino_p1_to_reads_tab, NULL+bc_umi_p1_L) - NULL;
5388 	int nreads_R = HashTableGet(cellno_umino_p1_to_reads_tab, NULL+bc_umi_p1_R) - NULL;
5389 
5390 	if(nreads_L<1 || nreads_R<1) SUBREADprintf("ERROR: No known read counts: %d, %d\n", nreads_L, nreads_R);
5391 	if(nreads_L>nreads_R) return -1;
5392 	if(nreads_L<nreads_R) return 1;
5393 
5394 	srInt_64 umiLno = (bc_umi_p1_L-1) & 0xffffffff;
5395 	srInt_64 umiRno = (bc_umi_p1_R-1) & 0xffffffff;
5396 	ArrayList * merged_umi_no_to_seq = sd[3];
5397 	char * umiLseq = ArrayListGet(merged_umi_no_to_seq, umiLno);
5398 	char * umiRseq = ArrayListGet(merged_umi_no_to_seq, umiRno);
5399 	return strcmp(umiLseq, umiRseq);
5400 }
5401 
scRNA_reduce_cellno_exchange(void * arr,int l,int r)5402 void scRNA_reduce_cellno_exchange(void * arr, int l, int r){
5403 	void **sd = arr;
5404 	ArrayList * cellno_umino_p1_list = sd[0];
5405 	srInt_64 off = sd[1]-NULL;
5406 
5407 	void* ti = cellno_umino_p1_list->elementList[off+l];
5408 	cellno_umino_p1_list->elementList[off+l] = cellno_umino_p1_list->elementList[off+r];
5409 	cellno_umino_p1_list->elementList[off+r] = ti;
5410 }
5411 
scRNA_reduce_cellno_merge(void * arr,int start,int items,int items2)5412 void scRNA_reduce_cellno_merge(void * arr, int start, int items, int items2){
5413 	void **sd = arr;
5414 	ArrayList * cellno_umino_p1_list = sd[0];
5415 	srInt_64 off = sd[1]-NULL;
5416 
5417 	void ** tmpelem=malloc(sizeof(void*)*(items+items2));
5418 	int i1_cursor = start, i2_cursor = items + start, tmp_cursor=0;
5419 	while(1){
5420 		if(i1_cursor == items + start && i2_cursor == items + items2 + start )break;
5421 		int select_items_1 = (i2_cursor == start + items + items2) || (i1_cursor < items + start && scRNA_reduce_cellno_compare(arr, i1_cursor, i2_cursor) <= 0);
5422 
5423 		if(select_items_1)
5424 			tmpelem[tmp_cursor++] = cellno_umino_p1_list->elementList[off+(i1_cursor++)];
5425 		else
5426 			tmpelem[tmp_cursor++] = cellno_umino_p1_list->elementList[off+(i2_cursor++)];
5427 	}
5428 
5429 	memcpy(cellno_umino_p1_list -> elementList+off+start, tmpelem, sizeof(void*)*(items+items2));
5430 	free(tmpelem);
5431 }
5432 
5433 // #define DEBUG_FOR_EXACT
5434 #define MIN_UMIS_FOR_CANDIDATE_RESCUE 500
5435 #define SCRNA_AMBIENT_RESCURE_MEDIAN_FRACTION 0.01
scRNA_merged_ambient_rescure(fc_thread_global_context_t * global_context,HashTable * cellP1_to_geneP1_to_umis_tab,HashTable * cellnoP1_to_umis_tab,ArrayList * this_sample_45k_90k_barcode_no_P0,ArrayList * this_sample_ambient_rescure_candi,ArrayList * highconf_cellbc_list)5436 void scRNA_merged_ambient_rescure(fc_thread_global_context_t * global_context, HashTable * cellP1_to_geneP1_to_umis_tab, HashTable * cellnoP1_to_umis_tab, ArrayList * this_sample_45k_90k_barcode_no_P0, ArrayList * this_sample_ambient_rescure_candi, ArrayList * highconf_cellbc_list){
5437 	ArrayList * sorted_bcno_p1 = HashTableSortedIndexes( cellnoP1_to_umis_tab, 1);
5438 	HashTable * highconf_cellbc_list_tab = ArrayListToLookupTable_Int(highconf_cellbc_list);
5439 	srInt_64 x1, high_conf_cells = 0;
5440 	for(x1=0; x1 < sorted_bcno_p1 -> numOfElements; x1++){
5441 		void * this_bc_pnt = ArrayListGet(sorted_bcno_p1 ,  x1);
5442 		if(HashTableGet(highconf_cellbc_list_tab, this_bc_pnt)) high_conf_cells = x1+1;
5443 		else break; // assuming that all high-umi barcodes are high-confident, this makes x1 being the # of total high-confidence barcodes.
5444 	}
5445 	#ifdef DEBUG_FOR_EXACT
5446 	#warning "============= EXT 1 ==========="
5447 	FILE * tfp = fopen("/tmp/del4-YangLiao-rescue-cand.txt","w");
5448 	#endif
5449 	if(high_conf_cells >0){
5450 		srInt_64 median_umis = HashTableGet(cellnoP1_to_umis_tab, ArrayListGet(sorted_bcno_p1 ,  (high_conf_cells-1)/2))-NULL;
5451 		srInt_64 median_umis_001_cut = (srInt_64)(median_umis *1. *SCRNA_AMBIENT_RESCURE_MEDIAN_FRACTION +0.50000001);
5452 		for(x1=0; x1 < sorted_bcno_p1 -> numOfElements; x1++){
5453 			void * this_bc_pnt_p1 = ArrayListGet(sorted_bcno_p1 ,  x1);
5454 			if(HashTableGet(highconf_cellbc_list_tab, this_bc_pnt_p1)){
5455 				continue; // it is in high-conf list
5456 			}
5457 			srInt_64 this_bc_umis = HashTableGet(cellnoP1_to_umis_tab, this_bc_pnt_p1) - NULL;
5458 			if(this_bc_umis < median_umis_001_cut) break;
5459 			if(this_bc_umis < MIN_UMIS_FOR_CANDIDATE_RESCUE) break;
5460 			if(x1 >= 45000) break;
5461 			ArrayListPush(this_sample_ambient_rescure_candi, this_bc_pnt_p1-1);
5462 		}
5463 		#ifdef DEBUG_FOR_EXACT
5464 		#warning "============= EXT 2 ==========="
5465 		for(x1=0; x1<this_sample_ambient_rescure_candi->numOfElements; x1++){
5466 			int this_bc_no_p0 = ArrayListGet(this_sample_ambient_rescure_candi, x1)-NULL;
5467 			srInt_64 this_bc_umis = HashTableGet(used_cell_barcode_tab, NULL+this_bc_no_p0+1) - NULL;
5468 			fprintf(tfp,"CAND %d %d\n", this_bc_no_p0+1, this_bc_umis);
5469 		}
5470 		#endif
5471 	}
5472 	for(x1=45000; x1 < sorted_bcno_p1 -> numOfElements; x1++){
5473 		if(x1 >= 90000) break;
5474 		ArrayListPush(this_sample_45k_90k_barcode_no_P0, ArrayListGet(sorted_bcno_p1 ,  x1)-1 );
5475 		#ifdef DEBUG_FOR_EXACT
5476 		#warning "============= EXT 3 ==========="
5477 		int this_bc_no_p1 = ArrayListGet(sorted_bcno_p1, x1)-NULL;
5478 		int this_bc_umis = HashTableGet(used_cell_barcode_tab, NULL+this_bc_no_p1) - NULL;
5479 		fprintf(tfp,"45K90K %d %d\n", this_bc_no_p1, this_bc_umis);
5480 		#endif
5481 	}
5482 	ArrayListDestroy(sorted_bcno_p1);
5483 	HashTableDestroy(highconf_cellbc_list_tab);
5484 	#ifdef DEBUG_FOR_EXACT
5485 	#warning "============= EXT 4 ==========="
5486 	fclose(tfp);
5487 
5488 	FILE * fp = fopen("/tmp/del4-YangLiao-from-python-rescue.txt","r");
5489 
5490 	x1=0;
5491 	while(1){
5492 		char * tpm=NULL;
5493 		char fl[100];
5494 		char * fr = fgets(fl, 99, fp);
5495 		if(!fr) break;
5496 		if(fl[0]!='4') continue;
5497 		int bc_no = atoi(fl+7) -1;
5498 		this_sample_45k_90k_barcode_no_P0 -> elementList[x1++] = NULL+bc_no;
5499 		if(x1 >= 45000)break;
5500 	}
5501 	fclose(fp);
5502 	fp = fopen("/tmp/del4-YangLiao-from-python-rescue.txt","r");
5503 
5504 	x1=0;
5505 	this_sample_ambient_rescure_candi -> numOfElements = 0;
5506 	while(1){
5507 		char * tpm=NULL;
5508 		char fl[100];
5509 		char * fr = fgets(fl, 99, fp);
5510 		if(!fr) break;
5511 		if(fl[0]!='C') continue;
5512 		int bc_no = atoi(fl+5) -1;
5513 		ArrayListPush(this_sample_ambient_rescure_candi, NULL+bc_no);
5514 	}
5515 	fclose(fp);
5516 	#endif
5517 }
5518 
5519 
5520 #define SCRNA_BOOTSTRAP_HIGH_INDEX 30
5521 #define SCRNA_BOOTSTRAP_SAMPLING_TIMES 100
5522 
5523 
scRNA_merged_bootstrap_a_sample(fc_thread_global_context_t * global_context,HashTable * cellP1_to_geneP1_to_umis_tab,HashTable * cellnoP1_to_umis_tab,ArrayList * highconf_cellbc_list)5524 int scRNA_merged_bootstrap_a_sample(fc_thread_global_context_t * global_context, HashTable * cellP1_to_geneP1_to_umis_tab, HashTable * cellnoP1_to_umis_tab, ArrayList * highconf_cellbc_list){
5525 	ArrayList * sorted_idx = HashTableSortedIndexes( cellnoP1_to_umis_tab, 1);
5526 	srInt_64 x2, x1;
5527 	float scRNA_umi_cutoff = global_context -> scRNA_umi_cutoff;
5528 
5529 	#define SCRNA_IDX_PRIME_NUMBER_BIG 11218439llu;
5530 	srInt_64 this_total = 0, seed_rand = sorted_idx -> numOfElements/2;
5531 
5532 	#ifdef DEBUG_FOR_EXACT
5533 	#warning "============== THIS BUILD IS ONLY FOR DEBUGGING EXACT RESULTS !!!! ================="
5534 	ArrayListSort(sorted_idx, NULL);
5535 	FILE * dfp = fopen("/tmp/del4-YangLiao-for-resample.txt","w");
5536 	for(x2 = 0; x2 < sorted_idx -> numOfElements ; x2++){
5537 		int bc_no_p1 = ArrayListGet(sorted_idx, x2)-NULL;
5538 		int bc_umis = HashTableGet(used_cell_barcode_tab, NULL+bc_no_p1) - NULL;
5539 		fprintf(dfp,"%d\t%d\t%s\n", bc_no_p1, bc_umis, ArrayListGet(global_context -> scRNA_cell_barcodes_array, bc_no_p1-1));
5540 	}
5541 	fclose(dfp);
5542 	system("python /usr/local/work/liao/subread/scripts/Cellranger-replicate/CrepPY-resample.py");
5543 	FILE * rfp = fopen("/tmp/del4-YangLiao-from-resample.txt","r");
5544 	#endif
5545 
5546 
5547 	int last_umi_no= -1;
5548 	if(scRNA_umi_cutoff >= 0.0){
5549 		for(x1 = 0; x1 < sorted_idx -> numOfElements ; x1++){
5550 			void * cellbc_p1_ptr = ArrayListGet(sorted_idx,x1);
5551 			srInt_64 this_umis = HashTableGet(cellnoP1_to_umis_tab, cellbc_p1_ptr )-NULL;
5552 			if(this_umis >= scRNA_umi_cutoff-0.1){
5553 				ArrayListPush(highconf_cellbc_list, ArrayListGet( sorted_idx, x1 ) - 1 );
5554 				last_umi_no = this_umis;
5555 			}else break;	// #UMI-sorted so no need to scan more
5556 		}
5557 	}else{
5558 		for(x1 = 0; x1 < SCRNA_BOOTSTRAP_SAMPLING_TIMES; x1++){
5559 			ArrayList * resampled_list_of_umis = ArrayListCreate( sorted_idx->numOfElements );
5560 
5561 			#ifdef DEBUG_FOR_EXACT
5562 			#warning "============== THIS BUILD IS ONLY FOR DEBUGGING EXACT RESULTS !!!! ================="
5563 			for(x2 = 0; x2 < sorted_idx -> numOfElements ; x2++){
5564 				char fl [100];
5565 				fgets(fl, 99, rfp);
5566 				int bc_no_p1 = atoi(fl);
5567 				int this_umis = HashTableGet(cellnoP1_to_umis_tab, NULL+bc_no_p1);
5568 				ArrayListPush(resampled_list_of_umis, NULL+this_umis);
5569 			}
5570 			ArrayListSort(resampled_list_of_umis, NULL);
5571 			#else
5572 			for(x2 = 0; x2 < sorted_idx -> numOfElements ; x2++){
5573 				seed_rand %= sorted_idx -> numOfElements;
5574 				void * cellbc_p1_ptr = ArrayListGet(sorted_idx, seed_rand);
5575 				seed_rand += SCRNA_IDX_PRIME_NUMBER_BIG;
5576 				srInt_64 this_umis = HashTableGet( cellnoP1_to_umis_tab, cellbc_p1_ptr )-NULL;
5577 				ArrayListPush(resampled_list_of_umis,NULL+this_umis);
5578 			}
5579 			#endif
5580 			ArrayListSort( resampled_list_of_umis, NULL );
5581 			srInt_64 UMIs_30th_div10 = ArrayListGet(resampled_list_of_umis, resampled_list_of_umis -> numOfElements - SCRNA_BOOTSTRAP_HIGH_INDEX) -NULL;
5582 			UMIs_30th_div10 = (srInt_64)(UMIs_30th_div10*1./10 + 0.500000001);
5583 
5584 			for(x2 =0; x2< resampled_list_of_umis -> numOfElements; x2++){
5585 				srInt_64 lli = resampled_list_of_umis -> numOfElements -1 -x2;
5586 				srInt_64 this_umis = ArrayListGet(resampled_list_of_umis, lli)-NULL;
5587 				if(this_umis >= UMIs_30th_div10) this_total ++;
5588 				else break;
5589 			}
5590 			ArrayListDestroy(resampled_list_of_umis);
5591 		}
5592 		double total_f = this_total*1. / SCRNA_BOOTSTRAP_SAMPLING_TIMES;
5593 		if(0) SUBREADprintf("FINAL_5CODE SELECTION_IDX =  %.5f\n",total_f);
5594 		this_total = (int)(total_f + 0.500000001);
5595 
5596 		#ifdef DEBUG_FOR_EXACT
5597 		#warning "============== THIS BUILD IS ONLY FOR DEBUGGING EXACT RESULTS !!!! ================="
5598 		sorted_idx = HashTableSortedIndexes( used_cell_barcode_tab, 1);
5599 		#endif
5600 
5601 		void * last_ptr =NULL;
5602 		for(x1 = 0; x1 < min(sorted_idx -> numOfElements, this_total) ; x1++){
5603 			last_ptr = ArrayListGet( sorted_idx, x1 );
5604 			ArrayListPush(highconf_cellbc_list, last_ptr - 1 );
5605 		}
5606 		last_umi_no = HashTableGet(cellnoP1_to_umis_tab ,last_ptr)-NULL;
5607 	}
5608 	ArrayListDestroy(sorted_idx);
5609 	return last_umi_no;
5610 }
5611 
build_exon_name(fc_thread_global_context_t * global_context,fc_feature_info_t * loaded_features,int sorted_order,char * exon_name,HashTable * sorted_order_p1_to_i_p1_tab)5612 void build_exon_name(fc_thread_global_context_t * global_context, fc_feature_info_t * loaded_features, int sorted_order, char * exon_name, HashTable * sorted_order_p1_to_i_p1_tab){
5613 	srInt_64 i = HashTableGet( sorted_order_p1_to_i_p1_tab , NULL+1+sorted_order )-NULL-1;
5614 	sprintf(exon_name, "%s:fc@R@Spl:%s:fc@R@Spl:%u:fc@R@Spl:%u:fc@R@Spl:%c", global_context -> unistr_buffer_space + loaded_features[i].feature_name_pos,
5615 	   global_context-> unistr_buffer_space + loaded_features[i].feature_name_pos + loaded_features[i].chro_name_pos_delta,
5616 	   loaded_features[i].start, loaded_features[i].end, loaded_features[i].is_negative_strand == 1?'N':(  loaded_features[i].is_negative_strand ==  0? 'P':'X'));
5617 }
5618 
scRNA_merged_write_sparse_unique_genes(void * ky,void * va,HashTable * tab)5619 void scRNA_merged_write_sparse_unique_genes(void * ky, void * va, HashTable * tab){
5620 	HashTable * unique_geneno1B_tab = tab -> appendix1;
5621 	HashTable * used_cellnoP1_tab = tab -> appendix2;
5622 
5623 	int cellbcP1 = ky-NULL;
5624 	if(used_cellnoP1_tab && !HashTableGet(used_cellnoP1_tab, NULL+cellbcP1))return;
5625 	HashTable * g2u = va;
5626 	ArrayList * g2ul = HashTableKeys(g2u);
5627 	int x1;
5628 	for(x1=0; x1<g2ul->numOfElements; x1++){
5629 		void *geneno1B_ptr = ArrayListGet(g2ul,x1);
5630 		if(!HashTableGet(unique_geneno1B_tab, ArrayListGet(g2ul,x1))) HashTablePut(unique_geneno1B_tab, geneno1B_ptr, NULL+1);
5631 		tab -> counter1 += HashTableGet(g2u, geneno1B_ptr)-NULL;
5632 	}
5633 	ArrayListDestroy(g2ul);
5634 }
5635 
scRNA_merged_write_sparse_matrix(fc_thread_global_context_t * global_context,HashTable * cellP1_to_geneP1_to_umis_tab,HashTable * cellnoP1_to_umis_tab,ArrayList * used_cell_barcodes,int sample_index,char * tabtype,fc_feature_info_t * loaded_features,HashTable * sorted_order_p1_to_i_p1_tab)5636 int scRNA_merged_write_sparse_matrix(fc_thread_global_context_t * global_context, HashTable * cellP1_to_geneP1_to_umis_tab, HashTable * cellnoP1_to_umis_tab, ArrayList * used_cell_barcodes, int sample_index, char * tabtype, fc_feature_info_t* loaded_features, HashTable * sorted_order_p1_to_i_p1_tab){
5637 	int x1,x2;
5638 
5639 	char ofname[MAX_FILE_NAME_LENGTH + 100];
5640 	sprintf(ofname,"%s.scRNA.%03d.%s.summary",global_context->input_file_name, sample_index+1,tabtype);
5641 	sprintf(ofname,"%s.scRNA.%03d.%s.BCtab",global_context->input_file_name, sample_index+1,tabtype);
5642 	FILE * ofp_bcs = fopen( ofname , "w" );
5643 	sprintf(ofname,"%s.scRNA.%03d.%s.GENEtab",global_context->input_file_name, sample_index+1,tabtype);
5644 	FILE * ofp_genes = fopen( ofname , "w" );
5645 	sprintf(ofname,"%s.scRNA.%03d.%s.spmtx",global_context->input_file_name, sample_index+1,tabtype);
5646 	FILE * ofp_mtx = fopen( ofname , "w" );
5647 	fprintf(ofp_mtx,"%%%%MatrixMarket matrix coordinate integer general\n");
5648 
5649 	HashTable * used_cellnoP1_tab = ArrayListToLookupTable_Int(used_cell_barcodes);
5650 	HashTable * unique_NZ_geneno1B_table = HashTableCreate(10000);
5651 	cellP1_to_geneP1_to_umis_tab -> counter1 = 0;
5652 	cellP1_to_geneP1_to_umis_tab -> appendix1 = unique_NZ_geneno1B_table;
5653 	cellP1_to_geneP1_to_umis_tab -> appendix2 = used_cellnoP1_tab;
5654 	HashTableIteration(cellP1_to_geneP1_to_umis_tab, scRNA_merged_write_sparse_unique_genes);
5655 	srInt_64 total_UMIs = cellP1_to_geneP1_to_umis_tab -> counter1;
5656 	ArrayList * unique_NZ_genenosP1_list = HashTableKeys(unique_NZ_geneno1B_table);
5657 	HashTableDestroy(unique_NZ_geneno1B_table);
5658 	HashTableDestroy(used_cellnoP1_tab);
5659 	ArrayListSort(unique_NZ_genenosP1_list, NULL);
5660 
5661 	#ifdef __MINGW32__
5662 	fprintf(ofp_mtx, "%I64d %I64d %I64d\n", unique_NZ_genenosP1_list -> numOfElements , used_cell_barcodes -> numOfElements,  total_UMIs );
5663 	#else
5664 	fprintf(ofp_mtx, "%lld %lld %lld\n", unique_NZ_genenosP1_list -> numOfElements , used_cell_barcodes -> numOfElements,  total_UMIs );
5665 	#endif
5666 
5667 	for(x2=0; x2 < unique_NZ_genenosP1_list -> numOfElements; x2++){
5668 		int gene_index_0B = ArrayListGet(unique_NZ_genenosP1_list, x2) - NULL-1;
5669 		if(global_context->is_gene_level){
5670 			char* gene_name = (char*)global_context -> gene_name_array [gene_index_0B];
5671 			fprintf(ofp_genes,"%s\n", gene_name);
5672 		}else{
5673 			char exon_name[FEATURE_NAME_LENGTH+60];
5674 			build_exon_name(global_context, loaded_features, gene_index_0B, exon_name, sorted_order_p1_to_i_p1_tab);
5675 			fprintf(ofp_genes,"%s\n", exon_name);
5676 		}
5677 	}
5678 
5679 	for(x1 = 0; x1 < used_cell_barcodes -> numOfElements; x1++){
5680 		srInt_64 cellno = ArrayListGet(used_cell_barcodes, x1)-NULL;
5681 		char * cellbc_seq = ArrayListGet(global_context -> scRNA_cell_barcodes_array, cellno);
5682 		fprintf(ofp_bcs,"%s\n", cellbc_seq);
5683 	}
5684 
5685 	for(x1 = 0; x1 < used_cell_barcodes -> numOfElements; x1++){
5686 		srInt_64 cellno = ArrayListGet(used_cell_barcodes, x1)-NULL;
5687 		HashTable * geneno1B_to_UMIs = HashTableGet(cellP1_to_geneP1_to_umis_tab, NULL+1+cellno);
5688 
5689 		for(x2=0; x2 < unique_NZ_genenosP1_list -> numOfElements; x2++){
5690 			int geneno1B = ArrayListGet(unique_NZ_genenosP1_list, x2)-NULL;
5691 			int this_umis = HashTableGet(geneno1B_to_UMIs, NULL+geneno1B) -NULL;
5692 			if(this_umis>0)fprintf(ofp_mtx,"%d %d %d\n", x2+1, x1+1, this_umis);
5693 		}
5694 	}
5695 	ArrayListDestroy(unique_NZ_genenosP1_list);
5696 	fclose(ofp_bcs);
5697 	fclose(ofp_genes);
5698 	fclose(ofp_mtx);
5699 
5700 	return 0;
5701 }
5702 
scRNA_merged_45K_to_90K_sum_SUM_Level2(void * GeneNo1B,void * vUMIs,HashTable * m2)5703 void scRNA_merged_45K_to_90K_sum_SUM_Level2(void * GeneNo1B, void * vUMIs, HashTable * m2){
5704 	HashTable * summed_gene_to_umis = m2 -> appendix1;
5705 	HashTablePut(summed_gene_to_umis, GeneNo1B, vUMIs + (HashTableGet(summed_gene_to_umis, GeneNo1B)-NULL));
5706 }
5707 
scRNA_merged_45K_to_90K_sum_SUM(void * keyCellNoP1,void * Vgno_umi_tab,HashTable * me)5708 void scRNA_merged_45K_to_90K_sum_SUM(void * keyCellNoP1, void * Vgno_umi_tab, HashTable * me){
5709 	HashTable * summed_gene_to_umis  = me -> appendix1;
5710 	HashTable * bcid_look_tab = me -> appendix2;
5711 	//fc_thread_global_context_t * global_context = me -> appendix3;
5712 	HashTable * geneno1B_to_UMIs_tab = Vgno_umi_tab;
5713 	if(!HashTableGet(bcid_look_tab, keyCellNoP1))return;
5714 	geneno1B_to_UMIs_tab -> appendix1 = summed_gene_to_umis;
5715 	HashTableIteration(geneno1B_to_UMIs_tab ,scRNA_merged_45K_to_90K_sum_SUM_Level2 );
5716 }
5717 
scRNA_merged_45K_to_90K_sum_WRT(void * kyGeneID,void * valUMIs,HashTable * me)5718 void scRNA_merged_45K_to_90K_sum_WRT(void * kyGeneID, void * valUMIs, HashTable * me){
5719 	fc_thread_global_context_t * global_context = me -> appendix1;
5720 	FILE * ofp = me -> appendix2;
5721 	void ** vp2 = me->appendix3;
5722 	fc_feature_info_t * loaded_features = vp2[0];
5723 	HashTable * sorted_order_p1_to_i_p1_tab = vp2[1];
5724 
5725 	if(global_context -> is_gene_level){
5726 		unsigned char * gene_name = global_context -> gene_name_array[ kyGeneID - NULL-1 ];
5727 		fprintf(ofp, "%s\t%u\n", gene_name, (unsigned int) (valUMIs-NULL));
5728 	}else{
5729 		char exon_name[FEATURE_NAME_LENGTH+60];
5730 		build_exon_name(global_context, loaded_features, kyGeneID-NULL-1, exon_name, sorted_order_p1_to_i_p1_tab);
5731 		fprintf(ofp,"%s\t%u\n", exon_name, (unsigned int) (valUMIs-NULL));
5732 	}
5733 }
5734 
scRNA_merged_45K_to_90K_sum(fc_thread_global_context_t * global_context,HashTable * cellP1_geneP1_UMIs_tab,ArrayList * bcid_P0_arr,int sample_no,fc_feature_info_t * loaded_features,HashTable * sorted_index_p1_to_i_p1_tab)5735 void scRNA_merged_45K_to_90K_sum(fc_thread_global_context_t * global_context, HashTable * cellP1_geneP1_UMIs_tab, ArrayList * bcid_P0_arr, int sample_no, fc_feature_info_t * loaded_features, HashTable * sorted_index_p1_to_i_p1_tab){
5736 	HashTable * summed_gene_to_umis = HashTableCreate( 3+cellP1_geneP1_UMIs_tab->numOfElements/6 );
5737 	HashTable * bcid_look_tab = ArrayListToLookupTable_Int(bcid_P0_arr);
5738 	cellP1_geneP1_UMIs_tab -> appendix1 = summed_gene_to_umis;
5739 	cellP1_geneP1_UMIs_tab -> appendix2 = bcid_look_tab;
5740 	cellP1_geneP1_UMIs_tab -> appendix3 = global_context;
5741 	HashTableIteration( cellP1_geneP1_UMIs_tab, scRNA_merged_45K_to_90K_sum_SUM );
5742 
5743 	char ofname[MAX_FILE_NAME_LENGTH + 100];
5744 	sprintf(ofname,"%s.scRNA.%03d.AmbSum",global_context->input_file_name, sample_no+1);
5745 	FILE * write_fp = fopen(ofname,"w");
5746 	fprintf(write_fp,"GeneID\tUMIs\n");
5747 	summed_gene_to_umis -> appendix1 = global_context;
5748 	summed_gene_to_umis -> appendix2 = write_fp;
5749 	void * vp2[2];
5750 	vp2[0]=loaded_features;
5751 	vp2[1]=sorted_index_p1_to_i_p1_tab;
5752 	summed_gene_to_umis -> appendix3 = vp2;
5753 	summed_gene_to_umis -> counter1 = sample_no;
5754 	HashTableIteration( summed_gene_to_umis, scRNA_merged_45K_to_90K_sum_WRT );
5755 	HashTableDestroy(bcid_look_tab);
5756 	HashTableDestroy(summed_gene_to_umis);
5757 	fclose(write_fp);
5758 }
5759 
scRNA_merged_write_nozero_geneids_WRT(void * k,void * v,HashTable * me)5760 void scRNA_merged_write_nozero_geneids_WRT(void *k, void *v, HashTable* me){
5761 	FILE * fp = me->appendix1;
5762 	fc_thread_global_context_t * global_context = me->appendix2;
5763 	void ** tv2 = me->appendix3;
5764 	fc_feature_info_t * loaded_features = tv2[0];
5765 	HashTable * sorted_order_p1_to_i_p1_tab = tv2[1];
5766 	if(global_context -> is_gene_level){
5767 		unsigned char* gene_symbol = global_context -> gene_name_array [k-NULL-1];
5768 		fprintf(fp, "%s\n", gene_symbol);
5769 	}else{
5770 		char exon_name[FEATURE_NAME_LENGTH+60];
5771 		build_exon_name(global_context, loaded_features, k-NULL-1, exon_name, sorted_order_p1_to_i_p1_tab);
5772 		fprintf(fp,"%s\n", exon_name);
5773 	}
5774 }
5775 
scRNA_merged_write_nozero_geneids(fc_thread_global_context_t * global_context,HashTable * no0genes,int samplenno,fc_feature_info_t * loaded_features,HashTable * sorted_order_p1_to_i_p1_tab)5776 void scRNA_merged_write_nozero_geneids(fc_thread_global_context_t * global_context, HashTable * no0genes, int samplenno, fc_feature_info_t * loaded_features, HashTable * sorted_order_p1_to_i_p1_tab){
5777 	char ofname[MAX_FILE_NAME_LENGTH + 100];
5778 	sprintf(ofname,"%s.scRNA.%03d.no0Genes",global_context->input_file_name, samplenno+1);
5779 	FILE * fp = fopen( ofname , "w" );
5780 	no0genes -> appendix1 =fp;
5781 	void * tv2[2];
5782 	no0genes -> appendix2 =global_context;
5783 	tv2[0]=loaded_features;
5784 	tv2[1]=sorted_order_p1_to_i_p1_tab;
5785 	no0genes -> appendix3 =tv2;
5786 	HashTableIteration(no0genes, scRNA_merged_write_nozero_geneids_WRT);
5787 	fclose(fp);
5788 }
5789 
scRNA_merged_to_tables_write_build_UMIcount_in(void * ky,void * val,HashTable * tab)5790 void scRNA_merged_to_tables_write_build_UMIcount_in(void * ky, void * val, HashTable * tab){
5791 	tab -> counter1 += (val-NULL);
5792 }
5793 
scRNA_merged_to_tables_write_build_UMIcounts(void * ky,void * val,HashTable * tab)5794 void scRNA_merged_to_tables_write_build_UMIcounts(void * ky, void * val, HashTable * tab){
5795 	HashTable * cellbcP1_to_umis_tab = tab -> appendix1;
5796 	int cell_no = ky-NULL-1;
5797 	HashTable * geneP1_to_counts_tab = val;
5798 
5799 	geneP1_to_counts_tab -> counter1 = 0;
5800 	HashTableIteration(geneP1_to_counts_tab, scRNA_merged_to_tables_write_build_UMIcount_in);
5801 	HashTablePut(cellbcP1_to_umis_tab, NULL+1+cell_no, NULL+geneP1_to_counts_tab -> counter1);
5802 }
5803 
5804 // this function writes a single count table.
5805 // Rows: genes
5806 // Cols: Cell_Barcode +"."+ SampleName
scRNA_merged_to_tables_write(fc_thread_global_context_t * global_context,HashTable ** cellP1_to_geneP1_to_umis,fc_feature_info_t * loaded_features,srInt_64 nexons)5807 void scRNA_merged_to_tables_write( fc_thread_global_context_t * global_context, HashTable ** cellP1_to_geneP1_to_umis, fc_feature_info_t * loaded_features, srInt_64 nexons){
5808 	char ofname[MAX_FILE_NAME_LENGTH + 20];
5809 	sprintf(ofname,"%s.scRNA.SampleTable",global_context->input_file_name);
5810 	FILE * sample_tab_fp = fopen( ofname , "w" );
5811 	int x1;
5812 
5813 	fprintf(sample_tab_fp,"SampleName\tUMICutoff\tTotalReads\tMappedReads\tAssignedReads\tIndex\n");
5814 	for(x1 = 0; x1 < global_context -> scRNA_sample_sheet_table -> numOfElements ; x1++){
5815 		srInt_64 mapped_reads = 0, all_reads = 0, assigned_reads = 0;
5816 		int thrid;
5817 		for(thrid=0; thrid<global_context-> thread_number; thrid++){
5818 			mapped_reads += global_context -> thread_contexts[thrid].scRNA_mapped_reads_per_sample[x1];
5819 			assigned_reads += global_context -> thread_contexts[thrid].scRNA_assigned_reads_per_sample[x1];
5820 			all_reads += global_context -> thread_contexts[thrid].scRNA_reads_per_sample[x1];
5821 		}
5822 		ArrayList * high_confid_barcode_index_list = ArrayListCreate(20000);
5823 		ArrayList * this_sample_ambient_rescure_candi = ArrayListCreate(10000);
5824 		ArrayList * this_sample_45k_90k_barcode_no_P0 = ArrayListCreate(90000 - 45000 + 100);
5825 
5826 		HashTable * cellbcP1_to_umis_tab = HashTableCreate(cellP1_to_geneP1_to_umis[x1] -> numOfElements);
5827 		cellP1_to_geneP1_to_umis[x1] -> appendix1 = cellbcP1_to_umis_tab;
5828 		HashTableIteration(cellP1_to_geneP1_to_umis[x1], scRNA_merged_to_tables_write_build_UMIcounts);
5829 
5830 		int applied_umi_cut = scRNA_merged_bootstrap_a_sample(global_context, cellP1_to_geneP1_to_umis[x1], cellbcP1_to_umis_tab, high_confid_barcode_index_list);
5831 		global_context -> scRNA_applied_umi_cut[x1] = applied_umi_cut;
5832 		scRNA_merged_ambient_rescure(global_context, cellP1_to_geneP1_to_umis[x1], cellbcP1_to_umis_tab, this_sample_45k_90k_barcode_no_P0, this_sample_ambient_rescure_candi, high_confid_barcode_index_list);
5833 
5834 		int umi_cutoff = global_context -> scRNA_applied_umi_cut[x1];
5835 		char * this_sample_name = ArrayListGet(global_context -> scRNA_sample_id_to_name, x1);
5836 #ifdef __MINGW32__
5837 		fprintf(sample_tab_fp,"%s\t%d\t%I64d\t%I64d\t%I64d\t%d\n", this_sample_name, umi_cutoff, all_reads, mapped_reads, assigned_reads,x1+1);
5838 #else
5839 		fprintf(sample_tab_fp,"%s\t%d\t%lld\t%lld\t%lld\t%d\n", this_sample_name, umi_cutoff, all_reads, mapped_reads, assigned_reads, x1+1);
5840 #endif
5841 		srInt_64 xk1;
5842 		HashTable * sorted_order_p1_to_i_p1_tab = HashTableCreate(nexons/4);
5843 		for(xk1 = 0; xk1 < nexons ; xk1++){
5844 			HashTablePut(sorted_order_p1_to_i_p1_tab, NULL+loaded_features[xk1].sorted_order+1 , NULL+xk1+1 );
5845 		}
5846 
5847 		#ifdef DEBUG_FOR_EXACT
5848 		#warning " ======= Another debug ======"
5849 		scRNA_merged_write_sparse_matrix(global_context, merged_tables_gene_to_cell_umis[x1], used_cell_barcode_tabs[x1], NULL, x1, "RawMatrix",  loaded_features, sorted_order_p1_to_i_p1_tab);
5850 		//scRNA_merged_write_sparse_matrix(global_context, merged_tables_gene_to_cell_umis[x1], used_cell_barcode_tabs[x1]  this_sample_45k_90k_barcode_no_P0, x1, "AmbProfCells",  loaded_features, sorted_order_p1_to_i_p1_tab);
5851 		#endif
5852 
5853 		scRNA_merged_write_sparse_matrix(global_context, cellP1_to_geneP1_to_umis[x1], cellbcP1_to_umis_tab, high_confid_barcode_index_list, x1, "HighConf",  loaded_features, sorted_order_p1_to_i_p1_tab);
5854 		scRNA_merged_write_sparse_matrix(global_context, cellP1_to_geneP1_to_umis[x1], cellbcP1_to_umis_tab, this_sample_ambient_rescure_candi, x1, "RescCand",  loaded_features, sorted_order_p1_to_i_p1_tab);
5855 		scRNA_merged_45K_to_90K_sum( global_context, cellP1_to_geneP1_to_umis[x1], this_sample_45k_90k_barcode_no_P0, x1 , loaded_features, sorted_order_p1_to_i_p1_tab);
5856 		HashTable * no0genes = HashTableCreate(10000);
5857 		cellP1_to_geneP1_to_umis[x1] -> appendix1 = no0genes;
5858 		cellP1_to_geneP1_to_umis[x1] -> appendix2 = NULL;
5859 		HashTableIteration(cellP1_to_geneP1_to_umis[x1], scRNA_merged_write_sparse_unique_genes);
5860 		scRNA_merged_write_nozero_geneids(global_context, no0genes, x1, loaded_features, sorted_order_p1_to_i_p1_tab);
5861 
5862 		HashTableDestroy(no0genes);
5863 		ArrayListDestroy(this_sample_ambient_rescure_candi);
5864 		ArrayListDestroy(this_sample_45k_90k_barcode_no_P0);
5865 		ArrayListDestroy(high_confid_barcode_index_list);
5866 		HashTableDestroy(cellbcP1_to_umis_tab);
5867 		HashTableDestroy(sorted_order_p1_to_i_p1_tab);
5868 	}
5869 
5870 	fclose(sample_tab_fp);
5871 }
5872 
scRNA_find_gene_to_umi_do_merger(void * ky_genep1,void * val_arr_bc_umip1,HashTable * tab)5873 void scRNA_find_gene_to_umi_do_merger(void * ky_genep1, void * val_arr_bc_umip1, HashTable * tab){
5874 	HashTable * bc_umi_to_genes_tab = tab -> appendix1;
5875 	ArrayList * arr_bc_umip1 = val_arr_bc_umip1;
5876 	int gene_no = ky_genep1-NULL-1;
5877 	int x1;
5878 	for(x1=0; x1<arr_bc_umip1 -> numOfElements; x1++){
5879 		void * bc_umip1 = ArrayListGet(arr_bc_umip1,x1);
5880 		ArrayList * gene_list = HashTableGet(bc_umi_to_genes_tab, bc_umip1);
5881 		if(!gene_list){
5882 			gene_list = ArrayListCreate(1);
5883 			HashTablePut(bc_umi_to_genes_tab, bc_umip1, gene_list);
5884 		}
5885 		ArrayListPush(gene_list, NULL+gene_no);
5886 	}
5887 }
5888 
scRNA_find_gene_to_umi_sortCompare(void * L_elem,void * R_elem,ArrayList * me)5889 int scRNA_find_gene_to_umi_sortCompare(void * L_elem, void * R_elem, ArrayList * me){
5890 	void ** pnts = me -> appendix1;
5891 	void * key_bc_umi_p1 = pnts[0];
5892 	HashTable * geneno_umi_bc_counts = pnts[1];
5893 	int geneno_L = L_elem-NULL;
5894 	int geneno_R = R_elem-NULL;
5895 	int nsupp_L = HashTableGet(HashTableGet(geneno_umi_bc_counts, NULL+1+ geneno_L), key_bc_umi_p1) - NULL;
5896 	int nsupp_R = HashTableGet(HashTableGet(geneno_umi_bc_counts, NULL+1+ geneno_R), key_bc_umi_p1) - NULL;
5897 	if(nsupp_L > nsupp_R) return -1;
5898 	if(nsupp_L < nsupp_R) return  1;
5899 	return 0;
5900 }
5901 
scRNA_find_gene_to_umi_sortByReads(void * key_bc_umi_p1,void * val_arr_genes,HashTable * tab)5902 void scRNA_find_gene_to_umi_sortByReads(void * key_bc_umi_p1, void * val_arr_genes, HashTable * tab){
5903 	ArrayList * arr_genes = val_arr_genes;
5904 	if(arr_genes -> numOfElements<2) return;
5905 
5906 	HashTable * geneno_umi_bc_counts = tab -> appendix1;
5907 	void * pnts[2];
5908 	pnts[0]=key_bc_umi_p1;
5909 	pnts[1]=geneno_umi_bc_counts;
5910 	arr_genes -> appendix1 = pnts;
5911 	ArrayListSort(arr_genes, scRNA_find_gene_to_umi_sortCompare);
5912 }
5913 
scRNA_find_gene_to_umi_mark_deletee(void * key_bc_umi_p1,void * val_arr_genes_sorted,HashTable * tab)5914 void scRNA_find_gene_to_umi_mark_deletee(void * key_bc_umi_p1, void * val_arr_genes_sorted, HashTable * tab){
5915 	ArrayList * arr_genes = val_arr_genes_sorted;
5916 	if(arr_genes -> numOfElements<2) return;
5917 
5918 	HashTable * gene_bc_umi_to_deleted_genes_tab = tab->appendix1;
5919 	HashTable * gene_to_bc_umi_p1_to_reads_tab = tab->appendix2;
5920 	int gene1_no = ArrayListGet(arr_genes, 0)-NULL;
5921 	int gene2_no = ArrayListGet(arr_genes, 1)-NULL;
5922 
5923 		if(0){
5924 			int x1;
5925 			for(x1=0; x1<arr_genes -> numOfElements; x1++){
5926 				int gene_no = ArrayListGet(arr_genes, x1)-NULL;
5927 				int nsupp = HashTableGet(HashTableGet(gene_to_bc_umi_p1_to_reads_tab , NULL+gene_no+1), key_bc_umi_p1) - NULL;
5928 				int cellno = ( key_bc_umi_p1-NULL-1 ) >> 32;
5929 				SUBREADprintf("TESTING_SORT : %d of %d have %d reads\n", gene_no, cellno, nsupp);
5930 			}
5931 		}
5932 
5933 
5934 
5935 	int supp1 = HashTableGet(HashTableGet(gene_to_bc_umi_p1_to_reads_tab, NULL+gene1_no+1), key_bc_umi_p1)-NULL;
5936 	int supp2 = HashTableGet(HashTableGet(gene_to_bc_umi_p1_to_reads_tab, NULL+gene2_no+1), key_bc_umi_p1)-NULL;
5937 
5938 	ArrayList * to_del_genes = HashTableGet(gene_bc_umi_to_deleted_genes_tab, key_bc_umi_p1);
5939 	if(NULL==to_del_genes){
5940 		to_del_genes = ArrayListCreate(3);
5941 		HashTablePut(gene_bc_umi_to_deleted_genes_tab, key_bc_umi_p1, to_del_genes);
5942 	}
5943 
5944 	if(supp1 == supp2) ArrayListPush(to_del_genes, NULL+gene1_no);
5945 	int x1;
5946 	for(x1=1; x1< arr_genes->numOfElements; x1++) ArrayListPush(to_del_genes, ArrayListGet(arr_genes, x1));
5947 }
5948 
scRNA_find_gene_to_umi_merger(fc_thread_global_context_t * global_context,HashTable * gene_to_bc_umi_p1_tab,HashTable * gene_to_bc_umi_p1_to_reads_tab)5949 HashTable * scRNA_find_gene_to_umi_merger(fc_thread_global_context_t * global_context, HashTable * gene_to_bc_umi_p1_tab, HashTable * gene_to_bc_umi_p1_to_reads_tab){
5950 	HashTable * bc_umi_to_genes_tab = HashTableCreate( 1000000);
5951 	HashTableSetDeallocationFunctions(bc_umi_to_genes_tab, NULL, (void (*) (void *))ArrayListDestroy);
5952 	gene_to_bc_umi_p1_tab -> appendix1 = bc_umi_to_genes_tab;
5953 	gene_to_bc_umi_p1_tab -> appendix2 = global_context;
5954 
5955 	HashTableIteration(gene_to_bc_umi_p1_tab, scRNA_find_gene_to_umi_do_merger);
5956 	bc_umi_to_genes_tab -> appendix1 = gene_to_bc_umi_p1_to_reads_tab;
5957 	HashTableIteration(bc_umi_to_genes_tab, scRNA_find_gene_to_umi_sortByReads);
5958 
5959 	HashTable * gene_bc_umi_to_deleted_genes_tab = HashTableCreate( 1000000); // bc_um1_p1 => ArrayList (deleted_gene_1, deleted_gene_2, ...)
5960 	bc_umi_to_genes_tab -> appendix1 = gene_bc_umi_to_deleted_genes_tab;
5961 	bc_umi_to_genes_tab -> appendix2 = gene_to_bc_umi_p1_to_reads_tab;
5962 	HashTableIteration(bc_umi_to_genes_tab, scRNA_find_gene_to_umi_mark_deletee);
5963 	HashTableDestroy(bc_umi_to_genes_tab);
5964 	return gene_bc_umi_to_deleted_genes_tab ;
5965 }
5966 
5967 struct scRNA_merge_batches_worker_task{
5968 	int sample_id;
5969 	int inbin_len;
5970 	srInt_64 block_number;
5971 	char inbin[MERGER_WORKER_BINSIZE];
5972 };
5973 
5974 struct scRNA_merge_batches_worker_current{
5975 	struct scRNA_merge_batches_worker_task * task;
5976 	char outbin[MERGER_WORKER_BINSIZE];
5977 	int outbin_len;
5978 	unsigned int crc32;
5979 
5980 	z_stream strm;
5981 };
5982 
scRNA_merge_batches_worker(void * vp)5983 void * scRNA_merge_batches_worker(void * vp){
5984 	void **vpp = vp;
5985 	fc_thread_global_context_t * global_context = vpp[0];
5986 	worker_master_mutex_t * worker_mut  = vpp[1];
5987 	int my_worker_id = vpp[2] - NULL;
5988 	struct scRNA_merge_batches_worker_current * my_current_job = vpp[3];
5989 	free(vp);
5990 
5991 	int Z_DEFAULT_MEM_LEVEL = 8;
5992 	worker_thread_start(worker_mut, my_worker_id);
5993 	while(1){
5994 		if(worker_wait_for_job(worker_mut, my_worker_id)) break;
5995 		if(!global_context -> is_scRNA_BAM_FQ_out_generated) continue;
5996 
5997 		deflateInit2(&my_current_job -> strm , SAMBAM_COMPRESS_LEVEL_NORMAL, Z_DEFLATED, SAMBAM_GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
5998 
5999 		struct scRNA_merge_batches_worker_task * current_input = my_current_job -> task;
6000 		my_current_job -> strm.avail_in = current_input -> inbin_len;
6001 		my_current_job -> strm.next_in = (unsigned char*)current_input -> inbin;
6002 		my_current_job -> strm.avail_out = MERGER_WORKER_BINSIZE;
6003 		my_current_job -> strm.next_out = (unsigned char*)my_current_job -> outbin;
6004 
6005 		deflate(&my_current_job -> strm, Z_FINISH);
6006 		my_current_job -> outbin_len = MERGER_WORKER_BINSIZE-my_current_job -> strm.avail_out;
6007 		my_current_job -> crc32 = FC_CRC32(current_input -> inbin, current_input -> inbin_len);
6008 		deflateEnd(&my_current_job -> strm);
6009 	}
6010 	return NULL;
6011 }
6012 
scRNA_save_BAM_result(fc_thread_global_context_t * global_context,struct scRNA_merge_batches_worker_current * finished_job)6013 void scRNA_save_BAM_result(fc_thread_global_context_t * global_context, struct scRNA_merge_batches_worker_current * finished_job){
6014 	if(!finished_job -> task)return;
6015 	if(global_context -> is_scRNA_BAM_FQ_out_generated){
6016 		int sample_id = finished_job -> task -> sample_id;
6017 		void ** fps = HashTableGet(global_context -> scRNA_sample_BAM_writers, NULL+sample_id);
6018 		simple_bam_writer * wtr = fps[0];
6019 		int inbin_pos = 0;
6020 		while(inbin_pos < finished_job -> task -> inbin_len){
6021 			int binlen = 0;
6022 			memcpy(&binlen, finished_job -> task -> inbin+inbin_pos, 4);
6023 			simple_bam_writer_update_index(wtr, finished_job -> task -> inbin+inbin_pos, binlen, finished_job -> task -> block_number, inbin_pos);
6024 			inbin_pos += 4+binlen;
6025 		}
6026 		simple_bam_write_compressed_block(wtr, finished_job -> outbin, finished_job -> outbin_len, finished_job -> task -> inbin_len, finished_job -> crc32, finished_job -> task -> block_number);
6027 	}
6028 	finished_job -> task = NULL;
6029 }
6030 
6031 // return the number of RG result sets
fc_thread_merge_results(fc_thread_global_context_t * global_context,read_count_type_t * nreads,srInt_64 * nreads_mapped_to_exon,fc_read_counters * my_read_counter,HashTable * junction_global_table,HashTable * splicing_global_table,HashTable * RGmerged_table,fc_feature_info_t * loaded_features,srInt_64 nexons)6032 int fc_thread_merge_results(fc_thread_global_context_t * global_context, read_count_type_t * nreads , srInt_64 *nreads_mapped_to_exon, fc_read_counters * my_read_counter, HashTable * junction_global_table, HashTable * splicing_global_table, HashTable * RGmerged_table, fc_feature_info_t * loaded_features, srInt_64 nexons)
6033 {
6034 	int xk1, xk2, ret = 0, sample_i;
6035 
6036 	srInt_64 total_input_reads = 0 ;
6037 	(*nreads_mapped_to_exon)=0;
6038 	SAM_pairer_destroy(&global_context -> read_pairer);
6039 
6040 	if(global_context -> do_scRNA_table){
6041 		int compress_workers = max(1,global_context-> thread_number-1);
6042 		HashTable * cellnoP1_to_genenoP1_to_UMIs[global_context -> scRNA_sample_sheet_table -> numOfElements];
6043 		struct scRNA_merge_batches_worker_task * task_buffers = malloc(sizeof(struct scRNA_merge_batches_worker_task) * (1+compress_workers)* global_context->scRNA_sample_sheet_table -> numOfElements);
6044 		int current_filling_worker_per_sample [global_context->scRNA_sample_sheet_table -> numOfElements];
6045 		struct scRNA_merge_batches_worker_current * worker_current_jobs = calloc(sizeof(struct scRNA_merge_batches_worker_current), compress_workers);
6046 
6047 		ArrayList * file_size_list = ArrayListCreate(global_context-> scRNA_barcode_batched_bin_no +1);
6048 		for(xk1=0; xk1<global_context-> scRNA_barcode_batched_bin_no +2; xk1++){
6049 			if(xk1<global_context-> scRNA_barcode_batched_bin_no +1){
6050 				srInt_64 batchsize = ftello(global_context -> scRNA_barcode_batched_bins[xk1]);
6051 				ArrayListPush(file_size_list, NULL+( batchsize<<20 | xk1));
6052 			}
6053 			fclose(global_context -> scRNA_barcode_batched_bins[xk1]);
6054 		}
6055 		ArrayListSort(file_size_list, NULL);
6056 
6057 		srInt_64 block_numbers_current [global_context->scRNA_sample_sheet_table -> numOfElements];
6058 		for(xk1=0; xk1<global_context->scRNA_sample_sheet_table -> numOfElements; xk1++){
6059 	 		cellnoP1_to_genenoP1_to_UMIs[xk1] = HashTableCreate(10000);
6060 			HashTableSetDeallocationFunctions(cellnoP1_to_genenoP1_to_UMIs[xk1], NULL,(void (*) (void*))HashTableDestroy);
6061 			current_filling_worker_per_sample[xk1] = 0;
6062 			task_buffers[xk1].inbin_len = 0;
6063 			block_numbers_current[xk1] = 0;
6064 		}
6065 
6066 		pthread_t *threads = malloc(sizeof(pthread_t)*global_context-> thread_number);
6067 		for(xk1=0; xk1<compress_workers+1; xk1++)for(xk2 = 0; xk2 < global_context->scRNA_sample_sheet_table -> numOfElements; xk2++) task_buffers[xk1*global_context->scRNA_sample_sheet_table -> numOfElements + xk2].sample_id = xk2+1;
6068 
6069 		for(xk1=0; xk1<global_context-> thread_number; xk1++){
6070 			void ** vpp = malloc(sizeof(void*)*3);
6071 			vpp[0] = global_context;
6072 			vpp[1] = global_context -> thread_contexts+xk1;
6073 			vpp[2] = file_size_list;
6074 			pthread_create(threads + xk1, NULL, scRNA_do_one_batch, vpp);
6075 		}
6076 
6077 		for(xk1=0; xk1<global_context-> thread_number; xk1++)
6078 			pthread_join(threads[xk1],NULL);
6079 		ArrayListDestroy(file_size_list);
6080 
6081 		worker_master_mutex_t worker_mut;
6082 		worker_master_mutex_init(&worker_mut, max(1,global_context-> thread_number-1));
6083 
6084 		for(xk1=0; xk1<max(1,global_context-> thread_number-1); xk1++){
6085 			void ** vpp = malloc(sizeof(void*)*4);
6086 			vpp[0] = global_context;
6087 			vpp[1] = &worker_mut;
6088 			vpp[2] = NULL + xk1;
6089 			vpp[3] = worker_current_jobs + xk1;
6090 			pthread_create(threads + xk1, NULL, scRNA_merge_batches_worker, vpp);
6091 		}
6092 
6093 		FILE * input_fps[global_context -> scRNA_barcode_batched_bin_no+2];
6094 		char * last_rbin_buffer[global_context -> scRNA_barcode_batched_bin_no+1];
6095 		srInt_64 current_sorting_key[global_context -> scRNA_barcode_batched_bin_no+1];
6096 
6097 		for(xk1=0; xk1<global_context -> scRNA_barcode_batched_bin_no+2; xk1++){
6098 			char tmp_fname[MAX_FILE_NAME_LENGTH+80];
6099 			sprintf(tmp_fname, "%s/cellCounts-Splitted-Reads-%05d-%05d.bin", global_context -> temp_file_dir, getpid(), xk1);
6100 			input_fps[xk1] = fopen(tmp_fname,"rb");
6101 			if(xk1 == global_context -> scRNA_barcode_batched_bin_no+1)break;
6102 
6103 			srInt_64 section1_items=0;
6104 			for(sample_i = 0; sample_i < global_context -> scRNA_sample_sheet_table -> numOfElements; sample_i++){
6105 				fread(&section1_items,1, 8, input_fps[xk1]);
6106 				for(xk2 = 0; xk2 < section1_items; xk2++){
6107 					srInt_64 cellbcP0_geneno0B=0, umis=0;
6108 					fread(&cellbcP0_geneno0B,1,8,input_fps[xk1]);
6109 					fread(&umis,1,8,input_fps[xk1]);
6110 
6111 					int cellbc_no = cellbcP0_geneno0B>>32;
6112 					int gene_no0B = (int)(cellbcP0_geneno0B&0xffffffffu);
6113 					HashTable *gene_tab = HashTableGet(cellnoP1_to_genenoP1_to_UMIs[sample_i], NULL+cellbc_no+1);
6114 					if(gene_tab==NULL){
6115 						gene_tab = HashTableCreate(300);
6116 						HashTablePut(cellnoP1_to_genenoP1_to_UMIs[sample_i], NULL+cellbc_no+1, gene_tab);
6117 					}
6118 					HashTablePut(gene_tab, NULL+gene_no0B+1 , NULL+umis);
6119 				}
6120 			}
6121 			last_rbin_buffer[xk1] = malloc( global_context -> scRNA_barcode_batched_max_genes *8 + global_context -> scRNA_barcode_batched_max_Rbin_len + 4 + MAX_UMI_LEN + 16 + 10000);
6122 			int rlen = fread(last_rbin_buffer[xk1], 1, 16, input_fps[xk1]);
6123 			if(rlen >0){
6124 				int binlen = 0;
6125 				srInt_64 genes = 0;
6126 				memcpy(&genes, last_rbin_buffer[xk1]+8, 8);
6127 				if(genes & (1LLU<<63))genes = genes & 0x7fffffff;
6128 				else genes= 0;
6129 
6130 				fread(last_rbin_buffer[xk1]+16, 1, 8*genes+ global_context -> scRNA_UMI_length + 4, input_fps[xk1]);
6131 				memcpy(&binlen, last_rbin_buffer[xk1] +16 +8*genes+ global_context -> scRNA_UMI_length  , 4);
6132 				fread(last_rbin_buffer[xk1] + 16+ 8*genes+ global_context -> scRNA_UMI_length + 4, 1, binlen, input_fps[xk1]);
6133 
6134 				srInt_64 sorting_key = *(int*)(last_rbin_buffer[xk1] + 16 +8*genes+global_context -> scRNA_UMI_length +4);
6135 				sorting_key = sorting_key << 32;
6136 				sorting_key |= *(int*)(last_rbin_buffer[xk1] + 16+ 8*genes+global_context -> scRNA_UMI_length +8);
6137 				current_sorting_key[xk1] = sorting_key;
6138 			}else current_sorting_key[xk1] = 0x7fffffffffffffffLLU;
6139 		}
6140 
6141 		int current_worker = 0;
6142 		while(1){
6143 			int selected_fp_no = 0;
6144 			srInt_64 selected_fp_key = current_sorting_key[0];
6145 			for(xk1=1; xk1<global_context -> scRNA_barcode_batched_bin_no+1; xk1++){
6146 				if(current_sorting_key[xk1] < selected_fp_key){
6147 					selected_fp_key = current_sorting_key[xk1] ;
6148 					selected_fp_no = xk1;
6149 				}
6150 			}
6151 			if(selected_fp_key == 0x7fffffffffffffffLLU) break;
6152 
6153 			int sample_id = 0, binlen = 0;
6154 			srInt_64 genes = 0;
6155 			memcpy(&sample_id, last_rbin_buffer[selected_fp_no], 4);
6156 			memcpy(&genes, last_rbin_buffer[selected_fp_no]+8, 8);
6157 			if(genes & (1LLU<<63)) genes = genes & 0x7fffffff;
6158 			else genes = 0;
6159 			memcpy(&binlen,last_rbin_buffer[selected_fp_no]+16+8*genes+global_context -> scRNA_UMI_length,4);
6160 
6161 			struct scRNA_merge_batches_worker_task * tofill = task_buffers+(current_filling_worker_per_sample[sample_id-1] * global_context->scRNA_sample_sheet_table -> numOfElements +sample_id-1);
6162 			memcpy(tofill->inbin + tofill-> inbin_len, last_rbin_buffer[selected_fp_no]+16+8*genes+global_context -> scRNA_UMI_length, binlen + 4);
6163 			tofill -> inbin_len += (binlen + 4);
6164 			//SUBREADprintf("ADDING BLOCKKK = %d  WKR = %d  IT THINK IT'S %d ; GENES=%d\n", tofill -> inbin_len, current_worker, tofill -> sample_id, genes);
6165 			if(tofill-> inbin_len > 60000){
6166 				master_wait_for_job_done(&worker_mut, current_worker);
6167 				struct scRNA_merge_batches_worker_current * my_finished_job = worker_current_jobs+current_worker;
6168 				scRNA_save_BAM_result(global_context, my_finished_job);
6169 				my_finished_job -> task = tofill;
6170 				my_finished_job -> task -> block_number = (block_numbers_current[sample_id-1]++);
6171 				my_finished_job -> outbin_len = 0;
6172 				master_notify_worker(&worker_mut, current_worker);
6173 
6174 				current_filling_worker_per_sample[sample_id-1] ++;
6175 				if(current_filling_worker_per_sample[sample_id-1] == compress_workers +1) current_filling_worker_per_sample[sample_id-1] = 0;
6176 				tofill = task_buffers+(current_filling_worker_per_sample[sample_id-1] * global_context->scRNA_sample_sheet_table -> numOfElements +sample_id-1);
6177 				tofill -> inbin_len = 0;
6178 
6179 				current_worker ++;
6180 				if(current_worker == compress_workers) current_worker=0;
6181 			}
6182 
6183 			int rlen = fread(last_rbin_buffer[selected_fp_no], 1, 16, input_fps[selected_fp_no]);
6184 			if(rlen >0){
6185 				int binlen = 0;
6186 				srInt_64 genes = 0;
6187 				memcpy(&genes, last_rbin_buffer[selected_fp_no]+8, 8);
6188 				if(genes & (1LLU<<63))genes = genes & 0x7fffffff;
6189 				else genes= 0;
6190 				fread(last_rbin_buffer[selected_fp_no]+16, 1, 8*genes+ global_context -> scRNA_UMI_length + 4, input_fps[selected_fp_no]);
6191 				memcpy(&binlen, last_rbin_buffer[selected_fp_no] +16 +8*genes+ global_context -> scRNA_UMI_length  , 4);
6192 
6193 				fread(last_rbin_buffer[selected_fp_no] + 16+ 8*genes+ global_context -> scRNA_UMI_length + 4, 1, binlen, input_fps[selected_fp_no]);
6194 				srInt_64 sorting_key = *(int*)(last_rbin_buffer[selected_fp_no] + 16+8*genes +global_context -> scRNA_UMI_length +4);
6195 				sorting_key = sorting_key << 32;
6196 				sorting_key |= *(int*)(last_rbin_buffer[selected_fp_no] + 16 +8*genes+global_context -> scRNA_UMI_length +8);
6197 				current_sorting_key[selected_fp_no] = sorting_key;
6198 			} else current_sorting_key[selected_fp_no] = 0x7fffffffffffffffLLU;
6199 		}
6200 
6201 		for(xk1=0; xk1<global_context -> scRNA_barcode_batched_bin_no+1; xk1++){
6202 			fclose(input_fps[xk1]);
6203 			free(last_rbin_buffer[xk1]);
6204 		}
6205 
6206 		for(xk1=0; xk1<global_context -> scRNA_sample_sheet_table -> numOfElements; xk1++){
6207 			struct scRNA_merge_batches_worker_task * tofill = task_buffers+(current_filling_worker_per_sample[xk1] * global_context->scRNA_sample_sheet_table -> numOfElements +xk1);
6208 			if(tofill->inbin_len<1) continue;
6209 
6210 			master_wait_for_job_done(&worker_mut, current_worker);
6211 			struct scRNA_merge_batches_worker_current * my_finished_job = worker_current_jobs+current_worker;
6212 			scRNA_save_BAM_result(global_context, my_finished_job);
6213 			my_finished_job -> task = tofill;
6214 			my_finished_job -> task -> block_number = (block_numbers_current[xk1]++);
6215 			my_finished_job -> outbin_len = 0;
6216 			master_notify_worker(&worker_mut, current_worker);
6217 			current_worker ++;
6218 			if(current_worker == compress_workers) current_worker=0;
6219 		}
6220 		for(xk1=0; xk1<compress_workers; xk1++){
6221 			struct scRNA_merge_batches_worker_current * my_finished_job = worker_current_jobs+current_worker;
6222 			if(my_finished_job -> task)master_wait_for_job_done(&worker_mut, current_worker);
6223 			scRNA_save_BAM_result(global_context, my_finished_job);
6224 
6225 			current_worker ++;
6226 			if(current_worker == compress_workers) current_worker=0;
6227 		}
6228 
6229 		//TODO: add "scRNA_barcode_batched_bin_no+1" bin into "all unmapped"
6230 
6231 		for(xk1 = 0; xk1 < 1+compress_workers; xk1++) for(xk2 = 0; xk2 < global_context->scRNA_sample_sheet_table -> numOfElements;xk2++)
6232 			task_buffers[ xk1 * global_context->scRNA_sample_sheet_table -> numOfElements  + xk2 ].inbin_len = 0;
6233 		current_worker = 0;
6234 		FILE * notmapped_fp = input_fps[global_context -> scRNA_barcode_batched_bin_no+1];
6235 		while(1){
6236 			int sample_id = 0, binlen = 0;
6237 			int rlen = fread(&sample_id, 1, 4, notmapped_fp);
6238 			if(rlen < 4) break;
6239 			struct scRNA_merge_batches_worker_task * tofill = task_buffers+(current_filling_worker_per_sample[sample_id -1] * global_context->scRNA_sample_sheet_table -> numOfElements +sample_id-1);
6240 			fread(&binlen, 1, 4, notmapped_fp);
6241 			memcpy(tofill -> inbin + tofill -> inbin_len, &binlen, 4);
6242 			tofill -> inbin_len += 4;
6243 			fread(tofill -> inbin + tofill -> inbin_len, 1, binlen, notmapped_fp);
6244 			tofill -> inbin_len += binlen;
6245 			if(tofill-> inbin_len > 60000){
6246 				struct scRNA_merge_batches_worker_current * my_finished_job = worker_current_jobs+current_worker;
6247 				if(my_finished_job -> task)master_wait_for_job_done(&worker_mut, current_worker);
6248 				scRNA_save_BAM_result(global_context, my_finished_job);
6249 				my_finished_job -> task = tofill;
6250 				my_finished_job -> task -> block_number = (block_numbers_current[sample_id-1]++);
6251 				my_finished_job -> outbin_len = 0;
6252 				master_notify_worker(&worker_mut, current_worker);
6253 
6254 				current_filling_worker_per_sample[sample_id-1] ++;
6255 				if(current_filling_worker_per_sample[sample_id-1] == compress_workers +1) current_filling_worker_per_sample[sample_id-1] = 0;
6256 				tofill = task_buffers+(current_filling_worker_per_sample[sample_id-1] * global_context->scRNA_sample_sheet_table -> numOfElements +sample_id-1);
6257 				tofill -> inbin_len = 0;
6258 
6259 				current_worker ++;
6260 				if(current_worker == compress_workers) current_worker=0;
6261 			}
6262 		}
6263 
6264 		for(xk1=0; xk1<global_context -> scRNA_sample_sheet_table -> numOfElements; xk1++){
6265 			struct scRNA_merge_batches_worker_task * tofill = task_buffers+(current_filling_worker_per_sample[xk1] * global_context->scRNA_sample_sheet_table -> numOfElements +xk1);
6266 			if(tofill->inbin_len<1) continue;
6267 
6268 			master_wait_for_job_done(&worker_mut, current_worker);
6269 			struct scRNA_merge_batches_worker_current * my_finished_job = worker_current_jobs+current_worker;
6270 			scRNA_save_BAM_result(global_context, my_finished_job);
6271 			my_finished_job -> task = tofill;
6272 			my_finished_job -> task -> block_number = (block_numbers_current[xk1]++);
6273 			my_finished_job -> outbin_len = 0;
6274 			master_notify_worker(&worker_mut, current_worker);
6275 			current_worker ++;
6276 			if(current_worker == compress_workers) current_worker=0;
6277 		}
6278 
6279 		for(xk1=0; xk1<compress_workers; xk1++){
6280 			master_wait_for_job_done(&worker_mut, current_worker);
6281 			struct scRNA_merge_batches_worker_current * my_finished_job = worker_current_jobs+current_worker;
6282 			scRNA_save_BAM_result(global_context, my_finished_job);
6283 
6284 			current_worker ++;
6285 			if(current_worker == compress_workers) current_worker=0;
6286 		}
6287 
6288 		fclose(notmapped_fp);
6289 		terminate_workers(&worker_mut);
6290 		free(task_buffers);
6291 		free(worker_current_jobs);
6292 
6293 
6294 		for(xk1=0; xk1< compress_workers; xk1++){
6295 			pthread_join(threads[xk1],NULL);
6296 		}
6297 
6298 		worker_master_mutex_destroy(&worker_mut);
6299 		global_context -> scRNA_applied_umi_cut = calloc(sizeof(int), global_context -> scRNA_sample_sheet_table -> numOfElements);
6300 		scRNA_merged_to_tables_write(global_context , cellnoP1_to_genenoP1_to_UMIs , loaded_features, nexons);
6301 
6302 		for(xk1=0; xk1<global_context -> scRNA_sample_sheet_table -> numOfElements; xk1++)
6303 			HashTableDestroy(cellnoP1_to_genenoP1_to_UMIs[xk1]);
6304 
6305 
6306 		HashTable ** used_cell_no_tables = malloc(sizeof(HashTable*) * global_context -> scRNA_sample_sheet_table -> numOfElements);
6307 		for(xk1=0; xk1<global_context -> scRNA_sample_sheet_table -> numOfElements; xk1++){
6308 			used_cell_no_tables[xk1] = HashTableCreate(30000);
6309 			used_cell_no_tables[xk1] -> appendix1 = malloc(sizeof(pthread_spinlock_t));
6310 			pthread_spin_init((pthread_spinlock_t*)used_cell_no_tables[xk1] -> appendix1,1);
6311 		}
6312 
6313 
6314 
6315 		for(xk1=0; xk1<global_context -> scRNA_sample_sheet_table -> numOfElements; xk1++){
6316 			pthread_spin_destroy((pthread_spinlock_t*)used_cell_no_tables[xk1] -> appendix1);
6317 			HashTableDestroy(used_cell_no_tables[xk1]);
6318 		}
6319 
6320 		free(used_cell_no_tables);
6321 	}
6322 
6323 	for(xk1=0; xk1<global_context-> thread_number; xk1++)
6324 	{
6325 		if(global_context -> assign_reads_to_RG){
6326 			HashTable * thread_rg_tab = global_context -> thread_contexts[xk1].RG_table;
6327 			int buck_i;
6328 			for(buck_i = 0; buck_i < thread_rg_tab -> numOfBuckets; buck_i++){
6329 				KeyValuePair *cursor = thread_rg_tab -> bucketArray[buck_i];
6330 				while(cursor){
6331 					char * rg_name = (char *)cursor -> key;
6332 					void ** rg_thread_tabs = cursor -> value;
6333 					void ** rg_old_tabs = HashTableGet(RGmerged_table, rg_name);
6334 					if(!rg_old_tabs){
6335 						rg_old_tabs = malloc(sizeof(char *)*4); // all_counts, sum_counts , junc_table, split_table
6336 						rg_old_tabs[0] = calloc(global_context -> thread_contexts[xk1].count_table_size, sizeof(srInt_64));
6337 						rg_old_tabs[1] = calloc(1, sizeof(fc_read_counters));
6338 						if(global_context -> do_junction_counting){
6339 							HashTable * junction_counting_table = HashTableCreate(131317);
6340 							HashTableSetHashFunction(junction_counting_table,HashTableStringHashFunction);
6341 							HashTableSetDeallocationFunctions(junction_counting_table, free, NULL);
6342 							HashTableSetKeyComparisonFunction(junction_counting_table, fc_strcmp_chro);
6343 
6344 							HashTable * splicing_point_table = HashTableCreate(131317);
6345 							HashTableSetHashFunction(splicing_point_table,HashTableStringHashFunction);
6346 							HashTableSetDeallocationFunctions(splicing_point_table, free, NULL);
6347 							HashTableSetKeyComparisonFunction(splicing_point_table, fc_strcmp_chro);
6348 
6349 							rg_old_tabs[2] = junction_counting_table;
6350 							rg_old_tabs[3] = splicing_point_table;
6351 						}else rg_old_tabs[2] = NULL;
6352 
6353 						HashTablePut(RGmerged_table, memstrcpy(rg_name), rg_old_tabs);
6354 					}
6355 					srInt_64 * rg_counts = rg_old_tabs[0];
6356 					fc_read_counters * rg_sum_reads = rg_old_tabs[1];
6357 					HashTable * rg_junc_tab = rg_old_tabs[2];
6358 					HashTable * rg_split_tab = rg_old_tabs[3];
6359 
6360 					srInt_64 * rg_thread_counts = rg_thread_tabs[0];
6361 					fc_read_counters * rg_thread_sum_reads = rg_thread_tabs[1];
6362 					HashTable * rg_thread_junc_table = rg_thread_tabs[2];
6363 					HashTable * rg_thread_split_table = rg_thread_tabs[3];
6364 
6365 					for(xk2=0; xk2<global_context -> exontable_exons; xk2++)
6366 						rg_counts[xk2] += rg_thread_counts[xk2];
6367 
6368 					rg_sum_reads->unassigned_ambiguous += rg_thread_sum_reads->unassigned_ambiguous;
6369 					rg_sum_reads->unassigned_nofeatures += rg_thread_sum_reads->unassigned_nofeatures;
6370 					rg_sum_reads->unassigned_overlapping_length += rg_thread_sum_reads->unassigned_overlapping_length;
6371 					rg_sum_reads->unassigned_unmapped += rg_thread_sum_reads->unassigned_unmapped;
6372 					rg_sum_reads->unassigned_singleton += rg_thread_sum_reads->unassigned_singleton;
6373 					rg_sum_reads->unassigned_read_type += rg_thread_sum_reads->unassigned_read_type;
6374 					rg_sum_reads->unassigned_mappingquality += rg_thread_sum_reads->unassigned_mappingquality;
6375 					rg_sum_reads->unassigned_fragmentlength += rg_thread_sum_reads->unassigned_fragmentlength;
6376 					rg_sum_reads->unassigned_chimericreads += rg_thread_sum_reads->unassigned_chimericreads;
6377 					rg_sum_reads->unassigned_multimapping += rg_thread_sum_reads->unassigned_multimapping;
6378 					rg_sum_reads->unassigned_secondary += rg_thread_sum_reads->unassigned_secondary;
6379 					rg_sum_reads->unassigned_junction_condition += rg_thread_sum_reads->unassigned_junction_condition;
6380 					rg_sum_reads->unassigned_duplicate += rg_thread_sum_reads->unassigned_duplicate;
6381 					rg_sum_reads->assigned_reads += rg_thread_sum_reads->assigned_reads;
6382 
6383 					if(global_context -> do_junction_counting){
6384 						int bucket_i;
6385 						for(bucket_i = 0 ; bucket_i < rg_thread_junc_table -> numOfBuckets; bucket_i++){
6386 							KeyValuePair * cursor;
6387 							cursor = rg_thread_junc_table -> bucketArray[bucket_i];
6388 							while(cursor){
6389 								char * junckey = (char *) cursor -> key;
6390 								void * globval = HashTableGet(rg_junc_tab, junckey);
6391 								char * new_key = memstrcpy(junckey);
6392 
6393 								globval += (cursor -> value - NULL);
6394 								HashTablePut(rg_junc_tab, new_key, globval);
6395 									// new_key will be freed when it is replaced next time or when the global table is destroyed.
6396 
6397 								cursor = cursor->next;
6398 							}
6399 						}
6400 
6401 						for(bucket_i = 0 ; bucket_i < rg_thread_split_table -> numOfBuckets; bucket_i++){
6402 							KeyValuePair * cursor;
6403 							cursor = rg_thread_split_table -> bucketArray[bucket_i];
6404 							while(cursor){
6405 								char * junckey = (char *) cursor -> key;
6406 								void * globval = HashTableGet(rg_split_tab, junckey);
6407 								char * new_key = memstrcpy(junckey);
6408 
6409 								//if(xk1>0)
6410 								//SUBREADprintf("MERGE THREAD-%d : %s    VAL=%u, ADD=%u\n", xk1, junckey, globval - NULL, cursor -> value - NULL);
6411 								globval += (cursor -> value - NULL);
6412 								HashTablePut(rg_split_tab, new_key, globval);
6413 								cursor = cursor->next;
6414 							}
6415 						}
6416 					} // end : merge junc tables
6417 					ret++;
6418 					cursor = cursor -> next;
6419 				}
6420 			}
6421 		}
6422 
6423 		for(xk2=0; xk2<global_context -> exontable_exons; xk2++)
6424 			nreads[xk2]+=global_context -> thread_contexts[xk1].count_table[xk2];
6425 
6426 		total_input_reads += global_context -> thread_contexts[xk1].all_reads;
6427 		(*nreads_mapped_to_exon) += global_context -> thread_contexts[xk1].nreads_mapped_to_exon;
6428 
6429 		global_context -> read_counters.unassigned_ambiguous += global_context -> thread_contexts[xk1].read_counters.unassigned_ambiguous;
6430 		global_context -> read_counters.unassigned_nofeatures += global_context -> thread_contexts[xk1].read_counters.unassigned_nofeatures;
6431 		global_context -> read_counters.unassigned_overlapping_length += global_context -> thread_contexts[xk1].read_counters.unassigned_overlapping_length;
6432 		global_context -> read_counters.unassigned_unmapped += global_context -> thread_contexts[xk1].read_counters.unassigned_unmapped;
6433 		global_context -> read_counters.unassigned_singleton += global_context -> thread_contexts[xk1].read_counters.unassigned_singleton;
6434 		global_context -> read_counters.unassigned_read_type += global_context -> thread_contexts[xk1].read_counters.unassigned_read_type;
6435 		global_context -> read_counters.unassigned_mappingquality += global_context -> thread_contexts[xk1].read_counters.unassigned_mappingquality;
6436 		global_context -> read_counters.unassigned_fragmentlength += global_context -> thread_contexts[xk1].read_counters.unassigned_fragmentlength;
6437 		global_context -> read_counters.unassigned_chimericreads += global_context -> thread_contexts[xk1].read_counters.unassigned_chimericreads;
6438 		global_context -> read_counters.unassigned_multimapping += global_context -> thread_contexts[xk1].read_counters.unassigned_multimapping;
6439 		global_context -> read_counters.unassigned_secondary += global_context -> thread_contexts[xk1].read_counters.unassigned_secondary;
6440 		global_context -> read_counters.unassigned_junction_condition += global_context -> thread_contexts[xk1].read_counters.unassigned_junction_condition;
6441 		global_context -> read_counters.unassigned_duplicate += global_context -> thread_contexts[xk1].read_counters.unassigned_duplicate;
6442 		global_context -> read_counters.assigned_reads += global_context -> thread_contexts[xk1].read_counters.assigned_reads;
6443 
6444 		my_read_counter->unassigned_ambiguous += global_context -> thread_contexts[xk1].read_counters.unassigned_ambiguous;
6445 		my_read_counter->unassigned_nofeatures += global_context -> thread_contexts[xk1].read_counters.unassigned_nofeatures;
6446 		my_read_counter->unassigned_overlapping_length += global_context -> thread_contexts[xk1].read_counters.unassigned_overlapping_length;
6447 		my_read_counter->unassigned_unmapped += global_context -> thread_contexts[xk1].read_counters.unassigned_unmapped;
6448 		my_read_counter->unassigned_singleton += global_context -> thread_contexts[xk1].read_counters.unassigned_singleton;
6449 		my_read_counter->unassigned_read_type += global_context -> thread_contexts[xk1].read_counters.unassigned_read_type;
6450 		my_read_counter->unassigned_mappingquality += global_context -> thread_contexts[xk1].read_counters.unassigned_mappingquality;
6451 		my_read_counter->unassigned_fragmentlength += global_context -> thread_contexts[xk1].read_counters.unassigned_fragmentlength;
6452 		my_read_counter->unassigned_chimericreads += global_context -> thread_contexts[xk1].read_counters.unassigned_chimericreads;
6453 		my_read_counter->unassigned_multimapping += global_context -> thread_contexts[xk1].read_counters.unassigned_multimapping;
6454 		my_read_counter->unassigned_secondary += global_context -> thread_contexts[xk1].read_counters.unassigned_secondary;
6455 		my_read_counter->unassigned_junction_condition += global_context -> thread_contexts[xk1].read_counters.unassigned_junction_condition;
6456 		my_read_counter->unassigned_duplicate += global_context -> thread_contexts[xk1].read_counters.unassigned_duplicate;
6457 		my_read_counter->assigned_reads += global_context -> thread_contexts[xk1].read_counters.assigned_reads;
6458 
6459 		if(global_context -> do_junction_counting){
6460 			int bucket_i;
6461 			for(bucket_i = 0 ; bucket_i < global_context -> thread_contexts[xk1].junction_counting_table -> numOfBuckets; bucket_i++){
6462 				KeyValuePair * cursor;
6463 				cursor = global_context -> thread_contexts[xk1].junction_counting_table -> bucketArray[bucket_i];
6464 				while(cursor){
6465 					char * junckey = (char *) cursor -> key;
6466 
6467 					void * globval = HashTableGet(junction_global_table, junckey);
6468 					char * new_key = malloc(strlen(junckey)+1);
6469 					strcpy(new_key, junckey);
6470 					globval += (cursor -> value - NULL);
6471 					HashTablePut(junction_global_table, new_key, globval);
6472 						// new_key will be freed when it is replaced next time or when the global table is destroyed.
6473 
6474 					cursor = cursor->next;
6475 				}
6476 			}
6477 
6478 			for(bucket_i = 0 ; bucket_i < global_context -> thread_contexts[xk1].splicing_point_table -> numOfBuckets; bucket_i++){
6479 				KeyValuePair * cursor;
6480 				cursor = global_context -> thread_contexts[xk1].splicing_point_table -> bucketArray[bucket_i];
6481 				while(cursor){
6482 					char * junckey = (char *) cursor -> key;
6483 					void * globval = HashTableGet(splicing_global_table, junckey);
6484 					char * new_key = malloc(strlen(junckey)+1);
6485 					strcpy(new_key, junckey);
6486 
6487 					//if(xk1>0)
6488 					//SUBREADprintf("MERGE THREAD-%d : %s    VAL=%u, ADD=%u\n", xk1, junckey, globval - NULL, cursor -> value - NULL);
6489 
6490 					globval += (cursor -> value - NULL);
6491 					HashTablePut(splicing_global_table, new_key, globval);
6492 					cursor = cursor->next;
6493 				}
6494 			}
6495 		}
6496 	}
6497 
6498 
6499 
6500 	if(0 == global_context -> is_input_bad_format){
6501 
6502 		if(global_context -> is_paired_end_reads_expected){
6503 			if(global_context -> is_mixed_PE_SE)
6504 					print_in_box(80,0,0,"   WARNING: Single-end reads were found%s.", global_context -> is_strand_checked?" and excluded":"");
6505 			else print_in_box(80,0,0,"   Paired-end reads are included.");
6506 			if(!global_context -> is_paired_end_mode_assign)
6507 				print_in_box(80,0,0, "   The reads are assigned on the single-end mode.");
6508 		}else{
6509 			// paired-end reads in a single-end lib will result in error.
6510 			print_in_box(80,0,0,"   Single-end reads are included.");
6511 		}
6512 
6513 		char pct_str[10];
6514 		if(total_input_reads>0)
6515 			sprintf(pct_str,"(%.1f%%%%)", (*nreads_mapped_to_exon)*100./total_input_reads);
6516 		else	pct_str[0]=0;
6517 
6518 		int show_summary = 1;
6519 		if(global_context -> assign_reads_to_RG){
6520 			if(RGmerged_table -> numOfElements)
6521 				print_in_box(80,0,0,"   Total read groups : %ld", RGmerged_table -> numOfElements);
6522 			else{
6523 				print_in_box(80,0,0,"   No read groups are found; no output is generated.");
6524 				show_summary = 0;
6525 			}
6526 		}
6527 		if(show_summary){
6528 			print_in_box(80,0,0,"   Total alignments : %llu", total_input_reads);
6529 			print_in_box(pct_str[0]?81:80,0,0,"   Successfully assigned alignments : %llu %s", *nreads_mapped_to_exon,pct_str);
6530 		}
6531 		print_in_box(80,0,0,"   Running time : %.2f minutes", (miltime() - global_context -> start_time)/60);
6532 		print_in_box(80,0,0,"");
6533 	}
6534 	return ret;
6535 }
6536 
get_temp_dir_from_out(char * tmp,char * out)6537 void get_temp_dir_from_out(char * tmp, char * out){
6538 	char * slash = strrchr(out,'/');
6539 	if(NULL == slash){
6540 		strcpy(tmp, "./");
6541 	}else{
6542 		memcpy(tmp, out, slash - out);
6543 		tmp[slash - out]=0;
6544 	}
6545 }
6546 
fc_thread_init_input_files(fc_thread_global_context_t * global_context,char * in_fnames,char ** out_ptr)6547 void fc_thread_init_input_files(fc_thread_global_context_t * global_context, char * in_fnames, char ** out_ptr ){
6548 	if(global_context -> use_stdin_file){
6549 		#ifdef MAKE_STANDALONE
6550 
6551 		char MAC_or_random[13];
6552 
6553 		(*out_ptr) = malloc(MAX_FILE_NAME_LENGTH);
6554 		mac_or_rand_str(MAC_or_random);
6555 		sprintf(*out_ptr, "%s/temp-core-%06u-%s.sam", global_context -> temp_file_dir, getpid(), MAC_or_random);
6556 
6557 		SUBREADprintf("\nReading data from <STDIN> for featureCounts ...\n\n");
6558 
6559 		FILE * ifp = fopen(*out_ptr,"w");
6560 		while(1){
6561 			char nchar[100];
6562 			int rlen = fread(nchar, 1, 100, stdin);
6563 			if(rlen > 0) fwrite(nchar, 1, rlen, ifp);
6564 			else break;
6565 			//if(rlen < 100)break;
6566 		}
6567 		fclose(ifp);
6568 
6569 		#endif
6570 	}else{
6571 		(*out_ptr) = malloc(strlen(in_fnames)+1);
6572 		strcpy((*out_ptr), in_fnames);
6573 	}
6574 
6575 }
6576 
fc_NCfree(void * vv)6577 void fc_NCfree(void * vv){
6578 	char ** cc = vv;
6579 	int i;
6580 	for(i=0; cc[i]; i++) free(cc[i]);
6581 	free(vv);
6582 }
6583 
scRNA_convert_ss_to_arr(void * key,void * hashed_obj,HashTable * tab)6584 void scRNA_convert_ss_to_arr( void * key, void * hashed_obj, HashTable * tab ){
6585 	ArrayList * hashed_arr = hashed_obj ;
6586 	fc_thread_global_context_t * global_context = tab->appendix1;
6587 	ArrayListPush(global_context -> scRNA_sample_id_to_name, key);
6588 	hashed_arr -> appendix1 = NULL+global_context -> scRNA_sample_id_to_name -> numOfElements; // One-based
6589 
6590 	srInt_64 xx1;
6591 	for(xx1 =0; xx1< hashed_arr -> numOfElements; xx1++){
6592 		char ** push_arr = malloc(sizeof(char*)*3);
6593 		char ** sbc_lane_sample = ArrayListGet(hashed_arr, xx1);
6594 		srInt_64 lane_sample_int = sbc_lane_sample[0]-(char*)NULL;
6595 
6596 		ArrayListPush(global_context -> scRNA_sample_barcode_list, push_arr);
6597 		push_arr[0] = NULL + lane_sample_int;
6598 		push_arr[1] = NULL + global_context -> scRNA_sample_id_to_name -> numOfElements;
6599 		push_arr[2] = sbc_lane_sample[1]; // Sample Barcode
6600 
6601 		int line_no_in_sheet = sbc_lane_sample[2] - (char*)NULL;
6602 		HashTablePut(global_context -> scRNA_lineno1B_to_sampleno1B_tab , NULL+line_no_in_sheet, NULL + global_context -> scRNA_sample_id_to_name -> numOfElements);
6603 		//SUBREADprintf("Push_LineNo : %d -> %d\n", line_no_in_sheet, global_context -> scRNA_sample_id_to_name -> numOfElements);
6604 		//SUBREADprintf("Push Barcode %s in Lane %ld for %s [%d]\n", push_arr[2], (sbc_lane_sample[0]-(char*)NULL), key, global_context -> scRNA_sample_id_to_name -> numOfElements);
6605 	}
6606 }
6607 
6608 
scRNA_cell_barcode_tabel_destroy(void * a)6609 void scRNA_cell_barcode_tabel_destroy(void *a){
6610 	if(((a-NULL) & 0xfffffffff0000000llu ) ==IMPOSSIBLE_MEMORY_SPACE )return;
6611 	ArrayListDestroy((ArrayList*)a);
6612 }
6613 
scRNA_make_barcode_HT_table(fc_thread_global_context_t * global_context)6614 void scRNA_make_barcode_HT_table( fc_thread_global_context_t * global_context ){
6615 	int xx1,xx2;
6616 	global_context -> scRNA_cell_barcode_head_tail_table = StringTableCreate(600000);
6617 	HashTableSetDeallocationFunctions( global_context -> scRNA_cell_barcode_head_tail_table, free, scRNA_cell_barcode_tabel_destroy);
6618 
6619 	for(xx1=0;xx1 < global_context-> scRNA_cell_barcodes_array -> numOfElements; xx1++){
6620 		char * bc = ArrayListGet(global_context-> scRNA_cell_barcodes_array, xx1);
6621 		int bcl =strlen(bc);
6622 		if(global_context -> known_cell_barcode_length==0) global_context -> known_cell_barcode_length=bcl;
6623 		if(bcl!=global_context -> known_cell_barcode_length){
6624 //			SUBREADprintf("The cell barcodes have variable lengths. This may be a new protocol and we don't support it yet.\n");
6625 			assert(bcl==global_context -> known_cell_barcode_length);
6626 		}
6627 		char bctmp[20];
6628 		HashTablePut(global_context -> scRNA_cell_barcode_head_tail_table, strdup(bc), NULL+xx1+IMPOSSIBLE_MEMORY_SPACE);
6629 		for(xx2=0; xx2<2; xx2++){
6630 			bctmp[0] = xx2?'S':'F';
6631 			int xx3;
6632 			for(xx3 = 0; xx3< global_context -> known_cell_barcode_length/2; xx3++)
6633 				bctmp[xx3+1] = bc[ xx3*2+xx2 ];
6634 			bctmp[bcl/2+1]=0;
6635 
6636 			ArrayList * array_of_codes = HashTableGet(global_context -> scRNA_cell_barcode_head_tail_table, bctmp);
6637 			if(!array_of_codes){
6638 				array_of_codes = ArrayListCreate(4);
6639 				HashTablePut(global_context -> scRNA_cell_barcode_head_tail_table, strdup(bctmp), array_of_codes);
6640 			}
6641 			ArrayListPush(array_of_codes, NULL+xx1);
6642 		}
6643 	}
6644 }
6645 
6646 
scRNA_close_sample_SamBam_writers(void * v)6647 void scRNA_close_sample_SamBam_writers(void *v){
6648 	void ** vv = v;
6649 	simple_bam_writer * wtr = vv[0];
6650 	simple_bam_close(wtr);
6651 
6652 	if(vv[1]){
6653 		parallel_gzip_writer_t* gzfp = vv[1];
6654 		parallel_gzip_writer_close(gzfp);
6655 
6656 		gzfp = vv[2];
6657 		parallel_gzip_writer_close(gzfp);
6658 
6659 		gzfp = vv[3];
6660 		parallel_gzip_writer_close(gzfp);
6661 	}
6662 
6663 	pthread_spinlock_t * gz_lock = vv[4];
6664 	pthread_spin_destroy(gz_lock);
6665 	free(gz_lock);
6666 
6667 	free(vv);
6668 }
6669 
6670 #define SORT_BAM_FROM_SCRNA 1
scRNA_sample_SamBam_writers_new_files(void * k,void * v,HashTable * tab)6671 void scRNA_sample_SamBam_writers_new_files(void *k, void *v, HashTable * tab){
6672 	HashTable * fp_tab = tab -> appendix1;
6673 	fc_thread_global_context_t * global_context = tab -> appendix2;
6674 	ArrayList * scRNA_sample_id_to_name = tab -> appendix3;
6675 
6676 	char * samplename = k;
6677 	char fname [MAX_FILE_NAME_LENGTH+20], fnamet[MAX_FILE_NAME_LENGTH+20];
6678 	sprintf(fname, "%s.bam", samplename);
6679 	sprintf(fnamet, "del4-cC-tmp0-%s.del", samplename);
6680 	simple_bam_writer * wtr = simple_bam_create(fname);
6681 	parallel_gzip_writer_t * gzipR1fq=NULL, * gzipI1fq=NULL, * gzipR2fq=NULL;
6682 
6683 	if(global_context -> scRNA_input_mode == GENE_INPUT_BCL || global_context -> scRNA_input_mode == GENE_INPUT_SCRNA_BAM){
6684 		gzipR1fq = calloc(sizeof(parallel_gzip_writer_t),1);
6685 		gzipI1fq = calloc(sizeof(parallel_gzip_writer_t),1);
6686 		gzipR2fq = calloc(sizeof(parallel_gzip_writer_t),1);
6687 		sprintf(fname, "%s_R1.fastq.gz", samplename);
6688 		parallel_gzip_writer_init(gzipR1fq, fname, global_context -> thread_number);
6689 		sprintf(fname, "%s_I1.fastq.gz", samplename);
6690 		parallel_gzip_writer_init(gzipI1fq, fname, global_context -> thread_number);
6691 		sprintf(fname, "%s_R2.fastq.gz", samplename);
6692 		parallel_gzip_writer_init(gzipR2fq, fname, global_context -> thread_number);
6693 	}
6694 
6695 	pthread_spinlock_t * gzfp_lock = malloc(sizeof(pthread_spinlock_t));
6696 	pthread_spin_init(gzfp_lock, PTHREAD_PROCESS_PRIVATE);
6697 	int x1;
6698 	for(x1=0; x1<scRNA_sample_id_to_name -> numOfElements; x1++){
6699 		char * sample_name = ArrayListGet( scRNA_sample_id_to_name, x1 );
6700 		if(strcmp(sample_name, samplename)==0){
6701 			void ** wtrptr = malloc(sizeof(void*)*6);
6702 			wtrptr[0]=wtr;
6703 			wtrptr[1]=gzipR1fq;
6704 			wtrptr[2]=gzipI1fq;
6705 			wtrptr[3]=gzipR2fq;
6706 			wtrptr[4]=gzfp_lock;
6707 			wtrptr[5]=NULL;
6708 			HashTablePut(fp_tab, NULL+x1+1 , wtrptr);
6709 			break;
6710 		}
6711 	}
6712 }
6713 
fc_thread_init_global_context(fc_thread_global_context_t * global_context,unsigned int buffer_size,unsigned short threads,int line_length,int min_pe_dist,int max_pe_dist,int is_gene_level,int is_overlap_allowed,char * strand_check_mode,char * output_fname,int is_sam_out,int is_both_end_required,int is_chimertc_disallowed,int is_PE_distance_checked,char * feature_name_column,char * gene_id_column,int min_map_qual_score,int is_multi_mapping_allowed,int is_SAM,char * alias_file_name,char * cmd_rebuilt,int is_input_file_resort_needed,int feature_block_size,int isCVersion,int fiveEndExtension,int threeEndExtension,int minFragmentOverlap,int is_split_or_exonic_only,int reduce_5_3_ends_to_one,char * debug_command,int is_duplicate_ignored,int is_not_sort,int use_fraction_multimapping,int useOverlappingBreakTie,char * pair_orientations,int do_junction_cnt,int max_M,int isRestrictlyNoOvelrapping,float fracOverlap,char * temp_dir,int use_stdin_file,int assign_reads_to_RG,int long_read_minimum_length,int is_verbose,float frac_feature_overlap,int do_detection_call,int max_missing_bases_in_read,int max_missing_bases_in_feature,int is_primary_alignment_only,char * Rpath,char * extra_column_names,char * annotation_file_screen_output,int read_shift_type,int read_shift_size,char * scRNA_sample_sheet,char * scRNA_cell_barcode_list,int is_scRNA_BAM_FQ_out_generated,int scRNA_input_mode,int scRNA_rerun_on_persample_BAM,float scRNA_umi_cutoff)6714 void fc_thread_init_global_context(fc_thread_global_context_t * global_context, unsigned int buffer_size, unsigned short threads, int line_length, int min_pe_dist, int max_pe_dist, int is_gene_level, int is_overlap_allowed, char * strand_check_mode, char * output_fname, int is_sam_out, int is_both_end_required, int is_chimertc_disallowed, int is_PE_distance_checked, char *feature_name_column, char * gene_id_column, int min_map_qual_score, int is_multi_mapping_allowed, int is_SAM, char * alias_file_name, char * cmd_rebuilt, int is_input_file_resort_needed, int feature_block_size, int isCVersion, int fiveEndExtension,  int threeEndExtension, int minFragmentOverlap, int is_split_or_exonic_only, int reduce_5_3_ends_to_one, char * debug_command, int is_duplicate_ignored, int is_not_sort, int use_fraction_multimapping, int useOverlappingBreakTie, char * pair_orientations, int do_junction_cnt, int max_M, int isRestrictlyNoOvelrapping, float fracOverlap, char * temp_dir, int use_stdin_file, int assign_reads_to_RG, int long_read_minimum_length, int is_verbose, float frac_feature_overlap, int do_detection_call, int max_missing_bases_in_read, int max_missing_bases_in_feature, int is_primary_alignment_only, char * Rpath, char * extra_column_names , char * annotation_file_screen_output, int read_shift_type, int read_shift_size, char * scRNA_sample_sheet, char * scRNA_cell_barcode_list, int is_scRNA_BAM_FQ_out_generated, int scRNA_input_mode, int scRNA_rerun_on_persample_BAM, float scRNA_umi_cutoff) {
6715 	int x1;
6716 	myrand_srand(time(NULL));
6717 
6718 	memset(global_context, 0, sizeof(fc_thread_global_context_t));
6719 	global_context -> max_BAM_header_size = buffer_size;
6720 	global_context -> all_reads = 0;
6721 	global_context -> redo = 0;
6722 	global_context -> read_details_out_FP = NULL;
6723 
6724 	global_context -> reported_extra_columns = extra_column_names;
6725 	global_context -> isCVersion = isCVersion;
6726 	global_context -> is_read_details_out = is_sam_out;
6727 	global_context -> is_multi_overlap_allowed = is_overlap_allowed;
6728 	global_context -> restricted_no_multi_overlap = isRestrictlyNoOvelrapping;
6729 	global_context -> is_gene_level = is_gene_level;
6730 	global_context -> strand_check_mode = strand_check_mode;
6731 	global_context -> is_both_end_required = is_both_end_required;
6732 	global_context -> is_chimertc_disallowed = is_chimertc_disallowed;
6733 	global_context -> is_PE_distance_checked = is_PE_distance_checked;
6734 	global_context -> is_multi_mapping_allowed = is_multi_mapping_allowed;
6735 	global_context -> is_primary_alignment_only = is_primary_alignment_only;
6736 	global_context -> is_split_or_exonic_only = is_split_or_exonic_only;
6737 	global_context -> is_duplicate_ignored = is_duplicate_ignored;
6738 	global_context -> use_stdin_file = use_stdin_file;
6739 	global_context -> assign_reads_to_RG = assign_reads_to_RG;
6740 	global_context -> long_read_minimum_length = long_read_minimum_length;
6741 	global_context -> is_verbose = is_verbose;
6742 	global_context -> do_detection_call = do_detection_call;
6743 	//global_context -> is_first_read_reversed = (pair_orientations[0]=='r');
6744 	//global_context -> is_second_read_straight = (pair_orientations[1]=='f');
6745 
6746 	global_context -> reduce_5_3_ends_to_one = reduce_5_3_ends_to_one;
6747 	global_context -> do_not_sort = is_not_sort;
6748 	global_context -> is_SAM_file = is_SAM;
6749 	global_context -> use_fraction_multi_mapping = use_fraction_multimapping;
6750 	global_context -> do_junction_counting = do_junction_cnt;
6751 
6752 	global_context -> thread_number = threads;
6753 	global_context -> min_mapping_quality_score = min_map_qual_score;
6754 	global_context -> unistr_buffer_size = 1024*1024*2;
6755 	global_context -> unistr_buffer_used = 0;
6756 	global_context -> unistr_buffer_space = malloc(global_context -> unistr_buffer_size);
6757 	global_context -> BAM_chros_to_anno_table = NULL;
6758 	global_context -> cmd_rebuilt = cmd_rebuilt;
6759 	global_context -> feature_block_size = feature_block_size;
6760 	global_context -> five_end_extension = fiveEndExtension;
6761 	global_context -> three_end_extension = threeEndExtension;
6762 	global_context -> read_shift_type = read_shift_type;
6763 	global_context -> read_shift_size = read_shift_size;
6764 	global_context -> fragment_minimum_overlapping = minFragmentOverlap;
6765 	global_context -> fractional_minimum_overlapping = fracOverlap;
6766 	global_context -> fractional_minimum_feature_overlapping = frac_feature_overlap;
6767 	global_context -> max_missing_bases_in_read = max_missing_bases_in_read;
6768 	global_context -> max_missing_bases_in_feature = max_missing_bases_in_feature;
6769 	global_context -> use_overlapping_break_tie = useOverlappingBreakTie;
6770 	global_context -> need_calculate_fragment_len = ( global_context -> fractional_minimum_overlapping > 1E-10 ) || (global_context -> fractional_minimum_feature_overlapping > 1E-10) || ( global_context -> max_missing_bases_in_read >= 0 ) || ( global_context -> max_missing_bases_in_feature >= 0 );
6771 	global_context -> need_calculate_overlap_len = (global_context -> fractional_minimum_overlapping > 1E-10) || (global_context -> fragment_minimum_overlapping > 1) || global_context -> use_overlapping_break_tie || (global_context -> fractional_minimum_feature_overlapping > 1E-10) || ( global_context -> max_missing_bases_in_read >= 0 ) || ( global_context -> max_missing_bases_in_feature >= 0 );
6772 	global_context -> debug_command = debug_command;
6773 	global_context -> max_M = max_M;
6774 	global_context -> max_BAM_header_size = buffer_size;
6775 	if(scRNA_sample_sheet){
6776 		global_context -> scRNA_umi_cutoff = scRNA_umi_cutoff;
6777 		global_context -> do_scRNA_table = 1;
6778 		global_context -> scRNA_sample_id_to_name = ArrayListCreate(64);
6779 		global_context -> scRNA_lineno1B_to_sampleno1B_tab = HashTableCreate(10);
6780 		strcpy(global_context->scRNA_sample_sheet,scRNA_sample_sheet);
6781 		global_context-> scRNA_sample_sheet_table = input_BLC_parse_SampleSheet( global_context->scRNA_sample_sheet );
6782 		global_context-> scRNA_sample_sheet_table -> appendix1 = global_context;
6783 		global_context -> scRNA_sample_barcode_list = ArrayListCreate(64);
6784 		ArrayListSetDeallocationFunction(global_context -> scRNA_sample_barcode_list, free);
6785 		HashTableIteration(global_context-> scRNA_sample_sheet_table, scRNA_convert_ss_to_arr);
6786 
6787 		if(scRNA_cell_barcode_list){
6788 			strcpy(global_context->scRNA_cell_barcode_list,scRNA_cell_barcode_list);
6789 			global_context-> scRNA_cell_barcodes_array = input_BLC_parse_CellBarcodes( global_context->scRNA_cell_barcode_list );
6790 			scRNA_make_barcode_HT_table( global_context );
6791 			//print_in_box(80,0,0,"Loaded %ld cell barcodes from the list.", global_context-> scRNA_cell_barcodes_array -> numOfElements);
6792 		}
6793 		global_context -> is_scRNA_BAM_FQ_out_generated = is_scRNA_BAM_FQ_out_generated;
6794 		global_context -> scRNA_input_mode = scRNA_input_mode;
6795 		global_context -> scRNA_rerun_on_persample_BAM = scRNA_rerun_on_persample_BAM;
6796 		global_context -> scRNA_barcode_batched_bin_no = 149;
6797 		pthread_spin_init(&global_context -> scRNA_do_one_batch_runner_lock, PTHREAD_PROCESS_PRIVATE);
6798 		global_context -> scRNA_barcode_batched_locks = malloc(sizeof(pthread_spinlock_t)*(global_context -> scRNA_barcode_batched_bin_no+2));
6799 		global_context -> scRNA_barcode_batched_bins = malloc(sizeof(FILE*)*(global_context -> scRNA_barcode_batched_bin_no+2));
6800 		for(x1=0; x1<global_context -> scRNA_barcode_batched_bin_no+2; x1++){
6801 			char tmp_fname[MAX_FILE_NAME_LENGTH+20];
6802 			sprintf(tmp_fname, "%s/cellCounts-Splitted-Reads-%05d-%05d.bin", temp_dir, getpid(), x1);
6803 			//SUBREADprintf("CREATE TEMP FILE %s\n" , tmp_fname);
6804 			global_context -> scRNA_barcode_batched_bins[x1]=fopen(tmp_fname, "wb");
6805 			pthread_spin_init(global_context -> scRNA_barcode_batched_locks+x1, PTHREAD_PROCESS_PRIVATE);
6806 		}
6807 	}else{
6808 		global_context -> do_scRNA_table = 0;
6809 		global_context-> scRNA_cell_barcodes_array = NULL;
6810 		global_context-> scRNA_sample_sheet_table = NULL;
6811 	}
6812 
6813 	global_context -> read_counters.unassigned_ambiguous=0;
6814 	global_context -> read_counters.unassigned_nofeatures=0;
6815 	global_context -> read_counters.unassigned_overlapping_length=0;
6816 	global_context -> read_counters.unassigned_unmapped=0;
6817 	global_context -> read_counters.unassigned_read_type=0;
6818 	global_context -> read_counters.unassigned_singleton=0;
6819 	global_context -> read_counters.unassigned_mappingquality=0;
6820 	global_context -> read_counters.unassigned_fragmentlength=0;
6821 	global_context -> read_counters.unassigned_chimericreads=0;
6822 	global_context -> read_counters.unassigned_multimapping=0;
6823 	global_context -> read_counters.unassigned_secondary=0;
6824 	global_context -> read_counters.unassigned_junction_condition=0;
6825 	global_context -> read_counters.unassigned_duplicate=0;
6826 	global_context -> read_counters.assigned_reads=0;
6827 
6828 	global_context -> GCcontent_table = HashTableCreate(20000);
6829 	HashTableSetHashFunction(global_context -> GCcontent_table, HashTableStringHashFunction);
6830 	HashTableSetDeallocationFunctions(global_context -> GCcontent_table, free, free);
6831 	HashTableSetKeyComparisonFunction(global_context -> GCcontent_table, fc_strcmp_chro);
6832 
6833 	if(annotation_file_screen_output) strcpy(global_context -> annotation_file_screen_output, annotation_file_screen_output);
6834 	else global_context ->annotation_file_screen_output[0]=0;
6835 
6836 	if(alias_file_name && alias_file_name[0])
6837 	{
6838 		strcpy(global_context -> alias_file_name,alias_file_name);
6839 		global_context -> BAM_chros_to_anno_table = load_alias_table(alias_file_name);
6840 	}
6841 	else	global_context -> alias_file_name[0]=0;
6842 
6843 	global_context -> read_details_path[0]=0;
6844 	if(Rpath)strcpy(global_context -> read_details_path, Rpath);
6845 
6846 	strcpy(global_context -> feature_name_column,feature_name_column);
6847 	strcpy(global_context -> gene_id_column,gene_id_column);
6848 	strcpy(global_context -> output_file_name, output_fname);
6849 	global_context -> output_file_path[0]=0;
6850 	for( x1 = strlen(output_fname)-1; x1 >= 0; x1 --){
6851 		if(output_fname[x1]=='/'){
6852 			memcpy(global_context -> output_file_path, output_fname, x1);
6853 			global_context -> output_file_path[x1]=0;
6854 			break;
6855 		}
6856 	}
6857 	if(0 == global_context -> output_file_path[0]){
6858 		strcpy(global_context -> output_file_path, ".");
6859 	}
6860 
6861 	if(temp_dir == NULL)get_temp_dir_from_out(global_context -> temp_file_dir, output_fname);
6862 	else strcpy(global_context -> temp_file_dir, temp_dir);
6863 	//SUBREADprintf("OFPP:%s, OFNN:%s\n", global_context -> output_file_path, global_context -> output_file_name);
6864 
6865 	global_context -> min_paired_end_distance = min_pe_dist;
6866 	global_context -> max_paired_end_distance = max_pe_dist;
6867 	global_context -> thread_number = threads;
6868 	global_context -> line_length = line_length;
6869 }
6870 
6871 
6872 
fc_thread_start_threads(fc_thread_global_context_t * global_context,int et_exons,int * et_geneid,char ** et_chr,srInt_64 * et_start,srInt_64 * et_stop,unsigned char * et_strand,char * et_anno_chr_2ch,char ** et_anno_chrs,srInt_64 * et_anno_chr_heads,srInt_64 * et_bk_end_index,srInt_64 * et_bk_min_start,srInt_64 * et_bk_max_end,int read_length)6873 int fc_thread_start_threads(fc_thread_global_context_t * global_context, int et_exons, int * et_geneid, char ** et_chr, srInt_64 * et_start, srInt_64 * et_stop, unsigned char * et_strand, char * et_anno_chr_2ch, char ** et_anno_chrs, srInt_64 * et_anno_chr_heads, srInt_64 * et_bk_end_index, srInt_64 * et_bk_min_start, srInt_64 * et_bk_max_end, int read_length)
6874 {
6875 	int xk1;
6876 
6877 	global_context -> read_length = read_length;
6878 	global_context -> is_unpaired_warning_shown = 0;
6879 	global_context -> is_stake_warning_shown = 0;
6880 	global_context -> is_read_too_long_to_SAM_BAM_shown = 0;
6881 
6882 	if(global_context -> is_read_details_out)
6883 	{
6884 		char tmp_fname[MAX_FILE_NAME_LENGTH+20], *modified_fname;
6885 		int i=0;
6886 		char * applied_detail_path = global_context -> output_file_path;
6887 		if(global_context -> read_details_path[0]) applied_detail_path = global_context -> read_details_path;
6888 
6889 		if( global_context -> input_file_unique ){
6890 			sprintf(tmp_fname, "%s/%s.featureCounts%s", applied_detail_path, global_context -> input_file_short_name, global_context -> is_read_details_out == FILE_TYPE_BAM?".bam":(global_context -> is_read_details_out == FILE_TYPE_SAM?".sam":""));
6891 			global_context -> read_details_out_FP = f_subr_open(tmp_fname, "w");
6892 			//SUBREADprintf("FCSSF=%s\n", tmp_fname);
6893 		} else {
6894 			sprintf(tmp_fname, "%s.featureCounts%s", global_context -> raw_input_file_name, global_context -> is_read_details_out == FILE_TYPE_BAM?".bam":(global_context -> is_read_details_out == FILE_TYPE_SAM?".sam":""));
6895 			modified_fname = tmp_fname;
6896 			while(modified_fname[0]=='/' || modified_fname[0]=='.' || modified_fname[0]=='\\'){
6897 				modified_fname ++;
6898 			}
6899 			while(modified_fname[i]){
6900 				if(modified_fname[i]=='\\' || modified_fname[i]=='/'||modified_fname[i]==' ')modified_fname[i]='.';
6901 				i++;
6902 			}
6903 			char tmp_fname2[MAX_FILE_NAME_LENGTH*2+100];
6904 			sprintf(tmp_fname2, "%s/%s", applied_detail_path, modified_fname);
6905 			global_context -> read_details_out_FP = f_subr_open(tmp_fname2, "w");
6906 			//SUBREADprintf("FCSSF=%s\n", tmp_fname2);
6907 		}
6908 		if(global_context -> read_details_out_FP){
6909 			pthread_spin_init(&global_context -> read_details_out_lock, 1);
6910 		}else{
6911 			SUBREADprintf("Unable to create file '%s'; the read assignment details are not written.\n", tmp_fname);
6912 		}
6913 	}
6914 	else
6915 		global_context -> read_details_out_FP = NULL;
6916 
6917 	global_context -> redo = 0;
6918 	global_context -> exontable_geneid = et_geneid;
6919 	global_context -> exontable_chr = et_chr;
6920 	global_context -> exontable_start = et_start;
6921 	global_context -> exontable_stop = et_stop;
6922 	global_context -> exontable_strand = (char *)et_strand;
6923 	global_context -> exontable_anno_chr_2ch = et_anno_chr_2ch;
6924 	global_context -> exontable_anno_chrs = et_anno_chrs;
6925 	global_context -> exontable_anno_chr_heads = et_anno_chr_heads;
6926 	global_context -> exontable_block_end_index = et_bk_end_index;
6927 	global_context -> exontable_block_max_end = et_bk_max_end;
6928 	global_context -> exontable_block_min_start = et_bk_min_start;
6929 	global_context -> sambam_chro_table_items = 0;
6930 	global_context -> sambam_chro_table = NULL;
6931 
6932 	global_context -> thread_contexts = malloc(sizeof(fc_thread_thread_context_t) * global_context -> thread_number);
6933 	for(xk1=0; xk1<global_context -> thread_number; xk1++)
6934 	{
6935 	//	printf("CHRR_MALLOC\n");
6936 		global_context -> thread_contexts[xk1].thread_id = xk1;
6937 		global_context -> thread_contexts[xk1].chunk_read_ptr = 0;
6938 		global_context -> thread_contexts[xk1].count_table = calloc(sizeof(read_count_type_t), et_exons);
6939 		global_context -> thread_contexts[xk1].count_table_size = et_exons;
6940 		global_context -> thread_contexts[xk1].nreads_mapped_to_exon = 0;
6941 		global_context -> thread_contexts[xk1].all_reads = 0;
6942 		global_context -> thread_contexts[xk1].chro_name_buff = malloc(CHROMOSOME_NAME_LENGTH);
6943 
6944 		global_context -> thread_contexts[xk1].read_counters.assigned_reads = 0;
6945 		global_context -> thread_contexts[xk1].read_counters.unassigned_ambiguous = 0;
6946 		global_context -> thread_contexts[xk1].read_counters.unassigned_nofeatures = 0;
6947 		global_context -> thread_contexts[xk1].read_counters.unassigned_unmapped = 0;
6948 		global_context -> thread_contexts[xk1].read_counters.unassigned_singleton = 0;
6949 		global_context -> thread_contexts[xk1].read_counters.unassigned_read_type = 0;
6950 		global_context -> thread_contexts[xk1].read_counters.unassigned_mappingquality = 0;
6951 		global_context -> thread_contexts[xk1].read_counters.unassigned_fragmentlength = 0;
6952 		global_context -> thread_contexts[xk1].read_counters.unassigned_chimericreads = 0;
6953 		global_context -> thread_contexts[xk1].read_counters.unassigned_multimapping = 0;
6954 		global_context -> thread_contexts[xk1].read_counters.unassigned_secondary = 0;
6955 		global_context -> thread_contexts[xk1].read_counters.unassigned_junction_condition = 0;
6956 		global_context -> thread_contexts[xk1].read_counters.unassigned_overlapping_length = 0;
6957 		global_context -> thread_contexts[xk1].read_counters.unassigned_duplicate = 0;
6958 		global_context -> thread_contexts[xk1].read_details_buff_used = 0;
6959 		global_context -> thread_contexts[xk1].hits_number_capacity = 300 ;
6960 
6961 		global_context -> thread_contexts[xk1].hits_start_pos1 = malloc(sizeof(int)* global_context -> thread_contexts[xk1].hits_number_capacity);
6962 		global_context -> thread_contexts[xk1].hits_start_pos2 = malloc(sizeof(int)* global_context -> thread_contexts[xk1].hits_number_capacity);
6963 		global_context -> thread_contexts[xk1].hits_length1 = malloc(sizeof(short)* global_context -> thread_contexts[xk1].hits_number_capacity);
6964 		global_context -> thread_contexts[xk1].hits_length2 = malloc(sizeof(short)* global_context -> thread_contexts[xk1].hits_number_capacity);
6965 		global_context -> thread_contexts[xk1].hits_chro1 = malloc(sizeof(char*)* global_context -> thread_contexts[xk1].hits_number_capacity);
6966 		global_context -> thread_contexts[xk1].hits_chro2 = malloc(sizeof(char*)* global_context -> thread_contexts[xk1].hits_number_capacity);
6967 		global_context -> thread_contexts[xk1].hits_indices1 = malloc(sizeof(srInt_64)* global_context -> thread_contexts[xk1].hits_number_capacity);
6968 		global_context -> thread_contexts[xk1].hits_indices2 = malloc(sizeof(srInt_64)* global_context -> thread_contexts[xk1].hits_number_capacity);
6969 
6970 		global_context -> thread_contexts[xk1].scoring_buff_numbers = malloc(sizeof(int)* global_context -> thread_contexts[xk1].hits_number_capacity * 2);
6971 		global_context -> thread_contexts[xk1].scoring_buff_flags = malloc(sizeof(int)* global_context -> thread_contexts[xk1].hits_number_capacity * 2);
6972 		global_context -> thread_contexts[xk1].scoring_buff_overlappings = malloc(sizeof(int)* global_context -> thread_contexts[xk1].hits_number_capacity * 2);
6973 		global_context -> thread_contexts[xk1].scoring_buff_exon_ids =malloc(sizeof(srInt_64)* global_context -> thread_contexts[xk1].hits_number_capacity * 2);
6974 
6975 		if(global_context -> read_details_out_FP){
6976 			global_context -> thread_contexts[xk1].read_details_buff = malloc(70000 + 2 * MAX_FC_READ_LENGTH * 3);
6977 			global_context -> thread_contexts[xk1].bam_compressed_buff = malloc(70000 + 2 * MAX_FC_READ_LENGTH * 3);
6978 		}
6979 
6980 		if(global_context -> need_calculate_overlap_len){
6981 			global_context -> thread_contexts[xk1].scoring_buff_gap_chros = malloc( sizeof(char *) * global_context -> thread_contexts[xk1].hits_number_capacity * 2 * global_context -> max_M *2);
6982 			global_context -> thread_contexts[xk1].scoring_buff_gap_starts = malloc( sizeof(unsigned int ) * global_context -> thread_contexts[xk1].hits_number_capacity * 2 * global_context -> max_M *2);
6983 			global_context -> thread_contexts[xk1].scoring_buff_gap_lengths = malloc( sizeof(unsigned short) * global_context -> thread_contexts[xk1].hits_number_capacity * 2 * global_context -> max_M *2);
6984 		} else global_context -> thread_contexts[xk1].scoring_buff_gap_chros = NULL;
6985 
6986 		if(global_context -> do_junction_counting)
6987 		{
6988 			global_context -> thread_contexts[xk1].junction_counting_table = HashTableCreate(131317);
6989 			HashTableSetHashFunction(global_context -> thread_contexts[xk1].junction_counting_table,HashTableStringHashFunction);
6990 			HashTableSetDeallocationFunctions(global_context -> thread_contexts[xk1].junction_counting_table, free, NULL);
6991 			HashTableSetKeyComparisonFunction(global_context -> thread_contexts[xk1].junction_counting_table, fc_strcmp_chro);
6992 
6993 			global_context -> thread_contexts[xk1].splicing_point_table = HashTableCreate(131317);
6994 			HashTableSetHashFunction(global_context -> thread_contexts[xk1].splicing_point_table,HashTableStringHashFunction);
6995 			HashTableSetDeallocationFunctions(global_context -> thread_contexts[xk1].splicing_point_table, free, NULL);
6996 			HashTableSetKeyComparisonFunction(global_context -> thread_contexts[xk1].splicing_point_table, fc_strcmp_chro);
6997 		}
6998 
6999 		if(global_context -> assign_reads_to_RG){
7000 			global_context -> thread_contexts[xk1].RG_table = HashTableCreate(97);
7001 			HashTableSetHashFunction(global_context -> thread_contexts[xk1].RG_table,HashTableStringHashFunction);
7002 			HashTableSetDeallocationFunctions(global_context -> thread_contexts[xk1].RG_table, free, disallocate_RG_tables);
7003 			HashTableSetKeyComparisonFunction(global_context -> thread_contexts[xk1].RG_table, fc_strcmp_chro);
7004 		}
7005 
7006 		if(global_context -> do_scRNA_table){
7007 			global_context -> thread_contexts[xk1].scRNA_reads_per_sample = calloc(sizeof(srInt_64),global_context-> scRNA_sample_sheet_table ->numOfElements);
7008 			global_context -> thread_contexts[xk1].scRNA_mapped_reads_per_sample = calloc(sizeof(srInt_64),global_context-> scRNA_sample_sheet_table ->numOfElements);
7009 			global_context -> thread_contexts[xk1].scRNA_assigned_reads_per_sample = calloc(sizeof(srInt_64),global_context-> scRNA_sample_sheet_table ->numOfElements);
7010 			global_context -> thread_contexts[xk1].scRNA_sample_bc_tables = malloc(sizeof(HashTable*) * global_context -> scRNA_sample_id_to_name -> numOfElements);
7011 			global_context -> thread_contexts[xk1].scRNA_registered_UMI_table = StringTableCreate(100000);
7012 			HashTableSetDeallocationFunctions(global_context  -> thread_contexts[xk1].scRNA_registered_UMI_table, free, NULL);
7013 			int xk2;
7014 			for(xk2 = 0; xk2 < global_context -> scRNA_sample_id_to_name -> numOfElements; xk2++){
7015 				HashTable *al = HashTableCreate(2000);
7016 				HashTableSetDeallocationFunctions(al, NULL, (void (*)(void*))HashTableDestroy);
7017 				//SUBREADprintf("PUSH ARR for THR %d XK2 %d\n", xk1, xk2);
7018 				global_context -> thread_contexts[xk1].scRNA_sample_bc_tables[xk2] = al;
7019 			}
7020 			global_context -> thread_contexts[xk1].scRNA_pooled_reads=0;
7021 			global_context -> thread_contexts[xk1].scRNA_has_valid_sample_index  =0;
7022 			global_context -> thread_contexts[xk1].scRNA_has_valid_cell_barcode  =0;
7023 		}
7024 
7025 		if(!global_context ->  thread_contexts[xk1].count_table) return 1;
7026 	}
7027 
7028 	char new_fn[MAX_FILE_NAME_LENGTH+10];
7029 	char MAC_or_random[13];
7030 	mac_or_rand_str(MAC_or_random);
7031 	char rand_prefix[MAX_FILE_NAME_LENGTH+100];
7032 	sprintf(rand_prefix, "%s/temp-core-%06u-%s.sam", global_context -> temp_file_dir, getpid(), MAC_or_random);
7033 	if(global_context -> use_stdin_file) sprintf(new_fn, "<%s",  global_context -> input_file_name );
7034 	else sprintf(new_fn, "%s",  global_context -> input_file_name );
7035 
7036 	//#warning " ===================== REMOVE ' 0 && ' FROM NEXT LINE !!!!!! =================="
7037 	SAM_pairer_create(&global_context -> read_pairer, global_context -> thread_number , global_context -> max_BAM_header_size/1024/1024+2, !global_context-> is_SAM_file, !( global_context -> is_read_details_out == FILE_TYPE_BAM ||global_context -> is_read_details_out == FILE_TYPE_SAM ) , !global_context -> is_paired_end_mode_assign, global_context ->is_paired_end_mode_assign && global_context -> do_not_sort, global_context -> assign_reads_to_RG ,0, new_fn, process_pairer_reset, process_pairer_header, process_pairer_output, rand_prefix, global_context,  global_context -> long_read_minimum_length);
7038 
7039 	return 0;
7040 }
7041 
fc_thread_destroy_thread_context(fc_thread_global_context_t * global_context)7042 void fc_thread_destroy_thread_context(fc_thread_global_context_t * global_context)
7043 {
7044 	int xk1;
7045 
7046 	if(global_context -> is_read_details_out)for(xk1=0; xk1<global_context-> thread_number; xk1++)
7047 		write_read_detailed_remainder(global_context, global_context -> thread_contexts+xk1);
7048 
7049 	if(global_context -> is_read_details_out) {
7050 		if( global_context -> is_read_details_out == FILE_TYPE_BAM ){
7051 			char bam_tail_block[1000];
7052 			int tail_size = compress_read_detail_BAM( global_context, global_context -> thread_contexts, 0,0,bam_tail_block);
7053 			assert(tail_size > 0);
7054 			//SUBREADprintf("TAIL SIZE=%d\n", tail_size);
7055 			fwrite(bam_tail_block, 1, tail_size, global_context -> read_details_out_FP);
7056 		}
7057 		fclose(global_context -> read_details_out_FP);
7058 		global_context -> read_details_out_FP = NULL;
7059 		pthread_spin_destroy(&global_context -> read_details_out_lock);
7060 	}
7061 
7062 	for(xk1=0; xk1<global_context-> thread_number; xk1++) {
7063 		//printf("CHRR_FREE\n");
7064 		free(global_context -> thread_contexts[xk1].count_table);
7065 		free(global_context -> thread_contexts[xk1].chro_name_buff);
7066 		free(global_context -> thread_contexts[xk1].hits_start_pos1);
7067 		free(global_context -> thread_contexts[xk1].hits_start_pos2);
7068 		free(global_context -> thread_contexts[xk1].hits_length1);
7069 		free(global_context -> thread_contexts[xk1].hits_length2);
7070 		free(global_context -> thread_contexts[xk1].hits_chro1);
7071 		free(global_context -> thread_contexts[xk1].hits_chro2);
7072 		free(global_context -> thread_contexts[xk1].hits_indices1);
7073 		free(global_context -> thread_contexts[xk1].hits_indices2);
7074 		free(global_context -> thread_contexts[xk1].scoring_buff_numbers);
7075 		free(global_context -> thread_contexts[xk1].scoring_buff_flags);
7076 		free(global_context -> thread_contexts[xk1].scoring_buff_overlappings);
7077 		free(global_context -> thread_contexts[xk1].scoring_buff_exon_ids);
7078 
7079 		if(global_context -> thread_contexts[xk1].scoring_buff_gap_chros){
7080 			free(global_context -> thread_contexts[xk1].scoring_buff_gap_chros);
7081 			free(global_context -> thread_contexts[xk1].scoring_buff_gap_starts);
7082 			free(global_context -> thread_contexts[xk1].scoring_buff_gap_lengths);
7083 		}
7084 		if(global_context -> do_junction_counting){
7085 			HashTableDestroy(global_context -> thread_contexts[xk1].junction_counting_table);
7086 			HashTableDestroy(global_context -> thread_contexts[xk1].splicing_point_table);
7087 		}
7088 		if(global_context -> assign_reads_to_RG)
7089 			HashTableDestroy(global_context -> thread_contexts[xk1].RG_table);
7090 		if(global_context -> is_read_details_out ){
7091 			free(global_context -> thread_contexts[xk1].read_details_buff);
7092 			free(global_context -> thread_contexts[xk1].bam_compressed_buff);
7093 		}
7094 
7095 		if(global_context -> do_scRNA_table){
7096 			int xk2;
7097 			for(xk2=0;xk2< global_context -> scRNA_sample_id_to_name -> numOfElements;xk2++) {
7098 				HashTableDestroy(global_context -> thread_contexts[xk1].scRNA_sample_bc_tables[xk2]);
7099 			}
7100 			//HashTableDestroy(global_context -> scRNA_sample_BAM_writers);
7101 			free(global_context -> thread_contexts[xk1].scRNA_reads_per_sample);
7102 			free(global_context -> thread_contexts[xk1].scRNA_mapped_reads_per_sample);
7103 			free(global_context -> thread_contexts[xk1].scRNA_assigned_reads_per_sample);
7104 			free(global_context -> thread_contexts[xk1].scRNA_sample_bc_tables);
7105 			HashTableDestroy(global_context -> thread_contexts[xk1].scRNA_registered_UMI_table);
7106 		}
7107 	}
7108 
7109 	free(global_context -> thread_contexts);
7110 }
fc_thread_wait_threads(fc_thread_global_context_t * global_context)7111 void fc_thread_wait_threads(fc_thread_global_context_t * global_context)
7112 {
7113 	int assign_ret = SAM_pairer_run(&global_context -> read_pairer);
7114 	if(0 && assign_ret){
7115 		print_in_box(80,0,0,"");
7116 		print_in_box(80,0,0,"   format error found in this file.");
7117 	}
7118 	global_context -> is_input_bad_format |= assign_ret;
7119 }
7120 
merge_repeated_extra_columns(char * cols)7121 void merge_repeated_extra_columns(char * cols){
7122 	if(cols[0]!=';')return;
7123 
7124 	int is_diff = 0;
7125 	int seglen = -1, laststart = 0;
7126 	int xx;
7127 	for(xx=0; ; xx++){
7128 		if(cols[xx]==';' || cols[xx]==0){
7129 			if(seglen <0)seglen = xx -1;
7130 			else{
7131 				is_diff = (xx-laststart != seglen )|| memcmp(cols+laststart, cols+1, seglen);
7132 				if(is_diff)break;
7133 			}
7134 			laststart = xx+1;
7135 		}
7136 		if(cols[xx]==0)break;
7137 	}
7138 
7139 	if(seglen>0 && !is_diff) cols[seglen+1]=0;
7140 }
7141 
BUFstrcat(char * targ,char * src,char ** buf)7142 void BUFstrcat(char * targ, char * src, char ** buf){
7143 	int srclen = strlen(src);
7144 	if( (*buf) == NULL){
7145 		(*buf) = targ;
7146 	}
7147 	memcpy((*buf), src, srclen);
7148 	(*buf) += srclen;
7149 	(**buf) = 0;
7150 }
7151 
fc_write_final_gene_results(fc_thread_global_context_t * global_context,int * et_geneid,char ** et_chr,srInt_64 * et_start,srInt_64 * et_stop,unsigned char * et_strand,char ** et_extra_columns,const char * out_file,int features,ArrayList * column_numbers,ArrayList * column_names,fc_feature_info_t * loaded_features,int header_out)7152 void fc_write_final_gene_results(fc_thread_global_context_t * global_context, int * et_geneid, char ** et_chr, srInt_64 * et_start, srInt_64 * et_stop, unsigned char * et_strand, char ** et_extra_columns, const char * out_file, int features, ArrayList * column_numbers, ArrayList * column_names, fc_feature_info_t * loaded_features, int header_out)
7153 {
7154 	int xk1,xk4;
7155 	int genes = global_context -> gene_name_table -> numOfElements;
7156 	read_count_type_t *gene_columns;
7157 
7158 	FILE * fp_out = f_subr_open(out_file,"w");
7159 	if(!fp_out){
7160 		SUBREADprintf("Failed to create file %s\n", out_file);
7161 		return;
7162 	}
7163 
7164 	if(header_out)
7165 	{
7166 		fprintf(fp_out, "# Program:featureCounts v%s", SUBREAD_VERSION);
7167 		if(global_context->cmd_rebuilt)
7168 			fprintf(fp_out, "; Command:%s", global_context->cmd_rebuilt);
7169 		fprintf(fp_out, "\n");
7170 	}
7171 
7172 	int i_files;
7173 	fprintf(fp_out,"Geneid\t%sChr\tStart\tEnd\tStrand\tLength%s%s", global_context->do_detection_call?"GCfraction\t":"", global_context -> reported_extra_columns?"\t":"", global_context -> reported_extra_columns?global_context -> reported_extra_columns:"");
7174 	for(i_files=0; i_files<column_names->numOfElements; i_files++)
7175 	{
7176 		char * next_fn = ArrayListGet(column_names, i_files);
7177 		fprintf(fp_out,"\t%s", global_context -> use_stdin_file?"STDIN":next_fn);
7178 	}
7179 
7180 	fprintf(fp_out,"\n");
7181 
7182 	gene_columns = calloc(sizeof(read_count_type_t) , genes * column_names->numOfElements);
7183 	unsigned int * gene_exons_number = calloc(sizeof(unsigned int) , genes);
7184 	unsigned int * gene_exons_pointer = calloc(sizeof(unsigned int) , genes);
7185 	unsigned int * gene_exons_start = malloc(sizeof(unsigned int) * features);
7186 	unsigned int * gene_exons_end = malloc(sizeof(unsigned int) * features);
7187 	char ** gene_exons_chr = malloc(sizeof(char *) * features);
7188 	char ** gene_exons_extra_columns = malloc(sizeof(char *) * features);
7189 	char * gene_exons_strand = malloc(features);
7190 
7191 	for(xk1 = 0; xk1 < features; xk1++)
7192 	{
7193 		int gene_id = et_geneid[xk1];
7194 		gene_exons_number[gene_id]++;
7195 	}
7196 
7197 	unsigned int accumulative_no = 0;
7198 	unsigned longest_gene_exons = 0;
7199 	for(xk1 = 0 ; xk1 < genes; xk1++)
7200 	{
7201 		unsigned int this_gene_exons = gene_exons_number[xk1];
7202 		longest_gene_exons = max(longest_gene_exons, this_gene_exons);
7203 		gene_exons_number[xk1] = accumulative_no;
7204 		accumulative_no += this_gene_exons;
7205 	}
7206 
7207 	for(xk1 = 0; xk1 < features; xk1++)
7208 	{
7209 		int gene_id = et_geneid[xk1];
7210 		int gene_write_ptr = gene_exons_number[gene_id] + gene_exons_pointer[gene_id];
7211 
7212 		gene_exons_chr[gene_write_ptr] = et_chr[xk1];
7213 		gene_exons_start[gene_write_ptr] = et_start[xk1];
7214 		gene_exons_end[gene_write_ptr] = et_stop[xk1];
7215 		gene_exons_strand[gene_write_ptr] = et_strand[xk1];
7216 		if(global_context -> reported_extra_columns!=NULL)gene_exons_extra_columns[gene_write_ptr] = et_extra_columns[xk1];
7217 
7218 		gene_exons_pointer[gene_id]++;
7219 	}
7220 
7221 	for(xk1 = 0; xk1 < features; xk1++)
7222 	{
7223 		int gene_id = et_geneid[xk1], k_noempty = 0;
7224 		for(i_files=0;i_files < column_names->numOfElements; i_files++)
7225 		{
7226 			srInt_64 * this_col = ArrayListGet(column_numbers, i_files);
7227 			gene_columns[gene_id * column_names->numOfElements + k_noempty ] += this_col[xk1];
7228 			k_noempty++;
7229 		}
7230 	}
7231 
7232 
7233 	char *is_occupied = malloc(longest_gene_exons);
7234 	unsigned int * input_start_stop_list = malloc(longest_gene_exons * sizeof(int) * 2);
7235 	unsigned int * output_start_stop_list = malloc(longest_gene_exons * sizeof(int) * 2);
7236 	int disk_is_full = 0;
7237 
7238 	char * out_chr_list = malloc(longest_gene_exons * (1+global_context -> longest_chro_name) + 1), * tmp_chr_list = NULL;
7239 	char * out_start_list = malloc(11 * longest_gene_exons + 1), * tmp_start_list = NULL;
7240 	char * out_end_list = malloc(11 * longest_gene_exons + 1), * tmp_end_list = NULL;
7241 	char * out_strand_list = malloc(2 * longest_gene_exons + 1), * tmp_strand_list = NULL;
7242 
7243 	char * out_extra_columns[MAX_EXTRA_COLS];
7244 	int out_extra_column_size[MAX_EXTRA_COLS];
7245 	int total_extra_cols = 0;
7246 	if(global_context -> reported_extra_columns){
7247 		char * tnamep = global_context -> reported_extra_columns;
7248 		total_extra_cols =1;
7249 		while(*(tnamep++))
7250 			total_extra_cols += '\t' ==(*tnamep);
7251 		for(xk1=0; xk1<total_extra_cols; xk1++){
7252 			out_extra_columns[xk1] = malloc(220);
7253 			out_extra_column_size[xk1] = 220;
7254 		}
7255 	}
7256 
7257 
7258 	for(xk1 = 0 ; xk1 < genes; xk1++)
7259 	{
7260 		int xk2;
7261 
7262 		memset(is_occupied,0,gene_exons_pointer[xk1]);
7263 		tmp_chr_list = NULL;
7264 		tmp_start_list = NULL;
7265 		tmp_end_list = NULL;
7266 		tmp_strand_list = NULL;
7267 		out_chr_list[0]=0;
7268 		out_start_list[0]=0;
7269 		out_end_list[0]=0;
7270 		out_strand_list[0]=0;
7271 		for(xk4=0; xk4<total_extra_cols; xk4++)
7272 			out_extra_columns[xk4][0]=0;
7273 		int gene_nonoverlap_len =0;
7274 
7275 		unsigned char * gene_symbol = global_context -> gene_name_array [xk1];
7276 		for(xk2=0; xk2<gene_exons_pointer[xk1]; xk2++)
7277 		{
7278 			if(!is_occupied[xk2])
7279 			{
7280 				int xk3;
7281 				char * matched_chr = gene_exons_chr[xk2 + gene_exons_number[xk1]];
7282 				char matched_strand = gene_exons_strand[xk2 + gene_exons_number[xk1]];
7283 
7284 				memset(input_start_stop_list, 0, gene_exons_pointer[xk1] * sizeof(int) * 2);
7285 				int gap_merge_ptr = 1;
7286 				input_start_stop_list[0] = gene_exons_start[xk2 + gene_exons_number[xk1]];
7287 				input_start_stop_list[1] = gene_exons_end[xk2 + gene_exons_number[xk1]] + 1;
7288 
7289 				for(xk3 = xk2; xk3 < gene_exons_pointer[xk1]; xk3++)
7290 				{
7291 					if( global_context -> reported_extra_columns &&  (xk3==xk2 || (0 == is_occupied[xk3] && strcmp(matched_chr, gene_exons_chr[xk3+gene_exons_number[xk1]])==0 && matched_strand == gene_exons_strand[xk3 + gene_exons_number[xk1]] ))){
7292 						char * this_col_ptr = NULL;
7293 						char * this_col = strtok_r(gene_exons_extra_columns[xk3+gene_exons_number[xk1]], "\t", &this_col_ptr);
7294 						for(xk4 = 0; xk4 < total_extra_cols; xk4++){
7295 							int exlen = strlen( this_col), ollen = strlen(out_extra_columns[xk4]);
7296 							if(ollen + exlen +2 > out_extra_column_size[xk4]){
7297 								out_extra_column_size[xk4] = max(ollen + exlen +2, out_extra_column_size[xk4]);
7298 								out_extra_columns[xk4] = realloc(out_extra_columns[xk4], out_extra_column_size[xk4]);
7299 							}
7300 							sprintf(out_extra_columns[xk4]+ollen,";%s", this_col);
7301 							this_col = strtok_r(NULL, "\t", &this_col_ptr);
7302 						}
7303 					}
7304 
7305 					if(xk3==xk2)continue;
7306 
7307 					if((!is_occupied[xk3]) && strcmp(matched_chr, gene_exons_chr[xk3+gene_exons_number[xk1]])==0 && matched_strand == gene_exons_strand[xk3 + gene_exons_number[xk1]])
7308 					{
7309 						is_occupied[xk3]=1;
7310 						input_start_stop_list[gap_merge_ptr*2] = gene_exons_start[xk3+gene_exons_number[xk1]];
7311 						input_start_stop_list[gap_merge_ptr*2+1] = gene_exons_end[xk3+gene_exons_number[xk1]]+1;
7312 
7313 						gap_merge_ptr++;
7314 					}
7315 				}
7316 
7317 				{
7318 						int merged_gaps = mergeIntervals(input_start_stop_list, output_start_stop_list, gap_merge_ptr);
7319 
7320 						for(xk3=0; xk3<gap_merge_ptr; xk3++)
7321 						{
7322 							char numbbuf[12];
7323 							BUFstrcat(out_chr_list, matched_chr, &tmp_chr_list);
7324 							BUFstrcat(out_chr_list, ";", &tmp_chr_list);
7325 
7326 							sprintf(numbbuf,"%u;", input_start_stop_list[xk3 * 2]);
7327 							BUFstrcat(out_start_list, numbbuf, &tmp_start_list);
7328 							sprintf(numbbuf,"%u;", input_start_stop_list[xk3 * 2 + 1] - 1);
7329 							BUFstrcat(out_end_list, numbbuf, &tmp_end_list);
7330 							sprintf(numbbuf,"%c;", (matched_strand==1)?'-':( ( matched_strand==0 )? '+':'.'));
7331 							BUFstrcat(out_strand_list, numbbuf, &tmp_strand_list);
7332 
7333 						}
7334 						for(xk3=0; xk3<merged_gaps; xk3++)
7335 							gene_nonoverlap_len += output_start_stop_list[xk3 * 2 + 1] - output_start_stop_list[xk3 * 2];
7336 				}
7337 			}
7338 		}
7339 		#define _cut_tail(x) (x)[strlen(x)-1]=0
7340 
7341 		_cut_tail(out_chr_list);
7342 		_cut_tail(out_start_list);
7343 		_cut_tail(out_end_list);
7344 		_cut_tail(out_strand_list);
7345 
7346 		char * QCcontent = "";
7347 		char * QCtab = "";
7348 		if(global_context -> do_detection_call){
7349 			QCcontent = HashTableGet(global_context -> GCcontent_table, gene_symbol);
7350 			QCtab = "\t";
7351 			if(!QCcontent)QCcontent="nan";
7352 		}
7353 
7354 		int wlen = fprintf(fp_out, "%s\t%s%s%s\t%s\t%s\t%s\t%d", gene_symbol, QCcontent, QCtab, out_chr_list, out_start_list, out_end_list, out_strand_list, gene_nonoverlap_len);
7355 		for(xk4 = 0; xk4<total_extra_cols; xk4++){
7356 			merge_repeated_extra_columns(out_extra_columns[xk4]);
7357 			fprintf(fp_out, "\t%s", out_extra_columns[xk4]+1);
7358 		}
7359 
7360 		for(i_files=0; i_files< column_names->numOfElements; i_files++)
7361 		{
7362 			read_count_type_t longlong_res = 0;
7363 			double double_res = 0;
7364 			int is_double_number = calc_float_fraction(gene_columns[i_files + column_names->numOfElements*xk1], &longlong_res, &double_res);
7365 			if(is_double_number){
7366 				fprintf(fp_out,"\t%.2f", double_res);
7367 			}else{
7368 				#ifdef __MINGW32__
7369 				fprintf(fp_out,"\t%I64u", (srInt_64)longlong_res);
7370 				#else
7371 				fprintf(fp_out,"\t%lld", (srInt_64)longlong_res);
7372 				#endif
7373 			}
7374 		}
7375 		fprintf(fp_out,"\n");
7376 		if(wlen < 6)disk_is_full = 1;
7377 	}
7378 
7379 	for(xk1=0; xk1<total_extra_cols; xk1++) free(out_extra_columns[xk1]);
7380 	free(is_occupied);
7381 	free(input_start_stop_list);
7382 	free(output_start_stop_list);
7383 	free(out_chr_list);
7384 	free(out_strand_list);
7385 	free(out_start_list);
7386 	free(out_end_list);
7387 
7388 	free(gene_exons_number);
7389 	free(gene_exons_pointer);
7390 	free(gene_columns);
7391 	free(gene_exons_chr);
7392 	free(gene_exons_extra_columns);
7393 	free(gene_exons_start);
7394 	free(gene_exons_end);
7395 	free(gene_exons_strand);
7396 	fclose(fp_out);
7397 
7398 	if(disk_is_full){
7399 		SUBREADprintf("ERROR: disk is full; the count file cannot be generated.\n");
7400 		unlink(out_file);
7401 	}
7402 }
7403 
fc_write_final_counts(fc_thread_global_context_t * global_context,const char * out_file,ArrayList * column_names,ArrayList * read_counters,int isCVersion)7404 void fc_write_final_counts(fc_thread_global_context_t * global_context, const char * out_file, ArrayList * column_names, ArrayList * read_counters, int isCVersion)
7405 {
7406 	char fname[MAX_FILE_NAME_LENGTH];
7407 	int i_files, xk1, disk_is_full = 0;
7408 
7409 	sprintf(fname, "%s.summary", out_file);
7410 	FILE * fp_out = f_subr_open(fname,"w");
7411 
7412 	if(!fp_out){
7413 		SUBREADprintf("Unable to create summary file '%s'\n", fname);
7414 		return;
7415 	}
7416 
7417 	fprintf(fp_out,"Status");
7418 
7419 	for(i_files=0; i_files<column_names->numOfElements; i_files++)
7420 	{
7421 		char * next_fn = ArrayListGet(column_names, i_files);
7422 		fprintf(fp_out,"\t%s", global_context -> use_stdin_file?"STDIN":next_fn);
7423 	}
7424 
7425 	fprintf(fp_out,"\n");
7426 	char * keys [] ={ "Assigned" ,  "Unassigned_Unmapped", "Unassigned_Read_Type", "Unassigned_Singleton", "Unassigned_MappingQuality", "Unassigned_Chimera", "Unassigned_FragmentLength", "Unassigned_Duplicate", "Unassigned_MultiMapping" , "Unassigned_Secondary",  (global_context->is_split_or_exonic_only == 2)?"Unassigned_Split":"Unassigned_NonSplit", "Unassigned_NoFeatures", "Unassigned_Overlapping_Length", "Unassigned_Ambiguity"};
7427 
7428 	for(xk1=0; xk1<14; xk1++)
7429 	{
7430 		fprintf(fp_out,"%s", keys[xk1]);
7431 		for(i_files = 0; i_files < column_names->numOfElements; i_files ++)
7432 		{
7433 			srInt_64 * array_0 = ArrayListGet(read_counters,i_files);
7434 			srInt_64 * cntr = array_0 + xk1;
7435 			#ifdef __MINGW32__
7436 			fprintf(fp_out,"\t%I64u", (srInt_64)*cntr);
7437 			#else
7438 			fprintf(fp_out,"\t%lld", (srInt_64)*cntr);
7439 			#endif
7440 		}
7441 		int wlen = fprintf(fp_out,"\n");
7442 		if(wlen < 1)disk_is_full = 1;
7443 	}
7444 
7445 
7446 	fclose(fp_out);
7447 
7448 	if(disk_is_full){
7449 		SUBREADprintf("ERROR: disk is full; the count file cannot be generated.\n");
7450 		unlink(out_file);
7451 	}
7452 
7453 }
fc_write_final_results(fc_thread_global_context_t * global_context,const char * out_file,int features,ArrayList * column_numbers,ArrayList * column_names,fc_feature_info_t * loaded_features,int header_out)7454 void fc_write_final_results(fc_thread_global_context_t * global_context, const char * out_file, int features, ArrayList* column_numbers, ArrayList * column_names,fc_feature_info_t * loaded_features, int header_out)
7455 {
7456 	/* save the results */
7457 	FILE * fp_out;
7458 	int i, i_files = 0, disk_is_full =0;
7459 	fp_out = f_subr_open(out_file,"w");
7460 	if(!fp_out){
7461 		SUBREADprintf("Failed to create file %s\n", out_file);
7462 			return;
7463 		}
7464 
7465 	if(header_out)
7466 	{
7467 		fprintf(fp_out, "# Program:featureCounts v%s", SUBREAD_VERSION);
7468 		if(global_context->cmd_rebuilt)
7469 			fprintf(fp_out, "; Command:%s", global_context->cmd_rebuilt);
7470 		fprintf(fp_out, "\n");
7471 	}
7472 
7473 
7474 
7475 	char * next_fn;
7476 	fprintf(fp_out,"Geneid\tChr\tStart\tEnd\tStrand\tLength");
7477 	if(global_context -> reported_extra_columns)fprintf(fp_out,"\t%s", global_context -> reported_extra_columns);
7478 
7479 	for(i_files = 0; i_files < column_names -> numOfElements; i_files++){
7480 		next_fn = ArrayListGet(column_names, i_files);
7481 		fprintf(fp_out,"\t%s", global_context -> use_stdin_file?"STDIN":next_fn);
7482 	}
7483 	fprintf(fp_out,"\n");
7484 	for(i=0;i<features;i++)
7485 	{
7486 		fprintf(fp_out,"%s\t%s\t%u\t%u\t%c\t%d%s%s", global_context -> unistr_buffer_space + loaded_features[i].feature_name_pos,
7487  							   global_context -> unistr_buffer_space + loaded_features[i].feature_name_pos + loaded_features[i].chro_name_pos_delta,
7488 						   	   loaded_features[i].start, loaded_features[i].end, loaded_features[i].is_negative_strand == 1?'-':(  loaded_features[i].is_negative_strand ==  0? '+':'.'),
7489 							loaded_features[i].end-loaded_features[i].start+1, global_context -> reported_extra_columns ?"\t":"", global_context -> reported_extra_columns ?loaded_features[i].extra_columns:"");
7490 		for(i_files=0; i_files < column_names -> numOfElements; i_files++)
7491 		{
7492 			srInt_64 * this_list = ArrayListGet(column_numbers, i_files);
7493 			int sorted_exon_no = loaded_features[i].sorted_order;
7494 			srInt_64 count_frac_raw = this_list[sorted_exon_no], longlong_res = 0;
7495 
7496 			double double_res = 0;
7497 			int is_double_number = calc_float_fraction(count_frac_raw, &longlong_res, &double_res);
7498 			if(is_double_number){
7499 				fprintf(fp_out,"\t%.2f", double_res);
7500 			}else{
7501 				#ifdef __MINGW32__
7502 				fprintf(fp_out,"\t%I64d", (srInt_64)longlong_res);
7503 				#else
7504 				fprintf(fp_out,"\t%lld", (srInt_64)longlong_res);
7505 				#endif
7506 			}
7507 		}
7508 		int wlen = fprintf(fp_out,"\n");
7509 		if(wlen < 1)disk_is_full = 1;
7510 	}
7511 
7512 	fclose(fp_out);
7513 	if(disk_is_full){
7514 		SUBREADprintf("ERROR: disk is full; unable to write into the output file.\n");
7515 		unlink(out_file);
7516 	}
7517 }
7518 
7519 static struct option long_options[] =
7520 {
7521 	{"primary",no_argument, 0, 0},
7522 	{"readShiftSize", required_argument, 0, 0},
7523 	{"readShiftType", required_argument, 0, 0},
7524 	{"readExtension5", required_argument, 0, 0},
7525 	{"readExtension5", required_argument, 0, 0},
7526 	{"readExtension3", required_argument, 0, 0},
7527 	{"read2pos", required_argument, 0, 0},
7528 	{"minOverlap", required_argument, 0, 0},
7529 	{"fracOverlap", required_argument, 0, 0},
7530 	{"nonOverlap", required_argument, 0, 0},
7531 	{"nonOverlapFeature", required_argument, 0, 0},
7532 	{"fracOverlapFeature", required_argument, 0, 0},
7533 	{"splitOnly", no_argument, 0, 0},
7534 	{"nonSplitOnly", no_argument, 0, 0},
7535 	{"debugCommand", required_argument, 0, 0},
7536 	{"ignoreDup", no_argument, 0, 0},
7537 	{"donotsort", no_argument, 0, 0},
7538 	{"restrictedlyNoOverlap", no_argument, 0, 0},
7539 	{"fraction", no_argument, 0, 0},
7540 	{"order", required_argument, 0, 'S'},
7541 	{"genome", required_argument, 0, 'G'},
7542 	{"maxMOp", required_argument, 0, 0},
7543 	{"tmpDir", required_argument, 0, 0},
7544 	{"extraAttributes", required_argument, 0, 0},
7545 	{"largestOverlap", no_argument, 0,0},
7546 	{"countReadPairs", no_argument, 0, 0},
7547 	{"byReadGroup", no_argument, 0,0},
7548 	{"verbose", no_argument, 0,0},
7549 	{"detectionCall", no_argument, 0,0},
7550 	{"Rpath", required_argument, 0, 0},
7551 	{"scSampleSheet", required_argument, 0, 0},
7552 	{"scInputMode", required_argument, 0, 0},
7553 	{"scCellBarcodeFile", required_argument, 0, 0},
7554 	{0, 0, 0, 0}
7555 };
7556 
print_usage()7557 void print_usage()
7558 {
7559 	SUBREADprintf("\nVersion %s\n\n", SUBREAD_VERSION);
7560 
7561 	SUBREADputs("Usage: featureCounts [options] -a <annotation_file> -o <output_file> input_file1 [input_file2] ... \n");
7562 	SUBREADputs("## Mandatory arguments:");
7563 	SUBREADputs("");
7564 	SUBREADputs("  -a <string>         Name of an annotation file. GTF/GFF format by default. See");
7565 	SUBREADputs("                      -F option for more format information. Inbuilt annotations");
7566 	SUBREADputs("                      (SAF format) is available in 'annotation' directory of the");
7567 	SUBREADputs("                      package. Gzipped file is also accepted.");
7568 	SUBREADputs("");
7569 	SUBREADputs("  -o <string>         Name of output file including read counts. A separate file");
7570 	SUBREADputs("                      including summary statistics of counting results is also");
7571 	SUBREADputs("                      included in the output ('<string>.summary'). Both files");
7572 	SUBREADputs("                      are in tab delimited format.");
7573 	SUBREADputs("");
7574 	SUBREADputs("  input_file1 [input_file2] ...   A list of SAM or BAM format files. They can be");
7575 	SUBREADputs("                      either name or location sorted. If no files provided,");
7576 	SUBREADputs("                      <stdin> input is expected. Location-sorted paired-end reads");
7577 	SUBREADputs("                      are automatically sorted by read names.");
7578 	SUBREADputs("");
7579 
7580 	SUBREADputs("## Optional arguments:");
7581 	SUBREADputs("# Annotation");
7582 	SUBREADputs("");
7583 	SUBREADputs("  -F <string>         Specify format of the provided annotation file. Acceptable");
7584 	SUBREADputs("                      formats include 'GTF' (or compatible GFF format) and");
7585 	SUBREADputs("                      'SAF'. 'GTF' by default.  For SAF format, please refer to");
7586 	SUBREADputs("                      Users Guide.");
7587 	SUBREADputs("");
7588 	SUBREADputs("  -t <string>         Specify feature type(s) in a GTF annotation. If multiple");
7589 	SUBREADputs("                      types are provided, they should be separated by ',' with");
7590 	SUBREADputs("                      no space in between. 'exon' by default. Rows in the");
7591 	SUBREADputs("                      annotation with a matched feature will be extracted and");
7592 	SUBREADputs("                      used for read mapping. ");
7593 	SUBREADputs("");
7594 	SUBREADputs("  -g <string>         Specify attribute type in GTF annotation. 'gene_id' by ");
7595 	SUBREADputs("                      default. Meta-features used for read counting will be ");
7596 	SUBREADputs("                      extracted from annotation using the provided value.");
7597 	SUBREADputs("");
7598 	SUBREADputs("  --extraAttributes   Extract extra attribute types from the provided GTF");
7599 	SUBREADputs("                      annotation and include them in the counting output. These");
7600 	SUBREADputs("                      attribute types will not be used to group features. If");
7601 	SUBREADputs("                      more than one attribute type is provided they should be");
7602 	SUBREADputs("                      separated by comma.");
7603 	SUBREADputs("");
7604 	SUBREADputs("  -A <string>         Provide a chromosome name alias file to match chr names in");
7605 	SUBREADputs("                      annotation with those in the reads. This should be a two-");
7606 	SUBREADputs("                      column comma-delimited text file. Its first column should");
7607 	SUBREADputs("                      include chr names in the annotation and its second column");
7608 	SUBREADputs("                      should include chr names in the reads. Chr names are case");
7609 	SUBREADputs("                      sensitive. No column header should be included in the");
7610 	SUBREADputs("                      file.");
7611 	SUBREADputs("");
7612 
7613 	SUBREADputs("# Level of summarization");
7614 	SUBREADputs("");
7615 	SUBREADputs("  -f                  Perform read counting at feature level (eg. counting ");
7616 	SUBREADputs("                      reads for exons rather than genes).");
7617 	SUBREADputs("");
7618 
7619 	SUBREADputs("# Overlap between reads and features");
7620 	SUBREADputs("");
7621 	SUBREADputs("  -O                  Assign reads to all their overlapping meta-features (or ");
7622 	SUBREADputs("                      features if -f is specified).");
7623 	SUBREADputs("");
7624 	SUBREADputs("  --minOverlap <int>  Minimum number of overlapping bases in a read that is");
7625 	SUBREADputs("                      required for read assignment. 1 by default. Number of");
7626 	SUBREADputs("                      overlapping bases is counted from both reads if paired");
7627 	SUBREADputs("                      end. If a negative value is provided, then a gap of up");
7628 	SUBREADputs("                      to specified size will be allowed between read and the");
7629 	SUBREADputs("                      feature that the read is assigned to.");
7630 	SUBREADputs("");
7631 	SUBREADputs("  --fracOverlap <float> Minimum fraction of overlapping bases in a read that is");
7632 	SUBREADputs("                      required for read assignment. Value should be within range");
7633 	SUBREADputs("                      [0,1]. 0 by default. Number of overlapping bases is");
7634 	SUBREADputs("                      counted from both reads if paired end. Both this option");
7635 	SUBREADputs("                      and '--minOverlap' option need to be satisfied for read");
7636 	SUBREADputs("                      assignment.");
7637 	SUBREADputs("");
7638 	SUBREADputs("  --fracOverlapFeature <float> Minimum fraction of overlapping bases in a");
7639 	SUBREADputs("                      feature that is required for read assignment. Value");
7640 	SUBREADputs("                      should be within range [0,1]. 0 by default.");
7641 	SUBREADputs("");
7642 	SUBREADputs("  --largestOverlap    Assign reads to a meta-feature/feature that has the ");
7643 	SUBREADputs("                      largest number of overlapping bases.");
7644 	SUBREADputs("");
7645 	SUBREADputs("  --nonOverlap <int>  Maximum number of non-overlapping bases in a read (or a");
7646 	SUBREADputs("                      read pair) that is allowed when being assigned to a");
7647 	SUBREADputs("                      feature. No limit is set by default.");
7648 	SUBREADputs("");
7649 	SUBREADputs("  --nonOverlapFeature <int> Maximum number of non-overlapping bases in a feature");
7650 	SUBREADputs("                      that is allowed in read assignment. No limit is set by");
7651 	SUBREADputs("                      default.");
7652 	SUBREADputs("");
7653 	SUBREADputs("  --readExtension5 <int> Reads are extended upstream by <int> bases from their");
7654 	SUBREADputs("                      5' end.");
7655 	SUBREADputs("");
7656 	SUBREADputs("  --readExtension3 <int> Reads are extended upstream by <int> bases from their");
7657 	SUBREADputs("                      3' end.");
7658 	SUBREADputs("");
7659 	SUBREADputs("  --read2pos <5:3>    Reduce reads to their 5' most base or 3' most base. Read");
7660 	SUBREADputs("                      counting is then performed based on the single base the ");
7661 	SUBREADputs("                      read is reduced to.");
7662 	SUBREADputs("");
7663 
7664 	SUBREADputs("# Multi-mapping reads");
7665 	SUBREADputs("");
7666 	SUBREADputs("  -M                  Multi-mapping reads will also be counted. For a multi-");
7667 	SUBREADputs("                      mapping read, all its reported alignments will be ");
7668 	SUBREADputs("                      counted. The 'NH' tag in BAM/SAM input is used to detect ");
7669 	SUBREADputs("                      multi-mapping reads.");
7670 	SUBREADputs("");
7671 	SUBREADputs("# Fractional counting");
7672 	SUBREADputs("");
7673 	SUBREADputs("  --fraction          Assign fractional counts to features. This option must");
7674 	SUBREADputs("                      be used together with '-M' or '-O' or both. When '-M' is");
7675 	SUBREADputs("                      specified, each reported alignment from a multi-mapping");
7676 	SUBREADputs("                      read (identified via 'NH' tag) will carry a fractional");
7677 	SUBREADputs("                      count of 1/x, instead of 1 (one), where x is the total");
7678 	SUBREADputs("                      number of alignments reported for the same read. When '-O'");
7679 	SUBREADputs("                      is specified, each overlapping feature will receive a");
7680 	SUBREADputs("                      fractional count of 1/y, where y is the total number of");
7681 	SUBREADputs("                      features overlapping with the read. When both '-M' and");
7682 	SUBREADputs("                      '-O' are specified, each alignment will carry a fractional");
7683 	SUBREADputs("                      count of 1/(x*y).");
7684 	SUBREADputs("");
7685 
7686 
7687 	SUBREADputs("# Read filtering");
7688 	SUBREADputs("");
7689 	SUBREADputs("  -Q <int>            The minimum mapping quality score a read must satisfy in");
7690 	SUBREADputs("                      order to be counted. For paired-end reads, at least one");
7691 	SUBREADputs("                      end should satisfy this criteria. 0 by default.");
7692 	SUBREADputs("");
7693 	SUBREADputs("  --splitOnly         Count split alignments only (ie. alignments with CIGAR");
7694 	SUBREADputs("                      string containing 'N'). An example of split alignments is");
7695 	SUBREADputs("                      exon-spanning reads in RNA-seq data.");
7696 	SUBREADputs("");
7697 	SUBREADputs("  --nonSplitOnly      If specified, only non-split alignments (CIGAR strings do");
7698 	SUBREADputs("                      not contain letter 'N') will be counted. All the other");
7699 	SUBREADputs("                      alignments will be ignored.");
7700 	SUBREADputs("");
7701 	SUBREADputs("  --primary           Count primary alignments only. Primary alignments are ");
7702 	SUBREADputs("                      identified using bit 0x100 in SAM/BAM FLAG field.");
7703 	SUBREADputs("");
7704 	SUBREADputs("  --ignoreDup         Ignore duplicate reads in read counting. Duplicate reads ");
7705 	SUBREADputs("                      are identified using bit Ox400 in BAM/SAM FLAG field. The ");
7706 	SUBREADputs("                      whole read pair is ignored if one of the reads is a ");
7707 	SUBREADputs("                      duplicate read for paired end data.");
7708 	SUBREADputs("");
7709 
7710 	SUBREADputs("# Strandness");
7711 	SUBREADputs("");
7712 	SUBREADputs("  -s <int or string>  Perform strand-specific read counting. A single integer");
7713 	SUBREADputs("                      value (applied to all input files) or a string of comma-");
7714 	SUBREADputs("                      separated values (applied to each corresponding input");
7715 	SUBREADputs("                      file) should be provided. Possible values include:");
7716 	SUBREADputs("                      0 (unstranded), 1 (stranded) and 2 (reversely stranded).");
7717 	SUBREADputs("                      Default value is 0 (ie. unstranded read counting carried");
7718 	SUBREADputs("                      out for all input files).");
7719 	SUBREADputs("");
7720 
7721 	SUBREADputs("# Exon-exon junctions");
7722 	SUBREADputs("");
7723 	SUBREADputs("  -J                  Count number of reads supporting each exon-exon junction.");
7724 	SUBREADputs("                      Junctions were identified from those exon-spanning reads");
7725 	SUBREADputs("                      in the input (containing 'N' in CIGAR string). Counting");
7726 	SUBREADputs("                      results are saved to a file named '<output_file>.jcounts'");
7727 	SUBREADputs("");
7728 	SUBREADputs("  -G <string>         Provide the name of a FASTA-format file that contains the");
7729 	SUBREADputs("                      reference sequences used in read mapping that produced the");
7730 	SUBREADputs("                      provided SAM/BAM files. This optional argument can be used");
7731 	SUBREADputs("                      with '-J' option to improve read counting for junctions.");
7732 	SUBREADputs("");
7733 
7734 	SUBREADputs("# Parameters specific to paired end reads");
7735 	SUBREADputs("");
7736 	SUBREADputs("  -p                  If specified, libraries are assumed to contain paired-end");
7737 	SUBREADputs("                      reads. For any library that contains paired-end reads, the");
7738 	SUBREADputs("                      'countReadPairs' parameter controls if read pairs or reads");
7739 	SUBREADputs("                      should be counted.");
7740 	SUBREADputs("");
7741 	SUBREADputs("  --countReadPairs    If specified, fragments (or templates) will be counted");
7742 	SUBREADputs("                      instead of reads. This option is only applicable for");
7743 	SUBREADputs("                      paired-end reads. For single-end data, it is ignored.");
7744 	SUBREADputs("");
7745 	SUBREADputs("  -B                  Only count read pairs that have both ends aligned.");
7746 	SUBREADputs("");
7747 	SUBREADputs("  -P                  Check validity of paired-end distance when counting read ");
7748 	SUBREADputs("                      pairs. Use -d and -D to set thresholds.");
7749 	SUBREADputs("");
7750 	SUBREADputs("  -d <int>            Minimum fragment/template length, 50 by default.");
7751 	SUBREADputs("");
7752 	SUBREADputs("  -D <int>            Maximum fragment/template length, 600 by default.");
7753 	SUBREADputs("");
7754 	SUBREADputs("  -C                  Do not count read pairs that have their two ends mapping ");
7755 	SUBREADputs("                      to different chromosomes or mapping to same chromosome ");
7756 	SUBREADputs("                      but on different strands.");
7757 	SUBREADputs("");
7758 	SUBREADputs("  --donotsort         Do not sort reads in BAM/SAM input. Note that reads from ");
7759 	SUBREADputs("                      the same pair are required to be located next to each ");
7760 	SUBREADputs("                      other in the input.");
7761 	SUBREADputs("");
7762 
7763 	SUBREADputs("# Number of CPU threads");
7764 	SUBREADputs("");
7765 	SUBREADputs("  -T <int>            Number of the threads. 1 by default.");
7766 	SUBREADputs("");
7767 
7768 	SUBREADputs("# Read groups");
7769 	SUBREADputs("");
7770 	SUBREADputs("  --byReadGroup       Assign reads by read group. \"RG\" tag is required to be");
7771 	SUBREADputs("                      present in the input BAM/SAM files.");
7772 	SUBREADputs("                      ");
7773 	SUBREADputs("");
7774 
7775 	SUBREADputs("# Long reads");
7776 	SUBREADputs("");
7777 	SUBREADputs("  -L                  Count long reads such as Nanopore and PacBio reads. Long");
7778 	SUBREADputs("                      read counting can only run in one thread and only reads");
7779 	SUBREADputs("                      (not read-pairs) can be counted. There is no limitation on");
7780 	SUBREADputs("                      the number of 'M' operations allowed in a CIGAR string in");
7781 	SUBREADputs("                      long read counting.");
7782 	SUBREADputs("");
7783 
7784 	SUBREADputs("# Assignment results for each read");
7785 	SUBREADputs("");
7786 	SUBREADputs("  -R <format>         Output detailed assignment results for each read or read-");
7787 	SUBREADputs("                      pair. Results are saved to a file that is in one of the");
7788 	SUBREADputs("                      following formats: CORE, SAM and BAM. See Users Guide for");
7789 	SUBREADputs("                      more info about these formats.");
7790 	SUBREADputs("");
7791 	SUBREADputs("  --Rpath <string>    Specify a directory to save the detailed assignment");
7792 	SUBREADputs("                      results. If unspecified, the directory where counting");
7793 	SUBREADputs("                      results are saved is used.");
7794 	SUBREADputs("");
7795 
7796 	SUBREADputs("# Miscellaneous");
7797 	SUBREADputs("");
7798 	SUBREADputs("  --tmpDir <string>   Directory under which intermediate files are saved (later");
7799 	SUBREADputs("                      removed). By default, intermediate files will be saved to");
7800 	SUBREADputs("                      the directory specified in '-o' argument.");
7801 	SUBREADputs("");
7802 	SUBREADputs("  --maxMOp <int>      Maximum number of 'M' operations allowed in a CIGAR");
7803 	SUBREADputs("                      string. 10 by default. Both 'X' and '=' are treated as 'M'");
7804 	SUBREADputs("                      and adjacent 'M' operations are merged in the CIGAR");
7805 	SUBREADputs("                      string.");
7806 	SUBREADputs("");
7807 	SUBREADputs("  --verbose           Output verbose information for debugging, such as un-");
7808 	SUBREADputs("                      matched chromosome/contig names.");
7809 	SUBREADputs("");
7810 	SUBREADputs("  -v                  Output version of the program.");
7811 	SUBREADputs("");
7812 
7813 }
7814 
junckey_sort_compare(void * inptr,int i,int j)7815 int junckey_sort_compare(void * inptr, int i, int j){
7816 	char ** inp = (char **) inptr;
7817 	int x1;
7818 
7819 	int chrI=-1, chrJ=-1;
7820 
7821 	if(atoi(inp[i])>0) chrI = atoi(inp[i]);
7822 	if(atoi(inp[j])>0) chrJ = atoi(inp[j]);
7823 
7824 	if(inp[i][0]=='X' && !isdigit(inp[i][1])&& !isalpha(inp[i][1])) chrI = 90;
7825 	if(inp[i][0]=='Y' && !isdigit(inp[i][1])&& !isalpha(inp[i][1])) chrI = 91;
7826 	if(inp[i][0]=='M' && !isdigit(inp[i][1])&& !isalpha(inp[i][1])) chrI = 99;
7827 	if(inp[j][0]=='X' && !isdigit(inp[j][1])&& !isalpha(inp[j][1])) chrJ = 90;
7828 	if(inp[j][0]=='Y' && !isdigit(inp[j][1])&& !isalpha(inp[j][1])) chrJ = 91;
7829 	if(inp[j][0]=='M' && !isdigit(inp[j][1])&& !isalpha(inp[j][1])) chrJ = 99;
7830 
7831 
7832 
7833 	if(memcmp(inp[i], "chr", 3)==0){
7834 		chrI=atoi(inp[i]+3);
7835 		if(0 == chrI && inp[i][3] == 'X') chrI = 90;
7836 		if(0 == chrI && inp[i][3] == 'Y') chrI = 91;
7837 		if(0 == chrI && inp[i][3] == 'M') chrI = 99;
7838 	}
7839 	if(memcmp(inp[j], "chr", 3)==0){
7840 		chrJ=atoi(inp[j]+3);
7841 		if(0 == chrJ && inp[j][3] == 'X') chrJ = 90;
7842 		if(0 == chrJ && inp[j][3] == 'Y') chrJ = 91;
7843 		if(0 == chrJ && inp[j][3] == 'M') chrJ = 99;
7844 	}
7845 
7846 	int len_I_long = 9;
7847 	for(x1 = 0 ; x1 < FEATURE_NAME_LENGTH + 15 ; x1++){
7848 		int c1 = inp[i][x1];
7849 		int c2 = inp[j][x1];
7850 		if(c1 == '\t' && c2 != '\t')
7851 			len_I_long = -1;
7852 		else if(c1 != '\t' && c2 == '\t')
7853 			len_I_long = 1;
7854 		else if(c1 == '\t' && c2 == '\t')
7855 			len_I_long = 0;
7856 
7857 		if(len_I_long != 9) break;
7858 	}
7859 
7860 	if(chrI != chrJ || len_I_long != 0){
7861 		return (chrI * 100 + len_I_long) - (chrJ * 100);
7862 	}
7863 
7864 	for(x1 = 0 ; x1 < FEATURE_NAME_LENGTH + 15 ; x1++){
7865 		int c1 = inp[i][x1];
7866 		int c2 = inp[j][x1];
7867 		if(c1 != c2){
7868 			return c1 - c2;
7869 		}else if(c1 == '\t' && c1 == c2){
7870 			int pos1 = atoi(inp[i]+x1+1);
7871 			int pos2 = atoi(inp[j]+x1+1);
7872 			if( pos1 == pos2)
7873 				return strcmp(inp[i], inp[j]);
7874 			else
7875 				return pos1 - pos2;
7876 		}
7877 
7878 		if(c1 == 0 || c2 == 0)return c1 - c2;
7879 	}
7880 	return 0;
7881 }
7882 
junckey_sort_exchange(void * inptr,int i,int j)7883 void junckey_sort_exchange(void * inptr, int i, int j){
7884 
7885 	char ** inp = (char **) inptr;
7886 	char * tmpp = inp[j];
7887 	inp[j]=inp[i];
7888 	inp[i]=tmpp;
7889 }
7890 
junckey_sort_merge(void * inptr,int start,int items1,int items2)7891 void junckey_sort_merge(void * inptr, int start, int items1, int items2){
7892 	char ** inp = (char **) inptr;
7893 	char ** tmpp = malloc(sizeof(char *) * (items1+items2));
7894 	int read_1_ptr = start, read_2_ptr = start+items1, outptr = 0;
7895 	while(1){
7896 		if(read_1_ptr == start+items1 && read_2_ptr == start+items1+items2) break;
7897 		if((read_1_ptr == start+items1)||(read_2_ptr < start+items1+items2 &&  junckey_sort_compare(inptr, read_1_ptr, read_2_ptr) > 0 )) {
7898 			// select 2
7899 			tmpp[outptr++]=inp[read_2_ptr++];
7900 		} else {
7901 			// select 1
7902 			tmpp[outptr++]=inp[read_1_ptr++];
7903 		}
7904 	}
7905 	memcpy(inp + start, tmpp, sizeof(char *)*(items1+items2));
7906 	free(tmpp);
7907 }
7908 
junccmp(fc_junction_gene_t * j1,fc_junction_gene_t * j2)7909 int junccmp(fc_junction_gene_t * j1, fc_junction_gene_t * j2){
7910 	if(strcmp( j1 -> gene_name, j2 -> gene_name ) == 0)
7911 		return 0;
7912 	return 1;
7913 }
7914 
7915 
fc_write_final_junctions(fc_thread_global_context_t * global_context,char * output_file_name,ArrayList * column_names,ArrayList * junction_global_table_list,ArrayList * splicing_global_table_list)7916 void fc_write_final_junctions(fc_thread_global_context_t * global_context,  char * output_file_name, ArrayList * column_names, ArrayList * junction_global_table_list, ArrayList * splicing_global_table_list){
7917 	int infile_i, disk_is_full = 0;
7918 
7919 	HashTable * merged_junction_table = HashTableCreate(156679);
7920 
7921 	HashTableSetHashFunction(merged_junction_table,HashTableStringHashFunction);
7922 	HashTableSetDeallocationFunctions(merged_junction_table, NULL, NULL);
7923 	HashTableSetKeyComparisonFunction(merged_junction_table, fc_strcmp_chro);
7924 
7925 	HashTable * merged_splicing_table = HashTableCreate(156679);
7926 
7927 	HashTableSetHashFunction(merged_splicing_table,HashTableStringHashFunction);
7928 	HashTableSetDeallocationFunctions(merged_splicing_table, NULL, NULL);
7929 	HashTableSetKeyComparisonFunction(merged_splicing_table, fc_strcmp_chro);
7930 
7931 
7932 	for(infile_i = 0 ; infile_i < column_names -> numOfElements ; infile_i ++){
7933 		KeyValuePair * cursor;
7934 		int bucket;
7935 		HashTable * spl_table = ArrayListGet(splicing_global_table_list, infile_i);
7936 		for(bucket=0; bucket < spl_table -> numOfBuckets; bucket++)
7937 		{
7938 			cursor = spl_table -> bucketArray[bucket];
7939 			while (cursor)
7940 			{
7941 				char * ky = (char *)cursor -> key;
7942 				unsigned int old_supp = HashTableGet(merged_splicing_table, ky) - NULL;
7943 				old_supp += (cursor -> value - NULL);
7944 				HashTablePut(merged_splicing_table, ky, NULL+old_supp);
7945 				cursor = cursor -> next;
7946 			}
7947 		}
7948 	}
7949 
7950 	for(infile_i = 0 ; infile_i < column_names -> numOfElements ; infile_i ++){
7951 		KeyValuePair * cursor;
7952 		int bucket;
7953 		HashTable *  junc_table = ArrayListGet(junction_global_table_list, infile_i);
7954 		for(bucket=0; bucket < junc_table -> numOfBuckets; bucket++)
7955 		{
7956 			cursor = junc_table -> bucketArray[bucket];
7957 			while (cursor)
7958 			{
7959 				char * ky = (char *)cursor -> key;
7960 
7961 				if(HashTableGet(merged_junction_table, ky)==NULL)
7962 					HashTablePut(merged_junction_table, ky, NULL+1);
7963 				cursor = cursor -> next;
7964 			}
7965 		}
7966 	}
7967 
7968 	char ** key_list;
7969 	key_list = malloc(sizeof(char *) * merged_junction_table -> numOfElements);
7970 
7971 	KeyValuePair * cursor;
7972 	int bucket, ky_i = 0;
7973 	for(bucket=0; bucket < merged_junction_table -> numOfBuckets; bucket++){
7974 		cursor = merged_junction_table -> bucketArray[bucket];
7975 		while (cursor){
7976 			char * ky = (char *)cursor -> key;
7977 
7978 			key_list[ky_i ++] = ky;
7979 			cursor = cursor -> next;
7980 		}
7981 	}
7982 
7983 	merge_sort(key_list,  merged_junction_table -> numOfElements , junckey_sort_compare, junckey_sort_exchange, junckey_sort_merge);
7984 
7985 	char outfname[MAX_FILE_NAME_LENGTH];
7986 	sprintf(outfname, "%s.jcounts", output_file_name);
7987 
7988 	int max_junction_genes = 3000;
7989 	char * gene_names = malloc(max_junction_genes * FEATURE_NAME_LENGTH), * gene_name_tail;
7990 	fc_junction_gene_t ** ret_juncs_small = malloc(sizeof(fc_junction_gene_t *) * max_junction_genes);
7991 	fc_junction_gene_t ** ret_juncs_large = malloc(sizeof(fc_junction_gene_t *) * max_junction_genes);
7992 	fc_junction_gene_t ** junction_key_list = malloc(sizeof(fc_junction_gene_t *)* max_junction_genes * 2);
7993 	unsigned int * junction_support_list = malloc(sizeof(int)* max_junction_genes * 2);
7994 	unsigned char * junction_source_list = malloc(sizeof(char)* max_junction_genes * 2 );
7995 
7996 	int ky_i1, ky_i2;
7997 	FILE * ofp = fopen(outfname, "w");
7998 	char * tmpp = NULL;
7999 
8000 	fprintf(ofp, "PrimaryGene\tSecondaryGenes\tSite1_chr\tSite1_location\tSite1_strand\tSite2_chr\tSite2_location\tSite2_strand");
8001 
8002 	for(infile_i=0; infile_i < column_names -> numOfElements; infile_i++)
8003 	{
8004 		char * next_fn = ArrayListGet(column_names, infile_i);
8005 		fprintf(ofp,"\t%s", global_context -> use_stdin_file?"STDIN":next_fn);
8006 	}
8007 	fprintf(ofp, "\n");
8008 
8009 	for(ky_i = 0; ky_i < merged_junction_table -> numOfElements ; ky_i ++){
8010 
8011 		//SUBREADprintf("KY=%s\n", key_list[ky_i]);
8012 
8013 		int unique_junctions = 0;
8014 		char * chro_small = strtok_r( key_list[ky_i] , "\t", &tmpp);
8015 		char * pos_small_str = strtok_r( NULL, "\t", &tmpp);
8016 		char * chro_large = strtok_r( NULL, "\t", &tmpp);
8017 		char * pos_large_str = strtok_r( NULL, "\t", &tmpp);
8018 
8019 		unsigned int pos_small = atoi(pos_small_str);
8020 		unsigned int pos_large = atoi(pos_large_str);
8021 
8022 		int found_features_small = locate_junc_features(global_context, chro_small, pos_small, ret_juncs_small , max_junction_genes);
8023 		int found_features_large = locate_junc_features(global_context, chro_large, pos_large, ret_juncs_large , max_junction_genes);
8024 
8025 		char * strand = "NA";
8026 		if(global_context -> fasta_contigs){
8027 			char donor[3], receptor[3];
8028 			donor[2]=receptor[2]=0;
8029 			int has = !get_contig_fasta(global_context -> fasta_contigs, chro_small, pos_small, 2, donor);
8030 			has = has && !get_contig_fasta(global_context -> fasta_contigs, chro_large, pos_large-3, 2, receptor);
8031 			if(has){
8032 				if(donor[0]=='G' && donor[1]=='T' && receptor[0]=='A' && receptor[1]=='G') strand = "+";
8033 				else if(donor[0]=='C' && donor[1]=='T' && receptor[0]=='A' && receptor[1]=='C') strand = "-";
8034 			}else if(!global_context ->is_junction_no_chro_shown){
8035 				global_context ->is_junction_no_chro_shown = 1;
8036 				print_in_box(80,0,0, "   WARNING contig '%s' is not found in the", chro_small);
8037 				print_in_box(80,0,0, "   provided genome file.");
8038 				print_in_box(80,0,0,"");
8039 
8040 			}
8041 		}
8042 
8043 		//SUBREADprintf("FOUND=%d, %d\n", found_features_small, found_features_large);
8044 
8045 		gene_name_tail = gene_names;
8046 		gene_names[0]=0;
8047 
8048 		// rules to choose the primary gene:
8049 		// (1) if some genes have one support but the other have multiple supporting reads: remove the lowly supported genes
8050 		// (2) if all genes have only one support but from different ends of the fragment, then remove the genes that are assigned to the end having lower supporting fragments
8051 		// (3) choose the gene that have the smallest coordinate.
8052 
8053 		int max_supp = 0;
8054 		for(ky_i1 = 0; ky_i1 < found_features_small + found_features_large; ky_i1++){
8055 			int is_duplicate = 0;
8056 			fc_junction_gene_t * tested_key = (ky_i1 < found_features_small)?ret_juncs_small[ky_i1] :ret_juncs_large[ky_i1 - found_features_small];
8057 			for(ky_i2 = 0; ky_i2 < unique_junctions; ky_i2 ++){
8058 				if(junccmp( tested_key, junction_key_list[ky_i2]  )==0){
8059 					junction_support_list[ ky_i2 ] ++;
8060 					junction_source_list[ky_i2] |= ( (ky_i1 < found_features_small)? 1 : 2 );
8061 					is_duplicate = 1;
8062 
8063 					max_supp = max(junction_support_list[ky_i2], max_supp);
8064 					break;
8065 				}
8066 			}
8067 
8068 			if(!is_duplicate){
8069 				junction_key_list[unique_junctions] = tested_key;
8070 				junction_support_list[unique_junctions] = 1;
8071 				junction_source_list[unique_junctions] = ( (ky_i1 < found_features_small)? 1 : 2 );
8072 				max_supp = max(junction_support_list[unique_junctions], max_supp);
8073 				unique_junctions++;
8074 			}
8075 		}
8076 
8077 		if(1 == max_supp){
8078 			if(found_features_small > 0 && found_features_large > 0){
8079 				char junc_key [FEATURE_NAME_LENGTH + 15];
8080 				sprintf(junc_key, "%s\t%u", chro_small, pos_small);
8081 				unsigned int supp_small = HashTableGet(merged_splicing_table, junc_key) - NULL;
8082 				sprintf(junc_key, "%s\t%u", chro_large, pos_large);
8083 				unsigned int supp_large = HashTableGet(merged_splicing_table, junc_key) - NULL;
8084 
8085 				if(supp_small !=supp_large){
8086 					for(ky_i2 = 0; ky_i2 < unique_junctions; ky_i2 ++){
8087 						if(supp_small > supp_large && junction_source_list[ky_i2] == 1) junction_key_list[ky_i2] = NULL;
8088 						else if(supp_small < supp_large && junction_source_list[ky_i2] == 2) junction_key_list[ky_i2] = NULL;
8089 					}
8090 				}
8091 			}
8092 		}
8093 
8094 		int smallest_coordinate_gene = 0x7fffffff;
8095 		fc_junction_gene_t * primary_gene = NULL;
8096 
8097 		for(ky_i2 = 0; ky_i2 < unique_junctions; ky_i2 ++){
8098 			fc_junction_gene_t * tested_key = junction_key_list[ky_i2];
8099 			if(tested_key != NULL && junction_support_list[ky_i2] == max_supp && tested_key -> pos_first_base < smallest_coordinate_gene){
8100 				primary_gene = tested_key;
8101 				smallest_coordinate_gene = tested_key -> pos_first_base;
8102 			}
8103 		}
8104 
8105 		if(primary_gene == NULL){
8106 			strcpy(gene_names, "NA");
8107 		}else{
8108 			strcpy(gene_names, primary_gene -> gene_name);
8109 		}
8110 
8111 		*(pos_small_str-1)='\t';
8112 		*(pos_large_str-1)='\t';
8113 
8114 		fprintf(ofp, "%s", gene_names);
8115 
8116 		gene_name_tail = gene_names;
8117 		gene_names[0]=0;
8118 		for(ky_i2 = 0; ky_i2 < unique_junctions; ky_i2 ++){
8119 			fc_junction_gene_t * tested_key = junction_key_list[ky_i2];
8120 			if(tested_key && tested_key != primary_gene)
8121 				gene_name_tail += sprintf(gene_name_tail, "%s,", tested_key -> gene_name);
8122 		}
8123 		if( gene_names[0] ) gene_name_tail[-1]=0;
8124 		else strcpy(gene_names, "NA");
8125 		fprintf(ofp, "\t%s", gene_names);
8126 
8127 		fprintf(ofp, "\t%s\t%s\t%s\t%s", chro_small, strand, chro_large, strand);
8128 
8129 		chro_large[-1]='\t';
8130 
8131 		for(infile_i = 0 ; infile_i < column_names -> numOfElements ; infile_i ++){
8132 			HashTable * junc_table = ArrayListGet(junction_global_table_list, infile_i);
8133 			srInt_64 count = HashTableGet(junc_table, key_list[ky_i]) - NULL;
8134 			#ifdef __MINGW32__
8135 			fprintf(ofp,"\t%I64d", count);
8136 			#else
8137 			fprintf(ofp,"\t%lld", count);
8138 			#endif
8139 		}
8140 		int wlen = fprintf(ofp, "\n");
8141 		if(wlen < 1) disk_is_full = 1;
8142 	}
8143 	fclose(ofp);
8144 	free(junction_key_list);
8145 	free(gene_names);
8146 	free(ret_juncs_small);
8147 	free(ret_juncs_large);
8148 	free(junction_support_list);
8149 	free(key_list);
8150 	free(junction_source_list);
8151 
8152 	//print_in_box(80,0,PRINT_BOX_CENTER,"Found %llu junctions in all the input files.", merged_junction_table -> numOfElements);
8153 	//print_in_box(80,0,0,"");
8154 
8155 	HashTableDestroy(merged_junction_table);
8156 	HashTableDestroy(merged_splicing_table);
8157 	if(disk_is_full){
8158 		unlink(outfname);
8159 		SUBREADprintf("ERROR: disk is full; no junction counting table is generated.\n");
8160 	}
8161 }
8162 
scRNA_copy_loaded_features(srInt_64 nexons,fc_feature_info_t * loaded_features)8163 HashTable * scRNA_copy_loaded_features(srInt_64 nexons, fc_feature_info_t* loaded_features){
8164 	HashTable * ret = HashTableCreate(50000);
8165 	srInt_64 x1;
8166 	for(x1 =0; x1<nexons; x1++)
8167 		HashTablePut(ret , NULL +1 +loaded_features[x1].sorted_order, NULL +1 +x1);
8168 	return ret;
8169 }
8170 
8171 int readSummary_single_file(fc_thread_global_context_t * global_context, read_count_type_t * column_numbers, srInt_64 nexons,  int * geneid, char ** chr, srInt_64 * start, srInt_64 * stop, unsigned char * sorted_strand, char * anno_chr_2ch, char ** anno_chrs, srInt_64 * anno_chr_head, srInt_64 * block_end_index, srInt_64 * block_min_start , srInt_64 * block_max_end, fc_read_counters * my_read_counter, HashTable * junc_glob_tab, HashTable * splicing_glob_tab, HashTable * merged_RG_table, fc_feature_info_t * loaded_features);
8172 
Input_Files_And_Strand_Mode_Pair(char * fnames,char * smodes)8173 int Input_Files_And_Strand_Mode_Pair(char * fnames, char * smodes){
8174 	int ret = 0, ch, bad_fmt = 0, numbs = 0;
8175 	//SUBREADputs(fnames);
8176 	//SUBREADputs(smodes);
8177 	if(strstr(smodes, ".")==NULL){
8178 		bad_fmt = smodes[0]<'0' || smodes[0]>'2';
8179 	}else{
8180 		while('\0'!=(ch=*(fnames++)))if(ch == FC_FLIST_SPLITOR[0])ret++;
8181 		while('\0'!=(ch=*(smodes++))){
8182 			if(ch == '.'){
8183 				if(numbs != 1) bad_fmt = 1;
8184 				numbs = 0;
8185 				ret--;
8186 			}else if(ch >= '0' && ch <= '2') numbs++;
8187 		}
8188 		if(numbs != 1) bad_fmt = 1;
8189 	}
8190 	if(bad_fmt) SUBREADputs("Error: The strand mode list has a wrong format.");
8191 	if(ret) SUBREADputs("Error: The length of strand mode list differs from the length of input file list");
8192 	ret |= bad_fmt;
8193 	return ret;
8194 }
8195 
readSummary(int argc,char * argv[])8196 int readSummary(int argc,char *argv[]){
8197 
8198 	/*
8199 	   This function counts the number of reads falling into each exon region.
8200 	   The order of exons in the output is the same as that of exons included in the annotation.
8201 	   The annotation, if provided as a file, should be sorted by chromosome name.
8202 
8203 	   Parameters passed from the featureCounts R function:
8204 	0: "readSummary"
8205 	1: ann
8206 	2: files[i]
8207 	3: fout
8208 	4: as.numeric(isPairedEnd)
8209 	5: min.distance
8210 	6: max.distance
8211 	7: as.numeric(tolower(file.type)=="sam")
8212 	8: as.numeric(allowMultiOverlap)
8213 	9: as.numeric(isGeneLevel)
8214 	10: as.numeric(nthreads)
8215 	11: as.numeric(isGTFannotation)
8216 	12: isStrandChecked
8217 	13: as.numeric(isReadSummaryReported)
8218 	14: as.numeric(isBothEndMapped)
8219 	15: as.numeric(isChimericDisallowed)
8220 	16: as.numeric(isPEDistChecked)
8221 	17: nameFeatureTypeColumn
8222 	18: nameGeneIDColumn
8223 	19: min.MappingQualityScore
8224 	20: as.numeric(isMultiMappingAllowed) # "1" : NH tag > 1 is allowed  ; "0" : not allowd (by default)
8225 	21: Annotation Chromosome Alias Name File. If the file is not specified, set this value to NULL or a zero-length string.
8226 	22: Command line for CfeatureCounts header output; RfeatureCounts should set this value to NULL or a zero-length string or a space (' ').
8227 	23: as.numeric(isInputFileResortNeeded)
8228 	24: NOT IN USE: as.numeric(feature_block_size) # This parameter is no longer used. Give "14" for safe.
8229 	25: as.numeric(Five_End_Extension_Length)  # 5' end extension
8230 	26: as.numeric(Three_End_Extension_Length)  # 3' end extension
8231 	27: as.numeric(Minimum_Overlap_Between_Read_And_Feature) # 1 by default
8232 	28: as.numeric(is_Split_or_Exonic_Only) # 0 by default; 0: all reads are counted ; 1: only split (Cigar has "N") reads are counted ; 2: only exonic (no "N" in Cigar) are counted.
8233 	29: as.numeric(reduce_5_3_ends_to_one) # 0= no reduction; 1= reduce to 5' end; 2= reduce to 3' end
8234 	30: debug_command # This is for debug only; RfeatureCounts should pass a space (" ") to this parameter, disabling the debug command.
8235 	31: as.numeric(is_duplicate_ignored) # 0 = INCLUDE DUPLICATE READS; 1 = IGNORE DUPLICATE READS (0x400 FLAG IS SET) ; "0" by default.
8236 	32: as.numeric(do_not_sort)   # 1 = NEVER SORT THE PE BAM/SAM FILES; 0 = SORT THE BAM/SAM FILE IF IT IS FOUND NOT SORTED.
8237 	33: as.numeric(fractionMultiMapping) # 1 = calculate fraction numbers if a read overlaps with multiple features or meta-features. "-M" must be specified when fractions are caculated.
8238 	34: as.numeric(useOverlappingBreakTie) # 1 = Select features or meta-features with a longer overlapping length; 0 = just use read-voting strategy: one overlapping read = 1 vote
8239 	35: Pair_Orientations # FF, FR, RF or RR. This parameter matters only if "-s" option is 1 or 2.
8240 	36: as.numeric(doJunctionCounting)  # 1 = count the number of junction reads spaining each exon-exon pairs;  0 = do not.
8241 	37: file name of genome fasta (for determine the strandness of junctions by looking for GT/AG or CT/AC).
8242 	38: as.numeric(max_M_Ops) # maximum "M" sections allowed in the CIGAR string. This parameter is needed in parse_BIN()
8243 	39: as.numeric(is_Restrictly_No_Overlapping) # when "1", disable the voting-based tie breaking (e.g., when the reads are paired-end and one gene receives two votes but the other gene only has one.). "0" by default.
8244 	40: as.numeric(min_Fractional_Overlap) # A fractioal number.  0.00 : at least 1 bp overlapping
8245 	41: temp_directory # the directory to put temp files. "<use output directory>" by default, namely find it from the output file dir.
8246 	42: as.numeric(use_stdin_stdout) # only for CfeatureCounts. When use_stdin_stdout & 0x01 > 0, the input file is from stdin (stored in a temporary file); when use_stdin_stdout & 0x02 > 0, the output should be written to STDOUT instead of a file.
8247 	43: as.numeric(assign_reads_to_RG) # 1: reads with "RG" tags will be assigned to read groups' 0: default setting
8248 	44: as.numeric(long_read_minimum_length) # "1": treat the input BAM or SAM files as containing long reads. No multi-threading. "0": classic behaviour.
8249 	45: as.numeric(is_verbose) # 1: show the mismatched chromosome names on screet; 0: don't do so
8250 	46: as.numeric(frac_feature_overlap) # fraction of the feature to be overlapped with a read
8251 	47: as.numeric(do_detection_call) # do detectionCalls : put the GC fraction into the 2nd column.
8252 	48: as.numeric(max_missing_bases_in_read) # maximum # of bases in a read or fragment not overlapping with an exon ; efault value: "-1" means no limit
8253 	49: as.numeric(max_missing_bases_in_feature) # maximum # of bases in an exon not overlapping with a read or fragment ; default value: "-1" means no limit
8254 	50: as.numeric(is_Primary_Alignment_only) # "1" : only count the primary alignment (FLAG doesn't have 0x100 bit); "0" : count alignments no metter the 0x100 bit (by default)
8255 	51: Rpath : the path where the assignment details per read are stored.
8256 	52: AdditionalColumnList: the names of additional column names written after "Length". Comma deliminated.
8257 	53: annotation_file_screen_output : just for displaying the annotation file name or inbuilt (mm10/hg39/...) or R data.frame.
8258 	54: read_shift_type : how to shift reads? "upstream" : to the 5' end; "downstream" : to the 3' end; "left" : to the smaller coordinates in chromosome ; "right" : to the larger coordinates in chromosome.
8259 	55: as.numeric(read_shift_size) : how many bases to shift. Mush be a positive number or zero.
8260 	 */
8261 
8262 	int isCVersion, isChimericDisallowed, isPEDistChecked, minMappingQualityScore=0, isInputFileResortNeeded, feature_block_size = 20, reduce_5_3_ends_to_one, useStdinFile, assignReadsToRG, long_read_minimum_length, is_verbose, do_detectionCall, max_missing_bases_in_feature, max_missing_bases_in_read, is_Primary_Alignment_only, read_shift_size, read_shift_type, scRNA_input_mode;
8263 	float fracOverlap, fracOverlapFeature, umi_cutoff;
8264 	char **chr;
8265 	srInt_64 *start, *stop;
8266 	int *geneid;
8267 
8268 	char *nameFeatureTypeColumn, *nameGeneIDColumn,*debug_command, *pair_orientations="fr", *temp_dir, *file_name_ptr =NULL, *strand_check_mode = NULL, *extra_column_names = NULL, *scRNA_sample_sheet = NULL, *scRNA_cell_barcode_list = NULL ;
8269 	srInt_64 nexons;
8270 
8271 
8272 	srInt_64 * anno_chr_head, * block_min_start, *block_max_end, *block_end_index;
8273 	char ** anno_chrs, * anno_chr_2ch;
8274 	char * fasta_contigs_fname, *annotation_file_screen_output;
8275 	unsigned char * sorted_strand;
8276 
8277 	int minPEDistance, maxPEDistance, isReadSummaryReport, isBothEndRequired, isMultiMappingAllowed, fiveEndExtension, threeEndExtension, minFragmentOverlap, isSplitOrExonicOnly, is_duplicate_ignored, doNotSort, fractionMultiMapping, useOverlappingBreakTie, doJuncCounting, max_M, isRestrictlyNoOvelrapping ,is_scRNA_BAM_FQ_out_generated, scRNA_rerun_on_persample_BAM;
8278 	char * isPEassign,  *is_paired_end_reads_expected;
8279 
8280 	int  isGTF, n_input_files=0;
8281 	char * alias_file_name = NULL, * cmd_rebuilt = NULL, * Rpath = NULL;
8282 
8283 	int isMultiOverlapAllowed, isGeneLevel;
8284 
8285 	isCVersion = ((argv[0][0]=='C')?1:0);
8286 
8287 	isPEassign = argv[4];
8288 	minPEDistance = atoi(argv[5]);
8289 	maxPEDistance = atoi(argv[6]);
8290 
8291 	//  isSAM = atoi(argv[7]);
8292 	isMultiOverlapAllowed = atoi(argv[8]);
8293 	isGeneLevel = atoi(argv[9]);
8294 	unsigned short thread_number;
8295 	if(argc > 10)
8296 		thread_number = atoi(argv[10]);
8297 	else	thread_number = 4;
8298 	if(argc > 11)
8299 		isGTF = atoi(argv[11]);
8300 	else	isGTF = 0;
8301 	if(argc > 12)
8302 		strand_check_mode = argv[12];
8303 	else	strand_check_mode = NULL;
8304 	if(argc > 13)
8305 		isReadSummaryReport = atoi(argv[13]);
8306 	else	isReadSummaryReport = 0;
8307 	if(argc > 14)
8308 		isBothEndRequired = atoi(argv[14]);
8309 	else	isBothEndRequired = 0;
8310 	if(argc > 15)
8311 		isChimericDisallowed = atoi(argv[15]);
8312 	else	isChimericDisallowed = 0;
8313 	if(argc > 16)
8314 		isPEDistChecked = atoi(argv[16]);
8315 	else	isPEDistChecked = 0;
8316 
8317 
8318 	if(isPEDistChecked && 0==isBothEndRequired){
8319 		#ifdef MAKE_STANDALONE
8320 		SUBREADprintf("ERROR: when the '-P' option is specified for checking fragment lengths, the '-B' option must also be specified to require both ends mapped.\n");
8321 		#else
8322 		SUBREADprintf("ERROR: when parameter checkFragLength is set to TRUE, parameter requireBothEndMapped also needs to be set to TRUE.\n");
8323 		#endif
8324 		return -1;
8325 	}
8326 
8327 	if(argc > 17)
8328 		nameFeatureTypeColumn = argv[17];
8329 	else	nameFeatureTypeColumn = "exon";
8330 	if(argc > 18)
8331 		nameGeneIDColumn = argv[18];
8332 	else	nameGeneIDColumn = "gene_id";
8333 	if(argc > 19)
8334 		minMappingQualityScore = atoi(argv[19]);
8335 	else	minMappingQualityScore = 0;
8336 	if(argc > 20)
8337 		isMultiMappingAllowed = atoi(argv[20]);
8338 	else	isMultiMappingAllowed = 0;
8339 	if(argc > 21)
8340 	{
8341 		alias_file_name = argv[21];
8342 		if(alias_file_name == NULL || alias_file_name[0]==' ' || alias_file_name[0]==0)
8343 			alias_file_name = NULL;
8344 	}
8345 	else	alias_file_name = NULL;
8346 	if(argc > 22)
8347 	{
8348 		cmd_rebuilt = argv[22];
8349 		if(cmd_rebuilt == NULL || cmd_rebuilt[0]==' '||cmd_rebuilt[0]==0)
8350 			cmd_rebuilt=NULL;
8351 	}
8352 	else	cmd_rebuilt = NULL;
8353 	if(argc>23)
8354 		isInputFileResortNeeded = atoi(argv[23]);
8355 	else	isInputFileResortNeeded = 0;
8356 	if(thread_number<1) thread_number=1;
8357 	if(thread_number>FC_MAX_THREADS)thread_number=FC_MAX_THREADS;
8358 
8359 	int Param_fiveEndExtension, Param_threeEndExtension;
8360 	if(argc>25)
8361 		Param_fiveEndExtension = atoi(argv[25]);
8362 	else    Param_fiveEndExtension = 0;
8363 
8364 	if(argc>26)
8365 		Param_threeEndExtension = atoi(argv[26]);
8366 	else    Param_threeEndExtension = 0;
8367 
8368 	if(argc>27)
8369 		minFragmentOverlap = atoi(argv[27]);
8370 	else    minFragmentOverlap = 1;
8371 
8372 	if(minFragmentOverlap <1){
8373 		fiveEndExtension = 1 - minFragmentOverlap;
8374 		threeEndExtension = 1 - minFragmentOverlap;
8375 		minFragmentOverlap = 1;
8376 	}else{
8377 		fiveEndExtension = Param_fiveEndExtension;
8378 		threeEndExtension = Param_threeEndExtension;
8379 	}
8380 
8381 	if(argc>28)
8382 		isSplitOrExonicOnly = atoi(argv[28]);
8383 	else	isSplitOrExonicOnly = 0;
8384 
8385 	if(argc>29)
8386 		reduce_5_3_ends_to_one = atoi(argv[29]);	// 0 : no reduce; 1: reduce to 5' end; 2: reduce to 3' end.
8387 	else	reduce_5_3_ends_to_one = 0;
8388 
8389 
8390 	if(argc>30 && strlen(argv[30])>0 && argv[30][0]!=' ')
8391 		debug_command = argv[30];
8392 	else
8393 		debug_command = " ";
8394 
8395 	if(argc>31)
8396 		is_duplicate_ignored = atoi(argv[31]);
8397 	else
8398 		is_duplicate_ignored = 0;
8399 
8400 	if(argc>32)
8401 		doNotSort = atoi(argv[32]);
8402 	else
8403 		doNotSort = 0;
8404 
8405 	if(argc>33)
8406 		fractionMultiMapping = atoi(argv[33]);
8407 	else
8408 		fractionMultiMapping = 0;
8409 
8410 	if(argc>34)
8411 		useOverlappingBreakTie = atoi(argv[34]);
8412 	else	useOverlappingBreakTie = 0;
8413 
8414 
8415 	/*if(argc>35) "-S" is depreciated.
8416 		pair_orientations = argv[35];
8417 	else	pair_orientations = "FR";
8418 	*/
8419 
8420 	if(argc>36)
8421 		doJuncCounting = atoi(argv[36]);
8422 	else	doJuncCounting = 0;
8423 
8424 	fasta_contigs_fname = NULL;
8425 	if(argc>37)
8426 		if(argv[37][0] != 0 && argv[37][0]!=' ')
8427 			fasta_contigs_fname = argv[37];
8428 
8429 	if(argc>38)
8430 		max_M = atoi(argv[38]);
8431 	else	max_M = 10;
8432 
8433 	if(argc>39)
8434 		isRestrictlyNoOvelrapping = atoi(argv[39]);
8435 	else	isRestrictlyNoOvelrapping = 0;
8436 
8437 	if(argc>40)
8438 		fracOverlap = atof(argv[40]);
8439 	else	fracOverlap= 0.0;
8440 
8441 	if(argc>41){
8442 		if(strcmp("<use output directory>", argv[41])!=0)temp_dir = argv[41];
8443 		else temp_dir = NULL;
8444 	}
8445 	else	temp_dir = NULL;//	get_temp_dir_from_out(temp_dir, (char *)argv[3]);
8446 
8447 	if(argc>42){
8448 		useStdinFile = (atoi(argv[42]) & 1)!=0;
8449 	}else	useStdinFile = 0;
8450 
8451 	if(argc>43)
8452 		assignReadsToRG = (argv[43][0]=='1');
8453 	else  assignReadsToRG = 0;
8454 
8455 	if(argc>44)
8456 		long_read_minimum_length = atoi(argv[44])?1:1999999999;
8457 	else  long_read_minimum_length = 1999999999;
8458 
8459 	if(long_read_minimum_length < 2 && isPEassign[0]=='1'){
8460 		SUBREADputs("ERROR: long read assignment can only be done on single-end mode");
8461 		return -1;
8462 	}
8463 
8464 	if(argc>45)
8465 		is_verbose = (argv[45][0]=='1');
8466 	else  is_verbose = 0;
8467 
8468 	if(argc>46)
8469 		fracOverlapFeature = atof(argv[46]);
8470 	else	fracOverlapFeature = 0.0;
8471 
8472 	if(argc>47)
8473 		do_detectionCall = (argv[47][0]=='1');
8474 	else  do_detectionCall = 0;
8475 
8476 	if(argc>48) max_missing_bases_in_read = atoi(argv[48]);
8477 	else  max_missing_bases_in_read = -1;
8478 
8479 	if(argc>49) max_missing_bases_in_feature = atoi(argv[49]);
8480 	else  max_missing_bases_in_feature = -1;
8481 
8482 	if(argc>50) is_Primary_Alignment_only = atoi(argv[50]);
8483 	else is_Primary_Alignment_only = 0;
8484 
8485 	if(argc>51 && argv[51]!=NULL && argv[51][0]!=0 && argv[51][0]!=' ') Rpath = argv[51];
8486 	else Rpath = NULL;
8487 
8488 	if(argc>52 && argv[52]!=NULL && argv[52][0]!=0 && argv[52][0]!=' ') extra_column_names = argv[52];
8489 	else extra_column_names = NULL;
8490 
8491 	annotation_file_screen_output = NULL;
8492 #ifndef MAKE_STANDALONE
8493 	if(argc>53) annotation_file_screen_output = argv[53];
8494 #endif
8495 
8496 	if(argc>54){
8497 		read_shift_type = -1;
8498 		if(strcmp(argv[54], "upstream")==0)read_shift_type = READ_SHIFT_UPSTREAM;
8499 		if(strcmp(argv[54], "downstream")==0) read_shift_type = READ_SHIFT_DOWNSTREAM;
8500 		if(strcmp(argv[54], "left")==0) read_shift_type = READ_SHIFT_LEFT;
8501 		if(strcmp(argv[54], "right")==0) read_shift_type = READ_SHIFT_RIGHT;
8502 	} else read_shift_type = READ_SHIFT_UPSTREAM;
8503 
8504 	if(argc>55) read_shift_size = atoi(argv[55]);
8505 	else read_shift_size = 0;
8506 
8507 	if(argc>56 && strlen(argv[56])>0 && argv[56][0]!=' ') scRNA_sample_sheet = argv[56];
8508 	else scRNA_sample_sheet = NULL;
8509 
8510 	if(argc>57 && strlen(argv[57])>0 && argv[57][0]!=' ') scRNA_cell_barcode_list = argv[57];
8511 	else scRNA_cell_barcode_list = NULL;
8512 
8513 	if(argc>58 && strlen(argv[58])>0 && argv[58][0]!=' ') is_paired_end_reads_expected = argv[58];
8514 	else is_paired_end_reads_expected = "0";
8515 
8516 	if(argc>59 && strlen(argv[59])>0 && argv[59][0]!=' ') is_scRNA_BAM_FQ_out_generated = atoi(argv[59]);
8517 	else is_scRNA_BAM_FQ_out_generated = 1;
8518 
8519 	if(argc>60) scRNA_input_mode = (argv[60][0]-'0');
8520 	else scRNA_input_mode = GENE_INPUT_BCL;
8521 
8522 	if(argc>61) scRNA_rerun_on_persample_BAM = (argv[61][0]-'0');
8523 	else scRNA_rerun_on_persample_BAM = 0;
8524 
8525 	if(argc>62) umi_cutoff = atof(argv[62]);
8526 	else umi_cutoff = -1;
8527 
8528 	if(read_shift_size<0){
8529 		SUBREADprintf("ERROR: why the value for read_shift_size is negative?\n");
8530 		return -1;
8531 	}
8532 
8533 	if(read_shift_type<0){
8534 		SUBREADprintf("ERROR: why the value for read_shift_type is %s?\n", argv[54]);
8535 		return -1;
8536 	}
8537 
8538 	if(SAM_pairer_warning_file_open_limit()) return -1;
8539 	if(strand_check_mode != NULL && Input_Files_And_Strand_Mode_Pair(argv[2],strand_check_mode)) return -1;
8540 	if(extra_column_names){
8541 		if(!isGTF){
8542 			SUBREADputs("ERROR: only GTF files contain additional attributes");
8543 			return -1;
8544 		}
8545 		int xk1, total_cols =1;
8546 		for(xk1=0; extra_column_names[xk1]; xk1++)
8547 			if(extra_column_names[xk1] == ';' || extra_column_names[xk1]==',' || extra_column_names[xk1]=='\t'){
8548 				extra_column_names[xk1]='\t';
8549 				total_cols ++;
8550 			}
8551 		if(total_cols>MAX_EXTRA_COLS){
8552 			SUBREADprintf("ERROR: there are more than %d additional attributes required\n", MAX_EXTRA_COLS);
8553 			return -1;
8554 		}
8555 	}
8556 
8557 	fc_thread_global_context_t global_context;
8558 
8559 	fc_thread_init_global_context(& global_context, FEATURECOUNTS_BUFFER_SIZE, thread_number, MAX_LINE_LENGTH, minPEDistance, maxPEDistance,isGeneLevel, isMultiOverlapAllowed, strand_check_mode, (char *)argv[3] , isReadSummaryReport, isBothEndRequired, isChimericDisallowed, isPEDistChecked, nameFeatureTypeColumn, nameGeneIDColumn, minMappingQualityScore,isMultiMappingAllowed, 0, alias_file_name, cmd_rebuilt, isInputFileResortNeeded, feature_block_size, isCVersion, fiveEndExtension, threeEndExtension , minFragmentOverlap, isSplitOrExonicOnly, reduce_5_3_ends_to_one, debug_command, is_duplicate_ignored, doNotSort, fractionMultiMapping, useOverlappingBreakTie, pair_orientations, doJuncCounting, max_M, isRestrictlyNoOvelrapping, fracOverlap, temp_dir, useStdinFile, assignReadsToRG, long_read_minimum_length, is_verbose, fracOverlapFeature, do_detectionCall, max_missing_bases_in_read, max_missing_bases_in_feature, is_Primary_Alignment_only, Rpath, extra_column_names, annotation_file_screen_output, read_shift_type, read_shift_size, scRNA_sample_sheet, scRNA_cell_barcode_list, is_scRNA_BAM_FQ_out_generated, scRNA_input_mode, scRNA_rerun_on_persample_BAM, umi_cutoff);
8560 
8561 	fc_thread_init_input_files( & global_context, argv[2], &file_name_ptr );
8562 
8563 	if( print_FC_configuration(&global_context, argv[1], file_name_ptr, argv[3], global_context.is_SAM_file, isGTF, & n_input_files, isReadSummaryReport, is_paired_end_reads_expected, isPEassign) )
8564 		return -1;
8565 	// Loading the annotations.
8566 	// Nothing is done if the annotation does not exist.
8567 	fc_feature_info_t * loaded_features;
8568 	print_in_box(84,0,0,"Load annotation file %s %c[0m...", get_short_fname(argv[1]), CHAR_ESC);
8569 	nexons = load_feature_info(&global_context,argv[1], isGTF?FILE_TYPE_GTF:FILE_TYPE_RSUBREAD, &loaded_features);
8570 	if(nexons<1){
8571 		if(nexons >= -1) SUBREADprintf("Failed to open the annotation file %s, or its format is incorrect, or it contains no '%s' features.\n",argv[1], nameFeatureTypeColumn);
8572 		return -1;
8573 	}
8574 
8575 	sort_feature_info(&global_context, nexons, loaded_features, &chr, &geneid, &start, &stop, &sorted_strand, &anno_chr_2ch, &anno_chrs, &anno_chr_head, & block_end_index, & block_min_start, & block_max_end);
8576 	if((!global_context.do_scRNA_table) || global_context.is_gene_level) global_context.lineno_2_sortedno_tab = NULL;
8577 	else global_context.lineno_2_sortedno_tab = scRNA_copy_loaded_features(nexons, loaded_features);
8578 	if(global_context.do_junction_counting){
8579 		sort_bucket_table(&global_context);
8580 	}
8581 	print_in_box(80,0,0,"   Meta-features : %d", global_context . gene_name_table -> numOfElements);
8582 	print_in_box(80,0,0,"   Chromosomes/contigs : %d", global_context . exontable_nchrs);
8583 
8584 	print_in_box(80,0,0,"");
8585 
8586 	if(global_context.do_scRNA_table){
8587 		print_in_box(80,0,0,"Load scRNA-related files...");
8588 		print_in_box(80,0,0,"   scRNA samples : %d", global_context.scRNA_sample_sheet_table->numOfElements);
8589 		print_in_box(80,0,0,"   scRNA cell barcodes : %d", global_context.scRNA_cell_barcodes_array -> numOfElements);
8590 		print_in_box(80,0,0,"");
8591 	}
8592 
8593 	if(fasta_contigs_fname){
8594 		print_in_box(80,0,0,"Load FASTA contigs from %s...", get_short_fname(fasta_contigs_fname));
8595 		global_context.fasta_contigs = malloc(sizeof(fasta_contigs_t));
8596 		int ret_fq = read_contig_fasta(global_context.fasta_contigs, fasta_contigs_fname);
8597 		if(ret_fq){
8598 			print_in_box(80,0,0,"   WARNING unable to open the FASTA file.");
8599 			print_in_box(80,0,0,"");
8600 			free(global_context.fasta_contigs);
8601 			global_context.fasta_contigs = NULL;
8602 		}else{
8603 			print_in_box(80,0,0,"   %lu contigs were loaded", global_context.fasta_contigs -> contig_table -> numOfElements);
8604 			print_in_box(80,0,0,"");
8605 		}
8606 	}else	global_context.fasta_contigs = NULL;
8607 
8608 
8609 	global_context.exontable_exons = nexons;
8610 	unsigned int x1, total_written_coulmns=0;
8611 
8612 
8613 
8614 
8615 	char * tmp_pntr = NULL, *tmp_smode_ptr = NULL;
8616 	char * strand_mode_list = strdup(global_context.strand_check_mode);
8617 	char * file_list_used = malloc(strlen(file_name_ptr)+1);
8618 	char * file_list_used2 = malloc(strlen(file_name_ptr)+1);
8619 	char * is_unique = malloc(strlen(file_name_ptr)+1);
8620 	strcpy(file_list_used, file_name_ptr);
8621 	for(x1 = 0;;x1++){
8622 		char * test_fn = strtok_r(x1?NULL:file_list_used, FC_FLIST_SPLITOR, &tmp_pntr);
8623 		if(NULL == test_fn) break;
8624 		char * short_fname = get_short_fname(test_fn);
8625 		strcpy(file_list_used2, file_name_ptr);
8626 
8627 		is_unique[x1]=1;
8628 		char * loop_ptr = NULL;
8629 		int x2;
8630 		for(x2 = 0;;x2++){
8631 			char * test_loopfn = strtok_r(x2?NULL:file_list_used2, FC_FLIST_SPLITOR, &loop_ptr);
8632 			if(NULL == test_loopfn) break;
8633 			if(x1==x2)continue;
8634 
8635 			char * short_loop_fname = get_short_fname(test_loopfn);
8636 
8637 			if(strcmp(short_loop_fname, short_fname)==0) {
8638 				is_unique[x1] = 0;
8639 				break;
8640 			}
8641 		}
8642 	}
8643 	free(file_list_used2);
8644 
8645 	tmp_pntr = NULL;
8646 	strcpy(file_list_used, file_name_ptr);
8647 	char * next_fn = strtok_r(file_list_used, FC_FLIST_SPLITOR, &tmp_pntr);
8648 	char * next_strand_mode = strtok_r(strand_mode_list, ".", &tmp_smode_ptr);
8649 	int one_single_strand_mode = -1;
8650 	if(NULL == strstr( global_context.strand_check_mode, "." )){
8651 		one_single_strand_mode = next_strand_mode[0] - '0';
8652 		assert(one_single_strand_mode >= 0 && one_single_strand_mode < 3);
8653 	}
8654 
8655 	ArrayList * table_columns = ArrayListCreate(n_input_files+1);
8656 	ArrayList * table_column_names = ArrayListCreate(n_input_files+1);
8657 	ArrayList * read_counters = ArrayListCreate(n_input_files+1);
8658 	ArrayListSetDeallocationFunction(table_columns, free);
8659 	ArrayListSetDeallocationFunction(table_column_names, free);
8660 	ArrayListSetDeallocationFunction(read_counters, free);
8661 
8662 	ArrayList * junction_global_table_list = NULL;
8663 	ArrayList * splicing_global_table_list = NULL;
8664 
8665 	if(global_context.do_junction_counting){
8666 		junction_global_table_list = ArrayListCreate(n_input_files+1);
8667 		splicing_global_table_list = ArrayListCreate(n_input_files+1);
8668 		ArrayListSetDeallocationFunction(junction_global_table_list, (void (*)(void *))HashTableDestroy);
8669 		ArrayListSetDeallocationFunction(splicing_global_table_list, (void (*)(void *))HashTableDestroy);
8670 	}
8671 
8672 	int ret_int = 0;
8673 
8674 #ifdef MAKE_STANDALONE
8675 	#define NO_SORT_OPTION_NAME "donotsort"
8676 #else
8677 	#define NO_SORT_OPTION_NAME "autosort"
8678 #endif
8679 
8680 	for(x1 = 0;;x1++){
8681 		int orininal_isPE = global_context.is_paired_end_mode_assign;
8682 		if(next_fn==NULL || strlen(next_fn)<1 || global_context.disk_is_full) break;
8683 		int this_file_isPEassign = isPEassign[1]?isPEassign[x1] == '1' :(isPEassign[0]=='1');
8684 		int this_file_isPEexpected = is_paired_end_reads_expected[1]?is_paired_end_reads_expected[x1]=='1' :(is_paired_end_reads_expected[0]=='1');
8685 		global_context.is_paired_end_reads_expected = this_file_isPEexpected;
8686 		global_context.is_paired_end_mode_assign = this_file_isPEassign;
8687 		if(global_context.do_not_sort && 0==this_file_isPEassign){
8688 			print_in_box(80,0,0,"      WARNING the %s option is ignored when single-end reads", NO_SORT_OPTION_NAME);
8689 			print_in_box(80,0,0,"              are being counted.");
8690 		}
8691 
8692 		read_count_type_t * column_numbers = calloc(nexons, sizeof(read_count_type_t));
8693 		HashTable * junction_global_table = NULL;
8694 		HashTable * splicing_global_table = NULL;
8695 
8696 		strcpy(global_context.input_file_name, next_fn);
8697 		strcpy(global_context.raw_input_file_name, next_fn);
8698 		global_context.this_input_number = x1;
8699 		global_context.input_file_unique = is_unique[x1];
8700 		global_context.input_file_short_name = get_short_fname(next_fn);
8701 		if(strstr( global_context.strand_check_mode, "." )){
8702 			global_context.is_strand_checked = next_strand_mode[0]-'0';
8703 			assert(global_context.is_strand_checked >=0 && global_context.is_strand_checked <=2);
8704 		}else global_context.is_strand_checked = one_single_strand_mode;
8705 		global_context.redo=0;
8706 
8707 		if(global_context.is_scRNA_BAM_FQ_out_generated && global_context.scRNA_sample_sheet_table){
8708 			global_context.scRNA_sample_BAM_writers = HashTableCreate(global_context.scRNA_sample_sheet_table -> numOfElements);
8709 			HashTableSetDeallocationFunctions(global_context.scRNA_sample_BAM_writers, NULL, scRNA_close_sample_SamBam_writers);
8710 			global_context.scRNA_sample_sheet_table ->appendix1 = global_context.scRNA_sample_BAM_writers;
8711 			global_context.scRNA_sample_sheet_table ->appendix2 = &global_context;
8712 			global_context.scRNA_sample_sheet_table ->appendix3 = global_context.scRNA_sample_id_to_name;
8713 			HashTableIteration( global_context.scRNA_sample_sheet_table, scRNA_sample_SamBam_writers_new_files);
8714 		}
8715 
8716 		if(global_context.do_junction_counting){
8717 			junction_global_table = HashTableCreate(156679);
8718 			splicing_global_table = HashTableCreate(156679);
8719 
8720 			HashTableSetHashFunction(junction_global_table,HashTableStringHashFunction);
8721 			HashTableSetDeallocationFunctions(junction_global_table, free, NULL);
8722 			HashTableSetKeyComparisonFunction(junction_global_table, fc_strcmp_chro);
8723 
8724 			HashTableSetHashFunction(splicing_global_table,HashTableStringHashFunction);
8725 			HashTableSetDeallocationFunctions(splicing_global_table, free, NULL);
8726 			HashTableSetKeyComparisonFunction(splicing_global_table, fc_strcmp_chro);
8727 		}
8728 
8729 		HashTable * merged_RG_table = NULL;
8730 		if(global_context.assign_reads_to_RG){
8731 			merged_RG_table = HashTableCreate(97);
8732 			HashTableSetHashFunction(merged_RG_table,HashTableStringHashFunction);
8733 			HashTableSetDeallocationFunctions(merged_RG_table, NULL, free); // the names are put into the column_names table, but the 4-pointer arrays are not used anymore.
8734 			HashTableSetKeyComparisonFunction(merged_RG_table, fc_strcmp_chro);
8735 		}
8736 
8737 		fc_read_counters * my_read_counter = calloc(1, sizeof(fc_read_counters));
8738 		global_context.is_read_details_out = isReadSummaryReport;
8739 		global_context.max_M = max_M;
8740 
8741 		ret_int = ret_int || readSummary_single_file(& global_context, column_numbers, nexons, geneid, chr, start, stop, sorted_strand, anno_chr_2ch, anno_chrs, anno_chr_head, block_end_index, block_min_start, block_max_end, my_read_counter, junction_global_table, splicing_global_table, merged_RG_table, loaded_features);
8742 		if(global_context.disk_is_full){
8743 			SUBREADprintf("ERROR: disk is full. Please check the free space in the output directory.\n");
8744 		}
8745 		if(ret_int!=0){
8746 			// give up this file.
8747 			if(global_context.do_junction_counting){
8748 				HashTableDestroy(junction_global_table);
8749 				HashTableDestroy(splicing_global_table);
8750 			}
8751 			free(column_numbers);
8752 		} else {
8753 			// finished
8754 
8755 			char * mem_file_name = memstrcpy(next_fn);
8756 			if(!global_context.assign_reads_to_RG){
8757 				ArrayListPush(table_columns, column_numbers);
8758 				ArrayListPush(table_column_names, mem_file_name);
8759 				ArrayListPush(read_counters, my_read_counter);
8760 				if(global_context.do_junction_counting){
8761 					ArrayListPush(junction_global_table_list,junction_global_table);
8762 					ArrayListPush(splicing_global_table_list,splicing_global_table);
8763 				}
8764 			}
8765 
8766 			if(global_context.assign_reads_to_RG){
8767 				int rgcur;
8768 				char * rg_name = global_context.RGnames_set;
8769 				for(rgcur = 0; rgcur < global_context.RGnames_ptr+1;  rgcur ++){
8770 					if(global_context.RGnames_set[rgcur] == '\t'||global_context.RGnames_set[rgcur] == '\0'){
8771 						global_context.RGnames_set[rgcur] = 0;
8772 						int rg_name_len = strlen(rg_name);
8773 						if(rg_name_len > 0){
8774 	//						SUBREADprintf("GET 4Tab:'%s'\n", rg_name);
8775 							void ** tab4 = HashTableGet(merged_RG_table, rg_name);
8776 							int file_len = strlen(mem_file_name);
8777 
8778 							char * rg_file_name = malloc(rg_name_len + 3 + file_len);
8779 							sprintf(rg_file_name, "%s:%s", mem_file_name, rg_name);
8780 
8781 							ArrayListPush(table_column_names, rg_file_name);
8782 							ArrayListPush(table_columns, tab4[0]);
8783 							ArrayListPush(read_counters, tab4[1]);
8784 							if(global_context.do_junction_counting){
8785 								ArrayListPush(junction_global_table_list,tab4[2]);
8786 								ArrayListPush(splicing_global_table_list,tab4[3]);
8787 							}
8788 							rg_name = global_context.RGnames_set + rgcur + 1;
8789 						}
8790 					}
8791 				}
8792 				free(mem_file_name);
8793 			}
8794 			total_written_coulmns ++;
8795 		}
8796 		global_context.is_paired_end_mode_assign = orininal_isPE;
8797 		next_fn = strtok_r(NULL, FC_FLIST_SPLITOR, &tmp_pntr);
8798 
8799 		if(strstr( global_context.strand_check_mode, "." )) next_strand_mode = strtok_r(NULL, ".", &tmp_smode_ptr);
8800 		if(global_context.assign_reads_to_RG) free(global_context.RGnames_set);
8801 		if(merged_RG_table) HashTableDestroy(merged_RG_table);
8802 	}
8803 
8804 	free(file_list_used);
8805 	free(is_unique);
8806 
8807 	if(global_context.is_input_bad_format){
8808 	//	SUBREADprintf("\nEEROR: The program has to terminate and no counting file is generated.\n\n");
8809 	}else if(!global_context.disk_is_full){
8810 		print_in_box(80,0,0,"Write the final count table.");
8811 		if(isGeneLevel){
8812 			char ** sorted_extra_columns = NULL;
8813 			if(global_context.reported_extra_columns != NULL){
8814 				sorted_extra_columns = malloc(sizeof(char**) * nexons);
8815 				int ii;
8816 				for(ii = 0; ii < nexons; ii++){
8817 					sorted_extra_columns[loaded_features[ii].sorted_order] = loaded_features[ii].extra_columns;
8818 					//SUBREADprintf("SSMQ: %d = %s\n", loaded_features[ii].sorted_order, loaded_features[ii].extra_columns);
8819 				}
8820 			}
8821 
8822 			fc_write_final_gene_results(&global_context, geneid, chr, start, stop, sorted_strand, sorted_extra_columns, argv[3], nexons,  table_columns, table_column_names, loaded_features, isCVersion);
8823 
8824 			if(sorted_extra_columns) free(sorted_extra_columns);
8825 		} else
8826 			fc_write_final_results(&global_context, argv[3], nexons, table_columns, table_column_names, loaded_features, isCVersion);
8827 	}
8828 	if(global_context.do_junction_counting && global_context.is_input_bad_format == 0 && !global_context.disk_is_full){
8829 		print_in_box(80,0,0,"Write the junction count table.");
8830 		fc_write_final_junctions(&global_context, argv[3], table_column_names, junction_global_table_list, splicing_global_table_list);
8831 	}
8832 
8833 	if(global_context.is_input_bad_format == 0 && !global_context.disk_is_full){
8834 		print_in_box(80,0,0,"Write the read assignment summary.");
8835 		fc_write_final_counts(&global_context, argv[3], table_column_names,  read_counters, isCVersion);
8836 	}
8837 
8838 	ArrayListDestroy(table_columns);
8839 	ArrayListDestroy(table_column_names);
8840 	ArrayListDestroy(read_counters);
8841 	if(global_context.do_junction_counting){
8842 		ArrayListDestroy(junction_global_table_list);
8843 		ArrayListDestroy(splicing_global_table_list);
8844 	}
8845 	free(file_name_ptr);
8846 
8847 	if(global_context.is_input_bad_format == 0) print_FC_results(&global_context, (char *)argv[3]/*out file name*/);
8848 	KeyValuePair * cursor;
8849 	int bucket;
8850 	for(bucket=0; bucket < global_context.exontable_chro_table  -> numOfBuckets; bucket++)
8851 	{
8852 		cursor = global_context.exontable_chro_table -> bucketArray[bucket];
8853 		while (1)
8854 		{
8855 			if (!cursor) break;
8856 			fc_chromosome_index_info * del_chro_info = cursor->value;
8857 			free(del_chro_info->reverse_table_start_index);
8858 			//free(del_chro_info->reverse_table_end_index);
8859 			free((void *)cursor -> key);
8860 			free(del_chro_info);
8861 			cursor = cursor->next;
8862 		}
8863 	}
8864 
8865 	if(global_context.read_details_out_FP) fclose(global_context. read_details_out_FP);
8866 	HashTableDestroy(global_context.gene_name_table);
8867 	HashTableDestroy(global_context.GCcontent_table);
8868 	if(global_context.scRNA_sample_sheet_table){
8869 		HashTableDestroy(global_context.scRNA_sample_sheet_table);
8870 		ArrayListDestroy(global_context.scRNA_sample_barcode_list);
8871 		ArrayListDestroy(global_context.scRNA_sample_id_to_name);
8872 		HashTableDestroy(global_context.scRNA_lineno1B_to_sampleno1B_tab);
8873 
8874 		for(x1=0; x1<global_context.scRNA_barcode_batched_bin_no +2; x1++){
8875 			char tmp_fname[MAX_FILE_NAME_LENGTH+20];
8876 			sprintf(tmp_fname, "%s/cellCounts-Splitted-Reads-%05d-%05d.bin", temp_dir, getpid(), x1);
8877 			unlink(tmp_fname);
8878 			pthread_spin_destroy(global_context.scRNA_barcode_batched_locks+x1);
8879 		}
8880 		pthread_spin_destroy(&global_context.scRNA_do_one_batch_runner_lock);
8881 
8882 		if(global_context.is_scRNA_BAM_FQ_out_generated){
8883 			HashTableDestroy(global_context.scRNA_sample_BAM_writers);
8884 		}
8885 	}
8886 	if(global_context.scRNA_cell_barcodes_array){
8887 		SUBREADprintf("DESTROYING global_context.scRNA_cell_barcodes_array : %p and %p, having %lld\n", global_context.scRNA_cell_barcodes_array, global_context.scRNA_cell_barcodes_array->elemDeallocator, global_context.scRNA_cell_barcodes_array-> numOfElements);
8888 		ArrayListDestroy(global_context.scRNA_cell_barcodes_array);
8889 		HashTableDestroy(global_context.scRNA_cell_barcode_head_tail_table);
8890 	}
8891 	free(global_context.gene_name_array);
8892 
8893 	HashTableDestroy(global_context.exontable_chro_table);
8894 	if(global_context.fasta_contigs){
8895 		destroy_contig_fasta(global_context.fasta_contigs);
8896 		free(global_context.fasta_contigs);
8897 	}
8898 	if(global_context.BAM_chros_to_anno_table)
8899 		HashTableDestroy(global_context.BAM_chros_to_anno_table);
8900 	if(global_context.do_junction_counting){
8901 		HashTableDestroy(global_context.junction_bucket_table);
8902 		HashTableDestroy(global_context.junction_features_table);
8903 	}
8904 
8905 
8906 	free(global_context.unistr_buffer_space);
8907 
8908 	if(global_context.reported_extra_columns){
8909 		for(bucket = 0; bucket < nexons; bucket++)
8910 			free(loaded_features[bucket].extra_columns);
8911 	}
8912 	if(global_context.lineno_2_sortedno_tab)HashTableDestroy(global_context.lineno_2_sortedno_tab);
8913 
8914 	free(loaded_features);
8915 	free(geneid);
8916 	free(chr);
8917 	free(start);
8918 	free(sorted_strand);
8919 	free(anno_chr_2ch);
8920 	free(anno_chrs);
8921 	free(anno_chr_head);
8922 	free(block_min_start);
8923 	free(block_max_end);
8924 	free(block_end_index);
8925 	free(stop);
8926 	free(strand_mode_list);
8927 
8928 	return total_written_coulmns?0:-1;
8929 }
8930 
register_buckets(fc_thread_global_context_t * global_context,HashTable * gene_feature_table,char * chro_name)8931 void register_buckets(fc_thread_global_context_t * global_context , HashTable * gene_feature_table, char * chro_name){
8932 	KeyValuePair * cursor;
8933 	int bucket;
8934 	for(bucket=0; bucket < gene_feature_table -> numOfBuckets; bucket++){
8935 		cursor = gene_feature_table -> bucketArray[bucket];
8936 		while(1){
8937 			if (!cursor) break;
8938 			fc_junction_gene_t * gene = (fc_junction_gene_t *) cursor -> value;
8939 			unsigned int x1;
8940 
8941 			for(x1 = gene -> pos_first_base - gene -> pos_first_base % JUNCTION_BUCKET_STEP; x1 <= gene -> pos_last_base ; x1 += JUNCTION_BUCKET_STEP){
8942 				char bucket_key[CHROMOSOME_NAME_LENGTH + 20];
8943 				sprintf(bucket_key, "%s:%u", chro_name, x1);
8944 				gene_info_list_t * list = HashTableGet(global_context -> junction_bucket_table, bucket_key);
8945 				if(list == NULL){
8946 					list = malloc(sizeof(gene_info_list_t));
8947 					list -> space = 3;
8948 					list -> used = 0;
8949 					list -> genes = malloc(sizeof(void *) * list -> space);
8950 					char * mem_bucket_key = malloc(strlen(bucket_key) + 1);
8951 					strcpy(mem_bucket_key , bucket_key);
8952 					HashTablePut(global_context -> junction_bucket_table, mem_bucket_key , list);
8953 				}
8954 
8955 				if(list -> used  ==  list -> space){
8956 					list -> space = max(list -> space + 3, list -> space * 1.3);
8957 					list -> genes = realloc(list -> genes , list -> space * sizeof(void *));
8958 				}
8959 				list -> genes[list -> used++] = gene;
8960 			}
8961 			cursor = cursor -> next;
8962 		}
8963 	}
8964 }
8965 
sort_bucket_table(fc_thread_global_context_t * global_context)8966 void sort_bucket_table(fc_thread_global_context_t * global_context){
8967 	KeyValuePair * cursor;
8968 	int bucket;
8969 	for(bucket=0; bucket < global_context -> junction_features_table -> numOfBuckets; bucket++){
8970 		cursor = global_context -> junction_features_table -> bucketArray[bucket];
8971 		while(1){
8972 			if (!cursor) break;
8973 			HashTable * gene_feature_table = cursor -> value;
8974 			char * chro_name = (char *)cursor -> key;
8975 			register_buckets(global_context , gene_feature_table, chro_name);
8976 			cursor = cursor -> next;
8977 		}
8978 	}
8979 }
8980 
8981 
scRNA_generate_BAM_FASTQ(fc_thread_global_context_t * global_context)8982 void scRNA_generate_BAM_FASTQ(fc_thread_global_context_t * global_context){
8983 	char MAC_or_random[13];
8984 	mac_or_rand_str(MAC_or_random);
8985 	char rand_prefix[MAX_FILE_NAME_LENGTH+100];
8986 	sprintf(rand_prefix, "%s/temp-core-%06u-%s.sam", global_context -> temp_file_dir, getpid(), MAC_or_random);
8987 
8988 	SAM_pairer_create(&global_context -> scRNA_read_pairer, global_context -> thread_number , global_context -> max_BAM_header_size/1024/1024+2, 1 /* is bam */, 0 /* do not drop seq/qual */ , 1 /*single end*/, 0 /*do not sort*/,0 /* no RG*/ ,0, global_context -> input_file_name, NULL, NULL, process_pairer_scRNAr2_output, rand_prefix, global_context,  9999);
8989 	SAM_pairer_run(&global_context -> scRNA_read_pairer);
8990 	SAM_pairer_destroy(&global_context -> scRNA_read_pairer);
8991 }
8992 
readSummary_single_file(fc_thread_global_context_t * global_context,read_count_type_t * column_numbers,srInt_64 nexons,int * geneid,char ** chr,srInt_64 * start,srInt_64 * stop,unsigned char * sorted_strand,char * anno_chr_2ch,char ** anno_chrs,srInt_64 * anno_chr_head,srInt_64 * block_end_index,srInt_64 * block_min_start,srInt_64 * block_max_end,fc_read_counters * my_read_counter,HashTable * junction_global_table,HashTable * splicing_global_table,HashTable * merged_RG_table,fc_feature_info_t * loaded_features)8993 int readSummary_single_file(fc_thread_global_context_t * global_context, read_count_type_t * column_numbers, srInt_64 nexons,  int * geneid, char ** chr, srInt_64 * start, srInt_64 * stop, unsigned char * sorted_strand, char * anno_chr_2ch, char ** anno_chrs, srInt_64 * anno_chr_head, srInt_64 * block_end_index, srInt_64 * block_min_start , srInt_64 * block_max_end, fc_read_counters * my_read_counter, HashTable * junction_global_table, HashTable * splicing_global_table, HashTable * merged_RG_table, fc_feature_info_t * loaded_features)
8994 {
8995 	int read_length = 0;
8996 	int is_first_read_PE=0;
8997 	char * line = (char*)calloc(MAX_LINE_LENGTH, 1);
8998 	char * file_str = "";
8999 
9000 	int file_probe = is_certainly_bam_file(global_context->input_file_name, &is_first_read_PE, NULL);
9001 
9002 		// a Singel-end SAM/BAM file cannot be assigned as a PE SAM/BAM file;
9003 		// but a PE SAM/BAM file may be assigned as a SE file if the user wishes to do so.
9004 
9005 	global_context->is_SAM_file = 1;
9006 	if(file_probe == 1) global_context->is_SAM_file = 0;
9007 	global_context->is_mixed_PE_SE = 0;
9008 	global_context->any_reads_are_PE = 0;
9009 	global_context -> start_time = miltime();
9010 
9011 	file_str = "SAM";
9012 	if(file_probe == 1) file_str = "BAM" ;
9013 	if(file_probe == -1) file_str = "Unknown";
9014 
9015 	if(!global_context->redo)
9016 	{
9017 		print_in_box(80,0,0,"Process %s file %s...", file_str, global_context -> use_stdin_file? "<STDIN>":get_short_fname(global_context->input_file_name));
9018 		if(global_context->is_strand_checked)
9019 			print_in_box(80,0,0,"   Strand specific : %s", global_context->is_strand_checked==1?"stranded":"reversely stranded");
9020 	}
9021 
9022 	// Open the SAM/BAM file
9023 	// Nothing is done if the file does not exist.
9024 
9025 	fc_thread_start_threads(global_context, nexons, geneid, chr, start, stop, sorted_strand, anno_chr_2ch, anno_chrs, anno_chr_head, block_end_index, block_min_start , block_max_end, read_length);
9026 	fc_thread_wait_threads(global_context);
9027 	if(global_context -> is_paired_end_reads_expected && !global_context -> any_reads_are_PE){
9028 		SUBREADprintf("ERROR: No paired-end reads were detected in paired-end read library : %s\n", global_context -> input_file_name);
9029 		global_context -> is_input_bad_format=1;
9030 		return -1;
9031 	}
9032 
9033 	srInt_64 nreads_mapped_to_exon = 0;
9034 	fc_thread_merge_results(global_context, column_numbers , &nreads_mapped_to_exon, my_read_counter, junction_global_table, splicing_global_table, merged_RG_table, loaded_features, nexons);
9035 	if(global_context -> do_scRNA_table){
9036 		scRNA_generate_BAM_FASTQ(global_context);
9037 		free(global_context -> scRNA_applied_umi_cut);
9038 	}
9039 	fc_thread_destroy_thread_context(global_context);
9040 
9041 	if(global_context -> sambam_chro_table) free(global_context -> sambam_chro_table);
9042 	global_context -> sambam_chro_table = NULL;
9043 
9044 	free(line);
9045 	if(global_context -> is_input_bad_format) return -1;
9046 	return 0;
9047 }
9048 
9049 
9050 #ifdef MAKE_STANDALONE
main(int argc,char ** argv)9051 int main(int argc, char ** argv)
9052 #else
9053 int feature_count_main(int argc, char ** argv)
9054 #endif
9055 {
9056 	char * Rargv[61];
9057 	char annot_name[MAX_FILE_NAME_LENGTH];
9058 	char temp_dir[MAX_FILE_NAME_LENGTH];
9059 	char * out_name = malloc(MAX_FILE_NAME_LENGTH);
9060 	char * fasta_contigs_name = malloc(MAX_FILE_NAME_LENGTH);
9061 	char * alias_file_name = malloc(MAX_FILE_NAME_LENGTH);
9062 	char * Rpath = malloc(MAX_FILE_NAME_LENGTH);
9063 	char * scRNA_sample_sheet = malloc(MAX_FILE_NAME_LENGTH);
9064 	char * scRNA_cell_barcode_list = malloc(MAX_FILE_NAME_LENGTH);
9065 
9066 	int cmd_rebuilt_size = 2000;
9067 	char * cmd_rebuilt = malloc(cmd_rebuilt_size);
9068 	char max_M_str[8];
9069 	char nameFeatureTypeColumn[2000];
9070 	char nameGeneIDColumn[66];
9071 	int min_qual_score = 0;
9072 	int min_dist = 50;
9073 	int max_dist = 600;
9074 	int read_shift_size = 0;
9075 	char debug_command[15];
9076 	char max_missing_bases_in_read_str[15];
9077 	char max_missing_bases_in_feature_str[15];
9078 	char min_dist_str[15];
9079 	char max_dist_str[15];
9080 	char read_shift_size_str[15];
9081 	char read_shift_type[15];
9082 	char min_qual_score_str[15];
9083 	char feature_block_size_str[15];
9084 	char * Strand_Sensitive_Str = "0";
9085 	char * old_zero_smode = Strand_Sensitive_Str;
9086 	char strFeatureFracOverlap[15];
9087 	char Pair_Orientations[3];
9088 	char * extra_column_names = NULL;
9089 	char * very_long_file_names;
9090 	char is_paired_end_reads_expected[2];
9091 	int is_Input_Need_Reorder = 0;
9092 	int is_PE = 0;
9093 	int is_SAM = 1;
9094 	int is_primary_alignment_only = 0;
9095 	int is_GeneLevel = 1;
9096 	int is_Overlap = 0;
9097 	int is_Both_End_Mapped = 0;
9098 	int is_Restrictedly_No_Overlap = 0;
9099 	int feature_block_size = 14;
9100 	int is_ReadSummary_Report = 0;
9101 	int is_Chimeric_Disallowed = 0;
9102 	int is_PE_Dist_Checked = 0;
9103 	int is_Multi_Mapping_Allowed = 0;
9104 	int is_Split_or_Exonic_Only = 0;
9105 	int is_duplicate_ignored = 0;
9106 	int assign_reads_to_RG = 0;
9107 	int do_not_sort = 0;
9108 	int do_junction_cnt = 0;
9109 	int do_detection_call = 0;
9110 	int reduce_5_3_ends_to_one = 0;
9111 	int use_fraction_multimapping = 0;
9112 	int threads = 1;
9113 	int isGTF = 1;
9114 	int use_overlapping_length_break_tie = 0;
9115 	char nthread_str[4];
9116 	int option_index = 0;
9117 	int max_missing_bases_in_feature = -1;
9118 	int max_missing_bases_in_read = -1;
9119 	int scRNA_input_mode = GENE_INPUT_BCL;
9120 	int c;
9121 	int very_long_file_names_size = 200;
9122 	int fiveEndExtension = 0, threeEndExtension = 0, minFragmentOverlap = 1;
9123 	float fracOverlap = 0.0, fracOverlapFeature = 0.0;
9124 	int std_input_output_mode = 0, long_read_mode = 0, is_verbose = 0;
9125 	int is_scRNA_BAM_FQ_out_generated = 1;
9126 	char strFiveEndExtension[11], strThreeEndExtension[11], strMinFragmentOverlap[11], fracOverlapStr[20], std_input_output_mode_str[16], long_read_mode_str[16];
9127 	very_long_file_names = malloc(very_long_file_names_size);
9128 	very_long_file_names [0] = 0;
9129 	fasta_contigs_name[0]=0;
9130 	scRNA_cell_barcode_list[0]=0;
9131 	scRNA_sample_sheet[0]=0;
9132 	is_paired_end_reads_expected[0]='0';
9133 	is_paired_end_reads_expected[1]='\0';
9134 
9135 	alias_file_name[0]=0;
9136 	debug_command[0] = 0;
9137 
9138 	strcpy(read_shift_type,"upstream");
9139 	strcpy(nameFeatureTypeColumn,"exon");
9140 	strcpy(nameGeneIDColumn,"gene_id");
9141 	strcpy(temp_dir, "<use output directory>");
9142 	annot_name[0]=0;out_name[0]=0;Rpath[0]=0;
9143 
9144 
9145 	cmd_rebuilt[0]=0;
9146 	for(c = 0; c<argc;c++)
9147 	{
9148 		if(strlen(cmd_rebuilt) + 1000 > cmd_rebuilt_size)
9149 		{
9150 			cmd_rebuilt_size*=2;
9151 			cmd_rebuilt = realloc(cmd_rebuilt, cmd_rebuilt_size);
9152 		}
9153 		sprintf(cmd_rebuilt+strlen(cmd_rebuilt), "\"%s\" ", argv[c]);
9154 	}
9155 
9156 	optind=0;
9157 	opterr=1;
9158 	optopt=63;
9159 	strcpy(max_M_str, "10");
9160 	strcpy(Pair_Orientations,"fr");
9161 
9162 	while ((c = getopt_long (argc, argv, "G:A:g:t:T:o:a:d:D:LQ:pbF:fs:S:CBJPMOR:v?", long_options, &option_index)) != -1)
9163 		switch(c)
9164 		{
9165 			case 'S':
9166 				/*
9167 				if(strlen(optarg)!=2 || (strcmp(optarg, "ff")!=0 && strcmp(optarg, "rf")!=0 && strcmp(optarg, "fr")!=0)){
9168 					SUBREADprintf("The order parameter can only be ff, fr or rf.\n");
9169 					print_usage();
9170 					return -1;
9171 				}
9172 				Pair_Orientations[0]=(optarg[0]=='r'?'r':'f');
9173 				Pair_Orientations[1]=(optarg[1]=='f'?'f':'r');
9174 				Pair_Orientations[2]=0;
9175 				*/
9176 				SUBREADprintf("The \"-S\" option has been depreciated.\n");
9177 
9178 				break;
9179 			case 'G':
9180 				strcpy(fasta_contigs_name , optarg);
9181 				break;
9182 			case 'J':
9183 				do_junction_cnt = 1;
9184 				break;
9185 			case 'A':
9186 				strcpy(alias_file_name, optarg);
9187 				break;
9188 			case 'M':
9189 				is_Multi_Mapping_Allowed = 1;
9190 				break;
9191 			case 'v':
9192 				core_version_number("featureCounts");
9193 				return 0;
9194 			case 'Q':
9195 				if(!is_valid_digit_range(optarg, "Q", 0 , 255))
9196 					STANDALONE_exit(-1);
9197 
9198 				min_qual_score = atoi(optarg);
9199 				break;
9200 			case 't':
9201 				strcpy(nameFeatureTypeColumn, optarg);
9202 				break;
9203 			case 'g':
9204 				while((*optarg) == ' ') optarg++;
9205 				strcpy(nameGeneIDColumn, optarg);
9206 				break;
9207 			case 'T':
9208 				if(!is_valid_digit_range(optarg, "T", 1, FC_MAX_THREADS))
9209 					STANDALONE_exit(-1);
9210 
9211 				threads = atoi(optarg);
9212 				break;
9213 			case 'd':
9214 				if(!is_valid_digit(optarg, "d"))
9215 					STANDALONE_exit(-1);
9216 
9217 				min_dist = atoi(optarg);
9218 				break;
9219 			case 'D':
9220 				if(!is_valid_digit(optarg, "D"))
9221 					STANDALONE_exit(-1);
9222 
9223 				max_dist = atoi(optarg);
9224 				break;
9225 			case 'p':
9226 				is_paired_end_reads_expected[0]='1';
9227 				break;
9228 			case 'C':
9229 				is_Chimeric_Disallowed = 1;
9230 				break;
9231 			case 'P':
9232 				is_PE_Dist_Checked = 1;
9233 				break;
9234 			case 'B':
9235 				is_Both_End_Mapped = 1;
9236 				break;
9237 			case 'f':
9238 				is_GeneLevel = 0;
9239 				break;
9240 			case 'F':
9241 				isGTF = 1;
9242 				if(strcmp("SAF", optarg)==0) isGTF=0;
9243 				else if(strcmp("GTF", optarg)==0) isGTF=1;
9244 				else SUBREADprintf("\nWarning: Unknown annotation format: %s. GTF format is used.\n\n", optarg);
9245 				break;
9246 			case 'O':
9247 				is_Overlap = 1;
9248 				break;
9249 			case 'R':
9250 				if(strcmp(optarg, "SAM")==0) is_ReadSummary_Report = FILE_TYPE_SAM;
9251 				else if(strcmp(optarg, "BAM")==0) is_ReadSummary_Report = FILE_TYPE_BAM;
9252 				else if(strcmp(optarg, "CORE")==0) is_ReadSummary_Report = FILE_TYPE_RSUBREAD;
9253 				else{
9254 					SUBREADprintf("\nERROR: unknown output format: '%s'\n\n", optarg);
9255 					STANDALONE_exit(-1);
9256 				}
9257 				break;
9258 			case 's':
9259 				Strand_Sensitive_Str = strdup(optarg);
9260 				int xx;
9261 				for(xx =0; Strand_Sensitive_Str[xx]!='\0'; xx++) if(Strand_Sensitive_Str[xx]==',') Strand_Sensitive_Str[xx]='.';
9262 				break;
9263 //			case 'i':
9264 //				term_strncpy(sam_name, optarg,299);
9265 //				break;
9266 			case 'o':
9267 				term_strncpy(out_name, optarg,MAX_FILE_NAME_LENGTH-1);
9268 				break;
9269 			case 'a':
9270 				term_strncpy(annot_name, optarg,MAX_FILE_NAME_LENGTH-1);
9271 				break;
9272 			case 'L':
9273 				long_read_mode = 1;
9274 				break;
9275 			case 0 :	// long options
9276 
9277 				if(strcmp("countReadPairs", long_options[option_index].name)==0){
9278 					is_PE=1;
9279 				}
9280 
9281 				if(strcmp("primary", long_options[option_index].name)==0)
9282 				{
9283 					is_primary_alignment_only = 1;
9284 				}
9285 
9286 				if(strcmp("readExtension5", long_options[option_index].name)==0)
9287 				{
9288 					if(!is_valid_digit_range(optarg, "readExtension5", 0, 0x7fffffff))
9289 						STANDALONE_exit(-1);
9290 					fiveEndExtension = atoi(optarg);
9291 					fiveEndExtension = max(0, fiveEndExtension);
9292 				}
9293 
9294 				if(strcmp("readExtension3", long_options[option_index].name)==0)
9295 				{
9296 					if(!is_valid_digit_range(optarg, "readExtension3", 0, 0x7fffffff))
9297 						STANDALONE_exit(-1);
9298 					threeEndExtension = atoi(optarg);
9299 					threeEndExtension = max(0, threeEndExtension);
9300 				}
9301 
9302 				if(strcmp("fracOverlap", long_options[option_index].name)==0)
9303 				{
9304 					if(!is_valid_float(optarg, "fracOverlap"))
9305 						STANDALONE_exit(-1);
9306 					fracOverlap = atof(optarg);
9307 				}
9308 
9309 
9310 				if(strcmp("fracOverlapFeature", long_options[option_index].name)==0)
9311 				{
9312 					if(!is_valid_float(optarg, "fracOverlapFeature"))
9313 						STANDALONE_exit(-1);
9314 					fracOverlapFeature = atof(optarg);
9315 				}
9316 
9317 				if(strcmp("nonOverlapFeature", long_options[option_index].name)==0){
9318 					if(!is_valid_digit_range(optarg, "nonOverlapFeature", 0, 0x7fffffff))
9319 						STANDALONE_exit(-1);
9320 					max_missing_bases_in_feature = atoi(optarg);
9321 				}
9322 
9323 				if(strcmp("nonOverlap", long_options[option_index].name)==0){
9324 					if(!is_valid_digit_range(optarg, "nonOverlap", 0, 0x7fffffff))
9325 						STANDALONE_exit(-1);
9326 					max_missing_bases_in_read = atoi(optarg);
9327 				}
9328 
9329 				if(strcmp("scCellBarcodeFile", long_options[option_index].name)==0)
9330 				{
9331 					 strcpy(scRNA_cell_barcode_list,optarg);
9332 				}
9333 
9334 				if(strcmp("scSampleSheet", long_options[option_index].name)==0)
9335 				{
9336 					 strcpy(scRNA_sample_sheet,optarg);
9337 				}
9338 
9339 				if(strcmp("scInputMode", long_options[option_index].name)==0)
9340 				{
9341 					if(strcmp("FASTQ", optarg)==0)
9342 						scRNA_input_mode=GENE_INPUT_SCRNA_FASTQ;
9343 					if(strcmp("BAM", optarg)==0)
9344 						scRNA_input_mode=GENE_INPUT_SCRNA_BAM;
9345 				}
9346 
9347 
9348 				if(strcmp("extraAttributes", long_options[option_index].name)==0)
9349 				{
9350 					extra_column_names = strdup(optarg);
9351 				}
9352 
9353 				if(strcmp("Rpath", long_options[option_index].name)==0)
9354 				{
9355 					strcpy(Rpath, optarg);
9356 				}
9357 
9358 				if(strcmp("minOverlap", long_options[option_index].name)==0)
9359 				{
9360 					if(!is_valid_digit(optarg, "minOverlap"))
9361 						STANDALONE_exit(-1);
9362 					minFragmentOverlap = atoi(optarg);
9363 				}
9364 
9365 				if(strcmp("debugCommand", long_options[option_index].name)==0)
9366 				{
9367 					strcpy(debug_command, optarg);
9368 				}
9369 
9370 
9371 				if(strcmp("ignoreDup", long_options[option_index].name)==0)
9372 				{
9373 					is_duplicate_ignored = 1 ;
9374 				}
9375 
9376 				if(strcmp("fraction", long_options[option_index].name)==0)
9377 				{
9378 					use_fraction_multimapping = 1;
9379 				}
9380 				if(strcmp("tmpDir", long_options[option_index].name)==0){
9381 					strcpy(temp_dir, optarg);
9382 				}
9383 				if(strcmp("maxMOp", long_options[option_index].name)==0){
9384 					if(!is_valid_digit_range(optarg, "maxMOp", 1 , 65555))
9385 						STANDALONE_exit(-1);
9386 					strcpy(max_M_str, optarg);
9387 				}
9388 				if(strcmp("read2pos", long_options[option_index].name)==0)
9389 				{
9390 					if(optarg[0]=='3')
9391 						reduce_5_3_ends_to_one = REDUCE_TO_3_PRIME_END;
9392 					else if(optarg[0]=='5')
9393 						reduce_5_3_ends_to_one = REDUCE_TO_5_PRIME_END;
9394 					else{
9395 						SUBREADprintf("Invalide parameter to the --read2pos option: %s\n", optarg);
9396 						STANDALONE_exit(-1);
9397 					}
9398 				}
9399 
9400 				if(strcmp("largestOverlap", long_options[option_index].name)==0)
9401 				{
9402 					use_overlapping_length_break_tie = 1;
9403 				}
9404 
9405 				if(strcmp("detectionCall", long_options[option_index].name)==0)
9406 				{
9407 					do_detection_call = 1;
9408 				}
9409 
9410 				if(strcmp("donotsort", long_options[option_index].name)==0)
9411 				{
9412 					do_not_sort = 1;
9413 				}
9414 
9415 				if(strcmp("readShiftSize", long_options[option_index].name)==0)
9416 				{
9417 					if(!is_valid_digit_range(optarg, "readShiftSize", 1 , 0x7fffffff))
9418 						STANDALONE_exit(-1);
9419 					read_shift_size = atoi(optarg);
9420 				}
9421 
9422 				if(strcmp("readShiftType", long_options[option_index].name)==0)
9423 				{
9424 					if(strcmp(optarg,"upstream")!=0 && strcmp(optarg,"downstream")!=0 && strcmp(optarg,"left")!=0 && strcmp(optarg,"right")!=0){
9425 						SUBREADprintf("Error: the readShiftType parameter can only be 'upstream', 'downstream', 'left' or 'right'\n");
9426 						STANDALONE_exit(-1);
9427 					}
9428 					strcpy(read_shift_type, optarg);
9429 				}
9430 
9431 				if(strcmp("splitOnly", long_options[option_index].name)==0)
9432 				{
9433 					if(is_Split_or_Exonic_Only == 2) {
9434 						SUBREADprintf("Error: You can not specify both splitOnly and nonSplitOnly\n");
9435 						return -1;
9436 					}
9437 					is_Split_or_Exonic_Only = 1;
9438 				}
9439 
9440 				if(strcmp("restrictedlyNoOverlap", long_options[option_index].name)==0)
9441 				{
9442 					is_Restrictedly_No_Overlap = 1;
9443 				}
9444 				if(strcmp("nonSplitOnly", long_options[option_index].name)==0)
9445 				{
9446 					if(is_Split_or_Exonic_Only == 1) {
9447 						SUBREADprintf("Error: You can not specify both splitOnly and nonSplitOnly\n");
9448 						return -1;
9449 					}
9450 					is_Split_or_Exonic_Only = 2;
9451 				}
9452 
9453 				if(strcmp("verbose", long_options[option_index].name)==0){
9454 					is_verbose = 1;
9455 				}
9456 
9457 				if(strcmp("byReadGroup", long_options[option_index].name)==0){
9458 					assign_reads_to_RG = 1;
9459 				}
9460 				break;
9461 			case '?':
9462 			default :
9463 				print_usage();
9464 				return -1;
9465 				break;
9466 		}
9467 
9468 
9469 	if(minFragmentOverlap<1)
9470 	{
9471 		fiveEndExtension = - minFragmentOverlap + 1;
9472 		threeEndExtension =  - minFragmentOverlap + 1;
9473 		minFragmentOverlap = 1;
9474 	}
9475 
9476 	if(out_name[0]==0 || annot_name[0]==0)
9477 	{
9478 		print_usage();
9479 		return -1;
9480 	}
9481 
9482 	for(; optind < argc; optind++)
9483 	{
9484 		int curr_strlen = strlen(very_long_file_names);
9485 		if( very_long_file_names_size - curr_strlen < MAX_FILE_NAME_LENGTH+1)
9486 		{
9487 			very_long_file_names_size *=2;
9488 			//printf("CL=%d ; NS=%d\n", curr_strlen , very_long_file_names_size);
9489 			very_long_file_names=realloc(very_long_file_names , very_long_file_names_size);
9490 		}
9491 
9492 		strcat(very_long_file_names, argv[optind]);
9493 		strcat(very_long_file_names, FC_FLIST_SPLITOR);
9494 	}
9495 
9496 	very_long_file_names[strlen(very_long_file_names)-1]=0;
9497 	std_input_output_mode = (strcmp(very_long_file_names, "") == 0?1:0);
9498 
9499 	sprintf(strFiveEndExtension, "%d", fiveEndExtension);
9500 	sprintf(strThreeEndExtension, "%d", threeEndExtension);
9501 	sprintf(strMinFragmentOverlap, "%d", minFragmentOverlap);
9502 	sprintf(nthread_str,"%d", threads);
9503 	sprintf(min_dist_str,"%d",min_dist);
9504 	sprintf(max_dist_str,"%d",max_dist);
9505 	sprintf(min_qual_score_str,"%d", min_qual_score);
9506 	sprintf(feature_block_size_str,"%d", feature_block_size);
9507 	sprintf(fracOverlapStr, "%g", fracOverlap);
9508 	sprintf(std_input_output_mode_str,"%d",std_input_output_mode);
9509 	sprintf(long_read_mode_str, "%d", long_read_mode);
9510 	sprintf(strFeatureFracOverlap, "%g", fracOverlapFeature);
9511 	sprintf(max_missing_bases_in_feature_str, "%d", max_missing_bases_in_feature);
9512 	sprintf(max_missing_bases_in_read_str, "%d", max_missing_bases_in_read);
9513 	sprintf(read_shift_size_str, "%d", read_shift_size);
9514 
9515 	Rargv[0] = "CreadSummary";
9516 	Rargv[1] = annot_name;
9517 	Rargv[2] = very_long_file_names;
9518 	Rargv[3] = out_name;
9519 	Rargv[4] = is_PE?"1":"0";
9520 	Rargv[5] = min_dist_str;
9521 	Rargv[6] = max_dist_str;
9522 	Rargv[7] = is_SAM?"1":"0";
9523 	Rargv[8] = is_Overlap?"1":"0";
9524 	Rargv[9] = is_GeneLevel?"1":"0";
9525 	Rargv[10] = nthread_str;
9526 	Rargv[11] = isGTF?"1":"0";
9527 	Rargv[12] = Strand_Sensitive_Str;
9528 	Rargv[13] = is_ReadSummary_Report == 0 ? "0":(is_ReadSummary_Report == FILE_TYPE_RSUBREAD?"10":(is_ReadSummary_Report == FILE_TYPE_BAM?"500":"50"));
9529 	Rargv[14] = is_Both_End_Mapped?"1":"0";
9530 	Rargv[15] = is_Chimeric_Disallowed?"1":"0";
9531 	Rargv[16] = is_PE_Dist_Checked?"1":"0";
9532 	Rargv[17] = nameFeatureTypeColumn;
9533 	Rargv[18] = nameGeneIDColumn;
9534 	Rargv[19] = min_qual_score_str;
9535 	Rargv[20] = is_Multi_Mapping_Allowed?"1":"0";
9536 	Rargv[21] = alias_file_name;
9537 	Rargv[22] = cmd_rebuilt;
9538 	Rargv[23] = is_Input_Need_Reorder?"1":"0";
9539 	Rargv[24] = feature_block_size_str;
9540 	Rargv[25] = strFiveEndExtension;
9541 	Rargv[26] = strThreeEndExtension;
9542 	Rargv[27] = strMinFragmentOverlap;
9543 	Rargv[28] = is_Split_or_Exonic_Only == 1?"1":(is_Split_or_Exonic_Only ==  2 ? "2":"0");
9544 	Rargv[29] = (reduce_5_3_ends_to_one == 0?"0":(reduce_5_3_ends_to_one==REDUCE_TO_3_PRIME_END?"3":"5"));
9545 	Rargv[30] = debug_command;
9546 	Rargv[31] = is_duplicate_ignored?"1":"0";
9547 	Rargv[32] = do_not_sort?"1":"0";
9548 	Rargv[33] = use_fraction_multimapping?"1":"0";
9549 	Rargv[34] = use_overlapping_length_break_tie?"1":"0";
9550 	Rargv[35] = Pair_Orientations;
9551 	Rargv[36] = do_junction_cnt?"1":"0";
9552 	Rargv[37] = fasta_contigs_name;
9553 	Rargv[38] = max_M_str;
9554 	Rargv[39] = is_Restrictedly_No_Overlap?"1":"0";
9555 	Rargv[40] = fracOverlapStr;
9556 	Rargv[41] = temp_dir;
9557 	Rargv[42] = std_input_output_mode_str;
9558 	Rargv[43] = assign_reads_to_RG?"1":"0";
9559 	Rargv[44] = long_read_mode_str;
9560 	Rargv[45] = is_verbose?"1":"0";
9561 	Rargv[46] = strFeatureFracOverlap;
9562 	Rargv[47] = do_detection_call?"1":"0";
9563 	Rargv[48] = max_missing_bases_in_read_str;
9564 	Rargv[49] = max_missing_bases_in_feature_str;
9565 	Rargv[50] = is_primary_alignment_only?"1":"0";
9566 	Rargv[51] = Rpath;
9567 	Rargv[52] = extra_column_names;
9568 	Rargv[54] = "NA"; // C featureCounts dosn't need the display_annotation_name.
9569 	Rargv[54] = read_shift_type;
9570 	Rargv[55] = read_shift_size_str;
9571 	Rargv[56] = scRNA_sample_sheet;
9572 	Rargv[57] = scRNA_cell_barcode_list;
9573 	Rargv[58] = is_paired_end_reads_expected;
9574 	Rargv[59] = is_scRNA_BAM_FQ_out_generated?"1":"0";
9575 
9576 	Rargv[60] = "3";
9577 	if(scRNA_input_mode == GENE_INPUT_SCRNA_FASTQ) Rargv[60] = "4";
9578 	if(scRNA_input_mode == GENE_INPUT_SCRNA_BAM) Rargv[60] = "5";
9579 
9580 	int retvalue = -1;
9581 	if(is_ReadSummary_Report && (std_input_output_mode & 1)==1) SUBREADprintf("ERROR: no detailed assignment results can be written when the input is from STDIN. Please remove the '-R' option.\n");
9582 	else retvalue = readSummary(61, Rargv);
9583 
9584 	free(very_long_file_names);
9585 	free(out_name);
9586 	free(alias_file_name);
9587 	free(fasta_contigs_name);
9588 	if(old_zero_smode != Strand_Sensitive_Str)free(Strand_Sensitive_Str);
9589 	free(cmd_rebuilt);
9590 	free(Rpath);
9591 	free(scRNA_sample_sheet);
9592 	free(scRNA_cell_barcode_list);
9593 	if(extra_column_names)free(extra_column_names);
9594 
9595 	return retvalue;
9596 
9597 }
9598 
9599 
9600