1 /***************************************************************
2 
3    The Subread and Rsubread software packages are free
4    software packages:
5 
6    you can redistribute it and/or modify it under the terms
7    of the GNU General Public License as published by the
8    Free Software Foundation, either version 3 of the License,
9    or (at your option) any later version.
10 
11    Subread is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty
13    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14 
15    See the GNU General Public License for more details.
16 
17    Authors: Drs Yang Liao and Wei Shi
18 
19   ***************************************************************/
20 
21 
22 #define _GNU_SOURCE
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <stdarg.h>
26 #include <assert.h>
27 #include <string.h>
28 #include <unistd.h>
29 #include <ctype.h>
30 
31 
32 
33 #ifndef MAKE_STANDALONE
34   #include <R.h>
35 #endif
36 
37 #include <zlib.h>
38 #include <math.h>
39 #include <pthread.h>
40 #include <getopt.h>
41 #include "subread.h"
42 #include "interval_merge.h"
43 #include "core.h"
44 #include "gene-algorithms.h"
45 #include "sambam-file.h"
46 #include "input-files.h"
47 #include "hashtable.h"
48 #include "seek-zlib.h"
49 #include "HelperFunctions.h"
50 
51 #define MERGING_MODE_CHOPPING 100   // keep all the edges : [100,199] + [150,249] = [100,149], [150,199], [200,249]
52 #define MERGING_MODE_MERGING 0    // just merge into one: [100,199] + [150,249] = [100, 249]
53 
54 typedef struct{
55 	char GTF_gene_id_column[MAX_READ_NAME_LEN];
56 	char GTF_wanted_feature_type[MAX_READ_NAME_LEN];
57 	char GTF_file_name[MAX_FILE_NAME_LENGTH];
58 	char output_file_name[MAX_FILE_NAME_LENGTH];
59 	FILE * output_FP;
60 	int merging_mode;
61 
62 	HashTable * gene_to_chro_strand_table;
63 	HashTable * gene_chro_strand_to_exons_table;
64 } flatAnno_context_t;
65 
66 static struct option long_options[] =
67 {
68 	{0, 0, 0, 0}
69 };
70 
71 
flatAnno_print_usage()72 void flatAnno_print_usage(){
73 	SUBREADprintf("flattenGTF Version %s\n\n", SUBREAD_VERSION);
74 	SUBREADputs("  Flatten features included in a GTF annotation and save the modified annotation");
75 	SUBREADputs("  to a SAF format file.");
76 	SUBREADputs("");
77 	SUBREADputs("Usage:");
78 	SUBREADputs("  ./flattenGTF [options] -a <input_file> -o <output_file>");
79 	SUBREADputs("");
80 	SUBREADputs("## Mandatory arguments: ");
81 	SUBREADputs("");
82 	SUBREADputs("  -a <file>      Name of an annotation file in GTF/GFF format.");
83 	SUBREADputs("");
84 	SUBREADputs("  -o <file>      Name of output file.");
85 	SUBREADputs("");
86 	SUBREADputs("## Optional arguments: ");
87 	SUBREADputs("");
88 	SUBREADputs("  -t <string>    Specify feature type in a GTF annotation. 'exon' by default.");
89 	SUBREADputs("                 Features with the specified feature type are extracted from the");
90 	SUBREADputs("                 annotation for processing.");
91 	SUBREADputs("");
92 	SUBREADputs("  -g <string>    Specify attribute type in GTF annotation. 'gene_id' by default.");
93 	SUBREADputs("                 This attribute type is used to group features into meta-");
94 	SUBREADputs("                 features.");
95 	SUBREADputs("");
96 	SUBREADputs("  -C             Merging overlapping exons into multiple non-overlapping exons but");
97 	SUBREADputs("                 all the edges are kept.");
98 	SUBREADputs("");
99 }
100 
flatAnno_finalise(flatAnno_context_t * context)101 int flatAnno_finalise(flatAnno_context_t * context){
102 	HashTableDestroy(context -> gene_to_chro_strand_table);
103 	HashTableDestroy(context -> gene_chro_strand_to_exons_table);
104 	fclose(context -> output_FP );
105 	SUBREADputs("Finished.\n");
106 	return 0;
107 }
108 
flatAnno_do_anno_1R(char * gene_name,char * transcript_name,char * chro_name,unsigned int start,unsigned int end,int is_negative_strand,void * Vcontext)109 int flatAnno_do_anno_1R(char * gene_name, char * transcript_name, char * chro_name, unsigned int start, unsigned int end, int is_negative_strand, void * Vcontext){
110 	flatAnno_context_t * context = Vcontext;
111 	ArrayList * chro_strand_list_for_gene = HashTableGet(context -> gene_to_chro_strand_table, gene_name);
112 	if(NULL == chro_strand_list_for_gene){
113 		char * mem_gene = malloc(strlen(gene_name)+1);
114 		strcpy(mem_gene, gene_name);
115 
116 		chro_strand_list_for_gene = ArrayListCreate(3);
117 		ArrayListSetDeallocationFunction(chro_strand_list_for_gene, free);
118 		HashTablePut(context -> gene_to_chro_strand_table, mem_gene, chro_strand_list_for_gene);
119 	}
120 	int i, found = 0;
121 	char chro_strand[MAX_CHROMOSOME_NAME_LEN+10+FEATURE_NAME_LENGTH];
122 	sprintf( chro_strand, "%s\t%s\t%c", gene_name,chro_name,is_negative_strand?'-':'+');
123 	for(i=0; i<chro_strand_list_for_gene->numOfElements; i++){
124 		char * old_ch_st = ArrayListGet(chro_strand_list_for_gene,i);
125 		if(strcmp(old_ch_st, chro_strand) == 0){
126 			found=1;
127 			break;
128 		}
129 	}
130 	if(!found){
131 		char * mem_ch_st = strdup(chro_strand);
132 		ArrayListPush(chro_strand_list_for_gene, mem_ch_st);
133 	}
134 
135 	ArrayList * ge_ch_st_exon_list = HashTableGet(context -> gene_chro_strand_to_exons_table, chro_strand);
136 	if(NULL == ge_ch_st_exon_list){
137 		ge_ch_st_exon_list =  ArrayListCreate(3);
138 		ArrayListSetDeallocationFunction(ge_ch_st_exon_list,free);
139 		HashTablePut(context -> gene_chro_strand_to_exons_table, strdup(chro_strand), ge_ch_st_exon_list);
140 	}
141 	int * mem_start_end = malloc(sizeof(int)*2);
142 	mem_start_end[0] = (int)start;
143 	mem_start_end[1] = (int)end;
144 	ArrayListPush(ge_ch_st_exon_list, mem_start_end);
145 	return 0;
146 }
147 
flatAnno_do_anno_merge_one_array_compare(void * vL,void * vR,ArrayList * me)148 int flatAnno_do_anno_merge_one_array_compare(void * vL, void * vR, ArrayList *me){
149 	int * iL = vL, *iR = vR;
150 	if((*iL)>(*iR))return 1;
151 	if((*iL)<(*iR))return -1;
152 	return 0;
153 }
154 
flatAnno_do_anno_chop_one_array(void * key,void * hashed_obj,HashTable * tab)155 void flatAnno_do_anno_chop_one_array(void * key, void * hashed_obj, HashTable * tab){
156 	ArrayList * this_list = hashed_obj; // each element is a pair of integers defining start and end of an exon.
157 	ArrayList * edge_before_me_List = ArrayListCreate( this_list->numOfElements * 2 );
158 	int i;
159 	for(i=0; i<this_list -> numOfElements; i++){
160 		int * curr_2i = this_list -> elementList[ i ];
161 		int findi, endi;
162 		for(endi = 0; endi < 2; endi++){
163 			assert(curr_2i[endi]>0);
164 			int search_tag = curr_2i[endi] + endi, found=0;
165 
166 			for(findi = 0; findi < edge_before_me_List->numOfElements; findi++){
167 				if(ArrayListGet(edge_before_me_List,findi)-NULL == search_tag){
168 					found = 1;
169 					break;
170 				}
171 			}
172 			if(0==found) ArrayListPush(edge_before_me_List, NULL+search_tag);
173 		}
174 	}
175 	ArrayListSort(edge_before_me_List, NULL);
176 	char * continue_after_an_edge = malloc(edge_before_me_List -> numOfElements -1);
177 	memset(continue_after_an_edge,0, edge_before_me_List -> numOfElements -1);
178 	int missed_gaps = edge_before_me_List->numOfElements -1;
179     for(i=0; i<edge_before_me_List->numOfElements -1; i++){
180 		int an_edge_before_me = edge_before_me_List-> elementList[i]-NULL;
181 		int j;
182 		for(j=0; j<this_list->numOfElements; j++){
183 			int * curr_2i = this_list -> elementList[ j ];
184 			if( curr_2i[0] <= an_edge_before_me && curr_2i[1] >= an_edge_before_me ) {
185 				continue_after_an_edge[i]=1;
186 				missed_gaps --;
187 				break;
188 			}
189 		}
190 	}
191 
192 	long old_capacity = this_list -> capacityOfElements, old_items = this_list -> numOfElements;
193 	if(edge_before_me_List->numOfElements -1 - missed_gaps > old_capacity){
194 		this_list -> elementList = realloc( this_list -> elementList, sizeof(void*) * edge_before_me_List->numOfElements -1 -missed_gaps );
195 		this_list -> capacityOfElements = edge_before_me_List->numOfElements -1 -missed_gaps ;
196 	}
197 
198 	int written_i = 0;
199     for(i=0; i<edge_before_me_List->numOfElements -1; i++){
200 		int * curr_2i = NULL;
201 		//if(edge_before_me_List->elementList[i] - NULL == 108395906)SUBREADprintf("CV_I %d = %ld ~ %ld ; CTN=%d\n", i, edge_before_me_List->elementList[i] - NULL,  edge_before_me_List -> elementList[i+1] -NULL-1, continue_after_an_edge[i]);
202 		if(0==continue_after_an_edge[i]){
203 		//	SUBREADprintf("NC_I %d ~ [ 0 , %ld ]\n", i, edge_before_me_List->numOfElements -2);
204 			assert(i>0 && i<edge_before_me_List->numOfElements -2);
205 			assert(continue_after_an_edge[i+1] && continue_after_an_edge[i-1] ); // otherwise it is a singleton edge.
206 			continue;
207 		}
208 
209 		if(written_i < old_items){
210 			curr_2i = this_list -> elementList[ written_i ];
211 		}else{
212 			curr_2i = malloc(sizeof(int)*2);
213 			this_list -> elementList[ written_i ] = curr_2i;
214 		}
215 		curr_2i[0] = edge_before_me_List -> elementList[i] -NULL;
216 		curr_2i[1] = edge_before_me_List -> elementList[i+1] -NULL-1;
217 		written_i++;
218 	}
219 	assert(written_i == edge_before_me_List->numOfElements -1 - missed_gaps );
220 	for(i=written_i; i<old_items; i++) free( this_list -> elementList[i]);
221 	this_list -> numOfElements = written_i;
222 	ArrayListDestroy(edge_before_me_List);
223 	free(continue_after_an_edge);
224 }
225 
flatAnno_do_anno_merge_one_array(void * key,void * hashed_obj,HashTable * tab)226 void flatAnno_do_anno_merge_one_array(void * key, void * hashed_obj, HashTable * tab){
227 	ArrayList * this_list = hashed_obj;
228 	ArrayListSort(this_list, flatAnno_do_anno_merge_one_array_compare);
229 	int i, n1_items = 0;
230 	for(i=1; i<this_list -> numOfElements; i++){
231 		int * last_2i = this_list -> elementList[ n1_items ];
232 		int * curr_2i = this_list -> elementList[ i ];
233 		if(last_2i[1] >= curr_2i[1]) continue;
234 		if(last_2i[1] >= curr_2i[0] -1){
235 			last_2i[1]=curr_2i[1];
236 			continue;
237 		}
238 		n1_items++;
239 		if(n1_items< i){
240 			last_2i = this_list -> elementList[ n1_items ];
241 			last_2i[0] = curr_2i[0];
242 			last_2i[1] = curr_2i[1];
243 		}
244 	}
245 	for(i=n1_items+1; i<this_list -> numOfElements; i++)free(this_list -> elementList[i]);
246 	this_list -> numOfElements = n1_items+1;
247 }
248 
flatme_strcmp(void * L,void * R,ArrayList * me)249 int flatme_strcmp(void * L, void * R, ArrayList * me){
250 	return strcmp(L, R);
251 }
252 
flatAnno_do_anno_merge_and_write(flatAnno_context_t * context)253 int flatAnno_do_anno_merge_and_write(flatAnno_context_t * context){
254 	context -> gene_chro_strand_to_exons_table -> appendix1 = context;
255 
256 	HashTableIteration(context -> gene_chro_strand_to_exons_table, context -> merging_mode == MERGING_MODE_CHOPPING? flatAnno_do_anno_chop_one_array :flatAnno_do_anno_merge_one_array);
257 	ArrayList * all_chro_st_list = HashTableKeyArray(context -> gene_chro_strand_to_exons_table);
258 	ArrayListSort(all_chro_st_list, flatme_strcmp);
259 
260 	fprintf( context -> output_FP , "GeneID\tChr\tStart\tEnd\tStrand\n");
261 	int i,j;
262 	for(i = 0; i< all_chro_st_list -> numOfElements; i++){
263 		char * ge_chro_strand = ArrayListGet(all_chro_st_list,i);
264 		char * local_ge_chro_strand = strdup(ge_chro_strand);
265 		char * strand_ptr = local_ge_chro_strand;
266 
267 		for(j=0; j<2; strand_ptr++)
268 			if(*strand_ptr=='\t')j++;
269 		strand_ptr[-1] = 0;
270 
271 		ArrayList * exon_in_chro_strand = HashTableGet(context -> gene_chro_strand_to_exons_table, ge_chro_strand);
272 
273 		for(j=0; j< exon_in_chro_strand  -> numOfElements; j++){
274 			int * start_end_2i = ArrayListGet(exon_in_chro_strand,j);
275 			fprintf( context -> output_FP ,"%s\t%d\t%d\t%s\n", local_ge_chro_strand, start_end_2i[0], start_end_2i[1], strand_ptr);
276 		}
277 		free(local_ge_chro_strand);
278 	}
279 	ArrayListDestroy(all_chro_st_list );
280 	return 0;
281 }
282 
flatAnno_do_anno(flatAnno_context_t * context)283 int flatAnno_do_anno(flatAnno_context_t * context){
284 	int loaded_features = load_features_annotation(context -> GTF_file_name, FILE_TYPE_GTF, context -> GTF_gene_id_column, NULL, context ->  GTF_wanted_feature_type, context,  flatAnno_do_anno_1R);
285 	if(loaded_features<0)SUBREADputs("ERROR: Unable to open the GTF file.");
286 	if(loaded_features==0)SUBREADprintf("ERROR: No '%s' feature was found in the GTF file. (the '%s' attribute is required)\n", context ->  GTF_wanted_feature_type,  context -> GTF_gene_id_column);
287 	if(loaded_features<=0) return -1;
288 
289 	return flatAnno_do_anno_merge_and_write(context);
290 }
291 
292 
flatAnno_start(flatAnno_context_t * context)293 int flatAnno_start(flatAnno_context_t * context){
294 	SUBREADputs("");
295 	if(!context -> GTF_file_name[0]){
296 		flatAnno_print_usage();
297 
298 		if(context -> output_file_name[0]) SUBREADprintf("Error: no input file is specified.\n");
299 		return -1;
300 	}
301 
302 	if(!context -> output_file_name[0]){
303 		flatAnno_print_usage();
304 		SUBREADprintf("Error: no output file is specified.\n");
305 		return -1;
306 	}
307 	SUBREADprintf("Flattening GTF file: %s\n", context -> GTF_file_name);
308 	SUBREADprintf("Output SAF file: %s\n", context -> output_file_name);
309 	context -> output_FP = fopen(context -> output_file_name, "w");
310 	if(NULL == context -> output_FP ){
311 		SUBREADprintf("Error: unable to open the output file.\n");
312 		return -1;
313 	}
314 
315 	SUBREADprintf("\nLooking for '%s' features... (grouped by '%s')\n\n", context ->  GTF_wanted_feature_type, context -> GTF_gene_id_column);
316 	context -> gene_to_chro_strand_table = StringTableCreate(30000);
317 	HashTableSetDeallocationFunctions(context -> gene_to_chro_strand_table, free, (void(*)(void *))ArrayListDestroy);
318 	context -> gene_chro_strand_to_exons_table =  StringTableCreate(30000);
319 	HashTableSetDeallocationFunctions(context -> gene_chro_strand_to_exons_table, free, (void(*)(void *))ArrayListDestroy);
320 	return 0;
321 }
322 
323 #ifdef MAKE_STANDALONE
main(int argc,char ** argv)324 int main(int argc, char ** argv)
325 #else
326 int R_flattenAnnotations(int argc, char ** argv)
327 #endif
328 {
329 	flatAnno_context_t context;
330 	memset(&context, 0, sizeof(context));
331 	strcpy(context.GTF_gene_id_column, "gene_id");
332 	strcpy(context.GTF_wanted_feature_type, "exon");
333 
334 	int option_index = 0,c;
335 	optind=0;
336 	opterr=1;
337 	optopt=63;
338 	while ((c = getopt_long (argc, argv, "Ct:g:a:o:v?", long_options, &option_index)) != -1)
339 		switch(c) {
340 			case 'C':
341 				context.merging_mode = MERGING_MODE_CHOPPING;
342 				break;
343 			case 'o':
344 				strcpy(context.output_file_name,  optarg);
345 				break;
346 			case 'a':
347 				strcpy(context.GTF_file_name,  optarg);
348 				break;
349 			case 'g':
350 				strcpy(context.GTF_gene_id_column, optarg);
351 				break;
352 			case 't':
353 				strcpy(context.GTF_wanted_feature_type, optarg);
354 				break;
355 
356 			case '?':
357 			default :
358 				flatAnno_print_usage();
359 				return -1;
360 		}
361 
362 	int ret = 0;
363 	ret = ret || flatAnno_start(&context);
364 	ret = ret || flatAnno_do_anno(&context);
365 	ret = ret || flatAnno_finalise(&context);
366 	return ret;
367 }
368