1 /***************************************************************
2
3 The Subread and Rsubread software packages are free
4 software packages:
5
6 you can redistribute it and/or modify it under the terms
7 of the GNU General Public License as published by the
8 Free Software Foundation, either version 3 of the License,
9 or (at your option) any later version.
10
11 Subread is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty
13 of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14
15 See the GNU General Public License for more details.
16
17 Authors: Drs Yang Liao and Wei Shi
18
19 ***************************************************************/
20
21
22 #define _GNU_SOURCE
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <stdarg.h>
26 #include <assert.h>
27 #include <string.h>
28 #include <unistd.h>
29 #include <ctype.h>
30
31
32
33 #ifndef MAKE_STANDALONE
34 #include <R.h>
35 #endif
36
37 #include <zlib.h>
38 #include <math.h>
39 #include <pthread.h>
40 #include <getopt.h>
41 #include "subread.h"
42 #include "interval_merge.h"
43 #include "core.h"
44 #include "gene-algorithms.h"
45 #include "sambam-file.h"
46 #include "input-files.h"
47 #include "hashtable.h"
48 #include "seek-zlib.h"
49 #include "HelperFunctions.h"
50
51 #define MERGING_MODE_CHOPPING 100 // keep all the edges : [100,199] + [150,249] = [100,149], [150,199], [200,249]
52 #define MERGING_MODE_MERGING 0 // just merge into one: [100,199] + [150,249] = [100, 249]
53
54 typedef struct{
55 char GTF_gene_id_column[MAX_READ_NAME_LEN];
56 char GTF_wanted_feature_type[MAX_READ_NAME_LEN];
57 char GTF_file_name[MAX_FILE_NAME_LENGTH];
58 char output_file_name[MAX_FILE_NAME_LENGTH];
59 FILE * output_FP;
60 int merging_mode;
61
62 HashTable * gene_to_chro_strand_table;
63 HashTable * gene_chro_strand_to_exons_table;
64 } flatAnno_context_t;
65
66 static struct option long_options[] =
67 {
68 {0, 0, 0, 0}
69 };
70
71
flatAnno_print_usage()72 void flatAnno_print_usage(){
73 SUBREADprintf("flattenGTF Version %s\n\n", SUBREAD_VERSION);
74 SUBREADputs(" Flatten features included in a GTF annotation and save the modified annotation");
75 SUBREADputs(" to a SAF format file.");
76 SUBREADputs("");
77 SUBREADputs("Usage:");
78 SUBREADputs(" ./flattenGTF [options] -a <input_file> -o <output_file>");
79 SUBREADputs("");
80 SUBREADputs("## Mandatory arguments: ");
81 SUBREADputs("");
82 SUBREADputs(" -a <file> Name of an annotation file in GTF/GFF format.");
83 SUBREADputs("");
84 SUBREADputs(" -o <file> Name of output file.");
85 SUBREADputs("");
86 SUBREADputs("## Optional arguments: ");
87 SUBREADputs("");
88 SUBREADputs(" -t <string> Specify feature type in a GTF annotation. 'exon' by default.");
89 SUBREADputs(" Features with the specified feature type are extracted from the");
90 SUBREADputs(" annotation for processing.");
91 SUBREADputs("");
92 SUBREADputs(" -g <string> Specify attribute type in GTF annotation. 'gene_id' by default.");
93 SUBREADputs(" This attribute type is used to group features into meta-");
94 SUBREADputs(" features.");
95 SUBREADputs("");
96 SUBREADputs(" -C Merging overlapping exons into multiple non-overlapping exons but");
97 SUBREADputs(" all the edges are kept.");
98 SUBREADputs("");
99 }
100
flatAnno_finalise(flatAnno_context_t * context)101 int flatAnno_finalise(flatAnno_context_t * context){
102 HashTableDestroy(context -> gene_to_chro_strand_table);
103 HashTableDestroy(context -> gene_chro_strand_to_exons_table);
104 fclose(context -> output_FP );
105 SUBREADputs("Finished.\n");
106 return 0;
107 }
108
flatAnno_do_anno_1R(char * gene_name,char * transcript_name,char * chro_name,unsigned int start,unsigned int end,int is_negative_strand,void * Vcontext)109 int flatAnno_do_anno_1R(char * gene_name, char * transcript_name, char * chro_name, unsigned int start, unsigned int end, int is_negative_strand, void * Vcontext){
110 flatAnno_context_t * context = Vcontext;
111 ArrayList * chro_strand_list_for_gene = HashTableGet(context -> gene_to_chro_strand_table, gene_name);
112 if(NULL == chro_strand_list_for_gene){
113 char * mem_gene = malloc(strlen(gene_name)+1);
114 strcpy(mem_gene, gene_name);
115
116 chro_strand_list_for_gene = ArrayListCreate(3);
117 ArrayListSetDeallocationFunction(chro_strand_list_for_gene, free);
118 HashTablePut(context -> gene_to_chro_strand_table, mem_gene, chro_strand_list_for_gene);
119 }
120 int i, found = 0;
121 char chro_strand[MAX_CHROMOSOME_NAME_LEN+10+FEATURE_NAME_LENGTH];
122 sprintf( chro_strand, "%s\t%s\t%c", gene_name,chro_name,is_negative_strand?'-':'+');
123 for(i=0; i<chro_strand_list_for_gene->numOfElements; i++){
124 char * old_ch_st = ArrayListGet(chro_strand_list_for_gene,i);
125 if(strcmp(old_ch_st, chro_strand) == 0){
126 found=1;
127 break;
128 }
129 }
130 if(!found){
131 char * mem_ch_st = strdup(chro_strand);
132 ArrayListPush(chro_strand_list_for_gene, mem_ch_st);
133 }
134
135 ArrayList * ge_ch_st_exon_list = HashTableGet(context -> gene_chro_strand_to_exons_table, chro_strand);
136 if(NULL == ge_ch_st_exon_list){
137 ge_ch_st_exon_list = ArrayListCreate(3);
138 ArrayListSetDeallocationFunction(ge_ch_st_exon_list,free);
139 HashTablePut(context -> gene_chro_strand_to_exons_table, strdup(chro_strand), ge_ch_st_exon_list);
140 }
141 int * mem_start_end = malloc(sizeof(int)*2);
142 mem_start_end[0] = (int)start;
143 mem_start_end[1] = (int)end;
144 ArrayListPush(ge_ch_st_exon_list, mem_start_end);
145 return 0;
146 }
147
flatAnno_do_anno_merge_one_array_compare(void * vL,void * vR,ArrayList * me)148 int flatAnno_do_anno_merge_one_array_compare(void * vL, void * vR, ArrayList *me){
149 int * iL = vL, *iR = vR;
150 if((*iL)>(*iR))return 1;
151 if((*iL)<(*iR))return -1;
152 return 0;
153 }
154
flatAnno_do_anno_chop_one_array(void * key,void * hashed_obj,HashTable * tab)155 void flatAnno_do_anno_chop_one_array(void * key, void * hashed_obj, HashTable * tab){
156 ArrayList * this_list = hashed_obj; // each element is a pair of integers defining start and end of an exon.
157 ArrayList * edge_before_me_List = ArrayListCreate( this_list->numOfElements * 2 );
158 int i;
159 for(i=0; i<this_list -> numOfElements; i++){
160 int * curr_2i = this_list -> elementList[ i ];
161 int findi, endi;
162 for(endi = 0; endi < 2; endi++){
163 assert(curr_2i[endi]>0);
164 int search_tag = curr_2i[endi] + endi, found=0;
165
166 for(findi = 0; findi < edge_before_me_List->numOfElements; findi++){
167 if(ArrayListGet(edge_before_me_List,findi)-NULL == search_tag){
168 found = 1;
169 break;
170 }
171 }
172 if(0==found) ArrayListPush(edge_before_me_List, NULL+search_tag);
173 }
174 }
175 ArrayListSort(edge_before_me_List, NULL);
176 char * continue_after_an_edge = malloc(edge_before_me_List -> numOfElements -1);
177 memset(continue_after_an_edge,0, edge_before_me_List -> numOfElements -1);
178 int missed_gaps = edge_before_me_List->numOfElements -1;
179 for(i=0; i<edge_before_me_List->numOfElements -1; i++){
180 int an_edge_before_me = edge_before_me_List-> elementList[i]-NULL;
181 int j;
182 for(j=0; j<this_list->numOfElements; j++){
183 int * curr_2i = this_list -> elementList[ j ];
184 if( curr_2i[0] <= an_edge_before_me && curr_2i[1] >= an_edge_before_me ) {
185 continue_after_an_edge[i]=1;
186 missed_gaps --;
187 break;
188 }
189 }
190 }
191
192 long old_capacity = this_list -> capacityOfElements, old_items = this_list -> numOfElements;
193 if(edge_before_me_List->numOfElements -1 - missed_gaps > old_capacity){
194 this_list -> elementList = realloc( this_list -> elementList, sizeof(void*) * edge_before_me_List->numOfElements -1 -missed_gaps );
195 this_list -> capacityOfElements = edge_before_me_List->numOfElements -1 -missed_gaps ;
196 }
197
198 int written_i = 0;
199 for(i=0; i<edge_before_me_List->numOfElements -1; i++){
200 int * curr_2i = NULL;
201 //if(edge_before_me_List->elementList[i] - NULL == 108395906)SUBREADprintf("CV_I %d = %ld ~ %ld ; CTN=%d\n", i, edge_before_me_List->elementList[i] - NULL, edge_before_me_List -> elementList[i+1] -NULL-1, continue_after_an_edge[i]);
202 if(0==continue_after_an_edge[i]){
203 // SUBREADprintf("NC_I %d ~ [ 0 , %ld ]\n", i, edge_before_me_List->numOfElements -2);
204 assert(i>0 && i<edge_before_me_List->numOfElements -2);
205 assert(continue_after_an_edge[i+1] && continue_after_an_edge[i-1] ); // otherwise it is a singleton edge.
206 continue;
207 }
208
209 if(written_i < old_items){
210 curr_2i = this_list -> elementList[ written_i ];
211 }else{
212 curr_2i = malloc(sizeof(int)*2);
213 this_list -> elementList[ written_i ] = curr_2i;
214 }
215 curr_2i[0] = edge_before_me_List -> elementList[i] -NULL;
216 curr_2i[1] = edge_before_me_List -> elementList[i+1] -NULL-1;
217 written_i++;
218 }
219 assert(written_i == edge_before_me_List->numOfElements -1 - missed_gaps );
220 for(i=written_i; i<old_items; i++) free( this_list -> elementList[i]);
221 this_list -> numOfElements = written_i;
222 ArrayListDestroy(edge_before_me_List);
223 free(continue_after_an_edge);
224 }
225
flatAnno_do_anno_merge_one_array(void * key,void * hashed_obj,HashTable * tab)226 void flatAnno_do_anno_merge_one_array(void * key, void * hashed_obj, HashTable * tab){
227 ArrayList * this_list = hashed_obj;
228 ArrayListSort(this_list, flatAnno_do_anno_merge_one_array_compare);
229 int i, n1_items = 0;
230 for(i=1; i<this_list -> numOfElements; i++){
231 int * last_2i = this_list -> elementList[ n1_items ];
232 int * curr_2i = this_list -> elementList[ i ];
233 if(last_2i[1] >= curr_2i[1]) continue;
234 if(last_2i[1] >= curr_2i[0] -1){
235 last_2i[1]=curr_2i[1];
236 continue;
237 }
238 n1_items++;
239 if(n1_items< i){
240 last_2i = this_list -> elementList[ n1_items ];
241 last_2i[0] = curr_2i[0];
242 last_2i[1] = curr_2i[1];
243 }
244 }
245 for(i=n1_items+1; i<this_list -> numOfElements; i++)free(this_list -> elementList[i]);
246 this_list -> numOfElements = n1_items+1;
247 }
248
flatme_strcmp(void * L,void * R,ArrayList * me)249 int flatme_strcmp(void * L, void * R, ArrayList * me){
250 return strcmp(L, R);
251 }
252
flatAnno_do_anno_merge_and_write(flatAnno_context_t * context)253 int flatAnno_do_anno_merge_and_write(flatAnno_context_t * context){
254 context -> gene_chro_strand_to_exons_table -> appendix1 = context;
255
256 HashTableIteration(context -> gene_chro_strand_to_exons_table, context -> merging_mode == MERGING_MODE_CHOPPING? flatAnno_do_anno_chop_one_array :flatAnno_do_anno_merge_one_array);
257 ArrayList * all_chro_st_list = HashTableKeyArray(context -> gene_chro_strand_to_exons_table);
258 ArrayListSort(all_chro_st_list, flatme_strcmp);
259
260 fprintf( context -> output_FP , "GeneID\tChr\tStart\tEnd\tStrand\n");
261 int i,j;
262 for(i = 0; i< all_chro_st_list -> numOfElements; i++){
263 char * ge_chro_strand = ArrayListGet(all_chro_st_list,i);
264 char * local_ge_chro_strand = strdup(ge_chro_strand);
265 char * strand_ptr = local_ge_chro_strand;
266
267 for(j=0; j<2; strand_ptr++)
268 if(*strand_ptr=='\t')j++;
269 strand_ptr[-1] = 0;
270
271 ArrayList * exon_in_chro_strand = HashTableGet(context -> gene_chro_strand_to_exons_table, ge_chro_strand);
272
273 for(j=0; j< exon_in_chro_strand -> numOfElements; j++){
274 int * start_end_2i = ArrayListGet(exon_in_chro_strand,j);
275 fprintf( context -> output_FP ,"%s\t%d\t%d\t%s\n", local_ge_chro_strand, start_end_2i[0], start_end_2i[1], strand_ptr);
276 }
277 free(local_ge_chro_strand);
278 }
279 ArrayListDestroy(all_chro_st_list );
280 return 0;
281 }
282
flatAnno_do_anno(flatAnno_context_t * context)283 int flatAnno_do_anno(flatAnno_context_t * context){
284 int loaded_features = load_features_annotation(context -> GTF_file_name, FILE_TYPE_GTF, context -> GTF_gene_id_column, NULL, context -> GTF_wanted_feature_type, context, flatAnno_do_anno_1R);
285 if(loaded_features<0)SUBREADputs("ERROR: Unable to open the GTF file.");
286 if(loaded_features==0)SUBREADprintf("ERROR: No '%s' feature was found in the GTF file. (the '%s' attribute is required)\n", context -> GTF_wanted_feature_type, context -> GTF_gene_id_column);
287 if(loaded_features<=0) return -1;
288
289 return flatAnno_do_anno_merge_and_write(context);
290 }
291
292
flatAnno_start(flatAnno_context_t * context)293 int flatAnno_start(flatAnno_context_t * context){
294 SUBREADputs("");
295 if(!context -> GTF_file_name[0]){
296 flatAnno_print_usage();
297
298 if(context -> output_file_name[0]) SUBREADprintf("Error: no input file is specified.\n");
299 return -1;
300 }
301
302 if(!context -> output_file_name[0]){
303 flatAnno_print_usage();
304 SUBREADprintf("Error: no output file is specified.\n");
305 return -1;
306 }
307 SUBREADprintf("Flattening GTF file: %s\n", context -> GTF_file_name);
308 SUBREADprintf("Output SAF file: %s\n", context -> output_file_name);
309 context -> output_FP = fopen(context -> output_file_name, "w");
310 if(NULL == context -> output_FP ){
311 SUBREADprintf("Error: unable to open the output file.\n");
312 return -1;
313 }
314
315 SUBREADprintf("\nLooking for '%s' features... (grouped by '%s')\n\n", context -> GTF_wanted_feature_type, context -> GTF_gene_id_column);
316 context -> gene_to_chro_strand_table = StringTableCreate(30000);
317 HashTableSetDeallocationFunctions(context -> gene_to_chro_strand_table, free, (void(*)(void *))ArrayListDestroy);
318 context -> gene_chro_strand_to_exons_table = StringTableCreate(30000);
319 HashTableSetDeallocationFunctions(context -> gene_chro_strand_to_exons_table, free, (void(*)(void *))ArrayListDestroy);
320 return 0;
321 }
322
323 #ifdef MAKE_STANDALONE
main(int argc,char ** argv)324 int main(int argc, char ** argv)
325 #else
326 int R_flattenAnnotations(int argc, char ** argv)
327 #endif
328 {
329 flatAnno_context_t context;
330 memset(&context, 0, sizeof(context));
331 strcpy(context.GTF_gene_id_column, "gene_id");
332 strcpy(context.GTF_wanted_feature_type, "exon");
333
334 int option_index = 0,c;
335 optind=0;
336 opterr=1;
337 optopt=63;
338 while ((c = getopt_long (argc, argv, "Ct:g:a:o:v?", long_options, &option_index)) != -1)
339 switch(c) {
340 case 'C':
341 context.merging_mode = MERGING_MODE_CHOPPING;
342 break;
343 case 'o':
344 strcpy(context.output_file_name, optarg);
345 break;
346 case 'a':
347 strcpy(context.GTF_file_name, optarg);
348 break;
349 case 'g':
350 strcpy(context.GTF_gene_id_column, optarg);
351 break;
352 case 't':
353 strcpy(context.GTF_wanted_feature_type, optarg);
354 break;
355
356 case '?':
357 default :
358 flatAnno_print_usage();
359 return -1;
360 }
361
362 int ret = 0;
363 ret = ret || flatAnno_start(&context);
364 ret = ret || flatAnno_do_anno(&context);
365 ret = ret || flatAnno_finalise(&context);
366 return ret;
367 }
368