1 /****************************************************************\
2 * *
3 * fastanrdb : create non-redundant versions of fasta database *
4 * *
5 * Guy St.C. Slater.. mailto:guy@ebi.ac.uk *
6 * Copyright (C) 2000-2009. All Rights Reserved. *
7 * *
8 * This source code is distributed under the terms of the *
9 * GNU General Public License, version 3. See the file COPYING *
10 * or http://www.gnu.org/licenses/gpl.txt for details *
11 * *
12 * If you use this code, please keep this notice intact. *
13 * *
14 \****************************************************************/
15
16 #include <stdlib.h>
17 #include <string.h> /* For strcmp() */
18 #include <ctype.h> /* For toupper() */
19
20 #include "argument.h"
21 #include "fastadb.h"
22
23 /**/
24
25 typedef struct {
26 gint checksum;
27 gint length;
28 FastaDB_Key *key;
29 FastaDB_Seq *seq;
30 } NRDB_Data;
31
NRDB_Data_create(FastaDB_Seq * fdbs)32 static NRDB_Data *NRDB_Data_create(FastaDB_Seq *fdbs){
33 register NRDB_Data *nd = g_new(NRDB_Data, 1);
34 nd->length = fdbs->seq->len;
35 nd->checksum = Sequence_checksum(fdbs->seq);
36 nd->key = FastaDB_Seq_get_key(fdbs);
37 nd->seq = NULL;
38 return nd;
39 }
40
NRDB_Data_destroy(NRDB_Data * nd)41 static void NRDB_Data_destroy(NRDB_Data *nd){
42 if(nd->key)
43 FastaDB_Key_destroy(nd->key);
44 if(nd->seq)
45 FastaDB_Seq_destroy(nd->seq);
46 g_free(nd);
47 return;
48 }
49
NRDB_Data_sort_checksum_function(const void * a,const void * b)50 static int NRDB_Data_sort_checksum_function(const void *a,
51 const void *b){
52 register NRDB_Data **nd_a = (NRDB_Data**)a,
53 **nd_b = (NRDB_Data**)b;
54 return (*nd_a)->checksum-(*nd_b)->checksum;
55 }
56
57 /**/
58
59 typedef int (*NRDB_CompFunc)(const char *s1, const char *s2);
60
61 typedef struct {
62 GPtrArray *nrdb_data_list;
63 gboolean ignore_case;
64 gboolean check_revcomp;
65 NRDB_CompFunc comp;
66 } NRDB_Info;
67
fasta_nrdb_traverse_func(FastaDB_Seq * fdbs,gpointer user_data)68 static gboolean fasta_nrdb_traverse_func(FastaDB_Seq *fdbs,
69 gpointer user_data){
70 register NRDB_Info *ni = user_data;
71 register NRDB_Data *nd = NRDB_Data_create(fdbs);
72 register FastaDB_Seq *revcomp_fdbs;
73 register NRDB_Data *revcomp_nd;
74 register gchar *seq, *rseq;
75 g_ptr_array_add(ni->nrdb_data_list, nd);
76 if(ni->check_revcomp){
77 revcomp_fdbs = FastaDB_Seq_revcomp(fdbs);
78 /* Check against palindromes */
79 seq = Sequence_get_str(fdbs->seq);
80 rseq = Sequence_get_str(revcomp_fdbs->seq);
81 if(ni->comp(seq, rseq)){
82 revcomp_nd = NRDB_Data_create(revcomp_fdbs);
83 g_ptr_array_add(ni->nrdb_data_list, revcomp_nd);
84 }
85 FastaDB_Seq_destroy(revcomp_fdbs);
86 g_free(seq);
87 g_free(rseq);
88 }
89 return FALSE;
90 }
91
92 /**/
93
NRDB_Data_report_redundant_set(GPtrArray * redundant_set)94 static void NRDB_Data_report_redundant_set(GPtrArray *redundant_set){
95 register gint i;
96 register GPtrArray *forward_list = g_ptr_array_new(),
97 *reverse_list = g_ptr_array_new();
98 register FastaDB_Seq *fdbs, *first_forward = NULL;
99 register GString *merge_def;
100 register gchar *curr_def;
101 register FastaDB_Mask mask = FastaDB_Mask_ID
102 | FastaDB_Mask_DEF
103 | FastaDB_Mask_SEQ;
104 for(i = 0; i < redundant_set->len; i++){
105 fdbs = redundant_set->pdata[i];
106 if(fdbs->seq->strand == Sequence_Strand_REVCOMP){
107 g_ptr_array_add(reverse_list, fdbs);
108 } else {
109 if(first_forward){
110 g_ptr_array_add(forward_list, fdbs);
111 } else {
112 first_forward = fdbs;
113 }
114 }
115 }
116 if(first_forward && ((forward_list->len+1) >= reverse_list->len)){
117 curr_def = first_forward->seq->def;
118 merge_def = g_string_sized_new(64);
119 for(i = 0; i < forward_list->len; i++){
120 fdbs = forward_list->pdata[i];
121 g_string_append_c(merge_def, ' ');
122 g_string_append(merge_def, fdbs->seq->id);
123 }
124 for(i = 0; i < reverse_list->len; i++){
125 fdbs = reverse_list->pdata[i];
126 g_string_append_c(merge_def, ' ');
127 g_string_append(merge_def, fdbs->seq->id);
128 g_string_append(merge_def, ".revcomp");
129 }
130 first_forward->seq->def = merge_def->str;
131 FastaDB_Seq_print(first_forward, stdout, mask);
132 g_string_free(merge_def, TRUE);
133 first_forward->seq->def = curr_def;
134 }
135 g_ptr_array_free(forward_list, TRUE);
136 g_ptr_array_free(reverse_list, TRUE);
137 return;
138 }
139
NRDB_Data_merge_and_print(NRDB_Info * ni,FastaDB * fdb)140 static void NRDB_Data_merge_and_print(NRDB_Info *ni, FastaDB *fdb){
141 register gint i, j;
142 register NRDB_Data *nd_a, *nd_b;
143 register FastaDB_Seq *fdbs_a, *fdbs_b;
144 register GPtrArray *redundant_set = g_ptr_array_new();
145 register gchar *seq_a, *seq_b;
146 for(i = 0; i < ni->nrdb_data_list->len; i++){
147 nd_a = ni->nrdb_data_list->pdata[i];
148 if(!nd_a) /* Already used */
149 continue;
150 fdbs_a = FastaDB_Key_get_seq(nd_a->key, FastaDB_Mask_ALL);
151 g_ptr_array_add(redundant_set, fdbs_a);
152 for(j = i+1; j < ni->nrdb_data_list->len; j++){
153 nd_b = ni->nrdb_data_list->pdata[j];
154 if(!nd_b) /* Already used */
155 continue;
156 if(nd_a->checksum != nd_b->checksum)
157 break;
158 if(nd_a->length != nd_b->length)
159 continue;
160 if(!nd_b->seq){
161 /* Not available from previous checksum collision */
162 nd_b->seq = FastaDB_Key_get_seq(nd_b->key,
163 FastaDB_Mask_ALL);
164 }
165 fdbs_b = nd_b->seq;
166 seq_a = Sequence_get_str(fdbs_a->seq);
167 seq_b = Sequence_get_str(fdbs_b->seq);
168 if(!ni->comp(seq_a, seq_b)){
169 g_ptr_array_add(redundant_set, FastaDB_Seq_share(fdbs_b));
170 NRDB_Data_destroy(nd_b);
171 ni->nrdb_data_list->pdata[j] = NULL;
172 }
173 g_free(seq_a);
174 g_free(seq_b);
175 }
176 NRDB_Data_destroy(nd_a);
177 ni->nrdb_data_list->pdata[i] = NULL;
178 NRDB_Data_report_redundant_set(redundant_set);
179 for(j = 0; j < redundant_set->len; j++)
180 FastaDB_Seq_destroy(redundant_set->pdata[j]);
181 g_ptr_array_set_size(redundant_set, 0);
182 }
183 return;
184 }
185
186 /**/
187
Argument_main(Argument * arg)188 int Argument_main(Argument *arg){
189 register FastaDB *fdb;
190 register ArgumentSet *as
191 = ArgumentSet_create("Sequence Input Options");
192 register Alphabet *alphabet;
193 NRDB_Info ni;
194 gchar *query_path;
195 ArgumentSet_add_option(as, 'f', "fasta", "path",
196 "Fasta input file", NULL,
197 Argument_parse_string, &query_path);
198 ArgumentSet_add_option(as, 'i', "ignorecase", NULL,
199 "Ignore sequence case", "FALSE",
200 Argument_parse_boolean, &ni.ignore_case);
201 ArgumentSet_add_option(as, 'r', "revcomp", NULL,
202 "Check for revcomp duplicates", "FALSE",
203 Argument_parse_boolean, &ni.check_revcomp);
204 Argument_absorb_ArgumentSet(arg, as);
205 Argument_process(arg, "fastanrdb",
206 "A utility create non-redundant fasta sequence database\n"
207 "Guy St.C. Slater. guy@ebi.ac.uk. 2000-2003.\n", NULL);
208 ni.nrdb_data_list = g_ptr_array_new();
209 if(ni.ignore_case)
210 ni.comp = strcasecmp;
211 else
212 ni.comp = strcmp;
213 if(ni.check_revcomp)
214 alphabet = Alphabet_create(Alphabet_Type_DNA, FALSE);
215 else
216 alphabet = Alphabet_create(Alphabet_Type_UNKNOWN, FALSE);
217 fdb = FastaDB_open(query_path, alphabet);
218 FastaDB_traverse(fdb, FastaDB_Mask_ALL,
219 fasta_nrdb_traverse_func, &ni);
220 qsort(ni.nrdb_data_list->pdata, ni.nrdb_data_list->len,
221 sizeof(gpointer), NRDB_Data_sort_checksum_function);
222 NRDB_Data_merge_and_print(&ni, fdb);
223 /* nrdb_data_list will be already cleared */
224 g_ptr_array_free(ni.nrdb_data_list, TRUE);
225 FastaDB_close(fdb);
226 Alphabet_destroy(alphabet);
227 return 0;
228 }
229
230