1 /****************************************************************\
2 *                                                                *
3 *  fastanrdb : create non-redundant versions of fasta database   *
4 *                                                                *
5 *  Guy St.C. Slater..   mailto:guy@ebi.ac.uk                     *
6 *  Copyright (C) 2000-2009.  All Rights Reserved.                *
7 *                                                                *
8 *  This source code is distributed under the terms of the        *
9 *  GNU General Public License, version 3. See the file COPYING   *
10 *  or http://www.gnu.org/licenses/gpl.txt for details            *
11 *                                                                *
12 *  If you use this code, please keep this notice intact.         *
13 *                                                                *
14 \****************************************************************/
15 
16 #include <stdlib.h>
17 #include <string.h> /* For strcmp() */
18 #include <ctype.h> /* For toupper() */
19 
20 #include "argument.h"
21 #include "fastadb.h"
22 
23 /**/
24 
25 typedef struct {
26            gint  checksum;
27            gint  length;
28     FastaDB_Key *key;
29     FastaDB_Seq *seq;
30 } NRDB_Data;
31 
NRDB_Data_create(FastaDB_Seq * fdbs)32 static NRDB_Data *NRDB_Data_create(FastaDB_Seq *fdbs){
33     register NRDB_Data *nd = g_new(NRDB_Data, 1);
34     nd->length = fdbs->seq->len;
35     nd->checksum = Sequence_checksum(fdbs->seq);
36     nd->key = FastaDB_Seq_get_key(fdbs);
37     nd->seq = NULL;
38     return nd;
39     }
40 
NRDB_Data_destroy(NRDB_Data * nd)41 static void NRDB_Data_destroy(NRDB_Data *nd){
42     if(nd->key)
43         FastaDB_Key_destroy(nd->key);
44     if(nd->seq)
45         FastaDB_Seq_destroy(nd->seq);
46     g_free(nd);
47     return;
48     }
49 
NRDB_Data_sort_checksum_function(const void * a,const void * b)50 static int NRDB_Data_sort_checksum_function(const void *a,
51                                             const void *b){
52     register NRDB_Data **nd_a = (NRDB_Data**)a,
53                        **nd_b = (NRDB_Data**)b;
54     return (*nd_a)->checksum-(*nd_b)->checksum;
55     }
56 
57 /**/
58 
59 typedef int (*NRDB_CompFunc)(const char *s1, const char *s2);
60 
61 typedef struct {
62          GPtrArray *nrdb_data_list;
63           gboolean  ignore_case;
64           gboolean  check_revcomp;
65      NRDB_CompFunc  comp;
66 } NRDB_Info;
67 
fasta_nrdb_traverse_func(FastaDB_Seq * fdbs,gpointer user_data)68 static gboolean fasta_nrdb_traverse_func(FastaDB_Seq *fdbs,
69                                          gpointer user_data){
70     register NRDB_Info *ni = user_data;
71     register NRDB_Data *nd = NRDB_Data_create(fdbs);
72     register FastaDB_Seq *revcomp_fdbs;
73     register NRDB_Data *revcomp_nd;
74     register gchar *seq, *rseq;
75     g_ptr_array_add(ni->nrdb_data_list, nd);
76     if(ni->check_revcomp){
77         revcomp_fdbs = FastaDB_Seq_revcomp(fdbs);
78         /* Check against palindromes */
79         seq = Sequence_get_str(fdbs->seq);
80         rseq = Sequence_get_str(revcomp_fdbs->seq);
81         if(ni->comp(seq, rseq)){
82             revcomp_nd = NRDB_Data_create(revcomp_fdbs);
83             g_ptr_array_add(ni->nrdb_data_list, revcomp_nd);
84             }
85         FastaDB_Seq_destroy(revcomp_fdbs);
86         g_free(seq);
87         g_free(rseq);
88         }
89     return FALSE;
90     }
91 
92 /**/
93 
NRDB_Data_report_redundant_set(GPtrArray * redundant_set)94 static void NRDB_Data_report_redundant_set(GPtrArray *redundant_set){
95     register gint i;
96     register GPtrArray *forward_list = g_ptr_array_new(),
97                        *reverse_list = g_ptr_array_new();
98     register FastaDB_Seq *fdbs, *first_forward = NULL;
99     register GString *merge_def;
100     register gchar *curr_def;
101     register FastaDB_Mask mask = FastaDB_Mask_ID
102                                | FastaDB_Mask_DEF
103                                | FastaDB_Mask_SEQ;
104     for(i = 0; i < redundant_set->len; i++){
105         fdbs = redundant_set->pdata[i];
106         if(fdbs->seq->strand == Sequence_Strand_REVCOMP){
107             g_ptr_array_add(reverse_list, fdbs);
108         } else {
109             if(first_forward){
110                 g_ptr_array_add(forward_list, fdbs);
111             } else {
112                 first_forward = fdbs;
113                 }
114             }
115         }
116     if(first_forward && ((forward_list->len+1) >= reverse_list->len)){
117         curr_def = first_forward->seq->def;
118         merge_def = g_string_sized_new(64);
119         for(i = 0; i < forward_list->len; i++){
120             fdbs = forward_list->pdata[i];
121             g_string_append_c(merge_def, ' ');
122             g_string_append(merge_def, fdbs->seq->id);
123             }
124         for(i = 0; i < reverse_list->len; i++){
125             fdbs = reverse_list->pdata[i];
126             g_string_append_c(merge_def, ' ');
127             g_string_append(merge_def, fdbs->seq->id);
128             g_string_append(merge_def, ".revcomp");
129             }
130         first_forward->seq->def = merge_def->str;
131         FastaDB_Seq_print(first_forward, stdout, mask);
132         g_string_free(merge_def, TRUE);
133         first_forward->seq->def = curr_def;
134         }
135     g_ptr_array_free(forward_list, TRUE);
136     g_ptr_array_free(reverse_list, TRUE);
137     return;
138     }
139 
NRDB_Data_merge_and_print(NRDB_Info * ni,FastaDB * fdb)140 static void NRDB_Data_merge_and_print(NRDB_Info *ni, FastaDB *fdb){
141     register gint i, j;
142     register NRDB_Data *nd_a, *nd_b;
143     register FastaDB_Seq *fdbs_a, *fdbs_b;
144     register GPtrArray *redundant_set = g_ptr_array_new();
145     register gchar *seq_a, *seq_b;
146     for(i = 0; i < ni->nrdb_data_list->len; i++){
147         nd_a = ni->nrdb_data_list->pdata[i];
148         if(!nd_a) /* Already used */
149             continue;
150         fdbs_a = FastaDB_Key_get_seq(nd_a->key, FastaDB_Mask_ALL);
151         g_ptr_array_add(redundant_set, fdbs_a);
152         for(j = i+1; j < ni->nrdb_data_list->len; j++){
153             nd_b = ni->nrdb_data_list->pdata[j];
154             if(!nd_b) /* Already used */
155                 continue;
156             if(nd_a->checksum != nd_b->checksum)
157                 break;
158             if(nd_a->length != nd_b->length)
159                 continue;
160             if(!nd_b->seq){
161                 /* Not available from previous checksum collision */
162                 nd_b->seq = FastaDB_Key_get_seq(nd_b->key,
163                                                 FastaDB_Mask_ALL);
164                 }
165             fdbs_b = nd_b->seq;
166             seq_a = Sequence_get_str(fdbs_a->seq);
167             seq_b = Sequence_get_str(fdbs_b->seq);
168             if(!ni->comp(seq_a, seq_b)){
169                 g_ptr_array_add(redundant_set, FastaDB_Seq_share(fdbs_b));
170                 NRDB_Data_destroy(nd_b);
171                 ni->nrdb_data_list->pdata[j] = NULL;
172                 }
173             g_free(seq_a);
174             g_free(seq_b);
175             }
176         NRDB_Data_destroy(nd_a);
177         ni->nrdb_data_list->pdata[i] = NULL;
178         NRDB_Data_report_redundant_set(redundant_set);
179         for(j = 0; j < redundant_set->len; j++)
180             FastaDB_Seq_destroy(redundant_set->pdata[j]);
181         g_ptr_array_set_size(redundant_set, 0);
182         }
183     return;
184     }
185 
186 /**/
187 
Argument_main(Argument * arg)188 int Argument_main(Argument *arg){
189     register FastaDB *fdb;
190     register ArgumentSet *as
191            = ArgumentSet_create("Sequence Input Options");
192     register Alphabet *alphabet;
193     NRDB_Info ni;
194     gchar *query_path;
195     ArgumentSet_add_option(as, 'f', "fasta", "path",
196         "Fasta input file", NULL,
197         Argument_parse_string, &query_path);
198     ArgumentSet_add_option(as, 'i', "ignorecase", NULL,
199         "Ignore sequence case", "FALSE",
200         Argument_parse_boolean, &ni.ignore_case);
201     ArgumentSet_add_option(as, 'r', "revcomp", NULL,
202         "Check for revcomp duplicates", "FALSE",
203         Argument_parse_boolean, &ni.check_revcomp);
204     Argument_absorb_ArgumentSet(arg, as);
205     Argument_process(arg, "fastanrdb",
206         "A utility create non-redundant fasta sequence database\n"
207         "Guy St.C. Slater. guy@ebi.ac.uk. 2000-2003.\n", NULL);
208     ni.nrdb_data_list = g_ptr_array_new();
209     if(ni.ignore_case)
210         ni.comp = strcasecmp;
211     else
212         ni.comp = strcmp;
213     if(ni.check_revcomp)
214         alphabet = Alphabet_create(Alphabet_Type_DNA, FALSE);
215     else
216         alphabet = Alphabet_create(Alphabet_Type_UNKNOWN, FALSE);
217     fdb = FastaDB_open(query_path, alphabet);
218     FastaDB_traverse(fdb, FastaDB_Mask_ALL,
219         fasta_nrdb_traverse_func, &ni);
220     qsort(ni.nrdb_data_list->pdata, ni.nrdb_data_list->len,
221           sizeof(gpointer), NRDB_Data_sort_checksum_function);
222     NRDB_Data_merge_and_print(&ni, fdb);
223     /* nrdb_data_list will be already cleared */
224     g_ptr_array_free(ni.nrdb_data_list, TRUE);
225     FastaDB_close(fdb);
226     Alphabet_destroy(alphabet);
227     return 0;
228     }
229 
230