1 /****************************************************************\
2 *                                                                *
3 *  Library for manipulation of FASTA format databases            *
4 *                                                                *
5 *  Guy St.C. Slater..   mailto:guy@ebi.ac.uk                     *
6 *  Copyright (C) 2000-2009.  All Rights Reserved.                *
7 *                                                                *
8 *  This source code is distributed under the terms of the        *
9 *  GNU General Public License, version 3. See the file COPYING   *
10 *  or http://www.gnu.org/licenses/gpl.txt for details            *
11 *                                                                *
12 *  If you use this code, please keep this notice intact.         *
13 *                                                                *
14 \****************************************************************/
15 
16 #include <stdio.h> /* For BUFSIZ */
17 #include <errno.h>
18 #include <string.h> /* For strerror() */
19 #include <ctype.h>  /* For isspace(),isprint() */
20 
21 #include <sys/types.h> /* For stat() */
22 #include <sys/stat.h>  /* For stat() */
23 #include <unistd.h>    /* For stat() */
24 #include <dirent.h>    /* For readdir() */
25 
26 #include "fastadb.h"
27 
28 #ifndef ALPHABET_SIZE
29 #define ALPHABET_SIZE (sizeof(gchar)<<CHAR_BIT)
30 #endif /* ALPHABET_SIZE */
31 
32 #define FASTADB_OUT_BUFFER_CHUNK_SIZE (BUFSIZ * 64)
33 
FastaDB_ArgumentSet_create(Argument * arg)34 FastaDB_ArgumentSet *FastaDB_ArgumentSet_create(Argument *arg){
35     register ArgumentSet *as;
36     static FastaDB_ArgumentSet fas = {NULL};
37     if(arg){
38         as = ArgumentSet_create("Fasta Database Options");
39         ArgumentSet_add_option(as, '\0', "fastasuffix", "suffix",
40            "Fasta file suffix filter (in subdirectories)", ".fa",
41            Argument_parse_string, &fas.suffix_filter);
42         Argument_absorb_ArgumentSet(arg, as);
43         }
44     return &fas;
45     }
46 
FastaDB_file_is_directory(gchar * path)47 static gboolean FastaDB_file_is_directory(gchar *path){
48     struct stat buf;
49     if(stat(path, &buf))
50         g_error("Cannot read file [%s]", path);
51     return S_ISDIR(buf.st_mode);
52     }
53 
FastaDB_get_dir_contents(gchar * path)54 static GPtrArray *FastaDB_get_dir_contents(gchar *path){
55     register GPtrArray *list = g_ptr_array_new();
56     register DIR *dir = opendir(path);
57     register struct dirent *entry;
58     if(!dir)
59         g_error("Could not open directory [%s]", path);
60     readdir(dir); /* Skip '.' */
61     readdir(dir); /* Skip '..' */
62     while((entry = readdir(dir)))
63         g_ptr_array_add(list, g_strdup(entry->d_name));
64     closedir(dir);
65     return list;
66     }
67 
FastaDB_expand_path_list_recur(GPtrArray * path_list,gchar * path,gchar * suffix_filter,gboolean is_subdir)68 static void FastaDB_expand_path_list_recur(GPtrArray *path_list,
69                gchar *path, gchar *suffix_filter, gboolean is_subdir){
70     register GPtrArray *dir_content;
71     register gchar *name, *full_path;
72     register gint i;
73     if(FastaDB_file_is_directory(path)){
74         dir_content = FastaDB_get_dir_contents(path);
75         for(i = 0; i < dir_content->len; i++){
76             name = dir_content->pdata[i];
77             full_path = g_strdup_printf("%s%c%s",
78                                         path, G_DIR_SEPARATOR, name);
79             FastaDB_expand_path_list_recur(path_list, full_path,
80                                            suffix_filter, TRUE);
81             g_free(full_path);
82             g_free(name);
83             }
84         g_ptr_array_free(dir_content, TRUE);
85     } else {
86         if(is_subdir && suffix_filter){
87             for(i = strlen(path)-1; i >= 0; i--)
88                 if(path[i] == '.')
89                     break;
90             if((!i) || strcmp(path+i+1, suffix_filter))
91                 return;
92             }
93         g_ptr_array_add(path_list, g_strdup(path));
94         }
95     return;
96     }
97 
FastaDB_expand_path_list(GPtrArray * path_list,gchar * suffix_filter)98 static GPtrArray *FastaDB_expand_path_list(GPtrArray *path_list,
99                                            gchar *suffix_filter){
100     register GPtrArray *expanded_path_list = g_ptr_array_new();
101     register gchar *path;
102     register gint i;
103     if(suffix_filter && (suffix_filter[0] == '.'))
104         suffix_filter++; /* Skip any leading dot on suffix_filter */
105     for(i = 0; i < path_list->len; i++){
106         path = path_list->pdata[i];
107         FastaDB_expand_path_list_recur(expanded_path_list,
108                                        path, suffix_filter, FALSE);
109         }
110     if(!expanded_path_list->len){
111         for(i = 0; i < path_list->len; i++)
112             g_message("Empty path: [%s]", (gchar*)path_list->pdata[i]);
113         g_error("No valid file paths found");
114         }
115     return expanded_path_list;
116     }
117 
FastaDB_open_list(GPtrArray * path_list,Alphabet * alphabet)118 FastaDB *FastaDB_open_list(GPtrArray *path_list, Alphabet *alphabet){
119     register FastaDB *fdb = g_new(FastaDB, 1);
120     register FastaDB_ArgumentSet *fas
121            = FastaDB_ArgumentSet_create(NULL);
122     register gint i;
123     register gchar *path;
124     register GPtrArray *full_path_list;
125     g_assert(path_list);
126     fdb->ref_count = 1;
127     if(alphabet)
128         fdb->alphabet = Alphabet_share(alphabet);
129     else
130         fdb->alphabet = Alphabet_create(Alphabet_Type_UNKNOWN, FALSE);
131     full_path_list = FastaDB_expand_path_list(path_list, fas->suffix_filter);
132     fdb->cf = CompoundFile_create(full_path_list, TRUE);
133     for(i = 0; i < full_path_list->len; i++){
134         path = full_path_list->pdata[i];
135         g_free(path);
136         }
137     g_ptr_array_free(full_path_list, TRUE);
138     FastaDB_rewind(fdb);
139     fdb->out_buffer_alloc = FASTADB_OUT_BUFFER_CHUNK_SIZE;
140     fdb->out_buffer = g_malloc(sizeof(gchar)*fdb->out_buffer_alloc);
141     fdb->line_length = -1;
142     return fdb;
143     }
144 
FastaDB_open_list_with_limit(GPtrArray * path_list,Alphabet * alphabet,gint chunk_id,gint chunk_total)145 FastaDB *FastaDB_open_list_with_limit(GPtrArray *path_list,
146              Alphabet *alphabet, gint chunk_id, gint chunk_total){
147     register FastaDB *fdb = FastaDB_open_list(path_list, alphabet);
148     register CompoundFile_Pos start, stop, total_length, chunk_size;
149     register CompoundFile_Location *start_cfl, *stop_cfl;
150     if(chunk_total){
151         g_assert(chunk_id);
152         if(chunk_total < 1)
153             g_error("Chunk total [%d] is too small", chunk_total);
154         if((chunk_id < 1) || (chunk_id > chunk_total))
155             g_error("Chunk id should be between 1 and %d", chunk_total);
156         total_length = CompoundFile_get_length(fdb->cf);
157         chunk_size = total_length / chunk_total;
158         start = (chunk_id-1) * chunk_size;
159         start = FastaDB_find_next_start(fdb, start);
160         if(chunk_id == chunk_total){
161             stop = total_length - 1;
162         } else {
163             stop = chunk_id * chunk_size;
164             stop = FastaDB_find_next_start(fdb, stop);
165             }
166         start_cfl = CompoundFile_Location_from_pos(fdb->cf, start);
167         stop_cfl = CompoundFile_Location_from_pos(fdb->cf, stop);
168         CompoundFile_set_limits(fdb->cf, start_cfl, stop_cfl);
169         CompoundFile_Location_destroy(start_cfl);
170         CompoundFile_Location_destroy(stop_cfl);
171         FastaDB_rewind(fdb);
172         }
173     return fdb;
174     }
175 
FastaDB_open(gchar * path,Alphabet * alphabet)176 FastaDB *FastaDB_open(gchar *path, Alphabet *alphabet){
177     register FastaDB *fdb;
178     register GPtrArray *path_list = g_ptr_array_new();
179     g_ptr_array_add(path_list, path);
180     fdb = FastaDB_open_list(path_list, alphabet);
181     g_ptr_array_free(path_list, TRUE);
182     return fdb;
183     }
184 
185 #define FastaDB_putc(fdb, ch) \
186     if(fdb->out_buffer_pos == fdb->out_buffer_alloc) \
187         FastaDB_putc_realloc(fdb); \
188     fdb->out_buffer[fdb->out_buffer_pos++] = (ch)
189 
FastaDB_putc_realloc(FastaDB * fdb)190 static void FastaDB_putc_realloc(FastaDB *fdb){
191     fdb->out_buffer_alloc += FASTADB_OUT_BUFFER_CHUNK_SIZE;
192     fdb->out_buffer = g_realloc(fdb->out_buffer,
193                                 fdb->out_buffer_alloc);
194     return;
195     }
196 
197 /**/
198 
FastaDB_share(FastaDB * fdb)199 FastaDB *FastaDB_share(FastaDB *fdb){
200     g_assert(fdb);
201     fdb->ref_count++;
202     return fdb;
203     }
204 
FastaDB_dup(FastaDB * fdb)205 FastaDB *FastaDB_dup(FastaDB *fdb){
206     register FastaDB *nfdb = g_new(FastaDB, 1);
207     nfdb->ref_count = 1;
208     nfdb->alphabet = Alphabet_create(fdb->alphabet->type,
209                                      fdb->alphabet->is_soft_masked);
210     nfdb->cf = CompoundFile_dup(fdb->cf);
211     nfdb->out_buffer_alloc = FASTADB_OUT_BUFFER_CHUNK_SIZE;
212     nfdb->out_buffer = g_malloc(sizeof(gchar)*nfdb->out_buffer_alloc);
213     nfdb->line_length = -1;
214     FastaDB_rewind(nfdb);
215     return nfdb;
216     }
217 
FastaDB_close(FastaDB * fdb)218 void FastaDB_close(FastaDB *fdb){
219     g_assert(fdb);
220     if(--fdb->ref_count)
221         return;
222     CompoundFile_destroy(fdb->cf);
223     Alphabet_destroy(fdb->alphabet);
224     g_free(fdb->out_buffer);
225     g_free(fdb);
226     return;
227     }
228 
FastaDB_rewind(FastaDB * fdb)229 void FastaDB_rewind(FastaDB *fdb){
230     register gint ch, prev = '\n';
231     g_assert(fdb);
232     CompoundFile_rewind(fdb->cf);
233     while((ch = CompoundFile_getc(fdb->cf)) != EOF){
234         if((ch == '>') && (prev == '\n'))
235             break;
236         prev = ch;
237         }
238     return;
239     }
240 
FastaDB_find_next_start(FastaDB * fdb,CompoundFile_Pos pos)241 CompoundFile_Pos FastaDB_find_next_start(FastaDB *fdb,
242                                          CompoundFile_Pos pos){
243     register gint ch, prev = '\n';
244     g_assert(fdb->cf->element_list->len == 1);
245     CompoundFile_seek(fdb->cf, pos);
246     while((ch = CompoundFile_getc(fdb->cf)) != EOF){
247         if((ch == '>') && (prev == '\n'))
248             break;
249         prev = ch;
250         }
251     return CompoundFile_ftell(fdb->cf)-1;
252     }
253 
FastaDB_file_is_fasta(gchar * path)254 gboolean FastaDB_file_is_fasta(gchar *path){
255     register FILE *fp = fopen(path, "r");
256     register gint ch;
257     register gboolean result = FALSE;
258     if(!fp)
259         g_error("Could not open file [%s]", path);
260     while(((ch = getc(fp)) != EOF)){
261         if(isspace(ch))
262             continue;
263         else
264             if(ch == '>'){ /* Assume is fasta format */
265                 result = TRUE;
266                 break;
267                 }
268         else
269             break;
270         }
271     fclose(fp);
272     return result;
273     }
274 /* Assumes a file is fasta format
275  * if the first non-whitespace character is '>'
276  */
277 
FastaDB_is_finished(FastaDB * fdb)278 gboolean FastaDB_is_finished(FastaDB *fdb){
279     g_assert(fdb);
280     return CompoundFile_is_finished(fdb->cf);
281     }
282 
FastaDB_traverse(FastaDB * fdb,FastaDB_Mask mask,FastaDB_TraverseFunc fdtf,gpointer user_data)283 void FastaDB_traverse(FastaDB *fdb, FastaDB_Mask mask,
284                       FastaDB_TraverseFunc fdtf, gpointer user_data){
285     register FastaDB_Seq *fdbs;
286     g_assert(fdb);
287     g_assert(fdtf);
288     while((fdbs = FastaDB_next(fdb, mask))){
289         if(fdtf(fdbs, user_data)){
290             FastaDB_Seq_destroy(fdbs);
291             break;
292             }
293         FastaDB_Seq_destroy(fdbs);
294         }
295     return;
296     }
297 
FastaDB_memory_usage(FastaDB * fdb)298 gsize FastaDB_memory_usage(FastaDB *fdb){
299     return sizeof(FastaDB)
300          + sizeof(Alphabet)
301          + sizeof(CompoundFile)
302          + (sizeof(gchar)*fdb->out_buffer_alloc);
303     }
304 
305 /**/
306 
FastaDB_Seq_create(FastaDB * fdb,Sequence * seq,CompoundFile_Location * cfl)307 static FastaDB_Seq *FastaDB_Seq_create(FastaDB *fdb, Sequence *seq,
308                                        CompoundFile_Location *cfl){
309     register FastaDB_Seq *fdbs = g_new0(FastaDB_Seq, 1);
310     fdbs->ref_count = 1;
311     fdbs->source = FastaDB_share(fdb);
312     fdbs->location = CompoundFile_Location_share(cfl);
313     fdbs->seq = Sequence_share(seq);
314     return fdbs;
315     }
316 
FastaDB_Seq_share(FastaDB_Seq * fdbs)317 FastaDB_Seq *FastaDB_Seq_share(FastaDB_Seq *fdbs){
318     g_assert(fdbs);
319     fdbs->ref_count++;
320     return fdbs;
321     }
322 
FastaDB_Seq_destroy(FastaDB_Seq * fdbs)323 void FastaDB_Seq_destroy(FastaDB_Seq *fdbs){
324     g_assert(fdbs);
325     if(--fdbs->ref_count)
326         return;
327     CompoundFile_Location_destroy(fdbs->location);
328     FastaDB_close(fdbs->source);
329     Sequence_destroy(fdbs->seq);
330     g_free(fdbs);
331     return;
332     }
333 
FastaDB_Seq_revcomp(FastaDB_Seq * fdbs)334 FastaDB_Seq *FastaDB_Seq_revcomp(FastaDB_Seq *fdbs){
335     register FastaDB_Seq *revcomp_fdbs;
336     g_assert(fdbs);
337     revcomp_fdbs = FastaDB_Seq_create(fdbs->source,
338                                       fdbs->seq, fdbs->location);
339     Sequence_destroy(revcomp_fdbs->seq);
340     revcomp_fdbs->seq = Sequence_revcomp(fdbs->seq);
341     return revcomp_fdbs;
342     }
343 
344 /**/
345 
FastaDB_next(FastaDB * fdb,FastaDB_Mask mask)346 FastaDB_Seq *FastaDB_next(FastaDB *fdb, FastaDB_Mask mask){
347     register FastaDB_Seq *fdbs;
348     register gint ch;
349     register gint id_pos = -1, def_pos = -1, seq_pos = -1,
350                   seq_len = -1;
351     register gint prev_len = -1, curr_length, line_start_seq_pos;
352     register Sequence *seq;
353     register CompoundFile_Location *location
354            = CompoundFile_Location_tell(fdb->cf);
355     fdb->out_buffer_pos = 0; /* Clear output buffer */
356     if(mask & FastaDB_Mask_ID){ /* Record ID */
357         id_pos = 0;
358         while((ch = CompoundFile_getc(fdb->cf)) != EOF){
359             /* if((ch == '\n') || (ch == ' ') || (ch == '\t')) */
360             if(isspace(ch))
361                 break;
362             FastaDB_putc(fdb, ch);
363             }
364         FastaDB_putc(fdb, '\0');
365     } else { /* Skip ID */
366         while((ch = CompoundFile_getc(fdb->cf)) != EOF){
367             /* if((ch == '\n') || (ch == ' ') || (ch == '\t')) */
368             if(isspace(ch))
369                 break;
370             }
371         }
372     if(ch == EOF){
373         CompoundFile_Location_destroy(location);
374         return NULL;
375         }
376     if(ch != '\n'){ /* If DEF is present */
377         if(mask & FastaDB_Mask_DEF){ /* Record DEF */
378             def_pos = fdb->out_buffer_pos;
379             while((ch = CompoundFile_getc(fdb->cf)) != EOF){
380                 if(ch == '\n')
381                     break;
382                 FastaDB_putc(fdb, ch);
383                 }
384             FastaDB_putc(fdb, '\0');
385         } else { /* Skip DEF */
386             while((ch = CompoundFile_getc(fdb->cf)) != EOF){
387                 if(ch == '\n')
388                     break;
389                 }
390             }
391         }
392     if(ch == EOF){
393         CompoundFile_Location_destroy(location);
394         return NULL;
395         }
396     if(mask & FastaDB_Mask_SEQ){ /* Record SEQ and LEN */
397         seq_pos = fdb->out_buffer_pos;
398         line_start_seq_pos = seq_pos;
399         while(((ch = CompoundFile_getc(fdb->cf)) != EOF)
400             && (ch != '>')){
401             if(Alphabet_symbol_is_valid(fdb->alphabet, ch)){
402                 FastaDB_putc(fdb, ch);
403             } else if(isspace(ch)){
404                 if(ch == '\n'){
405                     curr_length = fdb->out_buffer_pos
406                                 - line_start_seq_pos;
407                     line_start_seq_pos = fdb->out_buffer_pos;
408                     if(prev_len != -1){
409                         if(fdb->line_length == -1){
410                             fdb->line_length = prev_len;
411                         } else {
412                             if(prev_len != fdb->line_length)
413                                 fdb->line_length = 0;
414                             }
415                         }
416                     prev_len = curr_length;
417                     /**/
418                     }
419             } else {
420                 g_error("Unrecognised symbol \'%c\' (ascii:%d)"
421                         " file:[%s] seq:[%s] pos:[%d]",
422                      isprint(ch)?ch:' ', ch,
423                      CompoundFile_current_path(fdb->cf),
424                      (id_pos == -1)?"unknown":&fdb->out_buffer[id_pos],
425                      (fdb->out_buffer_pos - seq_pos - 1));
426                 }
427             }
428         FastaDB_putc(fdb, '\0');
429         seq_len = fdb->out_buffer_pos - seq_pos - 1;
430         if(fdb->line_length >= 0)
431             if(fdb->line_length < prev_len)
432                 fdb->line_length = 0;
433     } else if(mask & FastaDB_Mask_LEN){ /* Skip SEQ and record LEN */
434         seq_len = 0;
435         while(((ch = CompoundFile_getc(fdb->cf)) != EOF)
436             && (ch != '>')){
437             if( ((ch >= 'A') && (ch <= 'Z'))
438              || ((ch >= 'a') && (ch <= 'z'))){
439                 seq_len++;
440                 }
441            }
442     } else { /* Just skip SEQ */
443         while((ch = CompoundFile_getc(fdb->cf)) != EOF){
444             if(ch == '>')
445                 break;
446             }
447         }
448     seq = Sequence_create(
449             ( id_pos == -1)?NULL:&fdb->out_buffer[id_pos],
450             (def_pos == -1)?NULL:&fdb->out_buffer[def_pos],
451             (seq_pos == -1)?NULL:&fdb->out_buffer[seq_pos],
452             (seq_len == -1)?0:seq_len,
453             (fdb->alphabet->type == Alphabet_Type_DNA)
454                 ? Sequence_Strand_FORWARD
455                 : Sequence_Strand_UNKNOWN,
456             fdb->alphabet);
457     fdbs = FastaDB_Seq_create(fdb, seq, location);
458     CompoundFile_Location_destroy(location);
459     Sequence_destroy(seq);
460     if((mask & FastaDB_Mask_LEN) &&(!seq->len))
461         g_warning("Warning zero length sequence [%s]", seq->id);
462     return fdbs;
463     }
464 
465 /**/
466 
FastaDB_Key_create(FastaDB * source,CompoundFile_Location * location,Sequence_Strand strand,gint seq_offset,gint length)467 FastaDB_Key *FastaDB_Key_create(FastaDB *source,
468                                 CompoundFile_Location *location,
469                                 Sequence_Strand strand,
470                                 gint seq_offset, gint length){
471     register FastaDB_Key *fdbk = g_new(FastaDB_Key, 1);
472     g_assert(source);
473     fdbk->source = FastaDB_share(source);
474     fdbk->location = CompoundFile_Location_share(location);
475     fdbk->strand = strand;
476     fdbk->seq_offset = seq_offset;
477     fdbk->length = length;
478     return fdbk;
479     }
480 
FastaDB_Key_destroy(FastaDB_Key * fdbk)481 void FastaDB_Key_destroy(FastaDB_Key *fdbk){
482     g_assert(fdbk);
483     FastaDB_close(fdbk->source);
484     CompoundFile_Location_destroy(fdbk->location);
485     g_free(fdbk);
486     return;
487     }
488 
FastaDB_Seq_get_key(FastaDB_Seq * fdbs)489 FastaDB_Key *FastaDB_Seq_get_key(FastaDB_Seq *fdbs){
490     register FastaDB_Key *fdbk;
491     register gint seq_offset = strlen(fdbs->seq->id) + 2
492                              + (fdbs->seq->def ? (strlen(fdbs->seq->def)+1) : 0);
493     g_assert(fdbs);
494     fdbk = FastaDB_Key_create(fdbs->source, fdbs->location,
495                               fdbs->seq->strand, seq_offset,
496                               fdbs->seq->len);
497     return fdbk;
498     }
499 
FastaDB_fetch(FastaDB * fdb,FastaDB_Mask mask,CompoundFile_Pos pos)500 FastaDB_Seq *FastaDB_fetch(FastaDB *fdb, FastaDB_Mask mask,
501                            CompoundFile_Pos pos){
502     register FastaDB_Seq *fdbs = NULL;
503     register CompoundFile_Pos orig_pos = CompoundFile_ftell(fdb->cf);
504     g_assert(fdb);
505     g_assert(fdb->cf->element_list->len == 1);
506     CompoundFile_seek(fdb->cf, pos);
507     fdbs = FastaDB_next(fdb, mask);
508     CompoundFile_seek(fdb->cf, orig_pos);
509     return fdbs;
510     }
511 
FastaDB_Key_get_seq(FastaDB_Key * fdbk,FastaDB_Mask mask)512 FastaDB_Seq *FastaDB_Key_get_seq(FastaDB_Key *fdbk, FastaDB_Mask mask){
513     register FastaDB_Seq *fdbs;
514     register CompoundFile_Location *orig;
515     register Sequence *revcomp_sequence;
516     g_assert(fdbk);
517     orig = CompoundFile_Location_tell(fdbk->source->cf);
518     CompoundFile_Location_seek(fdbk->location);
519     fdbs = FastaDB_next(fdbk->source, mask);
520     g_assert(fdbs);
521     CompoundFile_Location_seek(orig);
522     CompoundFile_Location_destroy(orig);
523     if(fdbk->strand == Sequence_Strand_REVCOMP){
524         g_assert(fdbs->seq->strand == Sequence_Strand_FORWARD);
525         fdbs->seq->strand = Sequence_Strand_REVCOMP;
526         revcomp_sequence = Sequence_revcomp(fdbs->seq);
527         Sequence_destroy(fdbs->seq);
528         fdbs->seq = revcomp_sequence;
529         }
530     return fdbs;
531     }
532 
FastaDB_Key_get_def(FastaDB_Key * fdbk)533 gchar *FastaDB_Key_get_def(FastaDB_Key *fdbk){
534     register gint ch;
535     register gchar *def = NULL;
536     register GString *s;
537 #ifdef USE_PTHREADS
538     pthread_mutex_lock(&fdbk->source->cf->compoundfile_mutex);
539 #endif /* USE_PTHREADS */
540     CompoundFile_Location_seek(fdbk->location);
541     while((ch = CompoundFile_getc(fdbk->source->cf)) != EOF)
542         if((ch == '\n') || (ch == ' ') || (ch == '\t'))
543             break;
544     if(ch == '\n'){
545 #ifdef USE_PTHREADS
546         pthread_mutex_unlock(&fdbk->source->cf->compoundfile_mutex);
547 #endif /* USE_PTHREADS */
548         return NULL;
549         }
550     s = g_string_sized_new(64);
551     while((ch = CompoundFile_getc(fdbk->source->cf)) != EOF){
552         if(ch == '\n')
553             break;
554         s = g_string_append_c(s, ch);
555         }
556 #ifdef USE_PTHREADS
557     pthread_mutex_unlock(&fdbk->source->cf->compoundfile_mutex);
558 #endif /* USE_PTHREADS */
559     def = s->str;
560     g_string_free(s, FALSE);
561     return def;
562     }
563 
564 /**/
565 
FastaDB_SparseCache_get_func_8_BIT(gint pos,gpointer page_data,gpointer user_data)566 static gpointer FastaDB_SparseCache_get_func_8_BIT(gint pos, gpointer page_data,
567                                              gpointer user_data){
568     return GINT_TO_POINTER((gint)((gchar*)page_data)[pos]);
569     }
570 
FastaDB_SparseCache_copy_func_8_BIT(gint start,gint length,gchar * dst,gpointer page_data,gpointer user_data)571 static gint FastaDB_SparseCache_copy_func_8_BIT(gint start, gint length,
572                                                 gchar *dst, gpointer page_data,
573                                                 gpointer user_data){
574     register gint i;
575     register gchar *str = page_data;
576     for(i = 0; i < length; i++)
577         dst[i] = str[start+i];
578     return length;
579     }
580 
FastaDB_SparseCache_get_func_0_BIT_LC(gint pos,gpointer page_data,gpointer user_data)581 static gpointer FastaDB_SparseCache_get_func_0_BIT_LC(gint pos,
582                                                       gpointer page_data,
583                                                       gpointer user_data){
584     return GINT_TO_POINTER((gint)'n');
585     }
586 
FastaDB_SparseCache_copy_func_0_BIT_LC(gint start,gint length,gchar * dst,gpointer page_data,gpointer user_data)587 static gint FastaDB_SparseCache_copy_func_0_BIT_LC(gint start, gint length,
588                                                    gchar *dst, gpointer page_data,
589                                                    gpointer user_data){
590     register gint i;
591     for(i = 0; i < length; i++)
592         dst[i] = 'n';
593     return length;
594     }
595 
FastaDB_SparseCache_get_func_0_BIT_UC(gint pos,gpointer page_data,gpointer user_data)596 static gpointer FastaDB_SparseCache_get_func_0_BIT_UC(gint pos,
597                                                       gpointer page_data,
598                                                       gpointer user_data){
599     return GINT_TO_POINTER((gint)'N');
600     }
601 
FastaDB_SparseCache_copy_func_0_BIT_UC(gint start,gint length,gchar * dst,gpointer page_data,gpointer user_data)602 static gint FastaDB_SparseCache_copy_func_0_BIT_UC(gint start, gint length,
603                                                    gchar *dst, gpointer page_data,
604                                                    gpointer user_data){
605     register gint i;
606     for(i = 0; i < length; i++)
607         dst[i] = 'N';
608     return length;
609     }
610 
611 typedef enum {
612     FastaDB_SparseCache_Mask_0_BIT_LC, /* [n] */
613     FastaDB_SparseCache_Mask_0_BIT_UC, /* [N] */
614     FastaDB_SparseCache_Mask_2_BIT_LC, /* [acgt] */
615     FastaDB_SparseCache_Mask_2_BIT_UC, /* [ACGT] */
616     FastaDB_SparseCache_Mask_4_BIT,    /* [ACGTNacgtn] */
617     FastaDB_SparseCache_Mask_8_BIT
618 } FastaDB_SparseCache_Mask;
619 
FastaDB_SparseCache_fill_mask(FastaDB_SparseCache_Mask * mask_index)620 static void FastaDB_SparseCache_fill_mask(FastaDB_SparseCache_Mask *mask_index){
621     register gint i;
622     for(i = 0; i < ALPHABET_SIZE; i++)
623         mask_index[i] = (1 << FastaDB_SparseCache_Mask_8_BIT);
624     mask_index['N'] = (1 << FastaDB_SparseCache_Mask_0_BIT_UC);
625     mask_index['n'] = (1 << FastaDB_SparseCache_Mask_0_BIT_LC);
626     mask_index['A'] =
627     mask_index['C'] =
628     mask_index['G'] =
629     mask_index['T'] = (1 << FastaDB_SparseCache_Mask_2_BIT_UC);
630     mask_index['a'] =
631     mask_index['c'] =
632     mask_index['g'] =
633     mask_index['t'] = (1 << FastaDB_SparseCache_Mask_2_BIT_LC);
634     return;
635     }
636 
637 /**/
638 
FastaDB_SparseCache_fill_index(gint * index,gchar * alphabet)639 static void FastaDB_SparseCache_fill_index(gint *index, gchar *alphabet){
640     register gint i;
641     for(i = 0; i < ALPHABET_SIZE; i++)
642         index[i] = -1;
643     for(i = 0; alphabet[i]; i++)
644         index[(guchar)alphabet[i]] = i;
645     return;
646     }
647 
FastaDB_SparseCache_get_func_4_BIT(gint pos,gpointer page_data,gpointer user_data)648 static gpointer FastaDB_SparseCache_get_func_4_BIT(gint pos,
649                                                    gpointer page_data,
650                                                    gpointer user_data){
651     register gint ch = ((gchar*)page_data)[(pos >> 1)],
652                   num = (ch >> ((pos & 1) << 2)) & 15;
653     static gchar *alphabet = "ACGTNacgtn------";
654     return GINT_TO_POINTER((gint)alphabet[num]);
655     }
656 
FastaDB_SparseCache_copy_func_4_BIT(gint start,gint length,gchar * dst,gpointer page_data,gpointer user_data)657 static gint FastaDB_SparseCache_copy_func_4_BIT(gint start, gint length,
658                                                 gchar *dst, gpointer page_data,
659                                                 gpointer user_data){
660 #ifndef G_DISABLE_ASSERT
661     register gint i;
662 #endif /* G_DISABLE_ASSERT */
663     register gint pos = 0, end = length;
664     register gchar *str = page_data, *word;
665     static gchar *table[256] = {
666         "AA", "CA", "GA", "TA", "NA", "aA", "cA", "gA",
667         "tA", "nA", "-A", "-A", "-A", "-A", "-A", "-A",
668         "AC", "CC", "GC", "TC", "NC", "aC", "cC", "gC",
669         "tC", "nC", "-C", "-C", "-C", "-C", "-C", "-C",
670         "AG", "CG", "GG", "TG", "NG", "aG", "cG", "gG",
671         "tG", "nG", "-G", "-G", "-G", "-G", "-G", "-G",
672         "AT", "CT", "GT", "TT", "NT", "aT", "cT", "gT",
673         "tT", "nT", "-T", "-T", "-T", "-T", "-T", "-T",
674         "AN", "CN", "GN", "TN", "NN", "aN", "cN", "gN",
675         "tN", "nN", "-N", "-N", "-N", "-N", "-N", "-N",
676         "Aa", "Ca", "Ga", "Ta", "Na", "aa", "ca", "ga",
677         "ta", "na", "-a", "-a", "-a", "-a", "-a", "-a",
678         "Ac", "Cc", "Gc", "Tc", "Nc", "ac", "cc", "gc",
679         "tc", "nc", "-c", "-c", "-c", "-c", "-c", "-c",
680         "Ag", "Cg", "Gg", "Tg", "Ng", "ag", "cg", "gg",
681         "tg", "ng", "-g", "-g", "-g", "-g", "-g", "-g",
682         "At", "Ct", "Gt", "Tt", "Nt", "at", "ct", "gt",
683         "tt", "nt", "-t", "-t", "-t", "-t", "-t", "-t",
684         "An", "Cn", "Gn", "Tn", "Nn", "an", "cn", "gn",
685         "tn", "nn", "-n", "-n", "-n", "-n", "-n", "-n",
686         "A-", "C-", "G-", "T-", "N-", "a-", "c-", "g-",
687         "t-", "n-", "--", "--", "--", "--", "--", "--",
688         "A-", "C-", "G-", "T-", "N-", "a-", "c-", "g-",
689         "t-", "n-", "--", "--", "--", "--", "--", "--",
690         "A-", "C-", "G-", "T-", "N-", "a-", "c-", "g-",
691         "t-", "n-", "--", "--", "--", "--", "--", "--",
692         "A-", "C-", "G-", "T-", "N-", "a-", "c-", "g-",
693         "t-", "n-", "--", "--", "--", "--", "--", "--",
694         "A-", "C-", "G-", "T-", "N-", "a-", "c-", "g-",
695         "t-", "n-", "--", "--", "--", "--", "--", "--",
696         "A-", "C-", "G-", "T-", "N-", "a-", "c-", "g-",
697         "t-", "n-", "--", "--", "--", "--", "--", "--"
698         };
699     g_assert(length);
700     if(start&1){ /* If odd start, get first base */
701         dst[0] = GPOINTER_TO_INT(FastaDB_SparseCache_get_func_4_BIT(start,
702                                                      page_data, user_data));
703         pos = 1;
704         }
705     if((length > 2) && ((start+length)&1)){ /* If odd end, get last base */
706         dst[length-1] = GPOINTER_TO_INT(FastaDB_SparseCache_get_func_4_BIT(
707                                        start+length-1, page_data, user_data));
708         end--;
709         }
710     while(pos < end){
711         word = table[(guchar)str[(start+pos)>>1]];
712         g_assert(word[0] != '-');
713         g_assert(word[1] != '-');
714         dst[pos++] = word[0];
715         dst[pos++] = word[1];
716         }
717 #ifndef G_DISABLE_ASSERT
718     for(i = 0; i < length; i++){
719         g_assert(dst[i]
720                  == GPOINTER_TO_INT(FastaDB_SparseCache_get_func_4_BIT(
721                                     start+i, page_data, user_data)));
722         }
723 #endif /* G_DISABLE_ASSERT */
724     return length;
725     }
726 
FastaDB_SparseCache_compress_4bit(SparseCache_Page * page,gint len)727 static void FastaDB_SparseCache_compress_4bit(SparseCache_Page *page,
728                                               gint len){
729     register gint i;
730     register gchar *seq =  page->data;
731     static gint index[ALPHABET_SIZE];
732     static gboolean index_is_filled = FALSE;
733     if(!index_is_filled){
734         FastaDB_SparseCache_fill_index(index, "ACGTNacgtn");
735         index_is_filled = TRUE;
736         }
737     page->data = g_new0(gchar, (len >> 1)+1);
738     page->data_size = sizeof(gchar)*((len >> 1)+1);
739     for(i = 0; i < len; i++)
740         ((gchar*)page->data)[(i >> 1)]
741         |= (index[(guchar)seq[i]] << ((i&1) << 2));
742     g_free(seq);
743     page->get_func = FastaDB_SparseCache_get_func_4_BIT;
744     page->copy_func = FastaDB_SparseCache_copy_func_4_BIT;
745     return;
746     }
747 
748 /**/
749 
FastaDB_SparseCache_get_func_2_BIT_LC(gint pos,gpointer page_data,gpointer user_data)750 static gpointer FastaDB_SparseCache_get_func_2_BIT_LC(gint pos,
751                                                       gpointer page_data,
752                                                       gpointer user_data){
753     register gint ch = ((gchar*)page_data)[(pos >> 2)],
754                   num = (ch >> ((pos & 3) << 1)) & 3;
755     static gchar *alphabet = "acgt";
756     return GINT_TO_POINTER((gint)alphabet[num]);
757     }
758 
FastaDB_SparseCache_copy_func_2_BIT_LC(gint start,gint length,gchar * dst,gpointer page_data,gpointer user_data)759 static gint FastaDB_SparseCache_copy_func_2_BIT_LC(gint start, gint length,
760                                                    gchar *dst, gpointer page_data,
761                                                    gpointer user_data){
762 #ifndef G_DISABLE_ASSERT
763     register gint i;
764 #endif /* G_DISABLE_ASSERT */
765     register gint pos = 0, end = length;
766     register gchar *str = page_data, *word;
767     gchar *table[256] = {
768         "aaaa", "caaa", "gaaa", "taaa", "acaa", "ccaa", "gcaa", "tcaa",
769         "agaa", "cgaa", "ggaa", "tgaa", "ataa", "ctaa", "gtaa", "ttaa",
770         "aaca", "caca", "gaca", "taca", "acca", "ccca", "gcca", "tcca",
771         "agca", "cgca", "ggca", "tgca", "atca", "ctca", "gtca", "ttca",
772         "aaga", "caga", "gaga", "taga", "acga", "ccga", "gcga", "tcga",
773         "agga", "cgga", "ggga", "tgga", "atga", "ctga", "gtga", "ttga",
774         "aata", "cata", "gata", "tata", "acta", "ccta", "gcta", "tcta",
775         "agta", "cgta", "ggta", "tgta", "atta", "ctta", "gtta", "ttta",
776         "aaac", "caac", "gaac", "taac", "acac", "ccac", "gcac", "tcac",
777         "agac", "cgac", "ggac", "tgac", "atac", "ctac", "gtac", "ttac",
778         "aacc", "cacc", "gacc", "tacc", "accc", "cccc", "gccc", "tccc",
779         "agcc", "cgcc", "ggcc", "tgcc", "atcc", "ctcc", "gtcc", "ttcc",
780         "aagc", "cagc", "gagc", "tagc", "acgc", "ccgc", "gcgc", "tcgc",
781         "aggc", "cggc", "gggc", "tggc", "atgc", "ctgc", "gtgc", "ttgc",
782         "aatc", "catc", "gatc", "tatc", "actc", "cctc", "gctc", "tctc",
783         "agtc", "cgtc", "ggtc", "tgtc", "attc", "cttc", "gttc", "tttc",
784         "aaag", "caag", "gaag", "taag", "acag", "ccag", "gcag", "tcag",
785         "agag", "cgag", "ggag", "tgag", "atag", "ctag", "gtag", "ttag",
786         "aacg", "cacg", "gacg", "tacg", "accg", "cccg", "gccg", "tccg",
787         "agcg", "cgcg", "ggcg", "tgcg", "atcg", "ctcg", "gtcg", "ttcg",
788         "aagg", "cagg", "gagg", "tagg", "acgg", "ccgg", "gcgg", "tcgg",
789         "aggg", "cggg", "gggg", "tggg", "atgg", "ctgg", "gtgg", "ttgg",
790         "aatg", "catg", "gatg", "tatg", "actg", "cctg", "gctg", "tctg",
791         "agtg", "cgtg", "ggtg", "tgtg", "attg", "cttg", "gttg", "tttg",
792         "aaat", "caat", "gaat", "taat", "acat", "ccat", "gcat", "tcat",
793         "agat", "cgat", "ggat", "tgat", "atat", "ctat", "gtat", "ttat",
794         "aact", "cact", "gact", "tact", "acct", "ccct", "gcct", "tcct",
795         "agct", "cgct", "ggct", "tgct", "atct", "ctct", "gtct", "ttct",
796         "aagt", "cagt", "gagt", "tagt", "acgt", "ccgt", "gcgt", "tcgt",
797         "aggt", "cggt", "gggt", "tggt", "atgt", "ctgt", "gtgt", "ttgt",
798         "aatt", "catt", "gatt", "tatt", "actt", "cctt", "gctt", "tctt",
799         "agtt", "cgtt", "ggtt", "tgtt", "attt", "cttt", "gttt", "tttt",
800         };
801     while((start+pos)&3){
802         dst[pos] = GPOINTER_TO_INT(FastaDB_SparseCache_get_func_2_BIT_LC(
803                                    start+pos, page_data, user_data));
804         pos++;
805         }
806     if(length > 4){
807         while(end&3){
808             dst[end-start-1]
809                 = GPOINTER_TO_INT(FastaDB_SparseCache_get_func_2_BIT_LC(
810                                   end-1, page_data, user_data));
811             end--;
812             }
813         }
814     while(pos < end){
815         word = table[(guchar)str[(start+pos)>>2]];
816         dst[pos++] = word[0];
817         dst[pos++] = word[1];
818         dst[pos++] = word[2];
819         dst[pos++] = word[3];
820         }
821 #ifndef G_DISABLE_ASSERT
822     for(i = 0; i < length; i++){
823         g_assert(dst[i]
824           == GPOINTER_TO_INT(FastaDB_SparseCache_get_func_2_BIT_LC(
825                              start+i, page_data, user_data)));
826         }
827 #endif /* G_DISABLE_ASSERT */
828     return length;
829     }
830 
FastaDB_SparseCache_compress_2bit_lc(SparseCache_Page * page,gint len)831 static void FastaDB_SparseCache_compress_2bit_lc(SparseCache_Page *page,
832                                                  gint len){
833     register gint i;
834     register gchar *seq =  page->data;
835     static gint index[ALPHABET_SIZE];
836     static gboolean index_is_filled = FALSE;
837     if(!index_is_filled){
838         FastaDB_SparseCache_fill_index(index, "acgt");
839         index_is_filled = TRUE;
840         }
841     page->data = g_new0(gchar, (len >> 2)+1);
842     page->data_size = sizeof(gchar)*((len >> 2)+1);
843     for(i = 0; i < len; i++)
844         ((gchar*)page->data)[(i >> 2)] |= (index[(guchar)seq[i]] << ((i&3) << 1));
845     g_free(seq);
846     page->get_func = FastaDB_SparseCache_get_func_2_BIT_LC;
847     page->copy_func = FastaDB_SparseCache_copy_func_2_BIT_LC;
848     return;
849     }
850 
FastaDB_SparseCache_get_func_2_BIT_UC(gint pos,gpointer page_data,gpointer user_data)851 static gpointer FastaDB_SparseCache_get_func_2_BIT_UC(gint pos,
852                                                       gpointer page_data,
853                                                       gpointer user_data){
854     register gint ch = ((gchar*)page_data)[(pos >> 2)],
855                   num = (ch >> ((pos & 3) << 1)) & 3;
856     static gchar *alphabet = "ACGT";
857     return GINT_TO_POINTER((gint)alphabet[num]);
858     }
859 
FastaDB_SparseCache_copy_func_2_BIT_UC(gint start,gint length,gchar * dst,gpointer page_data,gpointer user_data)860 static gint FastaDB_SparseCache_copy_func_2_BIT_UC(gint start, gint length,
861                                                    gchar *dst, gpointer page_data,
862                                                    gpointer user_data){
863 #ifndef G_DISABLE_ASSERT
864     register gint i;
865 #endif /* G_DISABLE_ASSERT */
866     register gint pos = 0, end = length;
867     register gchar *str = page_data, *word;
868     gchar *table[256] = {
869         "AAAA", "CAAA", "GAAA", "TAAA", "ACAA", "CCAA", "GCAA", "TCAA",
870         "AGAA", "CGAA", "GGAA", "TGAA", "ATAA", "CTAA", "GTAA", "TTAA",
871         "AACA", "CACA", "GACA", "TACA", "ACCA", "CCCA", "GCCA", "TCCA",
872         "AGCA", "CGCA", "GGCA", "TGCA", "ATCA", "CTCA", "GTCA", "TTCA",
873         "AAGA", "CAGA", "GAGA", "TAGA", "ACGA", "CCGA", "GCGA", "TCGA",
874         "AGGA", "CGGA", "GGGA", "TGGA", "ATGA", "CTGA", "GTGA", "TTGA",
875         "AATA", "CATA", "GATA", "TATA", "ACTA", "CCTA", "GCTA", "TCTA",
876         "AGTA", "CGTA", "GGTA", "TGTA", "ATTA", "CTTA", "GTTA", "TTTA",
877         "AAAC", "CAAC", "GAAC", "TAAC", "ACAC", "CCAC", "GCAC", "TCAC",
878         "AGAC", "CGAC", "GGAC", "TGAC", "ATAC", "CTAC", "GTAC", "TTAC",
879         "AACC", "CACC", "GACC", "TACC", "ACCC", "CCCC", "GCCC", "TCCC",
880         "AGCC", "CGCC", "GGCC", "TGCC", "ATCC", "CTCC", "GTCC", "TTCC",
881         "AAGC", "CAGC", "GAGC", "TAGC", "ACGC", "CCGC", "GCGC", "TCGC",
882         "AGGC", "CGGC", "GGGC", "TGGC", "ATGC", "CTGC", "GTGC", "TTGC",
883         "AATC", "CATC", "GATC", "TATC", "ACTC", "CCTC", "GCTC", "TCTC",
884         "AGTC", "CGTC", "GGTC", "TGTC", "ATTC", "CTTC", "GTTC", "TTTC",
885         "AAAG", "CAAG", "GAAG", "TAAG", "ACAG", "CCAG", "GCAG", "TCAG",
886         "AGAG", "CGAG", "GGAG", "TGAG", "ATAG", "CTAG", "GTAG", "TTAG",
887         "AACG", "CACG", "GACG", "TACG", "ACCG", "CCCG", "GCCG", "TCCG",
888         "AGCG", "CGCG", "GGCG", "TGCG", "ATCG", "CTCG", "GTCG", "TTCG",
889         "AAGG", "CAGG", "GAGG", "TAGG", "ACGG", "CCGG", "GCGG", "TCGG",
890         "AGGG", "CGGG", "GGGG", "TGGG", "ATGG", "CTGG", "GTGG", "TTGG",
891         "AATG", "CATG", "GATG", "TATG", "ACTG", "CCTG", "GCTG", "TCTG",
892         "AGTG", "CGTG", "GGTG", "TGTG", "ATTG", "CTTG", "GTTG", "TTTG",
893         "AAAT", "CAAT", "GAAT", "TAAT", "ACAT", "CCAT", "GCAT", "TCAT",
894         "AGAT", "CGAT", "GGAT", "TGAT", "ATAT", "CTAT", "GTAT", "TTAT",
895         "AACT", "CACT", "GACT", "TACT", "ACCT", "CCCT", "GCCT", "TCCT",
896         "AGCT", "CGCT", "GGCT", "TGCT", "ATCT", "CTCT", "GTCT", "TTCT",
897         "AAGT", "CAGT", "GAGT", "TAGT", "ACGT", "CCGT", "GCGT", "TCGT",
898         "AGGT", "CGGT", "GGGT", "TGGT", "ATGT", "CTGT", "GTGT", "TTGT",
899         "AATT", "CATT", "GATT", "TATT", "ACTT", "CCTT", "GCTT", "TCTT",
900         "AGTT", "CGTT", "GGTT", "TGTT", "ATTT", "CTTT", "GTTT", "TTTT",
901         };
902     while((start+pos)&3){
903         dst[pos] = GPOINTER_TO_INT(FastaDB_SparseCache_get_func_2_BIT_UC(
904                                    start+pos, page_data, user_data));
905         pos++;
906         }
907     if(length > 4){
908         while(end&3){
909             dst[end-start-1]
910                 = GPOINTER_TO_INT(FastaDB_SparseCache_get_func_2_BIT_UC(end-1,
911                                               page_data, user_data));
912             end--;
913             }
914         }
915     while(pos < end){
916         word = table[(guchar)str[(start+pos)>>2]];
917         dst[pos++] = word[0];
918         dst[pos++] = word[1];
919         dst[pos++] = word[2];
920         dst[pos++] = word[3];
921         }
922 #ifndef G_DISABLE_ASSERT
923     for(i = 0; i < length; i++){
924         g_assert(dst[i]
925           == GPOINTER_TO_INT(FastaDB_SparseCache_get_func_2_BIT_UC(
926                              start+i, page_data, user_data)));
927         }
928 #endif /* G_DISABLE_ASSERT */
929     return length;
930     }
931 
FastaDB_SparseCache_compress_2bit_uc(SparseCache_Page * page,gint len)932 static void FastaDB_SparseCache_compress_2bit_uc(SparseCache_Page *page,
933                                                  gint len){
934     register gint i;
935     register gchar *seq =  page->data;
936     static gint index[ALPHABET_SIZE];
937     static gboolean index_is_filled = FALSE;
938     if(!index_is_filled){
939         FastaDB_SparseCache_fill_index(index, "ACGT");
940         index_is_filled = TRUE;
941         }
942     page->data = g_new0(gchar, (len >> 2)+1);
943     page->data_size = sizeof(gchar)*((len >> 2)+1);
944     for(i = 0; i < len; i++)
945         ((gchar*)page->data)[(i >> 2)] |= (index[(guchar)seq[i]] << ((i&3) << 1));
946     g_free(seq);
947     page->get_func = FastaDB_SparseCache_get_func_2_BIT_UC;
948     page->copy_func = FastaDB_SparseCache_copy_func_2_BIT_UC;
949     return;
950     }
951 
952 /**/
953 
FastaDB_SparseCache_compress(SparseCache_Page * page,gint len)954 void FastaDB_SparseCache_compress(SparseCache_Page *page, gint len){
955     register gint i, mask = 0;
956     register gchar *seq =  page->data;
957     static FastaDB_SparseCache_Mask mask_index[ALPHABET_SIZE];
958     static gboolean mask_is_filled = FALSE;
959     if(!mask_is_filled){
960         FastaDB_SparseCache_fill_mask(mask_index);
961         mask_is_filled = TRUE;
962         }
963     for(i = 0; i < len; i++)
964         mask |= mask_index[(guchar)seq[i]];
965     if(mask & (1 << FastaDB_SparseCache_Mask_8_BIT)){
966         page->copy_func = FastaDB_SparseCache_copy_func_8_BIT;
967         return; /* Keep current 8 bit encoding */
968         }
969     if(mask & (mask-1)){ /* more than one mask bit is set */
970         FastaDB_SparseCache_compress_4bit(page, len);
971         return;
972         }
973     if(mask & (1 << FastaDB_SparseCache_Mask_2_BIT_LC)){
974         FastaDB_SparseCache_compress_2bit_lc(page, len);
975         return;
976         }
977     if(mask & (1 << FastaDB_SparseCache_Mask_2_BIT_UC)){
978         FastaDB_SparseCache_compress_2bit_uc(page, len);
979         return;
980         }
981     if(mask & (1 << FastaDB_SparseCache_Mask_0_BIT_LC)){
982         g_free(page->data);
983         page->data = NULL;
984         page->data_size = 0;
985         page->get_func = FastaDB_SparseCache_get_func_0_BIT_LC;
986         page->copy_func = FastaDB_SparseCache_copy_func_0_BIT_LC;
987         return;
988         }
989     if(mask & (1 << FastaDB_SparseCache_Mask_0_BIT_UC)){
990         g_free(page->data);
991         page->data = NULL;
992         page->data_size = 0;
993         page->get_func = FastaDB_SparseCache_get_func_0_BIT_UC;
994         page->copy_func = FastaDB_SparseCache_copy_func_0_BIT_UC;
995         return;
996         }
997     g_error("Bad FastaDB_SparseCache compression mask");
998     return;
999     }
1000 
FastaDB_SparseCache_fill_func(gint start,gpointer user_data)1001 static SparseCache_Page *FastaDB_SparseCache_fill_func(gint start,
1002                                                        gpointer user_data){
1003     register SparseCache_Page *page = g_new(SparseCache_Page, 1);
1004     register FastaDB_Key *fdbk = user_data;
1005     register gint ch, pos = 0, len;
1006     register gint db_start, db_end;
1007     if(fdbk->source->line_length < 1)
1008         g_error("Unknown or irregular fasta line length");
1009     db_start = fdbk->seq_offset
1010              + start
1011              + (start / (fdbk->source->line_length));
1012     len = MIN(SparseCache_PAGE_SIZE, fdbk->length-start);
1013     db_end = db_start
1014            + len
1015            + (len / (fdbk->source->line_length));
1016 #ifdef USE_PTHREADS
1017     pthread_mutex_lock(&fdbk->source->cf->compoundfile_mutex);
1018 #endif /* USE_PTHREADS */
1019     fdbk->location->pos += db_start;
1020     CompoundFile_Location_seek(fdbk->location);
1021     fdbk->location->pos -= db_start;
1022     page->data = g_new(gchar, len);
1023     page->get_func = FastaDB_SparseCache_get_func_8_BIT;
1024     page->copy_func = NULL; /* FIXME: temp */
1025     page->data_size = sizeof(gchar)*len;
1026     do {
1027         ch = CompoundFile_getc(fdbk->source->cf);
1028         g_assert(ch != EOF);
1029         g_assert(pos < len);
1030         if(!isspace(ch))
1031             ((gchar*)page->data)[pos++] = ch;
1032     } while(pos < len);
1033 #ifdef USE_PTHREADS
1034     pthread_mutex_unlock(&fdbk->source->cf->compoundfile_mutex);
1035 #endif /* USE_PTHREADS */
1036     FastaDB_SparseCache_compress(page, len);
1037     return page;
1038     }
1039 /* FIXME: this is really inefficient: should use unbuffered read/write
1040  *        do this by adding CompoundFile_read()
1041  */
1042 /* FIXME add suppport for compressed pages etc */
1043 
FastaDB_Key_get_SparseCache(FastaDB_Key * fdbk)1044 SparseCache *FastaDB_Key_get_SparseCache(FastaDB_Key *fdbk){
1045     register SparseCache *cache = SparseCache_create(fdbk->length,
1046                   FastaDB_SparseCache_fill_func, NULL, NULL, fdbk);
1047     return cache;
1048     }
1049 
1050 /**/
1051 
FastaDB_all(gchar * path,Alphabet * alphabet,FastaDB_Mask mask,guint * total)1052 FastaDB_Seq **FastaDB_all(gchar *path, Alphabet *alphabet,
1053                           FastaDB_Mask mask, guint *total){
1054     register FastaDB *fdb = FastaDB_open(path, alphabet);
1055     register GPtrArray *ptra = g_ptr_array_new();
1056     register FastaDB_Seq *fdbs, **fdbsa;
1057     register guint numseqs = 0;
1058     while((fdbs = FastaDB_next(fdb, mask))){
1059         g_ptr_array_add(ptra, fdbs);
1060         numseqs++;
1061         }
1062     g_ptr_array_add(ptra, NULL); /* NULL delimit */
1063     if(total)
1064         *total = numseqs; /* Record number of sequences */
1065     fdbsa = (FastaDB_Seq**)ptra->pdata;
1066     g_ptr_array_free(ptra, FALSE); /* Leave data intact */
1067     FastaDB_close(fdb);
1068     return fdbsa;
1069     }
1070 
FastaDB_Seq_all_destroy(FastaDB_Seq ** fdbs)1071 void FastaDB_Seq_all_destroy(FastaDB_Seq **fdbs){
1072     register gint i;
1073     g_assert(fdbs);
1074     for(i = 0; fdbs[i]; i++)
1075         FastaDB_Seq_destroy(fdbs[i]);
1076     g_free(fdbs);
1077     return;
1078     }
1079 
FastaDB_Seq_print(FastaDB_Seq * fdbs,FILE * fp,FastaDB_Mask mask)1080 gint FastaDB_Seq_print(FastaDB_Seq *fdbs, FILE *fp, FastaDB_Mask mask){
1081     register gint written = 0;
1082     if(mask & (FastaDB_Mask_ID|FastaDB_Mask_DEF|FastaDB_Mask_LEN)){
1083         written += fprintf(fp, ">%s", (mask & FastaDB_Mask_ID)
1084                              ?fdbs->seq->id:"[unknown]");
1085         if(fdbs->seq->def && (mask & FastaDB_Mask_DEF))
1086             written += fprintf(fp, " %s", fdbs->seq->def);
1087         if(mask & FastaDB_Mask_LEN)
1088             written += fprintf(fp, " [len:%d]", fdbs->seq->len);
1089         written += fprintf(fp, "\n");
1090         }
1091     if(mask & FastaDB_Mask_SEQ) /* to print seq, length must be set */
1092         written += Sequence_print_fasta_block(fdbs->seq, fp);
1093     return written;
1094     }
1095 
FastaDB_Seq_all_print(FastaDB_Seq ** fdbs,FILE * fp,FastaDB_Mask mask)1096 gint FastaDB_Seq_all_print(FastaDB_Seq **fdbs, FILE *fp,
1097                           FastaDB_Mask mask){
1098     register int i, written = 0;
1099     for(i = 0; fdbs[i]; i++)
1100         written += FastaDB_Seq_print(fdbs[i], fp, mask);
1101     return written;
1102     }
1103 
FastaDB_get_single(gchar * path,Alphabet * alphabet)1104 FastaDB_Seq *FastaDB_get_single(gchar *path, Alphabet *alphabet){
1105     register FastaDB *fdb = FastaDB_open(path, alphabet);
1106     register FastaDB_Seq *fdbs = FastaDB_next(fdb, FastaDB_Mask_ALL);
1107     if(!fdbs)
1108         g_error("No sequences found in [%s]\n", path);
1109     FastaDB_close(fdb);
1110     return fdbs;
1111     }
1112 
FastaDB_guess_type(gchar * path)1113 Alphabet_Type FastaDB_guess_type(gchar *path){
1114     register Alphabet *alphabet = Alphabet_create(Alphabet_Type_UNKNOWN,
1115                                                   FALSE);
1116     register FastaDB_Seq *fdbs = FastaDB_get_single(path, alphabet);
1117     register gchar *str = Sequence_get_str(fdbs->seq);
1118     register Alphabet_Type type = Alphabet_Type_guess(str);
1119     g_free(str);
1120     FastaDB_Seq_destroy(fdbs);
1121     Alphabet_destroy(alphabet);
1122     return type;
1123     }
1124 
1125