1 /****************************************************************\
2 * *
3 * Library for manipulation of FASTA format databases *
4 * *
5 * Guy St.C. Slater.. mailto:guy@ebi.ac.uk *
6 * Copyright (C) 2000-2009. All Rights Reserved. *
7 * *
8 * This source code is distributed under the terms of the *
9 * GNU General Public License, version 3. See the file COPYING *
10 * or http://www.gnu.org/licenses/gpl.txt for details *
11 * *
12 * If you use this code, please keep this notice intact. *
13 * *
14 \****************************************************************/
15
16 #include <stdio.h> /* For BUFSIZ */
17 #include <errno.h>
18 #include <string.h> /* For strerror() */
19 #include <ctype.h> /* For isspace(),isprint() */
20
21 #include <sys/types.h> /* For stat() */
22 #include <sys/stat.h> /* For stat() */
23 #include <unistd.h> /* For stat() */
24 #include <dirent.h> /* For readdir() */
25
26 #include "fastadb.h"
27
28 #ifndef ALPHABET_SIZE
29 #define ALPHABET_SIZE (sizeof(gchar)<<CHAR_BIT)
30 #endif /* ALPHABET_SIZE */
31
32 #define FASTADB_OUT_BUFFER_CHUNK_SIZE (BUFSIZ * 64)
33
FastaDB_ArgumentSet_create(Argument * arg)34 FastaDB_ArgumentSet *FastaDB_ArgumentSet_create(Argument *arg){
35 register ArgumentSet *as;
36 static FastaDB_ArgumentSet fas = {NULL};
37 if(arg){
38 as = ArgumentSet_create("Fasta Database Options");
39 ArgumentSet_add_option(as, '\0', "fastasuffix", "suffix",
40 "Fasta file suffix filter (in subdirectories)", ".fa",
41 Argument_parse_string, &fas.suffix_filter);
42 Argument_absorb_ArgumentSet(arg, as);
43 }
44 return &fas;
45 }
46
FastaDB_file_is_directory(gchar * path)47 static gboolean FastaDB_file_is_directory(gchar *path){
48 struct stat buf;
49 if(stat(path, &buf))
50 g_error("Cannot read file [%s]", path);
51 return S_ISDIR(buf.st_mode);
52 }
53
FastaDB_get_dir_contents(gchar * path)54 static GPtrArray *FastaDB_get_dir_contents(gchar *path){
55 register GPtrArray *list = g_ptr_array_new();
56 register DIR *dir = opendir(path);
57 register struct dirent *entry;
58 if(!dir)
59 g_error("Could not open directory [%s]", path);
60 readdir(dir); /* Skip '.' */
61 readdir(dir); /* Skip '..' */
62 while((entry = readdir(dir)))
63 g_ptr_array_add(list, g_strdup(entry->d_name));
64 closedir(dir);
65 return list;
66 }
67
FastaDB_expand_path_list_recur(GPtrArray * path_list,gchar * path,gchar * suffix_filter,gboolean is_subdir)68 static void FastaDB_expand_path_list_recur(GPtrArray *path_list,
69 gchar *path, gchar *suffix_filter, gboolean is_subdir){
70 register GPtrArray *dir_content;
71 register gchar *name, *full_path;
72 register gint i;
73 if(FastaDB_file_is_directory(path)){
74 dir_content = FastaDB_get_dir_contents(path);
75 for(i = 0; i < dir_content->len; i++){
76 name = dir_content->pdata[i];
77 full_path = g_strdup_printf("%s%c%s",
78 path, G_DIR_SEPARATOR, name);
79 FastaDB_expand_path_list_recur(path_list, full_path,
80 suffix_filter, TRUE);
81 g_free(full_path);
82 g_free(name);
83 }
84 g_ptr_array_free(dir_content, TRUE);
85 } else {
86 if(is_subdir && suffix_filter){
87 for(i = strlen(path)-1; i >= 0; i--)
88 if(path[i] == '.')
89 break;
90 if((!i) || strcmp(path+i+1, suffix_filter))
91 return;
92 }
93 g_ptr_array_add(path_list, g_strdup(path));
94 }
95 return;
96 }
97
FastaDB_expand_path_list(GPtrArray * path_list,gchar * suffix_filter)98 static GPtrArray *FastaDB_expand_path_list(GPtrArray *path_list,
99 gchar *suffix_filter){
100 register GPtrArray *expanded_path_list = g_ptr_array_new();
101 register gchar *path;
102 register gint i;
103 if(suffix_filter && (suffix_filter[0] == '.'))
104 suffix_filter++; /* Skip any leading dot on suffix_filter */
105 for(i = 0; i < path_list->len; i++){
106 path = path_list->pdata[i];
107 FastaDB_expand_path_list_recur(expanded_path_list,
108 path, suffix_filter, FALSE);
109 }
110 if(!expanded_path_list->len){
111 for(i = 0; i < path_list->len; i++)
112 g_message("Empty path: [%s]", (gchar*)path_list->pdata[i]);
113 g_error("No valid file paths found");
114 }
115 return expanded_path_list;
116 }
117
FastaDB_open_list(GPtrArray * path_list,Alphabet * alphabet)118 FastaDB *FastaDB_open_list(GPtrArray *path_list, Alphabet *alphabet){
119 register FastaDB *fdb = g_new(FastaDB, 1);
120 register FastaDB_ArgumentSet *fas
121 = FastaDB_ArgumentSet_create(NULL);
122 register gint i;
123 register gchar *path;
124 register GPtrArray *full_path_list;
125 g_assert(path_list);
126 fdb->ref_count = 1;
127 if(alphabet)
128 fdb->alphabet = Alphabet_share(alphabet);
129 else
130 fdb->alphabet = Alphabet_create(Alphabet_Type_UNKNOWN, FALSE);
131 full_path_list = FastaDB_expand_path_list(path_list, fas->suffix_filter);
132 fdb->cf = CompoundFile_create(full_path_list, TRUE);
133 for(i = 0; i < full_path_list->len; i++){
134 path = full_path_list->pdata[i];
135 g_free(path);
136 }
137 g_ptr_array_free(full_path_list, TRUE);
138 FastaDB_rewind(fdb);
139 fdb->out_buffer_alloc = FASTADB_OUT_BUFFER_CHUNK_SIZE;
140 fdb->out_buffer = g_malloc(sizeof(gchar)*fdb->out_buffer_alloc);
141 fdb->line_length = -1;
142 return fdb;
143 }
144
FastaDB_open_list_with_limit(GPtrArray * path_list,Alphabet * alphabet,gint chunk_id,gint chunk_total)145 FastaDB *FastaDB_open_list_with_limit(GPtrArray *path_list,
146 Alphabet *alphabet, gint chunk_id, gint chunk_total){
147 register FastaDB *fdb = FastaDB_open_list(path_list, alphabet);
148 register CompoundFile_Pos start, stop, total_length, chunk_size;
149 register CompoundFile_Location *start_cfl, *stop_cfl;
150 if(chunk_total){
151 g_assert(chunk_id);
152 if(chunk_total < 1)
153 g_error("Chunk total [%d] is too small", chunk_total);
154 if((chunk_id < 1) || (chunk_id > chunk_total))
155 g_error("Chunk id should be between 1 and %d", chunk_total);
156 total_length = CompoundFile_get_length(fdb->cf);
157 chunk_size = total_length / chunk_total;
158 start = (chunk_id-1) * chunk_size;
159 start = FastaDB_find_next_start(fdb, start);
160 if(chunk_id == chunk_total){
161 stop = total_length - 1;
162 } else {
163 stop = chunk_id * chunk_size;
164 stop = FastaDB_find_next_start(fdb, stop);
165 }
166 start_cfl = CompoundFile_Location_from_pos(fdb->cf, start);
167 stop_cfl = CompoundFile_Location_from_pos(fdb->cf, stop);
168 CompoundFile_set_limits(fdb->cf, start_cfl, stop_cfl);
169 CompoundFile_Location_destroy(start_cfl);
170 CompoundFile_Location_destroy(stop_cfl);
171 FastaDB_rewind(fdb);
172 }
173 return fdb;
174 }
175
FastaDB_open(gchar * path,Alphabet * alphabet)176 FastaDB *FastaDB_open(gchar *path, Alphabet *alphabet){
177 register FastaDB *fdb;
178 register GPtrArray *path_list = g_ptr_array_new();
179 g_ptr_array_add(path_list, path);
180 fdb = FastaDB_open_list(path_list, alphabet);
181 g_ptr_array_free(path_list, TRUE);
182 return fdb;
183 }
184
185 #define FastaDB_putc(fdb, ch) \
186 if(fdb->out_buffer_pos == fdb->out_buffer_alloc) \
187 FastaDB_putc_realloc(fdb); \
188 fdb->out_buffer[fdb->out_buffer_pos++] = (ch)
189
FastaDB_putc_realloc(FastaDB * fdb)190 static void FastaDB_putc_realloc(FastaDB *fdb){
191 fdb->out_buffer_alloc += FASTADB_OUT_BUFFER_CHUNK_SIZE;
192 fdb->out_buffer = g_realloc(fdb->out_buffer,
193 fdb->out_buffer_alloc);
194 return;
195 }
196
197 /**/
198
FastaDB_share(FastaDB * fdb)199 FastaDB *FastaDB_share(FastaDB *fdb){
200 g_assert(fdb);
201 fdb->ref_count++;
202 return fdb;
203 }
204
FastaDB_dup(FastaDB * fdb)205 FastaDB *FastaDB_dup(FastaDB *fdb){
206 register FastaDB *nfdb = g_new(FastaDB, 1);
207 nfdb->ref_count = 1;
208 nfdb->alphabet = Alphabet_create(fdb->alphabet->type,
209 fdb->alphabet->is_soft_masked);
210 nfdb->cf = CompoundFile_dup(fdb->cf);
211 nfdb->out_buffer_alloc = FASTADB_OUT_BUFFER_CHUNK_SIZE;
212 nfdb->out_buffer = g_malloc(sizeof(gchar)*nfdb->out_buffer_alloc);
213 nfdb->line_length = -1;
214 FastaDB_rewind(nfdb);
215 return nfdb;
216 }
217
FastaDB_close(FastaDB * fdb)218 void FastaDB_close(FastaDB *fdb){
219 g_assert(fdb);
220 if(--fdb->ref_count)
221 return;
222 CompoundFile_destroy(fdb->cf);
223 Alphabet_destroy(fdb->alphabet);
224 g_free(fdb->out_buffer);
225 g_free(fdb);
226 return;
227 }
228
FastaDB_rewind(FastaDB * fdb)229 void FastaDB_rewind(FastaDB *fdb){
230 register gint ch, prev = '\n';
231 g_assert(fdb);
232 CompoundFile_rewind(fdb->cf);
233 while((ch = CompoundFile_getc(fdb->cf)) != EOF){
234 if((ch == '>') && (prev == '\n'))
235 break;
236 prev = ch;
237 }
238 return;
239 }
240
FastaDB_find_next_start(FastaDB * fdb,CompoundFile_Pos pos)241 CompoundFile_Pos FastaDB_find_next_start(FastaDB *fdb,
242 CompoundFile_Pos pos){
243 register gint ch, prev = '\n';
244 g_assert(fdb->cf->element_list->len == 1);
245 CompoundFile_seek(fdb->cf, pos);
246 while((ch = CompoundFile_getc(fdb->cf)) != EOF){
247 if((ch == '>') && (prev == '\n'))
248 break;
249 prev = ch;
250 }
251 return CompoundFile_ftell(fdb->cf)-1;
252 }
253
FastaDB_file_is_fasta(gchar * path)254 gboolean FastaDB_file_is_fasta(gchar *path){
255 register FILE *fp = fopen(path, "r");
256 register gint ch;
257 register gboolean result = FALSE;
258 if(!fp)
259 g_error("Could not open file [%s]", path);
260 while(((ch = getc(fp)) != EOF)){
261 if(isspace(ch))
262 continue;
263 else
264 if(ch == '>'){ /* Assume is fasta format */
265 result = TRUE;
266 break;
267 }
268 else
269 break;
270 }
271 fclose(fp);
272 return result;
273 }
274 /* Assumes a file is fasta format
275 * if the first non-whitespace character is '>'
276 */
277
FastaDB_is_finished(FastaDB * fdb)278 gboolean FastaDB_is_finished(FastaDB *fdb){
279 g_assert(fdb);
280 return CompoundFile_is_finished(fdb->cf);
281 }
282
FastaDB_traverse(FastaDB * fdb,FastaDB_Mask mask,FastaDB_TraverseFunc fdtf,gpointer user_data)283 void FastaDB_traverse(FastaDB *fdb, FastaDB_Mask mask,
284 FastaDB_TraverseFunc fdtf, gpointer user_data){
285 register FastaDB_Seq *fdbs;
286 g_assert(fdb);
287 g_assert(fdtf);
288 while((fdbs = FastaDB_next(fdb, mask))){
289 if(fdtf(fdbs, user_data)){
290 FastaDB_Seq_destroy(fdbs);
291 break;
292 }
293 FastaDB_Seq_destroy(fdbs);
294 }
295 return;
296 }
297
FastaDB_memory_usage(FastaDB * fdb)298 gsize FastaDB_memory_usage(FastaDB *fdb){
299 return sizeof(FastaDB)
300 + sizeof(Alphabet)
301 + sizeof(CompoundFile)
302 + (sizeof(gchar)*fdb->out_buffer_alloc);
303 }
304
305 /**/
306
FastaDB_Seq_create(FastaDB * fdb,Sequence * seq,CompoundFile_Location * cfl)307 static FastaDB_Seq *FastaDB_Seq_create(FastaDB *fdb, Sequence *seq,
308 CompoundFile_Location *cfl){
309 register FastaDB_Seq *fdbs = g_new0(FastaDB_Seq, 1);
310 fdbs->ref_count = 1;
311 fdbs->source = FastaDB_share(fdb);
312 fdbs->location = CompoundFile_Location_share(cfl);
313 fdbs->seq = Sequence_share(seq);
314 return fdbs;
315 }
316
FastaDB_Seq_share(FastaDB_Seq * fdbs)317 FastaDB_Seq *FastaDB_Seq_share(FastaDB_Seq *fdbs){
318 g_assert(fdbs);
319 fdbs->ref_count++;
320 return fdbs;
321 }
322
FastaDB_Seq_destroy(FastaDB_Seq * fdbs)323 void FastaDB_Seq_destroy(FastaDB_Seq *fdbs){
324 g_assert(fdbs);
325 if(--fdbs->ref_count)
326 return;
327 CompoundFile_Location_destroy(fdbs->location);
328 FastaDB_close(fdbs->source);
329 Sequence_destroy(fdbs->seq);
330 g_free(fdbs);
331 return;
332 }
333
FastaDB_Seq_revcomp(FastaDB_Seq * fdbs)334 FastaDB_Seq *FastaDB_Seq_revcomp(FastaDB_Seq *fdbs){
335 register FastaDB_Seq *revcomp_fdbs;
336 g_assert(fdbs);
337 revcomp_fdbs = FastaDB_Seq_create(fdbs->source,
338 fdbs->seq, fdbs->location);
339 Sequence_destroy(revcomp_fdbs->seq);
340 revcomp_fdbs->seq = Sequence_revcomp(fdbs->seq);
341 return revcomp_fdbs;
342 }
343
344 /**/
345
FastaDB_next(FastaDB * fdb,FastaDB_Mask mask)346 FastaDB_Seq *FastaDB_next(FastaDB *fdb, FastaDB_Mask mask){
347 register FastaDB_Seq *fdbs;
348 register gint ch;
349 register gint id_pos = -1, def_pos = -1, seq_pos = -1,
350 seq_len = -1;
351 register gint prev_len = -1, curr_length, line_start_seq_pos;
352 register Sequence *seq;
353 register CompoundFile_Location *location
354 = CompoundFile_Location_tell(fdb->cf);
355 fdb->out_buffer_pos = 0; /* Clear output buffer */
356 if(mask & FastaDB_Mask_ID){ /* Record ID */
357 id_pos = 0;
358 while((ch = CompoundFile_getc(fdb->cf)) != EOF){
359 /* if((ch == '\n') || (ch == ' ') || (ch == '\t')) */
360 if(isspace(ch))
361 break;
362 FastaDB_putc(fdb, ch);
363 }
364 FastaDB_putc(fdb, '\0');
365 } else { /* Skip ID */
366 while((ch = CompoundFile_getc(fdb->cf)) != EOF){
367 /* if((ch == '\n') || (ch == ' ') || (ch == '\t')) */
368 if(isspace(ch))
369 break;
370 }
371 }
372 if(ch == EOF){
373 CompoundFile_Location_destroy(location);
374 return NULL;
375 }
376 if(ch != '\n'){ /* If DEF is present */
377 if(mask & FastaDB_Mask_DEF){ /* Record DEF */
378 def_pos = fdb->out_buffer_pos;
379 while((ch = CompoundFile_getc(fdb->cf)) != EOF){
380 if(ch == '\n')
381 break;
382 FastaDB_putc(fdb, ch);
383 }
384 FastaDB_putc(fdb, '\0');
385 } else { /* Skip DEF */
386 while((ch = CompoundFile_getc(fdb->cf)) != EOF){
387 if(ch == '\n')
388 break;
389 }
390 }
391 }
392 if(ch == EOF){
393 CompoundFile_Location_destroy(location);
394 return NULL;
395 }
396 if(mask & FastaDB_Mask_SEQ){ /* Record SEQ and LEN */
397 seq_pos = fdb->out_buffer_pos;
398 line_start_seq_pos = seq_pos;
399 while(((ch = CompoundFile_getc(fdb->cf)) != EOF)
400 && (ch != '>')){
401 if(Alphabet_symbol_is_valid(fdb->alphabet, ch)){
402 FastaDB_putc(fdb, ch);
403 } else if(isspace(ch)){
404 if(ch == '\n'){
405 curr_length = fdb->out_buffer_pos
406 - line_start_seq_pos;
407 line_start_seq_pos = fdb->out_buffer_pos;
408 if(prev_len != -1){
409 if(fdb->line_length == -1){
410 fdb->line_length = prev_len;
411 } else {
412 if(prev_len != fdb->line_length)
413 fdb->line_length = 0;
414 }
415 }
416 prev_len = curr_length;
417 /**/
418 }
419 } else {
420 g_error("Unrecognised symbol \'%c\' (ascii:%d)"
421 " file:[%s] seq:[%s] pos:[%d]",
422 isprint(ch)?ch:' ', ch,
423 CompoundFile_current_path(fdb->cf),
424 (id_pos == -1)?"unknown":&fdb->out_buffer[id_pos],
425 (fdb->out_buffer_pos - seq_pos - 1));
426 }
427 }
428 FastaDB_putc(fdb, '\0');
429 seq_len = fdb->out_buffer_pos - seq_pos - 1;
430 if(fdb->line_length >= 0)
431 if(fdb->line_length < prev_len)
432 fdb->line_length = 0;
433 } else if(mask & FastaDB_Mask_LEN){ /* Skip SEQ and record LEN */
434 seq_len = 0;
435 while(((ch = CompoundFile_getc(fdb->cf)) != EOF)
436 && (ch != '>')){
437 if( ((ch >= 'A') && (ch <= 'Z'))
438 || ((ch >= 'a') && (ch <= 'z'))){
439 seq_len++;
440 }
441 }
442 } else { /* Just skip SEQ */
443 while((ch = CompoundFile_getc(fdb->cf)) != EOF){
444 if(ch == '>')
445 break;
446 }
447 }
448 seq = Sequence_create(
449 ( id_pos == -1)?NULL:&fdb->out_buffer[id_pos],
450 (def_pos == -1)?NULL:&fdb->out_buffer[def_pos],
451 (seq_pos == -1)?NULL:&fdb->out_buffer[seq_pos],
452 (seq_len == -1)?0:seq_len,
453 (fdb->alphabet->type == Alphabet_Type_DNA)
454 ? Sequence_Strand_FORWARD
455 : Sequence_Strand_UNKNOWN,
456 fdb->alphabet);
457 fdbs = FastaDB_Seq_create(fdb, seq, location);
458 CompoundFile_Location_destroy(location);
459 Sequence_destroy(seq);
460 if((mask & FastaDB_Mask_LEN) &&(!seq->len))
461 g_warning("Warning zero length sequence [%s]", seq->id);
462 return fdbs;
463 }
464
465 /**/
466
FastaDB_Key_create(FastaDB * source,CompoundFile_Location * location,Sequence_Strand strand,gint seq_offset,gint length)467 FastaDB_Key *FastaDB_Key_create(FastaDB *source,
468 CompoundFile_Location *location,
469 Sequence_Strand strand,
470 gint seq_offset, gint length){
471 register FastaDB_Key *fdbk = g_new(FastaDB_Key, 1);
472 g_assert(source);
473 fdbk->source = FastaDB_share(source);
474 fdbk->location = CompoundFile_Location_share(location);
475 fdbk->strand = strand;
476 fdbk->seq_offset = seq_offset;
477 fdbk->length = length;
478 return fdbk;
479 }
480
FastaDB_Key_destroy(FastaDB_Key * fdbk)481 void FastaDB_Key_destroy(FastaDB_Key *fdbk){
482 g_assert(fdbk);
483 FastaDB_close(fdbk->source);
484 CompoundFile_Location_destroy(fdbk->location);
485 g_free(fdbk);
486 return;
487 }
488
FastaDB_Seq_get_key(FastaDB_Seq * fdbs)489 FastaDB_Key *FastaDB_Seq_get_key(FastaDB_Seq *fdbs){
490 register FastaDB_Key *fdbk;
491 register gint seq_offset = strlen(fdbs->seq->id) + 2
492 + (fdbs->seq->def ? (strlen(fdbs->seq->def)+1) : 0);
493 g_assert(fdbs);
494 fdbk = FastaDB_Key_create(fdbs->source, fdbs->location,
495 fdbs->seq->strand, seq_offset,
496 fdbs->seq->len);
497 return fdbk;
498 }
499
FastaDB_fetch(FastaDB * fdb,FastaDB_Mask mask,CompoundFile_Pos pos)500 FastaDB_Seq *FastaDB_fetch(FastaDB *fdb, FastaDB_Mask mask,
501 CompoundFile_Pos pos){
502 register FastaDB_Seq *fdbs = NULL;
503 register CompoundFile_Pos orig_pos = CompoundFile_ftell(fdb->cf);
504 g_assert(fdb);
505 g_assert(fdb->cf->element_list->len == 1);
506 CompoundFile_seek(fdb->cf, pos);
507 fdbs = FastaDB_next(fdb, mask);
508 CompoundFile_seek(fdb->cf, orig_pos);
509 return fdbs;
510 }
511
FastaDB_Key_get_seq(FastaDB_Key * fdbk,FastaDB_Mask mask)512 FastaDB_Seq *FastaDB_Key_get_seq(FastaDB_Key *fdbk, FastaDB_Mask mask){
513 register FastaDB_Seq *fdbs;
514 register CompoundFile_Location *orig;
515 register Sequence *revcomp_sequence;
516 g_assert(fdbk);
517 orig = CompoundFile_Location_tell(fdbk->source->cf);
518 CompoundFile_Location_seek(fdbk->location);
519 fdbs = FastaDB_next(fdbk->source, mask);
520 g_assert(fdbs);
521 CompoundFile_Location_seek(orig);
522 CompoundFile_Location_destroy(orig);
523 if(fdbk->strand == Sequence_Strand_REVCOMP){
524 g_assert(fdbs->seq->strand == Sequence_Strand_FORWARD);
525 fdbs->seq->strand = Sequence_Strand_REVCOMP;
526 revcomp_sequence = Sequence_revcomp(fdbs->seq);
527 Sequence_destroy(fdbs->seq);
528 fdbs->seq = revcomp_sequence;
529 }
530 return fdbs;
531 }
532
FastaDB_Key_get_def(FastaDB_Key * fdbk)533 gchar *FastaDB_Key_get_def(FastaDB_Key *fdbk){
534 register gint ch;
535 register gchar *def = NULL;
536 register GString *s;
537 #ifdef USE_PTHREADS
538 pthread_mutex_lock(&fdbk->source->cf->compoundfile_mutex);
539 #endif /* USE_PTHREADS */
540 CompoundFile_Location_seek(fdbk->location);
541 while((ch = CompoundFile_getc(fdbk->source->cf)) != EOF)
542 if((ch == '\n') || (ch == ' ') || (ch == '\t'))
543 break;
544 if(ch == '\n'){
545 #ifdef USE_PTHREADS
546 pthread_mutex_unlock(&fdbk->source->cf->compoundfile_mutex);
547 #endif /* USE_PTHREADS */
548 return NULL;
549 }
550 s = g_string_sized_new(64);
551 while((ch = CompoundFile_getc(fdbk->source->cf)) != EOF){
552 if(ch == '\n')
553 break;
554 s = g_string_append_c(s, ch);
555 }
556 #ifdef USE_PTHREADS
557 pthread_mutex_unlock(&fdbk->source->cf->compoundfile_mutex);
558 #endif /* USE_PTHREADS */
559 def = s->str;
560 g_string_free(s, FALSE);
561 return def;
562 }
563
564 /**/
565
FastaDB_SparseCache_get_func_8_BIT(gint pos,gpointer page_data,gpointer user_data)566 static gpointer FastaDB_SparseCache_get_func_8_BIT(gint pos, gpointer page_data,
567 gpointer user_data){
568 return GINT_TO_POINTER((gint)((gchar*)page_data)[pos]);
569 }
570
FastaDB_SparseCache_copy_func_8_BIT(gint start,gint length,gchar * dst,gpointer page_data,gpointer user_data)571 static gint FastaDB_SparseCache_copy_func_8_BIT(gint start, gint length,
572 gchar *dst, gpointer page_data,
573 gpointer user_data){
574 register gint i;
575 register gchar *str = page_data;
576 for(i = 0; i < length; i++)
577 dst[i] = str[start+i];
578 return length;
579 }
580
FastaDB_SparseCache_get_func_0_BIT_LC(gint pos,gpointer page_data,gpointer user_data)581 static gpointer FastaDB_SparseCache_get_func_0_BIT_LC(gint pos,
582 gpointer page_data,
583 gpointer user_data){
584 return GINT_TO_POINTER((gint)'n');
585 }
586
FastaDB_SparseCache_copy_func_0_BIT_LC(gint start,gint length,gchar * dst,gpointer page_data,gpointer user_data)587 static gint FastaDB_SparseCache_copy_func_0_BIT_LC(gint start, gint length,
588 gchar *dst, gpointer page_data,
589 gpointer user_data){
590 register gint i;
591 for(i = 0; i < length; i++)
592 dst[i] = 'n';
593 return length;
594 }
595
FastaDB_SparseCache_get_func_0_BIT_UC(gint pos,gpointer page_data,gpointer user_data)596 static gpointer FastaDB_SparseCache_get_func_0_BIT_UC(gint pos,
597 gpointer page_data,
598 gpointer user_data){
599 return GINT_TO_POINTER((gint)'N');
600 }
601
FastaDB_SparseCache_copy_func_0_BIT_UC(gint start,gint length,gchar * dst,gpointer page_data,gpointer user_data)602 static gint FastaDB_SparseCache_copy_func_0_BIT_UC(gint start, gint length,
603 gchar *dst, gpointer page_data,
604 gpointer user_data){
605 register gint i;
606 for(i = 0; i < length; i++)
607 dst[i] = 'N';
608 return length;
609 }
610
611 typedef enum {
612 FastaDB_SparseCache_Mask_0_BIT_LC, /* [n] */
613 FastaDB_SparseCache_Mask_0_BIT_UC, /* [N] */
614 FastaDB_SparseCache_Mask_2_BIT_LC, /* [acgt] */
615 FastaDB_SparseCache_Mask_2_BIT_UC, /* [ACGT] */
616 FastaDB_SparseCache_Mask_4_BIT, /* [ACGTNacgtn] */
617 FastaDB_SparseCache_Mask_8_BIT
618 } FastaDB_SparseCache_Mask;
619
FastaDB_SparseCache_fill_mask(FastaDB_SparseCache_Mask * mask_index)620 static void FastaDB_SparseCache_fill_mask(FastaDB_SparseCache_Mask *mask_index){
621 register gint i;
622 for(i = 0; i < ALPHABET_SIZE; i++)
623 mask_index[i] = (1 << FastaDB_SparseCache_Mask_8_BIT);
624 mask_index['N'] = (1 << FastaDB_SparseCache_Mask_0_BIT_UC);
625 mask_index['n'] = (1 << FastaDB_SparseCache_Mask_0_BIT_LC);
626 mask_index['A'] =
627 mask_index['C'] =
628 mask_index['G'] =
629 mask_index['T'] = (1 << FastaDB_SparseCache_Mask_2_BIT_UC);
630 mask_index['a'] =
631 mask_index['c'] =
632 mask_index['g'] =
633 mask_index['t'] = (1 << FastaDB_SparseCache_Mask_2_BIT_LC);
634 return;
635 }
636
637 /**/
638
FastaDB_SparseCache_fill_index(gint * index,gchar * alphabet)639 static void FastaDB_SparseCache_fill_index(gint *index, gchar *alphabet){
640 register gint i;
641 for(i = 0; i < ALPHABET_SIZE; i++)
642 index[i] = -1;
643 for(i = 0; alphabet[i]; i++)
644 index[(guchar)alphabet[i]] = i;
645 return;
646 }
647
FastaDB_SparseCache_get_func_4_BIT(gint pos,gpointer page_data,gpointer user_data)648 static gpointer FastaDB_SparseCache_get_func_4_BIT(gint pos,
649 gpointer page_data,
650 gpointer user_data){
651 register gint ch = ((gchar*)page_data)[(pos >> 1)],
652 num = (ch >> ((pos & 1) << 2)) & 15;
653 static gchar *alphabet = "ACGTNacgtn------";
654 return GINT_TO_POINTER((gint)alphabet[num]);
655 }
656
FastaDB_SparseCache_copy_func_4_BIT(gint start,gint length,gchar * dst,gpointer page_data,gpointer user_data)657 static gint FastaDB_SparseCache_copy_func_4_BIT(gint start, gint length,
658 gchar *dst, gpointer page_data,
659 gpointer user_data){
660 #ifndef G_DISABLE_ASSERT
661 register gint i;
662 #endif /* G_DISABLE_ASSERT */
663 register gint pos = 0, end = length;
664 register gchar *str = page_data, *word;
665 static gchar *table[256] = {
666 "AA", "CA", "GA", "TA", "NA", "aA", "cA", "gA",
667 "tA", "nA", "-A", "-A", "-A", "-A", "-A", "-A",
668 "AC", "CC", "GC", "TC", "NC", "aC", "cC", "gC",
669 "tC", "nC", "-C", "-C", "-C", "-C", "-C", "-C",
670 "AG", "CG", "GG", "TG", "NG", "aG", "cG", "gG",
671 "tG", "nG", "-G", "-G", "-G", "-G", "-G", "-G",
672 "AT", "CT", "GT", "TT", "NT", "aT", "cT", "gT",
673 "tT", "nT", "-T", "-T", "-T", "-T", "-T", "-T",
674 "AN", "CN", "GN", "TN", "NN", "aN", "cN", "gN",
675 "tN", "nN", "-N", "-N", "-N", "-N", "-N", "-N",
676 "Aa", "Ca", "Ga", "Ta", "Na", "aa", "ca", "ga",
677 "ta", "na", "-a", "-a", "-a", "-a", "-a", "-a",
678 "Ac", "Cc", "Gc", "Tc", "Nc", "ac", "cc", "gc",
679 "tc", "nc", "-c", "-c", "-c", "-c", "-c", "-c",
680 "Ag", "Cg", "Gg", "Tg", "Ng", "ag", "cg", "gg",
681 "tg", "ng", "-g", "-g", "-g", "-g", "-g", "-g",
682 "At", "Ct", "Gt", "Tt", "Nt", "at", "ct", "gt",
683 "tt", "nt", "-t", "-t", "-t", "-t", "-t", "-t",
684 "An", "Cn", "Gn", "Tn", "Nn", "an", "cn", "gn",
685 "tn", "nn", "-n", "-n", "-n", "-n", "-n", "-n",
686 "A-", "C-", "G-", "T-", "N-", "a-", "c-", "g-",
687 "t-", "n-", "--", "--", "--", "--", "--", "--",
688 "A-", "C-", "G-", "T-", "N-", "a-", "c-", "g-",
689 "t-", "n-", "--", "--", "--", "--", "--", "--",
690 "A-", "C-", "G-", "T-", "N-", "a-", "c-", "g-",
691 "t-", "n-", "--", "--", "--", "--", "--", "--",
692 "A-", "C-", "G-", "T-", "N-", "a-", "c-", "g-",
693 "t-", "n-", "--", "--", "--", "--", "--", "--",
694 "A-", "C-", "G-", "T-", "N-", "a-", "c-", "g-",
695 "t-", "n-", "--", "--", "--", "--", "--", "--",
696 "A-", "C-", "G-", "T-", "N-", "a-", "c-", "g-",
697 "t-", "n-", "--", "--", "--", "--", "--", "--"
698 };
699 g_assert(length);
700 if(start&1){ /* If odd start, get first base */
701 dst[0] = GPOINTER_TO_INT(FastaDB_SparseCache_get_func_4_BIT(start,
702 page_data, user_data));
703 pos = 1;
704 }
705 if((length > 2) && ((start+length)&1)){ /* If odd end, get last base */
706 dst[length-1] = GPOINTER_TO_INT(FastaDB_SparseCache_get_func_4_BIT(
707 start+length-1, page_data, user_data));
708 end--;
709 }
710 while(pos < end){
711 word = table[(guchar)str[(start+pos)>>1]];
712 g_assert(word[0] != '-');
713 g_assert(word[1] != '-');
714 dst[pos++] = word[0];
715 dst[pos++] = word[1];
716 }
717 #ifndef G_DISABLE_ASSERT
718 for(i = 0; i < length; i++){
719 g_assert(dst[i]
720 == GPOINTER_TO_INT(FastaDB_SparseCache_get_func_4_BIT(
721 start+i, page_data, user_data)));
722 }
723 #endif /* G_DISABLE_ASSERT */
724 return length;
725 }
726
FastaDB_SparseCache_compress_4bit(SparseCache_Page * page,gint len)727 static void FastaDB_SparseCache_compress_4bit(SparseCache_Page *page,
728 gint len){
729 register gint i;
730 register gchar *seq = page->data;
731 static gint index[ALPHABET_SIZE];
732 static gboolean index_is_filled = FALSE;
733 if(!index_is_filled){
734 FastaDB_SparseCache_fill_index(index, "ACGTNacgtn");
735 index_is_filled = TRUE;
736 }
737 page->data = g_new0(gchar, (len >> 1)+1);
738 page->data_size = sizeof(gchar)*((len >> 1)+1);
739 for(i = 0; i < len; i++)
740 ((gchar*)page->data)[(i >> 1)]
741 |= (index[(guchar)seq[i]] << ((i&1) << 2));
742 g_free(seq);
743 page->get_func = FastaDB_SparseCache_get_func_4_BIT;
744 page->copy_func = FastaDB_SparseCache_copy_func_4_BIT;
745 return;
746 }
747
748 /**/
749
FastaDB_SparseCache_get_func_2_BIT_LC(gint pos,gpointer page_data,gpointer user_data)750 static gpointer FastaDB_SparseCache_get_func_2_BIT_LC(gint pos,
751 gpointer page_data,
752 gpointer user_data){
753 register gint ch = ((gchar*)page_data)[(pos >> 2)],
754 num = (ch >> ((pos & 3) << 1)) & 3;
755 static gchar *alphabet = "acgt";
756 return GINT_TO_POINTER((gint)alphabet[num]);
757 }
758
FastaDB_SparseCache_copy_func_2_BIT_LC(gint start,gint length,gchar * dst,gpointer page_data,gpointer user_data)759 static gint FastaDB_SparseCache_copy_func_2_BIT_LC(gint start, gint length,
760 gchar *dst, gpointer page_data,
761 gpointer user_data){
762 #ifndef G_DISABLE_ASSERT
763 register gint i;
764 #endif /* G_DISABLE_ASSERT */
765 register gint pos = 0, end = length;
766 register gchar *str = page_data, *word;
767 gchar *table[256] = {
768 "aaaa", "caaa", "gaaa", "taaa", "acaa", "ccaa", "gcaa", "tcaa",
769 "agaa", "cgaa", "ggaa", "tgaa", "ataa", "ctaa", "gtaa", "ttaa",
770 "aaca", "caca", "gaca", "taca", "acca", "ccca", "gcca", "tcca",
771 "agca", "cgca", "ggca", "tgca", "atca", "ctca", "gtca", "ttca",
772 "aaga", "caga", "gaga", "taga", "acga", "ccga", "gcga", "tcga",
773 "agga", "cgga", "ggga", "tgga", "atga", "ctga", "gtga", "ttga",
774 "aata", "cata", "gata", "tata", "acta", "ccta", "gcta", "tcta",
775 "agta", "cgta", "ggta", "tgta", "atta", "ctta", "gtta", "ttta",
776 "aaac", "caac", "gaac", "taac", "acac", "ccac", "gcac", "tcac",
777 "agac", "cgac", "ggac", "tgac", "atac", "ctac", "gtac", "ttac",
778 "aacc", "cacc", "gacc", "tacc", "accc", "cccc", "gccc", "tccc",
779 "agcc", "cgcc", "ggcc", "tgcc", "atcc", "ctcc", "gtcc", "ttcc",
780 "aagc", "cagc", "gagc", "tagc", "acgc", "ccgc", "gcgc", "tcgc",
781 "aggc", "cggc", "gggc", "tggc", "atgc", "ctgc", "gtgc", "ttgc",
782 "aatc", "catc", "gatc", "tatc", "actc", "cctc", "gctc", "tctc",
783 "agtc", "cgtc", "ggtc", "tgtc", "attc", "cttc", "gttc", "tttc",
784 "aaag", "caag", "gaag", "taag", "acag", "ccag", "gcag", "tcag",
785 "agag", "cgag", "ggag", "tgag", "atag", "ctag", "gtag", "ttag",
786 "aacg", "cacg", "gacg", "tacg", "accg", "cccg", "gccg", "tccg",
787 "agcg", "cgcg", "ggcg", "tgcg", "atcg", "ctcg", "gtcg", "ttcg",
788 "aagg", "cagg", "gagg", "tagg", "acgg", "ccgg", "gcgg", "tcgg",
789 "aggg", "cggg", "gggg", "tggg", "atgg", "ctgg", "gtgg", "ttgg",
790 "aatg", "catg", "gatg", "tatg", "actg", "cctg", "gctg", "tctg",
791 "agtg", "cgtg", "ggtg", "tgtg", "attg", "cttg", "gttg", "tttg",
792 "aaat", "caat", "gaat", "taat", "acat", "ccat", "gcat", "tcat",
793 "agat", "cgat", "ggat", "tgat", "atat", "ctat", "gtat", "ttat",
794 "aact", "cact", "gact", "tact", "acct", "ccct", "gcct", "tcct",
795 "agct", "cgct", "ggct", "tgct", "atct", "ctct", "gtct", "ttct",
796 "aagt", "cagt", "gagt", "tagt", "acgt", "ccgt", "gcgt", "tcgt",
797 "aggt", "cggt", "gggt", "tggt", "atgt", "ctgt", "gtgt", "ttgt",
798 "aatt", "catt", "gatt", "tatt", "actt", "cctt", "gctt", "tctt",
799 "agtt", "cgtt", "ggtt", "tgtt", "attt", "cttt", "gttt", "tttt",
800 };
801 while((start+pos)&3){
802 dst[pos] = GPOINTER_TO_INT(FastaDB_SparseCache_get_func_2_BIT_LC(
803 start+pos, page_data, user_data));
804 pos++;
805 }
806 if(length > 4){
807 while(end&3){
808 dst[end-start-1]
809 = GPOINTER_TO_INT(FastaDB_SparseCache_get_func_2_BIT_LC(
810 end-1, page_data, user_data));
811 end--;
812 }
813 }
814 while(pos < end){
815 word = table[(guchar)str[(start+pos)>>2]];
816 dst[pos++] = word[0];
817 dst[pos++] = word[1];
818 dst[pos++] = word[2];
819 dst[pos++] = word[3];
820 }
821 #ifndef G_DISABLE_ASSERT
822 for(i = 0; i < length; i++){
823 g_assert(dst[i]
824 == GPOINTER_TO_INT(FastaDB_SparseCache_get_func_2_BIT_LC(
825 start+i, page_data, user_data)));
826 }
827 #endif /* G_DISABLE_ASSERT */
828 return length;
829 }
830
FastaDB_SparseCache_compress_2bit_lc(SparseCache_Page * page,gint len)831 static void FastaDB_SparseCache_compress_2bit_lc(SparseCache_Page *page,
832 gint len){
833 register gint i;
834 register gchar *seq = page->data;
835 static gint index[ALPHABET_SIZE];
836 static gboolean index_is_filled = FALSE;
837 if(!index_is_filled){
838 FastaDB_SparseCache_fill_index(index, "acgt");
839 index_is_filled = TRUE;
840 }
841 page->data = g_new0(gchar, (len >> 2)+1);
842 page->data_size = sizeof(gchar)*((len >> 2)+1);
843 for(i = 0; i < len; i++)
844 ((gchar*)page->data)[(i >> 2)] |= (index[(guchar)seq[i]] << ((i&3) << 1));
845 g_free(seq);
846 page->get_func = FastaDB_SparseCache_get_func_2_BIT_LC;
847 page->copy_func = FastaDB_SparseCache_copy_func_2_BIT_LC;
848 return;
849 }
850
FastaDB_SparseCache_get_func_2_BIT_UC(gint pos,gpointer page_data,gpointer user_data)851 static gpointer FastaDB_SparseCache_get_func_2_BIT_UC(gint pos,
852 gpointer page_data,
853 gpointer user_data){
854 register gint ch = ((gchar*)page_data)[(pos >> 2)],
855 num = (ch >> ((pos & 3) << 1)) & 3;
856 static gchar *alphabet = "ACGT";
857 return GINT_TO_POINTER((gint)alphabet[num]);
858 }
859
FastaDB_SparseCache_copy_func_2_BIT_UC(gint start,gint length,gchar * dst,gpointer page_data,gpointer user_data)860 static gint FastaDB_SparseCache_copy_func_2_BIT_UC(gint start, gint length,
861 gchar *dst, gpointer page_data,
862 gpointer user_data){
863 #ifndef G_DISABLE_ASSERT
864 register gint i;
865 #endif /* G_DISABLE_ASSERT */
866 register gint pos = 0, end = length;
867 register gchar *str = page_data, *word;
868 gchar *table[256] = {
869 "AAAA", "CAAA", "GAAA", "TAAA", "ACAA", "CCAA", "GCAA", "TCAA",
870 "AGAA", "CGAA", "GGAA", "TGAA", "ATAA", "CTAA", "GTAA", "TTAA",
871 "AACA", "CACA", "GACA", "TACA", "ACCA", "CCCA", "GCCA", "TCCA",
872 "AGCA", "CGCA", "GGCA", "TGCA", "ATCA", "CTCA", "GTCA", "TTCA",
873 "AAGA", "CAGA", "GAGA", "TAGA", "ACGA", "CCGA", "GCGA", "TCGA",
874 "AGGA", "CGGA", "GGGA", "TGGA", "ATGA", "CTGA", "GTGA", "TTGA",
875 "AATA", "CATA", "GATA", "TATA", "ACTA", "CCTA", "GCTA", "TCTA",
876 "AGTA", "CGTA", "GGTA", "TGTA", "ATTA", "CTTA", "GTTA", "TTTA",
877 "AAAC", "CAAC", "GAAC", "TAAC", "ACAC", "CCAC", "GCAC", "TCAC",
878 "AGAC", "CGAC", "GGAC", "TGAC", "ATAC", "CTAC", "GTAC", "TTAC",
879 "AACC", "CACC", "GACC", "TACC", "ACCC", "CCCC", "GCCC", "TCCC",
880 "AGCC", "CGCC", "GGCC", "TGCC", "ATCC", "CTCC", "GTCC", "TTCC",
881 "AAGC", "CAGC", "GAGC", "TAGC", "ACGC", "CCGC", "GCGC", "TCGC",
882 "AGGC", "CGGC", "GGGC", "TGGC", "ATGC", "CTGC", "GTGC", "TTGC",
883 "AATC", "CATC", "GATC", "TATC", "ACTC", "CCTC", "GCTC", "TCTC",
884 "AGTC", "CGTC", "GGTC", "TGTC", "ATTC", "CTTC", "GTTC", "TTTC",
885 "AAAG", "CAAG", "GAAG", "TAAG", "ACAG", "CCAG", "GCAG", "TCAG",
886 "AGAG", "CGAG", "GGAG", "TGAG", "ATAG", "CTAG", "GTAG", "TTAG",
887 "AACG", "CACG", "GACG", "TACG", "ACCG", "CCCG", "GCCG", "TCCG",
888 "AGCG", "CGCG", "GGCG", "TGCG", "ATCG", "CTCG", "GTCG", "TTCG",
889 "AAGG", "CAGG", "GAGG", "TAGG", "ACGG", "CCGG", "GCGG", "TCGG",
890 "AGGG", "CGGG", "GGGG", "TGGG", "ATGG", "CTGG", "GTGG", "TTGG",
891 "AATG", "CATG", "GATG", "TATG", "ACTG", "CCTG", "GCTG", "TCTG",
892 "AGTG", "CGTG", "GGTG", "TGTG", "ATTG", "CTTG", "GTTG", "TTTG",
893 "AAAT", "CAAT", "GAAT", "TAAT", "ACAT", "CCAT", "GCAT", "TCAT",
894 "AGAT", "CGAT", "GGAT", "TGAT", "ATAT", "CTAT", "GTAT", "TTAT",
895 "AACT", "CACT", "GACT", "TACT", "ACCT", "CCCT", "GCCT", "TCCT",
896 "AGCT", "CGCT", "GGCT", "TGCT", "ATCT", "CTCT", "GTCT", "TTCT",
897 "AAGT", "CAGT", "GAGT", "TAGT", "ACGT", "CCGT", "GCGT", "TCGT",
898 "AGGT", "CGGT", "GGGT", "TGGT", "ATGT", "CTGT", "GTGT", "TTGT",
899 "AATT", "CATT", "GATT", "TATT", "ACTT", "CCTT", "GCTT", "TCTT",
900 "AGTT", "CGTT", "GGTT", "TGTT", "ATTT", "CTTT", "GTTT", "TTTT",
901 };
902 while((start+pos)&3){
903 dst[pos] = GPOINTER_TO_INT(FastaDB_SparseCache_get_func_2_BIT_UC(
904 start+pos, page_data, user_data));
905 pos++;
906 }
907 if(length > 4){
908 while(end&3){
909 dst[end-start-1]
910 = GPOINTER_TO_INT(FastaDB_SparseCache_get_func_2_BIT_UC(end-1,
911 page_data, user_data));
912 end--;
913 }
914 }
915 while(pos < end){
916 word = table[(guchar)str[(start+pos)>>2]];
917 dst[pos++] = word[0];
918 dst[pos++] = word[1];
919 dst[pos++] = word[2];
920 dst[pos++] = word[3];
921 }
922 #ifndef G_DISABLE_ASSERT
923 for(i = 0; i < length; i++){
924 g_assert(dst[i]
925 == GPOINTER_TO_INT(FastaDB_SparseCache_get_func_2_BIT_UC(
926 start+i, page_data, user_data)));
927 }
928 #endif /* G_DISABLE_ASSERT */
929 return length;
930 }
931
FastaDB_SparseCache_compress_2bit_uc(SparseCache_Page * page,gint len)932 static void FastaDB_SparseCache_compress_2bit_uc(SparseCache_Page *page,
933 gint len){
934 register gint i;
935 register gchar *seq = page->data;
936 static gint index[ALPHABET_SIZE];
937 static gboolean index_is_filled = FALSE;
938 if(!index_is_filled){
939 FastaDB_SparseCache_fill_index(index, "ACGT");
940 index_is_filled = TRUE;
941 }
942 page->data = g_new0(gchar, (len >> 2)+1);
943 page->data_size = sizeof(gchar)*((len >> 2)+1);
944 for(i = 0; i < len; i++)
945 ((gchar*)page->data)[(i >> 2)] |= (index[(guchar)seq[i]] << ((i&3) << 1));
946 g_free(seq);
947 page->get_func = FastaDB_SparseCache_get_func_2_BIT_UC;
948 page->copy_func = FastaDB_SparseCache_copy_func_2_BIT_UC;
949 return;
950 }
951
952 /**/
953
FastaDB_SparseCache_compress(SparseCache_Page * page,gint len)954 void FastaDB_SparseCache_compress(SparseCache_Page *page, gint len){
955 register gint i, mask = 0;
956 register gchar *seq = page->data;
957 static FastaDB_SparseCache_Mask mask_index[ALPHABET_SIZE];
958 static gboolean mask_is_filled = FALSE;
959 if(!mask_is_filled){
960 FastaDB_SparseCache_fill_mask(mask_index);
961 mask_is_filled = TRUE;
962 }
963 for(i = 0; i < len; i++)
964 mask |= mask_index[(guchar)seq[i]];
965 if(mask & (1 << FastaDB_SparseCache_Mask_8_BIT)){
966 page->copy_func = FastaDB_SparseCache_copy_func_8_BIT;
967 return; /* Keep current 8 bit encoding */
968 }
969 if(mask & (mask-1)){ /* more than one mask bit is set */
970 FastaDB_SparseCache_compress_4bit(page, len);
971 return;
972 }
973 if(mask & (1 << FastaDB_SparseCache_Mask_2_BIT_LC)){
974 FastaDB_SparseCache_compress_2bit_lc(page, len);
975 return;
976 }
977 if(mask & (1 << FastaDB_SparseCache_Mask_2_BIT_UC)){
978 FastaDB_SparseCache_compress_2bit_uc(page, len);
979 return;
980 }
981 if(mask & (1 << FastaDB_SparseCache_Mask_0_BIT_LC)){
982 g_free(page->data);
983 page->data = NULL;
984 page->data_size = 0;
985 page->get_func = FastaDB_SparseCache_get_func_0_BIT_LC;
986 page->copy_func = FastaDB_SparseCache_copy_func_0_BIT_LC;
987 return;
988 }
989 if(mask & (1 << FastaDB_SparseCache_Mask_0_BIT_UC)){
990 g_free(page->data);
991 page->data = NULL;
992 page->data_size = 0;
993 page->get_func = FastaDB_SparseCache_get_func_0_BIT_UC;
994 page->copy_func = FastaDB_SparseCache_copy_func_0_BIT_UC;
995 return;
996 }
997 g_error("Bad FastaDB_SparseCache compression mask");
998 return;
999 }
1000
FastaDB_SparseCache_fill_func(gint start,gpointer user_data)1001 static SparseCache_Page *FastaDB_SparseCache_fill_func(gint start,
1002 gpointer user_data){
1003 register SparseCache_Page *page = g_new(SparseCache_Page, 1);
1004 register FastaDB_Key *fdbk = user_data;
1005 register gint ch, pos = 0, len;
1006 register gint db_start, db_end;
1007 if(fdbk->source->line_length < 1)
1008 g_error("Unknown or irregular fasta line length");
1009 db_start = fdbk->seq_offset
1010 + start
1011 + (start / (fdbk->source->line_length));
1012 len = MIN(SparseCache_PAGE_SIZE, fdbk->length-start);
1013 db_end = db_start
1014 + len
1015 + (len / (fdbk->source->line_length));
1016 #ifdef USE_PTHREADS
1017 pthread_mutex_lock(&fdbk->source->cf->compoundfile_mutex);
1018 #endif /* USE_PTHREADS */
1019 fdbk->location->pos += db_start;
1020 CompoundFile_Location_seek(fdbk->location);
1021 fdbk->location->pos -= db_start;
1022 page->data = g_new(gchar, len);
1023 page->get_func = FastaDB_SparseCache_get_func_8_BIT;
1024 page->copy_func = NULL; /* FIXME: temp */
1025 page->data_size = sizeof(gchar)*len;
1026 do {
1027 ch = CompoundFile_getc(fdbk->source->cf);
1028 g_assert(ch != EOF);
1029 g_assert(pos < len);
1030 if(!isspace(ch))
1031 ((gchar*)page->data)[pos++] = ch;
1032 } while(pos < len);
1033 #ifdef USE_PTHREADS
1034 pthread_mutex_unlock(&fdbk->source->cf->compoundfile_mutex);
1035 #endif /* USE_PTHREADS */
1036 FastaDB_SparseCache_compress(page, len);
1037 return page;
1038 }
1039 /* FIXME: this is really inefficient: should use unbuffered read/write
1040 * do this by adding CompoundFile_read()
1041 */
1042 /* FIXME add suppport for compressed pages etc */
1043
FastaDB_Key_get_SparseCache(FastaDB_Key * fdbk)1044 SparseCache *FastaDB_Key_get_SparseCache(FastaDB_Key *fdbk){
1045 register SparseCache *cache = SparseCache_create(fdbk->length,
1046 FastaDB_SparseCache_fill_func, NULL, NULL, fdbk);
1047 return cache;
1048 }
1049
1050 /**/
1051
FastaDB_all(gchar * path,Alphabet * alphabet,FastaDB_Mask mask,guint * total)1052 FastaDB_Seq **FastaDB_all(gchar *path, Alphabet *alphabet,
1053 FastaDB_Mask mask, guint *total){
1054 register FastaDB *fdb = FastaDB_open(path, alphabet);
1055 register GPtrArray *ptra = g_ptr_array_new();
1056 register FastaDB_Seq *fdbs, **fdbsa;
1057 register guint numseqs = 0;
1058 while((fdbs = FastaDB_next(fdb, mask))){
1059 g_ptr_array_add(ptra, fdbs);
1060 numseqs++;
1061 }
1062 g_ptr_array_add(ptra, NULL); /* NULL delimit */
1063 if(total)
1064 *total = numseqs; /* Record number of sequences */
1065 fdbsa = (FastaDB_Seq**)ptra->pdata;
1066 g_ptr_array_free(ptra, FALSE); /* Leave data intact */
1067 FastaDB_close(fdb);
1068 return fdbsa;
1069 }
1070
FastaDB_Seq_all_destroy(FastaDB_Seq ** fdbs)1071 void FastaDB_Seq_all_destroy(FastaDB_Seq **fdbs){
1072 register gint i;
1073 g_assert(fdbs);
1074 for(i = 0; fdbs[i]; i++)
1075 FastaDB_Seq_destroy(fdbs[i]);
1076 g_free(fdbs);
1077 return;
1078 }
1079
FastaDB_Seq_print(FastaDB_Seq * fdbs,FILE * fp,FastaDB_Mask mask)1080 gint FastaDB_Seq_print(FastaDB_Seq *fdbs, FILE *fp, FastaDB_Mask mask){
1081 register gint written = 0;
1082 if(mask & (FastaDB_Mask_ID|FastaDB_Mask_DEF|FastaDB_Mask_LEN)){
1083 written += fprintf(fp, ">%s", (mask & FastaDB_Mask_ID)
1084 ?fdbs->seq->id:"[unknown]");
1085 if(fdbs->seq->def && (mask & FastaDB_Mask_DEF))
1086 written += fprintf(fp, " %s", fdbs->seq->def);
1087 if(mask & FastaDB_Mask_LEN)
1088 written += fprintf(fp, " [len:%d]", fdbs->seq->len);
1089 written += fprintf(fp, "\n");
1090 }
1091 if(mask & FastaDB_Mask_SEQ) /* to print seq, length must be set */
1092 written += Sequence_print_fasta_block(fdbs->seq, fp);
1093 return written;
1094 }
1095
FastaDB_Seq_all_print(FastaDB_Seq ** fdbs,FILE * fp,FastaDB_Mask mask)1096 gint FastaDB_Seq_all_print(FastaDB_Seq **fdbs, FILE *fp,
1097 FastaDB_Mask mask){
1098 register int i, written = 0;
1099 for(i = 0; fdbs[i]; i++)
1100 written += FastaDB_Seq_print(fdbs[i], fp, mask);
1101 return written;
1102 }
1103
FastaDB_get_single(gchar * path,Alphabet * alphabet)1104 FastaDB_Seq *FastaDB_get_single(gchar *path, Alphabet *alphabet){
1105 register FastaDB *fdb = FastaDB_open(path, alphabet);
1106 register FastaDB_Seq *fdbs = FastaDB_next(fdb, FastaDB_Mask_ALL);
1107 if(!fdbs)
1108 g_error("No sequences found in [%s]\n", path);
1109 FastaDB_close(fdb);
1110 return fdbs;
1111 }
1112
FastaDB_guess_type(gchar * path)1113 Alphabet_Type FastaDB_guess_type(gchar *path){
1114 register Alphabet *alphabet = Alphabet_create(Alphabet_Type_UNKNOWN,
1115 FALSE);
1116 register FastaDB_Seq *fdbs = FastaDB_get_single(path, alphabet);
1117 register gchar *str = Sequence_get_str(fdbs->seq);
1118 register Alphabet_Type type = Alphabet_Type_guess(str);
1119 g_free(str);
1120 FastaDB_Seq_destroy(fdbs);
1121 Alphabet_destroy(alphabet);
1122 return type;
1123 }
1124
1125