1 /****************************************************************\
2 *                                                                *
3 *  Library for manipulation of FASTA format databases            *
4 *                                                                *
5 *  Guy St.C. Slater..   mailto:guy@ebi.ac.uk                     *
6 *  Copyright (C) 2000-2009.  All Rights Reserved.                *
7 *                                                                *
8 *  This source code is distributed under the terms of the        *
9 *  GNU General Public License, version 3. See the file COPYING   *
10 *  or http://www.gnu.org/licenses/gpl.txt for details            *
11 *                                                                *
12 *  If you use this code, please keep this notice intact.         *
13 *                                                                *
14 \****************************************************************/
15 
16 #ifndef INCLUDED_FASTADB_H
17 #define INCLUDED_FASTADB_H
18 
19 #ifdef __cplusplus
20 extern "C" {
21 #endif /* __cplusplus */
22 
23 #include <stdio.h>
24 #include <glib.h>
25 
26 #include "compoundfile.h"
27 #include "sequence.h"
28 #include "argument.h"
29 #include "sparsecache.h"
30 
31 typedef struct {
32     gchar *suffix_filter;
33 } FastaDB_ArgumentSet;
34 
35 FastaDB_ArgumentSet *FastaDB_ArgumentSet_create(Argument *arg);
36 
37 typedef enum {
38     FastaDB_Mask_ID  = (1<<1),
39     FastaDB_Mask_DEF = (1<<2),
40     FastaDB_Mask_SEQ = (1<<3),
41     FastaDB_Mask_LEN = (1<<4),
42     FastaDB_Mask_ALL = (~0)
43 } FastaDB_Mask;
44 
45 typedef struct FastaDB {
46             guint  ref_count;
47          Alphabet *alphabet;
48      CompoundFile *cf;
49             gchar *out_buffer;
50             guint  out_buffer_pos;
51             guint  out_buffer_alloc;
52              gint  line_length;
53 } FastaDB;
54 /* line_length is used for fasta file random-access
55  * it is set to zero for irregular line lengths
56  * is should not be used until the entire file has been parsed.
57  */
58 
59 typedef struct {
60                     guint  ref_count;
61                   FastaDB *source;
62     CompoundFile_Location *location;
63                  Sequence *seq;
64 } FastaDB_Seq;
65 
66 typedef gboolean (*FastaDB_TraverseFunc)(FastaDB_Seq *fdbs,
67                                          gpointer user_data);
68 /* Return TRUE to stop the traversal */
69 
70     FastaDB *FastaDB_open_list(GPtrArray *path_list,
71                                Alphabet *alphabet);
72     FastaDB *FastaDB_open_list_with_limit(GPtrArray *path_list,
73              Alphabet *alphabet, gint chunk_id, gint chunk_total);
74     FastaDB *FastaDB_open(gchar *path, Alphabet *alphabet);
75     FastaDB *FastaDB_share(FastaDB *fdb);
76     FastaDB *FastaDB_dup(FastaDB *fdb); /* For use in a separate thread */
77        void  FastaDB_close(FastaDB *fdb);
78        void  FastaDB_rewind(FastaDB *fdb);
79    gboolean  FastaDB_is_finished(FastaDB *fdb);
80        void  FastaDB_traverse(FastaDB *fdb, FastaDB_Mask mask,
81                      FastaDB_TraverseFunc fdtf, gpointer user_data);
82       gsize  FastaDB_memory_usage(FastaDB *fdb);
83 
84      FastaDB_Seq *FastaDB_next(FastaDB *fdb, FastaDB_Mask mask);
85 CompoundFile_Pos  FastaDB_find_next_start(FastaDB *fdb,
86                                           CompoundFile_Pos pos);
87 
88     gboolean FastaDB_file_is_fasta(gchar *path);
89     /* Returns true if first non-whitespace character in file is '>' */
90 
91 typedef struct {
92                 FastaDB *source;
93   CompoundFile_Location *location;
94         Sequence_Strand  strand;
95                    gint  seq_offset; /* for random access */
96                    gint  length;     /* for random access */
97 } FastaDB_Key;
98 
99 FastaDB_Seq *FastaDB_fetch(FastaDB *fdb, FastaDB_Mask mask,
100                            CompoundFile_Pos pos);
101 
102 FastaDB_Key *FastaDB_Key_create(FastaDB *source,
103                                 CompoundFile_Location *location,
104                                 Sequence_Strand strand,
105                                 gint seq_offset, gint length);
106 FastaDB_Key *FastaDB_Seq_get_key(FastaDB_Seq *fdbs);
107 FastaDB_Seq *FastaDB_Key_get_seq(FastaDB_Key *fdbk, FastaDB_Mask mask);
108        void  FastaDB_Key_destroy(FastaDB_Key *fdbk);
109       gchar *FastaDB_Key_get_def(FastaDB_Key *fdbk);
110 SparseCache *FastaDB_Key_get_SparseCache(FastaDB_Key *fdbk);
111        void  FastaDB_SparseCache_compress(SparseCache_Page *page, gint len);
112 
113 
114 FastaDB_Seq **FastaDB_all(gchar *path, Alphabet *alphabet,
115                           FastaDB_Mask mask, guint *total);
116 
117 FastaDB_Seq *FastaDB_Seq_share(FastaDB_Seq *fdbs);
118        void  FastaDB_Seq_destroy(FastaDB_Seq *fdbs);
119 FastaDB_Seq *FastaDB_Seq_revcomp(FastaDB_Seq *fdbs);
120        void  FastaDB_Seq_all_destroy(FastaDB_Seq **fdbs);
121 
122        gint  FastaDB_Seq_print(FastaDB_Seq *fdbs, FILE *fp,
123                                FastaDB_Mask mask);
124        gint  FastaDB_Seq_all_print(FastaDB_Seq **fdbs, FILE *fp,
125                                    FastaDB_Mask mask);
126 
127 FastaDB_Seq *FastaDB_get_single(gchar *path, Alphabet *alphabet);
128 Alphabet_Type FastaDB_guess_type(gchar *path);
129 
130 #ifdef __cplusplus
131 }
132 #endif /* __cplusplus */
133 
134 #endif /* INCLUDED_FASTADB_H */
135 
136