1 #ifndef EDLIB_H
2 #define EDLIB_H
3 
4 /**
5  * @file
6  * @author Martin Sosic
7  * @brief Main header file, containing all public functions and structures.
8  */
9 
10 // Define EDLIB_API macro to properly export symbols
11 #ifdef EDLIB_SHARED
12 #    ifdef _WIN32
13 #        ifdef EDLIB_BUILD
14 #            define EDLIB_API __declspec(dllexport)
15 #        else
16 #            define EDLIB_API __declspec(dllimport)
17 #        endif
18 #    else
19 #        define EDLIB_API __attribute__ ((visibility ("default")))
20 #    endif
21 #else
22 #    define EDLIB_API
23 #endif
24 
25 #ifdef __cplusplus
26 extern "C" {
27 #endif
28 
29 // Status codes
30 #define EDLIB_STATUS_OK 0
31 #define EDLIB_STATUS_ERROR 1
32 
33     /**
34      * Alignment methods - how should Edlib treat gaps before and after query?
35      */
36     typedef enum {
37         /**
38          * Global method. This is the standard method.
39          * Useful when you want to find out how similar is first sequence to second sequence.
40          */
41         EDLIB_MODE_NW,
42         /**
43          * Prefix method. Similar to global method, but with a small twist - gap at query end is not penalized.
44          * What that means is that deleting elements from the end of second sequence is "free"!
45          * For example, if we had "AACT" and "AACTGGC", edit distance would be 0, because removing "GGC" from the end
46          * of second sequence is "free" and does not count into total edit distance. This method is appropriate
47          * when you want to find out how well first sequence fits at the beginning of second sequence.
48          */
49         EDLIB_MODE_SHW,
50         /**
51          * Infix method. Similar as prefix method, but with one more twist - gaps at query end and start are
52          * not penalized. What that means is that deleting elements from the start and end of second sequence is "free"!
53          * For example, if we had ACT and CGACTGAC, edit distance would be 0, because removing CG from the start
54          * and GAC from the end of second sequence is "free" and does not count into total edit distance.
55          * This method is appropriate when you want to find out how well first sequence fits at any part of
56          * second sequence.
57          * For example, if your second sequence was a long text and your first sequence was a sentence from that text,
58          * but slightly scrambled, you could use this method to discover how scrambled it is and where it fits in
59          * that text. In bioinformatics, this method is appropriate for aligning read to a sequence.
60          */
61         EDLIB_MODE_HW
62     } EdlibAlignMode;
63 
64     /**
65      * Alignment tasks - what do you want Edlib to do?
66      */
67     typedef enum {
68         EDLIB_TASK_DISTANCE,  //!< Find edit distance and end locations.
69         EDLIB_TASK_LOC,       //!< Find edit distance, end locations and start locations.
70         EDLIB_TASK_PATH       //!< Find edit distance, end locations and start locations and alignment path.
71     } EdlibAlignTask;
72 
73     /**
74      * Describes cigar format.
75      * @see http://samtools.github.io/hts-specs/SAMv1.pdf
76      * @see http://drive5.com/usearch/manual/cigar.html
77      */
78     typedef enum {
79         EDLIB_CIGAR_STANDARD,  //!< Match: 'M', Insertion: 'I', Deletion: 'D', Mismatch: 'M'.
80         EDLIB_CIGAR_EXTENDED   //!< Match: '=', Insertion: 'I', Deletion: 'D', Mismatch: 'X'.
81     } EdlibCigarFormat;
82 
83 // Edit operations.
84 #define EDLIB_EDOP_MATCH 0    //!< Match.
85 #define EDLIB_EDOP_INSERT 1   //!< Insertion to target = deletion from query.
86 #define EDLIB_EDOP_DELETE 2   //!< Deletion from target = insertion to query.
87 #define EDLIB_EDOP_MISMATCH 3 //!< Mismatch.
88 
89     /**
90      * @brief Defines two given characters as equal.
91      */
92     typedef struct {
93         char first;
94         char second;
95     } EdlibEqualityPair;
96 
97     /**
98      * @brief Configuration object for edlibAlign() function.
99      */
100     typedef struct {
101         /**
102          * Set k to non-negative value to tell edlib that edit distance is not larger than k.
103          * Smaller k can significantly improve speed of computation.
104          * If edit distance is larger than k, edlib will set edit distance to -1.
105          * Set k to negative value and edlib will internally auto-adjust k until score is found.
106          */
107         int k;
108 
109         /**
110          * Alignment method.
111          * EDLIB_MODE_NW: global (Needleman-Wunsch)
112          * EDLIB_MODE_SHW: prefix. Gap after query is not penalized.
113          * EDLIB_MODE_HW: infix. Gaps before and after query are not penalized.
114          */
115         EdlibAlignMode mode;
116 
117         /**
118          * Alignment task - tells Edlib what to calculate. Less to calculate, faster it is.
119          * EDLIB_TASK_DISTANCE - find edit distance and end locations of optimal alignment paths in target.
120          * EDLIB_TASK_LOC - find edit distance and start and end locations of optimal alignment paths in target.
121          * EDLIB_TASK_PATH - find edit distance, alignment path (and start and end locations of it in target).
122          */
123         EdlibAlignTask task;
124 
125         /**
126          * List of pairs of characters, where each pair defines two characters as equal.
127          * This way you can extend edlib's definition of equality (which is that each character is equal only
128          * to itself).
129          * This can be useful if you have some wildcard characters that should match multiple other characters,
130          * or e.g. if you want edlib to be case insensitive.
131          * Can be set to NULL if there are none.
132          */
133         const EdlibEqualityPair* additionalEqualities;
134 
135         /**
136          * Number of additional equalities, which is non-negative number.
137          * 0 if there are none.
138          */
139         int additionalEqualitiesLength;
140     } EdlibAlignConfig;
141 
142     /**
143      * Helper method for easy construction of configuration object.
144      * @return Configuration object filled with given parameters.
145      */
146     EDLIB_API EdlibAlignConfig edlibNewAlignConfig(
147         int k, EdlibAlignMode mode, EdlibAlignTask task,
148         const EdlibEqualityPair* additionalEqualities,
149         int additionalEqualitiesLength
150     );
151 
152     /**
153      * @return Default configuration object, with following defaults:
154      *         k = -1, mode = EDLIB_MODE_NW, task = EDLIB_TASK_DISTANCE, no additional equalities.
155      */
156     EDLIB_API EdlibAlignConfig edlibDefaultAlignConfig(void);
157 
158 
159     /**
160      * Container for results of alignment done by edlibAlign() function.
161      */
162     typedef struct {
163         /**
164          * EDLIB_STATUS_OK or EDLIB_STATUS_ERROR. If error, all other fields will have undefined values.
165          */
166         int status;
167 
168         /**
169          * -1 if k is non-negative and edit distance is larger than k.
170          */
171         int editDistance;
172 
173         /**
174          * Array of zero-based positions in target where optimal alignment paths end.
175          * If gap after query is penalized, gap counts as part of query (NW), otherwise not.
176          * Set to NULL if edit distance is larger than k.
177          * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
178          */
179         int* endLocations;
180 
181         /**
182          * Array of zero-based positions in target where optimal alignment paths start,
183          * they correspond to endLocations.
184          * If gap before query is penalized, gap counts as part of query (NW), otherwise not.
185          * Set to NULL if not calculated or if edit distance is larger than k.
186          * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
187          */
188         int* startLocations;
189 
190         /**
191          * Number of end (and start) locations.
192          */
193         int numLocations;
194 
195         /**
196          * Alignment is found for first pair of start and end locations.
197          * Set to NULL if not calculated.
198          * Alignment is sequence of numbers: 0, 1, 2, 3.
199          * 0 stands for match.
200          * 1 stands for insertion to target.
201          * 2 stands for insertion to query.
202          * 3 stands for mismatch.
203          * Alignment aligns query to target from begining of query till end of query.
204          * If gaps are not penalized, they are not in alignment.
205          * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
206          */
207         unsigned char* alignment;
208 
209         /**
210          * Length of alignment.
211          */
212         int alignmentLength;
213 
214         /**
215          * Number of different characters in query and target together.
216          */
217         int alphabetLength;
218     } EdlibAlignResult;
219 
220     /**
221      * Frees memory in EdlibAlignResult that was allocated by edlib.
222      * If you do not use it, make sure to free needed members manually using free().
223      */
224     EDLIB_API void edlibFreeAlignResult(EdlibAlignResult result);
225 
226 
227     /**
228      * Aligns two sequences (query and target) using edit distance (levenshtein distance).
229      * Through config parameter, this function supports different alignment methods (global, prefix, infix),
230      * as well as different modes of search (tasks).
231      * It always returns edit distance and end locations of optimal alignment in target.
232      * It optionally returns start locations of optimal alignment in target and alignment path,
233      * if you choose appropriate tasks.
234      * @param [in] query  First sequence.
235      * @param [in] queryLength  Number of characters in first sequence.
236      * @param [in] target  Second sequence.
237      * @param [in] targetLength  Number of characters in second sequence.
238      * @param [in] config  Additional alignment parameters, like alignment method and wanted results.
239      * @return  Result of alignment, which can contain edit distance, start and end locations and alignment path.
240      *          Make sure to clean up the object using edlibFreeAlignResult() or by manually freeing needed members.
241      */
242     EDLIB_API EdlibAlignResult edlibAlign(
243         const char* query, int queryLength,
244         const char* target, int targetLength,
245         const EdlibAlignConfig config
246     );
247 
248 
249     /**
250      * Builds cigar string from given alignment sequence.
251      * @param [in] alignment  Alignment sequence.
252      *     0 stands for match.
253      *     1 stands for insertion to target.
254      *     2 stands for insertion to query.
255      *     3 stands for mismatch.
256      * @param [in] alignmentLength
257      * @param [in] cigarFormat  Cigar will be returned in specified format.
258      * @return Cigar string.
259      *     I stands for insertion.
260      *     D stands for deletion.
261      *     X stands for mismatch. (used only in extended format)
262      *     = stands for match. (used only in extended format)
263      *     M stands for (mis)match. (used only in standard format)
264      *     String is null terminated.
265      *     Needed memory is allocated and given pointer is set to it.
266      *     Do not forget to free it later using free()!
267      */
268     EDLIB_API char* edlibAlignmentToCigar(
269         const unsigned char* alignment, int alignmentLength,
270         EdlibCigarFormat cigarFormat
271     );
272 
273 #ifdef __cplusplus
274 }
275 #endif
276 
277 #endif // EDLIB_H
278