1 #ifndef EDLIB_H 2 #define EDLIB_H 3 4 /** 5 * @file 6 * @author Martin Sosic 7 * @brief Main header file, containing all public functions and structures. 8 */ 9 10 // Define EDLIB_API macro to properly export symbols 11 #ifdef EDLIB_SHARED 12 # ifdef _WIN32 13 # ifdef EDLIB_BUILD 14 # define EDLIB_API __declspec(dllexport) 15 # else 16 # define EDLIB_API __declspec(dllimport) 17 # endif 18 # else 19 # define EDLIB_API __attribute__ ((visibility ("default"))) 20 # endif 21 #else 22 # define EDLIB_API 23 #endif 24 25 #ifdef __cplusplus 26 extern "C" { 27 #endif 28 29 // Status codes 30 #define EDLIB_STATUS_OK 0 31 #define EDLIB_STATUS_ERROR 1 32 33 /** 34 * Alignment methods - how should Edlib treat gaps before and after query? 35 */ 36 typedef enum { 37 /** 38 * Global method. This is the standard method. 39 * Useful when you want to find out how similar is first sequence to second sequence. 40 */ 41 EDLIB_MODE_NW, 42 /** 43 * Prefix method. Similar to global method, but with a small twist - gap at query end is not penalized. 44 * What that means is that deleting elements from the end of second sequence is "free"! 45 * For example, if we had "AACT" and "AACTGGC", edit distance would be 0, because removing "GGC" from the end 46 * of second sequence is "free" and does not count into total edit distance. This method is appropriate 47 * when you want to find out how well first sequence fits at the beginning of second sequence. 48 */ 49 EDLIB_MODE_SHW, 50 /** 51 * Infix method. Similar as prefix method, but with one more twist - gaps at query end and start are 52 * not penalized. What that means is that deleting elements from the start and end of second sequence is "free"! 53 * For example, if we had ACT and CGACTGAC, edit distance would be 0, because removing CG from the start 54 * and GAC from the end of second sequence is "free" and does not count into total edit distance. 55 * This method is appropriate when you want to find out how well first sequence fits at any part of 56 * second sequence. 57 * For example, if your second sequence was a long text and your first sequence was a sentence from that text, 58 * but slightly scrambled, you could use this method to discover how scrambled it is and where it fits in 59 * that text. In bioinformatics, this method is appropriate for aligning read to a sequence. 60 */ 61 EDLIB_MODE_HW 62 } EdlibAlignMode; 63 64 /** 65 * Alignment tasks - what do you want Edlib to do? 66 */ 67 typedef enum { 68 EDLIB_TASK_DISTANCE, //!< Find edit distance and end locations. 69 EDLIB_TASK_LOC, //!< Find edit distance, end locations and start locations. 70 EDLIB_TASK_PATH //!< Find edit distance, end locations and start locations and alignment path. 71 } EdlibAlignTask; 72 73 /** 74 * Describes cigar format. 75 * @see http://samtools.github.io/hts-specs/SAMv1.pdf 76 * @see http://drive5.com/usearch/manual/cigar.html 77 */ 78 typedef enum { 79 EDLIB_CIGAR_STANDARD, //!< Match: 'M', Insertion: 'I', Deletion: 'D', Mismatch: 'M'. 80 EDLIB_CIGAR_EXTENDED //!< Match: '=', Insertion: 'I', Deletion: 'D', Mismatch: 'X'. 81 } EdlibCigarFormat; 82 83 // Edit operations. 84 #define EDLIB_EDOP_MATCH 0 //!< Match. 85 #define EDLIB_EDOP_INSERT 1 //!< Insertion to target = deletion from query. 86 #define EDLIB_EDOP_DELETE 2 //!< Deletion from target = insertion to query. 87 #define EDLIB_EDOP_MISMATCH 3 //!< Mismatch. 88 89 /** 90 * @brief Defines two given characters as equal. 91 */ 92 typedef struct { 93 char first; 94 char second; 95 } EdlibEqualityPair; 96 97 /** 98 * @brief Configuration object for edlibAlign() function. 99 */ 100 typedef struct { 101 /** 102 * Set k to non-negative value to tell edlib that edit distance is not larger than k. 103 * Smaller k can significantly improve speed of computation. 104 * If edit distance is larger than k, edlib will set edit distance to -1. 105 * Set k to negative value and edlib will internally auto-adjust k until score is found. 106 */ 107 int k; 108 109 /** 110 * Alignment method. 111 * EDLIB_MODE_NW: global (Needleman-Wunsch) 112 * EDLIB_MODE_SHW: prefix. Gap after query is not penalized. 113 * EDLIB_MODE_HW: infix. Gaps before and after query are not penalized. 114 */ 115 EdlibAlignMode mode; 116 117 /** 118 * Alignment task - tells Edlib what to calculate. Less to calculate, faster it is. 119 * EDLIB_TASK_DISTANCE - find edit distance and end locations of optimal alignment paths in target. 120 * EDLIB_TASK_LOC - find edit distance and start and end locations of optimal alignment paths in target. 121 * EDLIB_TASK_PATH - find edit distance, alignment path (and start and end locations of it in target). 122 */ 123 EdlibAlignTask task; 124 125 /** 126 * List of pairs of characters, where each pair defines two characters as equal. 127 * This way you can extend edlib's definition of equality (which is that each character is equal only 128 * to itself). 129 * This can be useful if you have some wildcard characters that should match multiple other characters, 130 * or e.g. if you want edlib to be case insensitive. 131 * Can be set to NULL if there are none. 132 */ 133 const EdlibEqualityPair* additionalEqualities; 134 135 /** 136 * Number of additional equalities, which is non-negative number. 137 * 0 if there are none. 138 */ 139 int additionalEqualitiesLength; 140 } EdlibAlignConfig; 141 142 /** 143 * Helper method for easy construction of configuration object. 144 * @return Configuration object filled with given parameters. 145 */ 146 EDLIB_API EdlibAlignConfig edlibNewAlignConfig( 147 int k, EdlibAlignMode mode, EdlibAlignTask task, 148 const EdlibEqualityPair* additionalEqualities, 149 int additionalEqualitiesLength 150 ); 151 152 /** 153 * @return Default configuration object, with following defaults: 154 * k = -1, mode = EDLIB_MODE_NW, task = EDLIB_TASK_DISTANCE, no additional equalities. 155 */ 156 EDLIB_API EdlibAlignConfig edlibDefaultAlignConfig(void); 157 158 159 /** 160 * Container for results of alignment done by edlibAlign() function. 161 */ 162 typedef struct { 163 /** 164 * EDLIB_STATUS_OK or EDLIB_STATUS_ERROR. If error, all other fields will have undefined values. 165 */ 166 int status; 167 168 /** 169 * -1 if k is non-negative and edit distance is larger than k. 170 */ 171 int editDistance; 172 173 /** 174 * Array of zero-based positions in target where optimal alignment paths end. 175 * If gap after query is penalized, gap counts as part of query (NW), otherwise not. 176 * Set to NULL if edit distance is larger than k. 177 * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). 178 */ 179 int* endLocations; 180 181 /** 182 * Array of zero-based positions in target where optimal alignment paths start, 183 * they correspond to endLocations. 184 * If gap before query is penalized, gap counts as part of query (NW), otherwise not. 185 * Set to NULL if not calculated or if edit distance is larger than k. 186 * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). 187 */ 188 int* startLocations; 189 190 /** 191 * Number of end (and start) locations. 192 */ 193 int numLocations; 194 195 /** 196 * Alignment is found for first pair of start and end locations. 197 * Set to NULL if not calculated. 198 * Alignment is sequence of numbers: 0, 1, 2, 3. 199 * 0 stands for match. 200 * 1 stands for insertion to target. 201 * 2 stands for insertion to query. 202 * 3 stands for mismatch. 203 * Alignment aligns query to target from begining of query till end of query. 204 * If gaps are not penalized, they are not in alignment. 205 * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). 206 */ 207 unsigned char* alignment; 208 209 /** 210 * Length of alignment. 211 */ 212 int alignmentLength; 213 214 /** 215 * Number of different characters in query and target together. 216 */ 217 int alphabetLength; 218 } EdlibAlignResult; 219 220 /** 221 * Frees memory in EdlibAlignResult that was allocated by edlib. 222 * If you do not use it, make sure to free needed members manually using free(). 223 */ 224 EDLIB_API void edlibFreeAlignResult(EdlibAlignResult result); 225 226 227 /** 228 * Aligns two sequences (query and target) using edit distance (levenshtein distance). 229 * Through config parameter, this function supports different alignment methods (global, prefix, infix), 230 * as well as different modes of search (tasks). 231 * It always returns edit distance and end locations of optimal alignment in target. 232 * It optionally returns start locations of optimal alignment in target and alignment path, 233 * if you choose appropriate tasks. 234 * @param [in] query First sequence. 235 * @param [in] queryLength Number of characters in first sequence. 236 * @param [in] target Second sequence. 237 * @param [in] targetLength Number of characters in second sequence. 238 * @param [in] config Additional alignment parameters, like alignment method and wanted results. 239 * @return Result of alignment, which can contain edit distance, start and end locations and alignment path. 240 * Make sure to clean up the object using edlibFreeAlignResult() or by manually freeing needed members. 241 */ 242 EDLIB_API EdlibAlignResult edlibAlign( 243 const char* query, int queryLength, 244 const char* target, int targetLength, 245 const EdlibAlignConfig config 246 ); 247 248 249 /** 250 * Builds cigar string from given alignment sequence. 251 * @param [in] alignment Alignment sequence. 252 * 0 stands for match. 253 * 1 stands for insertion to target. 254 * 2 stands for insertion to query. 255 * 3 stands for mismatch. 256 * @param [in] alignmentLength 257 * @param [in] cigarFormat Cigar will be returned in specified format. 258 * @return Cigar string. 259 * I stands for insertion. 260 * D stands for deletion. 261 * X stands for mismatch. (used only in extended format) 262 * = stands for match. (used only in extended format) 263 * M stands for (mis)match. (used only in standard format) 264 * String is null terminated. 265 * Needed memory is allocated and given pointer is set to it. 266 * Do not forget to free it later using free()! 267 */ 268 EDLIB_API char* edlibAlignmentToCigar( 269 const unsigned char* alignment, int alignmentLength, 270 EdlibCigarFormat cigarFormat 271 ); 272 273 #ifdef __cplusplus 274 } 275 #endif 276 277 #endif // EDLIB_H 278