1 /*************************************************
2 * Module: CombineMUMs.cc
3 * Author: Art Delcher
4 * Description:
5 * Take clusters of MUMs (from mgaps program) and do alignments
6 * to combine them into matches.
7 *
8 */
9
10
11 #include <mummer/tigrinc.hh>
12 #include <string>
13 using namespace std;
14
15
16 // Constants
17
18 #define BRANCH_PT_MATCH_VALUE 0.272
19 // Value to add for a match in finding branch points
20 // 1.20 was the calculated value for 6% vs 35% error discrimination
21 // Converting to integers didn't make it faster
22 #define BRANCH_PT_ERROR_VALUE -0.728
23 // Value to add for a mismatch in finding branch points
24 // -2.19 was the calculated value for 6% vs 35% error discrimination
25 // Converting to integers didn't make it faster
26 #define DEFAULT_ERROR_FILE_NAME "witherrors.gaps"
27 // Name of file produced that matches the input gaps file
28 // but with an extra column showing the number of errors in
29 // each gap
30 #define DEFAULT_PAD 10
31 // Default number of matching bases to show at beginning
32 // and end of alignments
33 #define EDIT_DIST_PROB_BOUND 1e-4
34 // Probability limit to "band" edit-distance calculation
35 // Determines NORMAL_DISTRIB_THOLD
36 #define ERRORS_FOR_FREE 1
37 // The number of errors that are ignored in setting probability
38 // bound for terminating alignment extensions in edit distance
39 // calculations
40 #define EXPANSION_FACTOR 1.4
41 // Factor by which to grow memory when realloc'ing
42 #define GIVE_UP_LEN 200
43 // Stop alignment when go this many bases past max-score value
44 #define INITIAL_BUFFER_SIZE 10000
45 // Initial number of bytes to use for sequence strings
46 #define MATCH_SCORE 1.0
47 // Score to add for match in finding length of alignment
48 #define MAX_ERROR_RATE 0.06
49 // The largest error allowed in overlaps
50 #define MAX_FRAG_LEN 1024
51 // The longest fragment allowed
52 #define MAX_ERRORS 500
53 // Most errors in any edit distance computation
54 #define MAX_EXTENSION 10000
55 // Maximum extension match out from a match
56 #define MAX_HDR_LEN 10000
57 // Longest allowable fasta header line
58 #define MAX_MEMORY_STORE 50000
59 // The most fragments allowed in memory store
60 #define MIN_BRANCH_END_DIST 20
61 // Branch points must be at least this many bases from the
62 // end of the fragment to be reported
63 #define MIN_BRANCH_TAIL_SLOPE 0.20
64 // Branch point tails must fall off from the max by at least
65 // this rate
66 #define MIN_MATCH_LEN 40
67 // Number of bases in match region in order to count it
68 #define MISMATCH_SCORE -3.0
69 // Score to add for non-match in finding length of alignment
70 #define NORMAL_DISTRIB_THOLD 3.62
71 // Determined by EDIT_DIST_PROB_BOUND
72 #define REALLY_VERBOSE 0
73 // If 1 prints tons of stuff
74 #define VERBOSE 0
75 // If 1 prints lots of stuff
76
77
78
79 // Type definitions
80
81 typedef struct s_Cover_t
82 {
83 long int lo, hi;
84 struct s_Cover_t * next;
85 } Cover_t;
86
87
88
89 // Globals
90
91 float Branch_Pt_Match_Value = BRANCH_PT_MATCH_VALUE;
92 // Score used for matches to locate branch points, i.e.,
93 // where alignment score peaks
94 float Branch_Pt_Error_Value = BRANCH_PT_ERROR_VALUE;
95 // Score used for mismatches to locate branch points, i.e.,
96 int Consec_Non_ACGT = 0;
97 // Stop alignments when encounter at least this many non-acgt characters.
98 int * Edit_Array [MAX_ERRORS];
99 // Use for alignment calculation. Points into Edit_Space .
100 int Edit_Match_Limit [MAX_ERRORS] = {0};
101 // This array [e] is the minimum value of Edit_Array [e] [d]
102 // to be worth pursuing in edit-distance computations between guides
103 int Edit_Space [(MAX_ERRORS + 4) * MAX_ERRORS];
104 // Memory used by alignment calculation
105 int Error_Bound [MAX_FRAG_LEN + 1];
106 // This array [i] is the maximum number of errors allowed
107 // in a match between sequences of length i , which is
108 // i * MAXERROR_RATE .
109 const char * Error_File_Name = DEFAULT_ERROR_FILE_NAME;
110 // Name of file to write gaps listing with # errors in each gap
111 int Fill_Ct = 0;
112 // Number of non-acgt bases in ref sequence
113 char * Gaps_File_Path = NULL;
114 // Name of file produced by mgaps program
115 char * Match_File_Path = NULL;
116 // Name of multifasta file of sequences to compare against the reference
117 // sequence
118 bool UserScoring = false;
119 // If TRUE, then user specified a percent ID cutoff and scoring
120 // is adjusted to enforce this in extensions
121 bool Nucleotides_Only = false;
122 // If TRUE , then only acgt's can match
123 bool Only_Difference_Positions = false;
124 // If true , then output just positions of difference instead of
125 // alignments between exact matches
126 bool Output_Cover_Files = true;
127 // If TRUE , output files showing coverage of each genome.
128 double Percent_ID;
129 // Desired identity rate of matches to be found
130 // Expressed as a fraction, not really a percent
131 char * Query = NULL;
132 // The query sequence
133 long int Query_Len;
134 // The length of the query sequence
135 const char * Query_Suffix = "Query";
136 // Suffix for query tag
137 char * Ref = NULL;
138 // The reference sequence
139 char * Ref_File_Path = NULL;
140 // Name of (single) fasta file of reference sequence
141 long int Ref_Len;
142 // The length of the reference sequence
143 long int Ref_Size;
144 // The size of the reference sequence buffer
145 const char * Ref_Suffix = "Ref";
146 // Suffix for reference tag
147 bool Show_Differences = false;
148 // If TRUE then show differences in all alignments
149 bool Tag_From_Fasta_Line = false;
150 // If TRUE then use fasta tag from ref & query sequences as
151 // 1st & 2nd column, resp., on match line to identify matches
152 int Verbose = 0;
153 // Controls printing of extra debuggin information
154
155
156 bool Global_Debug_Flag = false;
157
158
159
160 // Functions
161
162 void Add_Coverage
163 (Cover_t * * list, long int lo, long int hi);
164 int Binomial_Bound
165 (int, double, int, double);
166 void Display_Alignment
167 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct);
168 void Display_Alignment_With_Pad
169 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
170 char * left_pad [3], char * right_pad [3], int pad_len);
171 void Display_Difference_Positions
172 (char * a, int a_start, int a_len, char * b, int b_start, int b_len,
173 int b_incr, int delta [], int delta_ct, const char * a_tag,
174 const char * b_tag);
175 int Extend_Backward
176 (long int * ref_lo, long int * query_lo);
177 int Extend_Forward
178 (long int * ref_hi, long int * query_hi);
179 void Initialize_Globals
180 (void);
181 FILE * Local_File_Open
182 (const char * filename, const char * mode);
183 int Max_int
184 (int a, int b);
185 int Min_int
186 (int a, int b);
187 void Parse_Command_Line
188 (int argc, char * argv []);
189 int Prefix_Edit_Dist
190 (char A [], int m, char T [], int n, int Error_Limit,
191 int * A_End, int * T_End, int * Match_To_End,
192 int Delta [MAX_ERRORS], int * Delta_Len, int extending);
193 bool Read_String
194 (FILE * fp, char * * T, long int * Size, char header []);
195 void Rev_Complement
196 (char * s);
197 void Rev_Display_Alignment
198 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct);
199 int Rev_Prefix_Edit_Dist
200 (char A [], int m, char T [], int n, int Error_Limit,
201 int * A_End, int * T_End, int * Match_To_End,
202 int Delta [MAX_ERRORS], int * Delta_Len, int extending);
203 void Rev_Show_Diffs
204 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
205 long int a_ref, long int b_ref);
206 void Set_Deltas
207 (int delta [], int * delta_len, int row, int d, int e);
208 static void Set_Left_Pad
209 (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len);
210 static void Set_Right_Pad
211 (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len);
212 void Show_Coverage
213 (Cover_t * list, char * filename, char * tag, char * seq);
214 void Show_Diffs
215 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
216 long int a_ref, long int b_ref);
217 int Sign
218 (int a);
219 void Usage
220 (char * command);
221
222
223
main(int argc,char * argv[])224 int main
225 (int argc, char * argv [])
226
227 {
228 FILE * fp, * match_fp, * error_fp;
229 char ref_header [MAX_HDR_LEN];
230 char header [MAX_HDR_LEN], ref_tag [MAX_HDR_LEN], query_tag [MAX_HDR_LEN];
231 char line [MAX_LINE], filename [MAX_LINE];
232 char * left_pad [3], * right_pad [3];
233 long int ref_pos, ref_lo, ref_hi;
234 long int query_pos, query_lo, query_hi, query_size;
235 long int match_len, prior_match_len = 0, max_len;
236 double error = 0.0;
237 long int total_errors = 0;
238 int match_ct = 0;
239 double match_total = 0.0;
240 bool is_forward = false;
241 int first_match = true;
242 Cover_t * ref_cover_list = NULL;
243 Cover_t * query_cover_list = NULL;
244 char * p;
245 int i;
246
247 Parse_Command_Line (argc, argv);
248
249 if(UserScoring){
250 // percentID = mismatch penalty / (match bonus + mismatch penalty)
251 // or, more compactly, p = mm / (m+mm)
252 // Since we require that m+mm = 1, m=1-mm
253 // and so we get the trivial mm = p
254 //
255 // This means that Branch_Pt_Error_Value = -p
256 // and then Branch_Pt_Match_Value = 1-p
257 //
258 // Make it so:
259 Branch_Pt_Error_Value = - Percent_ID;
260 Branch_Pt_Match_Value = 1.0 - Percent_ID;
261 }
262
263 Initialize_Globals ();
264
265 p = strrchr (Ref_File_Path, '/');
266 if (p == NULL)
267 strcpy (ref_tag, Ref_File_Path);
268 else
269 strcpy (ref_tag, p + 1);
270 p = strrchr (ref_tag, '.');
271 if (p != NULL)
272 (* p) = '\0';
273 strcat (ref_tag, Ref_Suffix);
274
275 p = strrchr (Match_File_Path, '/');
276 if (p == NULL)
277 strcpy (query_tag, Match_File_Path);
278 else
279 strcpy (query_tag, p + 1);
280 p = strrchr (query_tag, '.');
281 if (p != NULL)
282 (* p) = '\0';
283 strcat (query_tag, Query_Suffix);
284
285 fp = Local_File_Open (Ref_File_Path, "r");
286 Ref_Size = 100000;
287 Ref = (char *) Safe_malloc (Ref_Size);
288
289 assert (Read_String (fp, & Ref, & Ref_Size, ref_header));
290 if (Tag_From_Fasta_Line)
291 strcpy (ref_tag, strtok (ref_header, " \t\n>"));
292 fclose (fp);
293
294 Ref_Len = strlen (Ref + 1);
295 Ref = (char *) Safe_realloc (Ref, 1 + Ref_Len);
296 for (i = 1; i <= Ref_Len; i ++)
297 switch (Ref [i])
298 {
299 case 'a' :
300 case 'c' :
301 case 'g' :
302 case 't' :
303 break;
304 default :
305 if (Nucleotides_Only)
306 Ref [i] = '2';
307 Fill_Ct ++;
308 }
309
310 for (i = 0; i < 3; i ++)
311 {
312 left_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1);
313 right_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1);
314 }
315
316 fp = Local_File_Open (Gaps_File_Path, "r");
317 match_fp = Local_File_Open (Match_File_Path, "r");
318 query_size = 100000;
319 Query = (char *) Safe_malloc (query_size);
320
321 error_fp = Local_File_Open (Error_File_Name, "w");
322
323 while (fgets (line, MAX_LINE, fp) != NULL)
324 {
325 int line_len = strlen (line);
326
327 if (line [0] == '>')
328 {
329 if (! first_match)
330 {
331 total_errors += Extend_Forward (& ref_hi, & query_hi);
332 max_len = 1 + ref_hi - ref_lo;
333 if (1 + abs (query_hi - query_lo) > max_len)
334 max_len = 1 + abs (query_hi - query_lo);
335 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
336 if (! Only_Difference_Positions)
337 printf
338 ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
339 ref_lo, ref_hi,
340 is_forward ? query_lo : 1 + Query_Len - query_lo,
341 is_forward ? query_hi : 1 + Query_Len - query_hi,
342 total_errors, max_len, 100.0 * error);
343
344 match_ct ++;
345 match_total += 1 + ref_hi - ref_lo;
346 total_errors = 0;
347 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
348 if (is_forward)
349 Add_Coverage (& query_cover_list, query_lo, query_hi);
350 else
351 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
352 1 + Query_Len - query_lo);
353 }
354 if (Tag_From_Fasta_Line)
355 {
356 char buff [MAX_LINE];
357 char * p;
358
359 strcpy (buff, line + 1);
360 p = strtok (buff, " >\t\n");
361 if (strlen (p) > 0)
362 strcpy (query_tag, p);
363 else
364 {
365 fprintf (stderr, "No tag on line %s", line);
366 strcpy (query_tag, "???");
367 }
368 }
369 is_forward = ! is_forward;
370 if (is_forward)
371 {
372 #ifndef NDEBUG
373 bool read_header =
374 #endif
375 Read_String (match_fp, & Query, & query_size, header);
376 assert (read_header);
377 Query_Len = strlen (Query + 1);
378 if (Nucleotides_Only)
379 {
380 for (i = 1; i <= Query_Len; i ++)
381 switch (Query [i])
382 {
383 case 'a' :
384 case 'c' :
385 case 'g' :
386 case 't' :
387 break;
388 default :
389 Query [i] = '1';
390 }
391 }
392 }
393 else
394 Rev_Complement (Query + 1);
395 first_match = true;
396 printf ("%s", line);
397 fprintf (error_fp, "%s", line);
398 }
399 else if (line [0] == '#')
400 {
401 if (! first_match)
402 {
403 total_errors += Extend_Forward (& ref_hi, & query_hi);
404 max_len = 1 + ref_hi - ref_lo;
405 if (1 + abs (query_hi - query_lo) > max_len)
406 max_len = 1 + abs (query_hi - query_lo);
407 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
408 if (! Only_Difference_Positions)
409 printf
410 ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
411 ref_lo, ref_hi,
412 is_forward ? query_lo : 1 + Query_Len - query_lo,
413 is_forward ? query_hi : 1 + Query_Len - query_hi,
414 total_errors, max_len, 100.0 * error);
415
416 match_ct ++;
417 match_total += 1 + ref_hi - ref_lo;
418 total_errors = 0;
419 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
420 if (is_forward)
421 Add_Coverage (& query_cover_list, query_lo, query_hi);
422 else
423 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
424 1 + Query_Len - query_lo);
425 }
426 first_match = true;
427 printf ("%s", line);
428 fprintf (error_fp, "%s", line);
429 }
430 else
431 {
432 #ifndef NDEBUG
433 int tokens =
434 #endif
435 sscanf (line, "%ld %ld %ld", & ref_pos, & query_pos, & match_len);
436 assert(tokens == 3);
437 if (first_match)
438 {
439 ref_lo = ref_pos;
440 query_lo = query_pos;
441 total_errors += Extend_Backward (& ref_lo, & query_lo);
442 if (! Only_Difference_Positions)
443 printf ("%s", line);
444 if (line_len > 0)
445 line [line_len - 1] = '\0';
446 fprintf (error_fp, "%s %7s\n", line, "-");
447 }
448 else
449 {
450 int errors;
451 int a_len, b_len;
452 int a_end, b_end, match_to_end;
453 int delta [MAX_ERRORS], delta_len;
454
455 a_len = ref_pos - ref_hi - 1;
456 b_len = query_pos - query_hi - 1;
457
458 errors = Prefix_Edit_Dist
459 (Ref + ref_hi + 1, a_len,
460 Query + query_hi + 1, b_len,
461 MAX_ERRORS - 1, & a_end, & b_end,
462 & match_to_end, delta, & delta_len, false);
463 if (Show_Differences)
464 Show_Diffs (Ref + ref_hi + 1, a_end,
465 Query + query_hi + 1, b_end,
466 delta, delta_len, ref_hi + 1, query_hi + 1);
467
468 if (! Only_Difference_Positions)
469 printf ("%s", line);
470
471 Set_Left_Pad (left_pad, ref_hi, query_hi, prior_match_len,
472 DEFAULT_PAD);
473 Set_Right_Pad (right_pad, ref_pos, query_pos, match_len,
474 DEFAULT_PAD);
475
476 if (a_end == 0 && b_end == 0)
477 {
478 if (! Only_Difference_Positions)
479 printf (" No extension\n");
480
481 if (line_len > 0)
482 line [line_len - 1] = '\0';
483 fprintf (error_fp, "%s %7s\n", line, "-");
484 }
485 else if (Only_Difference_Positions)
486 {
487 int q_start, b_incr;
488
489 if (is_forward)
490 {
491 q_start = query_hi + 1;
492 b_incr = 1;
493 }
494 else
495 {
496 q_start = Query_Len - query_hi;
497 b_incr = -1;
498 }
499 Display_Difference_Positions
500 (Ref + ref_hi + 1, ref_hi + 1, a_end,
501 Query + query_hi + 1, q_start, b_end,
502 b_incr, delta, delta_len, ref_tag,
503 query_tag);
504 }
505 else
506 {
507 printf (" Errors = %d\n", errors);
508 if (! Only_Difference_Positions)
509 Display_Alignment_With_Pad
510 (Ref + ref_hi + 1, a_end,
511 Query + query_hi + 1, b_end,
512 delta, delta_len, left_pad, right_pad,
513 DEFAULT_PAD);
514 if (line_len > 0)
515 line [line_len - 1] = '\0';
516 fprintf (error_fp, "%s %7d\n", line, errors);
517 }
518
519 total_errors += errors;
520
521 if (! match_to_end)
522 {
523 ref_hi += a_end;
524 query_hi += b_end;
525 max_len = 1 + ref_hi - ref_lo;
526 if (1 + abs (query_hi - query_lo) > max_len)
527 max_len = 1 + abs (query_hi - query_lo);
528 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
529 if (! Only_Difference_Positions)
530 printf (
531 "Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
532 ref_lo, ref_hi,
533 is_forward ? query_lo : 1 + Query_Len - query_lo,
534 is_forward ? query_hi : 1 + Query_Len - query_hi,
535 total_errors, max_len, 100.0 * error);
536
537 match_ct ++;
538 match_total += 1 + ref_hi - ref_lo;
539 total_errors = 0;
540 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
541 if (is_forward)
542 Add_Coverage (& query_cover_list, query_lo, query_hi);
543 else
544 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
545 1 + Query_Len - query_lo);
546 ref_lo = ref_pos;
547 query_lo = query_pos;
548 total_errors += Extend_Backward (& ref_lo, & query_lo);
549 }
550 }
551
552 ref_hi = ref_pos + match_len - 1;
553 query_hi = query_pos + match_len - 1;
554 prior_match_len = match_len;
555 first_match = false;
556 }
557 }
558
559 if (! first_match)
560 {
561 total_errors += Extend_Forward (& ref_hi, & query_hi);
562 max_len = 1 + ref_hi - ref_lo;
563 if (1 + abs (query_hi - query_lo) > max_len)
564 max_len = 1 + abs (query_hi - query_lo);
565 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
566 if (! Only_Difference_Positions)
567 printf
568 ("Region: %9ld .. %-9ld %9ld .. %-9ld %7ld / %-9ld %5.2f%%\n",
569 ref_lo, ref_hi,
570 is_forward ? query_lo : 1 + Query_Len - query_lo,
571 is_forward ? query_hi : 1 + Query_Len - query_hi,
572 total_errors, max_len, 100.0 * error);
573
574 match_ct ++;
575 match_total += 1 + ref_hi - ref_lo;
576 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
577 if (is_forward)
578 Add_Coverage (& query_cover_list, query_lo, query_hi);
579 else
580 Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
581 1 + Query_Len - query_lo);
582 }
583
584 fprintf (stderr, " Ref len = %ld\n", Ref_Len);
585 fprintf (stderr, " acgt's = %ld\n", Ref_Len - Fill_Ct);
586 fprintf (stderr, " Non acgt's = %d\n", Fill_Ct);
587 fprintf (stderr, " Number of matches = %d\n", match_ct);
588 fprintf (stderr, "Sum of match bases = %.0f\n", match_total);
589 fprintf (stderr, " Avg match bases = %.0f\n",
590 match_ct == 0 ? 0.0 : match_total / match_ct);
591
592 fclose (error_fp);
593
594 if (Output_Cover_Files)
595 {
596 strcpy (filename, ref_tag);
597 strcat (filename, ".cover");
598 Show_Coverage (ref_cover_list, filename, ref_tag, Ref);
599
600 strcpy (filename, query_tag);
601 strcat (filename, ".cover");
602 Rev_Complement (Query + 1);
603 Show_Coverage (query_cover_list, filename, query_tag, Query);
604 }
605
606 return 0;
607 }
608
609
610
Add_Coverage(Cover_t ** list,long int lo,long int hi)611 void Add_Coverage
612 (Cover_t * * list, long int lo, long int hi)
613
614 // Add lo .. hi to list of regions covered in (* list) .
615 // Combine nodes when appropriate.
616
617 {
618 Cover_t * new_node, * p, * prev = NULL;
619
620 if ((* list) == NULL || hi + 1 < (* list) -> lo)
621 {
622 new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t));
623 new_node -> lo = lo;
624 new_node -> hi = hi;
625 new_node -> next = (* list);
626 (* list) = new_node;
627 return;
628 }
629
630 for (p = (* list); p != NULL && lo - 1 > p -> hi; p = p -> next)
631 prev = p;
632
633 if (p == NULL || hi + 1 < p -> lo)
634 { // insert between or on end
635 assert (prev != NULL);
636 new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t));
637 new_node -> lo = lo;
638 new_node -> hi = hi;
639 new_node -> next = p;
640 prev -> next = new_node;
641 return;
642 }
643
644 if (lo < p -> lo)
645 p -> lo = lo;
646 while (p -> next != NULL && hi + 1 >= p -> next -> lo)
647 {
648 Cover_t * save;
649
650 p -> hi = p -> next -> hi;
651 save = p -> next;
652 p -> next = save -> next;
653 free (save);
654 }
655 if (hi > p -> hi)
656 p -> hi = hi;
657
658 return;
659 }
660
661
662
Binomial_Bound(int e,double p,int Start,double Limit)663 int Binomial_Bound
664 (int e, double p, int Start, double Limit)
665
666 // Return the smallest n >= Start s.t.
667 // prob [>= e errors in n binomial trials (p = error prob)]
668 // > Limit
669
670 {
671 double Normal_Z, Mu_Power, Factorial, Poisson_Coeff;
672 double q, Sum, P_Power, Q_Power, X;
673 int k, n, Bin_Coeff, Ct;
674
675 q = 1.0 - p;
676 if (Start < e)
677 Start = e;
678
679 for (n = Start; n < MAX_FRAG_LEN; n ++)
680 {
681 if (n <= 35)
682 {
683 Sum = 0.0;
684 Bin_Coeff = 1;
685 Ct = 0;
686 P_Power = 1.0;
687 Q_Power = pow (q, (double) n);
688
689 for (k = 0; k < e && 1.0 - Sum > Limit; k ++)
690 {
691 X = Bin_Coeff * P_Power * Q_Power;
692 Sum += X;
693 Bin_Coeff *= n - Ct;
694 Bin_Coeff /= ++ Ct;
695 P_Power *= p;
696 Q_Power /= q;
697 }
698 if (1.0 - Sum > Limit)
699 return n;
700 }
701 else
702 {
703 Normal_Z = (e - 0.5 - n * p) / sqrt ((double)(n * p * q));
704 if (Normal_Z <= NORMAL_DISTRIB_THOLD)
705 return n;
706 Sum = 0.0;
707 Mu_Power = 1.0;
708 Factorial = 1.0;
709 Poisson_Coeff = exp (- (double) n * p);
710 for (k = 0; k < e; k ++)
711 {
712 Sum += Mu_Power * Poisson_Coeff / Factorial;
713 Mu_Power *= n * p;
714 Factorial *= k + 1;
715 }
716 if (1.0 - Sum > Limit)
717 return n;
718 }
719 }
720
721 return MAX_FRAG_LEN;
722 }
723
724
725
726 #define DISPLAY_WIDTH 60
727
Display_Alignment(char * a,int a_len,char * b,int b_len,int delta[],int delta_ct)728 void Display_Alignment
729 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct)
730
731 // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)]
732 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] .
733
734 {
735 int i, j, k, m, top_len, bottom_len;
736 char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
737 char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
738
739 i = j = top_len = bottom_len = 0;
740 for (k = 0; k < delta_ct; k ++)
741 {
742 for (m = 1; m < abs (delta [k]); m ++)
743 {
744 top [top_len ++] = a [i ++];
745 j ++;
746 }
747 if (delta [k] < 0)
748 {
749 top [top_len ++] = '-';
750 j ++;
751 }
752 else
753 {
754 top [top_len ++] = a [i ++];
755 }
756 }
757 while (i < a_len && j < b_len)
758 {
759 top [top_len ++] = a [i ++];
760 j ++;
761 }
762 top [top_len] = '\0';
763
764
765 i = j = 0;
766 for (k = 0; k < delta_ct; k ++)
767 {
768 for (m = 1; m < abs (delta [k]); m ++)
769 {
770 bottom [bottom_len ++] = b [j ++];
771 i ++;
772 }
773 if (delta [k] > 0)
774 {
775 bottom [bottom_len ++] = '-';
776 i ++;
777 }
778 else
779 {
780 bottom [bottom_len ++] = b [j ++];
781 }
782 }
783 while (j < b_len && i < a_len)
784 {
785 bottom [bottom_len ++] = b [j ++];
786 i ++;
787 }
788 bottom [bottom_len] = '\0';
789
790
791 for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH)
792 {
793 printf ("A: ");
794 for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++)
795 putchar (top [i + j]);
796 putchar ('\n');
797 printf ("B: ");
798 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++)
799 putchar (bottom [i + j]);
800 putchar ('\n');
801 printf (" ");
802 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len;
803 j ++)
804 if (top [i + j] != ' ' && bottom [i + j] != ' '
805 && top [i + j] != bottom [i + j])
806 putchar ('^');
807 else
808 putchar (' ');
809 putchar ('\n');
810 }
811
812 return;
813 }
814
815
816
Display_Alignment_With_Pad(char * a,int a_len,char * b,int b_len,int delta[],int delta_ct,char * left_pad[3],char * right_pad[3],int pad_len)817 void Display_Alignment_With_Pad
818 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
819 char * left_pad [3], char * right_pad [3], int pad_len)
820
821 // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)]
822 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)] .
823 // Attach pad strings in left_pad to the beginning of the alignment
824 // and the strings in right_pad to the end of the alignment.
825 // pad_len is the width of the pad strings
826
827 {
828 int i, j, k, m, top_len, bottom_len;
829 #if 0
830 char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
831 char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
832 #else
833 string top, bottom;
834 #endif
835
836 for (k = 0; k < pad_len; k ++)
837 {
838 top . push_back (left_pad [0] [k]);
839 bottom . push_back (left_pad [1] [k]);
840 }
841
842 i = j = 0;
843 for (k = 0; k < delta_ct; k ++)
844 {
845 for (m = 1; m < abs (delta [k]); m ++)
846 {
847 top . push_back (a [i ++]);
848 j ++;
849 }
850 if (delta [k] < 0)
851 {
852 top . push_back ('-');
853 j ++;
854 }
855 else
856 {
857 top . push_back (a [i ++]);
858 }
859 }
860 while (i < a_len && j < b_len)
861 {
862 top . push_back (a [i ++]);
863 j ++;
864 }
865
866 i = j = 0;
867 for (k = 0; k < delta_ct; k ++)
868 {
869 for (m = 1; m < abs (delta [k]); m ++)
870 {
871 bottom . push_back (b [j ++]);
872 i ++;
873 }
874 if (delta [k] > 0)
875 {
876 bottom . push_back ('-');
877 i ++;
878 }
879 else
880 {
881 bottom . push_back (b [j ++]);
882 }
883 }
884 while (j < b_len && i < a_len)
885 {
886 bottom . push_back (b [j ++]);
887 i ++;
888 }
889
890 for (k = 0; k < pad_len; k ++)
891 {
892 top . push_back (right_pad [0] [k]);
893 bottom . push_back (right_pad [1] [k]);
894 }
895
896 top_len = top . length ();
897 bottom_len = bottom . length ();
898
899 if (top_len != bottom_len)
900 {
901 fprintf (stderr, "ERROR: Alignment length mismatch top = %d bottom = %d\n",
902 top_len, bottom_len);
903 exit (EXIT_FAILURE);
904 }
905
906 for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH)
907 {
908 printf ("A: ");
909 for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++)
910 putchar (top [i + j]);
911 putchar ('\n');
912 printf ("B: ");
913 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++)
914 putchar (bottom [i + j]);
915 putchar ('\n');
916 printf (" ");
917 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len;
918 j ++)
919 if (i + j < pad_len)
920 putchar (left_pad [2] [i + j]);
921 else if (top_len - i - j <= pad_len)
922 putchar (right_pad [2] [i + j - (top_len - pad_len)]);
923 else if (top [i + j] != ' ' && bottom [i + j] != ' '
924 && top [i + j] != bottom [i + j])
925 putchar ('^');
926 else
927 putchar (' ');
928 putchar ('\n');
929 }
930
931 return;
932 }
933
934
935
Display_Difference_Positions(char * a,int a_start,int a_len,char * b,int b_start,int b_len,int b_incr,int delta[],int delta_ct,const char * a_tag,const char * b_tag)936 void Display_Difference_Positions
937 (char * a, int a_start, int a_len, char * b, int b_start, int b_len,
938 int b_incr, int delta [], int delta_ct, const char * a_tag,
939 const char * b_tag)
940
941 // Show (to stdout ) the positions indicated in delta [0 .. (delta_ct - 1)]
942 // between strings a [0 .. (a_len - 1)] and
943 // b [0 .. (b_len - 1))] . The positions of the starts of the strings
944 // are at a_start and b_start , respectively, and b_incr indicates
945 // if the b postions increase or decrease. b_incr must be 1 or -1.
946 // Use a_tag and b_tag to label the positions.
947
948 {
949 int i, j, k, m;
950
951 assert (b_incr == 1 || b_incr == -1);
952
953 i = j = 0;
954 for (k = 0; k < delta_ct; k ++)
955 {
956 for (m = 1; m < abs (delta [k]); m ++)
957 {
958 if (a [i] != b [j])
959 printf ("%-15s %-15s %8d %8d %c %c\n",
960 a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]);
961 i ++;
962 j ++;
963 }
964 if (delta [k] < 0)
965 {
966 printf ("%-15s %-15s %8d %8d %c %c\n",
967 a_tag, b_tag, a_start + i, b_start + j * b_incr, '-', b [j]);
968 j ++;
969 }
970 else
971 {
972 if (b_incr > 0)
973 printf ("%-15s %-15s %8d %8d %c %c\n",
974 a_tag, b_tag, a_start + i, b_start + j, a [i], '-');
975 else
976 printf ("%-15s %-15s %8d %8d %c %c\n",
977 a_tag, b_tag, a_start + i, b_start - j + 1, a [i], '-');
978 i ++;
979 }
980 }
981 while (i < a_len && j < b_len)
982 {
983 if (a [i] != b [j])
984 printf ("%-15s %-15s %8d %8d %c %c\n",
985 a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]);
986 i ++;
987 j ++;
988 }
989
990 return;
991 }
992
993
994
Extend_Backward(long int * ref_lo,long int * query_lo)995 int Extend_Backward
996 (long int * ref_lo, long int * query_lo)
997
998 // Do edit distance off end of match beginning at (* ref_lo) and
999 // (* query_lo) in the reference and query sequences, resp.,
1000 // in the reverse direction.
1001 // Return the number of errors in the extension and set (* ref_hi)
1002 // and (* query_hi) to the end of the extension.
1003
1004 {
1005 int ct = 0, errors, sum = 0;
1006 int a_len, b_len;
1007 int a_end, b_end, match_to_end;
1008 int delta [MAX_ERRORS], delta_len;
1009
1010 do
1011 {
1012 a_len = (* ref_lo) - 1;
1013 if (a_len > MAX_EXTENSION)
1014 a_len = MAX_EXTENSION;
1015 b_len = (* query_lo) - 1;
1016 if (b_len > MAX_EXTENSION)
1017 b_len = MAX_EXTENSION;
1018
1019 errors = Rev_Prefix_Edit_Dist
1020 (Ref + (* ref_lo) - 1, a_len,
1021 Query + (* query_lo) - 1, b_len,
1022 MAX_ERRORS - 1, & a_end, & b_end,
1023 & match_to_end, delta, & delta_len, true);
1024 if (Show_Differences)
1025 Rev_Show_Diffs
1026 (Ref + (* ref_lo - 1), a_end,
1027 Query + (* query_lo - 1), b_end,
1028 delta, delta_len, (* ref_lo) - 1, (* query_lo) - 1);
1029 if (Verbose > 0)
1030 {
1031 printf ("revextend#%d %9ld %9ld %5d %5d %3d %c %4d %4d\n",
1032 ++ ct, (* ref_lo) - 1, (* query_lo) - 1, a_len, b_len,
1033 errors, match_to_end ? 'T' : 'F', a_end, b_end);
1034 Rev_Display_Alignment
1035 (Ref + (* ref_lo) - 1, a_end, Query + (* query_lo) - 1, b_end,
1036 delta, delta_len);
1037 }
1038
1039 (* ref_lo) -= a_end;
1040 (* query_lo) -= b_end;
1041 sum += errors;
1042 } while (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION);
1043
1044 return sum;
1045 }
1046
1047
1048
Extend_Forward(long int * ref_hi,long int * query_hi)1049 int Extend_Forward
1050 (long int * ref_hi, long int * query_hi)
1051
1052 // Do edit distance off end of match ending at (* ref_hi) and
1053 // (* query_hi) in the reference and query sequences, resp.
1054 // Return the number of errors in the extension and set (* ref_hi)
1055 // and (* query_hi) to the end of the extension.
1056
1057 {
1058 int ct = 0, errors, sum = 0;
1059 int a_end, b_end, match_to_end;
1060 int a_len, b_len;
1061 int delta [MAX_ERRORS], delta_len;
1062
1063 do
1064 {
1065 a_len = Ref_Len - (* ref_hi);
1066 if (a_len > MAX_EXTENSION)
1067 a_len = MAX_EXTENSION;
1068 b_len = Query_Len - (* query_hi);
1069 if (b_len > MAX_EXTENSION)
1070 b_len = MAX_EXTENSION;
1071
1072 errors = Prefix_Edit_Dist
1073 (Ref + (* ref_hi) + 1, a_len,
1074 Query + (* query_hi) + 1, b_len,
1075 MAX_ERRORS - 1, & a_end, & b_end,
1076 & match_to_end, delta, & delta_len, true);
1077 if (Show_Differences)
1078 Show_Diffs (Ref + (* ref_hi) + 1, a_end,
1079 Query + (* query_hi) + 1, b_end,
1080 delta, delta_len, (* ref_hi) + 1, (* query_hi) + 1);
1081 if (Verbose > 0)
1082 {
1083 printf ("extend#%d %9ld %9ld %5d %5d %3d %c %4d %4d\n",
1084 ++ ct, (* ref_hi) + 1, (* query_hi) + 1, a_len, b_len,
1085 errors, match_to_end ? 'T' : 'F', a_end, b_end);
1086 Display_Alignment (Ref + (* ref_hi) + 1, a_end,
1087 Query + (* query_hi) + 1, b_end,
1088 delta, delta_len);
1089 }
1090
1091 (* ref_hi) += a_end;
1092 (* query_hi) += b_end;
1093 sum += errors;
1094 } while (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION);
1095
1096 return sum;
1097 }
1098
1099
Initialize_Globals(void)1100 void Initialize_Globals
1101 (void)
1102
1103 // Initialize global variables used in this program
1104
1105 {
1106 int i, offset, del;
1107 int e, start;
1108
1109 offset = 2;
1110 del = 6;
1111 for (i = 0; i < MAX_ERRORS; i ++)
1112 {
1113 Edit_Array [i] = Edit_Space + offset;
1114 offset += del;
1115 del += 2;
1116 }
1117
1118
1119 for (i = 0; i <= ERRORS_FOR_FREE; i ++)
1120 Edit_Match_Limit [i] = 0;
1121
1122 start = 1;
1123 for (e = ERRORS_FOR_FREE + 1; e < MAX_ERRORS; e ++)
1124 {
1125 start = Binomial_Bound (e - ERRORS_FOR_FREE, MAX_ERROR_RATE,
1126 start, EDIT_DIST_PROB_BOUND);
1127 Edit_Match_Limit [e] = start - 1;
1128 assert (Edit_Match_Limit [e] >= Edit_Match_Limit [e - 1]);
1129 }
1130
1131 for (i = 0; i <= MAX_FRAG_LEN; i ++)
1132 Error_Bound [i] = Max_int (10, 1 + (int) (2 * i * MAX_ERROR_RATE));
1133
1134 return;
1135 }
1136
1137
1138
Local_File_Open(const char * filename,const char * mode)1139 FILE * Local_File_Open
1140 (const char * filename, const char * mode)
1141
1142 /* Open Filename in Mode and return a pointer to its control
1143 * block. If fail, print a message and exit. */
1144
1145 {
1146 FILE * fp;
1147
1148 fp = fopen (filename, mode);
1149 if (fp == NULL)
1150 {
1151 fprintf (stderr, "ERROR %d: Could not open file %s \n",
1152 errno, filename);
1153 perror (strerror (errno));
1154 exit (EXIT_FAILURE);
1155 }
1156
1157 return fp;
1158 }
1159
1160
1161
Max_int(int a,int b)1162 int Max_int
1163 (int a, int b)
1164
1165 // Return the larger of a and b .
1166
1167 {
1168 if (a < b)
1169 return b;
1170
1171 return a;
1172 }
1173
1174
1175
Min_int(int a,int b)1176 int Min_int
1177 (int a, int b)
1178
1179 // Return the smaller of a and b .
1180
1181 {
1182 if (a < b)
1183 return a;
1184
1185 return b;
1186 }
1187
1188
1189
Parse_Command_Line(int argc,char * argv[])1190 void Parse_Command_Line
1191 (int argc, char * argv [])
1192
1193 // Get options and parameters from command line with argc
1194 // arguments in argv [0 .. (argc - 1)] .
1195
1196 {
1197 int ch;
1198 bool errflg = false;
1199
1200 optarg = NULL;
1201
1202 while (! errflg
1203 && ((ch = getopt (argc, argv, "De:nN:q:r:Stv:xW:")) != EOF))
1204 switch (ch)
1205 {
1206 case 'D' :
1207 Only_Difference_Positions = true;
1208 break;
1209
1210 case 'e' :
1211 Percent_ID = 1.0 - atof (optarg);
1212 UserScoring = true;
1213 break;
1214
1215 case 'n' :
1216 Nucleotides_Only = true;
1217 break;
1218
1219 case 'N' :
1220 Consec_Non_ACGT = strtol (optarg, NULL, 10);
1221 break;
1222
1223 case 'q' :
1224 Query_Suffix = optarg;
1225 break;
1226
1227 case 'r' :
1228 Ref_Suffix = optarg;
1229 break;
1230
1231 case 'S' :
1232 Show_Differences = true;
1233 break;
1234
1235 case 't' :
1236 Tag_From_Fasta_Line = true;
1237 break;
1238
1239 case 'v' :
1240 Verbose = strtol (optarg, NULL, 10);
1241 break;
1242
1243 case 'x' :
1244 Output_Cover_Files = false;
1245 break;
1246
1247 case 'W' :
1248 Error_File_Name = optarg;
1249 break;
1250
1251 case '?' :
1252 fprintf (stderr, "Unrecognized option -%c\n", optopt);
1253
1254 default :
1255 errflg = true;
1256 }
1257
1258 if (errflg || optind != argc - 3)
1259 {
1260 Usage (argv [0]);
1261 exit (EXIT_FAILURE);
1262 }
1263
1264 Ref_File_Path = argv [optind ++];
1265
1266 Match_File_Path = argv [optind ++];
1267
1268 Gaps_File_Path = argv [optind ++];
1269
1270 return;
1271 }
1272
1273
1274
Prefix_Edit_Dist(char A[],int m,char T[],int n,int Error_Limit,int * A_End,int * T_End,int * Match_To_End,int Delta[MAX_ERRORS],int * Delta_Len,int extending)1275 int Prefix_Edit_Dist
1276 (char A [], int m, char T [], int n, int Error_Limit,
1277 int * A_End, int * T_End, int * Match_To_End,
1278 int Delta [MAX_ERRORS], int * Delta_Len, int extending)
1279
1280 // Return the minimum number of changes (inserts, deletes, replacements)
1281 // needed to match string A [0 .. (m-1)] with a prefix of string
1282 // T [0 .. (n-1)] if it's not more than Error_Limit .
1283 // Put delta description of alignment in Delta and set
1284 // (* Delta_Len) to the number of entries there if it's a complete
1285 // match.
1286 // Set A_End and T_End to the rightmost positions where the
1287 // alignment ended in A and T , respectively.
1288 // Set Match_To_End true if the match extended to the end
1289 // of at least one string; otherwise, set it false to indicate
1290 // a branch point.
1291 // extending indicates whether the match is for a fixed region (FALSE)
1292 // or is extending off the end of a match as far as possible (TRUE)
1293
1294 {
1295 double Score, Max_Score;
1296 int Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0;
1297 int Best_d, Best_e, Longest, Row;
1298 int Left, Right;
1299 int d, e, i, j, shorter;
1300
1301 // assert (m <= n);
1302 Best_d = Best_e = Longest = 0;
1303 (* Delta_Len) = 0;
1304
1305 if (Consec_Non_ACGT > 0)
1306 {
1307 int ct;
1308
1309 for (i = ct = 0; i < m && ct < Consec_Non_ACGT; i ++)
1310 {
1311 switch (A [i])
1312 {
1313 case 'a' :
1314 case 'c' :
1315 case 'g' :
1316 case 't' :
1317 ct = 0;
1318 break;
1319 default :
1320 ct ++;
1321 }
1322 }
1323 if (ct >= Consec_Non_ACGT)
1324 {
1325 m = i - ct;
1326 extending = true;
1327 }
1328
1329 for (i = ct = 0; i < n && ct < Consec_Non_ACGT; i ++)
1330 {
1331 switch (T [i])
1332 {
1333 case 'a' :
1334 case 'c' :
1335 case 'g' :
1336 case 't' :
1337 ct = 0;
1338 break;
1339 default :
1340 ct ++;
1341 }
1342 }
1343 if (ct >= Consec_Non_ACGT)
1344 {
1345 n = i - ct;
1346 extending = true;
1347 }
1348 }
1349
1350 shorter = Min_int (m, n);
1351 for (Row = 0; Row < shorter && A [Row] == T [Row]; Row ++)
1352 ;
1353
1354 Edit_Array [0] [0] = Row;
1355
1356 if (Row == m && Row == n) // Exact match
1357 {
1358 (* Delta_Len) = 0;
1359 (* A_End) = (* T_End) = Row;
1360 (* Match_To_End) = ! extending;
1361 return 0;
1362 }
1363
1364 Left = Right = 0;
1365 Max_Score = Row * Branch_Pt_Match_Value;
1366 Max_Score_Len = Row;
1367 Max_Score_Best_d = 0;
1368 Max_Score_Best_e = 0;
1369
1370 for (e = 1; e <= Error_Limit; e ++)
1371 {
1372 int cutoff;
1373
1374 Left = Max_int (Left - 1, -e);
1375 Right = Min_int (Right + 1, e);
1376 Edit_Array [e - 1] [Left] = -2;
1377 Edit_Array [e - 1] [Left - 1] = -2;
1378 Edit_Array [e - 1] [Right] = -2;
1379 Edit_Array [e - 1] [Right + 1] = -2;
1380
1381 for (d = Left; d <= Right; d ++)
1382 {
1383 Row = 1 + Edit_Array [e - 1] [d];
1384 if ((j = Edit_Array [e - 1] [d - 1]) > Row)
1385 Row = j;
1386 if ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row)
1387 Row = j;
1388 while (Row < m && Row + d < n
1389 && A [Row] == T [Row + d])
1390 Row ++;
1391
1392 Edit_Array [e] [d] = Row;
1393
1394 if (Row == m && Row + d == n)
1395 {
1396 if (extending)
1397 {
1398 for (j = Left; j <= d; j ++)
1399 if (Edit_Array [e] [j] > Longest)
1400 {
1401 Best_d = j;
1402 Longest = Edit_Array [e] [j];
1403 }
1404 Score = Longest * Branch_Pt_Match_Value - e;
1405 if (Score < Max_Score)
1406 {
1407 (* A_End) = Max_Score_Len;
1408 (* T_End) = Max_Score_Len + Max_Score_Best_d;
1409 Set_Deltas (Delta, Delta_Len, Max_Score_Len,
1410 Max_Score_Best_d, Max_Score_Best_e);
1411 (* Match_To_End) = false;
1412 return Max_Score_Best_e;
1413 }
1414 }
1415
1416 (* A_End) = Row; // One past last align position
1417 (* T_End) = Row + d;
1418 Set_Deltas (Delta, Delta_Len, Row, d, e);
1419 (* Match_To_End) = ! extending;
1420 return e;
1421 }
1422 }
1423
1424 cutoff = Longest - GIVE_UP_LEN;
1425 while (Left <= Right && Edit_Array [e] [Left] < cutoff)
1426 Left ++;
1427 if (Left > Right)
1428 break;
1429 while (Right > Left && Edit_Array [e] [Right] < cutoff)
1430 Right --;
1431
1432 assert (Left <= Right);
1433
1434 for (d = Left; d <= Right; d ++)
1435 if (Edit_Array [e] [d] > Longest)
1436 {
1437 Best_d = d;
1438 Best_e = e;
1439 Longest = Edit_Array [e] [d];
1440 }
1441
1442 Score = Longest * Branch_Pt_Match_Value - e;
1443 // Assumes Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0
1444 if (Score > Max_Score)
1445 {
1446 Max_Score = Score;
1447 Max_Score_Len = Longest;
1448 Max_Score_Best_d = Best_d;
1449 Max_Score_Best_e = Best_e;
1450 }
1451
1452 if (Longest - Max_Score_Len >= GIVE_UP_LEN)
1453 break;
1454 }
1455
1456 (* A_End) = Max_Score_Len;
1457 (* T_End) = Max_Score_Len + Max_Score_Best_d;
1458 Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e);
1459 (* Match_To_End) = false;
1460
1461 return Max_Score_Best_e;
1462 }
1463
1464
1465
Read_String(FILE * fp,char ** T,long int * Size,char header[])1466 bool Read_String
1467 (FILE * fp, char * * T, long int * Size, char header [])
1468
1469 /* Read next string from fp (assuming FASTA format) into (* T) [1 ..]
1470 * which has Size characters. Allocate extra memory if needed
1471 * and adjust Size accordingly. Return TRUE if successful, FALSE
1472 * otherwise (e.g., EOF). Set header to the contents of the FASTA
1473 * header line. */
1474
1475 {
1476 char Line [MAX_LINE];
1477 long int Len;
1478 int Ch, Ct;
1479
1480 if (feof (fp))
1481 return false;
1482
1483 while ((Ch = fgetc (fp)) != EOF && Ch != '>')
1484 ;
1485
1486 if (Ch != '>')
1487 return false;
1488
1489 if(!fgets (Line, MAX_LINE, fp)) return false;
1490 Len = strlen (Line);
1491 assert (Len > 0 && Line [Len - 1] == '\n');
1492 Line [Len - 1] = '\0';
1493 strcpy (header, Line);
1494
1495
1496 Ct = 0;
1497 Len = 0;
1498 while ((Ch = fgetc (fp)) != EOF && Ch != '>')
1499 {
1500 if (isspace (Ch))
1501 continue;
1502
1503 Ct ++;
1504
1505 if (Ct >= (* Size) - 1)
1506 {
1507 (* Size) = (long int) ((* Size) * EXPANSION_FACTOR);
1508 (* T) = (char *) Safe_realloc ((* T), (* Size));
1509 }
1510 Ch = tolower (Ch);
1511 if (! isalpha (Ch))
1512 {
1513 fprintf (stderr, "Unexpected character `%c\' in string %s\n",
1514 Ch, header);
1515 Ch = 'x';
1516 }
1517 (* T) [++ Len] = Ch;
1518 }
1519
1520 (* T) [Len + 1] = '\0';
1521 if (Ch == '>')
1522 ungetc (Ch, fp);
1523
1524 return true;
1525 }
1526
1527
1528
Rev_Complement(char * s)1529 void Rev_Complement
1530 (char * s)
1531
1532 /* Set string s to its DNA reverse complement. */
1533
1534 {
1535 char ch;
1536 int i, j, len;
1537
1538 len = strlen (s);
1539
1540 for (i = 0, j = len - 1; i < j; i ++, j --)
1541 {
1542 ch = Complement (s [i]);
1543 s [i] = Complement (s [j]);
1544 s [j] = ch;
1545 }
1546
1547 if (i == j)
1548 s [i] = Complement (s [i]);
1549
1550 return;
1551 }
1552
1553
1554
Rev_Display_Alignment(char * a,int a_len,char * b,int b_len,int delta[],int delta_ct)1555 void Rev_Display_Alignment
1556 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct)
1557
1558 // Show (to stdout ) the alignment encoded in delta [0 .. (delta_ct - 1)]
1559 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)]
1560 // in the reverse direction.
1561
1562 {
1563 int i, j, k, m, top_len, bottom_len;
1564 char top [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
1565 char bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
1566
1567 i = j = top_len = bottom_len = 0;
1568 for (k = 0; k < delta_ct; k ++)
1569 {
1570 for (m = 1; m < abs (delta [k]); m ++)
1571 {
1572 top [top_len ++] = a [- i ++];
1573 j ++;
1574 }
1575 if (delta [k] < 0)
1576 {
1577 top [top_len ++] = '-';
1578 j ++;
1579 }
1580 else
1581 {
1582 top [top_len ++] = a [- i ++];
1583 }
1584 }
1585 while (i < a_len && j < b_len)
1586 {
1587 top [top_len ++] = a [- i ++];
1588 j ++;
1589 }
1590 top [top_len] = '\0';
1591
1592
1593 i = j = 0;
1594 for (k = 0; k < delta_ct; k ++)
1595 {
1596 for (m = 1; m < abs (delta [k]); m ++)
1597 {
1598 bottom [bottom_len ++] = b [- j ++];
1599 i ++;
1600 }
1601 if (delta [k] > 0)
1602 {
1603 bottom [bottom_len ++] = '-';
1604 i ++;
1605 }
1606 else
1607 {
1608 bottom [bottom_len ++] = b [- j ++];
1609 }
1610 }
1611 while (j < b_len && i < a_len)
1612 {
1613 bottom [bottom_len ++] = b [- j ++];
1614 i ++;
1615 }
1616 bottom [bottom_len] = '\0';
1617
1618
1619 for (i = 0; i < top_len || i < bottom_len; i += DISPLAY_WIDTH)
1620 {
1621 putchar ('\n');
1622 printf ("A: ");
1623 for (j = 0; j < DISPLAY_WIDTH && i + j < top_len; j ++)
1624 putchar (top [i + j]);
1625 putchar ('\n');
1626 printf ("B: ");
1627 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len; j ++)
1628 putchar (bottom [i + j]);
1629 putchar ('\n');
1630 printf (" ");
1631 for (j = 0; j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len;
1632 j ++)
1633 if (top [i + j] != ' ' && bottom [i + j] != ' '
1634 && top [i + j] != bottom [i + j])
1635 putchar ('^');
1636 else
1637 putchar (' ');
1638 putchar ('\n');
1639 }
1640
1641 return;
1642 }
1643
1644
1645
Rev_Prefix_Edit_Dist(char A[],int m,char T[],int n,int Error_Limit,int * A_End,int * T_End,int * Match_To_End,int Delta[MAX_ERRORS],int * Delta_Len,int extending)1646 int Rev_Prefix_Edit_Dist
1647 (char A [], int m, char T [], int n, int Error_Limit,
1648 int * A_End, int * T_End, int * Match_To_End,
1649 int Delta [MAX_ERRORS], int * Delta_Len, int extending)
1650
1651 // Return the minimum number of changes (inserts, deletes, replacements)
1652 // needed to match string A [0 .. -(m-1)] with a prefix of string
1653 // T [0 .. -(n-1)] if it's not more than Error_Limit .
1654 // Note: Match is reverse direction, right to left.
1655 // Put delta description of alignment in Delta and set
1656 // (* Delta_Len) to the number of entries there if it's a complete
1657 // match.
1658 // Set A_End and T_End to the rightmost positions where the
1659 // alignment ended in A and T , respectively.
1660 // Set Match_To_End true if the match extended to the end
1661 // of at least one string; otherwise, set it false to indicate
1662 // a branch point.
1663 // extending indicates whether the match is for a fixed region (FALSE)
1664 // or is extending off the end of a match as far as possible (TRUE)
1665
1666 {
1667 double Score, Max_Score;
1668 int Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0;
1669 int Best_d, Best_e, Longest, Row;
1670 int Left, Right;
1671 int d, e, i, j, shorter;
1672
1673 // assert (m <= n);
1674 Best_d = Best_e = Longest = 0;
1675 (* Delta_Len) = 0;
1676
1677 if (Consec_Non_ACGT > 0)
1678 {
1679 int ct;
1680
1681 for (i = ct = 0; i < m && ct < Consec_Non_ACGT; i ++)
1682 {
1683 switch (A [- i])
1684 {
1685 case 'a' :
1686 case 'c' :
1687 case 'g' :
1688 case 't' :
1689 ct = 0;
1690 break;
1691 default :
1692 ct ++;
1693 }
1694 }
1695 if (ct >= Consec_Non_ACGT)
1696 {
1697 m = i - ct;
1698 extending = true;
1699 }
1700
1701 for (i = ct = 0; i < n && ct < Consec_Non_ACGT; i ++)
1702 {
1703 switch (T [- i])
1704 {
1705 case 'a' :
1706 case 'c' :
1707 case 'g' :
1708 case 't' :
1709 ct = 0;
1710 break;
1711 default :
1712 ct ++;
1713 }
1714 }
1715 if (ct >= Consec_Non_ACGT)
1716 {
1717 n = i - ct;
1718 extending = true;
1719 }
1720 }
1721
1722 shorter = Min_int (m, n);
1723 for (Row = 0; Row < shorter && A [- Row] == T [- Row]; Row ++)
1724 ;
1725
1726 Edit_Array [0] [0] = Row;
1727
1728 if (Row == m && Row == n) // Exact match
1729 {
1730 (* Delta_Len) = 0;
1731 (* A_End) = (* T_End) = Row;
1732 (* Match_To_End) = ! extending;
1733 return 0;
1734 }
1735
1736 Left = Right = 0;
1737 Max_Score = Row * Branch_Pt_Match_Value;
1738 Max_Score_Len = Row;
1739 Max_Score_Best_d = 0;
1740 Max_Score_Best_e = 0;
1741
1742 for (e = 1; e <= Error_Limit; e ++)
1743 {
1744 int cutoff;
1745
1746 Left = Max_int (Left - 1, -e);
1747 Right = Min_int (Right + 1, e);
1748 Edit_Array [e - 1] [Left] = -2;
1749 Edit_Array [e - 1] [Left - 1] = -2;
1750 Edit_Array [e - 1] [Right] = -2;
1751 Edit_Array [e - 1] [Right + 1] = -2;
1752
1753 for (d = Left; d <= Right; d ++)
1754 {
1755 Row = 1 + Edit_Array [e - 1] [d];
1756 if ((j = Edit_Array [e - 1] [d - 1]) > Row)
1757 Row = j;
1758 if ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row)
1759 Row = j;
1760 while (Row < m && Row + d < n
1761 && A [- Row] == T [- Row - d])
1762 Row ++;
1763
1764 Edit_Array [e] [d] = Row;
1765
1766 if (Row == m && Row + d == n)
1767 {
1768 if (extending)
1769 {
1770 for (j = Left; j <= d; j ++)
1771 if (Edit_Array [e] [j] > Longest)
1772 {
1773 Best_d = j;
1774 Longest = Edit_Array [e] [j];
1775 }
1776 Score = Longest * Branch_Pt_Match_Value - e;
1777 if (Score < Max_Score)
1778 {
1779 (* A_End) = Max_Score_Len;
1780 (* T_End) = Max_Score_Len + Max_Score_Best_d;
1781 Set_Deltas (Delta, Delta_Len, Max_Score_Len,
1782 Max_Score_Best_d, Max_Score_Best_e);
1783 (* Match_To_End) = false;
1784 return Max_Score_Best_e;
1785 }
1786 }
1787
1788 (* A_End) = Row; // One past last align position
1789 (* T_End) = Row + d;
1790 Set_Deltas (Delta, Delta_Len, Row, d, e);
1791 (* Match_To_End) = ! extending;
1792 return e;
1793 }
1794 }
1795
1796 cutoff = Longest - GIVE_UP_LEN;
1797 while (Left <= Right && Edit_Array [e] [Left] < cutoff)
1798 Left ++;
1799 if (Left > Right)
1800 break;
1801 while (Right > Left && Edit_Array [e] [Right] < cutoff)
1802 Right --;
1803 assert (Left <= Right);
1804
1805 for (d = Left; d <= Right; d ++)
1806 if (Edit_Array [e] [d] > Longest)
1807 {
1808 Best_d = d;
1809 Best_e = e;
1810 Longest = Edit_Array [e] [d];
1811 }
1812
1813 Score = Longest * Branch_Pt_Match_Value - e;
1814 // Assumes Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0
1815 if (Score > Max_Score)
1816 {
1817 Max_Score = Score;
1818 Max_Score_Len = Longest;
1819 Max_Score_Best_d = Best_d;
1820 Max_Score_Best_e = Best_e;
1821 }
1822
1823
1824 if (Longest - Max_Score_Len >= GIVE_UP_LEN)
1825 break;
1826 }
1827
1828 (* A_End) = Max_Score_Len;
1829 (* T_End) = Max_Score_Len + Max_Score_Best_d;
1830 Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e);
1831 (* Match_To_End) = false;
1832
1833 return Max_Score_Best_e;
1834 }
1835
1836
1837
Rev_Show_Diffs(char * a,int a_len,char * b,int b_len,int delta[],int delta_ct,long int a_ref,long int b_ref)1838 void Rev_Show_Diffs
1839 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
1840 long int a_ref, long int b_ref)
1841
1842 // Show (to stdout ) the differences
1843 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)]
1844 // encoded in delta [0 .. (delta_ct - 1)]
1845 // in the reverse direction. a_ref is the position of the beginning
1846 // of string a . b_ref is the offset to the start of
1847 // string b .
1848
1849 {
1850 int i, j, k, m;
1851
1852 i = j = 0;
1853 for (k = 0; k < delta_ct; k ++)
1854 {
1855 for (m = 1; m < abs (delta [k]); m ++)
1856 {
1857 if (a [- i] != b [- j])
1858 printf ("%8ld %8ld: %c %c\n", a_ref - i, b_ref - j, a [- i], b [- j]);
1859 i ++;
1860 j ++;
1861 }
1862 if (delta [k] < 0)
1863 {
1864 printf ("%8ld %8ld: ins %c\n", a_ref - i, b_ref - j, b [- j]);
1865 j ++;
1866 }
1867 else
1868 {
1869 printf ("%8ld %8ld: del %c\n", a_ref - i, b_ref - j, a [- i]);
1870 i ++;
1871 }
1872 }
1873 while (i < a_len && j < b_len)
1874 {
1875 if (a [- i] != b [- j])
1876 printf ("%8ld %8ld: %c %c\n", a_ref - i, b_ref - j, a [- i], b [- j]);
1877 i ++;
1878 j ++;
1879 }
1880
1881 return;
1882 }
1883
1884
1885
Set_Deltas(int delta[],int * delta_len,int row,int d,int e)1886 void Set_Deltas
1887 (int delta [], int * delta_len, int row, int d, int e)
1888
1889 // Set delta to values that show the alignment recorded in
1890 // global Edit_Array . Set (* delta_len) to the number of
1891 // entries set. row is the length of the match in the first
1892 // string. d is the offset at the end of the match between
1893 // the first and second string. e is the number of errors.
1894
1895 {
1896 int delta_stack [MAX_ERRORS];
1897 int from, last, max;
1898 int i, j, k;
1899
1900 last = row;
1901 (* delta_len) = 0;
1902
1903 for (k = e; k > 0; k --)
1904 {
1905 from = d;
1906 max = 1 + Edit_Array [k - 1] [d];
1907 if ((j = Edit_Array [k - 1] [d - 1]) > max)
1908 {
1909 from = d - 1;
1910 max = j;
1911 }
1912 if ((j = 1 + Edit_Array [k - 1] [d + 1]) > max)
1913 {
1914 from = d + 1;
1915 max = j;
1916 }
1917 if (from == d - 1)
1918 {
1919 delta_stack [(* delta_len) ++] = max - last - 1;
1920 d --;
1921 last = Edit_Array [k - 1] [from];
1922 }
1923 else if (from == d + 1)
1924 {
1925 delta_stack [(* delta_len) ++] = last - (max - 1);
1926 d ++;
1927 last = Edit_Array [k - 1] [from];
1928 }
1929 }
1930 delta_stack [(* delta_len) ++] = last + 1;
1931
1932 k = 0;
1933 for (i = (* delta_len) - 1; i > 0; i --)
1934 delta [k ++]
1935 = abs (delta_stack [i]) * Sign (delta_stack [i - 1]);
1936 (* delta_len) --;
1937
1938 return;
1939 }
1940
1941
1942
Set_Left_Pad(char * pad[3],int r_hi,int q_hi,int mum_len,int pad_len)1943 static void Set_Left_Pad
1944 (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len)
1945
1946 // Set pad to a triple of strings, each of length pad_len ,
1947 // representing a MUM of length mum_len ending at r_hi in Ref
1948 // and q_hi in Query . Each string in pad must have been allocated
1949 // at least pad_len + 1 bytes of memory. The purpose is to serve as
1950 // the left-end padding for an alignment with this MUM as its left
1951 // boundary.
1952
1953 {
1954 int i;
1955
1956 for (i = 0; i < 3; i ++)
1957 pad [i] [pad_len] = '\0';
1958
1959 for (i = 0; i < pad_len; i ++)
1960 {
1961 pad [0] [pad_len - i - 1] = r_hi - i > 0 ? Ref [r_hi - i] : '.';
1962 pad [1] [pad_len - i - 1] = q_hi - i > 0 ? Query [q_hi - i] : '.';
1963 pad [2] [pad_len - i - 1] = i < mum_len ? '=' : ' ';
1964 }
1965
1966 return;
1967 }
1968
1969
1970
Set_Right_Pad(char * pad[3],int r_lo,int q_lo,int mum_len,int pad_len)1971 static void Set_Right_Pad
1972 (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len)
1973
1974 // Set pad to a triple of strings, each of length pad_len ,
1975 // representing a MUM of length mum_len beginning at r_lo in Ref
1976 // and q_lo in Query . Each string in pad must have been allocated
1977 // at least pad_len + 1 bytes of memory. The purpose is to serve as
1978 // the right-end padding for an alignment with this MUM as its right
1979 // boundary.
1980
1981 {
1982 int i;
1983
1984 for (i = 0; i < 3; i ++)
1985 pad [i] [pad_len] = '\0';
1986
1987 for (i = 0; i < pad_len; i ++)
1988 {
1989 pad [0] [i] = r_lo + i <= Ref_Len ? Ref [r_lo + i] : '.';
1990 pad [1] [i] = q_lo + i <= Query_Len ? Query [q_lo + i] : '.';
1991 pad [2] [i] = i < mum_len ? '=' : ' ';
1992 }
1993
1994 return;
1995 }
1996
1997
1998
Show_Coverage(Cover_t * list,char * filename,char * tag,char * seq)1999 void Show_Coverage
2000 (Cover_t * list, char * filename, char * tag, char * seq)
2001
2002 // Print to stderr summary stats of regions in list .
2003 // Send to filename a list of region info suitable for celagram .
2004 // Use tag to print the headings. Regions refer to seq .
2005
2006 {
2007 FILE * fp;
2008 Cover_t * p;
2009 int ct = 0;
2010 long int i, prev_hi = 0, n_ct, len, seq_len;
2011 long int cov_ns = 0, gap_ns = 0, total_ns;
2012 double cov_total = 0.0, gap_total = 0.0;
2013
2014 fp = Local_File_Open (filename, "w");
2015 fprintf (fp, "%-9s %9s %9s %8s %8s %6s\n",
2016 "Region", "Start", "End", "Len", "N's", "%N's");
2017
2018 for (p = list; p != NULL; p = p -> next)
2019 {
2020 n_ct = 0;
2021 for (i = prev_hi + 1; i < p -> lo; i ++)
2022 if (strchr ("acgt", seq [i]) == NULL)
2023 n_ct ++;
2024 len = p -> lo - prev_hi - 1;
2025 fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld %5.1f%%\n",
2026 ct, prev_hi + 1, p -> lo - 1, len, n_ct,
2027 len == 0 ? 0.0 : 100.0 * n_ct / len);
2028 gap_total += len;
2029 gap_ns += n_ct;
2030
2031 ct ++;
2032
2033 n_ct = 0;
2034 for (i = p -> lo; i <= p -> hi; i ++)
2035 if (strchr ("acgt", seq [i]) == NULL)
2036 n_ct ++;
2037 len = 1 + p -> hi - p -> lo;
2038 fprintf (fp, "cov%-6d %9ld %9ld %8ld %8ld %5.1f%%\n",
2039 ct, p -> lo, p -> hi, len, n_ct,
2040 len == 0 ? 0.0 : 100.0 * n_ct / len);
2041 cov_total += len;
2042 cov_ns += n_ct;
2043
2044 prev_hi = p -> hi;
2045 }
2046
2047 n_ct = 0;
2048 seq_len = strlen (seq + 1);
2049 for (i = prev_hi + 1; i <= seq_len; i ++)
2050 if (strchr ("acgt", seq [i]) == NULL)
2051 n_ct ++;
2052 len = seq_len - prev_hi;
2053 fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld %5.1f%%\n",
2054 ct, prev_hi + 1, seq_len, len, n_ct,
2055 len == 0 ? 0.0 : 100.0 * n_ct / len);
2056 gap_total += len;
2057 gap_ns += n_ct;
2058
2059 total_ns = cov_ns + gap_ns;
2060
2061 fclose (fp);
2062
2063 fprintf (stderr, "\n%s Sequence Coverage:\n", tag);
2064 fprintf (stderr, " Sequence length = %ld\n", seq_len);
2065 fprintf (stderr, " acgt's = %ld\n", seq_len - total_ns);
2066 fprintf (stderr, " Non acgt's = %ld\n", total_ns);
2067 fprintf (stderr, " Number of regions = %d\n", ct);
2068 fprintf (stderr, " Matched bases = %.0f (%.1f%%, %.1f%% of acgt's)\n",
2069 cov_total,
2070 seq_len == 0.0 ? 0.0 : 100.0 * cov_total / seq_len,
2071 seq_len - total_ns == 0.0 ? 0.0 :
2072 100.0 * (cov_total - cov_ns) / (seq_len - total_ns));
2073 fprintf (stderr, " Avg match len = %.0f\n",
2074 ct == 0 ? 0.0 : cov_total / ct);
2075 fprintf (stderr, " N's in matches = %ld (%.1f%%)\n",
2076 cov_ns,
2077 cov_total == 0.0 ? 0.0 : 100.0 * cov_ns / cov_total);
2078 fprintf (stderr, " Unmatched bases = %.0f (%.1f%%, %.1f%% of acgt's)\n",
2079 gap_total,
2080 seq_len == 0.0 ? 0.0 : 100.0 * gap_total / seq_len,
2081 seq_len - total_ns == 0.0 ? 0.0 :
2082 100.0 * (gap_total - gap_ns) / (seq_len - total_ns));
2083 fprintf (stderr, " Avg gap len = %.0f\n",
2084 gap_total / (1.0 + ct));
2085 fprintf (stderr, " N's in gaps = %ld (%.1f%%)\n",
2086 gap_ns,
2087 gap_total == 0.0 ? 0.0 : 100.0 * gap_ns / gap_total);
2088
2089 return;
2090 }
2091
2092
2093
Show_Diffs(char * a,int a_len,char * b,int b_len,int delta[],int delta_ct,long int a_ref,long int b_ref)2094 void Show_Diffs
2095 (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
2096 long int a_ref, long int b_ref)
2097
2098 // Show (to stdout ) the differences
2099 // between strings a [0 .. (a_len - 1)] and b [0 .. (b_len - 1)]
2100 // encoded in delta [0 .. (delta_ct - 1)] . a_ref is the offset to
2101 // the start of string a . b_ref is the offset to the start of
2102 // string b .
2103
2104 {
2105 int i, j, k, m;
2106
2107 i = j = 0;
2108 for (k = 0; k < delta_ct; k ++)
2109 {
2110 for (m = 1; m < abs (delta [k]); m ++)
2111 {
2112 if (a [i] != b [j])
2113 printf ("%8ld %8ld: %c %c\n", a_ref + i, b_ref + j, a [i], b [j]);
2114 i ++;
2115 j ++;
2116 }
2117 if (delta [k] < 0)
2118 {
2119 printf ("%8ld %8ld: ins %c\n", a_ref + i - 1, b_ref + j, b [j]);
2120 j ++;
2121 }
2122 else
2123 {
2124 printf ("%8ld %8ld: del %c\n", a_ref + i, b_ref + j - 1, a [i]);
2125 i ++;
2126 }
2127 }
2128 while (i < a_len && j < b_len)
2129 {
2130 if (a [i] != b [j])
2131 printf ("%8ld %8ld: %c %c\n", a_ref + i, b_ref + j, a [i], b [j]);
2132 i ++;
2133 j ++;
2134 }
2135
2136 return;
2137 }
2138
2139
2140
Sign(int a)2141 int Sign
2142 (int a)
2143
2144 // Return the algebraic sign of a .
2145
2146 {
2147 if (a > 0)
2148 return 1;
2149 else if (a < 0)
2150 return -1;
2151
2152 return 0;
2153 }
2154
2155
2156
Usage(char * command)2157 void Usage
2158 (char * command)
2159
2160 // Print to stderr description of options and command line for
2161 // this program. command is the command that was used to
2162 // invoke it.
2163
2164 {
2165 fprintf (stderr,
2166 "USAGE: %s <RefSequence> <MatchSequences> <GapsFile>\n"
2167 "\n"
2168 "Combines MUMs in <GapsFile> by extending matches off\n"
2169 "ends and between MUMs. <RefSequence> is a fasta file\n"
2170 "of the reference sequence. <MatchSequences> is a\n"
2171 "multi-fasta file of the sequences matched against the\n"
2172 "reference\n"
2173 "\n"
2174 "Options:\n"
2175 "-D Only output to stdout the difference positions\n"
2176 " and characters\n"
2177 "-n Allow matches only between nucleotides, i.e., ACGTs\n"
2178 "-N num Break matches at <num> or more consecutive non-ACGTs \n"
2179 "-q tag Used to label query match\n"
2180 "-r tag Used to label reference match\n"
2181 "-S Output all differences in strings\n"
2182 "-t Label query matches with query fasta header\n"
2183 "-v num Set verbose level for extra output\n"
2184 "-W file Reset the default output filename witherrors.gaps\n"
2185 "-x Don't output .cover files\n"
2186 "-e Set error-rate cutoff to e (e.g. 0.02 is two percent)\n",
2187
2188 command);
2189
2190 return;
2191 }
2192