1 /*************************************************
2 * Module:  CombineMUMs.cc
3 * Author: Art Delcher
4 * Description:
5 *   Take clusters of MUMs (from  mgaps  program) and do alignments
6 *   to combine them into matches.
7 *
8 */
9 
10 
11 #include <mummer/tigrinc.hh>
12 #include  <string>
13 using namespace std;
14 
15 
16 //  Constants
17 
18 #define  BRANCH_PT_MATCH_VALUE    0.272
19     //  Value to add for a match in finding branch points
20     //  1.20 was the calculated value for 6% vs 35% error discrimination
21     //  Converting to integers didn't make it faster
22 #define  BRANCH_PT_ERROR_VALUE    -0.728
23     //  Value to add for a mismatch in finding branch points
24     //   -2.19 was the calculated value for 6% vs 35% error discrimination
25     //  Converting to integers didn't make it faster
26 #define  DEFAULT_ERROR_FILE_NAME     "witherrors.gaps"
27     //  Name of file produced that matches the input gaps file
28     //  but with an extra column showing the number of errors in
29     //  each gap
30 #define  DEFAULT_PAD                 10
31     //  Default number of matching bases to show at beginning
32     //  and end of alignments
33 #define  EDIT_DIST_PROB_BOUND        1e-4
34     //  Probability limit to "band" edit-distance calculation
35     //  Determines  NORMAL_DISTRIB_THOLD
36 #define  ERRORS_FOR_FREE             1
37     //  The number of errors that are ignored in setting probability
38     //  bound for terminating alignment extensions in edit distance
39     //  calculations
40 #define  EXPANSION_FACTOR            1.4
41     //  Factor by which to grow memory when realloc'ing
42 #define  GIVE_UP_LEN                 200
43     //  Stop alignment when go this many bases past max-score value
44 #define  INITIAL_BUFFER_SIZE         10000
45     //  Initial number of bytes to use for sequence strings
46 #define  MATCH_SCORE                 1.0
47     //  Score to add for match in finding length of alignment
48 #define  MAX_ERROR_RATE              0.06
49     //  The largest error allowed in overlaps
50 #define  MAX_FRAG_LEN                1024
51     //  The longest fragment allowed
52 #define  MAX_ERRORS                  500
53     //  Most errors in any edit distance computation
54 #define  MAX_EXTENSION               10000
55     //  Maximum extension match out from a match
56 #define  MAX_HDR_LEN                 10000
57     //  Longest allowable fasta header line
58 #define  MAX_MEMORY_STORE            50000
59     //  The most fragments allowed in memory store
60 #define  MIN_BRANCH_END_DIST         20
61     //  Branch points must be at least this many bases from the
62     //  end of the fragment to be reported
63 #define  MIN_BRANCH_TAIL_SLOPE       0.20
64     //  Branch point tails must fall off from the max by at least
65     //  this rate
66 #define  MIN_MATCH_LEN               40
67     //  Number of bases in match region in order to count it
68 #define  MISMATCH_SCORE              -3.0
69     //  Score to add for non-match in finding length of alignment
70 #define  NORMAL_DISTRIB_THOLD        3.62
71     //  Determined by  EDIT_DIST_PROB_BOUND
72 #define  REALLY_VERBOSE              0
73     //  If  1  prints tons of stuff
74 #define  VERBOSE                     0
75     //  If  1  prints lots of stuff
76 
77 
78 
79 //  Type definitions
80 
81 typedef  struct s_Cover_t
82   {
83    long int  lo, hi;
84    struct s_Cover_t  * next;
85   }  Cover_t;
86 
87 
88 
89 //  Globals
90 
91 float Branch_Pt_Match_Value = BRANCH_PT_MATCH_VALUE;
92     // Score used for matches to locate branch points, i.e.,
93     // where alignment score peaks
94 float  Branch_Pt_Error_Value = BRANCH_PT_ERROR_VALUE;
95     // Score used for mismatches to locate branch points, i.e.,
96 int  Consec_Non_ACGT = 0;
97     // Stop alignments when encounter at least this many non-acgt characters.
98 int  * Edit_Array [MAX_ERRORS];
99     // Use for alignment calculation.  Points into  Edit_Space .
100 int  Edit_Match_Limit [MAX_ERRORS] = {0};
101     // This array [e] is the minimum value of  Edit_Array [e] [d]
102     // to be worth pursuing in edit-distance computations between guides
103 int  Edit_Space [(MAX_ERRORS + 4) * MAX_ERRORS];
104     // Memory used by alignment calculation
105 int  Error_Bound [MAX_FRAG_LEN + 1];
106     //  This array [i]  is the maximum number of errors allowed
107     //  in a match between sequences of length  i , which is
108     //  i * MAXERROR_RATE .
109 const char  * Error_File_Name = DEFAULT_ERROR_FILE_NAME;
110     // Name of file to write gaps listing with # errors in each gap
111 int  Fill_Ct = 0;
112     // Number of non-acgt bases in ref sequence
113 char  * Gaps_File_Path = NULL;
114     // Name of file produced by  mgaps  program
115 char  * Match_File_Path = NULL;
116     // Name of multifasta file of sequences to compare against the reference
117     // sequence
118 bool UserScoring = false;
119     // If TRUE, then user specified a percent ID cutoff and scoring
120     // is adjusted to enforce this in extensions
121 bool  Nucleotides_Only = false;
122     // If  TRUE , then only acgt's can match
123 bool  Only_Difference_Positions = false;
124     // If  true , then output just positions of difference instead of
125     // alignments between exact matches
126 bool  Output_Cover_Files = true;
127     // If  TRUE , output files showing coverage of each genome.
128 double  Percent_ID;
129     // Desired identity rate of matches to be found
130     // Expressed as a fraction, not really a percent
131 char  * Query = NULL;
132     // The query sequence
133 long int  Query_Len;
134     // The length of the query sequence
135 const char  * Query_Suffix = "Query";
136     // Suffix for query tag
137 char  * Ref = NULL;
138     // The reference sequence
139 char  * Ref_File_Path = NULL;
140     // Name of (single) fasta file of reference sequence
141 long int  Ref_Len;
142     // The length of the reference sequence
143 long int  Ref_Size;
144     // The size of the reference sequence buffer
145 const char  * Ref_Suffix = "Ref";
146     // Suffix for reference tag
147 bool  Show_Differences = false;
148     // If  TRUE  then show differences in all alignments
149 bool  Tag_From_Fasta_Line = false;
150     // If  TRUE  then use fasta tag from ref & query sequences as
151     // 1st & 2nd column, resp., on match line to identify matches
152 int  Verbose = 0;
153     // Controls printing of extra debuggin information
154 
155 
156 bool  Global_Debug_Flag = false;
157 
158 
159 
160 //  Functions
161 
162 void  Add_Coverage
163     (Cover_t * * list, long int lo, long int hi);
164 int  Binomial_Bound
165     (int, double, int, double);
166 void  Display_Alignment
167     (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct);
168 void  Display_Alignment_With_Pad
169     (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
170      char * left_pad [3], char * right_pad [3], int pad_len);
171 void  Display_Difference_Positions
172     (char * a, int a_start, int a_len, char * b, int b_start, int b_len,
173      int b_incr, int delta [], int delta_ct, const char * a_tag,
174      const char * b_tag);
175 int  Extend_Backward
176     (long int * ref_lo, long int * query_lo);
177 int  Extend_Forward
178     (long int * ref_hi, long int * query_hi);
179 void  Initialize_Globals
180     (void);
181 FILE *  Local_File_Open
182     (const char * filename, const char * mode);
183 int  Max_int
184     (int a, int b);
185 int  Min_int
186     (int a, int b);
187 void  Parse_Command_Line
188     (int argc, char * argv []);
189 int  Prefix_Edit_Dist
190     (char A [], int m, char T [], int n, int Error_Limit,
191      int * A_End, int * T_End, int * Match_To_End,
192      int Delta [MAX_ERRORS], int * Delta_Len, int extending);
193 bool  Read_String
194     (FILE * fp, char * * T, long int * Size, char header []);
195 void  Rev_Complement
196     (char * s);
197 void  Rev_Display_Alignment
198     (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct);
199 int  Rev_Prefix_Edit_Dist
200     (char A [], int m, char T [], int n, int Error_Limit,
201      int * A_End, int * T_End, int * Match_To_End,
202      int Delta [MAX_ERRORS], int * Delta_Len, int extending);
203 void  Rev_Show_Diffs
204     (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
205      long int a_ref, long int b_ref);
206 void  Set_Deltas
207     (int delta [], int * delta_len, int row, int d, int e);
208 static void  Set_Left_Pad
209     (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len);
210 static void  Set_Right_Pad
211     (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len);
212 void  Show_Coverage
213     (Cover_t * list, char * filename, char * tag, char * seq);
214 void  Show_Diffs
215     (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
216      long int a_ref, long int b_ref);
217 int  Sign
218     (int a);
219 void  Usage
220     (char * command);
221 
222 
223 
main(int argc,char * argv[])224 int  main
225     (int argc, char * argv [])
226 
227   {
228    FILE  * fp, * match_fp, * error_fp;
229    char  ref_header [MAX_HDR_LEN];
230    char  header [MAX_HDR_LEN], ref_tag [MAX_HDR_LEN], query_tag [MAX_HDR_LEN];
231    char  line [MAX_LINE], filename [MAX_LINE];
232    char  * left_pad [3], * right_pad [3];
233    long int  ref_pos, ref_lo, ref_hi;
234    long int  query_pos, query_lo, query_hi, query_size;
235    long int  match_len, prior_match_len = 0, max_len;
236    double  error = 0.0;
237    long int  total_errors = 0;
238    int  match_ct = 0;
239    double  match_total = 0.0;
240    bool  is_forward = false;
241    int  first_match = true;
242    Cover_t  * ref_cover_list = NULL;
243    Cover_t  * query_cover_list = NULL;
244    char  * p;
245    int  i;
246 
247    Parse_Command_Line  (argc, argv);
248 
249    if(UserScoring){
250      // percentID = mismatch penalty / (match bonus + mismatch penalty)
251      // or, more compactly, p = mm / (m+mm)
252      // Since we require that m+mm = 1, m=1-mm
253      // and so we get the trivial mm = p
254      //
255      // This means that Branch_Pt_Error_Value = -p
256      // and then Branch_Pt_Match_Value = 1-p
257      //
258      // Make it so:
259      Branch_Pt_Error_Value = - Percent_ID;
260      Branch_Pt_Match_Value = 1.0 - Percent_ID;
261    }
262 
263    Initialize_Globals ();
264 
265    p = strrchr (Ref_File_Path, '/');
266    if  (p == NULL)
267        strcpy (ref_tag, Ref_File_Path);
268      else
269        strcpy (ref_tag, p + 1);
270    p = strrchr (ref_tag, '.');
271    if  (p != NULL)
272        (* p) = '\0';
273    strcat (ref_tag, Ref_Suffix);
274 
275    p = strrchr (Match_File_Path, '/');
276    if  (p == NULL)
277        strcpy (query_tag, Match_File_Path);
278      else
279        strcpy (query_tag, p + 1);
280    p = strrchr (query_tag, '.');
281    if  (p != NULL)
282        (* p) = '\0';
283    strcat (query_tag, Query_Suffix);
284 
285    fp = Local_File_Open (Ref_File_Path, "r");
286    Ref_Size = 100000;
287    Ref = (char *) Safe_malloc (Ref_Size);
288 
289    assert (Read_String (fp, & Ref, & Ref_Size, ref_header));
290    if  (Tag_From_Fasta_Line)
291        strcpy (ref_tag, strtok (ref_header, " \t\n>"));
292    fclose (fp);
293 
294    Ref_Len = strlen (Ref + 1);
295    Ref = (char *) Safe_realloc (Ref, 1 + Ref_Len);
296    for  (i = 1;  i <= Ref_Len;  i ++)
297      switch  (Ref [i])
298        {
299         case  'a' :
300         case  'c' :
301         case  'g' :
302         case  't' :
303           break;
304         default :
305           if  (Nucleotides_Only)
306               Ref [i] = '2';
307           Fill_Ct ++;
308        }
309 
310    for  (i = 0;  i < 3;  i ++)
311      {
312       left_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1);
313       right_pad [i] = (char *) Safe_malloc (DEFAULT_PAD + 1);
314      }
315 
316    fp = Local_File_Open (Gaps_File_Path, "r");
317    match_fp = Local_File_Open (Match_File_Path, "r");
318    query_size = 100000;
319    Query = (char *) Safe_malloc (query_size);
320 
321    error_fp = Local_File_Open (Error_File_Name, "w");
322 
323    while  (fgets (line, MAX_LINE, fp) != NULL)
324      {
325       int  line_len = strlen (line);
326 
327       if  (line [0] == '>')
328           {
329            if  (! first_match)
330                {
331                 total_errors += Extend_Forward (& ref_hi, & query_hi);
332                 max_len = 1 + ref_hi - ref_lo;
333                 if  (1 + abs (query_hi - query_lo) > max_len)
334                     max_len = 1 + abs (query_hi - query_lo);
335                 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
336                 if  (! Only_Difference_Positions)
337                     printf
338                     ("Region:  %9ld .. %-9ld  %9ld .. %-9ld  %7ld / %-9ld  %5.2f%%\n",
339                         ref_lo, ref_hi,
340                         is_forward ? query_lo : 1 + Query_Len - query_lo,
341                         is_forward ? query_hi : 1 + Query_Len - query_hi,
342                         total_errors, max_len, 100.0 * error);
343 
344                 match_ct ++;
345                 match_total += 1 + ref_hi - ref_lo;
346                 total_errors = 0;
347                 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
348                 if  (is_forward)
349                     Add_Coverage (& query_cover_list, query_lo, query_hi);
350                   else
351                     Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
352                                   1 + Query_Len - query_lo);
353                }
354            if  (Tag_From_Fasta_Line)
355                {
356                 char  buff [MAX_LINE];
357                 char  * p;
358 
359                 strcpy (buff, line + 1);
360                 p = strtok (buff, " >\t\n");
361                 if  (strlen (p) > 0)
362                     strcpy (query_tag, p);
363                   else
364                     {
365                      fprintf (stderr, "No tag on line %s", line);
366                      strcpy (query_tag, "???");
367                     }
368                }
369            is_forward = ! is_forward;
370            if  (is_forward)
371                {
372 #ifndef NDEBUG
373                  bool read_header =
374 #endif
375                    Read_String (match_fp, & Query, & query_size, header);
376                 assert (read_header);
377                 Query_Len = strlen (Query + 1);
378                 if  (Nucleotides_Only)
379                     {
380                      for  (i = 1;  i <= Query_Len;  i ++)
381                        switch  (Query [i])
382                          {
383                           case  'a' :
384                           case  'c' :
385                           case  'g' :
386                           case  't' :
387                             break;
388                           default :
389                             Query [i] = '1';
390                          }
391                     }
392                }
393              else
394                Rev_Complement (Query + 1);
395            first_match = true;
396            printf ("%s", line);
397            fprintf (error_fp, "%s", line);
398           }
399       else if  (line [0] == '#')
400           {
401            if  (! first_match)
402                {
403                 total_errors += Extend_Forward (& ref_hi, & query_hi);
404                 max_len = 1 + ref_hi - ref_lo;
405                 if  (1 + abs (query_hi - query_lo) > max_len)
406                     max_len = 1 + abs (query_hi - query_lo);
407                 error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
408                 if  (! Only_Difference_Positions)
409                     printf
410                     ("Region:  %9ld .. %-9ld  %9ld .. %-9ld  %7ld / %-9ld  %5.2f%%\n",
411                         ref_lo, ref_hi,
412                         is_forward ? query_lo : 1 + Query_Len - query_lo,
413                         is_forward ? query_hi : 1 + Query_Len - query_hi,
414                         total_errors, max_len, 100.0 * error);
415 
416                 match_ct ++;
417                 match_total += 1 + ref_hi - ref_lo;
418                 total_errors = 0;
419                 Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
420                 if  (is_forward)
421                     Add_Coverage (& query_cover_list, query_lo, query_hi);
422                   else
423                     Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
424                                   1 + Query_Len - query_lo);
425                }
426            first_match = true;
427            printf ("%s", line);
428            fprintf (error_fp, "%s", line);
429           }
430         else
431           {
432 #ifndef NDEBUG
433             int tokens =
434 #endif
435               sscanf (line, "%ld %ld %ld", & ref_pos, & query_pos, & match_len);
436             assert(tokens == 3);
437            if  (first_match)
438                {
439                 ref_lo = ref_pos;
440                 query_lo = query_pos;
441                 total_errors += Extend_Backward (& ref_lo, & query_lo);
442                 if  (! Only_Difference_Positions)
443                     printf ("%s", line);
444                 if  (line_len > 0)
445                     line [line_len - 1] = '\0';
446                 fprintf (error_fp, "%s %7s\n", line, "-");
447                }
448              else
449                {
450                 int  errors;
451                 int  a_len, b_len;
452                 int  a_end, b_end, match_to_end;
453                 int  delta [MAX_ERRORS], delta_len;
454 
455                 a_len = ref_pos - ref_hi - 1;
456                 b_len = query_pos - query_hi - 1;
457 
458                 errors = Prefix_Edit_Dist
459                            (Ref + ref_hi + 1, a_len,
460                             Query + query_hi + 1, b_len,
461                             MAX_ERRORS - 1, & a_end, & b_end,
462                             & match_to_end, delta, & delta_len, false);
463                 if  (Show_Differences)
464                     Show_Diffs (Ref + ref_hi + 1, a_end,
465                                 Query + query_hi + 1, b_end,
466                                 delta, delta_len, ref_hi + 1, query_hi + 1);
467 
468                 if  (! Only_Difference_Positions)
469                     printf ("%s", line);
470 
471                 Set_Left_Pad (left_pad, ref_hi, query_hi, prior_match_len,
472                      DEFAULT_PAD);
473                 Set_Right_Pad (right_pad, ref_pos, query_pos, match_len,
474                      DEFAULT_PAD);
475 
476                 if  (a_end == 0 && b_end == 0)
477                     {
478                      if  (! Only_Difference_Positions)
479                          printf ("   No extension\n");
480 
481                      if  (line_len > 0)
482                          line [line_len - 1] = '\0';
483                      fprintf (error_fp, "%s %7s\n", line, "-");
484                     }
485                 else if  (Only_Difference_Positions)
486                     {
487                      int  q_start, b_incr;
488 
489                      if  (is_forward)
490                          {
491                           q_start = query_hi + 1;
492                           b_incr = 1;
493                          }
494                        else
495                          {
496                           q_start = Query_Len - query_hi;
497                           b_incr = -1;
498                          }
499                      Display_Difference_Positions
500                           (Ref + ref_hi + 1, ref_hi + 1, a_end,
501                            Query + query_hi + 1, q_start, b_end,
502                            b_incr, delta, delta_len, ref_tag,
503                            query_tag);
504                     }
505                   else
506                     {
507                      printf ("     Errors = %d\n", errors);
508                      if  (! Only_Difference_Positions)
509                          Display_Alignment_With_Pad
510                               (Ref + ref_hi + 1, a_end,
511                                Query + query_hi + 1, b_end,
512                                delta, delta_len, left_pad, right_pad,
513                                DEFAULT_PAD);
514                      if  (line_len > 0)
515                          line [line_len - 1] = '\0';
516                      fprintf (error_fp, "%s %7d\n", line, errors);
517                     }
518 
519                 total_errors += errors;
520 
521                 if  (! match_to_end)
522                     {
523                      ref_hi += a_end;
524                      query_hi += b_end;
525                      max_len = 1 + ref_hi - ref_lo;
526                      if  (1 + abs (query_hi - query_lo) > max_len)
527                          max_len = 1 + abs (query_hi - query_lo);
528                      error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
529                      if  (! Only_Difference_Positions)
530                          printf (
531                         "Region:  %9ld .. %-9ld  %9ld .. %-9ld  %7ld / %-9ld  %5.2f%%\n",
532                              ref_lo, ref_hi,
533                              is_forward ? query_lo : 1 + Query_Len - query_lo,
534                              is_forward ? query_hi : 1 + Query_Len - query_hi,
535                              total_errors, max_len, 100.0 * error);
536 
537                      match_ct ++;
538                      match_total += 1 + ref_hi - ref_lo;
539                      total_errors = 0;
540                      Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
541                      if  (is_forward)
542                          Add_Coverage (& query_cover_list, query_lo, query_hi);
543                        else
544                          Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
545                                        1 + Query_Len - query_lo);
546                      ref_lo = ref_pos;
547                      query_lo = query_pos;
548                      total_errors += Extend_Backward (& ref_lo, & query_lo);
549                     }
550                }
551 
552            ref_hi = ref_pos + match_len - 1;
553            query_hi = query_pos + match_len - 1;
554            prior_match_len = match_len;
555            first_match = false;
556           }
557      }
558 
559    if  (! first_match)
560        {
561         total_errors += Extend_Forward (& ref_hi, & query_hi);
562         max_len = 1 + ref_hi - ref_lo;
563         if  (1 + abs (query_hi - query_lo) > max_len)
564             max_len = 1 + abs (query_hi - query_lo);
565         error = (max_len == 0.0 ? 0.0 : (double) total_errors / max_len);
566         if  (! Only_Difference_Positions)
567             printf
568                  ("Region:  %9ld .. %-9ld  %9ld .. %-9ld  %7ld / %-9ld  %5.2f%%\n",
569                   ref_lo, ref_hi,
570                   is_forward ? query_lo : 1 + Query_Len - query_lo,
571                   is_forward ? query_hi : 1 + Query_Len - query_hi,
572                   total_errors, max_len, 100.0 * error);
573 
574         match_ct ++;
575         match_total += 1 + ref_hi - ref_lo;
576         Add_Coverage (& ref_cover_list, ref_lo, ref_hi);
577         if  (is_forward)
578             Add_Coverage (& query_cover_list, query_lo, query_hi);
579           else
580             Add_Coverage (& query_cover_list, 1 + Query_Len - query_hi,
581                           1 + Query_Len - query_lo);
582        }
583 
584    fprintf (stderr, "           Ref len = %ld\n", Ref_Len);
585    fprintf (stderr, "            acgt's = %ld\n", Ref_Len - Fill_Ct);
586    fprintf (stderr, "        Non acgt's = %d\n", Fill_Ct);
587    fprintf (stderr, " Number of matches = %d\n", match_ct);
588    fprintf (stderr, "Sum of match bases = %.0f\n", match_total);
589    fprintf (stderr, "   Avg match bases = %.0f\n",
590             match_ct == 0 ? 0.0 : match_total / match_ct);
591 
592    fclose (error_fp);
593 
594    if  (Output_Cover_Files)
595        {
596         strcpy (filename, ref_tag);
597         strcat (filename, ".cover");
598         Show_Coverage (ref_cover_list, filename, ref_tag, Ref);
599 
600         strcpy (filename, query_tag);
601         strcat (filename, ".cover");
602         Rev_Complement (Query + 1);
603         Show_Coverage (query_cover_list, filename, query_tag, Query);
604        }
605 
606    return  0;
607   }
608 
609 
610 
Add_Coverage(Cover_t ** list,long int lo,long int hi)611 void  Add_Coverage
612     (Cover_t * * list, long int lo, long int hi)
613 
614 //  Add  lo .. hi  to list of regions covered in  (* list) .
615 //  Combine nodes when appropriate.
616 
617   {
618    Cover_t  * new_node, * p, * prev = NULL;
619 
620    if  ((* list) == NULL || hi + 1 < (* list) -> lo)
621        {
622         new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t));
623         new_node -> lo = lo;
624         new_node -> hi = hi;
625         new_node -> next = (* list);
626         (* list) = new_node;
627         return;
628        }
629 
630    for  (p = (* list);  p != NULL && lo - 1 > p -> hi;  p = p -> next)
631      prev = p;
632 
633    if  (p == NULL || hi + 1 < p -> lo)
634        {  // insert between or on end
635         assert (prev != NULL);
636         new_node = (Cover_t *) Safe_malloc (sizeof (Cover_t));
637         new_node -> lo = lo;
638         new_node -> hi = hi;
639         new_node -> next = p;
640         prev -> next = new_node;
641         return;
642        }
643 
644    if  (lo < p -> lo)
645        p -> lo = lo;
646    while  (p -> next != NULL && hi + 1 >= p -> next -> lo)
647      {
648       Cover_t  * save;
649 
650       p -> hi = p -> next -> hi;
651       save = p -> next;
652       p -> next = save -> next;
653       free (save);
654      }
655    if  (hi > p -> hi)
656        p -> hi = hi;
657 
658    return;
659   }
660 
661 
662 
Binomial_Bound(int e,double p,int Start,double Limit)663 int  Binomial_Bound
664     (int e, double p, int Start, double Limit)
665 
666 //  Return the smallest  n >= Start  s.t.
667 //    prob [>= e  errors in  n  binomial trials (p = error prob)]
668 //          > Limit
669 
670   {
671    double  Normal_Z, Mu_Power, Factorial, Poisson_Coeff;
672    double  q, Sum, P_Power, Q_Power, X;
673    int  k, n, Bin_Coeff, Ct;
674 
675    q = 1.0 - p;
676    if  (Start < e)
677        Start = e;
678 
679    for  (n = Start;  n < MAX_FRAG_LEN;  n ++)
680      {
681       if  (n <= 35)
682           {
683            Sum = 0.0;
684            Bin_Coeff = 1;
685            Ct = 0;
686            P_Power = 1.0;
687            Q_Power = pow (q, (double) n);
688 
689            for  (k = 0;  k < e && 1.0 - Sum > Limit;  k ++)
690              {
691               X = Bin_Coeff * P_Power * Q_Power;
692               Sum += X;
693               Bin_Coeff *= n - Ct;
694               Bin_Coeff /= ++ Ct;
695               P_Power *= p;
696               Q_Power /= q;
697              }
698            if  (1.0 - Sum > Limit)
699                return  n;
700           }
701         else
702           {
703            Normal_Z = (e - 0.5 - n * p) / sqrt ((double)(n * p * q));
704            if  (Normal_Z <= NORMAL_DISTRIB_THOLD)
705                return  n;
706            Sum = 0.0;
707            Mu_Power = 1.0;
708            Factorial = 1.0;
709            Poisson_Coeff = exp (- (double) n * p);
710            for  (k = 0;  k < e;  k ++)
711              {
712               Sum += Mu_Power * Poisson_Coeff / Factorial;
713               Mu_Power *= n * p;
714               Factorial *= k + 1;
715              }
716            if  (1.0 - Sum > Limit)
717                return  n;
718           }
719      }
720 
721    return  MAX_FRAG_LEN;
722   }
723 
724 
725 
726 #define  DISPLAY_WIDTH   60
727 
Display_Alignment(char * a,int a_len,char * b,int b_len,int delta[],int delta_ct)728 void  Display_Alignment
729     (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct)
730 
731 //  Show (to  stdout ) the alignment encoded in  delta [0 .. (delta_ct - 1)]
732 //  between strings  a [0 .. (a_len - 1)]  and  b [0 .. (b_len - 1)] .
733 
734   {
735    int  i, j, k, m, top_len, bottom_len;
736    char  top [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
737    char  bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
738 
739    i = j = top_len = bottom_len = 0;
740    for  (k = 0;  k < delta_ct;  k ++)
741      {
742       for  (m = 1;  m < abs (delta [k]);  m ++)
743         {
744          top [top_len ++] = a [i ++];
745          j ++;
746         }
747       if  (delta [k] < 0)
748           {
749            top [top_len ++] = '-';
750            j ++;
751           }
752         else
753           {
754            top [top_len ++] = a [i ++];
755           }
756      }
757    while  (i < a_len && j < b_len)
758      {
759       top [top_len ++] = a [i ++];
760       j ++;
761      }
762    top [top_len] = '\0';
763 
764 
765    i = j = 0;
766    for  (k = 0;  k < delta_ct;  k ++)
767      {
768       for  (m = 1;  m < abs (delta [k]);  m ++)
769         {
770          bottom [bottom_len ++] = b [j ++];
771          i ++;
772         }
773       if  (delta [k] > 0)
774           {
775            bottom [bottom_len ++] = '-';
776            i ++;
777           }
778         else
779           {
780            bottom [bottom_len ++] = b [j ++];
781           }
782      }
783    while  (j < b_len && i < a_len)
784      {
785       bottom [bottom_len ++] = b [j ++];
786       i ++;
787      }
788    bottom [bottom_len] = '\0';
789 
790 
791    for  (i = 0;  i < top_len || i < bottom_len;  i += DISPLAY_WIDTH)
792      {
793       printf ("A: ");
794       for  (j = 0;  j < DISPLAY_WIDTH && i + j < top_len;  j ++)
795         putchar (top [i + j]);
796       putchar ('\n');
797       printf ("B: ");
798       for  (j = 0;  j < DISPLAY_WIDTH && i + j < bottom_len;  j ++)
799         putchar (bottom [i + j]);
800       putchar ('\n');
801       printf ("   ");
802       for  (j = 0;  j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len;
803                 j ++)
804         if  (top [i + j] != ' ' && bottom [i + j] != ' '
805                  && top [i + j] != bottom [i + j])
806             putchar ('^');
807           else
808             putchar (' ');
809       putchar ('\n');
810      }
811 
812    return;
813   }
814 
815 
816 
Display_Alignment_With_Pad(char * a,int a_len,char * b,int b_len,int delta[],int delta_ct,char * left_pad[3],char * right_pad[3],int pad_len)817 void  Display_Alignment_With_Pad
818     (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
819      char * left_pad [3], char * right_pad [3], int pad_len)
820 
821 //  Show (to  stdout ) the alignment encoded in  delta [0 .. (delta_ct - 1)]
822 //  between strings  a [0 .. (a_len - 1)]  and  b [0 .. (b_len - 1)] .
823 //  Attach pad strings in  left_pad  to the beginning of the alignment
824 //  and the strings in  right_pad  to the end of the alignment.
825 //   pad_len  is the width of the pad strings
826 
827   {
828    int  i, j, k, m, top_len, bottom_len;
829 #if  0
830    char  top [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
831    char  bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
832 #else
833    string  top, bottom;
834 #endif
835 
836    for  (k = 0;  k < pad_len;  k ++)
837      {
838       top . push_back (left_pad [0] [k]);
839       bottom . push_back (left_pad [1] [k]);
840      }
841 
842    i = j = 0;
843    for  (k = 0;  k < delta_ct;  k ++)
844      {
845       for  (m = 1;  m < abs (delta [k]);  m ++)
846         {
847          top . push_back (a [i ++]);
848          j ++;
849         }
850       if  (delta [k] < 0)
851           {
852            top . push_back ('-');
853            j ++;
854           }
855         else
856           {
857            top . push_back (a [i ++]);
858           }
859      }
860    while  (i < a_len && j < b_len)
861      {
862       top . push_back (a [i ++]);
863       j ++;
864      }
865 
866    i = j = 0;
867    for  (k = 0;  k < delta_ct;  k ++)
868      {
869       for  (m = 1;  m < abs (delta [k]);  m ++)
870         {
871          bottom . push_back (b [j ++]);
872          i ++;
873         }
874       if  (delta [k] > 0)
875           {
876            bottom . push_back ('-');
877            i ++;
878           }
879         else
880           {
881            bottom . push_back (b [j ++]);
882           }
883      }
884    while  (j < b_len && i < a_len)
885      {
886       bottom . push_back (b [j ++]);
887       i ++;
888      }
889 
890    for  (k = 0;  k < pad_len;  k ++)
891      {
892       top . push_back (right_pad [0] [k]);
893       bottom . push_back (right_pad [1] [k]);
894      }
895 
896    top_len = top . length ();
897    bottom_len = bottom . length ();
898 
899    if  (top_len != bottom_len)
900        {
901         fprintf (stderr, "ERROR:  Alignment length mismatch  top = %d  bottom = %d\n",
902              top_len, bottom_len);
903         exit (EXIT_FAILURE);
904        }
905 
906    for  (i = 0;  i < top_len || i < bottom_len;  i += DISPLAY_WIDTH)
907      {
908       printf ("A: ");
909       for  (j = 0;  j < DISPLAY_WIDTH && i + j < top_len;  j ++)
910         putchar (top [i + j]);
911       putchar ('\n');
912       printf ("B: ");
913       for  (j = 0;  j < DISPLAY_WIDTH && i + j < bottom_len;  j ++)
914         putchar (bottom [i + j]);
915       putchar ('\n');
916       printf ("   ");
917       for  (j = 0;  j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len;
918                 j ++)
919         if  (i + j < pad_len)
920             putchar (left_pad [2] [i + j]);
921         else if  (top_len - i - j <= pad_len)
922             putchar (right_pad [2] [i + j - (top_len - pad_len)]);
923         else if  (top [i + j] != ' ' && bottom [i + j] != ' '
924                  && top [i + j] != bottom [i + j])
925             putchar ('^');
926           else
927             putchar (' ');
928       putchar ('\n');
929      }
930 
931    return;
932   }
933 
934 
935 
Display_Difference_Positions(char * a,int a_start,int a_len,char * b,int b_start,int b_len,int b_incr,int delta[],int delta_ct,const char * a_tag,const char * b_tag)936 void  Display_Difference_Positions
937     (char * a, int a_start, int a_len, char * b, int b_start, int b_len,
938      int b_incr, int delta [], int delta_ct, const char * a_tag,
939      const char * b_tag)
940 
941 //  Show (to  stdout ) the positions indicated in  delta [0 .. (delta_ct - 1)]
942 //  between strings  a [0 .. (a_len - 1)]  and
943 //   b [0 .. (b_len - 1))] .  The positions of the starts of the strings
944 //  are at  a_start  and  b_start , respectively, and  b_incr  indicates
945 //  if the b postions increase or decrease.   b_incr  must be 1 or -1.
946 //  Use  a_tag  and  b_tag  to label the positions.
947 
948   {
949    int  i, j, k, m;
950 
951    assert (b_incr == 1 || b_incr == -1);
952 
953    i = j = 0;
954    for  (k = 0;  k < delta_ct;  k ++)
955      {
956       for  (m = 1;  m < abs (delta [k]);  m ++)
957         {
958          if  (a [i] != b [j])
959              printf ("%-15s %-15s %8d %8d  %c  %c\n",
960                   a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]);
961          i ++;
962          j ++;
963         }
964       if  (delta [k] < 0)
965           {
966            printf ("%-15s %-15s %8d %8d  %c  %c\n",
967                 a_tag, b_tag, a_start + i, b_start + j * b_incr, '-', b [j]);
968            j ++;
969           }
970         else
971           {
972            if  (b_incr > 0)
973                printf ("%-15s %-15s %8d %8d  %c  %c\n",
974                     a_tag, b_tag, a_start + i, b_start + j, a [i], '-');
975              else
976                printf ("%-15s %-15s %8d %8d  %c  %c\n",
977                     a_tag, b_tag, a_start + i, b_start - j + 1, a [i], '-');
978            i ++;
979           }
980      }
981    while  (i < a_len && j < b_len)
982      {
983       if  (a [i] != b [j])
984           printf ("%-15s %-15s %8d %8d  %c  %c\n",
985                a_tag, b_tag, a_start + i, b_start + j * b_incr, a [i], b [j]);
986       i ++;
987       j ++;
988      }
989 
990    return;
991   }
992 
993 
994 
Extend_Backward(long int * ref_lo,long int * query_lo)995 int  Extend_Backward
996     (long int * ref_lo, long int * query_lo)
997 
998 //  Do edit distance off end of match beginning at  (* ref_lo)  and
999 //  (* query_lo)  in the reference and query sequences, resp.,
1000 //  in the reverse direction.
1001 //  Return the number of errors in the extension and set  (* ref_hi)
1002 //  and  (* query_hi)  to the end of the extension.
1003 
1004   {
1005    int  ct = 0, errors, sum = 0;
1006    int  a_len, b_len;
1007    int  a_end, b_end, match_to_end;
1008    int  delta [MAX_ERRORS], delta_len;
1009 
1010    do
1011      {
1012       a_len = (* ref_lo) - 1;
1013       if  (a_len > MAX_EXTENSION)
1014           a_len = MAX_EXTENSION;
1015       b_len = (* query_lo) - 1;
1016       if  (b_len > MAX_EXTENSION)
1017           b_len = MAX_EXTENSION;
1018 
1019       errors = Rev_Prefix_Edit_Dist
1020                  (Ref + (* ref_lo) - 1, a_len,
1021                   Query + (* query_lo) - 1, b_len,
1022                   MAX_ERRORS - 1, & a_end, & b_end,
1023                   & match_to_end, delta, & delta_len, true);
1024       if  (Show_Differences)
1025           Rev_Show_Diffs
1026               (Ref + (* ref_lo - 1), a_end,
1027                Query + (* query_lo - 1), b_end,
1028                delta, delta_len, (* ref_lo) - 1, (* query_lo) - 1);
1029       if  (Verbose > 0)
1030           {
1031            printf ("revextend#%d  %9ld %9ld  %5d %5d  %3d  %c  %4d %4d\n",
1032                    ++ ct, (* ref_lo) - 1, (* query_lo) - 1, a_len, b_len,
1033                    errors, match_to_end ? 'T' : 'F', a_end, b_end);
1034            Rev_Display_Alignment
1035                (Ref + (* ref_lo) - 1, a_end, Query + (* query_lo) - 1, b_end,
1036                 delta, delta_len);
1037           }
1038 
1039       (* ref_lo) -= a_end;
1040       (* query_lo) -= b_end;
1041       sum += errors;
1042      }  while  (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION);
1043 
1044    return  sum;
1045   }
1046 
1047 
1048 
Extend_Forward(long int * ref_hi,long int * query_hi)1049 int  Extend_Forward
1050     (long int * ref_hi, long int * query_hi)
1051 
1052 //  Do edit distance off end of match ending at  (* ref_hi)  and
1053 //  (* query_hi)  in the reference and query sequences, resp.
1054 //  Return the number of errors in the extension and set  (* ref_hi)
1055 //  and  (* query_hi)  to the end of the extension.
1056 
1057   {
1058    int  ct = 0, errors, sum = 0;
1059    int  a_end, b_end, match_to_end;
1060    int  a_len, b_len;
1061    int  delta [MAX_ERRORS], delta_len;
1062 
1063    do
1064      {
1065       a_len = Ref_Len - (* ref_hi);
1066       if  (a_len > MAX_EXTENSION)
1067           a_len = MAX_EXTENSION;
1068       b_len = Query_Len - (* query_hi);
1069       if  (b_len > MAX_EXTENSION)
1070           b_len = MAX_EXTENSION;
1071 
1072       errors = Prefix_Edit_Dist
1073                  (Ref + (* ref_hi) + 1, a_len,
1074                   Query + (* query_hi) + 1, b_len,
1075                   MAX_ERRORS - 1, & a_end, & b_end,
1076                   & match_to_end, delta, & delta_len, true);
1077       if  (Show_Differences)
1078           Show_Diffs (Ref + (* ref_hi) + 1, a_end,
1079                       Query + (* query_hi) + 1, b_end,
1080                       delta, delta_len, (* ref_hi) + 1, (* query_hi) + 1);
1081       if  (Verbose > 0)
1082           {
1083            printf ("extend#%d  %9ld %9ld  %5d %5d  %3d  %c  %4d %4d\n",
1084                    ++ ct, (* ref_hi) + 1, (* query_hi) + 1, a_len, b_len,
1085                    errors, match_to_end ? 'T' : 'F', a_end, b_end);
1086            Display_Alignment (Ref + (* ref_hi) + 1, a_end,
1087                               Query + (* query_hi) + 1, b_end,
1088                               delta, delta_len);
1089           }
1090 
1091       (* ref_hi) += a_end;
1092       (* query_hi) += b_end;
1093       sum += errors;
1094      }  while  (a_end > 0.9 * MAX_EXTENSION || b_end > MAX_EXTENSION);
1095 
1096    return  sum;
1097   }
1098 
1099 
Initialize_Globals(void)1100 void  Initialize_Globals
1101     (void)
1102 
1103 //  Initialize global variables used in this program
1104 
1105   {
1106    int  i, offset, del;
1107    int  e, start;
1108 
1109    offset = 2;
1110    del = 6;
1111    for  (i = 0;  i < MAX_ERRORS;  i ++)
1112      {
1113        Edit_Array [i] = Edit_Space + offset;
1114        offset += del;
1115        del += 2;
1116      }
1117 
1118 
1119    for  (i = 0;  i <= ERRORS_FOR_FREE;  i ++)
1120      Edit_Match_Limit [i] = 0;
1121 
1122    start = 1;
1123    for  (e = ERRORS_FOR_FREE + 1;  e < MAX_ERRORS;  e ++)
1124      {
1125       start = Binomial_Bound (e - ERRORS_FOR_FREE, MAX_ERROR_RATE,
1126                   start, EDIT_DIST_PROB_BOUND);
1127       Edit_Match_Limit [e] = start - 1;
1128       assert (Edit_Match_Limit [e] >= Edit_Match_Limit [e - 1]);
1129      }
1130 
1131    for  (i = 0;  i <= MAX_FRAG_LEN;  i ++)
1132      Error_Bound [i] = Max_int (10, 1 + (int) (2 * i * MAX_ERROR_RATE));
1133 
1134    return;
1135   }
1136 
1137 
1138 
Local_File_Open(const char * filename,const char * mode)1139 FILE *  Local_File_Open
1140     (const char * filename, const char * mode)
1141 
1142 /* Open  Filename  in  Mode  and return a pointer to its control
1143 *  block.  If fail, print a message and exit. */
1144 
1145   {
1146    FILE  *  fp;
1147 
1148    fp = fopen (filename, mode);
1149    if  (fp == NULL)
1150        {
1151         fprintf (stderr, "ERROR %d:  Could not open file  %s \n",
1152                  errno, filename);
1153         perror (strerror (errno));
1154         exit (EXIT_FAILURE);
1155        }
1156 
1157    return  fp;
1158   }
1159 
1160 
1161 
Max_int(int a,int b)1162 int  Max_int
1163     (int a, int b)
1164 
1165 //  Return the larger of  a  and  b .
1166 
1167   {
1168    if  (a < b)
1169        return  b;
1170 
1171    return  a;
1172   }
1173 
1174 
1175 
Min_int(int a,int b)1176 int  Min_int
1177     (int a, int b)
1178 
1179 //  Return the smaller of  a  and  b .
1180 
1181   {
1182    if  (a < b)
1183        return  a;
1184 
1185    return  b;
1186   }
1187 
1188 
1189 
Parse_Command_Line(int argc,char * argv[])1190 void  Parse_Command_Line
1191     (int argc, char * argv [])
1192 
1193 //  Get options and parameters from command line with  argc
1194 //  arguments in  argv [0 .. (argc - 1)] .
1195 
1196   {
1197     int  ch;
1198     bool errflg = false;
1199 
1200    optarg = NULL;
1201 
1202    while  (! errflg
1203              && ((ch = getopt (argc, argv, "De:nN:q:r:Stv:xW:")) != EOF))
1204      switch  (ch)
1205        {
1206         case  'D' :
1207           Only_Difference_Positions = true;
1208           break;
1209 
1210         case 'e' :
1211 	  Percent_ID = 1.0 - atof (optarg);
1212 	  UserScoring = true;
1213 	  break;
1214 
1215         case  'n' :
1216           Nucleotides_Only = true;
1217           break;
1218 
1219         case  'N' :
1220           Consec_Non_ACGT = strtol (optarg, NULL, 10);
1221           break;
1222 
1223         case  'q' :
1224           Query_Suffix = optarg;
1225           break;
1226 
1227         case  'r' :
1228           Ref_Suffix = optarg;
1229           break;
1230 
1231         case  'S' :
1232           Show_Differences = true;
1233           break;
1234 
1235         case  't' :
1236           Tag_From_Fasta_Line = true;
1237           break;
1238 
1239         case  'v' :
1240           Verbose = strtol (optarg, NULL, 10);
1241           break;
1242 
1243         case  'x' :
1244           Output_Cover_Files = false;
1245           break;
1246 
1247         case  'W' :
1248           Error_File_Name = optarg;
1249           break;
1250 
1251         case  '?' :
1252           fprintf (stderr, "Unrecognized option -%c\n", optopt);
1253 
1254         default :
1255           errflg = true;
1256        }
1257 
1258    if  (errflg || optind != argc - 3)
1259        {
1260         Usage (argv [0]);
1261         exit (EXIT_FAILURE);
1262        }
1263 
1264    Ref_File_Path = argv [optind ++];
1265 
1266    Match_File_Path = argv [optind ++];
1267 
1268    Gaps_File_Path = argv [optind ++];
1269 
1270    return;
1271   }
1272 
1273 
1274 
Prefix_Edit_Dist(char A[],int m,char T[],int n,int Error_Limit,int * A_End,int * T_End,int * Match_To_End,int Delta[MAX_ERRORS],int * Delta_Len,int extending)1275 int  Prefix_Edit_Dist
1276     (char A [], int m, char T [], int n, int Error_Limit,
1277      int * A_End, int * T_End, int * Match_To_End,
1278      int Delta [MAX_ERRORS], int * Delta_Len, int extending)
1279 
1280 //  Return the minimum number of changes (inserts, deletes, replacements)
1281 //  needed to match string  A [0 .. (m-1)]  with a prefix of string
1282 //   T [0 .. (n-1)]  if it's not more than  Error_Limit .
1283 //  Put delta description of alignment in  Delta  and set
1284 //  (* Delta_Len)  to the number of entries there if it's a complete
1285 //  match.
1286 //  Set  A_End  and  T_End  to the rightmost positions where the
1287 //  alignment ended in  A  and  T , respectively.
1288 //  Set  Match_To_End  true if the match extended to the end
1289 //  of at least one string; otherwise, set it false to indicate
1290 //  a branch point.
1291 //  extending  indicates whether the match is for a fixed region (FALSE)
1292 //  or is extending off the end of a match as far as possible (TRUE)
1293 
1294   {
1295    double  Score, Max_Score;
1296    int  Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0;
1297    int  Best_d, Best_e, Longest, Row;
1298    int  Left, Right;
1299    int  d, e, i, j, shorter;
1300 
1301 //   assert (m <= n);
1302    Best_d = Best_e = Longest = 0;
1303    (* Delta_Len) = 0;
1304 
1305    if  (Consec_Non_ACGT > 0)
1306        {
1307         int  ct;
1308 
1309         for  (i = ct = 0;  i < m && ct < Consec_Non_ACGT;  i ++)
1310           {
1311            switch  (A [i])
1312              {
1313               case  'a' :
1314               case  'c' :
1315               case  'g' :
1316               case  't' :
1317                 ct = 0;
1318                 break;
1319               default :
1320                 ct ++;
1321              }
1322           }
1323         if  (ct >= Consec_Non_ACGT)
1324             {
1325              m = i - ct;
1326              extending = true;
1327             }
1328 
1329         for  (i = ct = 0;  i < n && ct < Consec_Non_ACGT;  i ++)
1330           {
1331            switch  (T [i])
1332              {
1333               case  'a' :
1334               case  'c' :
1335               case  'g' :
1336               case  't' :
1337                 ct = 0;
1338                 break;
1339               default :
1340                 ct ++;
1341              }
1342           }
1343         if  (ct >= Consec_Non_ACGT)
1344             {
1345              n = i - ct;
1346              extending = true;
1347             }
1348        }
1349 
1350    shorter = Min_int (m, n);
1351    for  (Row = 0;  Row < shorter && A [Row] == T [Row];  Row ++)
1352      ;
1353 
1354    Edit_Array [0] [0] = Row;
1355 
1356    if  (Row == m && Row == n)      // Exact match
1357        {
1358         (* Delta_Len) = 0;
1359         (* A_End) = (* T_End) = Row;
1360         (* Match_To_End) = ! extending;
1361         return  0;
1362        }
1363 
1364    Left = Right = 0;
1365    Max_Score = Row * Branch_Pt_Match_Value;
1366    Max_Score_Len = Row;
1367    Max_Score_Best_d = 0;
1368    Max_Score_Best_e = 0;
1369 
1370    for  (e = 1;  e <= Error_Limit;  e ++)
1371      {
1372       int  cutoff;
1373 
1374       Left = Max_int (Left - 1, -e);
1375       Right = Min_int (Right + 1, e);
1376       Edit_Array [e - 1] [Left] = -2;
1377       Edit_Array [e - 1] [Left - 1] = -2;
1378       Edit_Array [e - 1] [Right] = -2;
1379       Edit_Array [e - 1] [Right + 1] = -2;
1380 
1381       for  (d = Left;  d <= Right;  d ++)
1382         {
1383          Row = 1 + Edit_Array [e - 1] [d];
1384          if  ((j = Edit_Array [e - 1] [d - 1]) > Row)
1385              Row = j;
1386          if  ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row)
1387              Row = j;
1388          while  (Row < m && Row + d < n
1389                   && A [Row] == T [Row + d])
1390            Row ++;
1391 
1392          Edit_Array [e] [d] = Row;
1393 
1394          if  (Row == m && Row + d == n)
1395              {
1396               if  (extending)
1397                   {
1398                    for  (j = Left;  j <= d;  j ++)
1399                      if  (Edit_Array [e] [j] > Longest)
1400                          {
1401                           Best_d = j;
1402                           Longest = Edit_Array [e] [j];
1403                          }
1404                    Score = Longest * Branch_Pt_Match_Value - e;
1405                    if  (Score < Max_Score)
1406                        {
1407                         (* A_End) = Max_Score_Len;
1408                         (* T_End) = Max_Score_Len + Max_Score_Best_d;
1409                         Set_Deltas (Delta, Delta_Len, Max_Score_Len,
1410                                     Max_Score_Best_d, Max_Score_Best_e);
1411                         (* Match_To_End) = false;
1412                         return  Max_Score_Best_e;
1413                        }
1414                   }
1415 
1416               (* A_End) = Row;           // One past last align position
1417               (* T_End) = Row + d;
1418               Set_Deltas (Delta, Delta_Len, Row, d, e);
1419               (* Match_To_End) = ! extending;
1420               return  e;
1421              }
1422         }
1423 
1424       cutoff = Longest - GIVE_UP_LEN;
1425       while  (Left <= Right && Edit_Array [e] [Left] < cutoff)
1426         Left ++;
1427       if  (Left > Right)
1428           break;
1429       while  (Right > Left && Edit_Array [e] [Right] < cutoff)
1430         Right --;
1431 
1432       assert (Left <= Right);
1433 
1434       for  (d = Left;  d <= Right;  d ++)
1435         if  (Edit_Array [e] [d] > Longest)
1436             {
1437              Best_d = d;
1438              Best_e = e;
1439              Longest = Edit_Array [e] [d];
1440             }
1441 
1442       Score = Longest * Branch_Pt_Match_Value - e;
1443                // Assumes  Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0
1444       if  (Score > Max_Score)
1445           {
1446            Max_Score = Score;
1447            Max_Score_Len = Longest;
1448            Max_Score_Best_d = Best_d;
1449            Max_Score_Best_e = Best_e;
1450           }
1451 
1452       if  (Longest - Max_Score_Len >= GIVE_UP_LEN)
1453           break;
1454      }
1455 
1456    (* A_End) = Max_Score_Len;
1457    (* T_End) = Max_Score_Len + Max_Score_Best_d;
1458    Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e);
1459    (* Match_To_End) = false;
1460 
1461    return  Max_Score_Best_e;
1462   }
1463 
1464 
1465 
Read_String(FILE * fp,char ** T,long int * Size,char header[])1466 bool  Read_String
1467     (FILE * fp, char * * T, long int * Size, char header [])
1468 
1469 /* Read next string from  fp  (assuming FASTA format) into  (* T) [1 ..]
1470 *  which has  Size  characters.  Allocate extra memory if needed
1471 *  and adjust  Size  accordingly.  Return  TRUE  if successful,  FALSE
1472 *  otherwise (e.g., EOF).  Set  header  to the contents of the FASTA
1473 *  header line. */
1474 
1475   {
1476    char  Line [MAX_LINE];
1477    long int  Len;
1478    int  Ch, Ct;
1479 
1480    if  (feof (fp))
1481        return  false;
1482 
1483    while  ((Ch = fgetc (fp)) != EOF && Ch != '>')
1484      ;
1485 
1486    if  (Ch != '>')
1487        return  false;
1488 
1489    if(!fgets (Line, MAX_LINE, fp)) return false;
1490    Len = strlen (Line);
1491    assert (Len > 0 && Line [Len - 1] == '\n');
1492    Line [Len - 1] = '\0';
1493    strcpy (header, Line);
1494 
1495 
1496    Ct = 0;
1497    Len = 0;
1498    while  ((Ch = fgetc (fp)) != EOF && Ch != '>')
1499      {
1500       if  (isspace (Ch))
1501           continue;
1502 
1503       Ct ++;
1504 
1505       if  (Ct >= (* Size) - 1)
1506           {
1507            (* Size) = (long int) ((* Size) * EXPANSION_FACTOR);
1508            (* T) = (char *) Safe_realloc ((* T), (* Size));
1509           }
1510       Ch = tolower (Ch);
1511       if  (! isalpha (Ch))
1512           {
1513            fprintf (stderr, "Unexpected character `%c\' in string %s\n",
1514                                  Ch, header);
1515            Ch = 'x';
1516           }
1517       (* T) [++ Len] = Ch;
1518      }
1519 
1520    (* T) [Len + 1] = '\0';
1521    if  (Ch == '>')
1522        ungetc (Ch, fp);
1523 
1524    return  true;
1525   }
1526 
1527 
1528 
Rev_Complement(char * s)1529 void  Rev_Complement
1530     (char * s)
1531 
1532 /* Set string  s  to its DNA reverse complement. */
1533 
1534   {
1535    char  ch;
1536    int  i, j, len;
1537 
1538    len = strlen (s);
1539 
1540    for  (i = 0, j = len - 1;  i < j;  i ++, j --)
1541      {
1542       ch = Complement (s [i]);
1543       s [i] = Complement (s [j]);
1544       s [j] = ch;
1545      }
1546 
1547    if  (i == j)
1548        s [i] = Complement (s [i]);
1549 
1550    return;
1551   }
1552 
1553 
1554 
Rev_Display_Alignment(char * a,int a_len,char * b,int b_len,int delta[],int delta_ct)1555 void  Rev_Display_Alignment
1556     (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct)
1557 
1558 //  Show (to  stdout ) the alignment encoded in  delta [0 .. (delta_ct - 1)]
1559 //  between strings  a [0 .. (a_len - 1)]  and  b [0 .. (b_len - 1)]
1560 //  in the reverse direction.
1561 
1562   {
1563    int  i, j, k, m, top_len, bottom_len;
1564    char  top [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
1565    char  bottom [MAX_EXTENSION + 2 * MAX_ERRORS + 1];
1566 
1567    i = j = top_len = bottom_len = 0;
1568    for  (k = 0;  k < delta_ct;  k ++)
1569      {
1570       for  (m = 1;  m < abs (delta [k]);  m ++)
1571         {
1572          top [top_len ++] = a [- i ++];
1573          j ++;
1574         }
1575       if  (delta [k] < 0)
1576           {
1577            top [top_len ++] = '-';
1578            j ++;
1579           }
1580         else
1581           {
1582            top [top_len ++] = a [- i ++];
1583           }
1584      }
1585    while  (i < a_len && j < b_len)
1586      {
1587       top [top_len ++] = a [- i ++];
1588       j ++;
1589      }
1590    top [top_len] = '\0';
1591 
1592 
1593    i = j = 0;
1594    for  (k = 0;  k < delta_ct;  k ++)
1595      {
1596       for  (m = 1;  m < abs (delta [k]);  m ++)
1597         {
1598          bottom [bottom_len ++] = b [- j ++];
1599          i ++;
1600         }
1601       if  (delta [k] > 0)
1602           {
1603            bottom [bottom_len ++] = '-';
1604            i ++;
1605           }
1606         else
1607           {
1608            bottom [bottom_len ++] = b [- j ++];
1609           }
1610      }
1611    while  (j < b_len && i < a_len)
1612      {
1613       bottom [bottom_len ++] = b [- j ++];
1614       i ++;
1615      }
1616    bottom [bottom_len] = '\0';
1617 
1618 
1619    for  (i = 0;  i < top_len || i < bottom_len;  i += DISPLAY_WIDTH)
1620      {
1621       putchar ('\n');
1622       printf ("A: ");
1623       for  (j = 0;  j < DISPLAY_WIDTH && i + j < top_len;  j ++)
1624         putchar (top [i + j]);
1625       putchar ('\n');
1626       printf ("B: ");
1627       for  (j = 0;  j < DISPLAY_WIDTH && i + j < bottom_len;  j ++)
1628         putchar (bottom [i + j]);
1629       putchar ('\n');
1630       printf ("   ");
1631       for  (j = 0;  j < DISPLAY_WIDTH && i + j < bottom_len && i + j < top_len;
1632                 j ++)
1633         if  (top [i + j] != ' ' && bottom [i + j] != ' '
1634                  && top [i + j] != bottom [i + j])
1635             putchar ('^');
1636           else
1637             putchar (' ');
1638       putchar ('\n');
1639      }
1640 
1641    return;
1642   }
1643 
1644 
1645 
Rev_Prefix_Edit_Dist(char A[],int m,char T[],int n,int Error_Limit,int * A_End,int * T_End,int * Match_To_End,int Delta[MAX_ERRORS],int * Delta_Len,int extending)1646 int  Rev_Prefix_Edit_Dist
1647     (char A [], int m, char T [], int n, int Error_Limit,
1648      int * A_End, int * T_End, int * Match_To_End,
1649      int Delta [MAX_ERRORS], int * Delta_Len, int extending)
1650 
1651 //  Return the minimum number of changes (inserts, deletes, replacements)
1652 //  needed to match string  A [0 .. -(m-1)]  with a prefix of string
1653 //   T [0 .. -(n-1)]  if it's not more than  Error_Limit .
1654 //  Note:  Match is reverse direction, right to left.
1655 //  Put delta description of alignment in  Delta  and set
1656 //  (* Delta_Len)  to the number of entries there if it's a complete
1657 //  match.
1658 //  Set  A_End  and  T_End  to the rightmost positions where the
1659 //  alignment ended in  A  and  T , respectively.
1660 //  Set  Match_To_End  true if the match extended to the end
1661 //  of at least one string; otherwise, set it false to indicate
1662 //  a branch point.
1663 //  extending  indicates whether the match is for a fixed region (FALSE)
1664 //  or is extending off the end of a match as far as possible (TRUE)
1665 
1666   {
1667    double  Score, Max_Score;
1668    int  Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0;
1669    int  Best_d, Best_e, Longest, Row;
1670    int  Left, Right;
1671    int  d, e, i, j, shorter;
1672 
1673 //   assert (m <= n);
1674    Best_d = Best_e = Longest = 0;
1675    (* Delta_Len) = 0;
1676 
1677    if  (Consec_Non_ACGT > 0)
1678        {
1679         int  ct;
1680 
1681         for  (i = ct = 0;  i < m && ct < Consec_Non_ACGT;  i ++)
1682           {
1683            switch  (A [- i])
1684              {
1685               case  'a' :
1686               case  'c' :
1687               case  'g' :
1688               case  't' :
1689                 ct = 0;
1690                 break;
1691               default :
1692                 ct ++;
1693              }
1694           }
1695         if  (ct >= Consec_Non_ACGT)
1696             {
1697              m = i - ct;
1698              extending = true;
1699             }
1700 
1701         for  (i = ct = 0;  i < n && ct < Consec_Non_ACGT;  i ++)
1702           {
1703            switch  (T [- i])
1704              {
1705               case  'a' :
1706               case  'c' :
1707               case  'g' :
1708               case  't' :
1709                 ct = 0;
1710                 break;
1711               default :
1712                 ct ++;
1713              }
1714           }
1715         if  (ct >= Consec_Non_ACGT)
1716             {
1717              n = i - ct;
1718              extending = true;
1719             }
1720        }
1721 
1722    shorter = Min_int (m, n);
1723    for  (Row = 0;  Row < shorter && A [- Row] == T [- Row];  Row ++)
1724      ;
1725 
1726    Edit_Array [0] [0] = Row;
1727 
1728    if  (Row == m && Row == n)      // Exact match
1729        {
1730         (* Delta_Len) = 0;
1731         (* A_End) = (* T_End) = Row;
1732         (* Match_To_End) = ! extending;
1733         return  0;
1734        }
1735 
1736    Left = Right = 0;
1737    Max_Score = Row * Branch_Pt_Match_Value;
1738    Max_Score_Len = Row;
1739    Max_Score_Best_d = 0;
1740    Max_Score_Best_e = 0;
1741 
1742    for  (e = 1;  e <= Error_Limit;  e ++)
1743      {
1744       int  cutoff;
1745 
1746       Left = Max_int (Left - 1, -e);
1747       Right = Min_int (Right + 1, e);
1748       Edit_Array [e - 1] [Left] = -2;
1749       Edit_Array [e - 1] [Left - 1] = -2;
1750       Edit_Array [e - 1] [Right] = -2;
1751       Edit_Array [e - 1] [Right + 1] = -2;
1752 
1753       for  (d = Left;  d <= Right;  d ++)
1754         {
1755          Row = 1 + Edit_Array [e - 1] [d];
1756          if  ((j = Edit_Array [e - 1] [d - 1]) > Row)
1757              Row = j;
1758          if  ((j = 1 + Edit_Array [e - 1] [d + 1]) > Row)
1759              Row = j;
1760          while  (Row < m && Row + d < n
1761                   && A [- Row] == T [- Row - d])
1762            Row ++;
1763 
1764          Edit_Array [e] [d] = Row;
1765 
1766          if  (Row == m && Row + d == n)
1767              {
1768               if  (extending)
1769                   {
1770                    for  (j = Left;  j <= d;  j ++)
1771                      if  (Edit_Array [e] [j] > Longest)
1772                          {
1773                           Best_d = j;
1774                           Longest = Edit_Array [e] [j];
1775                          }
1776                    Score = Longest * Branch_Pt_Match_Value - e;
1777                    if  (Score < Max_Score)
1778                        {
1779                         (* A_End) = Max_Score_Len;
1780                         (* T_End) = Max_Score_Len + Max_Score_Best_d;
1781                         Set_Deltas (Delta, Delta_Len, Max_Score_Len,
1782                                     Max_Score_Best_d, Max_Score_Best_e);
1783                         (* Match_To_End) = false;
1784                         return  Max_Score_Best_e;
1785                        }
1786                   }
1787 
1788               (* A_End) = Row;           // One past last align position
1789               (* T_End) = Row + d;
1790               Set_Deltas (Delta, Delta_Len, Row, d, e);
1791               (* Match_To_End) = ! extending;
1792               return  e;
1793              }
1794         }
1795 
1796       cutoff = Longest - GIVE_UP_LEN;
1797       while  (Left <= Right && Edit_Array [e] [Left] < cutoff)
1798         Left ++;
1799       if  (Left > Right)
1800           break;
1801       while  (Right > Left && Edit_Array [e] [Right] < cutoff)
1802         Right --;
1803       assert (Left <= Right);
1804 
1805       for  (d = Left;  d <= Right;  d ++)
1806         if  (Edit_Array [e] [d] > Longest)
1807             {
1808              Best_d = d;
1809              Best_e = e;
1810              Longest = Edit_Array [e] [d];
1811             }
1812 
1813       Score = Longest * Branch_Pt_Match_Value - e;
1814                // Assumes  Branch_Pt_Match_Value - Branch_Pt_Error_Value == 1.0
1815       if  (Score > Max_Score)
1816           {
1817            Max_Score = Score;
1818            Max_Score_Len = Longest;
1819            Max_Score_Best_d = Best_d;
1820            Max_Score_Best_e = Best_e;
1821           }
1822 
1823 
1824       if  (Longest - Max_Score_Len >= GIVE_UP_LEN)
1825           break;
1826      }
1827 
1828    (* A_End) = Max_Score_Len;
1829    (* T_End) = Max_Score_Len + Max_Score_Best_d;
1830    Set_Deltas (Delta, Delta_Len, Max_Score_Len, Max_Score_Best_d, Max_Score_Best_e);
1831    (* Match_To_End) = false;
1832 
1833    return  Max_Score_Best_e;
1834   }
1835 
1836 
1837 
Rev_Show_Diffs(char * a,int a_len,char * b,int b_len,int delta[],int delta_ct,long int a_ref,long int b_ref)1838 void  Rev_Show_Diffs
1839     (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
1840      long int a_ref, long int b_ref)
1841 
1842 //  Show (to  stdout ) the differences
1843 //  between strings  a [0 .. (a_len - 1)]  and  b [0 .. (b_len - 1)]
1844 //  encoded in  delta [0 .. (delta_ct - 1)]
1845 //  in the reverse direction.   a_ref  is the position of the beginning
1846 //  of string  a .   b_ref  is the offset to the start of
1847 //  string  b .
1848 
1849   {
1850    int  i, j, k, m;
1851 
1852    i = j = 0;
1853    for  (k = 0;  k < delta_ct;  k ++)
1854      {
1855       for  (m = 1;  m < abs (delta [k]);  m ++)
1856         {
1857          if  (a [- i] != b [- j])
1858              printf ("%8ld %8ld:  %c  %c\n", a_ref - i, b_ref - j, a [- i], b [- j]);
1859          i ++;
1860          j ++;
1861         }
1862       if  (delta [k] < 0)
1863           {
1864            printf ("%8ld %8ld: ins %c\n", a_ref - i, b_ref - j, b [- j]);
1865            j ++;
1866           }
1867         else
1868           {
1869            printf ("%8ld %8ld: del %c\n", a_ref - i, b_ref - j, a [- i]);
1870            i ++;
1871           }
1872      }
1873    while  (i < a_len && j < b_len)
1874      {
1875       if  (a [- i] != b [- j])
1876           printf ("%8ld %8ld:  %c  %c\n", a_ref - i, b_ref - j, a [- i], b [- j]);
1877       i ++;
1878       j ++;
1879      }
1880 
1881    return;
1882   }
1883 
1884 
1885 
Set_Deltas(int delta[],int * delta_len,int row,int d,int e)1886 void  Set_Deltas
1887     (int delta [], int * delta_len, int row, int d, int e)
1888 
1889 //  Set  delta  to values that show the alignment recorded in
1890 //  global  Edit_Array .  Set  (* delta_len)  to the number of
1891 //  entries set.   row  is the length of the match in the first
1892 //  string.   d  is the offset at the end of the match between
1893 //  the first and second string.   e  is the number of errors.
1894 
1895   {
1896    int  delta_stack [MAX_ERRORS];
1897    int  from, last, max;
1898    int  i, j, k;
1899 
1900    last = row;
1901    (* delta_len) = 0;
1902 
1903    for  (k = e;  k > 0;  k --)
1904      {
1905       from = d;
1906       max = 1 + Edit_Array [k - 1] [d];
1907       if  ((j = Edit_Array [k - 1] [d - 1]) > max)
1908           {
1909            from = d - 1;
1910            max = j;
1911           }
1912       if  ((j = 1 + Edit_Array [k - 1] [d + 1]) > max)
1913           {
1914            from = d + 1;
1915            max = j;
1916           }
1917       if  (from == d - 1)
1918           {
1919            delta_stack [(* delta_len) ++] = max - last - 1;
1920            d --;
1921            last = Edit_Array [k - 1] [from];
1922           }
1923       else if  (from == d + 1)
1924           {
1925            delta_stack [(* delta_len) ++] = last - (max - 1);
1926            d ++;
1927            last = Edit_Array [k - 1] [from];
1928           }
1929      }
1930    delta_stack [(* delta_len) ++] = last + 1;
1931 
1932    k = 0;
1933    for  (i = (* delta_len) - 1;  i > 0;  i --)
1934      delta [k ++]
1935          = abs (delta_stack [i]) * Sign (delta_stack [i - 1]);
1936    (* delta_len) --;
1937 
1938    return;
1939   }
1940 
1941 
1942 
Set_Left_Pad(char * pad[3],int r_hi,int q_hi,int mum_len,int pad_len)1943 static void  Set_Left_Pad
1944     (char * pad [3], int r_hi, int q_hi, int mum_len, int pad_len)
1945 
1946 //  Set  pad  to a triple of strings, each of length  pad_len ,
1947 //  representing a MUM of length  mum_len  ending at  r_hi in  Ref
1948 //  and  q_hi  in  Query .   Each string in  pad  must have been allocated
1949 //  at least  pad_len + 1  bytes of memory.  The purpose is to serve as
1950 //  the left-end padding for an alignment with this MUM as its left
1951 //  boundary.
1952 
1953   {
1954    int  i;
1955 
1956    for  (i = 0;  i < 3;  i ++)
1957      pad [i] [pad_len] = '\0';
1958 
1959    for  (i = 0;  i < pad_len;  i ++)
1960      {
1961       pad [0] [pad_len - i - 1] = r_hi - i > 0 ? Ref [r_hi - i] : '.';
1962       pad [1] [pad_len - i - 1] = q_hi - i > 0 ? Query [q_hi - i] : '.';
1963       pad [2] [pad_len - i - 1] = i < mum_len ? '=' : ' ';
1964      }
1965 
1966    return;
1967   }
1968 
1969 
1970 
Set_Right_Pad(char * pad[3],int r_lo,int q_lo,int mum_len,int pad_len)1971 static void  Set_Right_Pad
1972     (char * pad [3], int r_lo, int q_lo, int mum_len, int pad_len)
1973 
1974 //  Set  pad  to a triple of strings, each of length  pad_len ,
1975 //  representing a MUM of length  mum_len  beginning at  r_lo in  Ref
1976 //  and  q_lo  in  Query .   Each string in  pad  must have been allocated
1977 //  at least  pad_len + 1  bytes of memory.  The purpose is to serve as
1978 //  the right-end padding for an alignment with this MUM as its right
1979 //  boundary.
1980 
1981   {
1982    int  i;
1983 
1984    for  (i = 0;  i < 3;  i ++)
1985      pad [i] [pad_len] = '\0';
1986 
1987    for  (i = 0;  i < pad_len;  i ++)
1988      {
1989       pad [0] [i] = r_lo + i <= Ref_Len ? Ref [r_lo + i] : '.';
1990       pad [1] [i] = q_lo + i <= Query_Len ? Query [q_lo + i] : '.';
1991       pad [2] [i] = i < mum_len ? '=' : ' ';
1992      }
1993 
1994    return;
1995   }
1996 
1997 
1998 
Show_Coverage(Cover_t * list,char * filename,char * tag,char * seq)1999 void  Show_Coverage
2000     (Cover_t * list, char * filename, char * tag, char * seq)
2001 
2002 //  Print to stderr summary stats of regions in  list .
2003 //  Send to  filename  a list of region info suitable for  celagram .
2004 //  Use  tag  to print the headings.  Regions refer to  seq .
2005 
2006   {
2007    FILE  * fp;
2008    Cover_t  * p;
2009    int  ct = 0;
2010    long int  i, prev_hi = 0, n_ct, len, seq_len;
2011    long int  cov_ns = 0, gap_ns = 0, total_ns;
2012    double  cov_total = 0.0, gap_total = 0.0;
2013 
2014    fp = Local_File_Open (filename, "w");
2015    fprintf (fp, "%-9s %9s %9s %8s %8s   %6s\n",
2016             "Region", "Start", "End", "Len", "N's", "%N's");
2017 
2018    for  (p = list;  p != NULL;  p = p -> next)
2019      {
2020       n_ct = 0;
2021       for  (i = prev_hi + 1;  i < p -> lo;  i ++)
2022         if  (strchr ("acgt", seq [i]) == NULL)
2023             n_ct ++;
2024       len = p -> lo - prev_hi - 1;
2025       fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld  %5.1f%%\n",
2026                ct, prev_hi + 1, p -> lo - 1, len, n_ct,
2027                len == 0 ? 0.0 : 100.0 * n_ct / len);
2028       gap_total += len;
2029       gap_ns += n_ct;
2030 
2031       ct ++;
2032 
2033       n_ct = 0;
2034       for  (i = p -> lo;  i <= p -> hi;  i ++)
2035         if  (strchr ("acgt", seq [i]) == NULL)
2036             n_ct ++;
2037       len = 1 + p -> hi - p -> lo;
2038       fprintf (fp, "cov%-6d %9ld %9ld %8ld %8ld  %5.1f%%\n",
2039                ct, p -> lo, p -> hi, len, n_ct,
2040                len == 0 ? 0.0 : 100.0 * n_ct / len);
2041       cov_total += len;
2042       cov_ns += n_ct;
2043 
2044       prev_hi = p -> hi;
2045      }
2046 
2047    n_ct = 0;
2048    seq_len = strlen (seq + 1);
2049    for  (i = prev_hi + 1;  i <= seq_len;  i ++)
2050      if  (strchr ("acgt", seq [i]) == NULL)
2051          n_ct ++;
2052    len = seq_len - prev_hi;
2053    fprintf (fp, "gap%-6d %9ld %9ld %8ld %8ld  %5.1f%%\n",
2054             ct, prev_hi + 1, seq_len, len, n_ct,
2055             len == 0 ? 0.0 : 100.0 * n_ct / len);
2056    gap_total += len;
2057    gap_ns += n_ct;
2058 
2059    total_ns = cov_ns + gap_ns;
2060 
2061    fclose (fp);
2062 
2063    fprintf (stderr, "\n%s Sequence Coverage:\n", tag);
2064    fprintf (stderr, "   Sequence length = %ld\n", seq_len);
2065    fprintf (stderr, "            acgt's = %ld\n", seq_len - total_ns);
2066    fprintf (stderr, "        Non acgt's = %ld\n", total_ns);
2067    fprintf (stderr, " Number of regions = %d\n", ct);
2068    fprintf (stderr, "     Matched bases = %.0f  (%.1f%%, %.1f%% of acgt's)\n",
2069             cov_total,
2070             seq_len == 0.0 ? 0.0 : 100.0 * cov_total / seq_len,
2071             seq_len - total_ns == 0.0 ? 0.0 :
2072                 100.0 * (cov_total - cov_ns) / (seq_len - total_ns));
2073    fprintf (stderr, "     Avg match len = %.0f\n",
2074             ct == 0 ? 0.0 : cov_total / ct);
2075    fprintf (stderr, "    N's in matches = %ld  (%.1f%%)\n",
2076             cov_ns,
2077             cov_total == 0.0 ? 0.0 : 100.0 * cov_ns / cov_total);
2078    fprintf (stderr, "   Unmatched bases = %.0f  (%.1f%%, %.1f%% of acgt's)\n",
2079             gap_total,
2080             seq_len == 0.0 ? 0.0 : 100.0 * gap_total / seq_len,
2081             seq_len - total_ns == 0.0 ? 0.0 :
2082                 100.0 * (gap_total - gap_ns) / (seq_len - total_ns));
2083    fprintf (stderr, "       Avg gap len = %.0f\n",
2084             gap_total / (1.0 + ct));
2085    fprintf (stderr, "       N's in gaps = %ld  (%.1f%%)\n",
2086             gap_ns,
2087             gap_total == 0.0 ? 0.0 : 100.0 * gap_ns / gap_total);
2088 
2089    return;
2090   }
2091 
2092 
2093 
Show_Diffs(char * a,int a_len,char * b,int b_len,int delta[],int delta_ct,long int a_ref,long int b_ref)2094 void  Show_Diffs
2095     (char * a, int a_len, char * b, int b_len, int delta [], int delta_ct,
2096      long int a_ref, long int b_ref)
2097 
2098 //  Show (to  stdout ) the differences
2099 //  between strings  a [0 .. (a_len - 1)]  and  b [0 .. (b_len - 1)]
2100 //  encoded in  delta [0 .. (delta_ct - 1)] .  a_ref  is the offset to
2101 //  the start of string  a .   b_ref  is the offset to the start of
2102 //  string  b .
2103 
2104   {
2105    int  i, j, k, m;
2106 
2107    i = j = 0;
2108    for  (k = 0;  k < delta_ct;  k ++)
2109      {
2110       for  (m = 1;  m < abs (delta [k]);  m ++)
2111         {
2112          if  (a [i] != b [j])
2113              printf ("%8ld %8ld:  %c  %c\n", a_ref + i, b_ref + j, a [i], b [j]);
2114          i ++;
2115          j ++;
2116         }
2117       if  (delta [k] < 0)
2118           {
2119            printf ("%8ld %8ld: ins %c\n", a_ref + i - 1, b_ref + j, b [j]);
2120            j ++;
2121           }
2122         else
2123           {
2124            printf ("%8ld %8ld: del %c\n", a_ref + i, b_ref + j - 1, a [i]);
2125            i ++;
2126           }
2127      }
2128    while  (i < a_len && j < b_len)
2129      {
2130       if  (a [i] != b [j])
2131           printf ("%8ld %8ld:  %c  %c\n", a_ref + i, b_ref + j, a [i], b [j]);
2132       i ++;
2133       j ++;
2134      }
2135 
2136    return;
2137   }
2138 
2139 
2140 
Sign(int a)2141 int  Sign
2142     (int a)
2143 
2144 //  Return the algebraic sign of  a .
2145 
2146   {
2147    if  (a > 0)
2148        return  1;
2149    else if  (a < 0)
2150        return  -1;
2151 
2152    return  0;
2153   }
2154 
2155 
2156 
Usage(char * command)2157 void  Usage
2158     (char * command)
2159 
2160 //  Print to stderr description of options and command line for
2161 //  this program.   command  is the command that was used to
2162 //  invoke it.
2163 
2164   {
2165    fprintf (stderr,
2166            "USAGE:  %s <RefSequence> <MatchSequences> <GapsFile>\n"
2167            "\n"
2168            "Combines MUMs in <GapsFile> by extending matches off\n"
2169            "ends and between MUMs.  <RefSequence> is a fasta file\n"
2170            "of the reference sequence.  <MatchSequences> is a\n"
2171            "multi-fasta file of the sequences matched against the\n"
2172            "reference\n"
2173            "\n"
2174            "Options:\n"
2175            "-D      Only output to stdout the difference positions\n"
2176            "          and characters\n"
2177            "-n      Allow matches only between nucleotides, i.e., ACGTs\n"
2178            "-N num  Break matches at <num> or more consecutive non-ACGTs \n"
2179            "-q tag  Used to label query match\n"
2180            "-r tag  Used to label reference match\n"
2181            "-S      Output all differences in strings\n"
2182            "-t      Label query matches with query fasta header\n"
2183            "-v num  Set verbose level for extra output\n"
2184 	   "-W file Reset the default output filename witherrors.gaps\n"
2185            "-x      Don't output .cover files\n"
2186 	   "-e      Set error-rate cutoff to e (e.g. 0.02 is two percent)\n",
2187 
2188            command);
2189 
2190    return;
2191   }
2192