1 /* Programmer:  A. Delcher
2 *     Revised:  12 December 2002
3 *        File:  MUMmer/repeat-match.cc
4 *
5 *  This program identifies maximal exact repeat regions (longer than
6 *  MIN_MATCH_LEN threshold) in input genome.  Uses both forward and
7 *  reverse strand.
8 */
9 
10 
11 #include <mummer/tigrinc.hh>
12 
13 
14 #define  SHOW_TREE 0
15 #define  USE_EXTRA_FIELDS  0
16 #define  DEBUG  0
17 
18 
19 int  Global_Trace = 0;
20 int  Global_Skip_Ct = 0;
21 int  Global_Non_Skip_Ct = 0;
22 
23 
24 #define  NIL  0         // Remove if convert to pointers
25 
26 const int  DEFAULT_MIN_MATCH_LEN = 20;
27 const char  DOLLAR_CHAR = '$';
28 // const char  DONT_KNOW_CHAR = 'N';
29 const int  MAX_NAME_LEN = 500;
30 const char  START_CHAR = '%';
31 
32 
33 
34 typedef  struct node
35   {
36    int  Lo;
37    unsigned  Child : 31;
38    unsigned  Child_Is_Leaf : 1;
39    int  Sibling : 31;
40    unsigned  Sibling_Is_Leaf : 1;
41    unsigned  Link;
42 #if  USE_EXTRA_FIELDS
43    int  Depth, ID;
44    unsigned  Parent;
45 #endif
46    int  Len : 31;
47    unsigned  Should_Skip : 1;
48    int  Subtree_Size;
49   }  Node_Type, * Node_Ptr;
50 
51 typedef  struct leaf
52   {
53    int  Lo;
54    int  Sibling : 31;
55    unsigned  Sibling_Is_Leaf : 1;
56 #if  USE_EXTRA_FIELDS
57    int  Depth, ID;
58    unsigned  Parent;
59 #endif
60    int  Len : 31;
61    unsigned  Is_Duplicate : 1;
62   }  Leaf_Type, * Leaf_Ptr;
63 
64 
65 char  * Data;
66 Leaf_Ptr  Leaf_Array;
67 Node_Ptr  Node_Array;
68 int  * Next_Leaf;
69 
70 int  Curr_ID;
71 int  Curr_String_ID;
72 int  Data_Len = 2;
73 static bool  Exhaustive_Matches = false;
74   // Set by -E option; if true then matches are found by exhaustive search
75   // For testing purposes
76 static bool  Forward_Only = false;
77   // Set by -f option; if true then matches to reverse complement string
78   // are not considered
79 long int  Genome_Len;
80 static char  * Input_File_Name = NULL;
81   // Name of input fasta file on command line
82 int  Longest_String = 0;
83 int  Max_Depth = 0;
84 int  Min_Match_Len = DEFAULT_MIN_MATCH_LEN;
85 int  Next_Avail_Node = 1;
86 int  Num_Strings = 2;
87 int  String_Separator;
88 static bool  Tandem_Only = false;
89   // Set by -t option to output only tandem repeats
90 int  Verbose = 0;
91   // Set by -V option to do extra tests and/or print debugging output
92 
93 
94 int  Add_Duplicates
95     (int, int, int, int);
96 int  Add_String
97     (int, int);
98 int  Build_Suffix_Tree
99     (int);
100 static int  New_Find_Child
101     (int, char, int &, int &, int &, int);
102 static int  New_Jump_Down
103     (int, int, int, int, int &, int &);
104 void  List_Matches
105     (int, int, int);
106 int  List_Tree
107     (int root, int is_leaf, int parent, int parent_depth);
108 void  List_Maximal_Matches
109     (int, int, int, int);
110 int  Longest_Prefix_Match
111     (char * p, char * q);
112 void  Mark_Skipable_Nodes
113     (int, int, int, int);
114 int  New_Node
115     (void);
116 static void  Parse_Command_Line
117     (int argc, char * argv []);
118 void  Set_Subtree_Size
119     (int, int, int, int);
120 static int  New_Step_Down
121     (int, int, int, int, int &, int, int &, int &);
122 static void  Usage
123     (char * command);
124 void  Verify_Match
125     (int a, int b, int n, int reverse);
126 
127 
128 
main(int argc,char * argv[])129 int main  (int argc, char * argv [])
130 
131   {
132    FILE  * fp;
133    long int  Input_Size;
134    int  Root;
135    char  Name [MAX_NAME_LEN];
136 
137    Parse_Command_Line (argc, argv);
138 
139    if  (Verbose > 0)
140        {
141         fprintf (stderr, "Verbose = %d\n", Verbose);
142         printf ("Node size = %d\n", (int) sizeof (Node_Type));
143         printf ("Leaf size = %d\n", (int) sizeof (Leaf_Type));
144         printf ("sizeof (int) = %d\n", (int) sizeof (int));
145         printf ("sizeof (size_t) = %d\n", (int) sizeof (size_t));
146        }
147 
148    fp = File_Open (Input_File_Name, "r");
149 
150    Data = (char *) Safe_malloc (INIT_SIZE);
151    Input_Size = INIT_SIZE;
152 
153    Read_String (fp, Data, Input_Size, Name, false);
154    fclose (fp);
155 
156    Genome_Len = strlen (Data + 1);
157    Data [0] = START_CHAR;
158    Data = (char *) Safe_realloc (Data, 2 * Genome_Len + 4);
159    strcpy (Data + Genome_Len + 2, Data + 1);
160    Reverse_Complement (Data, Genome_Len + 2, 2 * Genome_Len + 1);
161    Data [1 + Genome_Len] = DOLLAR_CHAR;
162    Data [2 + 2 * Genome_Len] = DOLLAR_CHAR;
163    Data_Len = 3 + 2 * Genome_Len;
164    Data [Data_Len] = '\0';
165 
166    String_Separator = 1 + Genome_Len;
167 
168    Leaf_Array = (Leaf_Ptr) Safe_calloc (Data_Len, sizeof (Leaf_Type));
169 
170    Node_Array = (Node_Ptr) Safe_calloc (Data_Len, sizeof (Node_Type));
171    Next_Leaf = (int *) Safe_calloc (Data_Len, sizeof (int));
172 
173    Curr_String_ID = 0;
174    Root = Build_Suffix_Tree (1);
175 
176    if  (! Forward_Only)
177        {
178         Curr_String_ID = 1;
179         if  (Add_String (2 + Genome_Len, Root) != 0)
180             return  -1;
181        }
182 
183 #if  SHOW_TREE
184    printf ("\n\nNodes in Entire Suffix Tree:\n\n");
185    List_Tree (Root, false, 0, 0);
186 #endif
187 
188    if(Verbose > 0)
189      fprintf (stderr, "Genome Length = %ld   Used %d internal nodes\n",
190               Genome_Len, Next_Avail_Node);
191 
192    Set_Subtree_Size (Root, false, NIL, 0);
193 
194    Mark_Skipable_Nodes (Root, false, NIL, 0);
195 
196    if  (Exhaustive_Matches)
197        {  // Exhaustive search for matches
198         int  i, j, match;
199 
200         Data [Genome_Len + 1] = '#';
201 
202         printf ("Exhaustive Exact Matches:\n");
203         printf ("%9s %10s  %8s\n", "Start1", "Start2", "Length");
204 
205         for  (i = 1;  i <= Genome_Len - Min_Match_Len;  i ++)
206           for  (j = i + 1;  j <= 1 + Genome_Len - Min_Match_Len;  j ++)
207             {
208              if  (Data [i - 1] == Data [j - 1])
209                  continue;
210              match = Longest_Prefix_Match (Data + i, Data + j);
211              if  (match >= Min_Match_Len)
212                  printf ("%9d %10d%c %8d\n", i, j, ' ', match);
213             }
214 
215         for  (i = 1;  i <= Genome_Len + 1 - Min_Match_Len;  i ++)
216           for  (j = Genome_Len + 2;  j <= 2 * Genome_Len - i;  j ++)
217             {
218              if  (Data [i - 1] == Data [j - 1])
219                  continue;
220              match = Longest_Prefix_Match (Data + i, Data + j);
221              if  (match >= Min_Match_Len)
222                  printf ("%9d %10d%c %8d\n", i, int (2 * Genome_Len + 2 - j), 'r', match);
223             }
224 
225         Data [Genome_Len + 1] = DOLLAR_CHAR;
226        }
227      else
228        {
229         printf ("Long Exact Matches:\n");
230         printf ("%9s %10s  %8s\n", "Start1", "Start2", "Length");
231 
232         List_Maximal_Matches (Root, false, NIL, 0);
233        }
234 
235    if  (Verbose > 1)
236        {
237         fprintf (stderr, "Global_Skip_Ct = %d\n", Global_Skip_Ct);
238         fprintf (stderr, "Global_Non_Skip_Ct = %d\n", Global_Non_Skip_Ct);
239        }
240 
241    return  0;
242   }
243 
244 
245 
Add_Duplicates(int Start,int End,int Leaf,int Leaf_Depth)246 int  Add_Duplicates  (int Start, int End, int Leaf, int Leaf_Depth)
247 
248 /* Mark all duplicate occurrences of suffixes in  Data [Start .. End]
249 *  in suffix tree where the first duplicate is at  Leaf  whose
250 *  depth is  Leaf_Depth . */
251 
252   {
253    int  i, j;
254 
255    j = Leaf;
256    for  (i = Start;  i <= End;  i ++, j++)
257      Leaf_Array [j] . Is_Duplicate = true;
258 
259    return  0;
260   }
261 
262 
263 
Add_String(int Start,int Root)264 int  Add_String  (int Start, int Root)
265 
266 /* Add all suffixes of string  Data [Start ...]  ending at the
267 *  first DOLLAR_CHAR encountered to the suffix tree rooted at
268 *  Root . */
269 
270   {
271    int  Leaf, End, Last_Parent, Grandparent;
272    int  Segment_Len, Segment_Start, Leaf_Len;
273    int  Last_Parent_Depth, Link_Depth, Matched, Offset;
274    int  New_Place, New_Depth, Made_New_Node, New_Place_Is_Leaf;
275 
276    for  (End = Start;  Data [End] != DOLLAR_CHAR;  End ++)
277      ;
278 
279    Curr_ID = Start;
280    Segment_Start = Start;
281    Segment_Len = 1 + End - Start;
282    New_Place = New_Step_Down (Root, 0, Segment_Start, Segment_Len,
283                           Matched, true, New_Place_Is_Leaf, Grandparent);
284 
285    if  (Segment_Len == Matched)
286        {
287         printf ("*** Genome is exact palindrome ***\n");
288         return  -1;
289        }
290 
291    if  (1 + Matched == Segment_Len)
292        {
293         Offset = 0;
294         if  (New_Place_Is_Leaf)
295             fprintf (stderr, "ERROR:  Unexpected leaf\n");
296           else
297             {
298              while  (! Node_Array [New_Place] . Child_Is_Leaf)
299                {
300                 New_Place = Node_Array [New_Place] . Child;
301                 Offset += abs (Node_Array [New_Place] . Len);
302                }
303              New_Place = Node_Array [New_Place] . Child;
304              Offset += abs (Leaf_Array [New_Place] . Len) - 1;
305             }
306         printf ("String %d is a substring of previous string.\n",
307                      Curr_String_ID);
308         return  0;
309        }
310 
311    Leaf = Start;
312    Leaf_Array [Leaf] . Lo = Segment_Start + Matched;
313    Leaf_Array [Leaf] . Sibling = Node_Array [New_Place] . Child;
314    Leaf_Array [Leaf] . Sibling_Is_Leaf
315           = Node_Array [New_Place] . Child_Is_Leaf;
316    Node_Array [New_Place] . Child = Leaf;
317    Node_Array [New_Place] . Child_Is_Leaf = true;
318    Leaf_Array [Leaf] . Len = Segment_Len - Matched;
319 #if  USE_EXTRA_FIELDS
320    Leaf_Array [Leaf] . Depth = 1 + End - Start;
321    Leaf_Array [Leaf] . ID = Start;
322    Leaf_Array [Leaf] . Parent = New_Place;
323 #endif
324 
325    Last_Parent = New_Place;
326    Last_Parent_Depth = Matched;
327 
328    while  (++ Start <= End)
329      {
330       Curr_ID ++;
331       if  (Node_Array [Last_Parent] . Link != NIL)
332           {
333            if  (Last_Parent == Root)
334                {
335                 Segment_Start = Start;
336                 Segment_Len = 1 + End - Start;
337                 Link_Depth = Last_Parent_Depth;
338                }
339              else
340                {
341                 Segment_Start = Start + Last_Parent_Depth - 1;
342                 Segment_Len = 2 + End - Start - Last_Parent_Depth;
343                 Link_Depth = Last_Parent_Depth - 1;
344                }
345            New_Place = New_Step_Down (Node_Array [Last_Parent] . Link,
346                                   Link_Depth,
347                                   Segment_Start, Segment_Len,
348                                   Matched, false,
349                                   New_Place_Is_Leaf, Grandparent);
350            if  (Matched == Segment_Len)
351                {
352                 New_Depth = Link_Depth + Matched;
353                 return  Add_Duplicates (Start, End, New_Place, New_Depth);
354                }
355 
356            Leaf = Start;
357            Leaf_Array [Leaf] . Lo = Segment_Start + Matched;
358            Leaf_Array [Leaf] . Sibling = Node_Array [New_Place] . Child;
359            Leaf_Array [Leaf] . Sibling_Is_Leaf
360                   = Node_Array [New_Place] . Child_Is_Leaf;
361            Node_Array [New_Place] . Child = Leaf;
362            Node_Array [New_Place] . Child_Is_Leaf = true;
363            Leaf_Array [Leaf] . Len = Segment_Len - Matched;
364 #if  USE_EXTRA_FIELDS
365            Leaf_Array [Leaf] . Depth = 1 + End - Start;
366            Leaf_Array [Leaf] . ID = Start;
367            Leaf_Array [Leaf] . Parent = New_Place;
368 #endif
369 
370            Last_Parent = New_Place;
371            Last_Parent_Depth = Link_Depth + Matched;
372           }
373         else
374           {
375            Leaf_Len = Leaf_Array [Leaf] . Len;
376            if  (Grandparent == Root)
377                {
378                 Segment_Start = Start;
379                 Segment_Len = Node_Array [Last_Parent] . Len - 1;
380                 Link_Depth = 0;
381                }
382              else
383                {
384                 Segment_Start = Start + Last_Parent_Depth - 1
385                                     - Node_Array [Last_Parent] . Len;
386                 Segment_Len = Node_Array [Last_Parent] . Len;
387                 Link_Depth = Last_Parent_Depth - 1
388                                     - Node_Array [Last_Parent] . Len;
389                }
390            New_Place = New_Jump_Down (Node_Array [Grandparent] . Link,
391                                   Link_Depth,
392                                   Segment_Start, Segment_Len,
393                                   Made_New_Node, Grandparent);
394 
395            if  (Made_New_Node)
396                {
397                 Leaf = Start;
398                 Leaf_Array [Leaf] . Lo = Segment_Start + Segment_Len;
399                 Leaf_Array [Leaf] . Sibling = Node_Array [New_Place] . Child;
400                 Leaf_Array [Leaf] . Sibling_Is_Leaf
401                        = Node_Array [New_Place] . Child_Is_Leaf;
402                 Node_Array [New_Place] . Child = Leaf;
403                 Node_Array [New_Place] . Child_Is_Leaf = true;
404                 Leaf_Array [Leaf] . Len = Leaf_Len;
405 #if  USE_EXTRA_FIELDS
406                 Leaf_Array [Leaf] . Depth = 1 + End - Start;
407                 Leaf_Array [Leaf] . ID = Start;
408                 Leaf_Array [Leaf] . Parent = New_Place;
409 #endif
410                 Node_Array [Last_Parent] . Link = New_Place;
411                 Last_Parent = New_Place;
412                 Last_Parent_Depth = Link_Depth + Segment_Len;
413                }
414              else
415                {
416                 Node_Array [Last_Parent] . Link = New_Place;
417                 Segment_Start += Segment_Len;
418                 Link_Depth += Segment_Len;
419                 Segment_Len = Leaf_Len;
420                 New_Place = New_Step_Down (New_Place, Link_Depth,
421                                        Segment_Start, Segment_Len,
422                                        Matched, false,
423                                        New_Place_Is_Leaf, Grandparent);
424                 if  (Matched >= Segment_Len)
425                     {
426                      New_Depth = Link_Depth + Matched;
427                      return  Add_Duplicates (Start, End, New_Place, New_Depth);
428                     }
429 
430                 Leaf = Start;
431                 Leaf_Array [Leaf] . Lo = Segment_Start + Matched;
432                 Leaf_Array [Leaf] . Sibling
433                         = Node_Array [New_Place] . Child;
434                 Leaf_Array [Leaf] . Sibling_Is_Leaf
435                        = Node_Array [New_Place] . Child_Is_Leaf;
436                 Node_Array [New_Place] . Child = Leaf;
437                 Node_Array [New_Place] . Child_Is_Leaf = true;
438                 Leaf_Array [Leaf] . Len = Segment_Len - Matched;
439 #if  USE_EXTRA_FIELDS
440                 Leaf_Array [Leaf] . Depth = 1 + End - Start;
441                 Leaf_Array [Leaf] . ID = Start;
442                 Leaf_Array [Leaf] . Parent = New_Place;
443 #endif
444 
445                 Last_Parent = New_Place;
446                 Last_Parent_Depth = Link_Depth + Matched;
447                }
448           }
449      }
450 
451    return  0;
452   }
453 
454 
455 
Build_Suffix_Tree(int Start)456 int  Build_Suffix_Tree  (int Start)
457 
458 /* Build a suffix tree for the string  Data [Start ...]  ending at
459 *  the first DOLLAR_CHAR encountered.  Return the subscript of the
460 *  root of the resulting tree. */
461 
462   {
463    int  Root, Leaf, End, Last_Parent, Grandparent;
464    int  Segment_Len, Segment_Start, Leaf_Len;
465    int  Last_Parent_Depth, Link_Depth, Matched;
466    int  New_Place, Made_New_Node, New_Place_Is_Leaf;
467 
468    for  (End = Start;  Data [End] != DOLLAR_CHAR;  End ++)
469      ;
470 
471    Curr_ID = Start;
472    Root = New_Node ();
473    Leaf = Start;
474 
475    Node_Array [Root] . Lo = 0;
476    Node_Array [Root] . Child = Leaf;
477    Node_Array [Root] . Sibling = NIL;
478    Node_Array [Root] . Link = Root;
479    Node_Array [Root] . Len = 0;
480    Node_Array [Root] . Sibling_Is_Leaf = false;
481    Node_Array [Root] . Child_Is_Leaf = true;
482 #if  USE_EXTRA_FIELDS
483    Node_Array [Root] . Depth = 0;
484    Node_Array [Root] . ID = Curr_ID;
485    Node_Array [Root] . Parent = NIL;
486 #endif
487 
488    Leaf_Array [Leaf] . Lo = Start;
489    Leaf_Array [Leaf] . Sibling = NIL;
490    Leaf_Array [Leaf] . Len = 1 + End - Start;
491    Leaf_Array [Leaf] . Sibling_Is_Leaf = false;
492 #if  USE_EXTRA_FIELDS
493    Leaf_Array [Leaf] . Depth = 1 + End - Start;
494    Leaf_Array [Leaf] . ID = Start;
495    Leaf_Array [Leaf] . Parent = Root;
496 #endif
497 
498    Last_Parent = Root;
499    Last_Parent_Depth = 0;
500 
501    while  (++ Start <= End)
502      {
503       if  (Verbose > 1)
504           printf ("Build_Suffix_Tree:  Start = %d\n", Start);
505 
506       Curr_ID ++;
507       if  (Node_Array [Last_Parent] . Link != NIL)
508           {
509            if  (Last_Parent == Root)
510                {
511                 Segment_Start = Start;
512                 Segment_Len = 1 + End - Start;
513                 Link_Depth = Last_Parent_Depth;
514                }
515              else
516                {
517                 Segment_Start = Start + Last_Parent_Depth - 1;
518                 Segment_Len = 2 + End - Start - Last_Parent_Depth;
519                 Link_Depth = Last_Parent_Depth - 1;
520                }
521            New_Place = New_Step_Down (Node_Array [Last_Parent] . Link,
522                                   Link_Depth,
523                                   Segment_Start, Segment_Len,
524                                   Matched, false,
525                                   New_Place_Is_Leaf, Grandparent);
526            if  (Matched >= Segment_Len)
527                {
528                 printf ("Ooops:  Suffix can't appear twice.\n");
529                 exit  (EXIT_FAILURE);
530                }
531              else
532                {
533                 Leaf = Start;
534                 Leaf_Array [Leaf] . Lo = Segment_Start + Matched;
535                 Leaf_Array [Leaf] . Sibling = Node_Array [New_Place] . Child;
536                 Leaf_Array [Leaf] . Sibling_Is_Leaf
537                        = Node_Array [New_Place] . Child_Is_Leaf;
538                 Node_Array [New_Place] . Child = Leaf;
539                 Node_Array [New_Place] . Child_Is_Leaf = true;
540                 Leaf_Array [Leaf] . Len = Segment_Len - Matched;
541 #if  USE_EXTRA_FIELDS
542                 Leaf_Array [Leaf] . Depth = 1 + End - Start;
543                 Leaf_Array [Leaf] . ID = Start;
544                 Leaf_Array [Leaf] . Parent = New_Place;
545 #endif
546                }
547            Last_Parent = New_Place;
548            Last_Parent_Depth = Link_Depth + Matched;
549           }
550         else
551           {
552            Leaf_Len = Leaf_Array [Leaf] . Len;
553            if  (Grandparent == Root)
554                {
555                 Segment_Start = Start;
556                 Segment_Len = Node_Array [Last_Parent] . Len - 1;
557                 Link_Depth = 0;
558                }
559              else
560                {
561                 Segment_Start = Start + Last_Parent_Depth - 1
562                                     - Node_Array [Last_Parent] . Len;
563                 Segment_Len = Node_Array [Last_Parent] . Len;
564                 Link_Depth = Last_Parent_Depth - 1
565                                     - Node_Array [Last_Parent] . Len;
566                }
567 
568            New_Place = New_Jump_Down (Node_Array [Grandparent] . Link,
569                                   Link_Depth,
570                                   Segment_Start, Segment_Len,
571                                   Made_New_Node, Grandparent);
572            if  (Made_New_Node)
573                {
574                 Leaf = Start;
575                 Leaf_Array [Leaf] . Lo = Segment_Start + Segment_Len;
576                 Leaf_Array [Leaf] . Sibling = Node_Array [New_Place] . Child;
577                 Leaf_Array [Leaf] . Sibling_Is_Leaf
578                        = Node_Array [New_Place] . Child_Is_Leaf;
579                 Node_Array [New_Place] . Child = Leaf;
580                 Node_Array [New_Place] . Child_Is_Leaf = true;
581                 Leaf_Array [Leaf] . Len = Leaf_Len;
582 #if  USE_EXTRA_FIELDS
583                 Leaf_Array [Leaf] . Depth = 1 + End - Start;
584                 Leaf_Array [Leaf] . ID = Start;
585                 Leaf_Array [Leaf] . Parent = New_Place;
586 #endif
587                 Node_Array [Last_Parent] . Link = New_Place;
588                 Last_Parent = New_Place;
589                 Last_Parent_Depth = Link_Depth + Segment_Len;
590                }
591              else
592                {
593                 Node_Array [Last_Parent] . Link = New_Place;
594                 Segment_Start += Segment_Len;
595                 Link_Depth += Segment_Len;
596                 Segment_Len = Leaf_Len;
597                 New_Place = New_Step_Down (New_Place, Link_Depth,
598                                        Segment_Start, Segment_Len,
599                                        Matched, false,
600                                        New_Place_Is_Leaf, Grandparent);
601                 if  (Matched >= Segment_Len)
602                     {
603                      printf ("Ooops:  Suffix can't appear twice.\n");
604                      exit  (EXIT_FAILURE);
605                     }
606                   else
607                     {
608                      Leaf = Start;
609                      Leaf_Array [Leaf] . Lo = Segment_Start + Matched;
610                      Leaf_Array [Leaf] . Sibling
611                              = Node_Array [New_Place] . Child;
612                      Leaf_Array [Leaf] . Sibling_Is_Leaf
613                             = Node_Array [New_Place] . Child_Is_Leaf;
614                      Node_Array [New_Place] . Child = Leaf;
615                      Node_Array [New_Place] . Child_Is_Leaf = true;
616                      Leaf_Array [Leaf] . Len = Segment_Len - Matched;
617 #if  USE_EXTRA_FIELDS
618                      Leaf_Array [Leaf] . Depth = 1 + End - Start;
619                      Leaf_Array [Leaf] . ID = Start;
620                      Leaf_Array [Leaf] . Parent = New_Place;
621 #endif
622                     }
623                 Last_Parent = New_Place;
624                 Last_Parent_Depth = Link_Depth + Matched;
625                }
626           }
627      }
628 
629    return  Root;
630   }
631 
632 
633 
New_Find_Child(int Node,char Ch,int & Is_Leaf,int & Pred,int & Pred_Is_Leaf,int Node_Depth)634 static int  New_Find_Child
635     (int Node, char Ch, int & Is_Leaf, int & Pred, int & Pred_Is_Leaf,
636      int Node_Depth)
637 
638 /* Return the subscript of child of  Node  whose string starts
639 *  with  Ch  and set  Is_Leaf  to indicate what kind of node
640 *  that child is.  Set  Pred  to the subscript of the prior
641 *  sibling of the chosen child and  Pred_Is_Leaf  to indicate
642 *  whether  Pred  is a leaf or not.   Node_Depth  is the depth of  Node
643 *  in the tree.
644 */
645 
646   {
647    int  Leaf_Lo, Leaf_Len;
648    int  i;
649    char  Start_Ch;
650 
651    Pred = NIL;
652    Pred_Is_Leaf = false;
653    i = Node_Array [Node] . Child;
654    Is_Leaf = Node_Array [Node] . Child_Is_Leaf;
655    if  (Is_Leaf)
656        {
657         Leaf_Lo = Leaf_Array [i] . Lo;
658         Leaf_Len = Leaf_Array [i] . Len;
659         if  (Leaf_Len > 0)
660             Start_Ch = Data [Leaf_Lo];
661           else
662             Start_Ch = Complement (Data [Leaf_Lo]);
663        }
664      else
665        {
666         if  (Node_Array [i] . Len > 0)
667             Start_Ch = Data [Node_Array [i] . Lo];
668           else
669             Start_Ch = Complement (Data [Node_Array [i] . Lo]);
670        }
671 
672    while  (i != NIL && Start_Ch != Ch)
673      {
674       Pred = i;
675       Pred_Is_Leaf = Is_Leaf;
676       if  (Is_Leaf)
677           {
678            Is_Leaf = Leaf_Array [i] . Sibling_Is_Leaf;
679            i = Leaf_Array [i] . Sibling;
680           }
681         else
682           {
683            Is_Leaf = Node_Array [i] . Sibling_Is_Leaf;
684            i = Node_Array [i] . Sibling;
685           }
686       if  (Is_Leaf)
687           {
688            Leaf_Lo = Leaf_Array [i] . Lo;
689            Leaf_Len = Leaf_Array [i] . Len;
690            if  (Leaf_Len > 0)
691                Start_Ch = Data [Leaf_Lo];
692              else
693                Start_Ch = Complement (Data [Leaf_Lo]);
694           }
695         else
696           {
697            if  (Node_Array [i] . Len > 0)
698                Start_Ch = Data [Node_Array [i] . Lo];
699              else
700                Start_Ch = Complement (Data [Node_Array [i] . Lo]);
701           }
702      }
703 
704    return  i;
705   }
706 
707 
708 
New_Jump_Down(int Node,int Node_Depth,int Lo,int Len,int & Made_New_Node,int & Par_New_Node)709 static int  New_Jump_Down
710     (int Node, int Node_Depth, int Lo, int Len, int & Made_New_Node,
711      int & Par_New_Node)
712 
713 /* Return the subscript of the node that represents the
714 *  descendant of  Node  that matches  Data [Lo .. Lo + Len - 1] .
715 *  Check only the first character of each child substring--the
716 *  rest are assumed to match automatically.
717 *  Node_Depth  is the depth of  Node  in the suffix tree.
718 *  If necessary, allocate a new node and return its subscript.
719 *  Set  Made_New_Node  to TRUE iff a new node was allocated and
720 *  set  Par_New_Node  to the parent of that new node.
721 */
722 
723   {
724    int  P, Q, P_Is_Leaf, D, Pred, Pred_Is_Leaf;
725    int  P_Sib, P_Sib_Is_Leaf;
726 
727    Made_New_Node = false;
728    if  (Len == 0)
729        return  Node;
730 
731    if  (Verbose > 1)
732        printf ("Jump_Down:  Node = %d  Depth = %d  Lo = %d  Len = %d\n",
733                Node, Node_Depth, Lo, Len);
734 
735    P = New_Find_Child (Node, Data [Lo], P_Is_Leaf, Pred, Pred_Is_Leaf,
736                    Node_Depth);
737 
738    while  (P != NIL)
739      {
740       if  (P_Is_Leaf)
741           {
742            D = Leaf_Array [P] . Len;
743            P_Sib = Leaf_Array [P] . Sibling;
744            P_Sib_Is_Leaf = Leaf_Array [P] . Sibling_Is_Leaf;
745           }
746         else
747           {
748            D = Node_Array [P] . Len;
749            P_Sib = Node_Array [P] . Sibling;
750            P_Sib_Is_Leaf = Node_Array [P] . Sibling_Is_Leaf;
751           }
752 
753       if  (Len == D)
754           return  P;
755 
756       if  (D < Len)
757           {
758            Lo += D;
759            Len -= D;
760            Node = P;
761            Node_Depth += D;
762            P = New_Find_Child (Node, Data [Lo], P_Is_Leaf,
763                            Pred, Pred_Is_Leaf, Node_Depth);
764           }
765         else
766           {
767            Q = New_Node ();
768            Node_Array [Q] . Lo = Lo;
769            Node_Array [Q] . Len = Len;
770            Node_Array [Q] . Child = P;
771            Node_Array [Q] . Sibling = P_Sib;
772            Node_Array [Q] . Sibling_Is_Leaf = P_Sib_Is_Leaf;
773            Par_New_Node = Node;
774            Node_Array [Q] . Link = NIL;
775            Node_Array [Q] . Child_Is_Leaf = P_Is_Leaf;
776 #if  USE_EXTRA_FIELDS
777            Node_Array [Q] . Parent = Node;
778            Node_Array [Q] . Depth = Node_Array [Node] . Depth + Len;
779            Node_Array [Q] . ID = Curr_ID;
780 #endif
781            if  (Pred == NIL)
782                {
783                 Node_Array [Node] . Child = Q;
784                 Node_Array [Node] . Child_Is_Leaf = false;
785                }
786            else if  (Pred_Is_Leaf)
787                {
788                 Leaf_Array [Pred] . Sibling = Q;
789                 Leaf_Array [Pred] . Sibling_Is_Leaf = false;
790                }
791              else
792                {
793                 Node_Array [Pred] . Sibling = Q;
794                 Node_Array [Pred] . Sibling_Is_Leaf = false;
795                }
796 
797            if  (! P_Is_Leaf)
798                {
799                 Node_Array [P] . Lo += Len;
800                 Node_Array [P] . Len -= Len;
801 #if  USE_EXTRA_FIELDS
802                 Node_Array [P] . Parent = Q;
803 #endif
804                 Node_Array [P] . Sibling = NIL;
805                 Node_Array [P] . Sibling_Is_Leaf = false;
806                }
807              else
808                {
809                 Leaf_Array [P] . Lo += Len;
810                 Leaf_Array [P] . Len -= Len;
811 #if  USE_EXTRA_FIELDS
812                 Leaf_Array [P] . Parent = Q;
813 #endif
814                 Leaf_Array [P] . Sibling = NIL;
815                 Leaf_Array [P] . Sibling_Is_Leaf = false;
816                }
817 
818            Made_New_Node = true;
819            return  Q;
820           }
821 
822      }
823 
824    printf ("Ooops:  Couldn't find appropriate child node\n");
825    exit  (EXIT_FAILURE);
826 
827    return  0;
828   }
829 
830 
831 
List_Matches(int A,int B,int n)832 void  List_Matches
833     (int A, int B, int n)
834 
835 /* List all maximal matches of length  n   between entries in
836 *  the list starting at
837 *  subscript  A  in array  Next_Leaf  and the list starting
838 *  at subscript  B  in the same array.  Don't list a match
839 *  if it's already been listed in a different order (because
840 *  of the reverse complement strand. */
841 
842   {
843    int  i, j, k, L, R, Reversed;
844 
845    for  (i = A;  i != NIL;  i = Next_Leaf [i])
846      {
847       for  (j = B;  j != NIL;  j = Next_Leaf [j])
848         {
849          if  (Data [i - 1] == Data [j - 1]
850                   || Data [i + n] == Data [j + n]
851                   || (i > String_Separator && j > String_Separator))
852              continue;
853          Reversed = false;
854          if  (j > String_Separator)
855              {
856               k = Genome_Len - (j - String_Separator) - n + 2;
857               if  (k < i)
858                   continue;
859               L = i;
860               R = k + n - 1;
861               Reversed = true;
862              }
863          else if  (i > String_Separator)
864              {
865               k = Genome_Len - (i - String_Separator) - n + 2;
866               if  (k < j)
867                   continue;
868               L = j;
869               R = k + n - 1;
870               Reversed = true;
871              }
872          else if  (i < j)
873              {
874               L = i;
875               R = j;
876              }
877            else
878              {
879               L = j;
880               R = i;
881              }
882          if  (Verbose > 1)
883              Verify_Match (L, R, n, Reversed);
884          if  (Tandem_Only && L + n < R)
885              continue;
886          printf ("%9d %10d%c %8d\n", L, R, Reversed ? 'r' : ' ', n);
887         }
888      }
889   }
890 
891 
892 
List_Tree(int Root,int Is_Leaf,int Parent,int Parent_Depth)893 int  List_Tree  (int Root, int Is_Leaf, int Parent, int Parent_Depth)
894 
895 //  Show contents of suffix tree rooted at  Root  whose parent's
896 //  string depth in the suffix tree is  Depth .   Is_Leaf
897 //  indicates whether  Root  is a leaf node or not.
898 //   Parent  is the subscript of this node's parent.
899 
900   {
901    int  i, j;
902 
903    while  (Root != NIL)
904      if  (Is_Leaf)
905          {
906           if  (Leaf_Array [Root] . Len > 0)
907               {
908                j = Leaf_Array [Root] . Lo - Parent_Depth;
909                printf ("Leaf %d:  ", j);
910                for  (i = 0;  i < Leaf_Array [Root] . Len;  i ++)
911                  putchar (Data [Leaf_Array [Root] . Lo + i]);
912               }
913             else
914               {
915                j = - (Leaf_Array [Root] . Lo + Parent_Depth);
916                printf ("Leaf %d:  ", j);
917                for  (i = 0;  i > Leaf_Array [Root] . Len;  i --)
918                  putchar (Complement (Data [Leaf_Array [Root] . Lo + i]));
919               }
920           printf ("  Par = %d", Parent);
921 #if  USE_EXTRA_FIELDS
922           printf ("  Depth = %d", Leaf_Array [Root] . Depth);
923 #endif
924           if  (Leaf_Array [Root] . Is_Duplicate)
925               printf ("  Duplicate = %d", int (Root + Genome_Len + 1));
926           putchar ('\n');
927           Is_Leaf = Leaf_Array [Root] . Sibling_Is_Leaf;
928           Root = Leaf_Array [Root] . Sibling;
929          }
930 
931        else
932          {
933 #if  USE_EXTRA_FIELDS
934           printf ("Node %d:  ID %d  ", Root, Node_Array [Root] . ID);
935 #else
936           printf ("Node %d:  ", Root);
937 #endif
938           if  (Node_Array [Root] . Len > 0)
939               for  (i = 0;  i < Node_Array [Root] . Len;  i ++)
940                 putchar (Data [Node_Array [Root] . Lo + i]);
941             else
942               for  (i = 0;  i > Node_Array [Root] . Len;  i --)
943                 putchar (Complement (Data [Node_Array [Root] . Lo + i]));
944           printf ("  Par = %d", Parent);
945 #if  USE_EXTRA_FIELDS
946           printf ("  Depth = %d", Node_Array [Root] . Depth);
947           if  (Node_Array [Root] . Link != NIL)
948               printf ("   Link to node %d (ID %d)", Node_Array [Root] . Link,
949                       Node_Array [Node_Array [Root] . Link] . ID);
950 #else
951           if  (Node_Array [Root] . Link != NIL)
952               printf ("   Link to node %d", Node_Array [Root] . Link);
953 #endif
954           putchar ('\n');
955           List_Tree (Node_Array [Root] . Child,
956                             Node_Array [Root] . Child_Is_Leaf, Root,
957                             Parent_Depth + abs (Node_Array [Root] . Len));
958           Is_Leaf = Node_Array [Root] . Sibling_Is_Leaf;
959           Root = Node_Array [Root] . Sibling;
960          }
961 
962    return  0;
963   }
964 
965 
966 
List_Maximal_Matches(int Root,int Is_Leaf,int Parent,int Parent_Depth)967 void  List_Maximal_Matches  (int Root, int Is_Leaf, int Parent, int Parent_Depth)
968 
969 /* List substring pairs that occur more than once where the match
970 *  does not extend either to the left or to the right for
971 *  the subtree rooted at  Root .  Is_Leaf  indicates
972 *  whether  Root  is a leaf or internal node.   Parent  is the
973 *  parent of  Root  and  Parent_Depth  is its string-depth in the
974 *  suffix tree. */
975 
976   {
977    int  j, k, Depth, List1, Start, End;
978    int  k_Is_Leaf;
979 
980    if  (Root == NIL)
981        return;
982 
983    if  (Is_Leaf)
984        {
985         // Skip any match here, it's other version at the start of
986         // the string will be output instead
987 
988         List_Maximal_Matches (Leaf_Array [Root] . Sibling,
989                Leaf_Array [Root] . Sibling_Is_Leaf,
990                Parent, Parent_Depth);
991 
992         return;
993        }
994 
995    Depth = Parent_Depth + Node_Array [Root] . Len;
996 
997    List_Maximal_Matches (Node_Array [Root] . Child,
998           Node_Array [Root] . Child_Is_Leaf, Root, Depth);
999 
1000    List_Maximal_Matches (Node_Array [Root] . Sibling,
1001           Node_Array [Root] . Sibling_Is_Leaf, Parent, Parent_Depth);
1002 
1003    if  (Verbose > 1)
1004        printf ("List_Maximal_Matches:  Root = %d  Parent = %d  Parent_Depth = %d\n"
1005                "   Depth = %d  Subtree_Size = %d\n",
1006                Root, Parent, Parent_Depth, Depth,
1007                Node_Array [Root] . Subtree_Size);
1008 
1009 if  (Depth >= Min_Match_Len)
1010     {
1011      if  (Node_Array [Root] . Should_Skip)
1012          Global_Skip_Ct ++;
1013        else
1014          Global_Non_Skip_Ct ++;
1015     }
1016    if  (Depth >= Min_Match_Len
1017           && ! Node_Array [Root] . Should_Skip)
1018        {
1019         Is_Leaf = Node_Array [Root] . Child_Is_Leaf;
1020         for  (j = Node_Array [Root] . Child;  j != NIL; )
1021           {
1022            if  (Verbose > 1)
1023                printf ("  Child = %d %s\n", j, Is_Leaf ? "leaf" : "node");
1024            if  (Is_Leaf)
1025                {
1026                 List1 = j;
1027                 k = Leaf_Array [j] . Sibling;
1028                 k_Is_Leaf = Leaf_Array [j] . Sibling_Is_Leaf;
1029                 Is_Leaf = Leaf_Array [j] . Sibling_Is_Leaf;
1030                 j = Leaf_Array [j] . Sibling;
1031                }
1032              else
1033                {
1034                 List1 = Node_Array [j] . Link;
1035                 k = Node_Array [j] . Sibling;
1036                 k_Is_Leaf = Node_Array [j] . Sibling_Is_Leaf;
1037                 Is_Leaf = Node_Array [j] . Sibling_Is_Leaf;
1038                 j = Node_Array [j] . Sibling;
1039                }
1040            while  (k != NIL)
1041              {
1042               if  (k_Is_Leaf)
1043                   {
1044                    List_Matches (List1, k, Depth);
1045                    k_Is_Leaf = Leaf_Array [k] . Sibling_Is_Leaf;
1046                    k = Leaf_Array [k] . Sibling;
1047                   }
1048                 else
1049                   {
1050                    List_Matches (List1, Node_Array [k] . Link, Depth);
1051                    k_Is_Leaf = Node_Array [k] . Sibling_Is_Leaf;
1052                    k = Node_Array [k] . Sibling;
1053                   }
1054              }
1055 #if  0
1056            if  (Is_Leaf)
1057                {
1058                 printf ("  Lf%7d\n", j);
1059                 Is_Leaf = Leaf_Array [j] . Sibling_Is_Leaf;
1060                 j = Leaf_Array [j] . Sibling;
1061                }
1062              else
1063                {
1064                 printf ("  Nd%7d\n", j);
1065                 Is_Leaf = Node_Array [j] . Sibling_Is_Leaf;
1066                 j = Node_Array [j] . Sibling;
1067                }
1068 #endif
1069           }
1070        }
1071 
1072    Start = End = 0;
1073    Is_Leaf = Node_Array [Root] . Child_Is_Leaf;
1074    for  (j = Node_Array [Root] . Child;  j != NIL; )
1075      {
1076       if  (Is_Leaf)
1077           {
1078            if  (Start == 0)
1079                Start = End = j;
1080              else
1081                {
1082                 Next_Leaf [End] = j;
1083                 End = j;
1084                }
1085            Is_Leaf = Leaf_Array [j] . Sibling_Is_Leaf;
1086            j = Leaf_Array [j] . Sibling;
1087           }
1088         else
1089           {
1090            if  (Start == 0)
1091                Start = End = Node_Array [j] . Link;
1092              else
1093                Next_Leaf [End] = Node_Array [j] . Link;
1094            while  (Next_Leaf [End] != NIL)
1095              End = Next_Leaf [End];
1096            Is_Leaf = Node_Array [j] . Sibling_Is_Leaf;
1097            j = Node_Array [j] . Sibling;
1098           }
1099      }
1100    Node_Array [Root] . Link = Start;
1101 #if  0
1102    printf  ("  Leaves: ");
1103    for  (j = Node_Array [Root] . Link;  j != NIL;  j = Next_Leaf [j])
1104      printf (" %3d", j);
1105    printf ("\n");
1106 #endif
1107 
1108    return;
1109   }
1110 
1111 
1112 
Longest_Prefix_Match(char * p,char * q)1113 int  Longest_Prefix_Match
1114     (char * p, char * q)
1115 
1116 //  Return the length of the longest common prefix of strings  p
1117 //  and  q .  Assumes they will mismatch before running off the
1118 //  end of either string.
1119 
1120   {
1121    int  i;
1122 
1123    for  (i = 0;  p [i] == q [i];  i ++)
1124      ;
1125 
1126    return  i;
1127   }
1128 
1129 
1130 
Mark_Skipable_Nodes(int Root,int Is_Leaf,int Parent,int Parent_Depth)1131 void  Mark_Skipable_Nodes
1132     (int Root, int Is_Leaf, int Parent, int Parent_Depth)
1133 
1134 //  Set the  Should_Skip  field of the node that  Root  links to
1135 //  if its  Subtree_Size  is the same as  Root 's,
1136 //  and do the same recursively in  Root 's subtree.
1137 //   Is_Leaf  indicates whether  Root  is a leaf or internal node.
1138 //   Parent  is the parent of  Root  and  Parent_Depth  is its
1139 //  string-depth in the suffix tree.
1140 
1141   {
1142    int  Link, Depth;
1143 
1144    if  (Root == NIL)
1145        return;
1146 
1147    if  (Is_Leaf)
1148        {
1149         Mark_Skipable_Nodes (Leaf_Array [Root] . Sibling,
1150                Leaf_Array [Root] . Sibling_Is_Leaf,
1151                Parent, Parent_Depth);
1152 
1153         return;
1154        }
1155 
1156    Depth = Parent_Depth + Node_Array [Root] . Len;
1157 
1158    Mark_Skipable_Nodes (Node_Array [Root] . Child, Node_Array [Root] . Child_Is_Leaf,
1159           Root, Depth);
1160 
1161    Mark_Skipable_Nodes (Node_Array [Root] . Sibling, Node_Array [Root] . Sibling_Is_Leaf,
1162           Parent, Parent_Depth);
1163 
1164    if  (Verbose > 1)
1165        printf ("Mark_Skipable_Nodes:  Root = %d  Parent = %d  Parent_Depth = %d\n",
1166                Root, Parent, Parent_Depth);
1167 
1168    Link = Node_Array [Root] . Link;
1169    if  (Link != NIL
1170           && Node_Array [Link] . Subtree_Size == Node_Array [Root] . Subtree_Size)
1171         Node_Array [Link] . Should_Skip = true;
1172 
1173    // Will mark the root of the entire tree as skippable, but that's
1174    // what we want
1175 
1176    return;
1177   }
1178 
1179 
1180 
New_Node(void)1181 int  New_Node  (void)
1182 
1183 /* Allocate and return the subscript of a new node. */
1184 
1185   {
1186    return  Next_Avail_Node ++;
1187   }
1188 
1189 
1190 
Parse_Command_Line(int argc,char * argv[])1191 static void  Parse_Command_Line
1192     (int argc, char * argv [])
1193 
1194 //  Get options and parameters from command line with  argc
1195 //  arguments in  argv [0 .. (argc - 1)] .
1196 
1197   {
1198    bool  errflg = false;
1199    char  * p;
1200    int  ch;
1201 
1202    optarg = NULL;
1203 
1204    while  (! errflg && ((ch = getopt (argc, argv, "Efn:tV:")) != EOF))
1205      switch  (ch)
1206        {
1207         case  'E' :
1208           Exhaustive_Matches = true;
1209           break;
1210 
1211         case  'f' :
1212           Forward_Only = true;
1213           break;
1214 
1215         case  'n' :
1216           Min_Match_Len = (int) strtol (optarg, & p, 10);
1217           if  (p == optarg || Min_Match_Len < 1)
1218               {
1219                fprintf (stderr, "ERROR:  Illegal min match length \"%s\"\n",
1220                         optarg);
1221                exit (-1);
1222               }
1223           break;
1224 
1225         case  't' :
1226           Forward_Only = true;
1227           Tandem_Only = true;
1228           break;
1229 
1230         case  'V' :
1231           Verbose = (int) strtol (optarg, & p, 10);
1232           if  (p == optarg)
1233               {
1234                fprintf (stderr, "ERROR:  Illegal verbose number \"%s\"\n",
1235                         optarg);
1236                exit (-1);
1237               }
1238           break;
1239 
1240         case  '?' :
1241           fprintf (stderr, "Unrecognized option -%c\n", optopt);
1242 
1243         default :
1244           errflg = true;
1245        }
1246 
1247    if  (errflg || optind != argc - 1)
1248        {
1249         Usage (argv [0]);
1250         exit (EXIT_FAILURE);
1251        }
1252 
1253    Input_File_Name = argv [optind ++];
1254 
1255    return;
1256   }
1257 
1258 
1259 
Set_Subtree_Size(int Root,int Is_Leaf,int Parent,int Parent_Depth)1260 void  Set_Subtree_Size
1261     (int Root, int Is_Leaf, int Parent, int Parent_Depth)
1262 
1263 //  Set the  Subtree_Size  field of  Root  and all its descendants
1264 //  to the number of leaves in its subtree.
1265 //   Is_Leaf  indicates whether  Root  is a leaf or internal node.
1266 //   Parent  is the parent of  Root  and  Parent_Depth  is its
1267 //  string-depth in the suffix tree.
1268 
1269   {
1270    int Depth;
1271    //   int Start;
1272 
1273    if  (Root == NIL)
1274        return;
1275 
1276    if  (Is_Leaf)
1277        {
1278          //        Start = Leaf_Array [Root] . Lo - Parent_Depth;
1279         if  (Leaf_Array [Root] . Is_Duplicate)
1280             Node_Array [Parent] . Subtree_Size ++;
1281           else
1282             Node_Array [Parent] . Subtree_Size += 2;
1283 
1284         Set_Subtree_Size (Leaf_Array [Root] . Sibling,
1285                Leaf_Array [Root] . Sibling_Is_Leaf,
1286                Parent, Parent_Depth);
1287 
1288         return;
1289        }
1290 
1291    Depth = Parent_Depth + Node_Array [Root] . Len;
1292 
1293    Set_Subtree_Size (Node_Array [Root] . Child, Node_Array [Root] . Child_Is_Leaf,
1294           Root, Depth);
1295 
1296    Set_Subtree_Size (Node_Array [Root] . Sibling, Node_Array [Root] . Sibling_Is_Leaf,
1297           Parent, Parent_Depth);
1298 
1299    if  (Verbose > 1)
1300        printf ("Set_Subtree_Size:  Root = %d  Parent = %d  Parent_Depth = %d\n",
1301                Root, Parent, Parent_Depth);
1302 
1303    if  (Parent != NIL)
1304        Node_Array [Parent] . Subtree_Size += Node_Array [Root] . Subtree_Size;
1305 
1306    return;
1307   }
1308 
1309 
1310 
New_Step_Down(int Node,int Node_Depth,int Lo,int Len,int & Depth,int Doing_Prefix,int & Return_Is_Leaf,int & Par_New_Node)1311 static int  New_Step_Down
1312     (int Node, int Node_Depth, int Lo, int Len, int & Depth, int Doing_Prefix,
1313      int & Return_Is_Leaf, int & Par_New_Node)
1314 
1315 /* Return the subscript of the node that represents the lowest
1316 *  descendant of  Node  that matches  Data [Lo .. Lo + Len - 1] .
1317 *  If necessary, allocate a new node and return its subscript,
1318 *  unless  Doing_Prefix .  In that case do not allocate a new node
1319 *  if  1 + Depth == Len  but return the child of where the new
1320 *  node would be allocated.  Set  Depth  to the number of characters
1321 *  successfully matched from  Node  down to the returned  Node .
1322 *  Node_Depth  is the depth of  Node  in the suffix tree.
1323 *  Set  Return_Is_Leaf  to indicate the status of the returned
1324 *  subscript.  If a new node is allocated, set  Par_New_Node
1325 *  to its parent.
1326 */
1327 
1328   {
1329    int  P, Q, P_Is_Leaf, i, j, D, Pred, Pred_Is_Leaf;
1330    int  P_Sib, P_Sib_Is_Leaf;
1331 
1332    Depth = 0;
1333 
1334    if  (Len == 0)
1335        return  NIL;
1336 
1337    P = New_Find_Child (Node, Data [Lo], P_Is_Leaf, Pred, Pred_Is_Leaf, Node_Depth);
1338 
1339    while  (P != NIL)
1340      {
1341       if  (P_Is_Leaf)
1342           {
1343            i = Leaf_Array [P] . Lo;
1344            D = Leaf_Array [P] . Len;
1345            P_Sib = Leaf_Array [P] . Sibling;
1346            P_Sib_Is_Leaf = Leaf_Array [P] . Sibling_Is_Leaf;
1347           }
1348         else
1349           {
1350            i = Node_Array [P] . Lo;
1351            D = Node_Array [P] . Len;
1352            P_Sib = Node_Array [P] . Sibling;
1353            P_Sib_Is_Leaf = Node_Array [P] . Sibling_Is_Leaf;
1354           }
1355       for  (j = 1;  j < Len && j < D
1356                       && Data [i + j] == Data [Lo + j];
1357                  j ++)
1358         ;
1359       Depth += j;
1360 
1361       if  (j < D)
1362           {
1363            if  (Doing_Prefix && 1 + j == Len)
1364                {
1365                 Return_Is_Leaf = P_Is_Leaf;
1366                 return  P;
1367                }
1368            Q = New_Node ();
1369            Node_Array [Q] . Lo = Lo;
1370            Node_Array [Q] . Len = j;
1371            Node_Array [Q] . Child = P;
1372            Node_Array [Q] . Sibling = P_Sib;
1373            Node_Array [Q] . Sibling_Is_Leaf = P_Sib_Is_Leaf;
1374            Par_New_Node = Node;
1375            Node_Array [Q] . Link = NIL;
1376            Node_Array [Q] . Child_Is_Leaf = P_Is_Leaf;
1377 #if  USE_EXTRA_FIELDS
1378            Node_Array [Q] . Parent = Node;
1379            Node_Array [Q] . Depth = Node_Array [Node] . Depth + j;
1380            Node_Array [Q] . ID = Curr_ID;
1381 #endif
1382            if  (Pred == NIL)
1383                {
1384                 Node_Array [Node] . Child = Q;
1385                 Node_Array [Node] . Child_Is_Leaf = false;
1386                }
1387            else if  (Pred_Is_Leaf)
1388                {
1389                 Leaf_Array [Pred] . Sibling = Q;
1390                 Leaf_Array [Pred] . Sibling_Is_Leaf = false;
1391                }
1392              else
1393                {
1394                 Node_Array [Pred] . Sibling = Q;
1395                 Node_Array [Pred] . Sibling_Is_Leaf = false;
1396                }
1397 
1398            if  (! P_Is_Leaf)
1399                {
1400                 Node_Array [P] . Lo += j;
1401                 Node_Array [P] . Len -= j;
1402 #if  USE_EXTRA_FIELDS
1403                 Node_Array [P] . Parent = Q;
1404 #endif
1405                 Node_Array [P] . Sibling = NIL;
1406                 Node_Array [P] . Sibling_Is_Leaf = false;
1407                }
1408              else
1409                {
1410                 Leaf_Array [P] . Lo += j;
1411                 Leaf_Array [P] . Len -= j;
1412 #if  USE_EXTRA_FIELDS
1413                 Leaf_Array [P] . Parent = Q;
1414 #endif
1415                 Leaf_Array [P] . Sibling = NIL;
1416                 Leaf_Array [P] . Sibling_Is_Leaf = false;
1417                }
1418 
1419            Return_Is_Leaf = false;
1420            return  Q;
1421           }
1422 
1423       if  (P_Is_Leaf || j == Len)
1424           {
1425 //           Return_Is_Leaf = true;
1426            Return_Is_Leaf = P_Is_Leaf;
1427            return  P;
1428           }
1429 
1430       Lo += j;
1431       Len -= j;
1432       Node = P;
1433       Node_Depth += j;
1434       P = New_Find_Child (Node, Data [Lo], P_Is_Leaf,
1435                       Pred, Pred_Is_Leaf, Node_Depth);
1436      }
1437 
1438    Return_Is_Leaf = false;
1439    return  Node;
1440   }
1441 
1442 
1443 
Usage(char * command)1444 static void  Usage
1445     (char * command)
1446 
1447 //  Print to stderr description of options and command line for
1448 //  this program.   command  is the command that was used to
1449 //  invoke it.
1450 
1451   {
1452    fprintf (stderr,
1453            "USAGE:  %s  [options]  <genome-file>\n"
1454            "\n"
1455            "Find all maximal exact matches in <genome-file>\n"
1456            "\n"
1457            "Options:\n"
1458            " -E    Use exhaustive (slow) search to find matches\n"
1459            " -f    Forward strand only, don't use reverse complement\n"
1460            " -n #  Set minimum exact match length to #\n"
1461            " -t    Only output tandem repeats\n"
1462            " -V #  Set level of verbose (debugging) printing to #\n"
1463            "\n",
1464            command);
1465 
1466    return;
1467   }
1468 
1469 
1470 
Verify_Match(int a,int b,int n,int reverse)1471 void  Verify_Match
1472     (int a, int b, int n, int reverse)
1473 
1474 //  Verify that the match of length  n  starting at  Data [a]  and
1475 //  Data [b]  is a maximal match.  If  reverse  is true then the
1476 //  string at  Data [b]  goes in the reverse direction.
1477 //  Assume  Data  is padded at the ends to prevent errors
1478 //  caused by falling off the ends.
1479 
1480   {
1481    int  i;
1482 
1483    if  (reverse)
1484        {
1485         if  (Data [a - 1] == Complement (Data [b + 1]))
1486             printf ("Verify_Match:  a=%d  b=%d  rev=%c  preceding chars match\n",
1487                     a, b, 't');
1488         if  (Data [a + n] == Complement (Data [b - n]))
1489             printf ("Verify_Match:  a=%d  b=%d  rev=%c  following chars match\n",
1490                     a, b, 't');
1491         for  (i = 0;  i < n;  i ++)
1492           if  (Data [a + i] != Complement (Data [b - i]))
1493               {
1494                printf ("Verify_Match:  a=%d  b=%d  rev=%c  mismatch at %d[%c]:%d[%c]\n",
1495                        a, b, 't', a + i, Data [a + i], b - i, Complement (Data [b - i]));
1496                exit (EXIT_FAILURE);
1497               }
1498        }
1499      else
1500        {
1501         if  (Data [a - 1] == Data [b - 1])
1502             printf ("Verify_Match:  a=%d  b=%d  rev=%c  preceding chars match\n",
1503                     a, b, 'f');
1504         if  (Data [a + n] == Data [b + n])
1505             printf ("Verify_Match:  a=%d  b=%d  rev=%c  following chars match\n",
1506                     a, b, 'f');
1507         for  (i = 0;  i < n;  i ++)
1508           if  (Data [a + i] != Data [b + i])
1509               {
1510                printf ("Verify_Match:  a=%d  b=%d  rev=%c  mismatch at %d[%c]:%d[%c]\n",
1511                        a, b, 'f', a + i, Data [a + i], b + i, Data [b + i]);
1512                exit (EXIT_FAILURE);
1513               }
1514        }
1515 
1516    return;
1517   }
1518