1 /* Programmer: A. Delcher
2 * Revised: 12 December 2002
3 * File: MUMmer/repeat-match.cc
4 *
5 * This program identifies maximal exact repeat regions (longer than
6 * MIN_MATCH_LEN threshold) in input genome. Uses both forward and
7 * reverse strand.
8 */
9
10
11 #include <mummer/tigrinc.hh>
12
13
14 #define SHOW_TREE 0
15 #define USE_EXTRA_FIELDS 0
16 #define DEBUG 0
17
18
19 int Global_Trace = 0;
20 int Global_Skip_Ct = 0;
21 int Global_Non_Skip_Ct = 0;
22
23
24 #define NIL 0 // Remove if convert to pointers
25
26 const int DEFAULT_MIN_MATCH_LEN = 20;
27 const char DOLLAR_CHAR = '$';
28 // const char DONT_KNOW_CHAR = 'N';
29 const int MAX_NAME_LEN = 500;
30 const char START_CHAR = '%';
31
32
33
34 typedef struct node
35 {
36 int Lo;
37 unsigned Child : 31;
38 unsigned Child_Is_Leaf : 1;
39 int Sibling : 31;
40 unsigned Sibling_Is_Leaf : 1;
41 unsigned Link;
42 #if USE_EXTRA_FIELDS
43 int Depth, ID;
44 unsigned Parent;
45 #endif
46 int Len : 31;
47 unsigned Should_Skip : 1;
48 int Subtree_Size;
49 } Node_Type, * Node_Ptr;
50
51 typedef struct leaf
52 {
53 int Lo;
54 int Sibling : 31;
55 unsigned Sibling_Is_Leaf : 1;
56 #if USE_EXTRA_FIELDS
57 int Depth, ID;
58 unsigned Parent;
59 #endif
60 int Len : 31;
61 unsigned Is_Duplicate : 1;
62 } Leaf_Type, * Leaf_Ptr;
63
64
65 char * Data;
66 Leaf_Ptr Leaf_Array;
67 Node_Ptr Node_Array;
68 int * Next_Leaf;
69
70 int Curr_ID;
71 int Curr_String_ID;
72 int Data_Len = 2;
73 static bool Exhaustive_Matches = false;
74 // Set by -E option; if true then matches are found by exhaustive search
75 // For testing purposes
76 static bool Forward_Only = false;
77 // Set by -f option; if true then matches to reverse complement string
78 // are not considered
79 long int Genome_Len;
80 static char * Input_File_Name = NULL;
81 // Name of input fasta file on command line
82 int Longest_String = 0;
83 int Max_Depth = 0;
84 int Min_Match_Len = DEFAULT_MIN_MATCH_LEN;
85 int Next_Avail_Node = 1;
86 int Num_Strings = 2;
87 int String_Separator;
88 static bool Tandem_Only = false;
89 // Set by -t option to output only tandem repeats
90 int Verbose = 0;
91 // Set by -V option to do extra tests and/or print debugging output
92
93
94 int Add_Duplicates
95 (int, int, int, int);
96 int Add_String
97 (int, int);
98 int Build_Suffix_Tree
99 (int);
100 static int New_Find_Child
101 (int, char, int &, int &, int &, int);
102 static int New_Jump_Down
103 (int, int, int, int, int &, int &);
104 void List_Matches
105 (int, int, int);
106 int List_Tree
107 (int root, int is_leaf, int parent, int parent_depth);
108 void List_Maximal_Matches
109 (int, int, int, int);
110 int Longest_Prefix_Match
111 (char * p, char * q);
112 void Mark_Skipable_Nodes
113 (int, int, int, int);
114 int New_Node
115 (void);
116 static void Parse_Command_Line
117 (int argc, char * argv []);
118 void Set_Subtree_Size
119 (int, int, int, int);
120 static int New_Step_Down
121 (int, int, int, int, int &, int, int &, int &);
122 static void Usage
123 (char * command);
124 void Verify_Match
125 (int a, int b, int n, int reverse);
126
127
128
main(int argc,char * argv[])129 int main (int argc, char * argv [])
130
131 {
132 FILE * fp;
133 long int Input_Size;
134 int Root;
135 char Name [MAX_NAME_LEN];
136
137 Parse_Command_Line (argc, argv);
138
139 if (Verbose > 0)
140 {
141 fprintf (stderr, "Verbose = %d\n", Verbose);
142 printf ("Node size = %d\n", (int) sizeof (Node_Type));
143 printf ("Leaf size = %d\n", (int) sizeof (Leaf_Type));
144 printf ("sizeof (int) = %d\n", (int) sizeof (int));
145 printf ("sizeof (size_t) = %d\n", (int) sizeof (size_t));
146 }
147
148 fp = File_Open (Input_File_Name, "r");
149
150 Data = (char *) Safe_malloc (INIT_SIZE);
151 Input_Size = INIT_SIZE;
152
153 Read_String (fp, Data, Input_Size, Name, false);
154 fclose (fp);
155
156 Genome_Len = strlen (Data + 1);
157 Data [0] = START_CHAR;
158 Data = (char *) Safe_realloc (Data, 2 * Genome_Len + 4);
159 strcpy (Data + Genome_Len + 2, Data + 1);
160 Reverse_Complement (Data, Genome_Len + 2, 2 * Genome_Len + 1);
161 Data [1 + Genome_Len] = DOLLAR_CHAR;
162 Data [2 + 2 * Genome_Len] = DOLLAR_CHAR;
163 Data_Len = 3 + 2 * Genome_Len;
164 Data [Data_Len] = '\0';
165
166 String_Separator = 1 + Genome_Len;
167
168 Leaf_Array = (Leaf_Ptr) Safe_calloc (Data_Len, sizeof (Leaf_Type));
169
170 Node_Array = (Node_Ptr) Safe_calloc (Data_Len, sizeof (Node_Type));
171 Next_Leaf = (int *) Safe_calloc (Data_Len, sizeof (int));
172
173 Curr_String_ID = 0;
174 Root = Build_Suffix_Tree (1);
175
176 if (! Forward_Only)
177 {
178 Curr_String_ID = 1;
179 if (Add_String (2 + Genome_Len, Root) != 0)
180 return -1;
181 }
182
183 #if SHOW_TREE
184 printf ("\n\nNodes in Entire Suffix Tree:\n\n");
185 List_Tree (Root, false, 0, 0);
186 #endif
187
188 if(Verbose > 0)
189 fprintf (stderr, "Genome Length = %ld Used %d internal nodes\n",
190 Genome_Len, Next_Avail_Node);
191
192 Set_Subtree_Size (Root, false, NIL, 0);
193
194 Mark_Skipable_Nodes (Root, false, NIL, 0);
195
196 if (Exhaustive_Matches)
197 { // Exhaustive search for matches
198 int i, j, match;
199
200 Data [Genome_Len + 1] = '#';
201
202 printf ("Exhaustive Exact Matches:\n");
203 printf ("%9s %10s %8s\n", "Start1", "Start2", "Length");
204
205 for (i = 1; i <= Genome_Len - Min_Match_Len; i ++)
206 for (j = i + 1; j <= 1 + Genome_Len - Min_Match_Len; j ++)
207 {
208 if (Data [i - 1] == Data [j - 1])
209 continue;
210 match = Longest_Prefix_Match (Data + i, Data + j);
211 if (match >= Min_Match_Len)
212 printf ("%9d %10d%c %8d\n", i, j, ' ', match);
213 }
214
215 for (i = 1; i <= Genome_Len + 1 - Min_Match_Len; i ++)
216 for (j = Genome_Len + 2; j <= 2 * Genome_Len - i; j ++)
217 {
218 if (Data [i - 1] == Data [j - 1])
219 continue;
220 match = Longest_Prefix_Match (Data + i, Data + j);
221 if (match >= Min_Match_Len)
222 printf ("%9d %10d%c %8d\n", i, int (2 * Genome_Len + 2 - j), 'r', match);
223 }
224
225 Data [Genome_Len + 1] = DOLLAR_CHAR;
226 }
227 else
228 {
229 printf ("Long Exact Matches:\n");
230 printf ("%9s %10s %8s\n", "Start1", "Start2", "Length");
231
232 List_Maximal_Matches (Root, false, NIL, 0);
233 }
234
235 if (Verbose > 1)
236 {
237 fprintf (stderr, "Global_Skip_Ct = %d\n", Global_Skip_Ct);
238 fprintf (stderr, "Global_Non_Skip_Ct = %d\n", Global_Non_Skip_Ct);
239 }
240
241 return 0;
242 }
243
244
245
Add_Duplicates(int Start,int End,int Leaf,int Leaf_Depth)246 int Add_Duplicates (int Start, int End, int Leaf, int Leaf_Depth)
247
248 /* Mark all duplicate occurrences of suffixes in Data [Start .. End]
249 * in suffix tree where the first duplicate is at Leaf whose
250 * depth is Leaf_Depth . */
251
252 {
253 int i, j;
254
255 j = Leaf;
256 for (i = Start; i <= End; i ++, j++)
257 Leaf_Array [j] . Is_Duplicate = true;
258
259 return 0;
260 }
261
262
263
Add_String(int Start,int Root)264 int Add_String (int Start, int Root)
265
266 /* Add all suffixes of string Data [Start ...] ending at the
267 * first DOLLAR_CHAR encountered to the suffix tree rooted at
268 * Root . */
269
270 {
271 int Leaf, End, Last_Parent, Grandparent;
272 int Segment_Len, Segment_Start, Leaf_Len;
273 int Last_Parent_Depth, Link_Depth, Matched, Offset;
274 int New_Place, New_Depth, Made_New_Node, New_Place_Is_Leaf;
275
276 for (End = Start; Data [End] != DOLLAR_CHAR; End ++)
277 ;
278
279 Curr_ID = Start;
280 Segment_Start = Start;
281 Segment_Len = 1 + End - Start;
282 New_Place = New_Step_Down (Root, 0, Segment_Start, Segment_Len,
283 Matched, true, New_Place_Is_Leaf, Grandparent);
284
285 if (Segment_Len == Matched)
286 {
287 printf ("*** Genome is exact palindrome ***\n");
288 return -1;
289 }
290
291 if (1 + Matched == Segment_Len)
292 {
293 Offset = 0;
294 if (New_Place_Is_Leaf)
295 fprintf (stderr, "ERROR: Unexpected leaf\n");
296 else
297 {
298 while (! Node_Array [New_Place] . Child_Is_Leaf)
299 {
300 New_Place = Node_Array [New_Place] . Child;
301 Offset += abs (Node_Array [New_Place] . Len);
302 }
303 New_Place = Node_Array [New_Place] . Child;
304 Offset += abs (Leaf_Array [New_Place] . Len) - 1;
305 }
306 printf ("String %d is a substring of previous string.\n",
307 Curr_String_ID);
308 return 0;
309 }
310
311 Leaf = Start;
312 Leaf_Array [Leaf] . Lo = Segment_Start + Matched;
313 Leaf_Array [Leaf] . Sibling = Node_Array [New_Place] . Child;
314 Leaf_Array [Leaf] . Sibling_Is_Leaf
315 = Node_Array [New_Place] . Child_Is_Leaf;
316 Node_Array [New_Place] . Child = Leaf;
317 Node_Array [New_Place] . Child_Is_Leaf = true;
318 Leaf_Array [Leaf] . Len = Segment_Len - Matched;
319 #if USE_EXTRA_FIELDS
320 Leaf_Array [Leaf] . Depth = 1 + End - Start;
321 Leaf_Array [Leaf] . ID = Start;
322 Leaf_Array [Leaf] . Parent = New_Place;
323 #endif
324
325 Last_Parent = New_Place;
326 Last_Parent_Depth = Matched;
327
328 while (++ Start <= End)
329 {
330 Curr_ID ++;
331 if (Node_Array [Last_Parent] . Link != NIL)
332 {
333 if (Last_Parent == Root)
334 {
335 Segment_Start = Start;
336 Segment_Len = 1 + End - Start;
337 Link_Depth = Last_Parent_Depth;
338 }
339 else
340 {
341 Segment_Start = Start + Last_Parent_Depth - 1;
342 Segment_Len = 2 + End - Start - Last_Parent_Depth;
343 Link_Depth = Last_Parent_Depth - 1;
344 }
345 New_Place = New_Step_Down (Node_Array [Last_Parent] . Link,
346 Link_Depth,
347 Segment_Start, Segment_Len,
348 Matched, false,
349 New_Place_Is_Leaf, Grandparent);
350 if (Matched == Segment_Len)
351 {
352 New_Depth = Link_Depth + Matched;
353 return Add_Duplicates (Start, End, New_Place, New_Depth);
354 }
355
356 Leaf = Start;
357 Leaf_Array [Leaf] . Lo = Segment_Start + Matched;
358 Leaf_Array [Leaf] . Sibling = Node_Array [New_Place] . Child;
359 Leaf_Array [Leaf] . Sibling_Is_Leaf
360 = Node_Array [New_Place] . Child_Is_Leaf;
361 Node_Array [New_Place] . Child = Leaf;
362 Node_Array [New_Place] . Child_Is_Leaf = true;
363 Leaf_Array [Leaf] . Len = Segment_Len - Matched;
364 #if USE_EXTRA_FIELDS
365 Leaf_Array [Leaf] . Depth = 1 + End - Start;
366 Leaf_Array [Leaf] . ID = Start;
367 Leaf_Array [Leaf] . Parent = New_Place;
368 #endif
369
370 Last_Parent = New_Place;
371 Last_Parent_Depth = Link_Depth + Matched;
372 }
373 else
374 {
375 Leaf_Len = Leaf_Array [Leaf] . Len;
376 if (Grandparent == Root)
377 {
378 Segment_Start = Start;
379 Segment_Len = Node_Array [Last_Parent] . Len - 1;
380 Link_Depth = 0;
381 }
382 else
383 {
384 Segment_Start = Start + Last_Parent_Depth - 1
385 - Node_Array [Last_Parent] . Len;
386 Segment_Len = Node_Array [Last_Parent] . Len;
387 Link_Depth = Last_Parent_Depth - 1
388 - Node_Array [Last_Parent] . Len;
389 }
390 New_Place = New_Jump_Down (Node_Array [Grandparent] . Link,
391 Link_Depth,
392 Segment_Start, Segment_Len,
393 Made_New_Node, Grandparent);
394
395 if (Made_New_Node)
396 {
397 Leaf = Start;
398 Leaf_Array [Leaf] . Lo = Segment_Start + Segment_Len;
399 Leaf_Array [Leaf] . Sibling = Node_Array [New_Place] . Child;
400 Leaf_Array [Leaf] . Sibling_Is_Leaf
401 = Node_Array [New_Place] . Child_Is_Leaf;
402 Node_Array [New_Place] . Child = Leaf;
403 Node_Array [New_Place] . Child_Is_Leaf = true;
404 Leaf_Array [Leaf] . Len = Leaf_Len;
405 #if USE_EXTRA_FIELDS
406 Leaf_Array [Leaf] . Depth = 1 + End - Start;
407 Leaf_Array [Leaf] . ID = Start;
408 Leaf_Array [Leaf] . Parent = New_Place;
409 #endif
410 Node_Array [Last_Parent] . Link = New_Place;
411 Last_Parent = New_Place;
412 Last_Parent_Depth = Link_Depth + Segment_Len;
413 }
414 else
415 {
416 Node_Array [Last_Parent] . Link = New_Place;
417 Segment_Start += Segment_Len;
418 Link_Depth += Segment_Len;
419 Segment_Len = Leaf_Len;
420 New_Place = New_Step_Down (New_Place, Link_Depth,
421 Segment_Start, Segment_Len,
422 Matched, false,
423 New_Place_Is_Leaf, Grandparent);
424 if (Matched >= Segment_Len)
425 {
426 New_Depth = Link_Depth + Matched;
427 return Add_Duplicates (Start, End, New_Place, New_Depth);
428 }
429
430 Leaf = Start;
431 Leaf_Array [Leaf] . Lo = Segment_Start + Matched;
432 Leaf_Array [Leaf] . Sibling
433 = Node_Array [New_Place] . Child;
434 Leaf_Array [Leaf] . Sibling_Is_Leaf
435 = Node_Array [New_Place] . Child_Is_Leaf;
436 Node_Array [New_Place] . Child = Leaf;
437 Node_Array [New_Place] . Child_Is_Leaf = true;
438 Leaf_Array [Leaf] . Len = Segment_Len - Matched;
439 #if USE_EXTRA_FIELDS
440 Leaf_Array [Leaf] . Depth = 1 + End - Start;
441 Leaf_Array [Leaf] . ID = Start;
442 Leaf_Array [Leaf] . Parent = New_Place;
443 #endif
444
445 Last_Parent = New_Place;
446 Last_Parent_Depth = Link_Depth + Matched;
447 }
448 }
449 }
450
451 return 0;
452 }
453
454
455
Build_Suffix_Tree(int Start)456 int Build_Suffix_Tree (int Start)
457
458 /* Build a suffix tree for the string Data [Start ...] ending at
459 * the first DOLLAR_CHAR encountered. Return the subscript of the
460 * root of the resulting tree. */
461
462 {
463 int Root, Leaf, End, Last_Parent, Grandparent;
464 int Segment_Len, Segment_Start, Leaf_Len;
465 int Last_Parent_Depth, Link_Depth, Matched;
466 int New_Place, Made_New_Node, New_Place_Is_Leaf;
467
468 for (End = Start; Data [End] != DOLLAR_CHAR; End ++)
469 ;
470
471 Curr_ID = Start;
472 Root = New_Node ();
473 Leaf = Start;
474
475 Node_Array [Root] . Lo = 0;
476 Node_Array [Root] . Child = Leaf;
477 Node_Array [Root] . Sibling = NIL;
478 Node_Array [Root] . Link = Root;
479 Node_Array [Root] . Len = 0;
480 Node_Array [Root] . Sibling_Is_Leaf = false;
481 Node_Array [Root] . Child_Is_Leaf = true;
482 #if USE_EXTRA_FIELDS
483 Node_Array [Root] . Depth = 0;
484 Node_Array [Root] . ID = Curr_ID;
485 Node_Array [Root] . Parent = NIL;
486 #endif
487
488 Leaf_Array [Leaf] . Lo = Start;
489 Leaf_Array [Leaf] . Sibling = NIL;
490 Leaf_Array [Leaf] . Len = 1 + End - Start;
491 Leaf_Array [Leaf] . Sibling_Is_Leaf = false;
492 #if USE_EXTRA_FIELDS
493 Leaf_Array [Leaf] . Depth = 1 + End - Start;
494 Leaf_Array [Leaf] . ID = Start;
495 Leaf_Array [Leaf] . Parent = Root;
496 #endif
497
498 Last_Parent = Root;
499 Last_Parent_Depth = 0;
500
501 while (++ Start <= End)
502 {
503 if (Verbose > 1)
504 printf ("Build_Suffix_Tree: Start = %d\n", Start);
505
506 Curr_ID ++;
507 if (Node_Array [Last_Parent] . Link != NIL)
508 {
509 if (Last_Parent == Root)
510 {
511 Segment_Start = Start;
512 Segment_Len = 1 + End - Start;
513 Link_Depth = Last_Parent_Depth;
514 }
515 else
516 {
517 Segment_Start = Start + Last_Parent_Depth - 1;
518 Segment_Len = 2 + End - Start - Last_Parent_Depth;
519 Link_Depth = Last_Parent_Depth - 1;
520 }
521 New_Place = New_Step_Down (Node_Array [Last_Parent] . Link,
522 Link_Depth,
523 Segment_Start, Segment_Len,
524 Matched, false,
525 New_Place_Is_Leaf, Grandparent);
526 if (Matched >= Segment_Len)
527 {
528 printf ("Ooops: Suffix can't appear twice.\n");
529 exit (EXIT_FAILURE);
530 }
531 else
532 {
533 Leaf = Start;
534 Leaf_Array [Leaf] . Lo = Segment_Start + Matched;
535 Leaf_Array [Leaf] . Sibling = Node_Array [New_Place] . Child;
536 Leaf_Array [Leaf] . Sibling_Is_Leaf
537 = Node_Array [New_Place] . Child_Is_Leaf;
538 Node_Array [New_Place] . Child = Leaf;
539 Node_Array [New_Place] . Child_Is_Leaf = true;
540 Leaf_Array [Leaf] . Len = Segment_Len - Matched;
541 #if USE_EXTRA_FIELDS
542 Leaf_Array [Leaf] . Depth = 1 + End - Start;
543 Leaf_Array [Leaf] . ID = Start;
544 Leaf_Array [Leaf] . Parent = New_Place;
545 #endif
546 }
547 Last_Parent = New_Place;
548 Last_Parent_Depth = Link_Depth + Matched;
549 }
550 else
551 {
552 Leaf_Len = Leaf_Array [Leaf] . Len;
553 if (Grandparent == Root)
554 {
555 Segment_Start = Start;
556 Segment_Len = Node_Array [Last_Parent] . Len - 1;
557 Link_Depth = 0;
558 }
559 else
560 {
561 Segment_Start = Start + Last_Parent_Depth - 1
562 - Node_Array [Last_Parent] . Len;
563 Segment_Len = Node_Array [Last_Parent] . Len;
564 Link_Depth = Last_Parent_Depth - 1
565 - Node_Array [Last_Parent] . Len;
566 }
567
568 New_Place = New_Jump_Down (Node_Array [Grandparent] . Link,
569 Link_Depth,
570 Segment_Start, Segment_Len,
571 Made_New_Node, Grandparent);
572 if (Made_New_Node)
573 {
574 Leaf = Start;
575 Leaf_Array [Leaf] . Lo = Segment_Start + Segment_Len;
576 Leaf_Array [Leaf] . Sibling = Node_Array [New_Place] . Child;
577 Leaf_Array [Leaf] . Sibling_Is_Leaf
578 = Node_Array [New_Place] . Child_Is_Leaf;
579 Node_Array [New_Place] . Child = Leaf;
580 Node_Array [New_Place] . Child_Is_Leaf = true;
581 Leaf_Array [Leaf] . Len = Leaf_Len;
582 #if USE_EXTRA_FIELDS
583 Leaf_Array [Leaf] . Depth = 1 + End - Start;
584 Leaf_Array [Leaf] . ID = Start;
585 Leaf_Array [Leaf] . Parent = New_Place;
586 #endif
587 Node_Array [Last_Parent] . Link = New_Place;
588 Last_Parent = New_Place;
589 Last_Parent_Depth = Link_Depth + Segment_Len;
590 }
591 else
592 {
593 Node_Array [Last_Parent] . Link = New_Place;
594 Segment_Start += Segment_Len;
595 Link_Depth += Segment_Len;
596 Segment_Len = Leaf_Len;
597 New_Place = New_Step_Down (New_Place, Link_Depth,
598 Segment_Start, Segment_Len,
599 Matched, false,
600 New_Place_Is_Leaf, Grandparent);
601 if (Matched >= Segment_Len)
602 {
603 printf ("Ooops: Suffix can't appear twice.\n");
604 exit (EXIT_FAILURE);
605 }
606 else
607 {
608 Leaf = Start;
609 Leaf_Array [Leaf] . Lo = Segment_Start + Matched;
610 Leaf_Array [Leaf] . Sibling
611 = Node_Array [New_Place] . Child;
612 Leaf_Array [Leaf] . Sibling_Is_Leaf
613 = Node_Array [New_Place] . Child_Is_Leaf;
614 Node_Array [New_Place] . Child = Leaf;
615 Node_Array [New_Place] . Child_Is_Leaf = true;
616 Leaf_Array [Leaf] . Len = Segment_Len - Matched;
617 #if USE_EXTRA_FIELDS
618 Leaf_Array [Leaf] . Depth = 1 + End - Start;
619 Leaf_Array [Leaf] . ID = Start;
620 Leaf_Array [Leaf] . Parent = New_Place;
621 #endif
622 }
623 Last_Parent = New_Place;
624 Last_Parent_Depth = Link_Depth + Matched;
625 }
626 }
627 }
628
629 return Root;
630 }
631
632
633
New_Find_Child(int Node,char Ch,int & Is_Leaf,int & Pred,int & Pred_Is_Leaf,int Node_Depth)634 static int New_Find_Child
635 (int Node, char Ch, int & Is_Leaf, int & Pred, int & Pred_Is_Leaf,
636 int Node_Depth)
637
638 /* Return the subscript of child of Node whose string starts
639 * with Ch and set Is_Leaf to indicate what kind of node
640 * that child is. Set Pred to the subscript of the prior
641 * sibling of the chosen child and Pred_Is_Leaf to indicate
642 * whether Pred is a leaf or not. Node_Depth is the depth of Node
643 * in the tree.
644 */
645
646 {
647 int Leaf_Lo, Leaf_Len;
648 int i;
649 char Start_Ch;
650
651 Pred = NIL;
652 Pred_Is_Leaf = false;
653 i = Node_Array [Node] . Child;
654 Is_Leaf = Node_Array [Node] . Child_Is_Leaf;
655 if (Is_Leaf)
656 {
657 Leaf_Lo = Leaf_Array [i] . Lo;
658 Leaf_Len = Leaf_Array [i] . Len;
659 if (Leaf_Len > 0)
660 Start_Ch = Data [Leaf_Lo];
661 else
662 Start_Ch = Complement (Data [Leaf_Lo]);
663 }
664 else
665 {
666 if (Node_Array [i] . Len > 0)
667 Start_Ch = Data [Node_Array [i] . Lo];
668 else
669 Start_Ch = Complement (Data [Node_Array [i] . Lo]);
670 }
671
672 while (i != NIL && Start_Ch != Ch)
673 {
674 Pred = i;
675 Pred_Is_Leaf = Is_Leaf;
676 if (Is_Leaf)
677 {
678 Is_Leaf = Leaf_Array [i] . Sibling_Is_Leaf;
679 i = Leaf_Array [i] . Sibling;
680 }
681 else
682 {
683 Is_Leaf = Node_Array [i] . Sibling_Is_Leaf;
684 i = Node_Array [i] . Sibling;
685 }
686 if (Is_Leaf)
687 {
688 Leaf_Lo = Leaf_Array [i] . Lo;
689 Leaf_Len = Leaf_Array [i] . Len;
690 if (Leaf_Len > 0)
691 Start_Ch = Data [Leaf_Lo];
692 else
693 Start_Ch = Complement (Data [Leaf_Lo]);
694 }
695 else
696 {
697 if (Node_Array [i] . Len > 0)
698 Start_Ch = Data [Node_Array [i] . Lo];
699 else
700 Start_Ch = Complement (Data [Node_Array [i] . Lo]);
701 }
702 }
703
704 return i;
705 }
706
707
708
New_Jump_Down(int Node,int Node_Depth,int Lo,int Len,int & Made_New_Node,int & Par_New_Node)709 static int New_Jump_Down
710 (int Node, int Node_Depth, int Lo, int Len, int & Made_New_Node,
711 int & Par_New_Node)
712
713 /* Return the subscript of the node that represents the
714 * descendant of Node that matches Data [Lo .. Lo + Len - 1] .
715 * Check only the first character of each child substring--the
716 * rest are assumed to match automatically.
717 * Node_Depth is the depth of Node in the suffix tree.
718 * If necessary, allocate a new node and return its subscript.
719 * Set Made_New_Node to TRUE iff a new node was allocated and
720 * set Par_New_Node to the parent of that new node.
721 */
722
723 {
724 int P, Q, P_Is_Leaf, D, Pred, Pred_Is_Leaf;
725 int P_Sib, P_Sib_Is_Leaf;
726
727 Made_New_Node = false;
728 if (Len == 0)
729 return Node;
730
731 if (Verbose > 1)
732 printf ("Jump_Down: Node = %d Depth = %d Lo = %d Len = %d\n",
733 Node, Node_Depth, Lo, Len);
734
735 P = New_Find_Child (Node, Data [Lo], P_Is_Leaf, Pred, Pred_Is_Leaf,
736 Node_Depth);
737
738 while (P != NIL)
739 {
740 if (P_Is_Leaf)
741 {
742 D = Leaf_Array [P] . Len;
743 P_Sib = Leaf_Array [P] . Sibling;
744 P_Sib_Is_Leaf = Leaf_Array [P] . Sibling_Is_Leaf;
745 }
746 else
747 {
748 D = Node_Array [P] . Len;
749 P_Sib = Node_Array [P] . Sibling;
750 P_Sib_Is_Leaf = Node_Array [P] . Sibling_Is_Leaf;
751 }
752
753 if (Len == D)
754 return P;
755
756 if (D < Len)
757 {
758 Lo += D;
759 Len -= D;
760 Node = P;
761 Node_Depth += D;
762 P = New_Find_Child (Node, Data [Lo], P_Is_Leaf,
763 Pred, Pred_Is_Leaf, Node_Depth);
764 }
765 else
766 {
767 Q = New_Node ();
768 Node_Array [Q] . Lo = Lo;
769 Node_Array [Q] . Len = Len;
770 Node_Array [Q] . Child = P;
771 Node_Array [Q] . Sibling = P_Sib;
772 Node_Array [Q] . Sibling_Is_Leaf = P_Sib_Is_Leaf;
773 Par_New_Node = Node;
774 Node_Array [Q] . Link = NIL;
775 Node_Array [Q] . Child_Is_Leaf = P_Is_Leaf;
776 #if USE_EXTRA_FIELDS
777 Node_Array [Q] . Parent = Node;
778 Node_Array [Q] . Depth = Node_Array [Node] . Depth + Len;
779 Node_Array [Q] . ID = Curr_ID;
780 #endif
781 if (Pred == NIL)
782 {
783 Node_Array [Node] . Child = Q;
784 Node_Array [Node] . Child_Is_Leaf = false;
785 }
786 else if (Pred_Is_Leaf)
787 {
788 Leaf_Array [Pred] . Sibling = Q;
789 Leaf_Array [Pred] . Sibling_Is_Leaf = false;
790 }
791 else
792 {
793 Node_Array [Pred] . Sibling = Q;
794 Node_Array [Pred] . Sibling_Is_Leaf = false;
795 }
796
797 if (! P_Is_Leaf)
798 {
799 Node_Array [P] . Lo += Len;
800 Node_Array [P] . Len -= Len;
801 #if USE_EXTRA_FIELDS
802 Node_Array [P] . Parent = Q;
803 #endif
804 Node_Array [P] . Sibling = NIL;
805 Node_Array [P] . Sibling_Is_Leaf = false;
806 }
807 else
808 {
809 Leaf_Array [P] . Lo += Len;
810 Leaf_Array [P] . Len -= Len;
811 #if USE_EXTRA_FIELDS
812 Leaf_Array [P] . Parent = Q;
813 #endif
814 Leaf_Array [P] . Sibling = NIL;
815 Leaf_Array [P] . Sibling_Is_Leaf = false;
816 }
817
818 Made_New_Node = true;
819 return Q;
820 }
821
822 }
823
824 printf ("Ooops: Couldn't find appropriate child node\n");
825 exit (EXIT_FAILURE);
826
827 return 0;
828 }
829
830
831
List_Matches(int A,int B,int n)832 void List_Matches
833 (int A, int B, int n)
834
835 /* List all maximal matches of length n between entries in
836 * the list starting at
837 * subscript A in array Next_Leaf and the list starting
838 * at subscript B in the same array. Don't list a match
839 * if it's already been listed in a different order (because
840 * of the reverse complement strand. */
841
842 {
843 int i, j, k, L, R, Reversed;
844
845 for (i = A; i != NIL; i = Next_Leaf [i])
846 {
847 for (j = B; j != NIL; j = Next_Leaf [j])
848 {
849 if (Data [i - 1] == Data [j - 1]
850 || Data [i + n] == Data [j + n]
851 || (i > String_Separator && j > String_Separator))
852 continue;
853 Reversed = false;
854 if (j > String_Separator)
855 {
856 k = Genome_Len - (j - String_Separator) - n + 2;
857 if (k < i)
858 continue;
859 L = i;
860 R = k + n - 1;
861 Reversed = true;
862 }
863 else if (i > String_Separator)
864 {
865 k = Genome_Len - (i - String_Separator) - n + 2;
866 if (k < j)
867 continue;
868 L = j;
869 R = k + n - 1;
870 Reversed = true;
871 }
872 else if (i < j)
873 {
874 L = i;
875 R = j;
876 }
877 else
878 {
879 L = j;
880 R = i;
881 }
882 if (Verbose > 1)
883 Verify_Match (L, R, n, Reversed);
884 if (Tandem_Only && L + n < R)
885 continue;
886 printf ("%9d %10d%c %8d\n", L, R, Reversed ? 'r' : ' ', n);
887 }
888 }
889 }
890
891
892
List_Tree(int Root,int Is_Leaf,int Parent,int Parent_Depth)893 int List_Tree (int Root, int Is_Leaf, int Parent, int Parent_Depth)
894
895 // Show contents of suffix tree rooted at Root whose parent's
896 // string depth in the suffix tree is Depth . Is_Leaf
897 // indicates whether Root is a leaf node or not.
898 // Parent is the subscript of this node's parent.
899
900 {
901 int i, j;
902
903 while (Root != NIL)
904 if (Is_Leaf)
905 {
906 if (Leaf_Array [Root] . Len > 0)
907 {
908 j = Leaf_Array [Root] . Lo - Parent_Depth;
909 printf ("Leaf %d: ", j);
910 for (i = 0; i < Leaf_Array [Root] . Len; i ++)
911 putchar (Data [Leaf_Array [Root] . Lo + i]);
912 }
913 else
914 {
915 j = - (Leaf_Array [Root] . Lo + Parent_Depth);
916 printf ("Leaf %d: ", j);
917 for (i = 0; i > Leaf_Array [Root] . Len; i --)
918 putchar (Complement (Data [Leaf_Array [Root] . Lo + i]));
919 }
920 printf (" Par = %d", Parent);
921 #if USE_EXTRA_FIELDS
922 printf (" Depth = %d", Leaf_Array [Root] . Depth);
923 #endif
924 if (Leaf_Array [Root] . Is_Duplicate)
925 printf (" Duplicate = %d", int (Root + Genome_Len + 1));
926 putchar ('\n');
927 Is_Leaf = Leaf_Array [Root] . Sibling_Is_Leaf;
928 Root = Leaf_Array [Root] . Sibling;
929 }
930
931 else
932 {
933 #if USE_EXTRA_FIELDS
934 printf ("Node %d: ID %d ", Root, Node_Array [Root] . ID);
935 #else
936 printf ("Node %d: ", Root);
937 #endif
938 if (Node_Array [Root] . Len > 0)
939 for (i = 0; i < Node_Array [Root] . Len; i ++)
940 putchar (Data [Node_Array [Root] . Lo + i]);
941 else
942 for (i = 0; i > Node_Array [Root] . Len; i --)
943 putchar (Complement (Data [Node_Array [Root] . Lo + i]));
944 printf (" Par = %d", Parent);
945 #if USE_EXTRA_FIELDS
946 printf (" Depth = %d", Node_Array [Root] . Depth);
947 if (Node_Array [Root] . Link != NIL)
948 printf (" Link to node %d (ID %d)", Node_Array [Root] . Link,
949 Node_Array [Node_Array [Root] . Link] . ID);
950 #else
951 if (Node_Array [Root] . Link != NIL)
952 printf (" Link to node %d", Node_Array [Root] . Link);
953 #endif
954 putchar ('\n');
955 List_Tree (Node_Array [Root] . Child,
956 Node_Array [Root] . Child_Is_Leaf, Root,
957 Parent_Depth + abs (Node_Array [Root] . Len));
958 Is_Leaf = Node_Array [Root] . Sibling_Is_Leaf;
959 Root = Node_Array [Root] . Sibling;
960 }
961
962 return 0;
963 }
964
965
966
List_Maximal_Matches(int Root,int Is_Leaf,int Parent,int Parent_Depth)967 void List_Maximal_Matches (int Root, int Is_Leaf, int Parent, int Parent_Depth)
968
969 /* List substring pairs that occur more than once where the match
970 * does not extend either to the left or to the right for
971 * the subtree rooted at Root . Is_Leaf indicates
972 * whether Root is a leaf or internal node. Parent is the
973 * parent of Root and Parent_Depth is its string-depth in the
974 * suffix tree. */
975
976 {
977 int j, k, Depth, List1, Start, End;
978 int k_Is_Leaf;
979
980 if (Root == NIL)
981 return;
982
983 if (Is_Leaf)
984 {
985 // Skip any match here, it's other version at the start of
986 // the string will be output instead
987
988 List_Maximal_Matches (Leaf_Array [Root] . Sibling,
989 Leaf_Array [Root] . Sibling_Is_Leaf,
990 Parent, Parent_Depth);
991
992 return;
993 }
994
995 Depth = Parent_Depth + Node_Array [Root] . Len;
996
997 List_Maximal_Matches (Node_Array [Root] . Child,
998 Node_Array [Root] . Child_Is_Leaf, Root, Depth);
999
1000 List_Maximal_Matches (Node_Array [Root] . Sibling,
1001 Node_Array [Root] . Sibling_Is_Leaf, Parent, Parent_Depth);
1002
1003 if (Verbose > 1)
1004 printf ("List_Maximal_Matches: Root = %d Parent = %d Parent_Depth = %d\n"
1005 " Depth = %d Subtree_Size = %d\n",
1006 Root, Parent, Parent_Depth, Depth,
1007 Node_Array [Root] . Subtree_Size);
1008
1009 if (Depth >= Min_Match_Len)
1010 {
1011 if (Node_Array [Root] . Should_Skip)
1012 Global_Skip_Ct ++;
1013 else
1014 Global_Non_Skip_Ct ++;
1015 }
1016 if (Depth >= Min_Match_Len
1017 && ! Node_Array [Root] . Should_Skip)
1018 {
1019 Is_Leaf = Node_Array [Root] . Child_Is_Leaf;
1020 for (j = Node_Array [Root] . Child; j != NIL; )
1021 {
1022 if (Verbose > 1)
1023 printf (" Child = %d %s\n", j, Is_Leaf ? "leaf" : "node");
1024 if (Is_Leaf)
1025 {
1026 List1 = j;
1027 k = Leaf_Array [j] . Sibling;
1028 k_Is_Leaf = Leaf_Array [j] . Sibling_Is_Leaf;
1029 Is_Leaf = Leaf_Array [j] . Sibling_Is_Leaf;
1030 j = Leaf_Array [j] . Sibling;
1031 }
1032 else
1033 {
1034 List1 = Node_Array [j] . Link;
1035 k = Node_Array [j] . Sibling;
1036 k_Is_Leaf = Node_Array [j] . Sibling_Is_Leaf;
1037 Is_Leaf = Node_Array [j] . Sibling_Is_Leaf;
1038 j = Node_Array [j] . Sibling;
1039 }
1040 while (k != NIL)
1041 {
1042 if (k_Is_Leaf)
1043 {
1044 List_Matches (List1, k, Depth);
1045 k_Is_Leaf = Leaf_Array [k] . Sibling_Is_Leaf;
1046 k = Leaf_Array [k] . Sibling;
1047 }
1048 else
1049 {
1050 List_Matches (List1, Node_Array [k] . Link, Depth);
1051 k_Is_Leaf = Node_Array [k] . Sibling_Is_Leaf;
1052 k = Node_Array [k] . Sibling;
1053 }
1054 }
1055 #if 0
1056 if (Is_Leaf)
1057 {
1058 printf (" Lf%7d\n", j);
1059 Is_Leaf = Leaf_Array [j] . Sibling_Is_Leaf;
1060 j = Leaf_Array [j] . Sibling;
1061 }
1062 else
1063 {
1064 printf (" Nd%7d\n", j);
1065 Is_Leaf = Node_Array [j] . Sibling_Is_Leaf;
1066 j = Node_Array [j] . Sibling;
1067 }
1068 #endif
1069 }
1070 }
1071
1072 Start = End = 0;
1073 Is_Leaf = Node_Array [Root] . Child_Is_Leaf;
1074 for (j = Node_Array [Root] . Child; j != NIL; )
1075 {
1076 if (Is_Leaf)
1077 {
1078 if (Start == 0)
1079 Start = End = j;
1080 else
1081 {
1082 Next_Leaf [End] = j;
1083 End = j;
1084 }
1085 Is_Leaf = Leaf_Array [j] . Sibling_Is_Leaf;
1086 j = Leaf_Array [j] . Sibling;
1087 }
1088 else
1089 {
1090 if (Start == 0)
1091 Start = End = Node_Array [j] . Link;
1092 else
1093 Next_Leaf [End] = Node_Array [j] . Link;
1094 while (Next_Leaf [End] != NIL)
1095 End = Next_Leaf [End];
1096 Is_Leaf = Node_Array [j] . Sibling_Is_Leaf;
1097 j = Node_Array [j] . Sibling;
1098 }
1099 }
1100 Node_Array [Root] . Link = Start;
1101 #if 0
1102 printf (" Leaves: ");
1103 for (j = Node_Array [Root] . Link; j != NIL; j = Next_Leaf [j])
1104 printf (" %3d", j);
1105 printf ("\n");
1106 #endif
1107
1108 return;
1109 }
1110
1111
1112
Longest_Prefix_Match(char * p,char * q)1113 int Longest_Prefix_Match
1114 (char * p, char * q)
1115
1116 // Return the length of the longest common prefix of strings p
1117 // and q . Assumes they will mismatch before running off the
1118 // end of either string.
1119
1120 {
1121 int i;
1122
1123 for (i = 0; p [i] == q [i]; i ++)
1124 ;
1125
1126 return i;
1127 }
1128
1129
1130
Mark_Skipable_Nodes(int Root,int Is_Leaf,int Parent,int Parent_Depth)1131 void Mark_Skipable_Nodes
1132 (int Root, int Is_Leaf, int Parent, int Parent_Depth)
1133
1134 // Set the Should_Skip field of the node that Root links to
1135 // if its Subtree_Size is the same as Root 's,
1136 // and do the same recursively in Root 's subtree.
1137 // Is_Leaf indicates whether Root is a leaf or internal node.
1138 // Parent is the parent of Root and Parent_Depth is its
1139 // string-depth in the suffix tree.
1140
1141 {
1142 int Link, Depth;
1143
1144 if (Root == NIL)
1145 return;
1146
1147 if (Is_Leaf)
1148 {
1149 Mark_Skipable_Nodes (Leaf_Array [Root] . Sibling,
1150 Leaf_Array [Root] . Sibling_Is_Leaf,
1151 Parent, Parent_Depth);
1152
1153 return;
1154 }
1155
1156 Depth = Parent_Depth + Node_Array [Root] . Len;
1157
1158 Mark_Skipable_Nodes (Node_Array [Root] . Child, Node_Array [Root] . Child_Is_Leaf,
1159 Root, Depth);
1160
1161 Mark_Skipable_Nodes (Node_Array [Root] . Sibling, Node_Array [Root] . Sibling_Is_Leaf,
1162 Parent, Parent_Depth);
1163
1164 if (Verbose > 1)
1165 printf ("Mark_Skipable_Nodes: Root = %d Parent = %d Parent_Depth = %d\n",
1166 Root, Parent, Parent_Depth);
1167
1168 Link = Node_Array [Root] . Link;
1169 if (Link != NIL
1170 && Node_Array [Link] . Subtree_Size == Node_Array [Root] . Subtree_Size)
1171 Node_Array [Link] . Should_Skip = true;
1172
1173 // Will mark the root of the entire tree as skippable, but that's
1174 // what we want
1175
1176 return;
1177 }
1178
1179
1180
New_Node(void)1181 int New_Node (void)
1182
1183 /* Allocate and return the subscript of a new node. */
1184
1185 {
1186 return Next_Avail_Node ++;
1187 }
1188
1189
1190
Parse_Command_Line(int argc,char * argv[])1191 static void Parse_Command_Line
1192 (int argc, char * argv [])
1193
1194 // Get options and parameters from command line with argc
1195 // arguments in argv [0 .. (argc - 1)] .
1196
1197 {
1198 bool errflg = false;
1199 char * p;
1200 int ch;
1201
1202 optarg = NULL;
1203
1204 while (! errflg && ((ch = getopt (argc, argv, "Efn:tV:")) != EOF))
1205 switch (ch)
1206 {
1207 case 'E' :
1208 Exhaustive_Matches = true;
1209 break;
1210
1211 case 'f' :
1212 Forward_Only = true;
1213 break;
1214
1215 case 'n' :
1216 Min_Match_Len = (int) strtol (optarg, & p, 10);
1217 if (p == optarg || Min_Match_Len < 1)
1218 {
1219 fprintf (stderr, "ERROR: Illegal min match length \"%s\"\n",
1220 optarg);
1221 exit (-1);
1222 }
1223 break;
1224
1225 case 't' :
1226 Forward_Only = true;
1227 Tandem_Only = true;
1228 break;
1229
1230 case 'V' :
1231 Verbose = (int) strtol (optarg, & p, 10);
1232 if (p == optarg)
1233 {
1234 fprintf (stderr, "ERROR: Illegal verbose number \"%s\"\n",
1235 optarg);
1236 exit (-1);
1237 }
1238 break;
1239
1240 case '?' :
1241 fprintf (stderr, "Unrecognized option -%c\n", optopt);
1242
1243 default :
1244 errflg = true;
1245 }
1246
1247 if (errflg || optind != argc - 1)
1248 {
1249 Usage (argv [0]);
1250 exit (EXIT_FAILURE);
1251 }
1252
1253 Input_File_Name = argv [optind ++];
1254
1255 return;
1256 }
1257
1258
1259
Set_Subtree_Size(int Root,int Is_Leaf,int Parent,int Parent_Depth)1260 void Set_Subtree_Size
1261 (int Root, int Is_Leaf, int Parent, int Parent_Depth)
1262
1263 // Set the Subtree_Size field of Root and all its descendants
1264 // to the number of leaves in its subtree.
1265 // Is_Leaf indicates whether Root is a leaf or internal node.
1266 // Parent is the parent of Root and Parent_Depth is its
1267 // string-depth in the suffix tree.
1268
1269 {
1270 int Depth;
1271 // int Start;
1272
1273 if (Root == NIL)
1274 return;
1275
1276 if (Is_Leaf)
1277 {
1278 // Start = Leaf_Array [Root] . Lo - Parent_Depth;
1279 if (Leaf_Array [Root] . Is_Duplicate)
1280 Node_Array [Parent] . Subtree_Size ++;
1281 else
1282 Node_Array [Parent] . Subtree_Size += 2;
1283
1284 Set_Subtree_Size (Leaf_Array [Root] . Sibling,
1285 Leaf_Array [Root] . Sibling_Is_Leaf,
1286 Parent, Parent_Depth);
1287
1288 return;
1289 }
1290
1291 Depth = Parent_Depth + Node_Array [Root] . Len;
1292
1293 Set_Subtree_Size (Node_Array [Root] . Child, Node_Array [Root] . Child_Is_Leaf,
1294 Root, Depth);
1295
1296 Set_Subtree_Size (Node_Array [Root] . Sibling, Node_Array [Root] . Sibling_Is_Leaf,
1297 Parent, Parent_Depth);
1298
1299 if (Verbose > 1)
1300 printf ("Set_Subtree_Size: Root = %d Parent = %d Parent_Depth = %d\n",
1301 Root, Parent, Parent_Depth);
1302
1303 if (Parent != NIL)
1304 Node_Array [Parent] . Subtree_Size += Node_Array [Root] . Subtree_Size;
1305
1306 return;
1307 }
1308
1309
1310
New_Step_Down(int Node,int Node_Depth,int Lo,int Len,int & Depth,int Doing_Prefix,int & Return_Is_Leaf,int & Par_New_Node)1311 static int New_Step_Down
1312 (int Node, int Node_Depth, int Lo, int Len, int & Depth, int Doing_Prefix,
1313 int & Return_Is_Leaf, int & Par_New_Node)
1314
1315 /* Return the subscript of the node that represents the lowest
1316 * descendant of Node that matches Data [Lo .. Lo + Len - 1] .
1317 * If necessary, allocate a new node and return its subscript,
1318 * unless Doing_Prefix . In that case do not allocate a new node
1319 * if 1 + Depth == Len but return the child of where the new
1320 * node would be allocated. Set Depth to the number of characters
1321 * successfully matched from Node down to the returned Node .
1322 * Node_Depth is the depth of Node in the suffix tree.
1323 * Set Return_Is_Leaf to indicate the status of the returned
1324 * subscript. If a new node is allocated, set Par_New_Node
1325 * to its parent.
1326 */
1327
1328 {
1329 int P, Q, P_Is_Leaf, i, j, D, Pred, Pred_Is_Leaf;
1330 int P_Sib, P_Sib_Is_Leaf;
1331
1332 Depth = 0;
1333
1334 if (Len == 0)
1335 return NIL;
1336
1337 P = New_Find_Child (Node, Data [Lo], P_Is_Leaf, Pred, Pred_Is_Leaf, Node_Depth);
1338
1339 while (P != NIL)
1340 {
1341 if (P_Is_Leaf)
1342 {
1343 i = Leaf_Array [P] . Lo;
1344 D = Leaf_Array [P] . Len;
1345 P_Sib = Leaf_Array [P] . Sibling;
1346 P_Sib_Is_Leaf = Leaf_Array [P] . Sibling_Is_Leaf;
1347 }
1348 else
1349 {
1350 i = Node_Array [P] . Lo;
1351 D = Node_Array [P] . Len;
1352 P_Sib = Node_Array [P] . Sibling;
1353 P_Sib_Is_Leaf = Node_Array [P] . Sibling_Is_Leaf;
1354 }
1355 for (j = 1; j < Len && j < D
1356 && Data [i + j] == Data [Lo + j];
1357 j ++)
1358 ;
1359 Depth += j;
1360
1361 if (j < D)
1362 {
1363 if (Doing_Prefix && 1 + j == Len)
1364 {
1365 Return_Is_Leaf = P_Is_Leaf;
1366 return P;
1367 }
1368 Q = New_Node ();
1369 Node_Array [Q] . Lo = Lo;
1370 Node_Array [Q] . Len = j;
1371 Node_Array [Q] . Child = P;
1372 Node_Array [Q] . Sibling = P_Sib;
1373 Node_Array [Q] . Sibling_Is_Leaf = P_Sib_Is_Leaf;
1374 Par_New_Node = Node;
1375 Node_Array [Q] . Link = NIL;
1376 Node_Array [Q] . Child_Is_Leaf = P_Is_Leaf;
1377 #if USE_EXTRA_FIELDS
1378 Node_Array [Q] . Parent = Node;
1379 Node_Array [Q] . Depth = Node_Array [Node] . Depth + j;
1380 Node_Array [Q] . ID = Curr_ID;
1381 #endif
1382 if (Pred == NIL)
1383 {
1384 Node_Array [Node] . Child = Q;
1385 Node_Array [Node] . Child_Is_Leaf = false;
1386 }
1387 else if (Pred_Is_Leaf)
1388 {
1389 Leaf_Array [Pred] . Sibling = Q;
1390 Leaf_Array [Pred] . Sibling_Is_Leaf = false;
1391 }
1392 else
1393 {
1394 Node_Array [Pred] . Sibling = Q;
1395 Node_Array [Pred] . Sibling_Is_Leaf = false;
1396 }
1397
1398 if (! P_Is_Leaf)
1399 {
1400 Node_Array [P] . Lo += j;
1401 Node_Array [P] . Len -= j;
1402 #if USE_EXTRA_FIELDS
1403 Node_Array [P] . Parent = Q;
1404 #endif
1405 Node_Array [P] . Sibling = NIL;
1406 Node_Array [P] . Sibling_Is_Leaf = false;
1407 }
1408 else
1409 {
1410 Leaf_Array [P] . Lo += j;
1411 Leaf_Array [P] . Len -= j;
1412 #if USE_EXTRA_FIELDS
1413 Leaf_Array [P] . Parent = Q;
1414 #endif
1415 Leaf_Array [P] . Sibling = NIL;
1416 Leaf_Array [P] . Sibling_Is_Leaf = false;
1417 }
1418
1419 Return_Is_Leaf = false;
1420 return Q;
1421 }
1422
1423 if (P_Is_Leaf || j == Len)
1424 {
1425 // Return_Is_Leaf = true;
1426 Return_Is_Leaf = P_Is_Leaf;
1427 return P;
1428 }
1429
1430 Lo += j;
1431 Len -= j;
1432 Node = P;
1433 Node_Depth += j;
1434 P = New_Find_Child (Node, Data [Lo], P_Is_Leaf,
1435 Pred, Pred_Is_Leaf, Node_Depth);
1436 }
1437
1438 Return_Is_Leaf = false;
1439 return Node;
1440 }
1441
1442
1443
Usage(char * command)1444 static void Usage
1445 (char * command)
1446
1447 // Print to stderr description of options and command line for
1448 // this program. command is the command that was used to
1449 // invoke it.
1450
1451 {
1452 fprintf (stderr,
1453 "USAGE: %s [options] <genome-file>\n"
1454 "\n"
1455 "Find all maximal exact matches in <genome-file>\n"
1456 "\n"
1457 "Options:\n"
1458 " -E Use exhaustive (slow) search to find matches\n"
1459 " -f Forward strand only, don't use reverse complement\n"
1460 " -n # Set minimum exact match length to #\n"
1461 " -t Only output tandem repeats\n"
1462 " -V # Set level of verbose (debugging) printing to #\n"
1463 "\n",
1464 command);
1465
1466 return;
1467 }
1468
1469
1470
Verify_Match(int a,int b,int n,int reverse)1471 void Verify_Match
1472 (int a, int b, int n, int reverse)
1473
1474 // Verify that the match of length n starting at Data [a] and
1475 // Data [b] is a maximal match. If reverse is true then the
1476 // string at Data [b] goes in the reverse direction.
1477 // Assume Data is padded at the ends to prevent errors
1478 // caused by falling off the ends.
1479
1480 {
1481 int i;
1482
1483 if (reverse)
1484 {
1485 if (Data [a - 1] == Complement (Data [b + 1]))
1486 printf ("Verify_Match: a=%d b=%d rev=%c preceding chars match\n",
1487 a, b, 't');
1488 if (Data [a + n] == Complement (Data [b - n]))
1489 printf ("Verify_Match: a=%d b=%d rev=%c following chars match\n",
1490 a, b, 't');
1491 for (i = 0; i < n; i ++)
1492 if (Data [a + i] != Complement (Data [b - i]))
1493 {
1494 printf ("Verify_Match: a=%d b=%d rev=%c mismatch at %d[%c]:%d[%c]\n",
1495 a, b, 't', a + i, Data [a + i], b - i, Complement (Data [b - i]));
1496 exit (EXIT_FAILURE);
1497 }
1498 }
1499 else
1500 {
1501 if (Data [a - 1] == Data [b - 1])
1502 printf ("Verify_Match: a=%d b=%d rev=%c preceding chars match\n",
1503 a, b, 'f');
1504 if (Data [a + n] == Data [b + n])
1505 printf ("Verify_Match: a=%d b=%d rev=%c following chars match\n",
1506 a, b, 'f');
1507 for (i = 0; i < n; i ++)
1508 if (Data [a + i] != Data [b + i])
1509 {
1510 printf ("Verify_Match: a=%d b=%d rev=%c mismatch at %d[%c]:%d[%c]\n",
1511 a, b, 'f', a + i, Data [a + i], b + i, Data [b + i]);
1512 exit (EXIT_FAILURE);
1513 }
1514 }
1515
1516 return;
1517 }
1518