1 /* @source embword ************************************************************
2 **
3 ** Wordmatch routines
4 **
5 ** @author Copyright (c) 1999 Gary Williams
6 ** @version $Revision: 1.65 $
7 ** @modified $Date: 2012/07/14 14:52:41 $ by $Author: rice $
8 ** @@
9 **
10 ** This library is free software; you can redistribute it and/or
11 ** modify it under the terms of the GNU Lesser General Public
12 ** License as published by the Free Software Foundation; either
13 ** version 2.1 of the License, or (at your option) any later version.
14 **
15 ** This library is distributed in the hope that it will be useful,
16 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 ** Lesser General Public License for more details.
19 **
20 ** You should have received a copy of the GNU Lesser General Public
21 ** License along with this library; if not, write to the Free Software
22 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23 ** MA  02110-1301,  USA.
24 **
25 ******************************************************************************/
26 
27 #include "ajlib.h"
28 
29 #include "embword.h"
30 #include "ajassert.h"
31 #include "ajseq.h"
32 #include "ajfeat.h"
33 #include "ajfile.h"
34 #include "ajlist.h"
35 #include "ajtable.h"
36 #include "ajutil.h"
37 
38 #include <math.h>
39 
40 
41 
42 
43 /*
44 ** current wordlength - this is an easily accessible copy of the value
45 ** in the first node of wordLengthList
46 */
47 static ajuint wordLength = 0;
48 
49 /* list of wordlengths with current one at top of list */
50 static AjPList wordLengthList = NULL;
51 
52 static AjPList wordCurList = NULL;
53 
54 /*
55 ** Rabin-Karp multi-pattern search parameters.
56 ** Modulus (a large prime) and radix are used in calculating hash values
57 ** efficiently.
58 **
59 ** Note: we should be able to replace binary search in Rabin-Karp search
60 ** algorithm with direct search by selecting a small q, however in this case
61 ** we should always check that a hit is a correct hit (as we currently do).
62 ** Radix has a relation with alphabet size, selecting its value depending
63 ** on input might be awarding in this context.
64 **
65 */
66 
67 #define RK_MODULUS 1073741789UL
68 #define RK_RADIX 256UL
69 
70 
71 
72 static ajint    wordCmpStr(const void *x, const void *y);
73 static ajint    wordCompare(const void *x, const void *y);
74 static void     wordCurIterTrace(const AjIList curiter);
75 static void     wordCurListTrace(const AjPList curlist);
76 static ajint    wordDeadZone(EmbPWordMatch match,
77 			     ajint deadx1, ajint deady1,
78 			     int deadx2, ajint deady2, ajint minlength);
79 static void     wordListInsertNodeOld(AjPListNode* pnode, void* x);
80 static void     wordListInsertOld(AjIList iter, void* x);
81 static ajint    wordMatchCmp(const void* v1, const void* v2);
82 static ajint    wordMatchCmpPos(const void* v1, const void* v2);
83 static void     wordNewListTrace(ajint i, const AjPList newlist);
84 static void     wordOrderPosMatchTable(AjPList unorderedList);
85 
86 static ajulong  wordStrHash(const void *key, ajulong hashsize);
87 
88 static void     wordVFreeLocs(void **value);
89 static void     wordVFreeSeqlocs(void **value);
90 
91 static ajint    wordRabinKarpCmp(const void *trgseq, const void *qryseq);
92 static ajulong  wordRabinKarpConstant(ajuint m);
93 
94 
95 
96 
97 /* @funcstatic wordCmpStr *****************************************************
98 **
99 ** Compare two words for first n chars. n set by embWordLength.
100 **
101 ** @param [r] x [const void *] First word
102 ** @param [r] y [const void *] Second word
103 ** @return [ajint] difference
104 **
105 ** @release 1.0.0
106 ** @@
107 ******************************************************************************/
108 
wordCmpStr(const void * x,const void * y)109 static ajint wordCmpStr(const void *x, const void *y)
110 {
111     return ajCharCmpCaseLen((const char *)x, (const char *)y, wordLength);
112 }
113 
114 
115 
116 
117 /* @funcstatic wordStrHash ****************************************************
118 **
119 ** Create hash value from key.
120 **
121 ** @param [r] key [const void *] key.
122 ** @param [r] hashsize [ajulong] Hash size
123 ** @return [ajulong] hash value
124 **
125 ** @release 1.0.0
126 ** @@
127 ******************************************************************************/
128 
wordStrHash(const void * key,ajulong hashsize)129 static ajulong wordStrHash(const void *key, ajulong hashsize)
130 {
131     ajulong hashval;
132     const char *s;
133 
134     ajuint i;
135 
136     s = (const char *) key;
137 
138     for(i=0, hashval = 0; i < wordLength; i++, s++)
139 	hashval = toupper((ajint)*s) + 31 *hashval;
140 
141     return hashval % hashsize;
142 }
143 
144 
145 
146 
147 /* @funcstatic wordCompare ****************************************************
148 **
149 ** Compare two words in descending order.
150 **
151 ** @param [r] x [const void *] First word
152 ** @param [r] y [const void *] Second word
153 ** @return [ajint] count difference for words.
154 **
155 ** @release 2.1.0
156 ** @@
157 ******************************************************************************/
158 
wordCompare(const void * x,const void * y)159 static ajint wordCompare(const void *x, const void *y)
160 {
161 /*
162     const EmbPWord xw;
163     const EmbPWord yw;
164 
165     xw = ((const EmbPWord2)x)->fword;
166     yw = ((const EmbPWord2)y)->fword;
167 
168     return (yw->count - xw->count);
169 */
170 
171     return ((*(EmbPWord const *)y)->count -
172 	    (*(EmbPWord const *)x)->count);
173 }
174 
175 
176 
177 
178 /* @func embWordLength ********************************************************
179 **
180 ** Sets the word length for all functions. Must be called first.
181 ** Creates the word length list if not yet done.
182 ** Pushes the latest word length value on the list.
183 **
184 ** @param [r] wordlen [ajint] Word length
185 ** @return [void]
186 **
187 ** @release 1.0.0
188 ** @@
189 ******************************************************************************/
190 
embWordLength(ajint wordlen)191 void embWordLength(ajint wordlen)
192 {
193     ajint *pint;
194 
195     if(!wordLengthList)
196 	wordLengthList = ajListNew();
197 
198     /* store the wordlength in case we do recursive word stuff */
199     AJNEW0(pint);
200     *pint = wordlen;
201     ajListPush(wordLengthList, pint);
202 
203     /* set the current wordlength as an easily accessible static ajint */
204     wordLength = wordlen;
205 
206     return;
207 }
208 
209 
210 
211 
212 /* @func embWordClear *********************************************************
213 **
214 ** Clears the word length for all functions. To be called when all is done.
215 ** Pops the last word length from the list and frees it.
216 ** If there is nothing else on the list, it frees the list.
217 **
218 ** @return [void]
219 **
220 ** @release 1.0.0
221 ** @@
222 ******************************************************************************/
223 
embWordClear(void)224 void embWordClear(void)
225 {
226     ajint *pint;
227 
228     /*
229     ** pop the previous word length from the list in case there's
230     ** recursive word stuff
231     */
232     if(ajListGetLength(wordLengthList))
233     {
234 	ajListPop(wordLengthList, (void **)&pint);
235 	AJFREE(pint);
236     }
237 
238     if(!ajListGetLength(wordLengthList))
239     {
240 	ajListFree(&wordLengthList);
241 	wordLengthList = NULL;		/* no valid word length set */
242 	wordLength = 0;			/* no valid word length set */
243     }
244     else
245     {
246 	/* set the current wordlength as an easily accessible static ajint */
247 	ajListPeekFirst(wordLengthList, (void **)&pint);
248 	wordLength = *pint;
249     }
250 
251     return;
252 }
253 
254 
255 
256 
257 /* @func embWordPrintTable ****************************************************
258 **
259 ** Print the words found with their frequencies.
260 **
261 ** @param [r] table [const AjPTable] table to be printed
262 ** @return [void]
263 **
264 ** @release 1.0.0
265 ** @@
266 ******************************************************************************/
267 
embWordPrintTable(const AjPTable table)268 void embWordPrintTable(const AjPTable table)
269 {
270     void **valarray = NULL;
271     EmbPWord ajnew;
272     ajint i;
273 
274     ajTableToarrayValues(table, &valarray);
275 
276     qsort(valarray, (size_t) ajTableGetLength(table), sizeof (*valarray), wordCompare);
277 
278     for(i = 0; valarray[i]; i++)
279     {
280 	ajnew = (EmbPWord) valarray[i];
281 	ajUser("%.*s\t%d", wordLength, ajnew->fword,ajnew->count);
282     }
283 
284     AJFREE(valarray);
285 
286     return;
287 }
288 
289 
290 
291 
292 /* @func embWordPrintTableFI **************************************************
293 **
294 ** Print the words found with their frequencies.
295 **
296 ** @param [r] table [const AjPTable] table to be printed
297 ** @param [r] mincount [ajint] Minimum frequency to report
298 ** @param [u] outf [AjPFile] Output file.
299 ** @return [void]
300 **
301 ** @release 4.0.0
302 ** @@
303 ******************************************************************************/
304 
embWordPrintTableFI(const AjPTable table,ajint mincount,AjPFile outf)305 void embWordPrintTableFI(const AjPTable table, ajint mincount, AjPFile outf)
306 {
307     void **valarray = NULL;
308     EmbPWord ajnew;
309     ajint i;
310 
311     if(!ajTableGetLength(table)) return;
312 
313     i = (ajuint) ajTableToarrayValues(table, &valarray);
314 
315     ajDebug("embWordPrintTableFI size %d mincount:%d\n", i, mincount);
316 
317     for(i = 0; valarray[i]; i++)
318     {
319 	ajnew = (EmbPWord) valarray[i];
320 	ajDebug("embWordPrintTableFI unsorted [%d] %.*s %d\n",
321 		i, wordLength, ajnew->fword,ajnew->count);
322     }
323 
324     qsort(valarray, (size_t) ajTableGetLength(table), sizeof (*valarray), wordCompare);
325 
326     for(i = 0; valarray[i]; i++)
327     {
328 	ajnew = (EmbPWord) valarray[i];
329 	ajDebug("embWordPrintTableFI sorted [%d] %.*s %d\n",
330 		i, wordLength, ajnew->fword,ajnew->count);
331 
332 	if(ajnew->count < mincount)
333             break;
334 
335 	ajFmtPrintF(outf, "%.*s\t%d\n",
336 			   wordLength, ajnew->fword,ajnew->count);
337     }
338 
339     AJFREE(valarray);
340 
341     return;
342 }
343 
344 
345 
346 
347 /* @func embWordPrintTableF ***************************************************
348 **
349 ** Print the words found with their frequencies.
350 **
351 ** @param [r] table [const AjPTable] table to be printed
352 ** @param [u] outf [AjPFile] Output file.
353 ** @return [void]
354 **
355 ** @release 1.0.0
356 ** @@
357 ******************************************************************************/
358 
embWordPrintTableF(const AjPTable table,AjPFile outf)359 void embWordPrintTableF(const AjPTable table, AjPFile outf)
360 {
361     embWordPrintTableFI(table, 1, outf);
362 
363     return;
364 }
365 
366 
367 
368 
369 /* @funcstatic wordVFreeLocs **************************************************
370 **
371 ** Free the elements in a EmbPWord locations table.
372 **
373 ** @param [d] value [void**] Data value for a table item
374 ** @return [void]
375 **
376 ** @release 6.4.0
377 ** @@
378 ******************************************************************************/
379 
wordVFreeLocs(void ** value)380 static void wordVFreeLocs(void **value)
381 {
382     AjPTable table = ((EmbPWord)*value)->seqlocs;
383 
384     ajTableDelValdel(&table, &wordVFreeSeqlocs);
385 
386     /* free the locations structure */
387     AJFREE(*value);
388 
389     return;
390 }
391 
392 
393 
394 
395 /* @funcstatic wordVFreeSeqlocs ***********************************************
396 **
397 ** Free the elements in a EmbPWord sequence locations list.
398 **
399 ** @param [d] value [void**] Data values as void**
400 ** @return [void]
401 **
402 ** @release 6.4.0
403 ** @@
404 ******************************************************************************/
405 
wordVFreeSeqlocs(void ** value)406 static void wordVFreeSeqlocs(void **value)
407 {
408     AjPList list = ((EmbPWordSeqLocs)*value)->locs;
409 
410     /* free the elements in the list of the positions */
411     ajListFreeData(&list);
412 
413     /* free the locations structure */
414     AJFREE(*value);
415 
416     return;
417 }
418 
419 
420 
421 
422 /* @func embWordFreeTable *****************************************************
423 **
424 ** delete the word table and free the memory.
425 **
426 ** @param [d] table [AjPTable*] table to be deleted
427 ** @return [void]
428 **
429 ** @release 1.0.0
430 ** @@
431 ******************************************************************************/
432 
embWordFreeTable(AjPTable * table)433 void embWordFreeTable(AjPTable *table)
434 {
435     ajTableDel(table);
436 
437     return;
438 }
439 
440 
441 
442 
443 /* @funcstatic wordMatchListDelete ********************************************
444 **
445 ** deletes entries in a list of matches.
446 **
447 ** @param [d] x [void**] Data values as void**
448 ** @param [r] cl [void*] Ignored user data, usually NULL.
449 ** @return [void]
450 **
451 ** @release 2.1.0
452 ** @@
453 ******************************************************************************/
454 
wordMatchListDelete(void ** x,void * cl)455 static void wordMatchListDelete(void **x,void *cl)
456 {
457     EmbPWordMatch p;
458 
459     p = (EmbPWordMatch)*x;
460 
461     AJFREE(p);
462 
463     if(!cl)
464         return;
465 
466     return;
467 }
468 
469 
470 
471 
472 /* @func embWordMatchListDelete ***********************************************
473 **
474 ** delete the word table.
475 **
476 ** @param [u] plist [AjPList*] list to be deleted.
477 ** @return [void]
478 **
479 ** @release 1.0.0
480 ** @@
481 ******************************************************************************/
482 
embWordMatchListDelete(AjPList * plist)483 void embWordMatchListDelete(AjPList* plist)
484 {
485     if(!*plist)
486 	return;
487 
488     ajListMap(*plist, &wordMatchListDelete, NULL);
489     ajListFree(plist);
490 
491     return;
492 }
493 
494 
495 
496 
497 /* @funcstatic wordMatchListPrint *********************************************
498 **
499 ** print the word table.
500 **
501 ** @param [r] x [void*] List item (EmbPWordMatch*)
502 ** @param [r] cl [void*] Output file AjPFile
503 ** @return [void]
504 **
505 ** @release 2.1.0
506 ** @@
507 ******************************************************************************/
508 
wordMatchListPrint(void * x,void * cl)509 static void wordMatchListPrint(void *x,void *cl)
510 {
511     EmbPWordMatch p;
512     AjPFile file;
513 
514     p    = (EmbPWordMatch)x;
515     file = (AjPFile) cl;
516 
517     ajFmtPrintF(file, "%10d  %10d %10d\n",
518 		       p->seq1start+1,
519 		       p->seq2start+1,
520 		       p->length);
521 
522     return;
523 }
524 
525 
526 
527 
528 /* @func embWordMatchListPrint ************************************************
529 **
530 ** print the word table.
531 **
532 ** @param [u] file [AjPFile] Output file
533 ** @param [r] list [const AjPList] list to be printed.
534 ** @return [void]
535 **
536 ** @release 1.0.0
537 ** @@
538 ******************************************************************************/
539 
embWordMatchListPrint(AjPFile file,const AjPList list)540 void embWordMatchListPrint(AjPFile file, const AjPList list)
541 {
542     ajListMapread(list, &wordMatchListPrint, file);
543 
544     return;
545 }
546 
547 
548 
549 
550 /* @func embWordMatchListConvToFeat *******************************************
551 **
552 ** convert the word table to feature tables.
553 **
554 ** @param [r] list [const AjPList] list to be printed.
555 ** @param [u] tab1 [AjPFeattable*] feature table for sequence 1
556 ** @param [u] tab2 [AjPFeattable*] feature table for sequence 2
557 ** @param [r] seq1 [const AjPSeq] sequence
558 ** @param [r] seq2 [const AjPSeq] second sequence
559 ** @return [void]
560 **
561 ** @release 1.0.0
562 ** @@
563 ******************************************************************************/
564 
embWordMatchListConvToFeat(const AjPList list,AjPFeattable * tab1,AjPFeattable * tab2,const AjPSeq seq1,const AjPSeq seq2)565 void embWordMatchListConvToFeat(const AjPList list,
566 				AjPFeattable *tab1, AjPFeattable *tab2,
567 				const AjPSeq seq1, const AjPSeq seq2)
568 {
569     char strand = '+';
570     ajint frame = 0;
571     AjPStr source = NULL;
572     AjPStr type   = NULL;
573     AjPStr tag    = NULL;
574     AjPFeature feature;
575     AjIList iter  = NULL;
576     float score   = 1.0;
577 
578     if(!*tab1)
579 	*tab1 = ajFeattableNewSeq(seq1);
580 
581     if(!*tab2)
582 	*tab2 = ajFeattableNewSeq(seq2);
583 
584     ajStrAssignC(&source,"wordmatch");
585     ajStrAssignC(&type,"misc_feature");
586     score = 1.0;
587     ajStrAssignC(&tag,"note");
588 
589     iter = ajListIterNewread(list);
590 
591     while(!ajListIterDone(iter))
592     {
593 	EmbPWordMatch p = (EmbPWordMatch) ajListIterGet(iter);
594 	feature = ajFeatNew(*tab1, source, type,
595 			    p->seq1start+1,p->seq1start+p->length , score,
596 			    strand, frame) ;
597 
598 	ajFeatTagSet(feature, tag, ajSeqGetNameS(seq2));
599 
600 	feature = ajFeatNew(*tab2, source, type,
601 			    p->seq2start+1,p->seq2start+p->length , score,
602 			    strand, frame) ;
603 
604 	ajFeatTagSet(feature, tag, ajSeqGetNameS(seq1));
605     }
606 
607     ajListIterDel(&iter);
608     ajStrDel(&source);
609     ajStrDel(&type);
610     ajStrDel(&tag);
611 
612     return;
613 }
614 
615 
616 
617 
618 /* @func embWordGetTable ******************************************************
619 **
620 ** Builds a table of all words in a sequence.
621 **
622 ** The word length must be defined by a call to embWordLength.
623 **
624 ** @param [u] table [AjPTable*] table to be created or updated.
625 ** @param [r] seq [const AjPSeq] Sequence to be "worded"
626 ** @return [AjBool] ajTrue if successful
627 **
628 ** @release 1.0.0
629 ** @@
630 ******************************************************************************/
631 
embWordGetTable(AjPTable * table,const AjPSeq seq)632 AjBool embWordGetTable(AjPTable *table, const AjPSeq seq)
633 {
634     const char * startptr;
635     ajuint i;
636     ajuint j;
637     ajuint ilast;
638     ajuint *k;
639     EmbPWord rec;
640     EmbPWordSeqLocs seqlocs;
641     const AjPStr seqname;
642     char* key;
643 
644     ajuint wordsize;
645     char skipchar;
646 
647     wordsize = wordLength+1;
648 
649     skipchar = 'X';
650 
651     if(ajSeqIsNuc(seq))
652 	skipchar = 'N';
653 
654     assert(wordLength > 0);
655 
656     ajDebug("embWordGetTable seq.len %d wordlength %d skipchar '%c'\n",
657 	     ajSeqGetLen(seq), wordLength, skipchar);
658 
659     if(ajSeqGetLen(seq) < wordLength)
660     {
661 	ajDebug("sequence too short: wordsize = %d, sequence length = %d",
662 	       wordLength, ajSeqGetLen(seq));
663 
664 	return ajFalse;
665     }
666 
667     if(!*table)
668     {
669 	*table = ajTableNewFunctionLen(ajSeqGetLen(seq),
670 				       &wordCmpStr, &wordStrHash,
671                                        &ajMemFree, &wordVFreeLocs);
672 	ajDebug("make new table\n");
673     }
674 
675     /* initialise ptr to start of seq string */
676     startptr = ajSeqGetSeqC(seq);
677 
678     i = 0;
679     ilast = ajSeqGetLen(seq) - wordLength;
680 
681     j=0;
682 
683     while(j<wordLength)
684     {
685 	if((char)toupper((ajint)startptr[j]) == skipchar)
686 	{
687 	    ajDebug("Skip '%c' at start from %d",
688 		    skipchar, i+j+1);
689 
690 	    while((char)toupper((ajint)startptr[j]) == skipchar)
691 	    {
692 		i++;
693 		startptr++;
694 	    }
695 
696 	    ajDebug(" to %d\n",
697 		    i+j);
698 	    j = 0;
699 
700 	    if(i > ilast)
701             {
702 		ajDebug("sequence has no word without ambiguity code '%c'\n",
703 			skipchar);
704 
705 		return ajFalse;
706 	    }
707 	}
708 	else
709 	    j++;
710     }
711 
712     j = wordLength - 1;
713 
714     while(i <= ilast)
715     {
716 	if((char)toupper((ajint)startptr[j]) == skipchar)
717 	{
718 	    ajDebug("Skip '%c' from %d", skipchar, j);
719 
720 	    while((char)toupper((ajint)startptr[j]) == skipchar)
721 	    {
722 		i++;
723 		startptr++;
724 	    }
725 
726 	    i += j;
727 	    startptr += j;
728 	    ajDebug(" to %d\n", i);
729 	    continue;
730 	}
731 
732 	rec = (EmbPWord) ajTableFetchmodV(*table, startptr);
733 
734 	/* does it exist already */
735 	if(rec)
736 	{
737 	    /* if yes increment count */
738 	    rec->count++;
739 	}
740 	else
741 	{
742 	    /* else create a new word */
743 	    AJNEW0(rec);
744 	    rec->count = 1;
745 	    key = ajCharNewResLenC(startptr, wordsize, wordLength);
746 	    rec->fword = key;
747 	    rec->seqlocs = ajTablestrNew(1000);
748 	    ajTablePut(*table, key, rec);
749 	}
750 
751 	AJNEW0(k);
752 	*k = i;
753 	seqname = ajSeqGetNameS(seq);
754 	seqlocs = (EmbPWordSeqLocs) ajTableFetchmodS(rec->seqlocs, seqname);
755 
756 	if (seqlocs == NULL)
757 	{
758 	    AJNEW0(seqlocs);
759 	    seqlocs->seq = seq;
760 	    seqlocs->locs = ajListNew();
761 	    ajTablePut(rec->seqlocs, ajStrNewS(seqname), seqlocs);
762 	}
763 
764 	ajListPushAppend(seqlocs->locs, k);
765 
766 	startptr++;
767 	i++;
768 
769     }
770 
771     ajDebug("table done, size %Lu\n", ajTableGetLength(*table));
772 
773     return ajTrue;
774 }
775 
776 
777 
778 
779 /* @funcstatic wordOrderMatchTable ********************************************
780 **
781 ** Sort the hits by length then seq1 start then by seq2 start
782 **
783 ** @param [u] unorderedList [AjPList] Unsorted list
784 ** @return [void]
785 **
786 ** @release 2.1.0
787 ** @@
788 ******************************************************************************/
789 
wordOrderMatchTable(AjPList unorderedList)790 static void wordOrderMatchTable(AjPList unorderedList)
791 {
792     ajDebug("wordOrderMatchTable size %d\n", ajListGetLength(unorderedList));
793     ajListSort(unorderedList, &wordMatchCmp);
794 
795     return;
796 }
797 
798 
799 
800 
801 /* @funcstatic wordMatchCmp ***************************************************
802 **
803 ** Compares two sequence matches so the result can be used in sorting.
804 ** The comparison is done by size and if the size is equal, by seq1
805 ** start position.  If the seq1 start positions are equal they are
806 ** sorted by seq2 start position.
807 **
808 ** @param [r] v1 [const void*] First word
809 ** @param [r] v2 [const void*] Comparison word
810 ** @return [ajint] Comparison value. 0 if equal, -1 if first is lower,
811 **               +1 if first is higher.
812 **
813 ** @release 1.0.0
814 ** @@
815 ******************************************************************************/
816 
wordMatchCmp(const void * v1,const void * v2)817 static ajint wordMatchCmp(const void* v1, const void* v2)
818 {
819     const EmbPWordMatch m1;
820     const EmbPWordMatch m2;
821 
822     m1 = *(EmbPWordMatch const *) v1;
823     m2 = *(EmbPWordMatch const *) v2;
824 
825     /*
826        ajDebug("m1 %x %5d %5d %5d\n",
827        m1, m1->length, m1->seq2start, m1->seq1start);
828        ajDebug("m2 %x %5d %5d %5d\n",
829        m2, m2->length, m2->seq2start, m2->seq1start);
830        */
831 
832     if(m1->length != m2->length)
833     {
834 	if(m1->length < m2->length)
835 	{
836 	    /*ajDebug("return 1\n");*/
837 	    return 1;
838 	}
839 	else
840 	    return -1;
841     }
842 
843     if(m1->seq1start != m2->seq1start)
844     {
845 	if(m1->seq1start > m2->seq1start)
846 	    return 1;
847 	else
848 	    return -1;
849     }
850 
851     if(m1->seq2start != m2->seq2start)
852     {
853 	if(m1->seq2start > m2->seq2start)
854 	    return 1;
855 	else
856 	    return -1;
857     }
858 
859     return 0;
860 }
861 
862 
863 
864 
865 /* @funcstatic wordOrderPosMatchTable *****************************************
866 **
867 ** Sort the hits by seq1 start then by seq2 start
868 **
869 ** @param [u] unorderedList [AjPList] Unsorted list
870 ** @return [void]
871 **
872 ** @release 2.1.0
873 ** @@
874 ******************************************************************************/
875 
wordOrderPosMatchTable(AjPList unorderedList)876 static void wordOrderPosMatchTable(AjPList unorderedList)
877 {
878     ajListSort(unorderedList, &wordMatchCmpPos);
879 
880     return;
881 }
882 
883 
884 
885 
886 /* @funcstatic wordMatchCmpPos ************************************************
887 **
888 ** Compares two sequence matches so the result can be used in sorting.
889 ** The comparison is done by seq1
890 ** start position.  If the seq1 start positions are equal they are
891 ** sorted by seq2 start position.
892 **
893 ** @param [r] v1 [const void*] First word
894 ** @param [r] v2 [const void*] Comparison word
895 ** @return [ajint] Comparison value. 0 if equal, -1 if first is lower,
896 **               +1 if first is higher.
897 **
898 ** @release 1.0.0
899 ** @@
900 ******************************************************************************/
901 
wordMatchCmpPos(const void * v1,const void * v2)902 static ajint wordMatchCmpPos(const void* v1, const void* v2)
903 {
904     EmbPWordMatch m1;
905     EmbPWordMatch m2;
906 
907     m1 = *(EmbPWordMatch const *) v1;
908     m2 = *(EmbPWordMatch const *) v2;
909 
910     /*
911        ajDebug("m1 %x %5d %5d %5d\n",
912        m1, m1->length, m1->seq2start, m1->seq1start);
913        ajDebug("m2 %x %5d %5d %5d\n",
914        m2, m2->length, m2->seq2start, m2->seq1start);
915        */
916 
917     if(m1->seq1start != m2->seq1start)
918     {
919 	if(m1->seq1start > m2->seq1start)
920 	{
921 	    /*ajDebug("return 1\n");*/
922 	    return 1;
923 	}
924 	else return -1;
925     }
926 
927     if(m1->seq2start != m2->seq2start)
928     {
929 	if(m1->seq2start > m2->seq2start)
930 	{
931 	    /*ajDebug("return 1\n");*/
932 	    return 1;
933 	}
934 	else return -1;
935     }
936 
937     return 0;
938 }
939 
940 
941 
942 
943 /* @func embWordBuildMatchTable ***********************************************
944 **
945 ** Create a linked list of all the matches and order them by the
946 ** second sequence.
947 **
948 ** We need three lists:
949 **   (a) all hits, added in positional order
950 **   (b) ongoing hits, where we have not reached the end yet
951 **                 which is a list of items in "all hits" being updated
952 **   (c) new hits, found in the word table from the other sequence.
953 **
954 ** @param [r] seq1MatchTable [const AjPTable] Match table
955 ** @param [r] seq2 [const AjPSeq] Second sequence
956 ** @param [r] orderit [ajint] 1 to sort results at end, else 0.
957 ** @return [AjPList] List of matches.
958 ** @error NULL table was not built due to an error.
959 **
960 ** @release 1.0.0
961 ** @@
962 ******************************************************************************/
963 
embWordBuildMatchTable(const AjPTable seq1MatchTable,const AjPSeq seq2,ajint orderit)964 AjPList embWordBuildMatchTable(const AjPTable seq1MatchTable,
965 			       const AjPSeq seq2,
966 				ajint orderit)
967 {
968     ajuint i = 0;
969     ajuint ilast;
970     AjPList hitlist = NULL;
971     const AjPList newlist = NULL;
972     const AjPTable seqlocst;
973     EmbPWordSeqLocs* seqlocs=NULL;
974     const char *startptr;
975     EmbPWord wordmatch;
976     EmbPWordMatch newmatch;
977     EmbPWordMatch curmatch = NULL;
978     AjIList newiter;
979     AjIList curiter;
980     void *ptr = NULL;
981 
982     ajint *k = 0;
983     ajuint kcur = 0;
984     ajuint kcur2 = 0;
985     ajuint knew = 0;
986     AjBool matched = ajFalse;
987 
988     assert(wordLength > 0);
989 
990     hitlist = ajListNew();
991 
992     if(!wordCurList)
993 	wordCurList = ajListNew();
994 
995     if(ajSeqGetLen(seq2) < wordLength)
996     {
997 	ajWarn("ERROR: Sequence %S length %d less than word length %d",
998 	       ajSeqGetUsaS(seq2), ajSeqGetLen(seq2), wordLength);
999 
1000 	return hitlist;
1001     }
1002 
1003     startptr = ajSeqGetSeqC(seq2);
1004     ilast    = ajSeqGetLen(seq2) - wordLength;
1005 
1006     /*ajDebug("embWordBuildMatchTable ilast: %u\n", ilast);*/
1007 
1008     while(i < (ilast+1))
1009     {
1010 	if((wordmatch = ajTableFetchmodV(seq1MatchTable, startptr)))
1011 	{
1012 	    /* match found so create EmbSWordMatch structure and fill it
1013 	    ** in. Then set next pos accordingly
1014 	    ** BUT this could match several places so need to do for each
1015             ** position
1016             **
1017 	    ** there is a match between the two sequences
1018 	    ** this could extend an existing match or start a new one
1019 	    */
1020 
1021 	    seqlocst = wordmatch->seqlocs;
1022 	    ajTableToarrayValues(seqlocst, (void***)&seqlocs);
1023 	    /* TODO: assumes matching against single sequence */
1024 	    newlist = seqlocs[0]->locs;
1025 
1026 	    if(!ajListGetLength(newlist))
1027 		ajWarn("ERROR: newlist is empty\n");
1028 
1029             /*ajDebug("\nnewlist %u i:%u\n", ajListGetLength(newlist), i);*/
1030 	    newiter = ajListIterNewread(newlist);
1031 
1032 	    /* this is the list of matches for the current word and position */
1033 
1034 	    if(ajListGetLength(wordCurList))
1035 	    {
1036                 /*ajDebug("wordCurList size %d\n",
1037                   ajListGetLength(wordCurList));*/
1038 		curiter = ajListIterNew(wordCurList);
1039 
1040 		curmatch = ajListIterGet(curiter);
1041 		kcur = curmatch->seq1start + curmatch->length - wordLength + 1;
1042 		kcur2 = curmatch->seq2start + curmatch->length - wordLength + 1;
1043 	    }
1044 	    else
1045 		curiter = NULL;
1046 
1047 	    while(!ajListIterDone(newiter) )
1048 	    {
1049 		k = (ajint*) ajListIterGet(newiter);
1050 		knew = *k;
1051 
1052                 /*ajDebug("knew: %u i:%u\n", knew, i);*/
1053 		/* compare to current hits to test for extending */
1054 
1055                 ajListIterRewind(curiter);
1056                 matched = ajFalse;
1057 
1058                 while(!ajListIterDone(curiter) )
1059                 {
1060 		    curmatch = ajListIterGet(curiter);
1061 		    kcur = curmatch->seq1start + curmatch->length -
1062                         wordLength + 1;
1063                     kcur2 = curmatch->seq2start + curmatch->length -
1064                         wordLength + 1;
1065                     /*ajDebug(".test kcur/knew %u/%u kcur2/i %u/%u\n",
1066                       kcur, knew, kcur2, i);*/
1067 
1068                     /* when we test, we may have already incremented
1069                        one of the matches - so test old and new kcur2 */
1070                     if(kcur2 != i && kcur2 != i+1)
1071                     {
1072                         /*ajDebug("finished kcur: %u kcur2: %u i: %u\n",
1073                           kcur, kcur2,i);*/
1074                         ajListIterRemove(curiter);
1075                         continue;
1076                     }
1077 
1078                     if(kcur == knew && kcur2 == i)
1079                     {			/* check continued matches */
1080                         /* ajDebug("**match knew: %d kcur: %d kcur2: %d "
1081                            "start1: %d "
1082                                 "start2: %d len: %d i:%d\n",
1083                                 knew, kcur, kcur2,curmatch->seq1start,
1084                                 curmatch->seq2start,curmatch->length, i);*/
1085 			curmatch->length++;
1086                         matched = ajTrue;
1087                         continue;
1088                     }
1089                 }
1090 
1091                 if(!matched)
1092                 {			/* new current match */
1093                     /*ajDebug("save start1: %d start2: %d len: %d\n",
1094                             match2->seq1start, match2->seq2start,
1095                             match2->length);*/
1096                     /* add to hitlist */
1097                     newmatch = embWordMatchNew(seq2, knew, i, wordLength);
1098                     ajListPushAppend(hitlist, newmatch);
1099 
1100                     if(curiter)
1101                     {			/* add to wordCurList */
1102                         /*ajDebug("...ajListInsert using curiter %u\n",
1103                           ajListGetLength(wordCurList));*/
1104                         wordListInsertOld(curiter, newmatch);
1105                         /*wordCurListTrace(wordCurList);*/
1106                         /*wordCurIterTrace(curiter);*/
1107                     }
1108                     else
1109                     {
1110                         /*ajDebug("...ajListPushAppend to wordCurList %u\n",
1111                           ajListGetLength(wordCurList));*/
1112                         ajListPushAppend(wordCurList, newmatch);
1113                         /* wordCurListTrace(wordCurList); */
1114                     }
1115                 }
1116  		/* ajDebug("k: %d i: %d\n", *k, i); */
1117 	    }
1118 
1119 	    ajListIterDel(&newiter);
1120             ajListIterDel(&curiter);
1121             AJFREE(seqlocs);
1122 	}
1123 
1124 	/* no match, so all existing matches are completed */
1125 
1126 	i++;
1127 	startptr++;
1128     }
1129 
1130     /* wordCurListTrace(hitlist); */
1131     if(orderit)
1132 	wordOrderMatchTable(hitlist);
1133 
1134     /* wordCurListTrace(hitlist); */
1135 
1136     while(ajListPop(wordCurList,(void **)&ptr));
1137 
1138     return hitlist;
1139 }
1140 
1141 
1142 
1143 
1144 /* @func embWordMatchNew ******************************************************
1145 **
1146 ** Creates and initialises a word match object
1147 **
1148 ** @param [r] seq[const AjPSeq] Query sequence, match has been found
1149 ** @param [r] seq1start [ajuint] Start position in target sequence
1150 ** @param [r] seq2start [ajuint] Start position in query sequence
1151 ** @param [r] length [ajint] length of the word match
1152 ** @return [EmbPWordMatch] New word match object.
1153 **
1154 ** @release 6.3.0
1155 ** @@
1156 ******************************************************************************/
1157 
embWordMatchNew(const AjPSeq seq,ajuint seq1start,ajuint seq2start,ajint length)1158 EmbPWordMatch embWordMatchNew(const AjPSeq seq, ajuint seq1start,
1159 	                      ajuint seq2start, ajint length)
1160 {
1161     EmbPWordMatch match;
1162 
1163     AJNEW0(match);
1164 
1165     match->sequence  = seq;
1166     match->seq1start = seq1start;
1167     match->seq2start = seq2start;
1168     match->length = length;
1169 
1170     ajDebug("new word match start1: %d start2: %d len: %d\n",
1171             match->seq1start, match->seq2start,
1172             match->length);
1173 
1174     return match;
1175 }
1176 
1177 
1178 
1179 
1180 /* @funcstatic wordNewListTrace ***********************************************
1181 **
1182 ** Reports contents of a word list.
1183 **
1184 ** @param [r] i [ajint] Offset
1185 ** @param [r] newlist [const AjPList] word list.
1186 ** @return [void]
1187 **
1188 ** @release 1.0.0
1189 ** @@
1190 ******************************************************************************/
1191 
wordNewListTrace(ajint i,const AjPList newlist)1192 static void wordNewListTrace(ajint i, const AjPList newlist)
1193 {
1194     ajint *k;
1195     AjIList iter;
1196 
1197     iter = ajListIterNewread(newlist);
1198 
1199     ajDebug("\n++newlist... %d \n", i);
1200     ajDebug("++  k+len  i+len    k+1    i+1    len\n");
1201 
1202     while(!ajListIterDone(iter))
1203     {
1204 	k = (ajint*) ajListIterGet(iter);
1205 	ajDebug("++ %6d %6d %6d %6d %6d\n",
1206 		(*k)+wordLength, i+wordLength, (*k)+1, i+1, wordLength);
1207     }
1208 
1209     ajListIterDel(&iter);
1210 
1211     return;
1212 }
1213 
1214 
1215 
1216 
1217 /* @funcstatic wordCurListTrace ***********************************************
1218 **
1219 ** Reports contents of a word list.
1220 **
1221 ** @param [r] curlist [const AjPList] word list.
1222 ** @return [void]
1223 **
1224 ** @release 1.0.0
1225 ** @@
1226 ******************************************************************************/
1227 
wordCurListTrace(const AjPList curlist)1228 static void wordCurListTrace(const AjPList curlist)
1229 {
1230 /*
1231     EmbPWordMatch match;
1232     ajint i;
1233     ajint j;
1234     ajint ilen;
1235 */
1236     AjIList iter = ajListIterNewread(curlist);
1237 
1238     /*
1239        ajDebug("\ncurlist...\n");
1240        while(!ajListIterDone(iter))
1241        {
1242        match = ajListIterGet(iter);
1243        i = match->seq1start + 1;
1244        j = match->seq2start + 1;
1245        ilen = match->length;
1246        ajDebug("%6d %6d %6d %6d %6d\n",
1247        i+ilen, j+ilen, i, j, ilen);
1248        }
1249     */
1250 
1251     ajListIterDel(&iter);
1252 
1253     return;
1254 }
1255 
1256 
1257 
1258 
1259 /* @funcstatic wordCurIterTrace ***********************************************
1260 **
1261 ** Reports contents of a current word list iterator
1262 **
1263 ** @param [r] curiter [const AjIList] List iterator for the current word list
1264 ** @return [void]
1265 **
1266 ** @release 1.0.0
1267 ** @@
1268 ******************************************************************************/
1269 
wordCurIterTrace(const AjIList curiter)1270 static void wordCurIterTrace(const AjIList curiter)
1271 {
1272     AjPListNode node;
1273     EmbPWordMatch match;
1274     ajint i, j, ilen;
1275 
1276     ajDebug("curiter ...\n");
1277 
1278     if(curiter->Here)
1279     {
1280         node = curiter->Here;
1281         match = node->Item;
1282         i = match->seq1start + 1;
1283         j = match->seq2start + 1;
1284         ilen = match->length;
1285         ajDebug(" Here: %6d %6d %6d %6d %6d\n",
1286                 i+ilen, j+ilen, i, j, ilen);
1287     }
1288     else
1289         ajDebug(" Here: NULL\n");
1290 
1291     node = curiter->Head->First;
1292     match = node->Item;
1293     i = match->seq1start + 1;
1294     j = match->seq2start + 1;
1295     ilen = match->length;
1296     ajDebug(" Orig: %6d %6d %6d %6d %6d\n",
1297             i+ilen, j+ilen, i, j, ilen);
1298 
1299     return;
1300 }
1301 
1302 
1303 
1304 
1305 /* @funcstatic wordDeadZone ***************************************************
1306 **
1307 ** Determines if a match is within the region which is not overlapped by the
1308 ** match starting at position (deadx1, deady1) or ending at position
1309 ** (deadx2, deady2). If it is in this region
1310 ** (the 'live zone') then 0 is returned, if it is partially in this region
1311 ** then it is truncated to lie inside it and 2 is returned, else 1 is returned.
1312 **
1313 ** What is the 'live zone' and 'dead zone'?
1314 ** When an initial large match has been found we wish to remove any other
1315 ** (smaller) matches that overlap with it. The region in which other matches
1316 ** overlap with the first match are called here the 'dead zones'. The regions
1317 ** in which they don't overlap are called the 'live zones'. Other matches are
1318 ** OK if they are in live zones - they can co-exist with this match.
1319 **
1320 **
1321 **                   deadx2
1322 **       |              .
1323 **       |              .   live
1324 **       |              .   zone 2
1325 **       |     dead     ............deady2
1326 **       |     zone     /
1327 **  seq2 |             /
1328 **       |            /match
1329 **       |           /
1330 **   deady1..........
1331 **       |          .     dead
1332 **       |          .     zone
1333 **       |live      .
1334 **       |zone 1  deadx1
1335 **       -------------------------
1336 **                 seq1
1337 **
1338 ** @param [u] match [EmbPWordMatch] match to investigate
1339 ** @param [r] deadx1 [ajint] x position of end of live zone 1
1340 ** @param [r] deady1 [ajint] y position of end of live zone 1
1341 ** @param [r] deadx2 [ajint] x position of end of live zone 2
1342 ** @param [r] deady2 [ajint] y position of end of live zone 2
1343 ** @param [r] minlength [ajint] minimum length of match
1344 ** @return [ajint] 0=in live zone, 1=in dead zone, 2=truncated
1345 **
1346 ** @release 2.1.0
1347 ** @@
1348 ******************************************************************************/
1349 
wordDeadZone(EmbPWordMatch match,ajint deadx1,ajint deady1,ajint deadx2,ajint deady2,ajint minlength)1350 static ajint wordDeadZone(EmbPWordMatch match,
1351 			  ajint deadx1, ajint deady1,
1352 			  ajint deadx2, ajint deady2, ajint minlength)
1353 {
1354     ajint startx;
1355     ajint starty;
1356     ajint endx;
1357     ajint endy;
1358 
1359     startx = match->seq1start;
1360     starty = match->seq2start;
1361 
1362     endx = match->seq1start + match->length -1;
1363     endy = match->seq2start + match->length -1;
1364 
1365 
1366     /* is it in the top right live zone ? */
1367     if(startx > deadx2 && starty > deady2)
1368 	return 0;
1369 
1370     /* is it in the bottom right live zone ? */
1371     if(endx < deadx1 && endy < deady1)
1372 	return 0;
1373 
1374     /* is it in the top left dead zone? */
1375     if(starty >=  deady1 && endx <= deadx2)
1376 	return 1;
1377 
1378     /* is it in the bottom right dead zone? */
1379     if(endy <= deady2 && startx >= deadx1)
1380 	return 1;
1381 
1382     /* it must be partially in a dead zone - truncate it */
1383 
1384     if(endy < deady2)
1385     {
1386 	if(startx - starty < deadx1 - deady1)	        /* crosses deady1 */
1387 	    match->length = deady1 - starty;
1388 	else if(startx - starty > deadx1 - deady1)	/* crosses deadx1 */
1389 	    match->length = deadx1 - startx;
1390 	else
1391 	    ajFatal("Found a match where match is on the same diagonal \n"
1392 		    "startx=%d, starty=%d, endx=%d, endy=%d \n"
1393 		    "deadx1=%d, deady1=%d, deadx2=%d, deady2=%d\n",
1394 		    startx, starty, endx, endy, deadx1, deady1, deadx2,
1395 		    deady2);
1396     }
1397     else if(starty > deady1)
1398     {
1399 	if(startx - starty < deadx1 - deady1)
1400 	{
1401 	    /* crosses deadx2 */
1402 	    match->length = endx - deadx2;
1403 	    match->seq1start = deadx2 +1;
1404 	    match->seq2start += deadx2 - startx +1;
1405 
1406 	}
1407 	else if(startx - starty > deadx1 - deady1)
1408 	{
1409 	    /* crosses deady2 */
1410 	    match->length = endy - deady2;
1411 	    match->seq1start += deady2 - starty +1;
1412 	    match->seq2start = deady2 +1;
1413 
1414 	}
1415 	else
1416 	    ajFatal("Found a match where match is on the same diagonal \n"
1417 		    "startx=%d, starty=%d, endx=%d, endy=%d \n"
1418 		    "deadx1=%d, deady1=%d, deadx2=%d, deady2=%d\n",
1419 		    startx, starty, endx, endy, deadx1, deady1, deadx2,
1420 		    deady2);
1421     }
1422     else
1423 	ajFatal("Found a match that was not caught by any case \n"
1424 		"startx=%d, starty=%d, endx=%d, endy=%d \n"
1425 		"deadx1=%d, deady1=%d, deadx2=%d, deady2=%d\n",
1426 		startx, starty, endx, endy, deadx1, deady1, deadx2, deady2);
1427 
1428     /*
1429     **  is the truncated match shorter than our allowed minimum length?
1430     **  If so it should be dead
1431     */
1432     if(match->length < minlength)
1433 	return 1;
1434 
1435     return 2;
1436 }
1437 
1438 
1439 
1440 
1441 /* @func embWordMatchMin ******************************************************
1442 **
1443 ** Given a list of matches, reduce it to the minimal set of best
1444 ** non-overlapping matches.
1445 **
1446 ** @param [u] matchlist [AjPList] list of matches to reduce to
1447 **                                non-overlapping set
1448 ** @return [void]
1449 **
1450 ** @release 2.0.0
1451 ** @@
1452 ******************************************************************************/
1453 
embWordMatchMin(AjPList matchlist)1454 void embWordMatchMin(AjPList matchlist)
1455 {
1456     AjIList iter = NULL;
1457     EmbPWordMatch match;
1458     EmbPWordMatch thismatch;
1459     AjPList minlist;			/* list of matches in min set */
1460     ajint deadx1;			/* positions of the dead zones */
1461     ajint deady1;
1462     ajint deadx2;
1463     ajint deady2;
1464     AjBool truncated;
1465     ajint result;
1466 
1467     minlist = ajListNew();
1468 
1469     /* order the matches by size - largest first */
1470     wordOrderMatchTable(matchlist);
1471 
1472     /*
1473     **  remove all other matches in the overlapping dead-zone, truncating
1474     **  those that extend into the live-zone
1475     */
1476 
1477     /* repeat until there are no more matches to process */
1478     while(ajListGetLength(matchlist))
1479     {
1480 	/* get next longest match and append to list of minimal matches */
1481 	ajListPop(matchlist, (void **)&thismatch);
1482 
1483 	ajListPushAppend(minlist, thismatch);
1484 
1485 	/* get the positions of the dead zones */
1486 	deadx1 = thismatch->seq1start;	/* first pos of match */
1487 	deady1 = thismatch->seq2start;	/* first pos of match */
1488 
1489 	/* last pos of match */
1490 	deadx2 = thismatch->seq1start + thismatch->length -1;
1491 
1492 	/* last pos of match */
1493 	deady2 = thismatch->seq2start + thismatch->length -1;
1494 
1495 	/* haven't truncated any matches yet */
1496 	truncated = ajFalse;
1497 
1498 	/* look at all remaining matches in matchlist */
1499 	iter = ajListIterNew(matchlist);
1500 
1501 	while(!ajListIterDone(iter))
1502 	{
1503 	    match = ajListIterGet(iter);
1504 
1505 	    /* want to remove this match if it is in the dead zone */
1506 	    result = wordDeadZone(match, deadx1, deady1, deadx2, deady2,
1507 				  wordLength);
1508 	    if(result == 1)
1509 	    {
1510 		/*
1511 		**  it is in the dead zone - remove it
1512 		**  Need to free up the match structure and remove the
1513 		**  current node of the list
1514 		*/
1515 		wordMatchListDelete((void **)&match, NULL);
1516 		ajListIterRemove(iter);
1517 	    }
1518 	    else if(result == 2)
1519 	    {
1520 		/* it is partially in the dead zone - now truncated */
1521 		truncated = ajTrue;
1522 	    }
1523 	}
1524 
1525 	ajListIterDel(&iter);
1526 
1527 	/*
1528         **  if some truncating done then need to sort the matchlist
1529         **  again by size
1530         */
1531 	if(truncated)
1532 	    wordOrderMatchTable(matchlist);
1533     }
1534 
1535 
1536     /* sort by x start position */
1537     wordOrderPosMatchTable(minlist);
1538 
1539 
1540     /* matchlist is now reduced to the minimal non-overlapping list */
1541     ajListPushlist(matchlist, &minlist);
1542 
1543     return;
1544 }
1545 
1546 
1547 
1548 
1549 /* @func embWordMatchFirstMax *************************************************
1550 **
1551 ** Given list of matches returns the first match with maximum similarity/score.
1552 **
1553 ** @param [r] matchlist [const AjPList] list of matches
1554 ** @return [EmbPWordMatch] maximum match
1555 **
1556 ** @release 6.3.0
1557 ** @@
1558 ******************************************************************************/
1559 
embWordMatchFirstMax(const AjPList matchlist)1560 EmbPWordMatch embWordMatchFirstMax(const AjPList matchlist)
1561 {
1562     ajint maxmatch = 0;
1563     EmbPWordMatch p;
1564     EmbPWordMatch max = NULL;
1565 
1566     AjIList iter;
1567 
1568     iter = ajListIterNewread(matchlist);
1569 
1570     while(!ajListIterDone(iter))
1571     {
1572 	p = (EmbPWordMatch) ajListIterGet(iter);
1573 
1574 	if(p->length>maxmatch)
1575 	{
1576 	    max = p;
1577 	    maxmatch = p->length;
1578 	}
1579 	else if(p->length==maxmatch)
1580 	{
1581 	    ajDebug("possible max match position start1:%d start2:%d"
1582 		    " length:%d\n",p->seq1start, p->seq2start, p->length);
1583 
1584 	    if(p->seq1start<max->seq1start)
1585 		max =p;
1586 	}
1587     }
1588 
1589     ajDebug("maximum match position start1:%d start2:%d"
1590 	    " length:%d\n",max->seq1start, max->seq2start, max->length);
1591 
1592     ajListIterDel(&iter);
1593 
1594     return max;
1595 }
1596 
1597 
1598 
1599 
1600 /* @funcstatic wordRabinKarpConstant ******************************************
1601 **
1602 ** Returns a value that helps recalculating consecutive hash values
1603 ** with less computation during Rabin-Karp search.
1604 **
1605 ** @param [r] m [ajuint] word length
1606 ** @return [ajulong] radix^(m-1) % modulus
1607 **
1608 ** @release 6.3.0
1609 ** @@
1610 ******************************************************************************/
1611 
wordRabinKarpConstant(ajuint m)1612 static ajulong wordRabinKarpConstant(ajuint m)
1613 {
1614     ajulong rm;
1615     ajuint i;
1616 
1617     rm = 1;
1618 
1619     for(i = 1; i <= m-1; i++)
1620         rm = (RK_RADIX * rm) % RK_MODULUS;
1621 
1622     return rm;
1623 }
1624 
1625 
1626 
1627 
1628 /* @funcstatic wordRabinKarpCmp ***********************************************
1629 **
1630 ** Comparison function for EmbPWordRK objects, based on their hash values
1631 **
1632 ** @param [r] trgseq [const void *] First EmbPWordRK object
1633 ** @param [r] qryseq [const void *] Second EmbPWordRK object
1634 **
1635 ** @return [ajint] difference of hash values
1636 **
1637 ** @release 6.3.0
1638 ******************************************************************************/
1639 
wordRabinKarpCmp(const void * trgseq,const void * qryseq)1640 static ajint wordRabinKarpCmp(const void *trgseq, const void *qryseq)
1641 {
1642     const EmbPWordRK ww1;
1643     const EmbPWordRK ww2;
1644 
1645     ww1 = *(const EmbPWordRK const *) trgseq;
1646     ww2 = *(const EmbPWordRK const *) qryseq;
1647 
1648     if(ww1->hash > ww2->hash)
1649         return 1;
1650 
1651     if(ww1->hash < ww2->hash)
1652         return -1;
1653 
1654     return 0;
1655 }
1656 
1657 
1658 
1659 
1660 /* @func embWordRabinKarpInit *************************************************
1661 **
1662 ** Scans word/pattern table and repackages the words in EmbPWordRK
1663 ** objects to improve access efficiency by Rabin-Karp search.
1664 ** Computes hash values for each word/pattern.
1665 **
1666 ** @param [r] table [const AjPTable] Table of patterns
1667 ** @param [u] ewords [EmbPWordRK**] Extended word objects to be used
1668 **                                  in Rabin-Karp search
1669 ** @param [r] wordlen [ajuint] Length of words/patterns, kmer size
1670 ** @param [r] seqset [const AjPSeqset] Sequence set, input patterns
1671 **                                     were derived from
1672 ** @return [ajuint] number of words
1673 **
1674 ** @release 6.3.0
1675 ** @@
1676 ******************************************************************************/
1677 
embWordRabinKarpInit(const AjPTable table,EmbPWordRK ** ewords,ajuint wordlen,const AjPSeqset seqset)1678 ajuint embWordRabinKarpInit(const AjPTable table, EmbPWordRK** ewords,
1679                             ajuint wordlen, const AjPSeqset seqset)
1680 {
1681     ajuint i;
1682     ajuint j;
1683     ajuint k;
1684     ajuint l;
1685     EmbPWord* words = NULL;
1686     const EmbPWord embword = NULL;
1687     ajulong patternHash;
1688     EmbPWordRK newword = NULL;
1689     AjIList iterp;
1690     EmbPWordSeqLocs* seqlocs = NULL;
1691     ajuint nseqlocs;
1692     const AjPSeq seq = NULL;
1693     const char* word;
1694     ajuint nseqs;
1695     ajuint nwords;
1696     ajuint pos;
1697 
1698     nseqs = ajSeqsetGetSize(seqset);
1699     nwords = (ajuint) ajTableToarrayValues(table, (void***)&words);
1700     AJCNEW(*ewords, nwords);
1701 
1702     for(i=0; i<nwords; i++)
1703     {
1704         seqlocs=NULL;
1705         embword = words[i];
1706         word = embword->fword;
1707 
1708         AJNEW0(newword);
1709 
1710         patternHash = 0;
1711 
1712         /* TODO: we can continuously calculate the hash value
1713          *       as we do in the search function */
1714         for(j=0; j<wordlen; j++)
1715             patternHash = (RK_RADIX * patternHash +
1716 			   toupper((int)word[j]))% RK_MODULUS;
1717 
1718         nseqlocs = (ajuint) ajTableToarrayValues(embword->seqlocs, (void***)&seqlocs);
1719         newword->nseqs = nseqlocs;
1720         newword->hash  = patternHash;
1721         newword->word = embword;
1722         AJCNEW(newword->seqindxs, nseqlocs);
1723         AJCNEW(newword->locs, nseqlocs);
1724         AJCNEW(newword->nnseqlocs, nseqlocs);
1725         AJCNEW(newword->nSeqMatches, nseqlocs);
1726 
1727         for(j=0; j<nseqlocs; j++)
1728         {
1729             seq= seqlocs[j]->seq;
1730 
1731             for(l=0;l<nseqs;l++)
1732                 if (seq == ajSeqsetGetseqSeq(seqset,l))
1733                 {
1734                     newword->seqindxs[j] = l;
1735                     break;
1736                 }
1737 
1738             if(l == nseqs)
1739             {
1740                 ajErr("something wrong, sequence not found in seqset");
1741                 ajExitBad();
1742             }
1743 
1744             iterp = ajListIterNewread(seqlocs[j]->locs);
1745             k = 0;
1746             newword->nnseqlocs[j] = (ajuint) ajListGetLength(seqlocs[j]->locs);
1747             AJCNEW(newword->locs[j],newword->nnseqlocs[j]);
1748 
1749             while(!ajListIterDone(iterp))
1750             {
1751                 pos = *(ajuint *) ajListIterGet(iterp);
1752                 newword->locs[j][k++] = pos;
1753             }
1754 
1755             ajListIterDel(&iterp);
1756         }
1757 
1758         AJFREE(seqlocs);
1759 
1760         (*ewords)[i] = newword;
1761 
1762     }
1763 
1764     AJFREE(words);
1765 
1766     qsort(*ewords, nwords, sizeof(EmbPWordRK), wordRabinKarpCmp);
1767 
1768     return nwords;
1769 }
1770 
1771 
1772 
1773 
1774 /* @func embWordRabinKarpSearch ***********************************************
1775 **
1776 ** Rabin Karp search for multiple patterns.
1777 **
1778 ** @param [r] sseq [const AjPStr] Sequence to be scanned for multiple patterns
1779 ** @param [r] seqset [const AjPSeqset] Sequence-set,
1780 **                                     where search patterns coming from
1781 ** @param [r] patterns [EmbPWordRK const *] Patterns to be searched
1782 ** @param [r] plen [ajuint] Length of patterns
1783 ** @param [r] npatterns [ajuint] Number of patterns
1784 ** @param [u] matchlist [AjPList*] List of matches for each sequence
1785 **                                 in the sequence set
1786 ** @param [u] lastlocation [ajuint*] Position of the search for each sequence
1787 **                                   in the sequence set
1788 ** @param [r] checkmode [AjBool] If true, not writing features or alignments
1789 **                               but running to produce match statistics only
1790 ** @return [ajuint] total number of matches
1791 **
1792 ** @release 6.3.0
1793 ** @@
1794 ******************************************************************************/
1795 
embWordRabinKarpSearch(const AjPStr sseq,const AjPSeqset seqset,EmbPWordRK const * patterns,ajuint plen,ajuint npatterns,AjPList * matchlist,ajuint * lastlocation,AjBool checkmode)1796 ajuint embWordRabinKarpSearch(const AjPStr sseq,
1797                               const AjPSeqset seqset,
1798                               EmbPWordRK const * patterns,
1799                               ajuint plen, ajuint npatterns,
1800                               AjPList* matchlist,
1801                               ajuint* lastlocation, AjBool checkmode)
1802 {
1803     const char *text;
1804     const AjPSeq seq;
1805     ajuint i;
1806     ajuint matchlen;
1807     ajuint tlen;
1808     ajuint ii;
1809     ajuint k;
1810     ajuint seqsetindx;
1811     ajuint indxloc;
1812     ajuint maxloc;
1813     ajuint nMatches = 0;
1814     EmbPWordRK* bsres; /* match found using binary search */
1815     EmbPWordRK cursor;
1816     ajulong rm;
1817     ajulong textHash = 0;
1818     ajuint seq2start;
1819     char* tmp;
1820 
1821     ajuint pos;
1822     const char *seq_;
1823 
1824     AJNEW0(cursor);
1825 
1826     rm = wordRabinKarpConstant(plen);
1827     text = ajStrGetPtr(sseq);
1828     tlen  = ajStrGetLen(sseq);
1829 
1830     for(i=0; i<plen; i++)
1831         textHash = (ajulong)(RK_RADIX * textHash   +
1832 			     toupper((int)text[i])) % RK_MODULUS;
1833 
1834     /* Scan the input sequence sseq for all patterns */
1835     for (i=plen; i<=tlen; i++)
1836     {
1837         cursor->hash = textHash;
1838         bsres = bsearch(&cursor, patterns, npatterns,
1839                 sizeof(EmbPWordRK), wordRabinKarpCmp);
1840 
1841         if(bsres!=NULL)
1842         {
1843             seq2start = i-plen;
1844 
1845             for(k=0;k<(*bsres)->nseqs;k++)
1846             {
1847                 seqsetindx = (*bsres)->seqindxs[k];
1848                 seq = ajSeqsetGetseqSeq(seqset, seqsetindx);
1849 
1850                 if(lastlocation[seqsetindx] < i)
1851                 {
1852                     maxloc = 0;
1853 
1854                     for(indxloc=0; indxloc < (*bsres)->nnseqlocs[k]; indxloc++)
1855                     {
1856                         pos = (*bsres)->locs[k][indxloc];
1857                         seq_ = ajSeqGetSeqC(seq);
1858                         matchlen=0;
1859                         ii = seq2start;
1860 
1861                         /* following loop is to make sure we never have
1862                          * false positives, after we are confident that
1863                          * we don't get false hits we can delete/disable it
1864                          */
1865                         while(matchlen<plen)
1866                         {
1867 			  if(toupper((int)seq_[pos+matchlen]) !=
1868 			     toupper((int)text[ii++]))
1869                             {
1870                                 AJCNEW0(tmp,plen+1);
1871                                 tmp[plen] = '\0';
1872                                 memcpy(tmp, text+i-plen, plen);
1873                                 ajDebug("unexpected match:   pat:%s  pat-pos:"
1874                                         "%u, txt-pos:%u text:%s hash:%u\n",
1875                                         (*bsres)->word->fword, pos,
1876                                         i+matchlen-plen, tmp, textHash);
1877                                 AJFREE(tmp);
1878                                 break;
1879                             }
1880 
1881                             matchlen++;
1882                         }
1883 
1884                         /* if the match was a false positive skip it */
1885                         if(matchlen<plen)
1886                             continue;
1887 
1888                         /* this is where we extend matches */
1889                         while(ii<tlen  && pos+matchlen<ajSeqGetLen(seq))
1890                         {
1891 			  if(toupper((int)seq_[pos+matchlen]) !=
1892 			     toupper((int)text[ii++]))
1893                                 break;
1894                             else
1895                                 ++matchlen;
1896                         }
1897 
1898                         nMatches ++;
1899 
1900                         if(!checkmode)
1901                             ajListPushAppend(matchlist[seqsetindx],
1902                                              embWordMatchNew(seq,pos,seq2start,
1903                                                              matchlen));
1904 
1905                         if(ii > maxloc)
1906                             maxloc = ii;
1907 
1908                         (*bsres)->lenMatches += matchlen;
1909                         (*bsres)->nMatches++;
1910                         (*bsres)->nSeqMatches[k]++;
1911 
1912                     }
1913 
1914                     if(maxloc>0)
1915                     {
1916                         lastlocation[seqsetindx] = maxloc;
1917                     }
1918                 }
1919             }
1920         }
1921 
1922         textHash = ((textHash + toupper((int)text[i-plen]) * (RK_MODULUS-rm))
1923 		    * RK_RADIX + toupper((int)text[i])) % RK_MODULUS;
1924     }
1925 
1926     AJFREE(cursor);
1927 
1928     return nMatches;
1929 }
1930 
1931 
1932 
1933 
1934 /* @func embWordMatchIter *****************************************************
1935 **
1936 ** Return the start positions and length for the next match.
1937 ** The caller iterates over the list, which is a standard AjPList
1938 **
1939 ** @param [u] iter [AjIList] List iterator
1940 ** @param [w] start1 [ajint*] Start in first sequence
1941 ** @param [w] start2 [ajint*] Start in second sequence
1942 ** @param [w] len [ajint*] Length of match
1943 ** @param [w] seq [const AjPSeq*] Pointer to sequence
1944 ** @return [AjBool] ajFalse if the iterator was exhausted
1945 **
1946 **
1947 ** @release 2.4.0
1948 ******************************************************************************/
1949 
embWordMatchIter(AjIList iter,ajint * start1,ajint * start2,ajint * len,const AjPSeq * seq)1950 AjBool embWordMatchIter(AjIList iter, ajint* start1, ajint* start2,
1951 			ajint* len, const AjPSeq* seq)
1952 {
1953     EmbPWordMatch p;
1954 
1955     if(ajListIterDone(iter))
1956 	return ajFalse;
1957 
1958     p = (EmbPWordMatch) ajListIterGet(iter);
1959     *start1 = p->seq1start;
1960     *start2 = p->seq2start;
1961     *len = p->length;
1962     *seq = p->sequence;
1963 
1964     return ajTrue;
1965 }
1966 
1967 
1968 
1969 
1970 /* @funcstatic wordListInsertOld **********************************************
1971 **
1972 ** Obsolete ajListInsert version emulation
1973 ** Insert an item in a list, using an iterator (if not null)
1974 ** to show which position to insert. Otherwise, simply push.
1975 **
1976 ** @param [u] iter [AjIList] List iterator.
1977 ** @param [r] x [void*] Data item to insert.
1978 ** @return [void]
1979 **
1980 ** @release 2.1.0
1981 ** @@
1982 ******************************************************************************/
1983 
wordListInsertOld(AjIList iter,void * x)1984 static void wordListInsertOld(AjIList iter, void* x)
1985 {
1986     AjPList list;
1987     AjPListNode p;
1988 
1989     list = iter->Head;
1990     p    = iter->Here;
1991 
1992     if(!p->Prev)
1993     {
1994 	ajListPush(list,(void *)x);
1995 	return;
1996     }
1997 
1998     if(p == list->First)
1999     {
2000 	if(!p->Prev->Prev)
2001 	    wordListInsertNodeOld(&list->First->Next,x);
2002 	else
2003 	    wordListInsertNodeOld(&p->Prev->Next,x);
2004     }
2005     else
2006     {
2007 	if(!p->Prev->Prev)
2008 	    wordListInsertNodeOld(&list->First,x);
2009 	else
2010 	    wordListInsertNodeOld(&p->Prev->Prev->Next,x);
2011     }
2012 
2013     list->Count++;
2014 
2015     return;
2016 }
2017 
2018 
2019 
2020 
2021 /* @funcstatic wordListInsertNodeOld ******************************************
2022 **
2023 ** Inserts a new node in a list at the current node position.
2024 **
2025 ** @param [u] pnode [AjPListNode*] Current node.
2026 ** @param [r] x [void*] Data item to insert.
2027 ** @return [void]
2028 **
2029 ** @release 2.1.0
2030 ** @@
2031 ******************************************************************************/
2032 
wordListInsertNodeOld(AjPListNode * pnode,void * x)2033 static void wordListInsertNodeOld(AjPListNode* pnode, void* x)
2034 {
2035     AjPListNode p;
2036 
2037     AJNEW0(p);
2038     p->Item = x;
2039     p->Next = (*pnode);
2040     p->Prev = (*pnode)->Prev;
2041     p->Next->Prev = p;
2042     *pnode = p;
2043 
2044     return;
2045 }
2046 
2047 
2048 
2049 
2050 /* @func embWordUnused ********************************************************
2051 **
2052 ** Unused functions. Here to keep compiler warnings away
2053 **
2054 ** @return [void]
2055 **
2056 ** @release 2.0.0
2057 ******************************************************************************/
2058 
embWordUnused(void)2059 void embWordUnused(void)
2060 {
2061 
2062     wordCurListTrace(NULL);	/* comment out in embWordBuildMatchTable */
2063     wordCurIterTrace(NULL);	/* comment out in embWordBuildMatchTable */
2064     wordNewListTrace(0, NULL);	/* comment out in embWordBuildMatchTable */
2065 
2066     return;
2067 }
2068 
2069 
2070 
2071 
2072 /* @func embWordExit **********************************************************
2073 **
2074 ** Cleanup word matching indexing internals on exit
2075 **
2076 ** @return [void]
2077 **
2078 ** @release 4.0.0
2079 ******************************************************************************/
2080 
embWordExit(void)2081 void embWordExit(void)
2082 {
2083     embWordClear();
2084     ajListFree(&wordCurList);
2085 
2086     return;
2087 }
2088 
2089 
2090 
2091 
2092 #ifdef AJ_COMPILE_DEPRECATED_BOOK
2093 #endif
2094 
2095 
2096 
2097 
2098 #ifdef AJ_COMPILE_DEPRECATED
2099 /* @obsolete embWordMatchListAppend
2100 ** @remove use embWordMatchNew followed by a list append call
2101 */
embWordMatchListAppend(AjPList hitlist,const AjPSeq seq,const ajuint seq1start,ajuint seq2start,ajint length)2102 __deprecated EmbPWordMatch embWordMatchListAppend(AjPList hitlist,
2103                                                   const AjPSeq seq,
2104                                                   const ajuint seq1start,
2105                                                   ajuint seq2start,
2106                                                   ajint length)
2107 {
2108     EmbPWordMatch match;
2109     AJNEW0(match);
2110     match->sequence  = seq;
2111     match->seq1start = seq1start;
2112     match->seq2start = seq2start;
2113     match->length = length;
2114     ajDebug("new word match start1: %d start2: %d len: %d\n",
2115             match->seq1start, match->seq2start,
2116             match->length);
2117     ajListPushAppend(hitlist, match);
2118     return match;
2119 }
2120 #endif
2121