1 /* @source embword ************************************************************
2 **
3 ** Wordmatch routines
4 **
5 ** @author Copyright (c) 1999 Gary Williams
6 ** @version $Revision: 1.65 $
7 ** @modified $Date: 2012/07/14 14:52:41 $ by $Author: rice $
8 ** @@
9 **
10 ** This library is free software; you can redistribute it and/or
11 ** modify it under the terms of the GNU Lesser General Public
12 ** License as published by the Free Software Foundation; either
13 ** version 2.1 of the License, or (at your option) any later version.
14 **
15 ** This library is distributed in the hope that it will be useful,
16 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ** Lesser General Public License for more details.
19 **
20 ** You should have received a copy of the GNU Lesser General Public
21 ** License along with this library; if not, write to the Free Software
22 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23 ** MA 02110-1301, USA.
24 **
25 ******************************************************************************/
26
27 #include "ajlib.h"
28
29 #include "embword.h"
30 #include "ajassert.h"
31 #include "ajseq.h"
32 #include "ajfeat.h"
33 #include "ajfile.h"
34 #include "ajlist.h"
35 #include "ajtable.h"
36 #include "ajutil.h"
37
38 #include <math.h>
39
40
41
42
43 /*
44 ** current wordlength - this is an easily accessible copy of the value
45 ** in the first node of wordLengthList
46 */
47 static ajuint wordLength = 0;
48
49 /* list of wordlengths with current one at top of list */
50 static AjPList wordLengthList = NULL;
51
52 static AjPList wordCurList = NULL;
53
54 /*
55 ** Rabin-Karp multi-pattern search parameters.
56 ** Modulus (a large prime) and radix are used in calculating hash values
57 ** efficiently.
58 **
59 ** Note: we should be able to replace binary search in Rabin-Karp search
60 ** algorithm with direct search by selecting a small q, however in this case
61 ** we should always check that a hit is a correct hit (as we currently do).
62 ** Radix has a relation with alphabet size, selecting its value depending
63 ** on input might be awarding in this context.
64 **
65 */
66
67 #define RK_MODULUS 1073741789UL
68 #define RK_RADIX 256UL
69
70
71
72 static ajint wordCmpStr(const void *x, const void *y);
73 static ajint wordCompare(const void *x, const void *y);
74 static void wordCurIterTrace(const AjIList curiter);
75 static void wordCurListTrace(const AjPList curlist);
76 static ajint wordDeadZone(EmbPWordMatch match,
77 ajint deadx1, ajint deady1,
78 int deadx2, ajint deady2, ajint minlength);
79 static void wordListInsertNodeOld(AjPListNode* pnode, void* x);
80 static void wordListInsertOld(AjIList iter, void* x);
81 static ajint wordMatchCmp(const void* v1, const void* v2);
82 static ajint wordMatchCmpPos(const void* v1, const void* v2);
83 static void wordNewListTrace(ajint i, const AjPList newlist);
84 static void wordOrderPosMatchTable(AjPList unorderedList);
85
86 static ajulong wordStrHash(const void *key, ajulong hashsize);
87
88 static void wordVFreeLocs(void **value);
89 static void wordVFreeSeqlocs(void **value);
90
91 static ajint wordRabinKarpCmp(const void *trgseq, const void *qryseq);
92 static ajulong wordRabinKarpConstant(ajuint m);
93
94
95
96
97 /* @funcstatic wordCmpStr *****************************************************
98 **
99 ** Compare two words for first n chars. n set by embWordLength.
100 **
101 ** @param [r] x [const void *] First word
102 ** @param [r] y [const void *] Second word
103 ** @return [ajint] difference
104 **
105 ** @release 1.0.0
106 ** @@
107 ******************************************************************************/
108
wordCmpStr(const void * x,const void * y)109 static ajint wordCmpStr(const void *x, const void *y)
110 {
111 return ajCharCmpCaseLen((const char *)x, (const char *)y, wordLength);
112 }
113
114
115
116
117 /* @funcstatic wordStrHash ****************************************************
118 **
119 ** Create hash value from key.
120 **
121 ** @param [r] key [const void *] key.
122 ** @param [r] hashsize [ajulong] Hash size
123 ** @return [ajulong] hash value
124 **
125 ** @release 1.0.0
126 ** @@
127 ******************************************************************************/
128
wordStrHash(const void * key,ajulong hashsize)129 static ajulong wordStrHash(const void *key, ajulong hashsize)
130 {
131 ajulong hashval;
132 const char *s;
133
134 ajuint i;
135
136 s = (const char *) key;
137
138 for(i=0, hashval = 0; i < wordLength; i++, s++)
139 hashval = toupper((ajint)*s) + 31 *hashval;
140
141 return hashval % hashsize;
142 }
143
144
145
146
147 /* @funcstatic wordCompare ****************************************************
148 **
149 ** Compare two words in descending order.
150 **
151 ** @param [r] x [const void *] First word
152 ** @param [r] y [const void *] Second word
153 ** @return [ajint] count difference for words.
154 **
155 ** @release 2.1.0
156 ** @@
157 ******************************************************************************/
158
wordCompare(const void * x,const void * y)159 static ajint wordCompare(const void *x, const void *y)
160 {
161 /*
162 const EmbPWord xw;
163 const EmbPWord yw;
164
165 xw = ((const EmbPWord2)x)->fword;
166 yw = ((const EmbPWord2)y)->fword;
167
168 return (yw->count - xw->count);
169 */
170
171 return ((*(EmbPWord const *)y)->count -
172 (*(EmbPWord const *)x)->count);
173 }
174
175
176
177
178 /* @func embWordLength ********************************************************
179 **
180 ** Sets the word length for all functions. Must be called first.
181 ** Creates the word length list if not yet done.
182 ** Pushes the latest word length value on the list.
183 **
184 ** @param [r] wordlen [ajint] Word length
185 ** @return [void]
186 **
187 ** @release 1.0.0
188 ** @@
189 ******************************************************************************/
190
embWordLength(ajint wordlen)191 void embWordLength(ajint wordlen)
192 {
193 ajint *pint;
194
195 if(!wordLengthList)
196 wordLengthList = ajListNew();
197
198 /* store the wordlength in case we do recursive word stuff */
199 AJNEW0(pint);
200 *pint = wordlen;
201 ajListPush(wordLengthList, pint);
202
203 /* set the current wordlength as an easily accessible static ajint */
204 wordLength = wordlen;
205
206 return;
207 }
208
209
210
211
212 /* @func embWordClear *********************************************************
213 **
214 ** Clears the word length for all functions. To be called when all is done.
215 ** Pops the last word length from the list and frees it.
216 ** If there is nothing else on the list, it frees the list.
217 **
218 ** @return [void]
219 **
220 ** @release 1.0.0
221 ** @@
222 ******************************************************************************/
223
embWordClear(void)224 void embWordClear(void)
225 {
226 ajint *pint;
227
228 /*
229 ** pop the previous word length from the list in case there's
230 ** recursive word stuff
231 */
232 if(ajListGetLength(wordLengthList))
233 {
234 ajListPop(wordLengthList, (void **)&pint);
235 AJFREE(pint);
236 }
237
238 if(!ajListGetLength(wordLengthList))
239 {
240 ajListFree(&wordLengthList);
241 wordLengthList = NULL; /* no valid word length set */
242 wordLength = 0; /* no valid word length set */
243 }
244 else
245 {
246 /* set the current wordlength as an easily accessible static ajint */
247 ajListPeekFirst(wordLengthList, (void **)&pint);
248 wordLength = *pint;
249 }
250
251 return;
252 }
253
254
255
256
257 /* @func embWordPrintTable ****************************************************
258 **
259 ** Print the words found with their frequencies.
260 **
261 ** @param [r] table [const AjPTable] table to be printed
262 ** @return [void]
263 **
264 ** @release 1.0.0
265 ** @@
266 ******************************************************************************/
267
embWordPrintTable(const AjPTable table)268 void embWordPrintTable(const AjPTable table)
269 {
270 void **valarray = NULL;
271 EmbPWord ajnew;
272 ajint i;
273
274 ajTableToarrayValues(table, &valarray);
275
276 qsort(valarray, (size_t) ajTableGetLength(table), sizeof (*valarray), wordCompare);
277
278 for(i = 0; valarray[i]; i++)
279 {
280 ajnew = (EmbPWord) valarray[i];
281 ajUser("%.*s\t%d", wordLength, ajnew->fword,ajnew->count);
282 }
283
284 AJFREE(valarray);
285
286 return;
287 }
288
289
290
291
292 /* @func embWordPrintTableFI **************************************************
293 **
294 ** Print the words found with their frequencies.
295 **
296 ** @param [r] table [const AjPTable] table to be printed
297 ** @param [r] mincount [ajint] Minimum frequency to report
298 ** @param [u] outf [AjPFile] Output file.
299 ** @return [void]
300 **
301 ** @release 4.0.0
302 ** @@
303 ******************************************************************************/
304
embWordPrintTableFI(const AjPTable table,ajint mincount,AjPFile outf)305 void embWordPrintTableFI(const AjPTable table, ajint mincount, AjPFile outf)
306 {
307 void **valarray = NULL;
308 EmbPWord ajnew;
309 ajint i;
310
311 if(!ajTableGetLength(table)) return;
312
313 i = (ajuint) ajTableToarrayValues(table, &valarray);
314
315 ajDebug("embWordPrintTableFI size %d mincount:%d\n", i, mincount);
316
317 for(i = 0; valarray[i]; i++)
318 {
319 ajnew = (EmbPWord) valarray[i];
320 ajDebug("embWordPrintTableFI unsorted [%d] %.*s %d\n",
321 i, wordLength, ajnew->fword,ajnew->count);
322 }
323
324 qsort(valarray, (size_t) ajTableGetLength(table), sizeof (*valarray), wordCompare);
325
326 for(i = 0; valarray[i]; i++)
327 {
328 ajnew = (EmbPWord) valarray[i];
329 ajDebug("embWordPrintTableFI sorted [%d] %.*s %d\n",
330 i, wordLength, ajnew->fword,ajnew->count);
331
332 if(ajnew->count < mincount)
333 break;
334
335 ajFmtPrintF(outf, "%.*s\t%d\n",
336 wordLength, ajnew->fword,ajnew->count);
337 }
338
339 AJFREE(valarray);
340
341 return;
342 }
343
344
345
346
347 /* @func embWordPrintTableF ***************************************************
348 **
349 ** Print the words found with their frequencies.
350 **
351 ** @param [r] table [const AjPTable] table to be printed
352 ** @param [u] outf [AjPFile] Output file.
353 ** @return [void]
354 **
355 ** @release 1.0.0
356 ** @@
357 ******************************************************************************/
358
embWordPrintTableF(const AjPTable table,AjPFile outf)359 void embWordPrintTableF(const AjPTable table, AjPFile outf)
360 {
361 embWordPrintTableFI(table, 1, outf);
362
363 return;
364 }
365
366
367
368
369 /* @funcstatic wordVFreeLocs **************************************************
370 **
371 ** Free the elements in a EmbPWord locations table.
372 **
373 ** @param [d] value [void**] Data value for a table item
374 ** @return [void]
375 **
376 ** @release 6.4.0
377 ** @@
378 ******************************************************************************/
379
wordVFreeLocs(void ** value)380 static void wordVFreeLocs(void **value)
381 {
382 AjPTable table = ((EmbPWord)*value)->seqlocs;
383
384 ajTableDelValdel(&table, &wordVFreeSeqlocs);
385
386 /* free the locations structure */
387 AJFREE(*value);
388
389 return;
390 }
391
392
393
394
395 /* @funcstatic wordVFreeSeqlocs ***********************************************
396 **
397 ** Free the elements in a EmbPWord sequence locations list.
398 **
399 ** @param [d] value [void**] Data values as void**
400 ** @return [void]
401 **
402 ** @release 6.4.0
403 ** @@
404 ******************************************************************************/
405
wordVFreeSeqlocs(void ** value)406 static void wordVFreeSeqlocs(void **value)
407 {
408 AjPList list = ((EmbPWordSeqLocs)*value)->locs;
409
410 /* free the elements in the list of the positions */
411 ajListFreeData(&list);
412
413 /* free the locations structure */
414 AJFREE(*value);
415
416 return;
417 }
418
419
420
421
422 /* @func embWordFreeTable *****************************************************
423 **
424 ** delete the word table and free the memory.
425 **
426 ** @param [d] table [AjPTable*] table to be deleted
427 ** @return [void]
428 **
429 ** @release 1.0.0
430 ** @@
431 ******************************************************************************/
432
embWordFreeTable(AjPTable * table)433 void embWordFreeTable(AjPTable *table)
434 {
435 ajTableDel(table);
436
437 return;
438 }
439
440
441
442
443 /* @funcstatic wordMatchListDelete ********************************************
444 **
445 ** deletes entries in a list of matches.
446 **
447 ** @param [d] x [void**] Data values as void**
448 ** @param [r] cl [void*] Ignored user data, usually NULL.
449 ** @return [void]
450 **
451 ** @release 2.1.0
452 ** @@
453 ******************************************************************************/
454
wordMatchListDelete(void ** x,void * cl)455 static void wordMatchListDelete(void **x,void *cl)
456 {
457 EmbPWordMatch p;
458
459 p = (EmbPWordMatch)*x;
460
461 AJFREE(p);
462
463 if(!cl)
464 return;
465
466 return;
467 }
468
469
470
471
472 /* @func embWordMatchListDelete ***********************************************
473 **
474 ** delete the word table.
475 **
476 ** @param [u] plist [AjPList*] list to be deleted.
477 ** @return [void]
478 **
479 ** @release 1.0.0
480 ** @@
481 ******************************************************************************/
482
embWordMatchListDelete(AjPList * plist)483 void embWordMatchListDelete(AjPList* plist)
484 {
485 if(!*plist)
486 return;
487
488 ajListMap(*plist, &wordMatchListDelete, NULL);
489 ajListFree(plist);
490
491 return;
492 }
493
494
495
496
497 /* @funcstatic wordMatchListPrint *********************************************
498 **
499 ** print the word table.
500 **
501 ** @param [r] x [void*] List item (EmbPWordMatch*)
502 ** @param [r] cl [void*] Output file AjPFile
503 ** @return [void]
504 **
505 ** @release 2.1.0
506 ** @@
507 ******************************************************************************/
508
wordMatchListPrint(void * x,void * cl)509 static void wordMatchListPrint(void *x,void *cl)
510 {
511 EmbPWordMatch p;
512 AjPFile file;
513
514 p = (EmbPWordMatch)x;
515 file = (AjPFile) cl;
516
517 ajFmtPrintF(file, "%10d %10d %10d\n",
518 p->seq1start+1,
519 p->seq2start+1,
520 p->length);
521
522 return;
523 }
524
525
526
527
528 /* @func embWordMatchListPrint ************************************************
529 **
530 ** print the word table.
531 **
532 ** @param [u] file [AjPFile] Output file
533 ** @param [r] list [const AjPList] list to be printed.
534 ** @return [void]
535 **
536 ** @release 1.0.0
537 ** @@
538 ******************************************************************************/
539
embWordMatchListPrint(AjPFile file,const AjPList list)540 void embWordMatchListPrint(AjPFile file, const AjPList list)
541 {
542 ajListMapread(list, &wordMatchListPrint, file);
543
544 return;
545 }
546
547
548
549
550 /* @func embWordMatchListConvToFeat *******************************************
551 **
552 ** convert the word table to feature tables.
553 **
554 ** @param [r] list [const AjPList] list to be printed.
555 ** @param [u] tab1 [AjPFeattable*] feature table for sequence 1
556 ** @param [u] tab2 [AjPFeattable*] feature table for sequence 2
557 ** @param [r] seq1 [const AjPSeq] sequence
558 ** @param [r] seq2 [const AjPSeq] second sequence
559 ** @return [void]
560 **
561 ** @release 1.0.0
562 ** @@
563 ******************************************************************************/
564
embWordMatchListConvToFeat(const AjPList list,AjPFeattable * tab1,AjPFeattable * tab2,const AjPSeq seq1,const AjPSeq seq2)565 void embWordMatchListConvToFeat(const AjPList list,
566 AjPFeattable *tab1, AjPFeattable *tab2,
567 const AjPSeq seq1, const AjPSeq seq2)
568 {
569 char strand = '+';
570 ajint frame = 0;
571 AjPStr source = NULL;
572 AjPStr type = NULL;
573 AjPStr tag = NULL;
574 AjPFeature feature;
575 AjIList iter = NULL;
576 float score = 1.0;
577
578 if(!*tab1)
579 *tab1 = ajFeattableNewSeq(seq1);
580
581 if(!*tab2)
582 *tab2 = ajFeattableNewSeq(seq2);
583
584 ajStrAssignC(&source,"wordmatch");
585 ajStrAssignC(&type,"misc_feature");
586 score = 1.0;
587 ajStrAssignC(&tag,"note");
588
589 iter = ajListIterNewread(list);
590
591 while(!ajListIterDone(iter))
592 {
593 EmbPWordMatch p = (EmbPWordMatch) ajListIterGet(iter);
594 feature = ajFeatNew(*tab1, source, type,
595 p->seq1start+1,p->seq1start+p->length , score,
596 strand, frame) ;
597
598 ajFeatTagSet(feature, tag, ajSeqGetNameS(seq2));
599
600 feature = ajFeatNew(*tab2, source, type,
601 p->seq2start+1,p->seq2start+p->length , score,
602 strand, frame) ;
603
604 ajFeatTagSet(feature, tag, ajSeqGetNameS(seq1));
605 }
606
607 ajListIterDel(&iter);
608 ajStrDel(&source);
609 ajStrDel(&type);
610 ajStrDel(&tag);
611
612 return;
613 }
614
615
616
617
618 /* @func embWordGetTable ******************************************************
619 **
620 ** Builds a table of all words in a sequence.
621 **
622 ** The word length must be defined by a call to embWordLength.
623 **
624 ** @param [u] table [AjPTable*] table to be created or updated.
625 ** @param [r] seq [const AjPSeq] Sequence to be "worded"
626 ** @return [AjBool] ajTrue if successful
627 **
628 ** @release 1.0.0
629 ** @@
630 ******************************************************************************/
631
embWordGetTable(AjPTable * table,const AjPSeq seq)632 AjBool embWordGetTable(AjPTable *table, const AjPSeq seq)
633 {
634 const char * startptr;
635 ajuint i;
636 ajuint j;
637 ajuint ilast;
638 ajuint *k;
639 EmbPWord rec;
640 EmbPWordSeqLocs seqlocs;
641 const AjPStr seqname;
642 char* key;
643
644 ajuint wordsize;
645 char skipchar;
646
647 wordsize = wordLength+1;
648
649 skipchar = 'X';
650
651 if(ajSeqIsNuc(seq))
652 skipchar = 'N';
653
654 assert(wordLength > 0);
655
656 ajDebug("embWordGetTable seq.len %d wordlength %d skipchar '%c'\n",
657 ajSeqGetLen(seq), wordLength, skipchar);
658
659 if(ajSeqGetLen(seq) < wordLength)
660 {
661 ajDebug("sequence too short: wordsize = %d, sequence length = %d",
662 wordLength, ajSeqGetLen(seq));
663
664 return ajFalse;
665 }
666
667 if(!*table)
668 {
669 *table = ajTableNewFunctionLen(ajSeqGetLen(seq),
670 &wordCmpStr, &wordStrHash,
671 &ajMemFree, &wordVFreeLocs);
672 ajDebug("make new table\n");
673 }
674
675 /* initialise ptr to start of seq string */
676 startptr = ajSeqGetSeqC(seq);
677
678 i = 0;
679 ilast = ajSeqGetLen(seq) - wordLength;
680
681 j=0;
682
683 while(j<wordLength)
684 {
685 if((char)toupper((ajint)startptr[j]) == skipchar)
686 {
687 ajDebug("Skip '%c' at start from %d",
688 skipchar, i+j+1);
689
690 while((char)toupper((ajint)startptr[j]) == skipchar)
691 {
692 i++;
693 startptr++;
694 }
695
696 ajDebug(" to %d\n",
697 i+j);
698 j = 0;
699
700 if(i > ilast)
701 {
702 ajDebug("sequence has no word without ambiguity code '%c'\n",
703 skipchar);
704
705 return ajFalse;
706 }
707 }
708 else
709 j++;
710 }
711
712 j = wordLength - 1;
713
714 while(i <= ilast)
715 {
716 if((char)toupper((ajint)startptr[j]) == skipchar)
717 {
718 ajDebug("Skip '%c' from %d", skipchar, j);
719
720 while((char)toupper((ajint)startptr[j]) == skipchar)
721 {
722 i++;
723 startptr++;
724 }
725
726 i += j;
727 startptr += j;
728 ajDebug(" to %d\n", i);
729 continue;
730 }
731
732 rec = (EmbPWord) ajTableFetchmodV(*table, startptr);
733
734 /* does it exist already */
735 if(rec)
736 {
737 /* if yes increment count */
738 rec->count++;
739 }
740 else
741 {
742 /* else create a new word */
743 AJNEW0(rec);
744 rec->count = 1;
745 key = ajCharNewResLenC(startptr, wordsize, wordLength);
746 rec->fword = key;
747 rec->seqlocs = ajTablestrNew(1000);
748 ajTablePut(*table, key, rec);
749 }
750
751 AJNEW0(k);
752 *k = i;
753 seqname = ajSeqGetNameS(seq);
754 seqlocs = (EmbPWordSeqLocs) ajTableFetchmodS(rec->seqlocs, seqname);
755
756 if (seqlocs == NULL)
757 {
758 AJNEW0(seqlocs);
759 seqlocs->seq = seq;
760 seqlocs->locs = ajListNew();
761 ajTablePut(rec->seqlocs, ajStrNewS(seqname), seqlocs);
762 }
763
764 ajListPushAppend(seqlocs->locs, k);
765
766 startptr++;
767 i++;
768
769 }
770
771 ajDebug("table done, size %Lu\n", ajTableGetLength(*table));
772
773 return ajTrue;
774 }
775
776
777
778
779 /* @funcstatic wordOrderMatchTable ********************************************
780 **
781 ** Sort the hits by length then seq1 start then by seq2 start
782 **
783 ** @param [u] unorderedList [AjPList] Unsorted list
784 ** @return [void]
785 **
786 ** @release 2.1.0
787 ** @@
788 ******************************************************************************/
789
wordOrderMatchTable(AjPList unorderedList)790 static void wordOrderMatchTable(AjPList unorderedList)
791 {
792 ajDebug("wordOrderMatchTable size %d\n", ajListGetLength(unorderedList));
793 ajListSort(unorderedList, &wordMatchCmp);
794
795 return;
796 }
797
798
799
800
801 /* @funcstatic wordMatchCmp ***************************************************
802 **
803 ** Compares two sequence matches so the result can be used in sorting.
804 ** The comparison is done by size and if the size is equal, by seq1
805 ** start position. If the seq1 start positions are equal they are
806 ** sorted by seq2 start position.
807 **
808 ** @param [r] v1 [const void*] First word
809 ** @param [r] v2 [const void*] Comparison word
810 ** @return [ajint] Comparison value. 0 if equal, -1 if first is lower,
811 ** +1 if first is higher.
812 **
813 ** @release 1.0.0
814 ** @@
815 ******************************************************************************/
816
wordMatchCmp(const void * v1,const void * v2)817 static ajint wordMatchCmp(const void* v1, const void* v2)
818 {
819 const EmbPWordMatch m1;
820 const EmbPWordMatch m2;
821
822 m1 = *(EmbPWordMatch const *) v1;
823 m2 = *(EmbPWordMatch const *) v2;
824
825 /*
826 ajDebug("m1 %x %5d %5d %5d\n",
827 m1, m1->length, m1->seq2start, m1->seq1start);
828 ajDebug("m2 %x %5d %5d %5d\n",
829 m2, m2->length, m2->seq2start, m2->seq1start);
830 */
831
832 if(m1->length != m2->length)
833 {
834 if(m1->length < m2->length)
835 {
836 /*ajDebug("return 1\n");*/
837 return 1;
838 }
839 else
840 return -1;
841 }
842
843 if(m1->seq1start != m2->seq1start)
844 {
845 if(m1->seq1start > m2->seq1start)
846 return 1;
847 else
848 return -1;
849 }
850
851 if(m1->seq2start != m2->seq2start)
852 {
853 if(m1->seq2start > m2->seq2start)
854 return 1;
855 else
856 return -1;
857 }
858
859 return 0;
860 }
861
862
863
864
865 /* @funcstatic wordOrderPosMatchTable *****************************************
866 **
867 ** Sort the hits by seq1 start then by seq2 start
868 **
869 ** @param [u] unorderedList [AjPList] Unsorted list
870 ** @return [void]
871 **
872 ** @release 2.1.0
873 ** @@
874 ******************************************************************************/
875
wordOrderPosMatchTable(AjPList unorderedList)876 static void wordOrderPosMatchTable(AjPList unorderedList)
877 {
878 ajListSort(unorderedList, &wordMatchCmpPos);
879
880 return;
881 }
882
883
884
885
886 /* @funcstatic wordMatchCmpPos ************************************************
887 **
888 ** Compares two sequence matches so the result can be used in sorting.
889 ** The comparison is done by seq1
890 ** start position. If the seq1 start positions are equal they are
891 ** sorted by seq2 start position.
892 **
893 ** @param [r] v1 [const void*] First word
894 ** @param [r] v2 [const void*] Comparison word
895 ** @return [ajint] Comparison value. 0 if equal, -1 if first is lower,
896 ** +1 if first is higher.
897 **
898 ** @release 1.0.0
899 ** @@
900 ******************************************************************************/
901
wordMatchCmpPos(const void * v1,const void * v2)902 static ajint wordMatchCmpPos(const void* v1, const void* v2)
903 {
904 EmbPWordMatch m1;
905 EmbPWordMatch m2;
906
907 m1 = *(EmbPWordMatch const *) v1;
908 m2 = *(EmbPWordMatch const *) v2;
909
910 /*
911 ajDebug("m1 %x %5d %5d %5d\n",
912 m1, m1->length, m1->seq2start, m1->seq1start);
913 ajDebug("m2 %x %5d %5d %5d\n",
914 m2, m2->length, m2->seq2start, m2->seq1start);
915 */
916
917 if(m1->seq1start != m2->seq1start)
918 {
919 if(m1->seq1start > m2->seq1start)
920 {
921 /*ajDebug("return 1\n");*/
922 return 1;
923 }
924 else return -1;
925 }
926
927 if(m1->seq2start != m2->seq2start)
928 {
929 if(m1->seq2start > m2->seq2start)
930 {
931 /*ajDebug("return 1\n");*/
932 return 1;
933 }
934 else return -1;
935 }
936
937 return 0;
938 }
939
940
941
942
943 /* @func embWordBuildMatchTable ***********************************************
944 **
945 ** Create a linked list of all the matches and order them by the
946 ** second sequence.
947 **
948 ** We need three lists:
949 ** (a) all hits, added in positional order
950 ** (b) ongoing hits, where we have not reached the end yet
951 ** which is a list of items in "all hits" being updated
952 ** (c) new hits, found in the word table from the other sequence.
953 **
954 ** @param [r] seq1MatchTable [const AjPTable] Match table
955 ** @param [r] seq2 [const AjPSeq] Second sequence
956 ** @param [r] orderit [ajint] 1 to sort results at end, else 0.
957 ** @return [AjPList] List of matches.
958 ** @error NULL table was not built due to an error.
959 **
960 ** @release 1.0.0
961 ** @@
962 ******************************************************************************/
963
embWordBuildMatchTable(const AjPTable seq1MatchTable,const AjPSeq seq2,ajint orderit)964 AjPList embWordBuildMatchTable(const AjPTable seq1MatchTable,
965 const AjPSeq seq2,
966 ajint orderit)
967 {
968 ajuint i = 0;
969 ajuint ilast;
970 AjPList hitlist = NULL;
971 const AjPList newlist = NULL;
972 const AjPTable seqlocst;
973 EmbPWordSeqLocs* seqlocs=NULL;
974 const char *startptr;
975 EmbPWord wordmatch;
976 EmbPWordMatch newmatch;
977 EmbPWordMatch curmatch = NULL;
978 AjIList newiter;
979 AjIList curiter;
980 void *ptr = NULL;
981
982 ajint *k = 0;
983 ajuint kcur = 0;
984 ajuint kcur2 = 0;
985 ajuint knew = 0;
986 AjBool matched = ajFalse;
987
988 assert(wordLength > 0);
989
990 hitlist = ajListNew();
991
992 if(!wordCurList)
993 wordCurList = ajListNew();
994
995 if(ajSeqGetLen(seq2) < wordLength)
996 {
997 ajWarn("ERROR: Sequence %S length %d less than word length %d",
998 ajSeqGetUsaS(seq2), ajSeqGetLen(seq2), wordLength);
999
1000 return hitlist;
1001 }
1002
1003 startptr = ajSeqGetSeqC(seq2);
1004 ilast = ajSeqGetLen(seq2) - wordLength;
1005
1006 /*ajDebug("embWordBuildMatchTable ilast: %u\n", ilast);*/
1007
1008 while(i < (ilast+1))
1009 {
1010 if((wordmatch = ajTableFetchmodV(seq1MatchTable, startptr)))
1011 {
1012 /* match found so create EmbSWordMatch structure and fill it
1013 ** in. Then set next pos accordingly
1014 ** BUT this could match several places so need to do for each
1015 ** position
1016 **
1017 ** there is a match between the two sequences
1018 ** this could extend an existing match or start a new one
1019 */
1020
1021 seqlocst = wordmatch->seqlocs;
1022 ajTableToarrayValues(seqlocst, (void***)&seqlocs);
1023 /* TODO: assumes matching against single sequence */
1024 newlist = seqlocs[0]->locs;
1025
1026 if(!ajListGetLength(newlist))
1027 ajWarn("ERROR: newlist is empty\n");
1028
1029 /*ajDebug("\nnewlist %u i:%u\n", ajListGetLength(newlist), i);*/
1030 newiter = ajListIterNewread(newlist);
1031
1032 /* this is the list of matches for the current word and position */
1033
1034 if(ajListGetLength(wordCurList))
1035 {
1036 /*ajDebug("wordCurList size %d\n",
1037 ajListGetLength(wordCurList));*/
1038 curiter = ajListIterNew(wordCurList);
1039
1040 curmatch = ajListIterGet(curiter);
1041 kcur = curmatch->seq1start + curmatch->length - wordLength + 1;
1042 kcur2 = curmatch->seq2start + curmatch->length - wordLength + 1;
1043 }
1044 else
1045 curiter = NULL;
1046
1047 while(!ajListIterDone(newiter) )
1048 {
1049 k = (ajint*) ajListIterGet(newiter);
1050 knew = *k;
1051
1052 /*ajDebug("knew: %u i:%u\n", knew, i);*/
1053 /* compare to current hits to test for extending */
1054
1055 ajListIterRewind(curiter);
1056 matched = ajFalse;
1057
1058 while(!ajListIterDone(curiter) )
1059 {
1060 curmatch = ajListIterGet(curiter);
1061 kcur = curmatch->seq1start + curmatch->length -
1062 wordLength + 1;
1063 kcur2 = curmatch->seq2start + curmatch->length -
1064 wordLength + 1;
1065 /*ajDebug(".test kcur/knew %u/%u kcur2/i %u/%u\n",
1066 kcur, knew, kcur2, i);*/
1067
1068 /* when we test, we may have already incremented
1069 one of the matches - so test old and new kcur2 */
1070 if(kcur2 != i && kcur2 != i+1)
1071 {
1072 /*ajDebug("finished kcur: %u kcur2: %u i: %u\n",
1073 kcur, kcur2,i);*/
1074 ajListIterRemove(curiter);
1075 continue;
1076 }
1077
1078 if(kcur == knew && kcur2 == i)
1079 { /* check continued matches */
1080 /* ajDebug("**match knew: %d kcur: %d kcur2: %d "
1081 "start1: %d "
1082 "start2: %d len: %d i:%d\n",
1083 knew, kcur, kcur2,curmatch->seq1start,
1084 curmatch->seq2start,curmatch->length, i);*/
1085 curmatch->length++;
1086 matched = ajTrue;
1087 continue;
1088 }
1089 }
1090
1091 if(!matched)
1092 { /* new current match */
1093 /*ajDebug("save start1: %d start2: %d len: %d\n",
1094 match2->seq1start, match2->seq2start,
1095 match2->length);*/
1096 /* add to hitlist */
1097 newmatch = embWordMatchNew(seq2, knew, i, wordLength);
1098 ajListPushAppend(hitlist, newmatch);
1099
1100 if(curiter)
1101 { /* add to wordCurList */
1102 /*ajDebug("...ajListInsert using curiter %u\n",
1103 ajListGetLength(wordCurList));*/
1104 wordListInsertOld(curiter, newmatch);
1105 /*wordCurListTrace(wordCurList);*/
1106 /*wordCurIterTrace(curiter);*/
1107 }
1108 else
1109 {
1110 /*ajDebug("...ajListPushAppend to wordCurList %u\n",
1111 ajListGetLength(wordCurList));*/
1112 ajListPushAppend(wordCurList, newmatch);
1113 /* wordCurListTrace(wordCurList); */
1114 }
1115 }
1116 /* ajDebug("k: %d i: %d\n", *k, i); */
1117 }
1118
1119 ajListIterDel(&newiter);
1120 ajListIterDel(&curiter);
1121 AJFREE(seqlocs);
1122 }
1123
1124 /* no match, so all existing matches are completed */
1125
1126 i++;
1127 startptr++;
1128 }
1129
1130 /* wordCurListTrace(hitlist); */
1131 if(orderit)
1132 wordOrderMatchTable(hitlist);
1133
1134 /* wordCurListTrace(hitlist); */
1135
1136 while(ajListPop(wordCurList,(void **)&ptr));
1137
1138 return hitlist;
1139 }
1140
1141
1142
1143
1144 /* @func embWordMatchNew ******************************************************
1145 **
1146 ** Creates and initialises a word match object
1147 **
1148 ** @param [r] seq[const AjPSeq] Query sequence, match has been found
1149 ** @param [r] seq1start [ajuint] Start position in target sequence
1150 ** @param [r] seq2start [ajuint] Start position in query sequence
1151 ** @param [r] length [ajint] length of the word match
1152 ** @return [EmbPWordMatch] New word match object.
1153 **
1154 ** @release 6.3.0
1155 ** @@
1156 ******************************************************************************/
1157
embWordMatchNew(const AjPSeq seq,ajuint seq1start,ajuint seq2start,ajint length)1158 EmbPWordMatch embWordMatchNew(const AjPSeq seq, ajuint seq1start,
1159 ajuint seq2start, ajint length)
1160 {
1161 EmbPWordMatch match;
1162
1163 AJNEW0(match);
1164
1165 match->sequence = seq;
1166 match->seq1start = seq1start;
1167 match->seq2start = seq2start;
1168 match->length = length;
1169
1170 ajDebug("new word match start1: %d start2: %d len: %d\n",
1171 match->seq1start, match->seq2start,
1172 match->length);
1173
1174 return match;
1175 }
1176
1177
1178
1179
1180 /* @funcstatic wordNewListTrace ***********************************************
1181 **
1182 ** Reports contents of a word list.
1183 **
1184 ** @param [r] i [ajint] Offset
1185 ** @param [r] newlist [const AjPList] word list.
1186 ** @return [void]
1187 **
1188 ** @release 1.0.0
1189 ** @@
1190 ******************************************************************************/
1191
wordNewListTrace(ajint i,const AjPList newlist)1192 static void wordNewListTrace(ajint i, const AjPList newlist)
1193 {
1194 ajint *k;
1195 AjIList iter;
1196
1197 iter = ajListIterNewread(newlist);
1198
1199 ajDebug("\n++newlist... %d \n", i);
1200 ajDebug("++ k+len i+len k+1 i+1 len\n");
1201
1202 while(!ajListIterDone(iter))
1203 {
1204 k = (ajint*) ajListIterGet(iter);
1205 ajDebug("++ %6d %6d %6d %6d %6d\n",
1206 (*k)+wordLength, i+wordLength, (*k)+1, i+1, wordLength);
1207 }
1208
1209 ajListIterDel(&iter);
1210
1211 return;
1212 }
1213
1214
1215
1216
1217 /* @funcstatic wordCurListTrace ***********************************************
1218 **
1219 ** Reports contents of a word list.
1220 **
1221 ** @param [r] curlist [const AjPList] word list.
1222 ** @return [void]
1223 **
1224 ** @release 1.0.0
1225 ** @@
1226 ******************************************************************************/
1227
wordCurListTrace(const AjPList curlist)1228 static void wordCurListTrace(const AjPList curlist)
1229 {
1230 /*
1231 EmbPWordMatch match;
1232 ajint i;
1233 ajint j;
1234 ajint ilen;
1235 */
1236 AjIList iter = ajListIterNewread(curlist);
1237
1238 /*
1239 ajDebug("\ncurlist...\n");
1240 while(!ajListIterDone(iter))
1241 {
1242 match = ajListIterGet(iter);
1243 i = match->seq1start + 1;
1244 j = match->seq2start + 1;
1245 ilen = match->length;
1246 ajDebug("%6d %6d %6d %6d %6d\n",
1247 i+ilen, j+ilen, i, j, ilen);
1248 }
1249 */
1250
1251 ajListIterDel(&iter);
1252
1253 return;
1254 }
1255
1256
1257
1258
1259 /* @funcstatic wordCurIterTrace ***********************************************
1260 **
1261 ** Reports contents of a current word list iterator
1262 **
1263 ** @param [r] curiter [const AjIList] List iterator for the current word list
1264 ** @return [void]
1265 **
1266 ** @release 1.0.0
1267 ** @@
1268 ******************************************************************************/
1269
wordCurIterTrace(const AjIList curiter)1270 static void wordCurIterTrace(const AjIList curiter)
1271 {
1272 AjPListNode node;
1273 EmbPWordMatch match;
1274 ajint i, j, ilen;
1275
1276 ajDebug("curiter ...\n");
1277
1278 if(curiter->Here)
1279 {
1280 node = curiter->Here;
1281 match = node->Item;
1282 i = match->seq1start + 1;
1283 j = match->seq2start + 1;
1284 ilen = match->length;
1285 ajDebug(" Here: %6d %6d %6d %6d %6d\n",
1286 i+ilen, j+ilen, i, j, ilen);
1287 }
1288 else
1289 ajDebug(" Here: NULL\n");
1290
1291 node = curiter->Head->First;
1292 match = node->Item;
1293 i = match->seq1start + 1;
1294 j = match->seq2start + 1;
1295 ilen = match->length;
1296 ajDebug(" Orig: %6d %6d %6d %6d %6d\n",
1297 i+ilen, j+ilen, i, j, ilen);
1298
1299 return;
1300 }
1301
1302
1303
1304
1305 /* @funcstatic wordDeadZone ***************************************************
1306 **
1307 ** Determines if a match is within the region which is not overlapped by the
1308 ** match starting at position (deadx1, deady1) or ending at position
1309 ** (deadx2, deady2). If it is in this region
1310 ** (the 'live zone') then 0 is returned, if it is partially in this region
1311 ** then it is truncated to lie inside it and 2 is returned, else 1 is returned.
1312 **
1313 ** What is the 'live zone' and 'dead zone'?
1314 ** When an initial large match has been found we wish to remove any other
1315 ** (smaller) matches that overlap with it. The region in which other matches
1316 ** overlap with the first match are called here the 'dead zones'. The regions
1317 ** in which they don't overlap are called the 'live zones'. Other matches are
1318 ** OK if they are in live zones - they can co-exist with this match.
1319 **
1320 **
1321 ** deadx2
1322 ** | .
1323 ** | . live
1324 ** | . zone 2
1325 ** | dead ............deady2
1326 ** | zone /
1327 ** seq2 | /
1328 ** | /match
1329 ** | /
1330 ** deady1..........
1331 ** | . dead
1332 ** | . zone
1333 ** |live .
1334 ** |zone 1 deadx1
1335 ** -------------------------
1336 ** seq1
1337 **
1338 ** @param [u] match [EmbPWordMatch] match to investigate
1339 ** @param [r] deadx1 [ajint] x position of end of live zone 1
1340 ** @param [r] deady1 [ajint] y position of end of live zone 1
1341 ** @param [r] deadx2 [ajint] x position of end of live zone 2
1342 ** @param [r] deady2 [ajint] y position of end of live zone 2
1343 ** @param [r] minlength [ajint] minimum length of match
1344 ** @return [ajint] 0=in live zone, 1=in dead zone, 2=truncated
1345 **
1346 ** @release 2.1.0
1347 ** @@
1348 ******************************************************************************/
1349
wordDeadZone(EmbPWordMatch match,ajint deadx1,ajint deady1,ajint deadx2,ajint deady2,ajint minlength)1350 static ajint wordDeadZone(EmbPWordMatch match,
1351 ajint deadx1, ajint deady1,
1352 ajint deadx2, ajint deady2, ajint minlength)
1353 {
1354 ajint startx;
1355 ajint starty;
1356 ajint endx;
1357 ajint endy;
1358
1359 startx = match->seq1start;
1360 starty = match->seq2start;
1361
1362 endx = match->seq1start + match->length -1;
1363 endy = match->seq2start + match->length -1;
1364
1365
1366 /* is it in the top right live zone ? */
1367 if(startx > deadx2 && starty > deady2)
1368 return 0;
1369
1370 /* is it in the bottom right live zone ? */
1371 if(endx < deadx1 && endy < deady1)
1372 return 0;
1373
1374 /* is it in the top left dead zone? */
1375 if(starty >= deady1 && endx <= deadx2)
1376 return 1;
1377
1378 /* is it in the bottom right dead zone? */
1379 if(endy <= deady2 && startx >= deadx1)
1380 return 1;
1381
1382 /* it must be partially in a dead zone - truncate it */
1383
1384 if(endy < deady2)
1385 {
1386 if(startx - starty < deadx1 - deady1) /* crosses deady1 */
1387 match->length = deady1 - starty;
1388 else if(startx - starty > deadx1 - deady1) /* crosses deadx1 */
1389 match->length = deadx1 - startx;
1390 else
1391 ajFatal("Found a match where match is on the same diagonal \n"
1392 "startx=%d, starty=%d, endx=%d, endy=%d \n"
1393 "deadx1=%d, deady1=%d, deadx2=%d, deady2=%d\n",
1394 startx, starty, endx, endy, deadx1, deady1, deadx2,
1395 deady2);
1396 }
1397 else if(starty > deady1)
1398 {
1399 if(startx - starty < deadx1 - deady1)
1400 {
1401 /* crosses deadx2 */
1402 match->length = endx - deadx2;
1403 match->seq1start = deadx2 +1;
1404 match->seq2start += deadx2 - startx +1;
1405
1406 }
1407 else if(startx - starty > deadx1 - deady1)
1408 {
1409 /* crosses deady2 */
1410 match->length = endy - deady2;
1411 match->seq1start += deady2 - starty +1;
1412 match->seq2start = deady2 +1;
1413
1414 }
1415 else
1416 ajFatal("Found a match where match is on the same diagonal \n"
1417 "startx=%d, starty=%d, endx=%d, endy=%d \n"
1418 "deadx1=%d, deady1=%d, deadx2=%d, deady2=%d\n",
1419 startx, starty, endx, endy, deadx1, deady1, deadx2,
1420 deady2);
1421 }
1422 else
1423 ajFatal("Found a match that was not caught by any case \n"
1424 "startx=%d, starty=%d, endx=%d, endy=%d \n"
1425 "deadx1=%d, deady1=%d, deadx2=%d, deady2=%d\n",
1426 startx, starty, endx, endy, deadx1, deady1, deadx2, deady2);
1427
1428 /*
1429 ** is the truncated match shorter than our allowed minimum length?
1430 ** If so it should be dead
1431 */
1432 if(match->length < minlength)
1433 return 1;
1434
1435 return 2;
1436 }
1437
1438
1439
1440
1441 /* @func embWordMatchMin ******************************************************
1442 **
1443 ** Given a list of matches, reduce it to the minimal set of best
1444 ** non-overlapping matches.
1445 **
1446 ** @param [u] matchlist [AjPList] list of matches to reduce to
1447 ** non-overlapping set
1448 ** @return [void]
1449 **
1450 ** @release 2.0.0
1451 ** @@
1452 ******************************************************************************/
1453
embWordMatchMin(AjPList matchlist)1454 void embWordMatchMin(AjPList matchlist)
1455 {
1456 AjIList iter = NULL;
1457 EmbPWordMatch match;
1458 EmbPWordMatch thismatch;
1459 AjPList minlist; /* list of matches in min set */
1460 ajint deadx1; /* positions of the dead zones */
1461 ajint deady1;
1462 ajint deadx2;
1463 ajint deady2;
1464 AjBool truncated;
1465 ajint result;
1466
1467 minlist = ajListNew();
1468
1469 /* order the matches by size - largest first */
1470 wordOrderMatchTable(matchlist);
1471
1472 /*
1473 ** remove all other matches in the overlapping dead-zone, truncating
1474 ** those that extend into the live-zone
1475 */
1476
1477 /* repeat until there are no more matches to process */
1478 while(ajListGetLength(matchlist))
1479 {
1480 /* get next longest match and append to list of minimal matches */
1481 ajListPop(matchlist, (void **)&thismatch);
1482
1483 ajListPushAppend(minlist, thismatch);
1484
1485 /* get the positions of the dead zones */
1486 deadx1 = thismatch->seq1start; /* first pos of match */
1487 deady1 = thismatch->seq2start; /* first pos of match */
1488
1489 /* last pos of match */
1490 deadx2 = thismatch->seq1start + thismatch->length -1;
1491
1492 /* last pos of match */
1493 deady2 = thismatch->seq2start + thismatch->length -1;
1494
1495 /* haven't truncated any matches yet */
1496 truncated = ajFalse;
1497
1498 /* look at all remaining matches in matchlist */
1499 iter = ajListIterNew(matchlist);
1500
1501 while(!ajListIterDone(iter))
1502 {
1503 match = ajListIterGet(iter);
1504
1505 /* want to remove this match if it is in the dead zone */
1506 result = wordDeadZone(match, deadx1, deady1, deadx2, deady2,
1507 wordLength);
1508 if(result == 1)
1509 {
1510 /*
1511 ** it is in the dead zone - remove it
1512 ** Need to free up the match structure and remove the
1513 ** current node of the list
1514 */
1515 wordMatchListDelete((void **)&match, NULL);
1516 ajListIterRemove(iter);
1517 }
1518 else if(result == 2)
1519 {
1520 /* it is partially in the dead zone - now truncated */
1521 truncated = ajTrue;
1522 }
1523 }
1524
1525 ajListIterDel(&iter);
1526
1527 /*
1528 ** if some truncating done then need to sort the matchlist
1529 ** again by size
1530 */
1531 if(truncated)
1532 wordOrderMatchTable(matchlist);
1533 }
1534
1535
1536 /* sort by x start position */
1537 wordOrderPosMatchTable(minlist);
1538
1539
1540 /* matchlist is now reduced to the minimal non-overlapping list */
1541 ajListPushlist(matchlist, &minlist);
1542
1543 return;
1544 }
1545
1546
1547
1548
1549 /* @func embWordMatchFirstMax *************************************************
1550 **
1551 ** Given list of matches returns the first match with maximum similarity/score.
1552 **
1553 ** @param [r] matchlist [const AjPList] list of matches
1554 ** @return [EmbPWordMatch] maximum match
1555 **
1556 ** @release 6.3.0
1557 ** @@
1558 ******************************************************************************/
1559
embWordMatchFirstMax(const AjPList matchlist)1560 EmbPWordMatch embWordMatchFirstMax(const AjPList matchlist)
1561 {
1562 ajint maxmatch = 0;
1563 EmbPWordMatch p;
1564 EmbPWordMatch max = NULL;
1565
1566 AjIList iter;
1567
1568 iter = ajListIterNewread(matchlist);
1569
1570 while(!ajListIterDone(iter))
1571 {
1572 p = (EmbPWordMatch) ajListIterGet(iter);
1573
1574 if(p->length>maxmatch)
1575 {
1576 max = p;
1577 maxmatch = p->length;
1578 }
1579 else if(p->length==maxmatch)
1580 {
1581 ajDebug("possible max match position start1:%d start2:%d"
1582 " length:%d\n",p->seq1start, p->seq2start, p->length);
1583
1584 if(p->seq1start<max->seq1start)
1585 max =p;
1586 }
1587 }
1588
1589 ajDebug("maximum match position start1:%d start2:%d"
1590 " length:%d\n",max->seq1start, max->seq2start, max->length);
1591
1592 ajListIterDel(&iter);
1593
1594 return max;
1595 }
1596
1597
1598
1599
1600 /* @funcstatic wordRabinKarpConstant ******************************************
1601 **
1602 ** Returns a value that helps recalculating consecutive hash values
1603 ** with less computation during Rabin-Karp search.
1604 **
1605 ** @param [r] m [ajuint] word length
1606 ** @return [ajulong] radix^(m-1) % modulus
1607 **
1608 ** @release 6.3.0
1609 ** @@
1610 ******************************************************************************/
1611
wordRabinKarpConstant(ajuint m)1612 static ajulong wordRabinKarpConstant(ajuint m)
1613 {
1614 ajulong rm;
1615 ajuint i;
1616
1617 rm = 1;
1618
1619 for(i = 1; i <= m-1; i++)
1620 rm = (RK_RADIX * rm) % RK_MODULUS;
1621
1622 return rm;
1623 }
1624
1625
1626
1627
1628 /* @funcstatic wordRabinKarpCmp ***********************************************
1629 **
1630 ** Comparison function for EmbPWordRK objects, based on their hash values
1631 **
1632 ** @param [r] trgseq [const void *] First EmbPWordRK object
1633 ** @param [r] qryseq [const void *] Second EmbPWordRK object
1634 **
1635 ** @return [ajint] difference of hash values
1636 **
1637 ** @release 6.3.0
1638 ******************************************************************************/
1639
wordRabinKarpCmp(const void * trgseq,const void * qryseq)1640 static ajint wordRabinKarpCmp(const void *trgseq, const void *qryseq)
1641 {
1642 const EmbPWordRK ww1;
1643 const EmbPWordRK ww2;
1644
1645 ww1 = *(const EmbPWordRK const *) trgseq;
1646 ww2 = *(const EmbPWordRK const *) qryseq;
1647
1648 if(ww1->hash > ww2->hash)
1649 return 1;
1650
1651 if(ww1->hash < ww2->hash)
1652 return -1;
1653
1654 return 0;
1655 }
1656
1657
1658
1659
1660 /* @func embWordRabinKarpInit *************************************************
1661 **
1662 ** Scans word/pattern table and repackages the words in EmbPWordRK
1663 ** objects to improve access efficiency by Rabin-Karp search.
1664 ** Computes hash values for each word/pattern.
1665 **
1666 ** @param [r] table [const AjPTable] Table of patterns
1667 ** @param [u] ewords [EmbPWordRK**] Extended word objects to be used
1668 ** in Rabin-Karp search
1669 ** @param [r] wordlen [ajuint] Length of words/patterns, kmer size
1670 ** @param [r] seqset [const AjPSeqset] Sequence set, input patterns
1671 ** were derived from
1672 ** @return [ajuint] number of words
1673 **
1674 ** @release 6.3.0
1675 ** @@
1676 ******************************************************************************/
1677
embWordRabinKarpInit(const AjPTable table,EmbPWordRK ** ewords,ajuint wordlen,const AjPSeqset seqset)1678 ajuint embWordRabinKarpInit(const AjPTable table, EmbPWordRK** ewords,
1679 ajuint wordlen, const AjPSeqset seqset)
1680 {
1681 ajuint i;
1682 ajuint j;
1683 ajuint k;
1684 ajuint l;
1685 EmbPWord* words = NULL;
1686 const EmbPWord embword = NULL;
1687 ajulong patternHash;
1688 EmbPWordRK newword = NULL;
1689 AjIList iterp;
1690 EmbPWordSeqLocs* seqlocs = NULL;
1691 ajuint nseqlocs;
1692 const AjPSeq seq = NULL;
1693 const char* word;
1694 ajuint nseqs;
1695 ajuint nwords;
1696 ajuint pos;
1697
1698 nseqs = ajSeqsetGetSize(seqset);
1699 nwords = (ajuint) ajTableToarrayValues(table, (void***)&words);
1700 AJCNEW(*ewords, nwords);
1701
1702 for(i=0; i<nwords; i++)
1703 {
1704 seqlocs=NULL;
1705 embword = words[i];
1706 word = embword->fword;
1707
1708 AJNEW0(newword);
1709
1710 patternHash = 0;
1711
1712 /* TODO: we can continuously calculate the hash value
1713 * as we do in the search function */
1714 for(j=0; j<wordlen; j++)
1715 patternHash = (RK_RADIX * patternHash +
1716 toupper((int)word[j]))% RK_MODULUS;
1717
1718 nseqlocs = (ajuint) ajTableToarrayValues(embword->seqlocs, (void***)&seqlocs);
1719 newword->nseqs = nseqlocs;
1720 newword->hash = patternHash;
1721 newword->word = embword;
1722 AJCNEW(newword->seqindxs, nseqlocs);
1723 AJCNEW(newword->locs, nseqlocs);
1724 AJCNEW(newword->nnseqlocs, nseqlocs);
1725 AJCNEW(newword->nSeqMatches, nseqlocs);
1726
1727 for(j=0; j<nseqlocs; j++)
1728 {
1729 seq= seqlocs[j]->seq;
1730
1731 for(l=0;l<nseqs;l++)
1732 if (seq == ajSeqsetGetseqSeq(seqset,l))
1733 {
1734 newword->seqindxs[j] = l;
1735 break;
1736 }
1737
1738 if(l == nseqs)
1739 {
1740 ajErr("something wrong, sequence not found in seqset");
1741 ajExitBad();
1742 }
1743
1744 iterp = ajListIterNewread(seqlocs[j]->locs);
1745 k = 0;
1746 newword->nnseqlocs[j] = (ajuint) ajListGetLength(seqlocs[j]->locs);
1747 AJCNEW(newword->locs[j],newword->nnseqlocs[j]);
1748
1749 while(!ajListIterDone(iterp))
1750 {
1751 pos = *(ajuint *) ajListIterGet(iterp);
1752 newword->locs[j][k++] = pos;
1753 }
1754
1755 ajListIterDel(&iterp);
1756 }
1757
1758 AJFREE(seqlocs);
1759
1760 (*ewords)[i] = newword;
1761
1762 }
1763
1764 AJFREE(words);
1765
1766 qsort(*ewords, nwords, sizeof(EmbPWordRK), wordRabinKarpCmp);
1767
1768 return nwords;
1769 }
1770
1771
1772
1773
1774 /* @func embWordRabinKarpSearch ***********************************************
1775 **
1776 ** Rabin Karp search for multiple patterns.
1777 **
1778 ** @param [r] sseq [const AjPStr] Sequence to be scanned for multiple patterns
1779 ** @param [r] seqset [const AjPSeqset] Sequence-set,
1780 ** where search patterns coming from
1781 ** @param [r] patterns [EmbPWordRK const *] Patterns to be searched
1782 ** @param [r] plen [ajuint] Length of patterns
1783 ** @param [r] npatterns [ajuint] Number of patterns
1784 ** @param [u] matchlist [AjPList*] List of matches for each sequence
1785 ** in the sequence set
1786 ** @param [u] lastlocation [ajuint*] Position of the search for each sequence
1787 ** in the sequence set
1788 ** @param [r] checkmode [AjBool] If true, not writing features or alignments
1789 ** but running to produce match statistics only
1790 ** @return [ajuint] total number of matches
1791 **
1792 ** @release 6.3.0
1793 ** @@
1794 ******************************************************************************/
1795
embWordRabinKarpSearch(const AjPStr sseq,const AjPSeqset seqset,EmbPWordRK const * patterns,ajuint plen,ajuint npatterns,AjPList * matchlist,ajuint * lastlocation,AjBool checkmode)1796 ajuint embWordRabinKarpSearch(const AjPStr sseq,
1797 const AjPSeqset seqset,
1798 EmbPWordRK const * patterns,
1799 ajuint plen, ajuint npatterns,
1800 AjPList* matchlist,
1801 ajuint* lastlocation, AjBool checkmode)
1802 {
1803 const char *text;
1804 const AjPSeq seq;
1805 ajuint i;
1806 ajuint matchlen;
1807 ajuint tlen;
1808 ajuint ii;
1809 ajuint k;
1810 ajuint seqsetindx;
1811 ajuint indxloc;
1812 ajuint maxloc;
1813 ajuint nMatches = 0;
1814 EmbPWordRK* bsres; /* match found using binary search */
1815 EmbPWordRK cursor;
1816 ajulong rm;
1817 ajulong textHash = 0;
1818 ajuint seq2start;
1819 char* tmp;
1820
1821 ajuint pos;
1822 const char *seq_;
1823
1824 AJNEW0(cursor);
1825
1826 rm = wordRabinKarpConstant(plen);
1827 text = ajStrGetPtr(sseq);
1828 tlen = ajStrGetLen(sseq);
1829
1830 for(i=0; i<plen; i++)
1831 textHash = (ajulong)(RK_RADIX * textHash +
1832 toupper((int)text[i])) % RK_MODULUS;
1833
1834 /* Scan the input sequence sseq for all patterns */
1835 for (i=plen; i<=tlen; i++)
1836 {
1837 cursor->hash = textHash;
1838 bsres = bsearch(&cursor, patterns, npatterns,
1839 sizeof(EmbPWordRK), wordRabinKarpCmp);
1840
1841 if(bsres!=NULL)
1842 {
1843 seq2start = i-plen;
1844
1845 for(k=0;k<(*bsres)->nseqs;k++)
1846 {
1847 seqsetindx = (*bsres)->seqindxs[k];
1848 seq = ajSeqsetGetseqSeq(seqset, seqsetindx);
1849
1850 if(lastlocation[seqsetindx] < i)
1851 {
1852 maxloc = 0;
1853
1854 for(indxloc=0; indxloc < (*bsres)->nnseqlocs[k]; indxloc++)
1855 {
1856 pos = (*bsres)->locs[k][indxloc];
1857 seq_ = ajSeqGetSeqC(seq);
1858 matchlen=0;
1859 ii = seq2start;
1860
1861 /* following loop is to make sure we never have
1862 * false positives, after we are confident that
1863 * we don't get false hits we can delete/disable it
1864 */
1865 while(matchlen<plen)
1866 {
1867 if(toupper((int)seq_[pos+matchlen]) !=
1868 toupper((int)text[ii++]))
1869 {
1870 AJCNEW0(tmp,plen+1);
1871 tmp[plen] = '\0';
1872 memcpy(tmp, text+i-plen, plen);
1873 ajDebug("unexpected match: pat:%s pat-pos:"
1874 "%u, txt-pos:%u text:%s hash:%u\n",
1875 (*bsres)->word->fword, pos,
1876 i+matchlen-plen, tmp, textHash);
1877 AJFREE(tmp);
1878 break;
1879 }
1880
1881 matchlen++;
1882 }
1883
1884 /* if the match was a false positive skip it */
1885 if(matchlen<plen)
1886 continue;
1887
1888 /* this is where we extend matches */
1889 while(ii<tlen && pos+matchlen<ajSeqGetLen(seq))
1890 {
1891 if(toupper((int)seq_[pos+matchlen]) !=
1892 toupper((int)text[ii++]))
1893 break;
1894 else
1895 ++matchlen;
1896 }
1897
1898 nMatches ++;
1899
1900 if(!checkmode)
1901 ajListPushAppend(matchlist[seqsetindx],
1902 embWordMatchNew(seq,pos,seq2start,
1903 matchlen));
1904
1905 if(ii > maxloc)
1906 maxloc = ii;
1907
1908 (*bsres)->lenMatches += matchlen;
1909 (*bsres)->nMatches++;
1910 (*bsres)->nSeqMatches[k]++;
1911
1912 }
1913
1914 if(maxloc>0)
1915 {
1916 lastlocation[seqsetindx] = maxloc;
1917 }
1918 }
1919 }
1920 }
1921
1922 textHash = ((textHash + toupper((int)text[i-plen]) * (RK_MODULUS-rm))
1923 * RK_RADIX + toupper((int)text[i])) % RK_MODULUS;
1924 }
1925
1926 AJFREE(cursor);
1927
1928 return nMatches;
1929 }
1930
1931
1932
1933
1934 /* @func embWordMatchIter *****************************************************
1935 **
1936 ** Return the start positions and length for the next match.
1937 ** The caller iterates over the list, which is a standard AjPList
1938 **
1939 ** @param [u] iter [AjIList] List iterator
1940 ** @param [w] start1 [ajint*] Start in first sequence
1941 ** @param [w] start2 [ajint*] Start in second sequence
1942 ** @param [w] len [ajint*] Length of match
1943 ** @param [w] seq [const AjPSeq*] Pointer to sequence
1944 ** @return [AjBool] ajFalse if the iterator was exhausted
1945 **
1946 **
1947 ** @release 2.4.0
1948 ******************************************************************************/
1949
embWordMatchIter(AjIList iter,ajint * start1,ajint * start2,ajint * len,const AjPSeq * seq)1950 AjBool embWordMatchIter(AjIList iter, ajint* start1, ajint* start2,
1951 ajint* len, const AjPSeq* seq)
1952 {
1953 EmbPWordMatch p;
1954
1955 if(ajListIterDone(iter))
1956 return ajFalse;
1957
1958 p = (EmbPWordMatch) ajListIterGet(iter);
1959 *start1 = p->seq1start;
1960 *start2 = p->seq2start;
1961 *len = p->length;
1962 *seq = p->sequence;
1963
1964 return ajTrue;
1965 }
1966
1967
1968
1969
1970 /* @funcstatic wordListInsertOld **********************************************
1971 **
1972 ** Obsolete ajListInsert version emulation
1973 ** Insert an item in a list, using an iterator (if not null)
1974 ** to show which position to insert. Otherwise, simply push.
1975 **
1976 ** @param [u] iter [AjIList] List iterator.
1977 ** @param [r] x [void*] Data item to insert.
1978 ** @return [void]
1979 **
1980 ** @release 2.1.0
1981 ** @@
1982 ******************************************************************************/
1983
wordListInsertOld(AjIList iter,void * x)1984 static void wordListInsertOld(AjIList iter, void* x)
1985 {
1986 AjPList list;
1987 AjPListNode p;
1988
1989 list = iter->Head;
1990 p = iter->Here;
1991
1992 if(!p->Prev)
1993 {
1994 ajListPush(list,(void *)x);
1995 return;
1996 }
1997
1998 if(p == list->First)
1999 {
2000 if(!p->Prev->Prev)
2001 wordListInsertNodeOld(&list->First->Next,x);
2002 else
2003 wordListInsertNodeOld(&p->Prev->Next,x);
2004 }
2005 else
2006 {
2007 if(!p->Prev->Prev)
2008 wordListInsertNodeOld(&list->First,x);
2009 else
2010 wordListInsertNodeOld(&p->Prev->Prev->Next,x);
2011 }
2012
2013 list->Count++;
2014
2015 return;
2016 }
2017
2018
2019
2020
2021 /* @funcstatic wordListInsertNodeOld ******************************************
2022 **
2023 ** Inserts a new node in a list at the current node position.
2024 **
2025 ** @param [u] pnode [AjPListNode*] Current node.
2026 ** @param [r] x [void*] Data item to insert.
2027 ** @return [void]
2028 **
2029 ** @release 2.1.0
2030 ** @@
2031 ******************************************************************************/
2032
wordListInsertNodeOld(AjPListNode * pnode,void * x)2033 static void wordListInsertNodeOld(AjPListNode* pnode, void* x)
2034 {
2035 AjPListNode p;
2036
2037 AJNEW0(p);
2038 p->Item = x;
2039 p->Next = (*pnode);
2040 p->Prev = (*pnode)->Prev;
2041 p->Next->Prev = p;
2042 *pnode = p;
2043
2044 return;
2045 }
2046
2047
2048
2049
2050 /* @func embWordUnused ********************************************************
2051 **
2052 ** Unused functions. Here to keep compiler warnings away
2053 **
2054 ** @return [void]
2055 **
2056 ** @release 2.0.0
2057 ******************************************************************************/
2058
embWordUnused(void)2059 void embWordUnused(void)
2060 {
2061
2062 wordCurListTrace(NULL); /* comment out in embWordBuildMatchTable */
2063 wordCurIterTrace(NULL); /* comment out in embWordBuildMatchTable */
2064 wordNewListTrace(0, NULL); /* comment out in embWordBuildMatchTable */
2065
2066 return;
2067 }
2068
2069
2070
2071
2072 /* @func embWordExit **********************************************************
2073 **
2074 ** Cleanup word matching indexing internals on exit
2075 **
2076 ** @return [void]
2077 **
2078 ** @release 4.0.0
2079 ******************************************************************************/
2080
embWordExit(void)2081 void embWordExit(void)
2082 {
2083 embWordClear();
2084 ajListFree(&wordCurList);
2085
2086 return;
2087 }
2088
2089
2090
2091
2092 #ifdef AJ_COMPILE_DEPRECATED_BOOK
2093 #endif
2094
2095
2096
2097
2098 #ifdef AJ_COMPILE_DEPRECATED
2099 /* @obsolete embWordMatchListAppend
2100 ** @remove use embWordMatchNew followed by a list append call
2101 */
embWordMatchListAppend(AjPList hitlist,const AjPSeq seq,const ajuint seq1start,ajuint seq2start,ajint length)2102 __deprecated EmbPWordMatch embWordMatchListAppend(AjPList hitlist,
2103 const AjPSeq seq,
2104 const ajuint seq1start,
2105 ajuint seq2start,
2106 ajint length)
2107 {
2108 EmbPWordMatch match;
2109 AJNEW0(match);
2110 match->sequence = seq;
2111 match->seq1start = seq1start;
2112 match->seq2start = seq2start;
2113 match->length = length;
2114 ajDebug("new word match start1: %d start2: %d len: %d\n",
2115 match->seq1start, match->seq2start,
2116 match->length);
2117 ajListPushAppend(hitlist, match);
2118 return match;
2119 }
2120 #endif
2121