1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include <search/extern.h>
28 #include <compiler.h>
29 #include <os-native.h>
30 #include <sysalloc.h>
31
32 #include <sys/types.h>
33 #include <sys/stat.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37
38 #include "search-priv.h"
39
40
41 /*
42 We don't use longs here because the addition means we need to keep
43 some bits in reserve to calculate carries.
44 */
45
46 typedef int32_t schunk;
47 typedef uint32_t uchunk;
48
49 typedef struct CHUNK {
50 int32_t size;
51 uchunk *chunks;
52 } CHUNK;
53
54 struct MyersUnlimitedSearch {
55 int32_t m;
56 CHUNK *PEq[256];
57 CHUNK *PEq_R[256];
58 };
59
60 static const char NA2KEY [] = "ACGT";
61 static const char NA4KEY [] = " ACMGRSVTWYHKDBN";
62
63 static
any_non_4na_chars(const char * pattern)64 int32_t any_non_4na_chars(const char *pattern)
65 {
66 int32_t len = strlen(pattern);
67 int32_t i;
68 char *p;
69
70 for ( i=0; i<len; i++ )
71 {
72 p = strchr( NA4KEY, pattern[i] );
73 if ( p == NULL )
74 return 1;
75 }
76 return 0;
77 }
78
79 static
na4key_matches(AgrepFlags mode,char na4,char acgt)80 int32_t na4key_matches(AgrepFlags mode, char na4, char acgt)
81 {
82 char *p;
83 int32_t pos4, pos2;
84 p = strchr( NA4KEY, na4 );
85 if ( p == NULL )
86 {
87 p = strchr( NA4KEY, 'N' );
88 }
89 pos4 = p - NA4KEY;
90 pos2 = strchr( NA2KEY, acgt ) - NA2KEY;
91 if ( pos4 & ( 1 << pos2 ) )
92 return 1;
93 return 0;
94 }
95
96
97
98
chunksize(int32_t size)99 int32_t chunksize(int32_t size) {
100 int32_t ret;
101 ret = 1 + (size / (8*sizeof(uchunk)));
102 return ret;
103 }
104
chunk_or_in(CHUNK * chunk,CHUNK * or)105 void chunk_or_in(CHUNK *chunk, CHUNK *or)
106 {
107 int32_t i;
108 int32_t size = chunk->size;
109 for (i=0; i<size; i++)
110 chunk->chunks[i] |= or->chunks[i];
111 }
112
chunk_xor_in(CHUNK * chunk,CHUNK * xor)113 void chunk_xor_in(CHUNK *chunk, CHUNK *xor)
114 {
115 int32_t i;
116 int32_t size = chunk->size;
117 for (i=0; i<size; i++)
118 chunk->chunks[i] ^= xor->chunks[i];
119 }
120
121
chunk_and_in(CHUNK * chunk,CHUNK * and)122 void chunk_and_in(CHUNK *chunk, CHUNK *and)
123 {
124 int32_t i;
125 int32_t size = chunk->size;
126 for (i=0; i<size; i++)
127 chunk->chunks[i] &= and->chunks[i];
128 }
129
chunk_add_in(CHUNK * chunk,CHUNK * add)130 void chunk_add_in(CHUNK *chunk, CHUNK *add)
131 {
132 int32_t i;
133 int32_t size = chunk->size;
134 uint64_t carry = 0;
135 uint64_t newcarry;
136 for (i=size-1; i>=0; i--) {
137 newcarry =
138 ((uint64_t)chunk->chunks[i] + (uint64_t)add->chunks[i]
139 + carry)
140 >> (8*sizeof(uchunk));
141 chunk->chunks[i] += add->chunks[i] + carry;
142 carry = newcarry;
143 }
144 }
145
146
chunk_set(CHUNK * chunk,CHUNK * copy)147 void chunk_set(CHUNK *chunk, CHUNK *copy)
148 {
149 int32_t i;
150 for (i=0; i<chunk->size; i++) {
151 chunk->chunks[i] = copy->chunks[i];
152 }
153 }
154
155
chunk_isbit_set(CHUNK * chunk,int32_t bit)156 int32_t chunk_isbit_set(CHUNK *chunk, int32_t bit)
157 {
158 int32_t cn = chunk->size - 1 - (bit / (8*sizeof(uchunk)));
159 int32_t chunkbit = bit % (8*sizeof(uchunk));
160 return chunk->chunks[cn] & (1 << chunkbit);
161 }
162
163
chunk_set_bit(CHUNK * chunk,int32_t bit)164 void chunk_set_bit(CHUNK *chunk, int32_t bit)
165 {
166 int32_t cn = chunk->size - 1 - (bit / (8*sizeof(uchunk)));
167 int32_t chunkbit = bit % (8*sizeof(uchunk));
168 chunk->chunks[cn] |= (1 << chunkbit);
169 }
170
171
chunk_negate(CHUNK * chunk)172 void chunk_negate(CHUNK *chunk)
173 {
174 int32_t i;
175 for (i=0; i<chunk->size; i++) {
176 chunk->chunks[i] = ~chunk->chunks[i];
177 }
178 }
179
alloc_chunk_and_zero(CHUNK * chunk,int32_t size)180 void alloc_chunk_and_zero(CHUNK *chunk, int32_t size)
181 {
182 chunk->chunks = malloc(size * sizeof(uchunk));
183 memset(chunk->chunks, 0, size * sizeof(uchunk));
184 }
185
free_chunk_parts(CHUNK * chunk)186 void free_chunk_parts(CHUNK *chunk)
187 {
188 free(chunk->chunks);
189 chunk->chunks = NULL;
190 }
191
free_chunk(CHUNK * chunk)192 void free_chunk(CHUNK *chunk)
193 {
194 free(chunk->chunks);
195 free(chunk);
196 }
197
198
alloc_chunk(int32_t size)199 CHUNK *alloc_chunk(int32_t size)
200 {
201 CHUNK *ret;
202 ret = malloc(sizeof(CHUNK));
203 ret->size = size;
204 ret->chunks = malloc(size * sizeof(uchunk));
205 memset(ret->chunks, 0, size * sizeof(uchunk));
206 return ret;
207 }
208
chunk_set_minusone(CHUNK * src)209 void chunk_set_minusone(CHUNK *src)
210 {
211 int32_t i;
212 for (i=0; i<src->size; i++) {
213 src->chunks[i] = (uchunk)-1;
214 }
215 }
216
chunk_zero(CHUNK * src)217 void chunk_zero(CHUNK *src)
218 {
219 int32_t i;
220 for (i=0; i<src->size; i++) {
221 src->chunks[i] = (uchunk)0;
222 }
223 }
224
print_chunk(CHUNK * src)225 void print_chunk(CHUNK *src)
226 {
227 uchunk chunk;
228 unsigned char byte;
229 char buf[9];
230 int32_t i, j, k;
231 buf[8] = '\0';
232 for (i=0; i<src->size; i++) {
233 chunk = src->chunks[i];
234 for (j=0; j<sizeof(uchunk); j++) {
235 byte = chunk >> (8*(sizeof(uchunk) - j - 1));
236 for (k=0; k<8; k++) {
237 buf[k] = "01"[1&(byte>>(7-k))];
238 }
239 printf("%s ", buf);
240 }
241 }
242 printf("\n");
243 }
244
245
246 /*
247 This currently does not preserve the sign bit.
248 */
lshift_chunk(CHUNK * dest,CHUNK * src,int32_t n)249 void lshift_chunk(CHUNK *dest, CHUNK *src, int32_t n)
250 {
251 int32_t i, j;
252 uchunk slop;
253 int32_t chunkshift = n / (8*sizeof(uchunk));
254 int32_t rem = n % (8*sizeof(uchunk));
255 /* Assumes they're both the same size. */
256 int32_t size = src->size;
257
258
259 /* i is the destination chunk */
260 slop = 0;
261 for (i=size-1; i>=0; i--) {
262 j = i+chunkshift;
263 if (j >= size) {
264 dest->chunks[i] = 0;
265 } else {
266 dest->chunks[i] = slop | src->chunks[j] << rem;
267 slop = src->chunks[j] >> (8*sizeof(uchunk) - rem);
268 }
269 }
270 }
271
chunk_lshift_one_inplace(CHUNK * dest)272 void chunk_lshift_one_inplace(CHUNK *dest)
273 {
274 int32_t i;
275 uchunk slop, newslop;
276 int32_t size = dest->size;
277
278 slop = 0;
279 for (i=size-1; i>=0; i--) {
280 newslop = dest->chunks[i] >> (8*sizeof(uchunk) - 1);
281 dest->chunks[i] = slop | dest->chunks[i] << 1;
282 slop = newslop;
283 }
284 }
285
286
chunk_lshift_one(CHUNK * dest,CHUNK * src)287 void chunk_lshift_one(CHUNK *dest, CHUNK *src)
288 {
289 int32_t i;
290 uchunk slop;
291 int32_t size = src->size;
292
293 slop = 0;
294 for (i=size-1; i>=0; i--) {
295 dest->chunks[i] = slop | src->chunks[i] << 1;
296 slop = src->chunks[i] >> (8*sizeof(uchunk) - 1);
297 }
298 }
299
300
301
302 #ifdef UNUSED_MAY_NOT_WORK
303
304 /*
305 * In this, chunks are big-endian, meaning 0 is the most significant chunk.
306 */
rshift_uchunk(CHUNK * dest,CHUNK * src,int32_t n)307 void rshift_uchunk(CHUNK *dest, CHUNK *src, int32_t n)
308 {
309 int32_t size = src->size;
310 uchunk slop;
311 int32_t chunkshift = n / (8*sizeof(uchunk));
312 int32_t rem = n % (8*sizeof(uchunk));
313 int32_t i, j;
314
315 /* i is the destination chunk */
316 slop = 0;
317 for (i=0; i<size; i++) {
318 j = i-chunkshift;
319 if (j < 0) {
320 dest->chunks[i] = 0;
321 } else {
322 dest->chunks[i] = slop | src->chunks[j] >> rem;
323 slop = src->chunks[j] << (8*sizeof(uchunk) - rem);
324 }
325 }
326 }
327
rshift_schunk(CHUNK * dest,CHUNK * src,int32_t n)328 void rshift_schunk(CHUNK *dest, CHUNK *src, int32_t n)
329 {
330 uchunk slop;
331 int32_t chunkshift = n / (8*sizeof(uchunk));
332 int32_t rem = n % (8*sizeof(uchunk));
333 int32_t i, j, size;
334
335 /* i is the destination chunk */
336 slop = -1;
337 for (i=0; i<size; i++) {
338 j = i-chunkshift;
339 if (j < 0) {
340 dest->chunks[i] = -1;
341 } else {
342 dest->chunks[i] = slop | src->chunks[j] >> rem;
343 slop = src->chunks[j] << (8*sizeof(uchunk) - rem);
344 }
345 }
346 }
347
348 #endif
349
350
MyersUnlimitedFree(MyersUnlimitedSearch * self)351 void MyersUnlimitedFree ( MyersUnlimitedSearch *self )
352 {
353 int32_t j;
354 for (j=0; j<256; j++) {
355 free_chunk(self->PEq[j]);
356 free_chunk(self->PEq_R[j]);
357 }
358 free(self);
359 }
360
361
MyersUnlimitedMake(MyersUnlimitedSearch ** self,AgrepFlags mode,const char * pattern)362 rc_t MyersUnlimitedMake ( MyersUnlimitedSearch **self,
363 AgrepFlags mode, const char *pattern )
364 {
365 const unsigned char *upattern = (const unsigned char *)pattern;
366 int32_t rc;
367 int32_t len = strlen(pattern);
368 int32_t plen = len;
369 int32_t i, j;
370 int32_t m;
371 int32_t chunks;
372
373 if (!(mode & AGREP_ANYTHING_ELSE_IS_N) &&
374 any_non_4na_chars(pattern))
375 /* This should be a return code. */
376 return RC( rcText, rcString, rcSearching, rcParam, rcUnrecognized);
377
378
379 *self = malloc(sizeof(MyersUnlimitedSearch));
380 m = (*self)->m = len;
381 chunks = chunksize(m);
382 for (j=0; j<256; j++) {
383 (*self)->PEq[j] = alloc_chunk(chunks);
384 (*self)->PEq_R[j] = alloc_chunk(chunks);
385 }
386
387 for(j = 0; j < m; j++) {
388 chunk_set_bit((*self)->PEq[upattern[j]], j);
389 if( pattern[j] == 'a' ) {
390 chunk_set_bit((*self)->PEq['t'], j); /* t == a */
391 }
392 }
393
394 for(j = 0; j < m; j++) {
395 chunk_set_bit((*self)->PEq_R[upattern[m-j-1]], j);
396 if( pattern[m-j-1] == 'a' ) {
397 chunk_set_bit((*self)->PEq_R['t'], j); /* t == a */
398 }
399 }
400
401 for (i=0; i<4; i++) {
402 unsigned char acgt = NA2KEY[i];
403 for (j=0; j<plen; j++) {
404 if (na4key_matches(mode, pattern[j], acgt)) {
405 /* bits |= (unsigned long)1<<j; */
406 chunk_set_bit((*self)->PEq[acgt], j);
407 if (mode & AGREP_TEXT_EXPANDED_2NA)
408 chunk_set_bit((*self)->PEq[i], j);
409 }
410 }
411 }
412 for (i=0; i<4; i++) {
413 unsigned char acgt = NA2KEY[i];
414 for (j=0; j<plen; j++) {
415 if (na4key_matches(mode, pattern[plen-j-1], acgt)) {
416 chunk_set_bit((*self)->PEq_R[acgt], j);
417 if (mode & AGREP_TEXT_EXPANDED_2NA)
418 chunk_set_bit((*self)->PEq_R[i], j);
419 }
420 }
421 }
422 return 0;
423
424 for (j=0; j<256; j++) {
425 free_chunk((*self)->PEq[j]);
426 free_chunk((*self)->PEq_R[j]);
427 }
428 free(*self);
429 *self = NULL;
430 return rc;
431
432 }
433
434
435 /*
436 This finds the first match forward in the string less than or equal
437 the threshold. If nonzero, this will be a prefix of any exact match.
438 */
MyersUnlimitedFindFirst(MyersUnlimitedSearch * self,int32_t threshold,const char * text,size_t n,AgrepMatch * match)439 uint32_t MyersUnlimitedFindFirst ( MyersUnlimitedSearch *self,
440 int32_t threshold, const char* text, size_t n, AgrepMatch *match )
441 {
442 const unsigned char *utext = (const unsigned char *)text;
443 CHUNK *Pv;
444 CHUNK *Mv;
445 CHUNK *Xv, *Xh, *Ph, *Mh;
446
447 int32_t m = self->m;
448 int32_t csize = chunksize(m);
449 int32_t Score;
450 int32_t BestScore = m;
451 int32_t from = 0;
452 int32_t to = -1;
453
454 int32_t j;
455
456 CHUNK *Eq;
457
458 Pv = alloc_chunk(csize);
459 Mv = alloc_chunk(csize);
460 Xv = alloc_chunk(csize);
461 Xh = alloc_chunk(csize);
462 Ph = alloc_chunk(csize);
463 Mh = alloc_chunk(csize);
464
465
466 Score = m;
467 chunk_set_minusone(Pv);
468 chunk_zero(Mv);
469
470 for(j = 0; j < n; j++) {
471 #ifdef DEBUG
472 printf("%d j loop\n", j);
473 #endif
474 Eq = self->PEq[utext[j]];
475 #ifdef DEBUG
476 printf("Eq: "); print_chunk(Eq); printf("\n");
477 #endif
478 /* Xv = Eq | Mv; */
479 chunk_set(Xv, Eq);
480 chunk_or_in(Xv, Mv);
481 #ifdef DEBUG
482 printf("Xv: "); print_chunk(Xv); printf("\n");
483 #endif
484 /* Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq; */
485 chunk_set(Xh, Eq);
486 chunk_and_in(Xh, Pv);
487 chunk_add_in(Xh, Pv);
488 chunk_xor_in(Xh, Pv);
489 chunk_or_in(Xh, Eq);
490 #ifdef DEBUG
491 printf("Xh: "); print_chunk(Xh); printf("\n");
492 #endif
493 /* Ph = Mv | ~ (Xh | Pv); */
494 chunk_set(Ph, Xh);
495 chunk_or_in(Ph, Pv);
496 chunk_negate(Ph);
497 chunk_or_in(Ph, Mv);
498 #ifdef DEBUG
499 printf("Ph: "); print_chunk(Ph); printf("\n");
500 #endif
501 /* Mh = Pv & Xh; */
502 chunk_set(Mh, Pv);
503 chunk_and_in(Mh, Xh);
504 #ifdef DEBUG
505 printf("Mh: "); print_chunk(Mh); printf("\n");
506 #endif
507 /* Ph & (1 << (m - 1)) */
508 if (chunk_isbit_set(Ph, m-1)) {
509 Score++;
510 /* Mh & (1 << (m - 1)) */
511 } else if (chunk_isbit_set(Mh, m-1)) {
512 Score--;
513 }
514 /* Ph <<= 1; */
515 chunk_lshift_one_inplace(Ph);
516 #ifdef DEBUG
517 printf("Ph: "); print_chunk(Ph); printf("\n");
518 #endif
519 /* Mh <<= 1; */
520 chunk_lshift_one_inplace(Mh);
521 #ifdef DEBUG
522 printf("Mh: "); print_chunk(Mh); printf("\n");
523 #endif
524 /* Pv = Mh | ~(Xv | Ph); */
525 chunk_set(Pv, Xv);
526 chunk_or_in(Pv, Ph);
527 chunk_negate(Pv);
528 chunk_or_in(Pv, Mh);
529 #ifdef DEBUG
530 printf("Pv: "); print_chunk(Pv); printf("\n");
531 #endif
532 /* Mv = Ph & Xv; */
533 chunk_set(Mv, Ph);
534 chunk_and_in(Mv, Xv);
535 #ifdef DEBUG
536 printf("Mv: "); print_chunk(Mv); printf("\n");
537 #endif
538 #ifdef DEBUG
539 printf("%3d. score %d\n", j, Score);
540 #endif
541 if (Score <= threshold) {
542 BestScore = Score;
543 to = j;
544 break;
545 }
546 }
547
548 /* Continue while score decreases under the threshold */
549 for(j++; j < n; j++) {
550 Eq = self->PEq[utext[j]];
551 /* Xv = Eq | Mv; */
552 chunk_set(Xv, Eq);
553 chunk_or_in(Xv, Mv);
554 /* Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq; */
555 chunk_set(Xh, Eq);
556 chunk_and_in(Xh, Pv);
557 chunk_add_in(Xh, Pv);
558 chunk_xor_in(Xh, Pv);
559 chunk_or_in(Xh, Eq);
560 /* Ph = Mv | ~ (Xh | Pv); */
561 chunk_set(Ph, Xh);
562 chunk_or_in(Ph, Pv);
563 chunk_negate(Ph);
564 chunk_or_in(Ph, Mv);
565 /* Mh = Pv & Xh; */
566 chunk_set(Mh, Pv);
567 chunk_and_in(Mh, Xh);
568 /* Ph & (1 << (m - 1)) */
569 if (chunk_isbit_set(Ph, m-1)) {
570 Score++;
571 /* Mh & (1 << (m - 1)) */
572 } else if (chunk_isbit_set(Mh, m-1)) {
573 Score--;
574 }
575 /* Ph <<= 1; */
576 chunk_lshift_one_inplace(Ph);
577 /* Mh <<= 1; */
578 chunk_lshift_one_inplace(Mh);
579 /* Pv = Mh | ~(Xv | Ph); */
580 chunk_set(Pv, Xv);
581 chunk_or_in(Pv, Ph);
582 chunk_negate(Pv);
583 chunk_or_in(Pv, Mh);
584 /* Mv = Ph & Xv; */
585 chunk_set(Mv, Ph);
586 chunk_and_in(Mv, Xv);
587
588 #ifdef DEBUG
589 printf("%3d. score %d\n", j, Score);
590 #endif
591 if (Score < BestScore) {
592 BestScore = Score;
593 to = j;
594 } else {
595 break;
596 }
597 }
598
599 /* Re-initialize for next scan! */
600 Score = m;
601 chunk_set_minusone(Pv);
602 chunk_zero(Mv);
603
604 for(j = to; j >= 0; j--) {
605 Eq = self->PEq_R[utext[j]]; /* This line is different. */
606 /* Xv = Eq | Mv; */
607 chunk_set(Xv, Eq);
608 chunk_or_in(Xv, Mv);
609 /* Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq; */
610 chunk_set(Xh, Eq);
611 chunk_and_in(Xh, Pv);
612 chunk_add_in(Xh, Pv);
613 chunk_xor_in(Xh, Pv);
614 chunk_or_in(Xh, Eq);
615 /* Ph = Mv | ~ (Xh | Pv); */
616 chunk_set(Ph, Xh);
617 chunk_or_in(Ph, Pv);
618 chunk_negate(Ph);
619 chunk_or_in(Ph, Mv);
620 /* Mh = Pv & Xh; */
621 chunk_set(Mh, Pv);
622 chunk_and_in(Mh, Xh);
623 /* Ph & (1 << (m - 1)) */
624 if (chunk_isbit_set(Ph, m-1)) {
625 Score++;
626 /* Mh & (1 << (m - 1)) */
627 } else if (chunk_isbit_set(Mh, m-1)) {
628 Score--;
629 }
630 /* Ph <<= 1; */
631 chunk_lshift_one_inplace(Ph);
632 /* Mh <<= 1; */
633 chunk_lshift_one_inplace(Mh);
634 /* Pv = Mh | ~(Xv | Ph); */
635 chunk_set(Pv, Xv);
636 chunk_or_in(Pv, Ph);
637 chunk_negate(Pv);
638 chunk_or_in(Pv, Mh);
639 /* Mv = Ph & Xv; */
640 chunk_set(Mv, Ph);
641 chunk_and_in(Mv, Xv);
642
643 #ifdef DEBUG
644 printf("%3d. score %d\n", j, Score);
645 #endif
646 if(Score <= BestScore) {
647 #ifdef DEBUG
648 printf("rev match at position %d\n", j);
649 #endif
650 from = j;
651 break;
652 }
653 }
654
655 free_chunk(Pv);
656 free_chunk(Mv);
657 free_chunk(Xv);
658 free_chunk(Xh);
659 free_chunk(Ph);
660 free_chunk(Mh);
661
662 if (BestScore <= threshold) {
663 match->position = from;
664 match->length = to-from+1;
665 match->score = BestScore;
666 return 1;
667 } else {
668 return 0;
669 }
670 }
671
672 /*
673 This finds the first match forward in the string less than or equal
674 the threshold. If nonzero, this will be a prefix of any exact match.
675 */
MyersUnlimitedFindAll(const AgrepCallArgs * args)676 void MyersUnlimitedFindAll ( const AgrepCallArgs *args )
677 {
678 AgrepFlags mode = args->self->mode;
679 MyersUnlimitedSearch *self = args->self->myersunltd;
680 int32_t threshold = args->threshold;
681 const unsigned char *utext = (const unsigned char *)args->buf;
682 int32_t n = args->buflen;
683 AgrepMatchCallback cb = dp_end_callback;
684 const void *cbinfo = args;
685
686 AgrepMatch match;
687 AgrepContinueFlag cont;
688
689 CHUNK *Pv;
690 CHUNK *Mv;
691 CHUNK *Xv, *Xh, *Ph, *Mh;
692
693 int32_t m = self->m;
694 int32_t csize = chunksize(m);
695 int32_t Score;
696
697 int32_t curscore = 0;
698 int32_t curlast = 0;
699 int32_t continuing = 0;
700
701 int32_t j;
702
703 CHUNK *Eq;
704
705
706
707 Pv = alloc_chunk(csize);
708 Mv = alloc_chunk(csize);
709 Xv = alloc_chunk(csize);
710 Xh = alloc_chunk(csize);
711 Ph = alloc_chunk(csize);
712 Mh = alloc_chunk(csize);
713
714 Score = m;
715 chunk_set_minusone(Pv);
716 chunk_zero(Mv);
717
718 for(j = 0; j < n; j++) {
719 #ifdef DEBUG
720 printf("%d j loop\n", j);
721 #endif
722 Eq = self->PEq[utext[j]];
723 #ifdef DEBUG
724 printf("Eq: "); print_chunk(Eq); printf("\n");
725 #endif
726 /* Xv = Eq | Mv; */
727 chunk_set(Xv, Eq);
728 chunk_or_in(Xv, Mv);
729 #ifdef DEBUG
730 printf("Xv: "); print_chunk(Xv); printf("\n");
731 #endif
732 /* Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq; */
733 chunk_set(Xh, Eq);
734 chunk_and_in(Xh, Pv);
735 chunk_add_in(Xh, Pv);
736 chunk_xor_in(Xh, Pv);
737 chunk_or_in(Xh, Eq);
738 #ifdef DEBUG
739 printf("Xh: "); print_chunk(Xh); printf("\n");
740 #endif
741 /* Ph = Mv | ~ (Xh | Pv); */
742 chunk_set(Ph, Xh);
743 chunk_or_in(Ph, Pv);
744 chunk_negate(Ph);
745 chunk_or_in(Ph, Mv);
746 #ifdef DEBUG
747 printf("Ph: "); print_chunk(Ph); printf("\n");
748 #endif
749 /* Mh = Pv & Xh; */
750 chunk_set(Mh, Pv);
751 chunk_and_in(Mh, Xh);
752 #ifdef DEBUG
753 printf("Mh: "); print_chunk(Mh); printf("\n");
754 #endif
755 /* Ph & (1 << (m - 1)) */
756 if (chunk_isbit_set(Ph, m-1)) {
757 Score++;
758 /* Mh & (1 << (m - 1)) */
759 } else if (chunk_isbit_set(Mh, m-1)) {
760 Score--;
761 }
762 /* Ph <<= 1; */
763 chunk_lshift_one_inplace(Ph);
764 #ifdef DEBUG
765 printf("Ph: "); print_chunk(Ph); printf("\n");
766 #endif
767 /* Mh <<= 1; */
768 chunk_lshift_one_inplace(Mh);
769 #ifdef DEBUG
770 printf("Mh: "); print_chunk(Mh); printf("\n");
771 #endif
772 /* Pv = Mh | ~(Xv | Ph); */
773 chunk_set(Pv, Xv);
774 chunk_or_in(Pv, Ph);
775 chunk_negate(Pv);
776 chunk_or_in(Pv, Mh);
777 #ifdef DEBUG
778 printf("Pv: "); print_chunk(Pv); printf("\n");
779 #endif
780 /* Mv = Ph & Xv; */
781 chunk_set(Mv, Ph);
782 chunk_and_in(Mv, Xv);
783 #ifdef DEBUG
784 printf("Mv: "); print_chunk(Mv); printf("\n");
785 #endif
786 #ifdef DEBUG
787 printf("%3d. score %d\n", j, Score);
788 #endif
789 if (Score <= threshold) {
790 if (continuing) {
791 if (Score < curscore &&
792 ((mode & AGREP_EXTEND_BETTER) ||
793 (mode & AGREP_EXTEND_SAME))) {
794 curscore = Score;
795 curlast = j;
796 } else if (Score == curscore &&
797 ((mode & AGREP_EXTEND_BETTER) ||
798 (mode & AGREP_EXTEND_SAME))) {
799 if (mode & AGREP_EXTEND_SAME) {
800 curlast = j;
801 }
802 } else {
803 continuing = 0;
804 match.score = curscore;
805 match.position = curlast;
806 match.length = -1;
807 cont = AGREP_CONTINUE;
808 (*cb)(cbinfo, &match, &cont);
809 if (cont != AGREP_CONTINUE)
810 goto EXIT;
811 }
812 } else if ((mode & AGREP_EXTEND_SAME) ||
813 (mode & AGREP_EXTEND_BETTER)) {
814 curscore = Score;
815 curlast = j;
816 continuing = 1;
817 } else {
818 match.score = Score;
819 match.position = j;
820 match.length = -1;
821 cont = AGREP_CONTINUE;
822 (*cb)(cbinfo, &match, &cont);
823 if (cont != AGREP_CONTINUE)
824 goto EXIT;
825 }
826 /* If we're no longer under the threshold, we might
827 have been moving forward looking for a better match
828 */
829 } else if (continuing) {
830 continuing = 0;
831 match.score = curscore;
832 match.position = curlast;
833 match.length = -1;
834 cont = AGREP_CONTINUE;
835 (*cb)(cbinfo, &match, &cont);
836 if (cont != AGREP_CONTINUE)
837 goto EXIT;
838 }
839 /* print_col_as_row(nxt, plen); */
840 }
841 if (continuing) {
842 continuing = 0;
843 match.score = curscore;
844 match.position = curlast;
845 match.length = -1;
846 (*cb)(cbinfo, &match, &cont);
847 }
848
849 EXIT:
850
851 free_chunk(Pv);
852 free_chunk(Mv);
853 free_chunk(Xv);
854 free_chunk(Xh);
855 free_chunk(Ph);
856 free_chunk(Mh);
857 }
858
859
MyersUnlimitedFindBest(MyersUnlimitedSearch * self,const char * text,size_t n,int32_t * pos,int32_t * len)860 LIB_EXPORT int32_t CC MyersUnlimitedFindBest ( MyersUnlimitedSearch *self,
861 const char* text, size_t n, int32_t *pos, int32_t *len )
862 {
863 const unsigned char *utext = (const unsigned char *)text;
864 CHUNK *Pv;
865 CHUNK *Mv;
866 CHUNK *Xv, *Xh, *Ph, *Mh;
867
868 int32_t m = self->m;
869 int32_t csize = chunksize(m);
870 int32_t Score;
871 int32_t BestScore = m;
872 int32_t from = 0;
873 int32_t to = -1;
874
875 int32_t j;
876
877 CHUNK *Eq;
878
879 Pv = alloc_chunk(csize);
880 Mv = alloc_chunk(csize);
881 Xv = alloc_chunk(csize);
882 Xh = alloc_chunk(csize);
883 Ph = alloc_chunk(csize);
884 Mh = alloc_chunk(csize);
885
886
887 Score = m;
888 chunk_set_minusone(Pv);
889 chunk_zero(Mv);
890
891 for(j = 0; j < n; j++) {
892 #ifdef DEBUG
893 printf("%d j loop\n", j);
894 #endif
895 Eq = self->PEq[utext[j]];
896 #ifdef DEBUG
897 printf("Eq: "); print_chunk(Eq); printf("\n");
898 #endif
899 /* Xv = Eq | Mv; */
900 chunk_set(Xv, Eq);
901 chunk_or_in(Xv, Mv);
902 #ifdef DEBUG
903 printf("Xv: "); print_chunk(Xv); printf("\n");
904 #endif
905 /* Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq; */
906 chunk_set(Xh, Eq);
907 chunk_and_in(Xh, Pv);
908 chunk_add_in(Xh, Pv);
909 chunk_xor_in(Xh, Pv);
910 chunk_or_in(Xh, Eq);
911 #ifdef DEBUG
912 printf("Xh: "); print_chunk(Xh); printf("\n");
913 #endif
914 /* Ph = Mv | ~ (Xh | Pv); */
915 chunk_set(Ph, Xh);
916 chunk_or_in(Ph, Pv);
917 chunk_negate(Ph);
918 chunk_or_in(Ph, Mv);
919 #ifdef DEBUG
920 printf("Ph: "); print_chunk(Ph); printf("\n");
921 #endif
922 /* Mh = Pv & Xh; */
923 chunk_set(Mh, Pv);
924 chunk_and_in(Mh, Xh);
925 #ifdef DEBUG
926 printf("Mh: "); print_chunk(Mh); printf("\n");
927 #endif
928 /* Ph & (1 << (m - 1)) */
929 if (chunk_isbit_set(Ph, m-1)) {
930 Score++;
931 /* Mh & (1 << (m - 1)) */
932 } else if (chunk_isbit_set(Mh, m-1)) {
933 Score--;
934 }
935 /* Ph <<= 1; */
936 chunk_lshift_one_inplace(Ph);
937 #ifdef DEBUG
938 printf("Ph: "); print_chunk(Ph); printf("\n");
939 #endif
940 /* Mh <<= 1; */
941 chunk_lshift_one_inplace(Mh);
942 #ifdef DEBUG
943 printf("Mh: "); print_chunk(Mh); printf("\n");
944 #endif
945 /* Pv = Mh | ~(Xv | Ph); */
946 chunk_set(Pv, Xv);
947 chunk_or_in(Pv, Ph);
948 chunk_negate(Pv);
949 chunk_or_in(Pv, Mh);
950 #ifdef DEBUG
951 printf("Pv: "); print_chunk(Pv); printf("\n");
952 #endif
953 /* Mv = Ph & Xv; */
954 chunk_set(Mv, Ph);
955 chunk_and_in(Mv, Xv);
956 #ifdef DEBUG
957 printf("Mv: "); print_chunk(Mv); printf("\n");
958 #endif
959 #ifdef DEBUG
960 printf("%3d. score %d\n", j, Score);
961 #endif
962 if (Score < BestScore) {
963 BestScore = Score;
964 to = j;
965 }
966 }
967
968 /* Re-initialize for next scan! */
969 Score = m;
970 chunk_set_minusone(Pv);
971 chunk_zero(Mv);
972
973 for(j = to; j >= 0; j--) {
974 Eq = self->PEq_R[utext[j]]; /* This line is different. */
975 /* Xv = Eq | Mv; */
976 chunk_set(Xv, Eq);
977 chunk_or_in(Xv, Mv);
978 /* Xh = (((Eq & Pv) + Pv) ^ Pv) | Eq; */
979 chunk_set(Xh, Eq);
980 chunk_and_in(Xh, Pv);
981 chunk_add_in(Xh, Pv);
982 chunk_xor_in(Xh, Pv);
983 chunk_or_in(Xh, Eq);
984 /* Ph = Mv | ~ (Xh | Pv); */
985 chunk_set(Ph, Xh);
986 chunk_or_in(Ph, Pv);
987 chunk_negate(Ph);
988 chunk_or_in(Ph, Mv);
989 /* Mh = Pv & Xh; */
990 chunk_set(Mh, Pv);
991 chunk_and_in(Mh, Xh);
992 /* Ph & (1 << (m - 1)) */
993 if (chunk_isbit_set(Ph, m-1)) {
994 Score++;
995 /* Mh & (1 << (m - 1)) */
996 } else if (chunk_isbit_set(Mh, m-1)) {
997 Score--;
998 }
999 /* Ph <<= 1; */
1000 chunk_lshift_one_inplace(Ph);
1001 /* Mh <<= 1; */
1002 chunk_lshift_one_inplace(Mh);
1003 /* Pv = Mh | ~(Xv | Ph); */
1004 chunk_set(Pv, Xv);
1005 chunk_or_in(Pv, Ph);
1006 chunk_negate(Pv);
1007 chunk_or_in(Pv, Mh);
1008 /* Mv = Ph & Xv; */
1009 chunk_set(Mv, Ph);
1010 chunk_and_in(Mv, Xv);
1011
1012 #ifdef DEBUG
1013 printf("%3d. score %d\n", j, Score);
1014 #endif
1015 if(Score <= BestScore) {
1016 #ifdef DEBUG
1017 printf("rev match at position %d\n", j);
1018 #endif
1019 from = j;
1020 break;
1021 }
1022 }
1023
1024 free_chunk(Pv);
1025 free_chunk(Mv);
1026 free_chunk(Xv);
1027 free_chunk(Xh);
1028 free_chunk(Ph);
1029 free_chunk(Mh);
1030
1031 *pos = from;
1032 *len = to-from+1;
1033
1034 return BestScore;
1035
1036 /* printf("Found [%d,%d]\n", from, to);
1037 printf("Found '%.*s'\n", to - from + 1, &text[from]); */
1038 }
1039
1040