1 // ========================================================================== 2 // SeqAn - The Library for Sequence Analysis 3 // ========================================================================== 4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin 5 // All rights reserved. 6 // 7 // Redistribution and use in source and binary forms, with or without 8 // modification, are permitted provided that the following conditions are met: 9 // 10 // * Redistributions of source code must retain the above copyright 11 // notice, this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above copyright 13 // notice, this list of conditions and the following disclaimer in the 14 // documentation and/or other materials provided with the distribution. 15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of 16 // its contributors may be used to endorse or promote products derived 17 // from this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 29 // DAMAGE. 30 // 31 // ========================================================================== 32 // Author: David Weese <david.weese@fu-berlin.de> 33 // ========================================================================== 34 // Utility code for enumerating all strings in edit or Hamming distance 35 // around a given string. 36 // ========================================================================== 37 38 // TODO(holtgrew): It would be nice if the maximal distance would be given as a run time parameter. 39 // TODO(holtgrew): Document iterator? 40 41 #ifndef SEQAN_INCLUDE_MISC_EDIT_ENVIRONMENT_H_ 42 #define SEQAN_INCLUDE_MISC_EDIT_ENVIRONMENT_H_ 43 44 namespace seqan { 45 46 // ========================================================================== 47 // Forwards 48 // ========================================================================== 49 50 template <typename TDistanceSpec, unsigned DISTANCE /*= 1*/> 51 struct EditEnvironment; 52 53 // ========================================================================== 54 // Tags, Classes, Enums 55 // ========================================================================== 56 57 // -------------------------------------------------------------------------- 58 // Class StringEnumerator 59 // -------------------------------------------------------------------------- 60 61 /*! 62 * @class StringEnumerator 63 * @headerfile <seqan/misc/edit_environment.h> 64 * @brief Class to enumerate all strings within a given edit/Hamming distance. 65 * 66 * @signature template <typename TString, typename TSpec> 67 * class StringEnumerator<TString, TSpec>; 68 * 69 * @tparam TString Type of the string to enumerate the environment of. 70 * @tparam TSpec Specialization tag. 71 * 72 * 73 * @fn StringEnumerator::StringEnumerator 74 * @brief Constructor 75 * 76 * @signature StringEnumerator::StringEnumerator(string[, minDist]); 77 * 78 * @param[in] string The string to use as the center. Types: <tt>TString</tt>. 79 * @param[in] minDist The smallest distance to generate strings with. Type: <tt>unsigned</tt>. Default: 0 80 * 81 * 82 * @var bool StringEnumerator::trim 83 * @brief Indicate whether to ignore substitutions in first or last character of string in Levenshtein mode 84 * (optimization for approximate search). 85 * 86 * This is useful when searching for such enumerated strings in large texts. Patterns with substitutions in the first 87 * base would also be found. 88 * 89 * @section Examples 90 * 91 * @include demos/dox/misc/enumerate_strings.cpp 92 * 93 * @include demos/dox/misc/enumerate_strings.cpp.stdout 94 */ 95 96 /*! 97 * @class HammingStringEnumerator 98 * @extends StringEnumerator 99 * @headerfile <seqan/misc/edit_environment.h> 100 * @brief Enumerate all strings within a given edit distance of a "center string". 101 * 102 * @signature template <typename TString, unsigned DISTANCE> 103 * class StringEnumerator<TString, EditEnvironment<HammingDistance, DISTANCE> >; 104 * 105 * @tparam TString Type of the string to enumerate the environment of. 106 * @tparam DISTANCE The maximal distance to generate strings with. 107 * 108 * See @link StringEnumerator @endlink for examples. 109 */ 110 111 /*! 112 * @class LevenshteinStringEnumerator 113 * @extends StringEnumerator 114 * @headerfile <seqan/misc/edit_environment.h> 115 * @brief Enumerate all strings within a given edit distance of a "center string" (of edit distance < 3). 116 * 117 * @signature template <typename TString, unsigned DISTANCE> 118 * class StringEnumerator<TString, EditEnvironment<LevenshteinDistance, DISTANCE> >; 119 * 120 * @tparam TString Type of the string to enumerate the environment of. 121 * @tparam DISTANCE The maximal distance to generate strings with. 122 * 123 * See @link StringEnumerator @endlink for examples. 124 * 125 * @note The @link StringEnumerator#length LevenshteinStringEnumerator#length @endlink function does not work for 126 * <tt>DISTANCE > 2</tt>. 127 */ 128 129 template <typename TObject, typename TSpec> 130 class StringEnumerator 131 { 132 public: 133 Holder<TObject> data_host; 134 unsigned minDist; 135 bool trim; 136 StringEnumerator(TObject & _original)137 StringEnumerator(TObject & _original) : data_host(_original), minDist(0), trim(true) 138 {} 139 StringEnumerator(TObject & _original,unsigned _minDist)140 StringEnumerator(TObject & _original, unsigned _minDist) : data_host(_original), minDist(_minDist), trim(true) 141 {} 142 StringEnumerator(TObject const & _original)143 StringEnumerator(TObject const & _original) : data_host(_original), minDist(0), trim(true) 144 {} 145 StringEnumerator(TObject const & _original,unsigned _minDist)146 StringEnumerator(TObject const & _original, unsigned _minDist) : data_host(_original), minDist(_minDist), trim(true) 147 {} 148 }; 149 150 // -------------------------------------------------------------------------- 151 // Spec EditEnvironment Iter for Hamming Distance 152 // -------------------------------------------------------------------------- 153 154 template <typename TSize> 155 struct StringEnumeratorHammingModifier_ 156 { 157 TSize errorPos; // position of substitution 158 unsigned character; // replacement character 159 unsigned skipChar; // skip the original character 160 StringEnumeratorHammingModifier_StringEnumeratorHammingModifier_161 StringEnumeratorHammingModifier_() : errorPos(0), character(0), skipChar(0) 162 {} 163 }; 164 165 template <typename TObject, unsigned DISTANCE> 166 class Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> 167 { 168 public: 169 typedef typename Value<TObject>::Type TValue; 170 typedef typename Size<TObject>::Type TSize; 171 typedef typename MakeSigned_<TSize>::Type TSignedSize; 172 typedef StringEnumeratorHammingModifier_<TSignedSize> TModifier; 173 174 TObject & orig; 175 // typename RemoveConst_<TObject>::Type tmp; 176 String<TValue> tmp; 177 178 TModifier mod[DISTANCE]; 179 unsigned minDist; 180 bool trim; 181 Iter(TObject & _original)182 Iter(TObject & _original) : 183 orig(_original), 184 minDist(0), 185 trim(true) 186 { 187 goBegin(*this); 188 } 189 Iter(TObject & _original,unsigned _minDist,bool _trim)190 Iter(TObject & _original, unsigned _minDist, bool _trim) : 191 orig(_original), 192 minDist(_minDist), 193 trim(_trim) 194 { 195 goBegin(*this); 196 } 197 Iter(TObject & _original,MinimalCtor)198 Iter(TObject & _original, MinimalCtor) : 199 orig(_original), 200 minDist(0), 201 trim(true) {} 202 Iter(TObject & _original,unsigned _minDist,bool _trim,MinimalCtor)203 Iter(TObject & _original, unsigned _minDist, bool _trim, MinimalCtor) : 204 orig(_original), 205 minDist(_minDist), 206 trim(_trim) {} 207 }; 208 209 // -------------------------------------------------------------------------- 210 // Spec EditEnvironment Iter for Edit Distance 211 // -------------------------------------------------------------------------- 212 213 template <typename TSize> 214 struct StringEnumeratorLevenshteinModifier_ 215 { 216 enum TState {DISABLED_, SUBST_, DELETE_, INSERT_, Eof_}; 217 TSize errorPosOrig; // position of edit operation in original string 218 TSize errorPos; // position of edit operation in modified string 219 TSize errorPosEnd; // errorPos < errorPosEnd must be fulfilled 220 unsigned character; // replacement character 221 unsigned skipChar; // skip the original character 222 TState state; // current state subst/insert before/delete 223 StringEnumeratorLevenshteinModifier_StringEnumeratorLevenshteinModifier_224 StringEnumeratorLevenshteinModifier_() : 225 errorPosOrig(0), errorPos(0), errorPosEnd(0), character(0), skipChar(0), state(DISABLED_) 226 {} 227 }; 228 229 template <typename TObject, unsigned DISTANCE> 230 class Iter<StringEnumerator<TObject, EditEnvironment<LevenshteinDistance, DISTANCE> >, Standard> 231 { 232 public: 233 typedef typename Value<TObject>::Type TValue; 234 typedef typename Size<TObject>::Type TSize; 235 typedef typename MakeSigned_<TSize>::Type TSignedSize; 236 typedef StringEnumeratorLevenshteinModifier_<TSignedSize> TModifier; 237 238 TObject & orig; 239 // typename RemoveConst_<TObject>::Type tmp; 240 String<TValue> tmp; 241 242 TModifier mod[DISTANCE + 1]; 243 unsigned minDist; 244 unsigned currentDistance; // upper bound for dist(original, *this) 245 bool trim; 246 Iter(TObject & _original)247 Iter(TObject & _original) : 248 orig(_original), 249 minDist(0), 250 currentDistance(0), 251 trim(true) 252 { 253 goBegin(*this); 254 } 255 Iter(TObject & _original,unsigned _minDist,bool _trim)256 Iter(TObject & _original, unsigned _minDist, bool _trim) : 257 orig(_original), 258 minDist(_minDist), 259 currentDistance(0), 260 trim(_trim) 261 { 262 goBegin(*this); 263 } 264 Iter(TObject & _original,MinimalCtor)265 Iter(TObject & _original, MinimalCtor) : 266 orig(_original), 267 minDist(0), 268 currentDistance(0), 269 trim(true) {} 270 Iter(TObject & _original,unsigned _minDist,bool _trim,MinimalCtor)271 Iter(TObject & _original, unsigned _minDist, bool _trim, MinimalCtor) : 272 orig(_original), 273 minDist(_minDist), 274 currentDistance(0), 275 trim(_trim) {} 276 _reinit(int pos,int posOrig)277 inline bool _reinit(int pos, int posOrig) 278 { 279 tmp = orig; 280 int posOrigEnd = length(tmp); 281 typename TModifier::TState lastState = TModifier::DISABLED_; 282 // i from high to low = modifier from left to right 283 for (int i = currentDistance - 1; i >= 0; --i) 284 { 285 TModifier & _mod = mod[i]; 286 switch (_mod.state) 287 { 288 case TModifier::SUBST_: 289 // INSERT+SUBST is SUBST+INSERT (already enumerated) 290 // DELETE+SUBST is SUBST+DELETE (already enumerated) 291 // eventually trim front SUBSTs 292 if (lastState == TModifier::INSERT_ || lastState == TModifier::DELETE_ || 293 (trim && posOrig == 0)) 294 { 295 ++posOrig; 296 ++pos; 297 } 298 if (posOrig >= posOrigEnd) 299 return false; 300 301 _mod.errorPosOrig = posOrig; 302 _mod.errorPos = pos; 303 _mod.skipChar = ordValue(orig[posOrig]); 304 _mod.character = (0 == _mod.skipChar) ? 1 : 0; 305 assignValue(tmp, pos, (TValue) _mod.character); 306 ++pos; 307 ++posOrig; 308 break; 309 310 case TModifier::DELETE_: 311 // INSERT after DELETE is one SUBST (already enumerated) 312 if (lastState == TModifier::INSERT_) 313 { 314 ++posOrig; 315 ++pos; 316 } 317 if (posOrig >= posOrigEnd) 318 return false; 319 320 _mod.errorPosOrig = posOrig; 321 _mod.errorPos = pos; 322 _mod.character = ValueSize<TValue>::VALUE - 1; 323 _mod.skipChar = -1; 324 ++posOrig; 325 erase(tmp, pos); 326 break; 327 328 case TModifier::INSERT_: 329 default: 330 // DELETE after INSERT is one SUBST (already enumerated) 331 // eventually trim front SUBSTs 332 if (lastState == TModifier::DELETE_ || (trim && posOrig == 0)) 333 { 334 ++posOrig; 335 ++pos; 336 } 337 if (posOrig > posOrigEnd) 338 return false; 339 340 _mod.errorPosOrig = posOrig; 341 _mod.errorPos = pos; 342 _mod.character = 0; 343 _mod.skipChar = -1; 344 insertValue(tmp, pos, (TValue)0); 345 ++pos; 346 break; 347 } 348 lastState = _mod.state; 349 } 350 351 pos = length(tmp); 352 bool cut = trim; 353 for (unsigned i = 0; i < currentDistance; ++i) 354 { 355 TModifier & _mod = mod[i]; 356 if (_mod.state != TModifier::DELETE_) 357 { 358 if (cut) 359 { 360 if (_mod.errorPos >= (TSignedSize)(pos - 1)) 361 return false; 362 363 _mod.errorPosEnd = pos - 1; 364 } 365 else 366 { 367 if (_mod.errorPos >= (TSignedSize)pos) 368 return false; 369 370 _mod.errorPosEnd = pos; 371 } 372 --pos; 373 } 374 else 375 { 376 cut = false; 377 if (_mod.errorPos > (TSignedSize)pos) 378 return false; 379 380 _mod.errorPosEnd = pos + 1; 381 } 382 } 383 return true; 384 } 385 386 }; 387 388 // ========================================================================== 389 // Metafunctions 390 // ========================================================================== 391 392 // -------------------------------------------------------------------------- 393 // Metafunction Value StringEnumerator 394 // -------------------------------------------------------------------------- 395 396 /*! 397 * @mfn StringEnumerator#Value 398 * @brief Return value type of the string to enumerate. 399 * 400 * @signature Value<TStringEnumerator>::Type; 401 */ 402 403 template <typename TObject, typename TSpec> 404 struct Value<StringEnumerator<TObject, TSpec> > : Value<TObject> 405 {}; 406 407 // -------------------------------------------------------------------------- 408 // Metafunction Reference StringEnumerator 409 // -------------------------------------------------------------------------- 410 411 /*! 412 * @mfn StringEnumerator#Reference 413 * @brief Returns reference type of the enumerated strings. 414 * 415 * @signature Reference<TStringEnumerator>::Type; 416 */ 417 418 template <typename TObject, typename TSpec> 419 struct Reference<StringEnumerator<TObject, TSpec> > 420 { 421 typedef TObject & Type; 422 }; 423 424 template <typename TObject, typename TSpec> 425 struct Reference<StringEnumerator<TObject, TSpec> const> 426 { 427 typedef TObject const & Type; 428 }; 429 430 // -------------------------------------------------------------------------- 431 // Metafunction Size StringEnumerator 432 // -------------------------------------------------------------------------- 433 434 /*! 435 * @mfn StringEnumerator#Size 436 * @brief Returns size type. 437 * 438 * @signature Size<TStringEnumerator>::Type; 439 */ 440 441 template <typename TObject, typename TSpec> 442 struct Size<StringEnumerator<TObject, TSpec> > : Size<TObject> 443 {}; 444 445 // -------------------------------------------------------------------------- 446 // Metafunction Difference StringEnumerator 447 // -------------------------------------------------------------------------- 448 449 /*! 450 * @mfn StringEnumerator#Difference 451 * @brief Returns difference type. 452 * 453 * @signature Difference<TStringEnumerator>::Type; 454 */ 455 456 template <typename TObject, typename TSpec> 457 struct Difference<StringEnumerator<TObject, TSpec> > : Difference<TObject> 458 {}; 459 460 // -------------------------------------------------------------------------- 461 // Metafunction Position StringEnumerator 462 // -------------------------------------------------------------------------- 463 464 /*! 465 * @mfn StringEnumerator#Position 466 * @brief Returns position type. 467 * 468 * @signature Position<TStringEnumerator>::Type; 469 */ 470 471 template <typename TObject, typename TSpec> 472 struct Position<StringEnumerator<TObject, TSpec> > : Position<TObject> 473 {}; 474 475 // -------------------------------------------------------------------------- 476 // Metafunction Iterator StringEnumerator 477 // -------------------------------------------------------------------------- 478 479 /*! 480 * @mfn StringEnumerator#Iterator 481 * @brief Returns iterator type. 482 * 483 * @signature Position<TStringEnumerator, TSpec>::Type; 484 */ 485 486 template <typename TObject, typename TSpec> 487 struct Iterator<StringEnumerator<TObject, TSpec>, Standard> 488 { 489 typedef Iter<StringEnumerator<TObject, TSpec>, Standard> Type; 490 }; 491 492 template <typename TObject, typename TSpec> 493 struct Iterator<StringEnumerator<TObject, TSpec> const, Standard> 494 { 495 typedef Iter<StringEnumerator<TObject, TSpec>, Standard> Type; 496 }; 497 498 // -------------------------------------------------------------------------- 499 // Metafunction Host StringEnumerator 500 // -------------------------------------------------------------------------- 501 502 /*! 503 * @mfn StringEnumerator#Host 504 * @brief Returns host type. 505 * 506 * @signature Host<TStringEnumerator>::Type; 507 */ 508 509 template <typename TObject, typename TSpec> 510 struct Host<StringEnumerator<TObject, TSpec> > 511 { 512 typedef TObject Type; 513 }; 514 515 template <typename TObject, typename TSpec> 516 struct Host<StringEnumerator<TObject, TSpec> const> 517 { 518 typedef TObject const Type; 519 }; 520 521 // ========================================================================== 522 // Functions 523 // ========================================================================== 524 525 // -------------------------------------------------------------------------- 526 // Function _dataHost() StringEnumerator 527 // -------------------------------------------------------------------------- 528 529 template <typename TText, typename TSpec> 530 inline Holder<TText> & 531 _dataHost(StringEnumerator<TText, TSpec> & enumerator) 532 { 533 return enumerator.data_host; 534 } 535 536 template <typename TText, typename TSpec> 537 inline Holder<TText> const & 538 _dataHost(StringEnumerator<TText, TSpec> const & enumerator) 539 { 540 return enumerator.data_host; 541 } 542 543 // -------------------------------------------------------------------------- 544 // Function begin() StringEnumerator 545 // -------------------------------------------------------------------------- 546 547 /*! 548 * @fn StringEnumerator#begin 549 * @brief Return begin iterator. 550 * 551 * @signature TIter begin(stringEnum[, tag]); 552 * 553 * @param[in] stringEnum StringEnumerator to query. 554 * @param[in] tag Iterator tag to use. 555 * 556 * @return TIter Iterator to the first string in the enumerator. 557 */ 558 559 template <typename TObject, typename TSpec> 560 inline Iter<StringEnumerator<TObject, TSpec>, Standard> 561 begin(StringEnumerator<TObject, TSpec> & enumerator, Standard) 562 { 563 return Iter<StringEnumerator<TObject, TSpec>, Standard>(host(enumerator), enumerator.minDist, enumerator.trim); 564 } 565 566 template <typename TObject, typename TSpec> 567 inline Iter<StringEnumerator<TObject, TSpec>, Standard> 568 begin(StringEnumerator<TObject, TSpec> const & enumerator, Standard) 569 { 570 return Iter<StringEnumerator<TObject, TSpec>, Standard>(host(enumerator), enumerator.minDist, enumerator.trim); 571 } 572 573 // -------------------------------------------------------------------------- 574 // Function end() StringEnumerator 575 // -------------------------------------------------------------------------- 576 577 /*! 578 * @fn StringEnumerator#end 579 * @brief Return end iterator. 580 * 581 * @signature TIter end(stringEnum[, tag]); 582 * 583 * @param[in] stringEnum StringEnumerator to query. 584 * @param[in] tag Iterator tag to use. 585 * 586 * @return TIter End iterator for the string enumerator. 587 */ 588 589 template <typename TObject, typename TSpec> 590 inline Iter<StringEnumerator<TObject, TSpec>, Standard> 591 end(StringEnumerator<TObject, TSpec> & enumerator, Standard) 592 { 593 Iter<StringEnumerator<TObject, TSpec>, Standard> iter(host(enumerator), enumerator.minDist, enumerator.trim, MinimalCtor()); 594 goEnd(iter); 595 return iter; 596 } 597 598 template <typename TObject, typename TSpec> 599 inline Iter<StringEnumerator<TObject, TSpec>, Standard> 600 end(StringEnumerator<TObject, TSpec> const & enumerator, Standard) 601 { 602 Iter<StringEnumerator<TObject, TSpec>, Standard> iter(host(enumerator), enumerator.minDist, enumerator.trim, MinimalCtor()); 603 goEnd(iter); 604 return iter; 605 } 606 607 // -------------------------------------------------------------------------- 608 // Function length() Hamming StringEnumerator 609 // -------------------------------------------------------------------------- 610 611 /*! 612 * @fn StringEnumerator#length 613 * @brief Return number of strings that will be enumerated. 614 * 615 * @signature TSize length(stringEnum); 616 * 617 * @param[in] stringEnum StringEnumerator to query. 618 * 619 * @return TSize The number of elements in the enumerator (Metafunction: @link StringEnumerator#Size @endlink). 620 */ 621 622 template <typename TObject, unsigned DISTANCE> 623 inline typename Size<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> > >::Type 624 length(StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> > const & me) 625 { 626 typedef typename Value<TObject>::Type TValue; 627 typedef typename Size<TObject>::Type TSize; 628 629 static const unsigned alphabetSize = ValueSize<TValue>::VALUE; 630 TSize sum = 0; 631 TSize numerator = 1; 632 TSize alpha = 1; 633 TSize divisor = 1; 634 635 for (unsigned i = 0; i <= DISTANCE; ++i) 636 { 637 if (i >= me.minDist) 638 sum += alpha * (numerator / divisor); 639 640 divisor *= i + 1; 641 numerator *= length(host(me)) - i; 642 alpha *= alphabetSize - 1; 643 } 644 645 return sum; 646 } 647 648 // -------------------------------------------------------------------------- 649 // Function operator*() StringEnumerator Iter 650 // -------------------------------------------------------------------------- 651 652 template <typename TObject, typename TSpec> 653 inline TObject const & 654 operator*(Iter<StringEnumerator<TObject, TSpec>, Standard> & it) 655 { 656 return it.tmp; 657 } 658 659 template <typename TObject, typename TSpec> 660 inline TObject const & 661 operator*(Iter<StringEnumerator<TObject, TSpec>, Standard> const & it) 662 { 663 return it.tmp; 664 } 665 666 // -------------------------------------------------------------------------- 667 // Function goBegin() Hamming StringEnumerator Iter 668 // -------------------------------------------------------------------------- 669 670 template <typename TObject, unsigned DISTANCE> 671 inline void 672 goBegin(Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> & it) 673 { 674 typedef typename Value<TObject>::Type TValue; 675 typedef typename Size<TObject>::Type TSize; 676 typedef typename MakeSigned_<TSize>::Type TSignedSize; 677 typedef StringEnumeratorHammingModifier_<TSignedSize> TModifier; 678 679 if (empty(it.orig) || it.minDist > DISTANCE || it.minDist > length(it.orig)) 680 { 681 goEnd(it); 682 return; 683 } 684 685 it.tmp = it.orig; 686 687 unsigned i = 0; 688 unsigned mDist = it.minDist; 689 690 if (mDist > length(it.orig)) 691 mDist = length(it.orig); 692 693 if (mDist == 0) 694 { 695 it.mod[0].errorPos = 0; 696 it.mod[0].skipChar = -1; 697 it.mod[0].character = 0; 698 assignValue(it.tmp, 0, (TValue) 0); 699 i = 1; 700 } 701 else 702 for (; i < mDist; ++i) 703 { 704 TModifier & mod = it.mod[i]; 705 mod.errorPos = (mDist - 1) - i; 706 mod.skipChar = ordValue(it.orig[mod.errorPos]); 707 mod.character = (0 == mod.skipChar) ? 1 : 0; 708 assignValue(it.tmp, mod.errorPos, (TValue) mod.character); 709 } 710 for (; i < DISTANCE; ++i) 711 { 712 TModifier & mod = it.mod[i]; 713 mod.errorPos = -1; 714 mod.character = 0; 715 mod.skipChar = -1; 716 } 717 } 718 719 // -------------------------------------------------------------------------- 720 // Function atBegin() Hamming StringEnumerator Iter 721 // -------------------------------------------------------------------------- 722 723 template <typename TObject, unsigned DISTANCE> 724 inline void 725 atBegin(Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> const & it) 726 { 727 Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> tmp(it.orig, it.minDist, it.trim); 728 return tmp == it; 729 } 730 731 template <typename TObject, unsigned DISTANCE> 732 inline bool 733 atBegin(Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> & it) 734 { 735 return atBegin(const_cast<Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> const &>(it)); 736 } 737 738 // -------------------------------------------------------------------------- 739 // Function goEnd() StringEnumerator Iter Hamming 740 // -------------------------------------------------------------------------- 741 742 template <typename TObject, unsigned DISTANCE> 743 inline void 744 goEnd(Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> & it) 745 { 746 typedef typename Size<TObject>::Type TSize; 747 typedef typename MakeSigned_<TSize>::Type TSignedSize; 748 typedef StringEnumeratorHammingModifier_<TSignedSize> TModifier; 749 750 for (unsigned i = 0; i < DISTANCE; ++i) 751 { 752 TModifier & mod = it.mod[i]; 753 mod.errorPos = -1; 754 mod.character = 0; 755 } 756 } 757 758 // -------------------------------------------------------------------------- 759 // Function atEnd() Hamming StringEnumerator Iter 760 // -------------------------------------------------------------------------- 761 762 template <typename TObject, unsigned DISTANCE> 763 inline bool 764 atEnd(Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> const & it) 765 { 766 typedef typename Size<TObject>::Type TSize; 767 typedef typename MakeSigned_<TSize>::Type TSignedSize; 768 typedef StringEnumeratorHammingModifier_<TSignedSize> TModifier; 769 770 for (unsigned i = 0; i < DISTANCE; ++i) 771 { 772 TModifier const & mod = it.mod[i]; 773 if (mod.errorPos != (TSignedSize) - 1 || mod.character != 0u) 774 return false; 775 } 776 return true; 777 } 778 779 template <typename TObject, unsigned DISTANCE> 780 inline bool 781 atEnd(Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> & it) 782 { 783 return atEnd(const_cast<Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> const &>(it)); 784 } 785 786 // -------------------------------------------------------------------------- 787 // Function operator++() Hamming StringEnumerator Iter 788 // -------------------------------------------------------------------------- 789 790 template <typename TObject, unsigned DISTANCE> 791 inline Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> & 792 operator++(Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> & it) 793 { 794 typedef typename Value<TObject>::Type TValue; 795 typedef typename Size<TObject>::Type TSize; 796 typedef typename MakeSigned_<TSize>::Type TSignedSize; 797 typedef StringEnumeratorHammingModifier_<TSignedSize> TModifier; 798 799 for (unsigned i = 0; true; ) 800 { 801 TModifier * mod = &it.mod[i]; 802 803 // next replacement value 804 if (++mod->character < ValueSize<TValue>::VALUE) 805 { 806 // output the original tuple only once 807 if (mod->character == mod->skipChar) 808 continue; 809 assignValue(it.tmp, mod->errorPos, (TValue) mod->character); 810 break; 811 } 812 mod->character = (0 == mod->skipChar) ? 1 : 0; 813 assignValue(it.tmp, mod->errorPos, (TValue) mod->character); 814 815 if (++i == DISTANCE || (mod + 1)->errorPos == (TSignedSize) - 1) 816 { 817 for (i = 0; i < DISTANCE; ++i) 818 { 819 mod = &it.mod[i]; 820 821 // restore char at old position 822 if (mod->errorPos >= 0) 823 { 824 // std::cout << "org" << it.orig << " tmp" << it.tmp << " ___ "; 825 assignValue(it.tmp, mod->errorPos, it.orig[mod->errorPos]); 826 // std::cout << "org" << it.orig << " tmp" << it.tmp << std::endl; 827 } 828 829 // next error position 830 if (++(mod->errorPos) < (TSignedSize)(length(it.tmp) - i)) 831 { 832 mod->skipChar = ordValue(it.orig[mod->errorPos]); 833 mod->character = (0 == mod->skipChar) ? 1 : 0; 834 assignValue(it.tmp, mod->errorPos, (TValue) mod->character); 835 836 for (; i > 0; ) 837 { 838 it.mod[i - 1].errorPos = mod->errorPos + 1; 839 mod = &it.mod[--i]; 840 mod->skipChar = ordValue(it.orig[mod->errorPos]); 841 mod->character = (0 == mod->skipChar) ? 1 : 0; 842 assignValue(it.tmp, mod->errorPos, (TValue) mod->character); 843 } 844 return it; 845 } 846 } 847 // end 848 for (i = 0; i < DISTANCE; ++i) 849 { 850 mod = &it.mod[i]; 851 mod->errorPos = -1; 852 mod->character = 0; 853 } 854 return it; 855 } 856 } 857 return it; 858 } 859 860 // -------------------------------------------------------------------------- 861 // Function goBegin() Levensthein StringEnumerator Iter 862 // -------------------------------------------------------------------------- 863 864 template <typename TObject, unsigned DISTANCE> 865 inline void 866 goBegin(Iter<StringEnumerator<TObject, EditEnvironment<LevenshteinDistance, DISTANCE> >, Standard> & it) 867 { 868 typedef typename Value<TObject>::Type TValue; 869 typedef typename Size<TObject>::Type TSize; 870 typedef typename MakeSigned_<TSize>::Type TSignedSize; 871 typedef StringEnumeratorLevenshteinModifier_<TSignedSize> TModifier; 872 873 if (empty(it.orig) || it.minDist > DISTANCE || it.minDist > length(it.orig)) 874 { 875 goEnd(it); 876 return; 877 } 878 879 it.tmp = it.orig; 880 for (unsigned i = 0; i <= DISTANCE; ++i) 881 { 882 it.mod[i].errorPosOrig = -1; 883 it.mod[i].errorPos = -1; 884 it.mod[i].errorPosEnd = -1; 885 it.mod[i].character = ValueSize<TValue>::VALUE - 1; 886 it.mod[i].state = TModifier::DISABLED_; 887 } 888 it.currentDistance = it.minDist; 889 if (!it._reinit(0, 0)) 890 goEnd(it); 891 } 892 893 // -------------------------------------------------------------------------- 894 // Function goEnd() Levensthein StringEnumerator Iter 895 // -------------------------------------------------------------------------- 896 897 template <typename TObject, unsigned DISTANCE> 898 inline void 899 goEnd(Iter<StringEnumerator<TObject, EditEnvironment<LevenshteinDistance, DISTANCE> >, Standard> & it) 900 { 901 typedef typename Size<TObject>::Type TSize; 902 typedef typename MakeSigned_<TSize>::Type TSignedSize; 903 typedef StringEnumeratorLevenshteinModifier_<TSignedSize> TModifier; 904 905 for (unsigned i = 0; i < DISTANCE; ++i) 906 { 907 TModifier & mod = it.mod[i]; 908 mod.errorPos = -1; 909 mod.character = 0; 910 mod.state = TModifier::SUBST_; 911 } 912 } 913 914 // -------------------------------------------------------------------------- 915 // Function atEnd() Edit StringEnumerator Iter 916 // -------------------------------------------------------------------------- 917 918 template <typename TObject, unsigned DISTANCE> 919 inline bool 920 atEnd(Iter<StringEnumerator<TObject, EditEnvironment<LevenshteinDistance, DISTANCE> >, Standard> const & it) 921 { 922 typedef typename Size<TObject>::Type TSize; 923 typedef typename MakeSigned_<TSize>::Type TSignedSize; 924 typedef StringEnumeratorLevenshteinModifier_<TSignedSize> TModifier; 925 926 for (unsigned i = 0; i < DISTANCE; ++i) 927 { 928 TModifier const & mod = it.mod[i]; 929 if (mod.errorPos != -1 || mod.character != 0u || mod.state != TModifier::SUBST_) 930 return false; 931 } 932 return true; 933 } 934 935 template <typename TObject, unsigned DISTANCE> 936 inline bool 937 atEnd(Iter<StringEnumerator<TObject, EditEnvironment<LevenshteinDistance, DISTANCE> >, Standard> & it) 938 { 939 return atEnd(const_cast<Iter<StringEnumerator<TObject, EditEnvironment<LevenshteinDistance, DISTANCE> >, Standard> const &>(it)); 940 } 941 942 // -------------------------------------------------------------------------- 943 // Function operator++() Levensthein StringEnumerator Iter 944 // -------------------------------------------------------------------------- 945 946 template <typename TObject, unsigned DISTANCE> 947 inline Iter<StringEnumerator<TObject, EditEnvironment<LevenshteinDistance, DISTANCE> >, Standard> & 948 operator++(Iter<StringEnumerator<TObject, EditEnvironment<LevenshteinDistance, DISTANCE> >, Standard> & it) 949 { 950 typedef typename Value<TObject>::Type TValue; 951 typedef typename Size<TObject>::Type TSize; 952 typedef typename MakeSigned_<TSize>::Type TSignedSize; 953 typedef StringEnumeratorLevenshteinModifier_<TSignedSize> TModifier; 954 typedef typename TModifier::TState TState; 955 956 // increment characters 957 TModifier * mod = it.mod; 958 do 959 { 960 // next replacement/insert value (loop core) 961 if (++(mod->character) < ValueSize<TValue>::VALUE) 962 { 963 // output the original tuple only once 964 if (mod->character == mod->skipChar) 965 continue; 966 if (mod->errorPos >= 0) 967 assignValue(it.tmp, mod->errorPos, (TValue) mod->character); 968 return it; 969 } 970 971 // reset counter 972 if (mod->state != mod->DELETE_) 973 { 974 mod->character = (0 == mod->skipChar) ? 1 : 0; 975 if (mod->errorPos >= 0) 976 assignValue(it.tmp, mod->errorPos, (TValue) mod->character); 977 } 978 979 // next modifier 980 ++mod; 981 } 982 while (mod->state != TModifier::DISABLED_); 983 984 // increment positions 985 mod = it.mod; 986 do 987 { 988 // restore char at old position 989 if (mod->errorPos >= 0 && static_cast<unsigned>(mod->errorPos) < length(it.tmp)) 990 assignValue(it.tmp, mod->errorPos, it.orig[mod->errorPosOrig]); 991 992 // int iMax = (TSignedSize)(length(it.tmp) - i); 993 // if (mod->state == mod->INSERT_) ++iMax; 994 995 // next error position 996 if (++(mod->errorPos) < mod->errorPosEnd) 997 { 998 ++(mod->errorPosOrig); 999 1000 // set next char 1001 if (mod == it.mod) // for the first modifier we use an optimization 1002 { 1003 if (mod->state != mod->DELETE_) 1004 { 1005 if (mod->state == mod->SUBST_) 1006 mod->skipChar = ordValue(it.orig[mod->errorPosOrig]); 1007 else 1008 mod->skipChar = -1; 1009 mod->character = (0 == mod->skipChar) ? 1 : 0; 1010 assignValue(it.tmp, mod->errorPos, (TValue) mod->character); 1011 } 1012 } 1013 else if (!it._reinit(mod->errorPos, mod->errorPosOrig)) 1014 break; 1015 1016 return it; 1017 } 1018 ++mod; 1019 } 1020 while (mod->state != TModifier::DISABLED_); 1021 1022 // increment states 1023 mod = it.mod; 1024 TModifier * modEnd = mod + DISTANCE; 1025 do 1026 { 1027 // next edit state (subst->delete->insert) 1028 if (mod->state != mod->INSERT_) 1029 { 1030 mod->state = (TState)(mod->state + 1); 1031 if (mod->state == TModifier::SUBST_) 1032 ++it.currentDistance; 1033 1034 if (!it._reinit(0, 0)) 1035 { 1036 mod = it.mod; 1037 continue; 1038 } 1039 return it; 1040 } 1041 else 1042 mod->state = mod->SUBST_; 1043 ++mod; 1044 } 1045 while (mod != modEnd); 1046 1047 // end 1048 goEnd(it); 1049 return it; 1050 } 1051 1052 // -------------------------------------------------------------------------- 1053 // Function length() Levensthein StringEnumerator Iter 1054 // -------------------------------------------------------------------------- 1055 1056 template <typename TObject, unsigned DISTANCE> 1057 inline typename Size<StringEnumerator<TObject, EditEnvironment<LevenshteinDistance, DISTANCE> > >::Type 1058 length(StringEnumerator<TObject, EditEnvironment<LevenshteinDistance, DISTANCE> > const & me) 1059 { 1060 typedef typename Value<TObject>::Type TValue; 1061 typedef typename Size<TObject>::Type TSize; 1062 1063 static const unsigned alpha = ValueSize<TValue>::VALUE; 1064 TSize sum = 0; 1065 // TSize numerator = 1; 1066 // TSize divisor = 1; 1067 TSize len = length(host(me)); 1068 1069 if (me.minDist == 0 && len > 0) 1070 ++sum; 1071 1072 if (me.minDist <= 1 && DISTANCE >= 1) 1073 { 1074 if (me.trim) 1075 { 1076 if (len > 2) 1077 sum += (alpha - 1) * (len - 2); // substitutions 1078 sum += len; // deletions 1079 if (len > 1) 1080 sum += alpha * (len - 1); // inserts 1081 } 1082 else 1083 { 1084 sum += (alpha - 1) * len; // substitutions 1085 sum += len; // deletions 1086 sum += alpha * (len + 1); // inserts 1087 } 1088 } 1089 1090 if (me.minDist <= 2 && DISTANCE >= 2) 1091 { 1092 if (me.trim) 1093 { 1094 sum += (alpha - 1) * (alpha - 1) * len * (len - 1) / 2; // subst^2 1095 sum += (alpha - 1) * (len - 1) * (len - 1); // subst*del 1096 sum += (alpha - 1) * alpha * len * (len + 1); // subst*ins 1097 sum += len * (len - 1) / 2; // del^2 1098 sum += alpha * alpha * (len + 1) * (len + 1) / 2; // ins^2 1099 } 1100 else 1101 { 1102 sum += (alpha - 1) * (alpha - 1) * len * (len - 1) / 2; // subst^2 1103 sum += (alpha - 1) * (len - 1) * (len - 1); // subst*del 1104 sum += (alpha - 1) * alpha * len * (len + 1); // subst*ins 1105 sum += len * (len - 1) / 2; // del^2 1106 sum += alpha * alpha * (len + 1) * (len + 1) / 2; // ins^2 1107 } 1108 } 1109 1110 // TODO: length function for DISTANCE >= 3 (if anyone needs should this) 1111 if (DISTANCE > 2u) 1112 SEQAN_FAIL("length() not implemented for DISTANCE >= 3!"); 1113 1114 return sum; 1115 } 1116 1117 // -------------------------------------------------------------------------- 1118 // Function operator==() Hamming StringEnumerator Iter 1119 // -------------------------------------------------------------------------- 1120 1121 template <typename TObject, unsigned DISTANCE> 1122 inline bool 1123 operator==( 1124 Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> const & a, 1125 Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> const & b) 1126 { 1127 typedef typename Size<TObject>::Type TSize; 1128 typedef typename MakeSigned_<TSize>::Type TSignedSize; 1129 typedef StringEnumeratorHammingModifier_<TSignedSize> TModifier; 1130 1131 if (&a.orig != &b.orig) 1132 return false; 1133 1134 for (unsigned i = 0; i < DISTANCE; ++i) 1135 { 1136 TModifier const & modA = a.mod[i]; 1137 TModifier const & modB = b.mod[i]; 1138 if (modA.errorPos != modB.errorPos || modA.character != modB.character) 1139 return false; 1140 } 1141 1142 return true; 1143 } 1144 1145 // -------------------------------------------------------------------------- 1146 // Function operator!=() Hamming StringEnumerator Iter 1147 // -------------------------------------------------------------------------- 1148 1149 template <typename TObject, unsigned DISTANCE> 1150 inline bool 1151 operator!=( 1152 Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> const & a, 1153 Iter<StringEnumerator<TObject, EditEnvironment<HammingDistance, DISTANCE> >, Standard> const & b) 1154 { 1155 typedef typename Size<TObject>::Type TSize; 1156 typedef typename MakeSigned_<TSize>::Type TSignedSize; 1157 typedef StringEnumeratorHammingModifier_<TSignedSize> TModifier; 1158 1159 if (&a.orig != &b.orig) 1160 return true; 1161 1162 for (unsigned i = 0; i < DISTANCE; ++i) 1163 { 1164 TModifier const & modA = a.mod[i]; 1165 TModifier const & modB = b.mod[i]; 1166 if (modA.errorPos != modB.errorPos || modA.character != modB.character) 1167 return true; 1168 } 1169 1170 return false; 1171 } 1172 1173 // -------------------------------------------------------------------------- 1174 // Function operator==() Levensthein StringEnumerator Iter 1175 // -------------------------------------------------------------------------- 1176 1177 template <typename TObject, unsigned DISTANCE> 1178 inline bool 1179 operator==( 1180 Iter<StringEnumerator<TObject, EditEnvironment<LevenshteinDistance, DISTANCE> >, Standard> const & a, 1181 Iter<StringEnumerator<TObject, EditEnvironment<LevenshteinDistance, DISTANCE> >, Standard> const & b) 1182 { 1183 typedef typename Size<TObject>::Type TSize; 1184 typedef typename MakeSigned_<TSize>::Type TSignedSize; 1185 typedef StringEnumeratorLevenshteinModifier_<TSignedSize> TModifier; 1186 1187 if (&a.orig != &b.orig) 1188 return false; 1189 1190 for (unsigned i = 0; i < DISTANCE; ++i) 1191 { 1192 TModifier const & modA = a.mod[i]; 1193 TModifier const & modB = b.mod[i]; 1194 if (modA.errorPos != modB.errorPos || 1195 modA.character != modB.character || 1196 modA.state != modB.state) 1197 return false; 1198 } 1199 1200 return true; 1201 } 1202 1203 // -------------------------------------------------------------------------- 1204 // Function operator!=() Levenshtein StringEnumerator Iter 1205 // -------------------------------------------------------------------------- 1206 1207 template <typename TObject, unsigned DISTANCE> 1208 inline bool 1209 operator!=( 1210 Iter<StringEnumerator<TObject, EditEnvironment<LevenshteinDistance, DISTANCE> >, Standard> const & a, 1211 Iter<StringEnumerator<TObject, EditEnvironment<LevenshteinDistance, DISTANCE> >, Standard> const & b) 1212 { 1213 typedef typename Size<TObject>::Type TSize; 1214 typedef typename MakeSigned_<TSize>::Type TSignedSize; 1215 typedef StringEnumeratorLevenshteinModifier_<TSignedSize> TModifier; 1216 1217 if (&a.orig != &b.orig) 1218 return true; 1219 1220 for (unsigned i = 0; i < DISTANCE; ++i) 1221 { 1222 TModifier const & modA = a.mod[i]; 1223 TModifier const & modB = b.mod[i]; 1224 if (modA.errorPos != modB.errorPos || 1225 modA.character != modB.character || 1226 modA.state != modB.state) 1227 return true; 1228 } 1229 1230 return false; 1231 } 1232 1233 } // namespace seqan 1234 1235 #endif // #ifndef SEQAN_INCLUDE_MISC_EDIT_ENVIRONMENT_H_ 1236 1237 // LocalWords: StringEnumerator 1238