1 // ========================================================================== 2 // SeqAn - The Library for Sequence Analysis 3 // ========================================================================== 4 // Copyright (c) 2006-2010, Knut Reinert, FU Berlin 5 // All rights reserved. 6 // 7 // Redistribution and use in source and binary forms, with or without 8 // modification, are permitted provided that the following conditions are met: 9 // 10 // * Redistributions of source code must retain the above copyright 11 // notice, this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above copyright 13 // notice, this list of conditions and the following disclaimer in the 14 // documentation and/or other materials provided with the distribution. 15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of 16 // its contributors may be used to endorse or promote products derived 17 // from this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 29 // DAMAGE. 30 // 31 // ========================================================================== 32 // Author: David Weese <david.weese@fu-berlin.de> 33 // ========================================================================== 34 35 #ifndef SEQAN_HEADER_INDEX_QGRAM_H 36 #define SEQAN_HEADER_INDEX_QGRAM_H 37 38 namespace SEQAN_NAMESPACE_MAIN 39 { 40 41 42 ////////////////////////////////////////////////////////////////////////////// 43 // q-gram index fibres 44 45 /** 46 .Tag.QGram Index Fibres 47 ..summary:Tag to select a specific fibre (e.g. table, object, ...) of a @Spec.IndexQGram.q-gram Index@. 48 ..remarks:These tags can be used to get @Metafunction.Fibre.Fibres@ of a @Spec.IndexQGram.q-gram Index@. 49 ..cat:Index 50 51 ..tag.QGramText:The original text the index should be based on. 52 53 ..tag.QGram_RawText:The concatenation of all text sequences. 54 ...remarks:$QGramText$ and $QGram_RawText$ fibres are equal by default. 55 They differ if the index text is a set of strings. Then, raw text is the concatenation of all strings in this set. 56 57 ..tag.QGramSA:The suffix array. 58 ...remarks:Contains all occurrences of q-grams, s.t. the occurrences of a single q-gram are stored in a contiguous block (q-gram bucket). 59 q-grams exceeding the end of the text are ignored. 60 The beginning of each bucket can be determined by the q-gram directory ($QGramDir$, see below). 61 ...remarks:It corresponds to a suffix array which is sorted by the first q-gram. 62 ...remarks:@Metafunction.Fibre@ returns a @Class.String@ over the alphabet of the @Metafunction.SAValue@ of $TIndex$. 63 64 ..tag.QGramDir:The directory/hash table. 65 ...remarks:The directory contains for every possible q-gram hash value the start index of the q-gram bucket. 66 A q-gram bucket is a contiguous interval in the suffix array ($QGramSA$, see above). 67 Each suffix in this interval begins with the same q-gram. 68 The end index is the start index of the next bucket. 69 ...remarks:@Metafunction.Fibre@ returns a @Class.String@ over the alphabet of a size type. 70 71 ..tag.QGramBucketMap:Maps q-gram hashes to buckets. 72 This fibre is used by the @Spec.OpenAddressing@ index and stores all parameters of the open addressing hash function and hash value occupancy in the QGramDir fibre. 73 In contrast to @Spec.OpenAddressing@, @Spec.IndexQGram@ uses a trivial 1-to-1 mapping from q-gram hash values to buckets. 74 For that index the fibre is of type @Tag.Nothing@. 75 76 ..tag.QGramCounts:The counts array. 77 ...remarks:Contains the numbers of occurrences per sequence of each q-gram, s.t. the numbers of the same q-gram are stored in a contiguous block (q-gram count bucket). 78 A bucket contains entries (seqNo,count) of sequences with at least one q-gram occurrence. q-grams exceeding the end of the text are ignored. 79 The beginning of each count bucket can be determined by the q-gram counts directory ($QGramCountsDir$, see below). 80 ...remarks:@Metafunction.Fibre@ returns a @Class.String@ over the alphabet of the @Metafunction.SAValue@ of $TIndex$. 81 82 ..tag.QGramCountsDir:The counts directory. 83 ...remarks:The counts directory contains for every possible q-gram hash value the start index of the q-gram count bucket. 84 A q-gram count bucket is a contiguous interval in the counts array ($QGramCounts$, see above). 85 The end index is the start index of the next bucket. 86 ...remarks:@Metafunction.Fibre@ returns a @Class.String@ over the alphabet of a size type. 87 88 ..tag.QGramShape:The shape the index is based on. 89 ...remarks:The q-gram index needs an underlying @Class.Shape@. This shape can be gapped or ungapped. 90 The number of '1's (relevant positions) in the shape determines $q$ and the size of the directory table. 91 ...remarks:Dynamic shapes (@Spec.SimpleShape@, @Spec.GenericShape@, ...) must be initialized before the index can be used. 92 93 ..tag.QGramSADir:The union of suffix array and directory. 94 ...remarks:In most applications a q-gram index consisting of both of these table is required. 95 To efficiently create them at once use this tag for @Function.indexRequire@ or @Function.indexCreate@. 96 97 ..see:Metafunction.Fibre 98 ..see:Function.getFibre 99 ..see:Spec.IndexQGram 100 ..include:seqan/index.h 101 */ 102 103 struct FibreDir_; // directory/hash table, contains start indices of buckets 104 struct FibreSADir_; // identifies algorithm to construct both SA and directory at once 105 struct Fibre_Shape_; // underlying shape 106 struct FibreCounts_; // counts each q-gram 107 struct FibreCountsDir_; // directory for counts buckets 108 struct FibreBucketMap_; // stores a q-gram hash value for each directory entry (-1 if empty) 109 110 typedef Tag<FibreDir_> const FibreDir; 111 typedef Tag<FibreSADir_> const FibreSADir; 112 typedef Tag<Fibre_Shape_> const FibreShape; 113 typedef Tag<FibreCounts_> const FibreCounts; 114 typedef Tag<FibreCountsDir_> const FibreCountsDir; 115 typedef Tag<FibreBucketMap_> const FibreBucketMap; 116 117 ////////////////////////////////////////////////////////////////////////////// 118 119 typedef FibreText QGramText; 120 typedef FibreRawText QGram_RawText; 121 typedef FibreSA QGramSA; 122 typedef FibreRawSA QGramRawSA; 123 typedef FibreDir QGramDir; 124 typedef FibreSADir QGramSADir; 125 typedef FibreShape QGramShape; 126 typedef FibreCounts QGramCounts; 127 typedef FibreCountsDir QGramCountsDir; 128 typedef FibreBucketMap QGramBucketMap; 129 130 ////////////////////////////////////////////////////////////////////////////// 131 // q-gram index 132 133 /** 134 .Spec.IndexQGram: 135 ..summary:An index based on an array of sorted q-grams. 136 ..cat:Index 137 ..general:Class.Index 138 ..signature:Index<TText, IndexQGram<TShapeSpec[, TSpec]> > 139 ..param.TText:The text type. 140 ...type:Class.String 141 ..param.TShapeSpec:The @Class.Shape@ specialization type. 142 ...note:This can be either a $TSpec$ argument (e.g. $SimpleShape$) or a complete @Class.Shape@ class (e.g. Shape<Dna, SimpleShape>). 143 ..param.TSpec:The specializing type. 144 ...default:Default 145 ...type:Spec.OpenAddressing 146 ..remarks:The fibres (see @Class.Index@ and @Metafunction.Fibre@) of this index are a suffix array sorted by the first q characters (see @Tag.QGram Index Fibres.QGramSA@) and a q-gram directory (see @Tag.QGram Index Fibres.QGramDir@). 147 ..include:seqan/index.h 148 */ 149 150 template < typename TShapeSpec, typename TSpec = Default > 151 struct IndexQGram {}; 152 153 // use the index value type as shape value type 154 template < typename TObject, typename TShapeSpec, typename TSpec > 155 struct Fibre< Index<TObject, IndexQGram<TShapeSpec, TSpec> >, FibreShape> 156 { 157 typedef Index< TObject, IndexQGram<TShapeSpec, TSpec> > TIndex; 158 typedef Shape< typename Value<TIndex>::Type, TShapeSpec > Type; 159 }; 160 161 // allow different value types for the shape 162 template < typename TObject, typename TShapeValue, typename TShapeSpec, typename TSpec > 163 struct Fibre< Index<TObject, IndexQGram<Shape<TShapeValue, TShapeSpec>, TSpec> >, FibreShape> 164 { 165 typedef Shape<TShapeValue, TShapeSpec> Type; 166 }; 167 168 template < typename TObject, typename TShapeSpec, typename TSpec > 169 struct Fibre< Index<TObject, IndexQGram<TShapeSpec, TSpec> >, FibreBucketMap> 170 { 171 typedef Nothing Type; 172 }; 173 174 #ifdef PLATFORM_WINDOWS_VS 175 #pragma warning( push ) 176 // Disable warning C4521 locally (multiple copy constructors). 177 #pragma warning( disable: 4521 ) 178 // Disable warning C4522 locally (multiple assignment operators). 179 #pragma warning( disable: 4522 ) 180 #endif // PLATFORM_WINDOWS_VS 181 182 template < typename TObject, typename TShapeSpec, typename TSpec > 183 class Index<TObject, IndexQGram<TShapeSpec, TSpec> > { 184 public: 185 typedef typename Fibre<Index, QGramText>::Type TText; 186 typedef typename Fibre<Index, QGramSA>::Type TSA; 187 typedef typename Fibre<Index, QGramDir>::Type TDir; 188 typedef typename Fibre<Index, QGramCounts>::Type TCounts; 189 typedef typename Fibre<Index, QGramCountsDir>::Type TCountsDir; 190 typedef typename Fibre<Index, QGramShape>::Type TShape; 191 typedef typename Fibre<Index, QGramBucketMap>::Type TBucketMap; 192 typedef typename Cargo<Index>::Type TCargo; 193 typedef typename Size<Index>::Type TSize; 194 195 Holder<TText> text; // underlying text 196 TSA sa; // suffix array sorted by the first q chars 197 TDir dir; // bucket directory 198 TCounts counts; // counts each q-gram per sequence 199 TCountsDir countsDir; // directory for count buckets 200 TShape shape; // underlying shape 201 TCargo cargo; // user-defined cargo 202 TBucketMap bucketMap; // bucketMap table (used by open-addressing index) 203 TSize stepSize; // store every <stepSize>'th q-gram in the index 204 205 Index(): 206 stepSize(1) {} 207 208 Index(Index &other): 209 text(other.text), 210 sa(other.sa), 211 dir(other.dir), 212 counts(other.counts), 213 countsDir(other.countsDir), 214 shape(other.shape), 215 cargo(other.cargo), 216 stepSize(1) {} 217 218 Index(Index const &other): 219 text(other.text), 220 sa(other.sa), 221 dir(other.dir), 222 counts(other.counts), 223 countsDir(other.countsDir), 224 shape(other.shape), 225 cargo(other.cargo), 226 stepSize(1) {} 227 228 template <typename TText_> 229 Index(TText_ &_text): 230 text(_text), 231 stepSize(1) {} 232 233 template <typename TText_> 234 Index(TText_ const &_text): 235 text(_text), 236 stepSize(1) {} 237 238 template <typename TText_, typename TShape_> 239 Index(TText_ &_text, TShape_ const &_shape): 240 text(_text), 241 shape(_shape), 242 stepSize(1) {} 243 244 template <typename TText_, typename TShape_> 245 Index(TText_ const &_text, TShape_ const &_shape): 246 text(_text), 247 shape(_shape), 248 stepSize(1) {} 249 }; 250 251 #ifdef PLATFORM_WINDOWS_VS 252 // Reset warning state to previous values for C4521, C4522. 253 #pragma warning( pop ) 254 #endif // PLATFORM_WINDOWS_VS 255 256 template < typename TText, typename TShapeSpec, typename TSpec > 257 struct Value< Index<TText, IndexQGram<TShapeSpec, TSpec> > > { 258 typedef typename Value< typename Fibre< Index<TText, IndexQGram<TShapeSpec, TSpec> >, QGram_RawText >::Type >::Type Type; 259 }; 260 261 template < typename TText, typename TShapeSpec, typename TSpec > 262 struct Size< Index<TText, IndexQGram<TShapeSpec, TSpec> > > { 263 typedef typename Size< typename Fibre< Index<TText, IndexQGram<TShapeSpec, TSpec> >, QGram_RawText >::Type >::Type Type; 264 }; 265 266 267 ////////////////////////////////////////////////////////////////////////////// 268 // default fibre creators 269 270 template < typename TText, typename TShapeSpec, typename TSpec > 271 struct DefaultIndexCreator<Index<TText, IndexQGram<TShapeSpec, TSpec> >, FibreSA> { 272 typedef Default Type; 273 }; 274 275 ////////////////////////////////////////////////////////////////////////////// 276 // counts array type 277 278 template < typename TText, typename TSpec > 279 struct Fibre< Index<TText, TSpec>, FibreCounts> { 280 typedef String< 281 Pair< 282 typename Size< TText >::Type, 283 typename Size< Index<TText, TSpec> >::Type 284 >, 285 typename DefaultIndexStringSpec< Index<TText, TSpec> >::Type 286 > Type; 287 }; 288 289 290 ////////////////////////////////////////////////////////////////////////////// 291 292 template <typename TText, typename TSpec> 293 inline typename Fibre<Index<TText, TSpec>, FibreDir>::Type & 294 getFibre(Index<TText, TSpec> &index, FibreDir) { 295 return index.dir; 296 } 297 template <typename TText, typename TSpec> 298 inline typename Fibre<Index<TText, TSpec> const, FibreDir>::Type & 299 getFibre(Index<TText, TSpec> const &index, FibreDir) { 300 return index.dir; 301 } 302 303 template <typename TText, typename TSpec> 304 inline typename Fibre<Index<TText, TSpec>, FibreCounts>::Type & 305 getFibre(Index<TText, TSpec> &index, FibreCounts) { 306 return index.counts; 307 } 308 template <typename TText, typename TSpec> 309 inline typename Fibre<Index<TText, TSpec> const, FibreCounts>::Type & 310 getFibre(Index<TText, TSpec> const &index, FibreCounts) { 311 return index.counts; 312 } 313 314 template <typename TText, typename TSpec> 315 inline typename Fibre<Index<TText, TSpec>, FibreCountsDir>::Type & 316 getFibre(Index<TText, TSpec> &index, FibreCountsDir) { 317 return index.countsDir; 318 } 319 template <typename TText, typename TSpec> 320 inline typename Fibre<Index<TText, TSpec> const, FibreCountsDir>::Type & 321 getFibre(Index<TText, TSpec> const &index, FibreCountsDir) { 322 return index.countsDir; 323 } 324 325 template <typename TText, typename TSpec> 326 inline typename Fibre<Index<TText, TSpec>, FibreBucketMap>::Type & 327 getFibre(Index<TText, TSpec> &index, FibreBucketMap) { 328 return index.bucketMap; 329 } 330 template <typename TText, typename TSpec> 331 inline typename Fibre<Index<TText, TSpec> const, FibreBucketMap>::Type & 332 getFibre(Index<TText, TSpec> const &index, FibreBucketMap) { 333 return index.bucketMap; 334 } 335 336 template <typename TText, typename TSpec> 337 inline typename Fibre<Index<TText, TSpec>, FibreShape>::Type & 338 getFibre(Index<TText, TSpec> &index, FibreShape) { 339 return index.shape; 340 } 341 template <typename TText, typename TSpec> 342 inline typename Fibre<Index<TText, TSpec> const, FibreShape>::Type & 343 getFibre(Index<TText, TSpec> const &index, FibreShape) { 344 return index.shape; 345 } 346 347 ////////////////////////////////////////////////////////////////////////////// 348 /** 349 .Function.indexDir 350 ..summary:Shortcut for $getFibre(.., QGramDir)$. 351 ..cat:Index 352 ..signature:indexDir(index) 353 ..param.index:The @Class.Index@ object holding the fibre. 354 ...type:Spec.IndexQGram 355 ..returns:A reference to the @Tag.QGram Index Fibres.QGramDir@ fibre (q-gram directory). 356 ..include:seqan/index.h 357 */ 358 359 template <typename TText, typename TSpec> 360 inline typename Fibre<Index<TText, TSpec>, FibreDir>::Type & 361 indexDir(Index<TText, TSpec> &index) { 362 return getFibre(index, FibreDir()); 363 } 364 template <typename TText, typename TSpec> 365 inline typename Fibre<Index<TText, TSpec> const, FibreDir>::Type & 366 indexDir(Index<TText, TSpec> const &index) { 367 return getFibre(index, FibreDir()); 368 } 369 370 ////////////////////////////////////////////////////////////////////////////// 371 /** 372 .Function.dirAt 373 ..summary:Shortcut for $value(indexDir(..), ..)$. 374 ..cat:Index 375 ..signature:dirAt(position, index) 376 ..param.position:A position in the array on which the value should be accessed. 377 ..param.index:The @Class.Index@ object holding the fibre. 378 ...type:Spec.IndexQGram 379 ..returns:A reference or proxy to the value. 380 ..include:seqan/index.h 381 */ 382 383 template <typename TPos, typename TIndex> 384 inline typename Reference<typename Fibre<TIndex, FibreDir>::Type>::Type 385 dirAt(TPos i, TIndex &index) { 386 return value(getFibre(index, FibreDir()), i); 387 } 388 template <typename TPos, typename TIndex> 389 inline typename Reference<typename Fibre<TIndex const, FibreDir>::Type>::Type 390 dirAt(TPos i, TIndex const &index) { 391 return value(getFibre(index, FibreDir()), i); 392 } 393 394 395 ////////////////////////////////////////////////////////////////////////////// 396 /** 397 .Function.indexCounts 398 ..summary:Shortcut for $getFibre(.., QGramCounts)$. 399 ..cat:Index 400 ..signature:indexCounts(index) 401 ..param.index:The @Class.Index@ object holding the fibre. 402 ...type:Spec.IndexQGram 403 ..returns:A reference to the @Tag.QGram Index Fibres.QGramCounts@ fibre (counts array). 404 ..include:seqan/index.h 405 */ 406 407 template <typename TText, typename TSpec> 408 inline typename Fibre<Index<TText, TSpec>, FibreCounts>::Type & 409 indexCounts(Index<TText, TSpec> &index) { 410 return getFibre(index, FibreCounts()); 411 } 412 template <typename TText, typename TSpec> 413 inline typename Fibre<Index<TText, TSpec> const, FibreCounts>::Type & 414 indexCounts(Index<TText, TSpec> const &index) { 415 return getFibre(index, FibreCounts()); 416 } 417 418 ////////////////////////////////////////////////////////////////////////////// 419 /** 420 .Function.indexCountsDir 421 ..summary:Shortcut for $getFibre(.., QGramCountsDir)$. 422 ..cat:Index 423 ..signature:indexCountsDir(index) 424 ..param.index:The @Class.Index@ object holding the fibre. 425 ...type:Spec.IndexQGram 426 ..returns:A reference to the @Tag.QGram Index Fibres.QGramCountsDir@ fibre (counts directory). 427 ..include:seqan/index.h 428 */ 429 430 template <typename TText, typename TSpec> 431 inline typename Fibre<Index<TText, TSpec>, FibreCountsDir>::Type & 432 indexCountsDir(Index<TText, TSpec> &index) { 433 return getFibre(index, FibreCountsDir()); 434 } 435 template <typename TText, typename TSpec> 436 inline typename Fibre<Index<TText, TSpec> const, FibreCountsDir>::Type & 437 indexCountsDir(Index<TText, TSpec> const &index) { 438 return getFibre(index, FibreCountsDir()); 439 } 440 441 ////////////////////////////////////////////////////////////////////////////// 442 /** 443 .Function.indexBucketMap 444 ..summary:Shortcut for $getFibre(.., QGramBucketMap)$. 445 ..cat:Index 446 ..signature:indexBucketMap(index) 447 ..param.index:The @Class.Index@ object holding the fibre. 448 ...type:Spec.IndexQGram 449 ..returns:A reference to the @Tag.QGram Index Fibres.QGramBucketMap@ fibre (maps q-gram hashes to buckets). 450 ..include:seqan/index.h 451 */ 452 453 template <typename TText, typename TSpec> 454 inline typename Fibre<Index<TText, TSpec>, FibreBucketMap>::Type & 455 indexBucketMap(Index<TText, TSpec> &index) { 456 return getFibre(index, FibreBucketMap()); 457 } 458 template <typename TText, typename TSpec> 459 inline typename Fibre<Index<TText, TSpec> const, FibreBucketMap>::Type & 460 indexBucketMap(Index<TText, TSpec> const &index) { 461 return getFibre(index, FibreBucketMap()); 462 } 463 464 ////////////////////////////////////////////////////////////////////////////// 465 /** 466 .Function.indexShape 467 ..summary:Shortcut for $getFibre(.., QGramShape)$. 468 ..cat:Index 469 ..signature:indexShape(index) 470 ..param.index:The @Class.Index@ object holding the fibre. 471 ...type:Spec.IndexQGram 472 ..returns:Returns a reference to the @Class.Shape@ object of a q-gram index. 473 Formally, this is a reference to the @Tag.QGram Index Fibres.QGramShape@ fibre. 474 ...type:Class.Shape 475 ..include:seqan/index.h 476 */ 477 478 template <typename TText, typename TSpec> 479 inline typename Fibre<Index<TText, TSpec>, FibreShape>::Type & 480 indexShape(Index<TText, TSpec> &index) { 481 return getFibre(index, FibreShape()); 482 } 483 template <typename TText, typename TSpec> 484 inline typename Fibre<Index<TText, TSpec> const, FibreShape>::Type & 485 indexShape(Index<TText, TSpec> const &index) { 486 return getFibre(index, FibreShape()); 487 } 488 489 ////////////////////////////////////////////////////////////////////////////// 490 /** 491 .Function.getStepSize 492 ..summary:Return the q-gram step size used for index creation. 493 ..cat:Index 494 ..signature:getStepSize(index) 495 ..param.index:A q-gram index. 496 ...type:Spec.IndexQGram 497 ..returns:The step size. If $x$ is returned every $x$'th q-gram is stored in the index. 498 ..include:seqan/index.h 499 */ 500 501 template <typename TText, typename TShapeSpec, typename TSpec> 502 inline typename Size<TText>::Type 503 getStepSize(Index<TText, IndexQGram<TShapeSpec, TSpec> > const &index) 504 { 505 return (index.stepSize != 0)? index.stepSize: length(indexShape(index)); 506 } 507 508 ////////////////////////////////////////////////////////////////////////////// 509 /** 510 .Function.setStepSize 511 ..summary:Change the q-gram step size used for index creation. 512 ..cat:Index 513 ..signature:setStepSize(index, stepSize) 514 ..param.index:A q-gram index. 515 ...type:Spec.IndexQGram 516 ..param.stepSize:Store every $stepSize$'th q-gram in the index. 517 ..remarks:The default step size of a q-gram index is 1, which corresponds to all overlapping q-grams. 518 To take effect of changing the $stepSize$ the q-gram index should be empty or recreated. 519 ..remarks:A $stepSize$ of 0 corresponds to $stepSize=length(indexShape(index))$, i.e. all non-overlapping q-grams. 520 ..see:Function.getStepSize 521 ..include:seqan/index.h 522 */ 523 524 template <typename TText, typename TShapeSpec, typename TSpec, typename TSize> 525 inline void 526 setStepSize(Index<TText, IndexQGram<TShapeSpec, TSpec> > &index, TSize stepSize) 527 { 528 index.stepSize = stepSize; 529 } 530 531 ////////////////////////////////////////////////////////////////////////////// 532 533 template <typename TIndex> 534 inline __int64 _fullDirLength(TIndex const &index) 535 { 536 typedef typename Fibre<TIndex, FibreShape>::Type TShape; 537 typedef typename Host<TShape>::Type TTextValue; 538 return _intPow((__int64)ValueSize<TTextValue>::VALUE, weight(indexShape(index))) + 1; 539 } 540 541 template <typename TIndex> 542 inline __int64 _fullDir2Length(TIndex const &index) 543 { 544 typedef typename Fibre<TIndex, FibreShape>::Type TShape; 545 typedef typename Host<TShape>::Type TTextValue; 546 return (_intPow( 547 (__int64)ValueSize<TTextValue>::VALUE, 548 weight(indexShape(index)) + 1) - 1) 549 / ((unsigned)ValueSize<TTextValue>::VALUE - 1) + 1; 550 } 551 552 553 ////////////////////////////////////////////////////////////////////////////// 554 // QGramLess 555 // 556 // compare two q-grams of a given text (q-grams can be smaller than q) 557 template < typename TSAValue, typename TText > 558 struct QGramLess_ : 559 public ::std::binary_function < TSAValue, TSAValue, bool > 560 { 561 typedef typename Iterator<TText, Standard>::Type TIter; 562 TIter _begin, _end; 563 typename Size<TText>::Type _q; 564 565 template <typename TSize> 566 QGramLess_(TText &text, TSize q): 567 _begin(begin(text, Standard())), 568 _end(end(text, Standard())), 569 _q(q) {} 570 571 // skip the first <offset> characters 572 template <typename TSize1, typename TSize2> 573 QGramLess_(TText &text, TSize1 q, TSize2 offset): 574 _begin(begin(text, Standard()) + offset), 575 _end(end(text, Standard())), 576 _q(q) {} 577 578 inline bool operator() (TSAValue const a, TSAValue const b) const 579 { 580 if (a == b) return false; 581 TIter itA = _begin + a; 582 TIter itB = _begin + b; 583 if (a <= b) { 584 TIter itEnd = itB + _q; 585 if (itEnd > _end) 586 itEnd = _end; 587 for(; itB != itEnd; ++itB, ++itA) { 588 if (lexLess(*itA, *itB)) return true; 589 if (lexLess(*itB, *itA)) return false; 590 } 591 return false; 592 } else { 593 TIter itEnd = itA + _q; 594 if (itEnd > _end) 595 itEnd = _end; 596 for(; itA != itEnd; ++itA, ++itB) { 597 if (lexLess(*itA, *itB)) return true; 598 if (lexLess(*itB, *itA)) return false; 599 } 600 return true; 601 } 602 } 603 }; 604 605 template < typename TSAValue, typename TString, typename TSpec > 606 struct QGramLess_<TSAValue, StringSet<TString, TSpec> const> : 607 public ::std::binary_function < TSAValue, TSAValue, bool > 608 { 609 typedef typename Iterator<TString, Standard>::Type TIter; 610 StringSet<TString, TSpec> const &_stringSet; 611 typename Size<TString>::Type _q; 612 613 template <typename TSize> 614 QGramLess_(StringSet<TString, TSpec> const &text, TSize q): 615 _stringSet(text), 616 _q(q) {} 617 618 inline bool operator() (TSAValue const &a, TSAValue const &b) const 619 { 620 if (a == b) return false; 621 TIter itA = begin(_stringSet[getValueI1(a)], Standard()) + getValueI2(a); 622 TIter itB = begin(_stringSet[getValueI1(b)], Standard()) + getValueI2(b); 623 if (suffixLength(a, _stringSet) > suffixLength(b, _stringSet)) { 624 TIter _end = end(_stringSet[getValueI1(b)], Standard()); 625 TIter itEnd = itB + _q; 626 if (itEnd > _end) 627 itEnd = _end; 628 for(; itB != itEnd; ++itB, ++itA) { 629 if (lexLess(*itA, *itB)) return true; 630 if (lexLess(*itB, *itA)) return false; 631 } 632 return false; 633 } else { 634 TIter _end = end(_stringSet[getValueI1(a)], Standard()); 635 TIter itEnd = itA + _q; 636 if (itEnd > _end) 637 itEnd = _end; 638 for(; itA != itEnd; ++itA, ++itB) { 639 if (lexLess(*itA, *itB)) return true; 640 if (lexLess(*itB, *itA)) return false; 641 } 642 return true; 643 } 644 } 645 }; 646 647 648 ////////////////////////////////////////////////////////////////////////////// 649 // QGramLessOffset 650 // 651 // compare two q-grams of a given text and skip the first <offset> characters 652 template < typename TSAValue, typename TText > 653 struct QGramLessOffset_ : 654 QGramLess_<TSAValue, TText> 655 { 656 template <typename TSize1, typename TSize2> 657 QGramLessOffset_(TText &text, TSize1 q, TSize2 offset): 658 QGramLess_<TSAValue, TText> (text, q, offset) {} 659 }; 660 661 template < typename TSAValue, typename TString, typename TSpec > 662 struct QGramLessOffset_<TSAValue, StringSet<TString, TSpec> const> : 663 public ::std::binary_function < TSAValue, TSAValue, bool > 664 { 665 typedef typename Iterator<TString, Standard>::Type TIter; 666 typedef typename Size<TString>::Type TSize; 667 StringSet<TString, TSpec> const &_stringSet; 668 typename Size<TString>::Type _q, _offset; 669 670 template <typename TSize1, typename TSize2> 671 QGramLessOffset_(StringSet<TString, TSpec> const &text, TSize1 q, TSize2 offset): 672 _stringSet(text), 673 _q(q), 674 _offset(offset) {} 675 676 inline bool operator() (TSAValue const &a, TSAValue const &b) const 677 { 678 if (a == b) return false; 679 TString const &sA = _stringSet[getValueI1(a)]; 680 TString const &sB = _stringSet[getValueI1(b)]; 681 TIter itA = begin(sA, Standard()) + getValueI2(a) + _offset; 682 TIter itB = begin(sB, Standard()) + getValueI2(b) + _offset; 683 TSize restA = length(sA) - getValueI2(a); 684 TSize restB = length(sB) - getValueI2(b); 685 if (restA > restB) { 686 TIter itEnd; 687 if (restB >= _q) 688 itEnd = itB + _q; 689 else 690 itEnd = end(sB, Standard()); 691 for(; itB != itEnd; ++itB, ++itA) { 692 if (lexLess(*itA, *itB)) return true; 693 if (lexLess(*itB, *itA)) return false; 694 } 695 return false; 696 } else { 697 TIter itEnd; 698 if (restA >= _q) 699 itEnd = itA + _q; 700 else 701 itEnd = end(sA, Standard()); 702 for(; itA != itEnd; ++itA, ++itB) { 703 if (lexLess(*itA, *itB)) return true; 704 if (lexLess(*itB, *itA)) return false; 705 } 706 return true; 707 } 708 } 709 }; 710 711 712 ////////////////////////////////////////////////////////////////////////////// 713 // QGramLessNoCheck 714 // 715 // compare two q-grams of a given text (no check for q-grams smaller than q) 716 template < typename TSAValue, typename TText > 717 struct QGramLessNoCheck_ : 718 public ::std::binary_function < TSAValue, TSAValue, bool > 719 { 720 typedef typename Iterator<TText, Standard>::Type TIter; 721 TIter _begin; 722 typename Size<TText>::Type _q; 723 724 template <typename TSize> 725 QGramLessNoCheck_(TText &text, TSize q): 726 _begin(begin(text, Standard())), 727 _q(q) {} 728 729 // skip the first <offset> characters 730 template <typename TSize> 731 QGramLessNoCheck_(TText &text, TSize q, TSize offset): 732 _begin(begin(text, Standard()) + offset), 733 _q(q) {} 734 735 inline bool operator() (TSAValue const a, TSAValue const b) const 736 { 737 if (a == b) return false; 738 TIter itA = _begin + a; 739 TIter itB = _begin + b; 740 if (a <= b) { 741 TIter itEnd = itB + _q; 742 for(; itB != itEnd; ++itB, ++itA) { 743 if (lexLess(*itA, *itB)) return true; 744 if (lexLess(*itB, *itA)) return false; 745 } 746 return false; 747 } else { 748 TIter itEnd = itA + _q; 749 for(; itA != itEnd; ++itA, ++itB) { 750 if (lexLess(*itA, *itB)) return true; 751 if (lexLess(*itB, *itA)) return false; 752 } 753 return true; 754 } 755 } 756 }; 757 758 template < typename TSAValue, typename TString, typename TSpec > 759 struct QGramLessNoCheck_<TSAValue, StringSet<TString, TSpec> const> : 760 public ::std::binary_function < TSAValue, TSAValue, bool > 761 { 762 typedef typename Iterator<TString, Standard>::Type TIter; 763 StringSet<TString, TSpec> const &_stringSet; 764 typename Size<TString>::Type _q; 765 766 template <typename TSize> 767 QGramLessNoCheck_(StringSet<TString, TSpec> const &text, TSize q): 768 _stringSet(text), 769 _q(q) {} 770 771 inline bool operator() (TSAValue const &a, TSAValue const &b) const 772 { 773 if (a == b) return false; 774 TIter itA = begin(_stringSet[getValueI1(a)], Standard()) + getValueI2(a); 775 TIter itB = begin(_stringSet[getValueI1(b)], Standard()) + getValueI2(b); 776 if (suffixLength(a, _stringSet) > suffixLength(b, _stringSet)) { 777 TIter itEnd = itB + _q; 778 for(; itB != itEnd; ++itB, ++itA) { 779 if (lexLess(*itA, *itB)) return true; 780 if (lexLess(*itB, *itA)) return false; 781 } 782 return false; 783 } else { 784 TIter itEnd = itA + _q; 785 for(; itA != itEnd; ++itA, ++itB) { 786 if (lexLess(*itA, *itB)) return true; 787 if (lexLess(*itB, *itA)) return false; 788 } 789 return true; 790 } 791 } 792 }; 793 794 795 ////////////////////////////////////////////////////////////////////////////// 796 // QGramLessNoCheckOffset_ 797 // 798 // compare two q-grams of a given text and skip the first <offset> characters 799 template < typename TSAValue, typename TText > 800 struct QGramLessNoCheckOffset_: QGramLessNoCheck_<TSAValue, TText> 801 { 802 template <typename TSize1, typename TSize2> 803 QGramLessNoCheckOffset_(TText &text, TSize1 q, TSize2 offset): 804 QGramLessNoCheck_<TSAValue, TText> (text, q, offset) {} 805 }; 806 807 template < typename TSAValue, typename TString, typename TSpec > 808 struct QGramLessNoCheckOffset_<TSAValue, StringSet<TString, TSpec> const> : 809 public ::std::binary_function < TSAValue, TSAValue, bool > 810 { 811 typedef typename Iterator<TString, Standard>::Type TIter; 812 StringSet<TString, TSpec> const &_stringSet; 813 typename Size<TString>::Type _q, _offset; 814 815 template <typename TSize1, typename TSize2> 816 QGramLessNoCheckOffset_(StringSet<TString, TSpec> const &text, TSize1 q, TSize2 offset): 817 _stringSet(text), 818 _q(q), 819 _offset(offset) {} 820 821 inline bool operator() (TSAValue const &a, TSAValue const &b) const 822 { 823 if (a == b) return false; 824 TIter itA = begin(_stringSet[getValueI1(a)], Standard()) + getValueI2(a) + _offset; 825 TIter itB = begin(_stringSet[getValueI1(b)], Standard()) + getValueI2(b) + _offset; 826 if (a <= b) { 827 TIter itEnd = itB + _q; 828 for(; itB != itEnd; ++itB, ++itA) { 829 if (lexLess(*itA, *itB)) return true; 830 if (lexLess(*itB, *itA)) return false; 831 } 832 return false; 833 } else { 834 TIter itEnd = itA + _q; 835 for(; itA != itEnd; ++itA, ++itB) { 836 if (lexLess(*itA, *itB)) return true; 837 if (lexLess(*itB, *itA)) return false; 838 } 839 return true; 840 } 841 } 842 }; 843 844 ////////////////////////////////////////////////////////////////////////////// 845 // Counting sort - little helpers 846 // 847 848 // map hashes 1:1 to directory 849 template < typename THashValue > 850 inline THashValue 851 requestBucket(Nothing &, THashValue hash) 852 { 853 return hash; 854 } 855 856 template < typename THashValue > 857 inline THashValue 858 getBucket(Nothing const &, THashValue hash) 859 { 860 return hash; 861 } 862 863 ////////////////////////////////////////////////////////////////////////////// 864 // Counting sort - Step 1: Clear directory 865 template < typename TDir > 866 inline void 867 _qgramClearDir(TDir &dir, Nothing &) 868 { 869 arrayFill(begin(dir, Standard()), end(dir, Standard()), 0); 870 } 871 872 ////////////////////////////////////////////////////////////////////////////// 873 // Counting sort - Step 2: Count q-grams 874 template < typename TDir, typename TBucketMap, typename TText, typename TShape, typename TStepSize > 875 inline void 876 _qgramCountQGrams(TDir &dir, TBucketMap &bucketMap, TText const &text, TShape shape, TStepSize stepSize) 877 { 878 SEQAN_CHECKPOINT 879 typedef typename Iterator<TText const, Standard>::Type TIterator; 880 typedef typename Value<TDir>::Type TSize; 881 882 if (length(text) < length(shape)) return; 883 TSize num_qgrams = (length(text) - length(shape)) / stepSize + 1; 884 885 TIterator itText = begin(text, Standard()); 886 ++dir[requestBucket(bucketMap, hash(shape, itText))]; 887 if (stepSize == 1) 888 for(TSize i = 1; i < num_qgrams; ++i) 889 { 890 ++itText; 891 ++dir[requestBucket(bucketMap, hashNext(shape, itText))]; 892 } 893 else 894 for(TSize i = 1; i < num_qgrams; ++i) 895 { 896 itText += stepSize; 897 ++dir[requestBucket(bucketMap, hash(shape, itText))]; 898 } 899 } 900 901 template < typename TDir, typename TBucketMap, typename TString, typename TSpec, typename TShape, typename TStepSize > 902 inline void 903 _qgramCountQGrams(TDir &dir, TBucketMap &bucketMap, StringSet<TString, TSpec> const &stringSet, TShape shape, TStepSize stepSize) 904 { 905 SEQAN_CHECKPOINT 906 typedef typename Iterator<TString const, Standard>::Type TIterator; 907 typedef typename Value<TDir>::Type TSize; 908 909 if (stepSize == 1) 910 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 911 { 912 TString const &sequence = value(stringSet, seqNo); 913 if (length(sequence) < length(shape)) continue; 914 TSize num_qgrams = length(sequence) - length(shape) + 1; 915 916 TIterator itText = begin(sequence, Standard()); 917 ++dir[requestBucket(bucketMap, hash(shape, itText))]; 918 for(TSize i = 1; i < num_qgrams; ++i) 919 { 920 ++itText; 921 ++dir[requestBucket(bucketMap, hashNext(shape, itText))]; 922 } 923 } 924 else 925 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 926 { 927 TString const &sequence = value(stringSet, seqNo); 928 if (length(sequence) < length(shape)) continue; 929 TSize num_qgrams = (length(sequence) - length(shape)) / stepSize + 1; 930 931 TIterator itText = begin(sequence, Standard()); 932 for(TSize i = 0; i < num_qgrams; ++i) 933 { 934 ++dir[requestBucket(bucketMap, hash(shape, itText))]; 935 itText += stepSize; 936 } 937 } 938 } 939 940 ////////////////////////////////////////////////////////////////////////////// 941 // Counting sort - Step 3: Cumulative sum 942 943 // First two entries are 0. 944 // Step 4 increments the entries hash(qgram)+1 on-the-fly while filling the SA table. 945 // After step 4 each entry (0..n-1) is the beginning of a qgram bucket. 946 template < typename TDir, typename TWithConstraints > 947 inline typename Value<TDir>::Type 948 _qgramCummulativeSum(TDir &dir, TWithConstraints) 949 { 950 SEQAN_CHECKPOINT 951 typedef typename Iterator<TDir, Standard>::Type TDirIterator; 952 typedef typename Value<TDir>::Type TSize; 953 954 TDirIterator it = begin(dir, Standard()); 955 TDirIterator itEnd = end(dir, Standard()); 956 TSize diff = 0, diff_prev = 0, sum = 0; 957 while (it != itEnd) 958 { 959 if (TWithConstraints::VALUE && diff == (TSize)-1) 960 { 961 sum += diff_prev; 962 diff_prev = 0; 963 diff = *it; 964 *it = (TSize)-1; // disable bucket 965 } else { 966 sum += diff_prev; 967 diff_prev = diff; 968 diff = *it; 969 *it = sum; 970 } 971 ++it; 972 } 973 return sum + diff_prev; 974 } 975 976 // The first entry is 0. 977 // This function is used when Step 4 is ommited. 978 template < typename TDir, typename TWithConstraints > 979 inline typename Value<TDir>::Type 980 _qgramCummulativeSumAlt(TDir &dir, TWithConstraints const) 981 { 982 SEQAN_CHECKPOINT 983 typedef typename Iterator<TDir, Standard>::Type TDirIterator; 984 typedef typename Value<TDir>::Type TSize; 985 986 TDirIterator it = begin(dir, Standard()); 987 TDirIterator itEnd = end(dir, Standard()); 988 TSize diff = 0, sum = 0; 989 while (it != itEnd) 990 { 991 sum += diff; 992 diff = *it; 993 if (TWithConstraints::VALUE && diff == (TSize)-1) 994 diff = 0; // ignore disabled buckets 995 *it = sum; 996 ++it; 997 } 998 return sum + diff; 999 } 1000 1001 ////////////////////////////////////////////////////////////////////////////// 1002 // Counting sort - Step 4: Fill suffix array 1003 // w/o constraints 1004 template < 1005 typename TSA, 1006 typename TText, 1007 typename TShape, 1008 typename TDir, 1009 typename TBucketMap, 1010 typename TWithConstraints, 1011 typename TStepSize > 1012 inline void 1013 _qgramFillSuffixArray( 1014 TSA &sa, 1015 TText const &text, 1016 TShape shape, 1017 TDir &dir, 1018 TBucketMap &bucketMap, 1019 TStepSize stepSize, 1020 TWithConstraints const) 1021 { 1022 SEQAN_CHECKPOINT 1023 typedef typename Iterator<TText const, Standard>::Type TIterator; 1024 typedef typename Value<TDir>::Type TSize; 1025 1026 TSize num_qgrams = length(text) - length(shape) + 1; 1027 TIterator itText = begin(text, Standard()); 1028 1029 if (TWithConstraints::VALUE) { 1030 TSize bktNo = getBucket(bucketMap, hash(shape, itText)) + 1; // first hash 1031 if (dir[bktNo] != (TSize)-1) sa[dir[bktNo]++] = 0; // if bucket is enabled 1032 } else 1033 sa[dir[getBucket(bucketMap, hash(shape, itText)) + 1]++] = 0; // first hash 1034 1035 if (stepSize == 1) 1036 for(TSize i = 1; i < num_qgrams; ++i) 1037 { 1038 ++itText; 1039 if (TWithConstraints::VALUE) { 1040 TSize bktNo = getBucket(bucketMap, hashNext(shape, itText)) + 1; // next hash 1041 if (dir[bktNo] != (TSize)-1) sa[dir[bktNo]++] = i; // if bucket is enabled 1042 } else 1043 sa[dir[getBucket(bucketMap, hashNext(shape, itText)) + 1]++] = i; // next hash 1044 } 1045 else 1046 for(TSize i = 1; i < num_qgrams; i += stepSize) 1047 { 1048 itText += stepSize; 1049 if (TWithConstraints::VALUE) { 1050 TSize bktNo = getBucket(bucketMap, hash(shape, itText)) + 1; // next hash (we mustn't use hashNext here) 1051 if (dir[bktNo] != (TSize)-1) sa[dir[bktNo]++] = i; // if bucket is enabled 1052 } else 1053 sa[dir[getBucket(bucketMap, hash(shape, itText)) + 1]++] = i; // next hash 1054 } 1055 } 1056 1057 // multiple sequences 1058 template < 1059 typename TSA, 1060 typename TString, 1061 typename TSpec, 1062 typename TShape, 1063 typename TDir, 1064 typename TBucketMap, 1065 typename TStepSize, 1066 typename TWithConstraints > 1067 inline void 1068 _qgramFillSuffixArray( 1069 TSA &sa, 1070 StringSet<TString, TSpec> const &stringSet, 1071 TShape shape, 1072 TDir &dir, 1073 TBucketMap &bucketMap, 1074 TStepSize stepSize, 1075 TWithConstraints const) 1076 { 1077 SEQAN_CHECKPOINT 1078 typedef typename Iterator<TString const, Standard>::Type TIterator; 1079 typedef typename Value<TDir>::Type TSize; 1080 1081 if (stepSize == 1) 1082 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 1083 { 1084 TString const &sequence = value(stringSet, seqNo); 1085 if (length(sequence) < length(shape)) continue; 1086 TSize num_qgrams = length(sequence) - length(shape) + 1; 1087 1088 typename Value<TSA>::Type localPos; 1089 assignValueI1(localPos, seqNo); 1090 assignValueI2(localPos, 0); 1091 1092 TIterator itText = begin(sequence, Standard()); 1093 if (TWithConstraints::VALUE) { 1094 TSize bktNo = getBucket(bucketMap, hash(shape, itText)) + 1; // first hash 1095 if (dir[bktNo] != (TSize)-1) sa[dir[bktNo]++] = localPos; // if bucket is enabled 1096 } else 1097 sa[dir[getBucket(bucketMap, hash(shape, itText)) + 1]++] = localPos; // first hash 1098 1099 for(TSize i = 1; i < num_qgrams; ++i) 1100 { 1101 ++itText; 1102 assignValueI2(localPos, i); 1103 if (TWithConstraints::VALUE) { 1104 TSize bktNo = getBucket(bucketMap, hashNext(shape, itText)) + 1; // next hash 1105 if (dir[bktNo] != (TSize)-1) sa[dir[bktNo]++] = localPos; // if bucket is enabled 1106 } else 1107 sa[dir[getBucket(bucketMap, hashNext(shape, itText)) + 1]++] = localPos; // next hash 1108 } 1109 } 1110 else 1111 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 1112 { 1113 TString const &sequence = value(stringSet, seqNo); 1114 if (length(sequence) < length(shape)) continue; 1115 TSize num_qgrams = length(sequence) - length(shape) + 1; 1116 1117 typename Value<TSA>::Type localPos; 1118 assignValueI1(localPos, seqNo); 1119 assignValueI2(localPos, 0); 1120 1121 TIterator itText = begin(sequence, Standard()); 1122 for(TSize i = 1; i < num_qgrams; i += stepSize) 1123 { 1124 assignValueI2(localPos, i); 1125 if (TWithConstraints::VALUE) { 1126 TSize bktNo = getBucket(bucketMap, hash(shape, itText)) + 1; // hash 1127 if (dir[bktNo] != (TSize)-1) sa[dir[bktNo]++] = localPos; // if bucket is enabled 1128 } else 1129 sa[dir[getBucket(bucketMap, hash(shape, itText)) + 1]++] = localPos; // hash 1130 itText += stepSize; 1131 } 1132 } 1133 } 1134 1135 ////////////////////////////////////////////////////////////////////////////// 1136 // Step 5: Correct disabled buckets 1137 template < typename TDir > 1138 inline void 1139 _qgramPostprocessBuckets(TDir &dir) 1140 { 1141 SEQAN_CHECKPOINT 1142 typedef typename Iterator<TDir, Standard>::Type TDirIterator; 1143 typedef typename Value<TDir>::Type TSize; 1144 1145 TDirIterator it = begin(dir, Standard()); 1146 TDirIterator itEnd = end(dir, Standard()); 1147 TSize prev = 0; 1148 for (; it != itEnd; ++it) 1149 if (*it == (TSize)-1) 1150 *it = prev; 1151 else 1152 prev = *it; 1153 } 1154 1155 1156 ////////////////////////////////////////////////////////////////////////////// 1157 /** 1158 .Function.createQGramIndex: 1159 ..summary:Builds a q-gram index on a sequence. 1160 ..cat:Index 1161 ..signature:createQGramIndex(index) 1162 ..signature:createQGramIndex(sa, dir, bucketMap, text, shape, stepSize) [DEPRECATED] 1163 ..param.index:The q-gram index. 1164 ...type:Spec.IndexQGram 1165 ..param.sa:The resulting list in which all q-grams are sorted alphabetically. 1166 ..param.dir:The resulting array that indicates at which position in index the corresponding q-grams can be found. 1167 ..param.bucketMap:Stores the q-gram hashes for the openaddressing hash maps, see @Function.indexBucketMap@. 1168 If bucketMap is of the type @Tag.Nothing@ the q-gram hash determines the bucket address in the index. 1169 ..param.text:The sequence. 1170 ..param.shape:The shape to be used. 1171 ...type:Class.Shape 1172 ..param.stepSize:Store every $stepSize$'th q-gram in the index. 1173 ..returns:Index contains the sorted list of qgrams. For each q-gram $dir$ contains the first position in index that corresponds to this q-gram. 1174 ..remarks:This function should not be called directly. Please use @Function.indexCreate@ or @Function.indexRequire@. 1175 The resulting tables must have appropriate size before calling this function. 1176 ..include:seqan/index.h 1177 */ 1178 1179 template < typename TIndex > 1180 inline bool _qgramDisableBuckets(TIndex &) 1181 { 1182 return false; // we disable no buckets by default 1183 } 1184 1185 template < typename TIndex > 1186 void createQGramIndex(TIndex &index) 1187 { 1188 SEQAN_CHECKPOINT 1189 typename Fibre<TIndex, QGramText>::Type const &text = indexText(index); 1190 typename Fibre<TIndex, QGramSA>::Type &sa = indexSA(index); 1191 typename Fibre<TIndex, QGramDir>::Type &dir = indexDir(index); 1192 typename Fibre<TIndex, QGramShape>::Type &shape = indexShape(index); 1193 typename Fibre<TIndex, QGramBucketMap>::Type &bucketMap = index.bucketMap; 1194 1195 // 1. clear counters 1196 _qgramClearDir(dir, bucketMap); 1197 1198 // 2. count q-grams 1199 _qgramCountQGrams(dir, bucketMap, text, shape, getStepSize(index)); 1200 1201 if (_qgramDisableBuckets(index)) 1202 { 1203 // 3. cumulative sum 1204 _qgramCummulativeSum(dir, True()); 1205 1206 // 4. fill suffix array 1207 _qgramFillSuffixArray(sa, text, shape, dir, bucketMap, getStepSize(index), True()); 1208 1209 // 5. correct disabled buckets 1210 _qgramPostprocessBuckets(dir); 1211 } 1212 else 1213 { 1214 // 3. cumulative sum 1215 _qgramCummulativeSum(dir, False()); 1216 1217 // 4. fill suffix array 1218 _qgramFillSuffixArray(sa, text, shape, dir, bucketMap, getStepSize(index), False()); 1219 } 1220 } 1221 1222 // DEPRECATED 1223 // better use createQGramIndex(index) (above) 1224 template < 1225 typename TSA, 1226 typename TDir, 1227 typename TBucketMap, 1228 typename TText, 1229 typename TShape, 1230 typename TStepSize > 1231 void createQGramIndex( 1232 TSA &sa, 1233 TDir &dir, 1234 TBucketMap &bucketMap, 1235 TText const &text, 1236 TShape &shape, 1237 TStepSize stepSize) 1238 { 1239 SEQAN_CHECKPOINT 1240 1241 // 1. clear counters 1242 _qgramClearDir(dir, bucketMap); 1243 1244 // 2. count q-grams 1245 _qgramCountQGrams(dir, bucketMap, text, shape, stepSize); 1246 1247 // 3. cumulative sum 1248 SEQAN_DO(_qgramCummulativeSum(dir, False()) == length(sa)); 1249 1250 // 4. fill suffix array 1251 _qgramFillSuffixArray(sa, text, shape, dir, bucketMap, stepSize, False()); 1252 } 1253 1254 // DEPRECATED 1255 // better use createQGramIndex(index) (above) 1256 template < 1257 typename TSA, 1258 typename TDir, 1259 typename TBucketMap, 1260 typename TText, 1261 typename TShape, 1262 typename TStepSize > 1263 void createQGramIndex( 1264 TSA &sa, 1265 TDir &dir, 1266 TBucketMap &bucketMap, 1267 TText const &text, 1268 TShape &shape) 1269 { 1270 createQGramIndex(sa, dir, bucketMap, text, shape, 1); 1271 } 1272 1273 ////////////////////////////////////////////////////////////////////////////// 1274 /** 1275 .Function.createQGramIndexSAOnly: 1276 ..summary:Builds the suffix array of a q-gram index on a sequence. 1277 ..cat:Index 1278 ..signature:createQGramIndexSAOnly(sa, text, shape, stepSize) 1279 ..param.sa:The resulting list in which all q-grams are sorted alphabetically. 1280 ..param.text:The sequence. 1281 ..param.shape:The shape to be used. q is the length of this shape 1282 ...type:Class.Shape 1283 ..param.stepSize:Store every $stepSize$'th q-gram in the index. 1284 ..remarks:This function should not be called directly. Please use @Function.indexCreate@ or @Function.indexRequire@. 1285 The resulting tables must have appropriate size before calling this function. 1286 ..include:seqan/index.h 1287 */ 1288 1289 template < 1290 typename TSA, 1291 typename TText, 1292 typename TShape, 1293 typename TStepSize > 1294 void createQGramIndexSAOnly( 1295 TSA &sa, 1296 TText const &text, 1297 TShape &shape, 1298 TStepSize stepSize) 1299 { 1300 SEQAN_CHECKPOINT 1301 typedef typename Size<TSA>::Type TSize; 1302 typedef typename Iterator<TSA, Standard>::Type TIter; 1303 1304 // 1. Fill suffix array with a permutation (the identity) 1305 TIter it = begin(sa, Standard()); 1306 TIter itEnd = end(sa, Standard()); 1307 TSize i = 0; 1308 for(; it != itEnd; ++it, i += stepSize) 1309 *it = i; 1310 1311 // 2. Sort suffix array with quicksort 1312 TSize span = length(shape); 1313 if (i + span > length(text) + 1) 1314 ::std::sort( 1315 begin(sa, Standard()), 1316 end(sa, Standard()), 1317 QGramLess_<typename Value<TSA>::Type, TText const>(text, span)); 1318 else 1319 ::std::sort( 1320 begin(sa, Standard()), 1321 end(sa, Standard()), 1322 QGramLessNoCheck_<typename Value<TSA>::Type, TText const>(text, span)); 1323 } 1324 1325 template < 1326 typename TSA, 1327 typename TString, 1328 typename TSpec, 1329 typename TShape, 1330 typename TStepSize > 1331 void createQGramIndexSAOnly( 1332 TSA &sa, 1333 StringSet<TString, TSpec> const &stringSet, 1334 TShape &shape, 1335 TStepSize stepSize) 1336 { 1337 SEQAN_CHECKPOINT 1338 typedef typename Iterator<TSA, Standard>::Type TIter; 1339 typedef typename Value<TSA>::Type TValue; 1340 typedef typename Size<TString>::Type TSize; 1341 typedef StringSet<TString, TSpec> TStringSet; 1342 1343 // 1. Fill suffix array with a permutation (the identity) 1344 TIter it = begin(sa, Standard()); 1345 TValue pair; 1346 unsigned int const q1 = length(shape) - 1; 1347 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 1348 { 1349 unsigned int const strlen = length(stringSet[seqNo]); 1350 if (strlen > q1) { 1351 assignValueI1(pair, seqNo); 1352 for (TSize i = 0; i < strlen - q1; ++it, i += stepSize) { 1353 assignValueI2(pair, i); 1354 *it = pair; 1355 } 1356 } 1357 } 1358 1359 // 2. Sort suffix array with quicksort 1360 TSize q = length(shape); 1361 if (lengthSum(stringSet) == length(sa)) 1362 ::std::sort( 1363 begin(sa, Standard()), 1364 end(sa, Standard()), 1365 QGramLess_<typename Value<TSA>::Type, TStringSet const>(stringSet, q)); 1366 else 1367 ::std::sort( 1368 begin(sa, Standard()), 1369 end(sa, Standard()), 1370 QGramLessNoCheck_<typename Value<TSA>::Type, TStringSet const>(stringSet, q)); 1371 } 1372 1373 template < 1374 typename TSA, 1375 typename TDir, 1376 typename TText, 1377 typename TSize1, 1378 typename TSize2 > 1379 void _refineQGramIndex( 1380 TSA &sa, 1381 TDir &dir, 1382 TText const &text, 1383 TSize1 oldQ, 1384 TSize2 newQ) 1385 { 1386 SEQAN_CHECKPOINT 1387 typedef typename Size<TSA>::Type TSize; 1388 typedef typename Iterator<TSA, Standard>::Type TIter; 1389 typedef typename Iterator<TDir, Standard>::Type TDirIter; 1390 1391 if (newQ <= (TSize2)oldQ) return; 1392 1393 if (length(dir) < 2) { 1394 ::std::sort( 1395 begin(sa, Standard()), 1396 end(sa, Standard()), 1397 QGramLess_<typename Value<TSA>::Type, TText const>(text, newQ)); 1398 return; 1399 } 1400 1401 // 1. Sort each bucket with quicksort and compare substrings s[i+oldQ..i+newQ) 1402 TDirIter dirIt = begin(dir, Standard()); 1403 TDirIter dirItEnd = end(dir, Standard()); 1404 TIter itBegin = begin(sa, Standard()); 1405 TIter itBktBegin = itBegin + *dirIt; 1406 TIter itBktEnd; 1407 ++dirIt; 1408 for(; dirIt != dirItEnd; ++dirIt, itBktBegin = itBktEnd) { 1409 itBktEnd = itBegin + *dirIt; 1410 if (itBktEnd - itBktBegin < 2) continue; 1411 ::std::sort( 1412 itBktBegin, 1413 itBktEnd, 1414 QGramLessOffset_<typename Value<TSA>::Type, TText const>(text, newQ - oldQ, oldQ)); 1415 } 1416 } 1417 1418 1419 ////////////////////////////////////////////////////////////////////////////// 1420 /** 1421 .Function.createQGramIndexDirOnly: 1422 ..summary:Builds the directory of a q-gram index on a sequence. 1423 ..cat:Index 1424 ..signature:createQGramIndexDirOnly(dir, bucketMap, text, shape, stepSize) 1425 ..param.dir:The resulting array that indicates at which position in index the corresponding q-grams can be found. 1426 ..param.bucketMap:Stores the q-gram hashes for the openaddressing hash maps, see @Function.indexBucketMap@. 1427 If bucketMap is of the type @Tag.Nothing@ the q-gram hash determines the bucket address in the index. 1428 ..param.text:The sequence. 1429 ..param.shape:The shape to be used. 1430 ...type:Class.Shape 1431 ..param.stepSize:Store every $stepSize$'th q-gram in the index. 1432 ..returns:Index contains the sorted list of qgrams. For each possible q-gram pos contains the first position in index that corresponds to this q-gram. 1433 ..remarks:This function should not be called directly. Please use @Function.indexCreate@ or @Function.indexRequire@. 1434 The resulting tables must have appropriate size before calling this function. 1435 ..include:seqan/index.h 1436 */ 1437 1438 template < 1439 typename TDir, 1440 typename TBucketMap, 1441 typename TText, 1442 typename TShape, 1443 typename TStepSize > 1444 void createQGramIndexDirOnly( 1445 TDir &dir, 1446 TBucketMap &bucketMap, 1447 TText const &text, 1448 TShape &shape, 1449 TStepSize stepSize) 1450 { 1451 SEQAN_CHECKPOINT 1452 1453 // 1. clear counters 1454 _qgramClearDir(dir, bucketMap); 1455 1456 // 2. count q-grams 1457 _qgramCountQGrams(dir, bucketMap, text, shape, stepSize); 1458 1459 // 3. cumulative sum (Step 4 is ommited) 1460 _qgramCummulativeSumAlt(dir, False()); 1461 } 1462 1463 template < 1464 typename TDir, 1465 typename TBucketMap, 1466 typename TString, 1467 typename TSpec, 1468 typename TShape, 1469 typename TStepSize > 1470 void createQGramIndexDirOnly( 1471 TDir &dir, 1472 TBucketMap &bucketMap, 1473 StringSet<TString, TSpec> const &stringSet, 1474 TShape &shape, 1475 TStepSize stepSize) 1476 { 1477 SEQAN_CHECKPOINT 1478 1479 // 1. clear counters 1480 _qgramClearDir(dir, bucketMap); 1481 1482 // 2. count q-grams 1483 _qgramCountQGrams(dir, bucketMap, stringSet, shape, stepSize); 1484 1485 // 3. cumulative sum (Step 4 is ommited) 1486 _qgramCummulativeSumAlt(dir, False()); 1487 } 1488 1489 1490 ////////////////////////////////////////////////////////////////////////////// 1491 /** 1492 .Function.createCountArray: 1493 ..summary:Builds an index on a StringSet storing how often a q-gram occurs in each sequence. 1494 ..cat:Index 1495 ..signature:createCountsArray(counts, dir, bucketMap, stringSet, shape, stepSize) 1496 ..param.counts:The resulting list of pairs (seqNo,count). 1497 ..param.dir:The resulting array that indicates at which position in the count table the corresponding a certain q-gram can be found. 1498 ..param.bucketMap:Stores the q-gram hashes for the openaddressing hash maps, see @Function.indexBucketMap@. 1499 If bucketMap is of the type @Tag.Nothing@ the q-gram hash determines the bucket address in the index. 1500 ..param.stringSet:The StringSet. 1501 ...type:Class.StringSet 1502 ..param.shape:The shape to be used. 1503 ...type:Class.Shape 1504 ..param.stepSize:Store every $stepSize$'th q-gram in the index. 1505 ..remarks:This function should not be called directly. Please use @Function.indexCreate@ or @Function.indexRequire@. 1506 The resulting tables must have appropriate size before calling this function. 1507 ..include:seqan/index.h 1508 */ 1509 1510 template < 1511 typename TCounts, 1512 typename TDir, 1513 typename TBucketMap, 1514 typename TString, 1515 typename TSpec, 1516 typename TShape, 1517 typename TStepSize > 1518 void createCountsArray( 1519 TCounts &counts, 1520 TDir &dir, 1521 TBucketMap &bucketMap, 1522 StringSet<TString, TSpec> const &stringSet, 1523 TShape shape, 1524 TStepSize stepSize) 1525 { 1526 SEQAN_CHECKPOINT 1527 typedef typename Iterator<TString const, Standard>::Type TIterator; 1528 typedef typename Iterator<TDir, Standard>::Type TDirIterator; 1529 typedef typename Value<TShape>::Type TValue; 1530 typedef typename Size<TString>::Type TSize; 1531 1532 TDir lastSeqSeen; 1533 resize(lastSeqSeen, length(dir)); 1534 1535 // 1. clear counters 1536 _qgramClearDir(dir, bucketMap); 1537 arrayFill(begin(lastSeqSeen, Standard()), end(lastSeqSeen, Standard()), -1); 1538 1539 // 2. count distinct sequences for each q-gram 1540 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 1541 { 1542 TString const &sequence = value(stringSet, seqNo); 1543 if (length(sequence) < length(shape)) continue; 1544 TSize num_qgrams = (length(sequence) - length(shape)) / stepSize + 1; 1545 1546 TIterator itText = begin(sequence, Standard()); 1547 TValue bktNo = requestBucket(bucketMap, hash(shape, itText)); 1548 lastSeqSeen[bktNo] = seqNo; 1549 ++dir[bktNo]; 1550 if (stepSize == 1) 1551 for(TSize i = 1; i < num_qgrams; ++i) 1552 { 1553 ++itText; 1554 bktNo = requestBucket(bucketMap, hashNext(shape, itText)); 1555 if (seqNo != lastSeqSeen[bktNo]) { 1556 lastSeqSeen[bktNo] = seqNo; 1557 ++dir[bktNo]; 1558 } 1559 } 1560 else 1561 for(TSize i = 1; i < num_qgrams; ++i) 1562 { 1563 itText += stepSize; 1564 bktNo = requestBucket(bucketMap, hash(shape, itText)); 1565 if (seqNo != lastSeqSeen[bktNo]) { 1566 lastSeqSeen[bktNo] = seqNo; 1567 ++dir[bktNo]; 1568 } 1569 } 1570 } 1571 1572 // 3. cumulative sum 1573 resize(counts, _qgramCummulativeSum(dir, False())); 1574 1575 // 4. fill count array 1576 arrayFill(begin(lastSeqSeen, Standard()), end(lastSeqSeen, Standard()), -1); 1577 for(unsigned seqNo = 0; seqNo < length(stringSet); ++seqNo) 1578 { 1579 TString const &sequence = value(stringSet, seqNo); 1580 if (length(sequence) < length(shape)) continue; 1581 TSize num_qgrams = (length(sequence) - length(shape)) / stepSize + 1; 1582 1583 TIterator itText = begin(sequence, Standard()); 1584 TValue bktNo = getBucket(bucketMap, hash(shape, itText)); 1585 lastSeqSeen[bktNo] = seqNo; 1586 TSize pos = dir[bktNo + 1]++; 1587 counts[pos].i1 = seqNo; // first hash 1588 counts[pos].i2 = 1; 1589 1590 if (stepSize == 1) 1591 for(TSize i = 1; i < num_qgrams; ++i) 1592 { 1593 ++itText; 1594 bktNo = getBucket(bucketMap, hashNext(shape, itText)); 1595 if (seqNo == lastSeqSeen[bktNo]) 1596 ++(counts[dir[bktNo + 1] - 1].i2); 1597 else { 1598 lastSeqSeen[bktNo] = seqNo; 1599 pos = dir[bktNo + 1]++; 1600 counts[pos].i1 = seqNo; // next hash 1601 counts[pos].i2 = 1; 1602 } 1603 } 1604 else 1605 for(TSize i = 1; i < num_qgrams; ++i) 1606 { 1607 itText += stepSize; 1608 bktNo = getBucket(bucketMap, hash(shape, itText)); 1609 if (seqNo == lastSeqSeen[bktNo]) 1610 ++(counts[dir[bktNo + 1] - 1].i2); 1611 else { 1612 lastSeqSeen[bktNo] = seqNo; 1613 pos = dir[bktNo + 1]++; 1614 counts[pos].i1 = seqNo; // next hash 1615 counts[pos].i2 = 1; 1616 } 1617 } 1618 } 1619 } 1620 1621 1622 ////////////////////////////////////////////////////////////////////////////// 1623 // create q-gram index of *one* sequence in external memory 1624 1625 // *** COMPARATORS & MAPS *** 1626 1627 template <typename InType, typename Result = int> 1628 struct _qgramComp : public ::std::binary_function<InType,InType,Result> { 1629 inline Result operator()(InType const &a, InType const &b) const 1630 { 1631 typedef typename Value<InType, 2>::Type TQGram; 1632 typename Value<TQGram>::Type const *sa = a.i2.i; 1633 typename Value<TQGram>::Type const *sb = b.i2.i; 1634 typename Value<TQGram>::Type const *saEnd = sa + length(a.i2); 1635 1636 for(; sa != saEnd; ++sa, ++sb) { 1637 if (*sa == *sb) continue; 1638 return (*sa < *sb)? -1 : 1; 1639 } 1640 return posCompare(a.i1, b.i1); 1641 } 1642 }; 1643 1644 // optimized for bitvectors 1645 template <typename T1, typename TValue, unsigned _size, typename Result> 1646 struct _qgramComp< Pair<T1, Tuple<TValue, _size, Compressed>, Compressed >, Result > : 1647 public ::std::binary_function< 1648 Pair<T1, Tuple<TValue, _size, Compressed>, Compressed >, 1649 Pair<T1, Tuple<TValue, _size, Compressed>, Compressed >, 1650 Result> { 1651 inline Result operator()( 1652 const Pair<T1, Tuple<TValue, _size, Compressed>, Compressed > &a, 1653 const Pair<T1, Tuple<TValue, _size, Compressed>, Compressed > &b) const 1654 { 1655 if (a.i2 < b.i2) return -1; 1656 if (a.i2 > b.i2) return 1; 1657 return posCompare(a.i1, b.i1); 1658 } 1659 }; 1660 1661 1662 template <typename TValue, typename TResult = unsigned> 1663 struct _qgramHash : public ::std::unary_function<TValue, TResult> { 1664 inline TResult operator()(TValue const &a) const 1665 { 1666 typedef typename Value<TValue, 2>::Type TQGram; 1667 TResult hash = 0; 1668 unsigned len = length(a.i2); 1669 for (unsigned i = 0; i < len; ++i) { 1670 hash *= ValueSize< typename Value<TQGram>::Type >::VALUE; 1671 hash += ordValue(a.i2[i]); 1672 } 1673 return hash; 1674 } 1675 }; 1676 1677 // TODO: replace fixed tuple size of 6 with q and add q to Shape template arguments 1678 template < 1679 typename TSA, 1680 typename TDir, 1681 typename TText, 1682 typename TValue, 1683 unsigned q > 1684 void createQGramIndexExt( 1685 TSA &suffixArray, 1686 TDir &dir, 1687 TText &text, 1688 Shape<TValue, UngappedShape<q> >) 1689 { 1690 // signed characters behave different than unsigned when compared 1691 // to get the same index with signed or unsigned chars we simply cast them to unsigned 1692 // before feeding them into the pipeline 1693 typedef Shape<TValue, UngappedShape<q> > TShape; 1694 typedef typename MakeUnsigned_<TValue>::Type TUValue; 1695 1696 // *** SPECIALIZATION *** 1697 1698 typedef Pipe< TText, Source<> > TSource; 1699 typedef Pipe< TSource, Caster<TUValue, CasterConvert> > TUnsigner; 1700 typedef Pipe< TUnsigner, Tupler<q> > TTupler; 1701 typedef _qgramComp<TypeOf_(TTupler)> qcomp_t; 1702 typedef Pool< 1703 TypeOf_(TTupler), 1704 SorterSpec< SorterConfigSize<qcomp_t, TSizeOf_(TTupler) > > 1705 > TSortTuples; 1706 typedef _qgramHash<TypeOf_(TTupler), typename Size<TDir>::Type> qhash_t; 1707 1708 // *** INSTANTIATION *** 1709 1710 TSource src(text); 1711 TUnsigner unsigner(src); 1712 TTupler tupler(unsigner); 1713 TSortTuples sorter; 1714 1715 // sort q-grams 1716 sorter << tupler; 1717 1718 // fill sa and dir 1719 if (!beginRead(sorter)) return; 1720 1721 typename Iterator<TSA>::Type itSA = begin(suffixArray); 1722 typename Iterator<TDir>::Type itDir = begin(dir); 1723 1724 qcomp_t qcomp; 1725 qhash_t qhash; 1726 1727 typename Value<TSortTuples>::Type old_qgram; 1728 typename Size<TDir>::Type hash, old_hash = 0; 1729 typename Size<TSortTuples>::Type leftToRead = length(sorter); 1730 bool first = true; 1731 1732 for (leftToRead = length(sorter); leftToRead > 0; --leftToRead, ++sorter, ++itSA) 1733 { 1734 // copy occurence position 1735 *itSA = (*sorter).i1; 1736 if (first || qcomp(old_qgram, *sorter) != 0) 1737 { 1738 old_qgram = *sorter; 1739 hash = qhash(old_qgram); 1740 1741 SEQAN_ASSERT(old_hash < hash); 1742 1743 // copy bucket begin 1744 typename Size<TSortTuples>::Type i = length(sorter) - leftToRead; 1745 for(; old_hash < hash; ++old_hash, ++itDir) 1746 *itDir = i; 1747 first = false; 1748 } 1749 } 1750 1751 // fill bucket table 1752 typename Size<TSortTuples>::Type i = length(sorter); 1753 hash = length(dir); 1754 for(; old_hash < hash; ++old_hash, ++itDir) 1755 *itDir = i; 1756 1757 endRead(sorter); 1758 } 1759 1760 1761 ////////////////////////////////////////////////////////////////////////////// 1762 // create q-gram index of *multiple* sequences in external memory 1763 1764 template < 1765 typename TSA, 1766 typename TDir, 1767 typename TString, 1768 typename TSpec, 1769 typename TValue, 1770 unsigned q, 1771 typename TLimitsString > 1772 void createQGramIndexExt( 1773 TSA &suffixArray, 1774 TDir &dir, 1775 StringSet<TString, TSpec> const &stringSet, 1776 Shape<TValue, UngappedShape<q> >, 1777 TLimitsString &limits) 1778 { 1779 // signed characters behave different than unsigned when compared 1780 // to get the same index with signed or unsigned chars we simply cast them to unsigned 1781 // before feeding them into the pipeline 1782 typedef typename Concatenator<StringSet<TString, TSpec> >::Type TConcat; 1783 typedef Shape<TValue, UngappedShape<q> > TShape; 1784 typedef typename MakeUnsigned_<TValue>::Type TUValue; 1785 typedef Multi< 1786 Tupler<q, true, Compressed>, 1787 typename Value<TSA>::Type, 1788 typename StringSetLimits< StringSet<TString, TSpec> >::Type > TTuplerSpec; 1789 1790 // *** SPECIALIZATION *** 1791 1792 typedef Pipe< TConcat, Source<> > TSource; 1793 typedef Pipe< TSource, Caster<TUValue, CasterConvert> > TUnsigner; 1794 typedef Pipe< TUnsigner, TTuplerSpec > TTupler; 1795 typedef _qgramComp<TypeOf_(TTupler)> qcomp_t; 1796 typedef Pool< 1797 TypeOf_(TTupler), 1798 SorterSpec< SorterConfigSize<qcomp_t, TSizeOf_(TTupler) > > 1799 > TSortTuples; 1800 typedef _qgramHash<TypeOf_(TTupler), typename Size<TDir>::Type> qhash_t; 1801 1802 // *** INSTANTIATION *** 1803 1804 TSource src(concat(stringSet)); 1805 TUnsigner unsigner(src); 1806 TTupler tupler(unsigner, limits); 1807 TSortTuples sorter; 1808 1809 // sort q-grams 1810 sorter << tupler; 1811 1812 // fill sa and dir 1813 if (!beginRead(sorter)) return; 1814 1815 typename Iterator<TSA>::Type itSA = begin(suffixArray); 1816 typename Iterator<TDir>::Type itDir = begin(dir); 1817 1818 qcomp_t qcomp; 1819 qhash_t qhash; 1820 1821 typename Value<TSortTuples>::Type old_qgram = *sorter; 1822 typename Size<TDir>::Type hash, old_hash = 0; 1823 typename Size<TSortTuples>::Type leftToRead; 1824 bool first = true; 1825 1826 for (leftToRead = length(sorter); leftToRead > 0; --leftToRead, ++sorter, ++itSA) 1827 { 1828 // copy occurence position 1829 *itSA = (*sorter).i1; 1830 1831 if (first || qcomp(old_qgram, *sorter) != 0) 1832 { 1833 old_qgram = *sorter; 1834 hash = qhash(old_qgram); 1835 1836 SEQAN_ASSERT_LEQ(old_hash, hash); 1837 1838 // copy bucket begin 1839 typename Size<TSortTuples>::Type i = length(sorter) - leftToRead; 1840 for(; old_hash < hash; ++old_hash, ++itDir) 1841 *itDir = i; 1842 first = false; 1843 } 1844 } 1845 1846 // fill bucket table 1847 typename Size<TSortTuples>::Type i = length(sorter); 1848 hash = length(dir); 1849 for(; old_hash < hash; ++old_hash, ++itDir) 1850 *itDir = i; 1851 1852 endRead(sorter); 1853 } 1854 1855 1856 1857 ////////////////////////////////////////////////////////////////////////////// 1858 // interface for automatic index creation 1859 1860 template <typename TText, typename TShapeSpec, typename TSpec> 1861 inline typename Size<Index<TText, IndexQGram<TShapeSpec, TSpec> > >::Type 1862 _qgramQGramCount(Index<TText, IndexQGram<TShapeSpec, TSpec> > const &index) 1863 { 1864 typedef Index<TText, IndexQGram<TShapeSpec, TSpec> > TIndex; 1865 typedef typename Fibre<TIndex, QGramShape>::Type TShape; 1866 typedef typename Size<TIndex>::Type TSize; 1867 1868 TShape const &shape = indexShape(index); 1869 1870 // count all overlapping q-grams 1871 TSize stepSize = getStepSize(index); 1872 TSize qgramCount = 0; 1873 for(unsigned i = 0; i < countSequences(index); ++i) 1874 if (sequenceLength(i, index) >= length(shape)) 1875 qgramCount += (sequenceLength(i, index) - length(shape)) / stepSize + 1; 1876 return qgramCount; 1877 } 1878 1879 template <typename TText, typename TShapeSpec, typename TSpec> 1880 inline bool indexCreate( 1881 Index<TText, IndexQGram<TShapeSpec, TSpec> > &index, 1882 FibreSADir, 1883 Default const) 1884 { 1885 resize(indexSA(index), _qgramQGramCount(index), Exact()); 1886 resize(indexDir(index), _fullDirLength(index), Exact()); 1887 createQGramIndex(index); 1888 return true; 1889 } 1890 1891 template <typename TText, typename TSpec> 1892 inline bool indexSupplied(Index<TText, TSpec> &index, FibreSADir) { 1893 return !(empty(getFibre(index, FibreSA())) || empty(getFibre(index, FibreDir()))); 1894 } 1895 1896 template <typename TText, typename TShapeSpec, typename TSpec> 1897 inline bool indexCreate( 1898 Index<TText, IndexQGram<TShapeSpec, TSpec> > &index, 1899 FibreSA, 1900 Default const) 1901 { 1902 resize(indexSA(index), _qgramQGramCount(index), Exact()); 1903 createQGramIndexSAOnly(indexSA(index), indexText(index), indexShape(index), getStepSize(index)); 1904 return true; 1905 } 1906 1907 template <typename TText, typename TShapeSpec, typename TSpec> 1908 inline bool indexCreate( 1909 Index<TText, IndexQGram<TShapeSpec, TSpec> > &index, 1910 FibreCounts, 1911 Default const) 1912 { 1913 resize(indexCountsDir(index), _fullDirLength(index), Exact()); 1914 createCountsArray(indexCounts(index), indexCountsDir(index), indexBucketMap(index), indexText(index), indexShape(index), getStepSize(index)); 1915 return true; 1916 } 1917 1918 template <typename TText, typename TShapeSpec, typename TSpec> 1919 inline bool indexCreate( 1920 Index<TText, IndexQGram<TShapeSpec, TSpec> > &index, 1921 FibreDir, 1922 Default const) 1923 { 1924 typedef Index<TText, IndexQGram<TShapeSpec, TSpec> > TIndex; 1925 1926 resize(indexDir(index), _fullDirLength(index), Exact()); 1927 createQGramIndexDirOnly(indexDir(index), indexBucketMap(index), indexText(index), indexShape(index), getStepSize(index)); 1928 return true; 1929 } 1930 1931 1932 ////////////////////////////////////////////////////////////////////////////// 1933 /** 1934 .Function.getKmerSimilarityMatrix: 1935 ..summary:Creates a matrix storing the number of common q-grams between all pairs of sequences. 1936 ..cat:Index 1937 ..signature:getKmerSimilarityMatrix(index, distMat[, seqSet]) 1938 ..param.index:A q-gram index. 1939 ...type:Spec.IndexQGram 1940 ..param.distMat:The resulting q-gram similarity matrix. 1941 ...type:Concept.Container 1942 ..param.seqSet:Contains sequence numbers if only a subset of sequences should be compared. 1943 ...type:Concept.Container 1944 ..remarks:$distMat$ will be resized to $seqCount*seqCount$, where $seqCount$ is the number of sequences in the index/in $seqSet$. 1945 The number of common q-grams between sequence $i$ and $j$ is stored at position $i*seqCount + j$. 1946 It sums up the minimum number of q-gram occurrences between both sequences for each q-gram. 1947 ..include:seqan/index.h 1948 */ 1949 1950 template < typename TObject, typename TShapeSpec, typename TSpec, typename TDistMatrix > 1951 inline void getKmerSimilarityMatrix( 1952 Index< TObject, IndexQGram<TShapeSpec, TSpec> > &index, 1953 TDistMatrix &distMat) 1954 { 1955 typedef Index< TObject, IndexQGram<TShapeSpec,TSpec> > TIndex; 1956 typedef typename Size<TIndex>::Type TSize; 1957 typedef typename Size<TDistMatrix>::Type TSizeMat; 1958 typedef typename Value<TDistMatrix>::Type TValueMat; 1959 1960 typedef typename Fibre<TIndex, QGramCountsDir>::Type TCountsDir; 1961 typedef typename Iterator<TCountsDir, Standard>::Type TIterCountsDir; 1962 typedef typename Fibre<TIndex, QGramCounts>::Type TCounts; 1963 typedef typename Iterator<TCounts, Standard>::Type TIterCounts; 1964 1965 // declare requirements 1966 indexRequire(index, QGramCounts()); 1967 1968 // initialize distance matrix 1969 TSizeMat seqNoLength = countSequences(index); 1970 clear(distMat); 1971 resize(distMat, seqNoLength * seqNoLength); 1972 arrayFill(begin(distMat, Standard()), end(distMat, Standard()), 0); 1973 1974 TIterCountsDir itCountsDir = begin(indexCountsDir(index), Standard()); 1975 TIterCountsDir itCountsDirEnd = end(indexCountsDir(index), Standard()); 1976 TIterCounts itCountsBegin = begin(indexCounts(index), Standard()); 1977 1978 // for each bucket count common q-grams for each sequence pair 1979 TSize bucketBegin = *itCountsDir; 1980 for(++itCountsDir; itCountsDir != itCountsDirEnd; ++itCountsDir) 1981 { 1982 TSize bucketEnd = *itCountsDir; 1983 1984 // q-gram must occur in at least 2 different sequences 1985 if (bucketBegin != bucketEnd) 1986 { 1987 TIterCounts itA = itCountsBegin + bucketBegin; 1988 TIterCounts itEnd = itCountsBegin + bucketEnd; 1989 for(; itA != itEnd; ++itA) 1990 { 1991 TSizeMat ofs = (*itA).i1 * seqNoLength; 1992 TSize countA = (*itA).i2; 1993 TIterCounts itB = itA; 1994 1995 for(; itB != itEnd; ++itB) 1996 { 1997 TSize countB = (*itB).i2; 1998 if (countA < countB) 1999 distMat[ofs + (*itB).i1] += countA; 2000 else 2001 distMat[ofs + (*itB).i1] += countB; 2002 } 2003 } 2004 } 2005 bucketBegin = bucketEnd; 2006 } 2007 2008 // copy upper triangle to lower triangle and scale 2009 for(TSizeMat row = 0; row < seqNoLength; ++row) 2010 { 2011 TValueMat maxValRow = distMat[row * (seqNoLength + 1)]; 2012 for(TSizeMat col = row + 1; col < seqNoLength; ++col) 2013 { 2014 // fractional common kmer count 2015 TValueMat maxValCol = distMat[col * (seqNoLength + 1)]; 2016 TValueMat val = distMat[row * seqNoLength + col]; 2017 2018 // number of common q-grams / Number of possible common q-grams 2019 if (maxValRow < maxValCol) { 2020 if (maxValRow != 0) 2021 val /= maxValRow; 2022 distMat[col * seqNoLength + row] = val; 2023 distMat[row * seqNoLength + col] = val; 2024 } else { 2025 if (maxValCol != 0) 2026 val /= maxValCol; 2027 distMat[col * seqNoLength + row] = val; 2028 distMat[row * seqNoLength + col] = val; 2029 } 2030 } 2031 } 2032 2033 // set diagonal to 1 2034 for(TSizeMat i = 0; i < seqNoLength; ++i) 2035 distMat[i * (seqNoLength + 1)] = 1; 2036 } 2037 2038 2039 ////////////////////////////////////////////////////////////////////////////// 2040 // getKmerSimilarityMatrix 2041 // for a subset of the StringSet given as a sorted string of sequence numbers 2042 2043 template < 2044 typename TObject, 2045 typename TShapeSpec, 2046 typename TSpec, 2047 typename TDistMatrix, 2048 typename TSeqNoString 2049 > 2050 inline void getKmerSimilarityMatrix( 2051 Index< TObject, IndexQGram<TShapeSpec, TSpec> > &index, 2052 TDistMatrix &distMat, 2053 TSeqNoString const &seqNo) 2054 { 2055 typedef Index< TObject, IndexQGram<TShapeSpec,TSpec> > TIndex; 2056 typedef typename Size<TIndex>::Type TSize; 2057 typedef typename Size<TDistMatrix>::Type TSizeMat; 2058 typedef typename Value<TDistMatrix>::Type TValueMat; 2059 2060 typedef typename Fibre<TIndex, QGramCountsDir>::Type TCountsDir; 2061 typedef typename Iterator<TCountsDir, Standard>::Type TIterCountsDir; 2062 typedef typename Fibre<TIndex, QGramCounts>::Type TCounts; 2063 typedef typename Iterator<TCounts, Standard>::Type TIterCounts; 2064 typedef typename Iterator<TSeqNoString, Standard>::Type TIterSeqNo; 2065 2066 // declare requirements 2067 indexRequire(index, QGramCounts()); 2068 2069 // initialize distance matrix 2070 TSizeMat seqNoLength = length(seqNo); 2071 clear(distMat); 2072 resize(distMat, seqNoLength * seqNoLength); 2073 arrayFill(begin(distMat, Standard()), end(distMat, Standard()), 0); 2074 2075 TIterCountsDir itCountsDir = begin(indexCountsDir(index), Standard()); 2076 TIterCountsDir itCountsDirEnd = end(indexCountsDir(index), Standard()); 2077 TIterCounts itCountsBegin = begin(indexCounts(index), Standard()); 2078 TIterSeqNo itSetBegin = begin(seqNo, Standard()); 2079 TIterSeqNo itSetEnd = end(seqNo, Standard()); 2080 2081 // for each bucket count common q-grams for each sequence pair 2082 TSize bucketBegin = *itCountsDir; 2083 for(++itCountsDir; itCountsDir != itCountsDirEnd; ++itCountsDir) 2084 { 2085 TSize bucketEnd = *itCountsDir; 2086 2087 // q-gram must occur in at least 2 different sequences 2088 if (bucketBegin != bucketEnd) 2089 { 2090 TIterCounts itA = itCountsBegin + bucketBegin; 2091 TIterCounts itEnd = itCountsBegin + bucketEnd; 2092 TIterSeqNo itSetA = itSetBegin; 2093 2094 while (itA != itEnd && itSetA != itSetEnd) 2095 { 2096 if ((*itA).i1 < *itSetA) 2097 ++itA; 2098 else if ((*itA).i1 > *itSetA) 2099 ++itSetA; 2100 else 2101 { 2102 TSizeMat ofs = (itSetA - itSetBegin) * seqNoLength; 2103 TSize countA = (*itA).i2; 2104 TIterCounts itB = itA; 2105 TIterSeqNo itSetB = itSetA; 2106 2107 while (itB != itEnd && itSetB != itSetEnd) 2108 { 2109 if ((*itB).i1 < *itSetB) 2110 ++itB; 2111 else if ((*itB).i1 > *itSetB) 2112 ++itSetB; 2113 else 2114 { 2115 TSize countB = (*itB).i2; 2116 if (countA < countB) 2117 distMat[ofs + (itSetB - itSetBegin)] += countA; 2118 else 2119 distMat[ofs + (itSetB - itSetBegin)] += countB; 2120 ++itB; 2121 ++itSetB; 2122 } 2123 } 2124 ++itA; 2125 ++itSetA; 2126 } 2127 } 2128 } 2129 bucketBegin = bucketEnd; 2130 } 2131 2132 // copy upper triangle to lower triangle and scale 2133 for(TSizeMat row = 0; row < seqNoLength; ++row) 2134 { 2135 TValueMat maxValRow = distMat[row * (seqNoLength + 1)]; 2136 for(TSizeMat col = row + 1; col < seqNoLength; ++col) 2137 { 2138 // fractional common kmer count 2139 TValueMat maxValCol = distMat[col * (seqNoLength + 1)]; 2140 TValueMat val = distMat[row * seqNoLength + col]; 2141 2142 // number of common q-grams / Number of possible common q-grams 2143 if (maxValRow < maxValCol) { 2144 if (maxValRow != 0) 2145 val /= maxValRow; 2146 distMat[col * seqNoLength + row] = val; 2147 distMat[row * seqNoLength + col] = val; 2148 } else { 2149 if (maxValCol != 0) 2150 val /= maxValCol; 2151 distMat[col * seqNoLength + row] = val; 2152 distMat[row * seqNoLength + col] = val; 2153 } 2154 } 2155 } 2156 2157 // set diagonal to 1 2158 for(TSizeMat i = 0; i < seqNoLength; ++i) 2159 distMat[i * (seqNoLength + 1)] = 1; 2160 } 2161 2162 2163 ////////////////////////////////////////////////////////////////////////////// 2164 /** 2165 .Function.range: 2166 ..signature:range(index, shape) 2167 ..param.index:A q-gram index. 2168 ...type:Spec.IndexQGram 2169 ..param.shape:A shape object. 2170 ...note:The shape stores the q-gram of the last call to @Function.hash@ or @Function.hashNext@. 2171 ...type:Class.Shape 2172 ..returns:All positions where the q-gram stored in $shape$ occurs in the text (see @Tag.QGram Index Fibres.QGramText@) 2173 are stored in a contiguous range of the suffix array. 2174 $range$ returns begin and end position of this range. 2175 If the type of $index$ is $TIndex$ the return type is $Pair<Size<TIndex>::Type>. 2176 ..note:The necessary index tables are built on-demand via @Function.indexRequire@ if index is not $const$. 2177 ..include:seqan/index.h 2178 */ 2179 2180 template < typename TObject, typename TShapeSpec, typename TSpec, typename TShapeSpec2, typename TValue > 2181 inline Pair<typename Size< Index< TObject, IndexQGram<TShapeSpec, TSpec> > >::Type> 2182 range( 2183 Index< TObject, IndexQGram<TShapeSpec, TSpec> > const &index, 2184 Shape< TValue, TShapeSpec2 > const &shape) 2185 { 2186 typedef typename Size< Index< TObject, IndexQGram<TShapeSpec, TSpec> > >::Type TSize; 2187 typedef typename Size< typename Fibre< Index< TObject, IndexQGram<TShapeSpec, TSpec> >, FibreDir>::Type>::Type TDirSize; 2188 TDirSize bucket = getBucket(indexBucketMap(index), value(shape)); 2189 return Pair<TSize>(indexDir(index)[bucket], indexDir(index)[bucket + 1]); 2190 } 2191 2192 template < typename TObject, typename TShapeSpec, typename TSpec, typename TShapeSpec2, typename TValue > 2193 inline Pair<typename Size< Index< TObject, IndexQGram<TShapeSpec, TSpec> > >::Type> 2194 range( 2195 Index< TObject, IndexQGram<TShapeSpec, TSpec> > &index, 2196 Shape< TValue, TShapeSpec2 > const &shape) 2197 { 2198 indexRequire(index, QGramDir()); 2199 return getOccurrences(const_cast<Index< TObject, IndexQGram<TShapeSpec, TSpec> > const &>(index), shape); 2200 } 2201 2202 ////////////////////////////////////////////////////////////////////////////// 2203 /** 2204 .Function.getOccurrence: 2205 ..signature:getOccurrence(index, shape) 2206 ..param.index:A q-gram index. 2207 ...type:Spec.IndexQGram 2208 ..param.shape:A shape object. 2209 ...note:The shape stores the q-gram of the last call to @Function.hash@ or @Function.hashNext@. 2210 ...type:Class.Shape 2211 ..returns:A position where the q-gram stored in $shape$ occurs in the text (see @Tag.QGram Index Fibres.QGramText@). 2212 If the type of $index$ is $TIndex$ the return type is $SAValue<TIndex>::Type$. 2213 ..include:seqan/index.h 2214 */ 2215 2216 template < typename TObject, typename TShapeSpec, typename TSpec, typename TShapeSpec2, typename TValue > 2217 inline typename SAValue< Index< TObject, IndexQGram<TShapeSpec, TSpec> > >::Type 2218 getOccurrence( 2219 Index< TObject, IndexQGram<TShapeSpec, TSpec> > const &index, 2220 Shape< TValue, TShapeSpec2 > const &shape) 2221 { 2222 return saAt(indexDir(index)[getBucket(indexBucketMap(index), value(shape))], index); 2223 } 2224 2225 template < typename TObject, typename TShapeSpec, typename TSpec, typename TShapeSpec2, typename TValue > 2226 inline typename SAValue< Index< TObject, IndexQGram<TShapeSpec, TSpec> > >::Type 2227 getOccurrence( 2228 Index< TObject, IndexQGram<TShapeSpec, TSpec> > &index, 2229 Shape< TValue, TShapeSpec2 > const &shape) 2230 { 2231 indexRequire(index, QGramSADir()); 2232 return getOccurrence(const_cast<Index< TObject, IndexQGram<TShapeSpec, TSpec> > const &>(index), shape); 2233 } 2234 2235 ////////////////////////////////////////////////////////////////////////////// 2236 /** 2237 .Function.getOccurrences: 2238 ..signature:getOccurrences(index, shape) 2239 ..param.index:A q-gram index. 2240 ...type:Spec.IndexQGram 2241 ..param.shape:A shape object. 2242 ...note:The shape stores the q-gram of the last call to @Function.hash@ or @Function.hashNext@. 2243 ...type:Class.Shape 2244 ..returns:All positions where the q-gram stored in $shape$ occurs in the text (see @Tag.QGram Index Fibres.QGramText@). 2245 If the type of $index$ is $TIndex$ the return type is $Infix<Fibre<TIndex, QGramSA>::Type const>::Type$. 2246 ..remarks:The necessary index tables are built on-demand via @Function.indexRequire@ if index is not $const$. 2247 ..include:seqan/index.h 2248 */ 2249 2250 template < typename TObject, typename TShapeSpec, typename TSpec, typename TShapeSpec2, typename TValue > 2251 inline typename Infix< typename Fibre< Index< TObject, IndexQGram<TShapeSpec, TSpec> >, FibreSA>::Type const >::Type 2252 getOccurrences( 2253 Index< TObject, IndexQGram<TShapeSpec, TSpec> > const &index, 2254 Shape< TValue, TShapeSpec2 > const &shape) 2255 { 2256 typedef typename Size<typename Fibre< Index< TObject, IndexQGram<TShapeSpec, TSpec> >, FibreDir>::Type>::Type TDirSize; 2257 TDirSize bucket = getBucket(indexBucketMap(index), value(shape)); 2258 return infix(indexSA(index), indexDir(index)[bucket], indexDir(index)[bucket + 1]); 2259 } 2260 2261 template < typename TObject, typename TShapeSpec, typename TSpec, typename TShapeSpec2, typename TValue > 2262 inline typename Infix< typename Fibre< Index< TObject, IndexQGram<TShapeSpec, TSpec> >, FibreSA>::Type const >::Type 2263 getOccurrences( 2264 Index< TObject, IndexQGram<TShapeSpec, TSpec> > &index, 2265 Shape< TValue, TShapeSpec2 > const &shape) 2266 { 2267 indexRequire(index, QGramSADir()); 2268 return getOccurrences(const_cast<Index< TObject, IndexQGram<TShapeSpec, TSpec> > const &>(index), shape); 2269 } 2270 2271 ////////////////////////////////////////////////////////////////////////////// 2272 /** 2273 .Function.countOccurrences: 2274 ..signature:countOccurrences(index, shape) 2275 ..param.index:A q-gram index. 2276 ...type:Spec.IndexQGram 2277 ..param.shape:A shape object. 2278 ...note:The shape stores the q-gram of the last call to @Function.hash@ or @Function.hashNext@. 2279 ...type:Class.Shape 2280 ..returns:The number of positions where the q-gram stored in $shape$ occurs in the text (see @Tag.QGram Index Fibres.QGramText@). 2281 If the type of $index$ is $TIndex$ the return type is $Size<TIndex>::Type$. 2282 ..note:The necessary index tables are built on-demand via @Function.indexRequire@ if index is not $const$. 2283 ..include:seqan/index.h 2284 */ 2285 2286 template < typename TObject, typename TShapeSpec, typename TSpec, typename TShapeSpec2, typename TValue > 2287 inline typename Size< typename Fibre< Index< TObject, IndexQGram<TShapeSpec, TSpec> >, FibreSA>::Type const >::Type 2288 countOccurrences( 2289 Index< TObject, IndexQGram<TShapeSpec, TSpec> > const &index, 2290 Shape< TValue, TShapeSpec2 > const &shape) 2291 { 2292 typedef typename Size<typename Fibre< Index< TObject, IndexQGram<TShapeSpec, TSpec> >, FibreDir>::Type>::Type TDirSize; 2293 TDirSize bucket = getBucket(indexBucketMap(index), value(shape)); 2294 return indexDir(index)[bucket + 1] - indexDir(index)[bucket]; 2295 } 2296 2297 template < typename TObject, typename TShapeSpec, typename TSpec, typename TShapeSpec2, typename TValue > 2298 inline typename Size< typename Fibre< Index< TObject, IndexQGram<TShapeSpec, TSpec> >, FibreSA>::Type const >::Type 2299 countOccurrences( 2300 Index< TObject, IndexQGram<TShapeSpec, TSpec> > &index, 2301 Shape< TValue, TShapeSpec2 > const &shape) 2302 { 2303 indexRequire(index, QGramDir()); 2304 return countOccurrences(const_cast<Index< TObject, IndexQGram<TShapeSpec, TSpec> > const &>(index), shape); 2305 } 2306 2307 ////////////////////////////////////////////////////////////////////////////// 2308 /** 2309 .Function.countOccurrencesMultiple: 2310 ..summary:Returns the number of occurences of a q-gram for every sequence of a @Class.StringSet@ . 2311 ..signature:countOccurrencesMultiple(index, shape) 2312 ..param.index:A q-gram index of a @Class.StringSet@. 2313 ...type:Spec.IndexQGram 2314 ..param.shape:A shape object. 2315 ...note:The shape stores the q-gram of the last call to @Function.hash@ or @Function.hashNext@. 2316 ...type:Class.Shape 2317 ..returns:A sequence of @Class.Pair.pairs@ (seqNo,count), count>0. 2318 For every @Class.StringSet@ sequence the q-gram occurs in, seqNo is the sequence number and count the number of occurrences. 2319 If the type of $index$ is $TIndex$ the return type is $Infix<Fibre<TIndex, QGramCounts>::Type const>::Type$. 2320 ..remarks:The necessary index tables are built on-demand via @Function.indexRequire@ if index is not $const$. 2321 ..see:Function.countOccurrences 2322 ..include:seqan/index.h 2323 */ 2324 2325 template < typename TObject, typename TShapeSpec, typename TSpec, typename TShapeSpec2, typename TValue > 2326 inline typename Infix< typename Fibre< Index< TObject, IndexQGram<TShapeSpec, TSpec> >, FibreCounts>::Type const >::Type 2327 countOccurrencesMultiple( 2328 Index< TObject, IndexQGram<TShapeSpec, TSpec> > const &index, 2329 Shape< TValue, TShapeSpec2 > const &shape) 2330 { 2331 typedef typename Size<typename Fibre< Index< TObject, IndexQGram<TShapeSpec, TSpec> >, FibreCountsDir>::Type>::Type TDirSize; 2332 TDirSize bucket = getBucket(indexBucketMap(index), value(shape)); 2333 return infix(indexCounts(index), indexCountsDir(index)[bucket], indexCountsDir(index)[bucket + 1]); 2334 } 2335 2336 template < typename TObject, typename TShapeSpec, typename TSpec, typename TShapeSpec2, typename TValue > 2337 inline typename Infix< typename Fibre< Index< TObject, IndexQGram<TShapeSpec, TSpec> >, FibreCounts>::Type const >::Type 2338 countOccurrencesMultiple( 2339 Index< TObject, IndexQGram<TShapeSpec, TSpec> > &index, 2340 Shape< TValue, TShapeSpec2 > const &shape) 2341 { 2342 indexRequire(index, QGramCounts()); 2343 return countOccurrencesMultiple(const_cast<Index< TObject, IndexQGram<TShapeSpec, TSpec> > const &>(index), shape); 2344 } 2345 2346 2347 ////////////////////////////////////////////////////////////////////////////// 2348 // clear 2349 2350 template < typename TText, typename TShapeSpec, typename TSpec > 2351 inline void clear(Index<TText, IndexQGram<TShapeSpec, TSpec> > &index) 2352 { 2353 clear(getFibre(index, QGramSA())); 2354 clear(getFibre(index, QGramDir())); 2355 clear(getFibre(index, QGramCounts())); 2356 clear(getFibre(index, QGramCountsDir())); 2357 } 2358 2359 ////////////////////////////////////////////////////////////////////////////// 2360 // open 2361 2362 template < typename TObject, typename TShapeSpec, typename TSpec > 2363 inline bool open( 2364 Index< TObject, IndexQGram<TShapeSpec, TSpec> > &index, 2365 const char *fileName, 2366 int openMode) 2367 { 2368 String<char> name; 2369 name = fileName; append(name, ".txt"); 2370 bool result = true; 2371 if ((!open(getFibre(index, QGramText()), toCString(name), openMode)) && 2372 (!open(getFibre(index, QGramText()), fileName, openMode))) 2373 result = false; 2374 2375 name = fileName; append(name, ".sa"); open(getFibre(index, QGramSA()), toCString(name), openMode); 2376 name = fileName; append(name, ".dir"); open(getFibre(index, QGramDir()), toCString(name), openMode); 2377 return result; 2378 } 2379 template < typename TObject, typename TShapeSpec, typename TSpec > 2380 inline bool open( 2381 Index< TObject, IndexQGram<TShapeSpec, TSpec> > &index, 2382 const char *fileName) 2383 { 2384 return open(index, fileName, OPEN_RDONLY); 2385 } 2386 2387 2388 ////////////////////////////////////////////////////////////////////////////// 2389 // save 2390 2391 template < typename TObject, typename TShapeSpec, typename TSpec > 2392 inline bool save( 2393 Index< TObject, IndexQGram<TShapeSpec, TSpec> > &index, 2394 const char *fileName, 2395 int openMode) 2396 { 2397 String<char> name; 2398 name = fileName; append(name, ".txt"); 2399 if ((!save(getFibre(index, QGramText()), toCString(name), openMode)) && 2400 (!save(getFibre(index, QGramText()), fileName, openMode))) return false; 2401 2402 name = fileName; append(name, ".sa"); save(getFibre(index, QGramSA()), toCString(name), openMode); 2403 name = fileName; append(name, ".dir"); save(getFibre(index, QGramDir()), toCString(name), openMode); 2404 return true; 2405 } 2406 template < typename TObject, typename TShapeSpec, typename TSpec > 2407 inline bool save( 2408 Index< TObject, IndexQGram<TShapeSpec, TSpec> > &index, 2409 const char *fileName) 2410 { 2411 return save(index, fileName, OPEN_WRONLY | OPEN_CREATE); 2412 } 2413 2414 } 2415 2416 #endif //#ifndef SEQAN_HEADER_... 2417