1 /* $Id: compart_matching.cpp 536658 2017-05-22 15:48:20Z zaretska $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Yuri Kapustin
27 *
28 * ===========================================================================
29 */
30
31
32 #include <ncbi_pch.hpp>
33
34 #include <math.h>
35
36 #include <corelib/ncbi_system.hpp>
37 #include <corelib/ncbifile.hpp>
38 #include <util/random_gen.hpp>
39 #include <algo/align/util/compartment_finder.hpp>
40 #include <objtools/blast/seqdb_reader/seqdb.hpp>
41 #include <algo/align/splign/compart_matching.hpp>
42
43 BEGIN_NCBI_SCOPE
44
45 vector<CSeq_id_Handle> CBlastSequenceSource::s_ids;
46
47
48 namespace {
49
50 // N-mer size for repeat filtering
51 const Uint4 kNr (14);
52
53 // total number of Nr-mers for repeat filtering
54 const size_t kNrMersTotal (1 << (kNr * 2));
55
56 // percentile used for repeat filtering
57 const double kRepeatsPercentile (0.995);
58
59 // N-mer size for matching
60 const Uint4 kN (16);
61
62 // total number of N-mers for indexing
63 const Uint8 kNMersTotal (Uint8(1) << (kN * 2));
64
65 // extension to indicate repeat filtering table
66 const char *kFileExt_Masked = ".rep";
67
68 // extension to indicate ID and coordinate remapping table
69 const char *kFileExt_Remap = ".idc";
70 const char *kFileExt_Offsets = ".ofs";
71 const char *kFileExt_Positions = ".pos";
72
73 const Uint8 kUI8_LoWord (0xFFFFFFFF);
74 const Uint8 kUI8_LoByte (kUI8_LoWord >> 24);
75 const Uint8 kUI8_MidWord (kUI8_LoWord << 16);
76 const Uint8 kUI8_HiWord (kUI8_LoWord << 32);
77 const Uint8 kUI8_LoFive (kUI8_LoWord | (kUI8_LoWord << 8));
78
79 const Uint8 kUI8_LoHalfWordEachByte ((Uint8(0x0F0F0F0F) << 32) | 0x0F0F0F0F);
80
81 const Uint4 kUI4_Lo28 (0xFFFFFFFF >> 4);
82
83 const Uint8 kUI8_SeqDb_lo (0x03030303);
84 const Uint8 kUI8_SeqDb ((kUI8_SeqDb_lo << 32) | kUI8_SeqDb_lo);
85
86 const Int8 kDiagMax (numeric_limits<Int8>::max());
87
88 const Uint8 kSeqDbMemBound (512 * 1024 * 1024);
89 const size_t kMapGran (512 * 1024 * 1024);
90 }
91
92
DecodeSeqDbChar(Uint1 c)93 char DecodeSeqDbChar(Uint1 c)
94 {
95 char rv ('*');
96 switch(c) {
97 case 0: rv = 'A'; break;
98 case 1: rv = 'C'; break;
99 case 2: rv = 'G'; break;
100 case 3: rv = 'T'; break;
101 }
102 return rv;
103 }
104
105 //#define ALGO_ALIGN_SPLIGN_QCOMP_DEBUG
106 #ifdef ALGO_ALIGN_SPLIGN_QCOMP_DEBUG
107
108 template<typename T>
GetBinString(T v)109 string GetBinString(T v)
110 {
111 vector<bool> vb;
112
113 const size_t bitdim (sizeof(T) * 8);
114 for(size_t i(0); i < bitdim; ++i) {
115 vb.push_back(v & 1);
116 v >>= 1;
117 }
118
119 CNcbiOstrstream ostr;
120 for(size_t i (bitdim); i > 0; --i) {
121 if(i < bitdim && i % 8 == 0) {
122 ostr << ' ';
123 }
124 ostr << vb[i-1];
125 }
126
127 const string rv = CNcbiOstrstreamToString(ostr);
128 return rv;
129 }
130
131 #endif
132
CheckWrittenFile(const string & filename,const Uint8 & len_bytes)133 void CheckWrittenFile(const string& filename, const Uint8& len_bytes)
134 {
135 Int8 reported_len (-1);
136 for(size_t attempt(0); attempt < 1; ++attempt) {
137 reported_len = CFile(filename).GetLength();
138 if(reported_len >= 0 && Uint8(reported_len) == len_bytes) {
139 return;
140 }
141 else {
142 SleepSec(1);
143 }
144 }
145
146 CNcbiOstrstream ostr;
147
148 if(reported_len < 0) {
149 ostr << "Cannot write " << filename
150 << " (error code = " << reported_len << "). ";
151 }
152 else {
153 ostr << "The size of " << filename << " (" << reported_len << ')'
154 << " is different from the expected " << len_bytes << ". ";
155 }
156 ostr << "Please make sure there is enough disk space.";
157
158 const string errmsg = CNcbiOstrstreamToString(ostr);
159 NCBI_THROW(CException, eUnknown, errmsg);
160 }
161
162
163 //
164 // Brute-force reversal & complementation
165
166 template<typename T>
ReverseAndComplement(T v)167 T ReverseAndComplement(T v)
168 {
169 T rv (0);
170 for(size_t i (0), imax (4*sizeof(T)); i < imax; ++i, v >>= 2) {
171 rv = (rv << 2) | (v & 3);
172 }
173 rv = ~rv;
174
175 return rv;
176 }
177
178
179 //
180 // Table-based bit reversal & complementation
181
182 template<typename T>
183 class CReverseAndComplement {
184
185 public:
186
CReverseAndComplement(void)187 CReverseAndComplement(void) {
188
189 m_Table.resize(256);
190
191 for(Uint1 i(1); i < 0xFF; ++i) {
192 m_Table[i] = ReverseAndComplement(i);
193 }
194
195 m_Table[0] = 0xFF;
196 m_Table[0xFF] = 0;
197 }
198
operator ()(T v) const199 T operator() (T v) const {
200
201 T rv (0);
202 for(size_t i(0), imax(sizeof(T)); i < imax; ++i) {
203 rv <<= 8;
204 rv |= m_Table[v & 0xFF];
205 v >>= 8;
206 }
207 return rv;
208 }
209
210 private:
211
212 vector<Uint1> m_Table;
213 };
214
215
216 namespace {
217 CReverseAndComplement<Uint4> g_RC;
218 }
219
220
s_IsLowComplexity(Uint4 key)221 bool CElementaryMatching::s_IsLowComplexity(Uint4 key)
222 {
223 // evaluate the lower 28 bits (14 residues) only
224
225 const Uint4 kMaxTwoBaseContent (14);
226
227 vector<Uint4> counts(4, 0);
228
229 for(Uint4 k = 0; k < 14; ++k) {
230 ++counts[0x00000003 & key];
231 key >>= 2;
232 }
233
234 const Uint4 ag (counts[0] + counts[1]);
235 const Uint4 at (counts[0] + counts[2]);
236 const Uint4 ac (counts[0] + counts[3]);
237 const Uint4 gt (counts[1] + counts[2]);
238 const Uint4 gc (counts[1] + counts[3]);
239 const Uint4 tc (counts[2] + counts[3]);
240
241 return
242 ag >= kMaxTwoBaseContent || at >= kMaxTwoBaseContent ||
243 ac >= kMaxTwoBaseContent || gt >= kMaxTwoBaseContent ||
244 gc >= kMaxTwoBaseContent || tc >= kMaxTwoBaseContent;
245 }
246
247
GetLocalBaseName(const string & extended_name,const string & sfx)248 string GetLocalBaseName(const string& extended_name, const string & sfx)
249 {
250 string dir, base, ext;
251 CDirEntry::SplitPath(extended_name, &dir, &base, &ext);
252 string rv (base);
253 if(!ext.empty()) {
254 rv += ext;
255 }
256 rv += "." + sfx;
257 return rv;
258 }
259
ReplaceExt(const string & extended_name,const string & new_ext)260 string ReplaceExt(const string& extended_name, const string & new_ext)
261 {
262 string dir, base, ext;
263 CDirEntry::SplitPath(extended_name, &dir, &base, &ext);
264 string rv;
265 if(!dir.empty()) {
266 rv = dir; // + CDirEntry::GetPathSeparator();
267 }
268 if(!base.empty()) {
269 rv += base;
270 }
271 rv += new_ext;
272
273 return rv;
274 }
275
276
277 template <typename VectorT>
g_SaveToTemp(const VectorT & v,const string & path)278 string g_SaveToTemp(const VectorT& v, const string& path)
279 {
280 typedef typename VectorT::value_type TElem;
281
282 const string filename(CFile::GetTmpNameEx(path, "splqcomp_"));
283 const Uint8 len_bytes (v.size() * sizeof(TElem));
284
285 {
286 CNcbiOfstream tempcgrfile (filename.c_str(), IOS_BASE::binary);
287 tempcgrfile.write((const char* ) & v.front(), len_bytes);
288 tempcgrfile.close();
289 }
290
291 CheckWrittenFile(filename, len_bytes);
292
293 return filename;
294 }
295
296
297 template <typename VectorT>
g_RestoreFromTemp(const string & filename,VectorT * pvd)298 void g_RestoreFromTemp(const string& filename, VectorT* pvd)
299 {
300 typedef typename VectorT::value_type TElem;
301
302 VectorT& v = *pvd;
303
304 const Uint8 dim (CFile(filename).GetLength() / sizeof(TElem));
305
306 CNcbiIfstream tempcgrfile (filename.c_str(), IOS_BASE::binary);
307 tempcgrfile.read((char* ) & v.front(), dim * sizeof(TElem));
308
309 CFile(filename).Remove();
310 }
311
312
x_InitParticipationVector(bool strand)313 void CElementaryMatching::x_InitParticipationVector(bool strand)
314 {
315 // Skim over the query offset volumes and set the bitmask
316
317 m_Mers.Init(kNMersTotal, false);
318
319 CDir dir (m_FilePath);
320 const string sfx (string(strand?".p": ".m") + ".v*");
321 const string mask_ofs_q (m_lbn_q + sfx + kFileExt_Offsets);
322 CDir::TEntries vols_ofs_q (dir.GetEntries(mask_ofs_q));
323 ITERATE(CDir::TEntries, ii, vols_ofs_q) {
324
325 const string filename ((*ii)->GetPath());
326 const Int8 vol_length (CFile(filename).GetLength());
327
328 CMemoryFile mf (filename);
329 for(const Uint8 * p8 (reinterpret_cast<const Uint8 *> (mf.Map())),
330 * p8e (p8 + vol_length / 8); p8 != p8e;
331 m_Mers.set_at(*p8++ & kUI8_LoWord));
332 mf.Unmap();
333 }
334
335 m_Mers.reset_at(0);
336 }
337
338
x_InitFilteringVector(const string & sdb,bool strand)339 void CElementaryMatching::x_InitFilteringVector(const string& sdb, bool strand)
340 {
341 // for plus strand create using the actual sequence data;
342 // use the plus strand table to init the minus strand table
343
344 if(strand) {
345
346 // create repeat filtering table (genome);
347 CRef<CSeqDB> subjdb (new CSeqDB(sdb, CSeqDB::eNucleotide));
348 if(subjdb->GetTotalLength() > numeric_limits<Uint4>::max())
349 {
350 CNcbiOstrstream ostr;
351 ostr << "Sequence volumes with total length exceeding "
352 << numeric_limits<Uint4>::max()
353 << " are not yet supported. Please split your FASTA file and re-run "
354 << " formatdb.";
355 const string err = CNcbiOstrstreamToString(ostr);
356 NCBI_THROW(CException, eUnknown, err);
357 }
358
359 Uint4 current_offset (0);
360
361 auto_ptr<vector<Uint4> > pNrCounts (new vector<Uint4> (kNrMersTotal, 0));
362 vector<Uint4> & NrCounts (*pNrCounts);
363
364 cerr << " Scanning " << subjdb->GetNumSeqs() << " genomic sequences ... ";
365 for(int oid (0); subjdb->CheckOrFindOID(oid); ++oid)
366 {
367 const char * pcb (0);
368 const Uint4 bases (subjdb->GetSequence(oid, &pcb));
369 const char * pcbe (pcb + bases / 4);
370 uintptr_t npcb (reinterpret_cast<uintptr_t>(pcb)), npcb0 (npcb);
371 npcb -= npcb % 8;
372 if(npcb < npcb0) npcb += 8;
373 const size_t gcbase (4*(npcb - npcb0));
374 const Uint8 * pui8 (reinterpret_cast<const Uint8*>(npcb));
375 const Uint8 * pui8_e (reinterpret_cast<const Uint8*>(pcbe));
376
377 for(size_t gccur (current_offset + gcbase); pui8 < pui8_e; ++pui8) {
378
379 Uint8 ui8 (*pui8);
380
381 // counting the hi 28 bits correspond to
382 // a subsequence at positions 1,2,5-16
383 #define QCOMP_COUNT_NrMERS {{ \
384 if(gccur + 16 >= current_offset + bases) { \
385 break; \
386 } \
387 const Uint4 mer4 (ui8 & kUI8_LoWord); \
388 ++NrCounts[mer4 >> 4]; \
389 gccur += 4; \
390 ui8 >>= 8; \
391 }}
392
393 QCOMP_COUNT_NrMERS;
394 QCOMP_COUNT_NrMERS;
395 QCOMP_COUNT_NrMERS;
396 QCOMP_COUNT_NrMERS;
397
398 if(pui8 + 1 < pui8_e) {
399
400 ui8 |= (*(pui8 + 1) << 32);
401
402 QCOMP_COUNT_NrMERS;
403 QCOMP_COUNT_NrMERS;
404 QCOMP_COUNT_NrMERS;
405 QCOMP_COUNT_NrMERS;
406 }
407
408 #undef QCOMP_COUNT_NMERS
409 }
410
411 subjdb->RetSequence(&pcb);
412
413 current_offset += bases;
414 } // OIDs
415
416 subjdb.Reset(0);
417
418 cerr << "Ok" << endl;
419 cerr << " Constructing FV ... ";
420
421 string filename_temp_01;
422 auto_ptr<vector<Uint4> > pNrCounts2 (new vector<Uint4>);
423 vector<Uint4> & NrCounts2 (* pNrCounts2);
424 try {
425 NrCounts2 = NrCounts;
426 }
427 catch(...) {
428 filename_temp_01 = g_SaveToTemp(NrCounts, m_FilePath);
429 pNrCounts2.reset(0);
430 }
431
432 // find the percentile
433 size_t total_mers (0);
434 ITERATE(vector<Uint4>, ii, NrCounts) {
435 if(*ii > 0) ++total_mers;
436 }
437 const size_t percentile ((size_t)(kNrMersTotal -
438 total_mers * (1 - kRepeatsPercentile)));
439 nth_element(NrCounts.begin(), NrCounts.begin() + percentile, NrCounts.end());
440 const Uint4 max_repeat_count (NrCounts[percentile]);
441
442 if(filename_temp_01.empty()) {
443 NrCounts = NrCounts2;
444 }
445 else {
446 g_RestoreFromTemp(filename_temp_01, &NrCounts);
447 }
448 pNrCounts2.reset(0);
449
450 m_Mers.Init(kNrMersTotal, false);
451 for(size_t i (0); i < kNrMersTotal; ++i) {
452 const bool v (NrCounts[i] <= max_repeat_count && !s_IsLowComplexity(i));
453 if(v) {
454 m_Mers.set_at(i);
455 }
456 }
457 pNrCounts.reset(0);
458
459 // serialize
460 const string masked_filename (m_FilePath + CDirEntry::GetPathSeparator() + m_lbn_s + kFileExt_Masked);
461 const Uint8 len_bytes (kNrMersTotal / 8);
462 {{
463 CMemoryFile mf (masked_filename, CMemoryFile::eMMP_Write,
464 CMemoryFile::eMMS_Shared, 0, kNrMersTotal / 8,
465 CMemoryFile::eCreate,
466 len_bytes);
467 Uint8* dest (reinterpret_cast<Uint8*>(mf.Map()));
468
469 const Uint8 * src (m_Mers.GetBuffer());
470 copy(src, src + kNrMersTotal / 64, dest);
471 }}
472
473 CheckWrittenFile(masked_filename, len_bytes);
474 }
475 else {
476
477 cerr << " Reading/transforming FV ... ";
478
479 // read the plus strand vector and trasnform
480 const string masked_filename (m_FilePath + CDirEntry::GetPathSeparator() + m_lbn_s + kFileExt_Masked);
481
482 CMemoryFile mf (masked_filename);
483 const Uint8 * p8 (reinterpret_cast<Uint8*>(mf.Map()));
484 CBoolVector nrmers_plus (kNrMersTotal, p8);
485 mf.Unmap();
486
487 m_Mers.Init(kNrMersTotal, false);
488
489 for(size_t i(0); i < kNrMersTotal; ++i) {
490 if(nrmers_plus.get_at(i)) {
491 // todo: optimize using tables
492 const size_t irc (g_RC(Uint4(i) << 4) & kUI4_Lo28);
493 m_Mers.set_at(irc);
494 }
495 }
496 }
497
498 cerr << "Ok" << endl;
499 }
500
501
x_CreateRemapData(const string & db,EIndexMode mode)502 void CElementaryMatching::x_CreateRemapData(const string& db, EIndexMode mode)
503 {
504 // create ID and coordinate remapping tables
505
506 CSeqDB blastdb (db, CSeqDB::eNucleotide);
507
508 TSeqInfos seq_infos;
509 const size_t num_seqs (blastdb.GetNumSeqs());
510 seq_infos.reserve(num_seqs);
511
512 Uint4 current_offset (0);
513 for(int oid (0); blastdb.CheckOrFindOID(oid); ++oid) {
514
515 const int seqlen (blastdb.GetSeqLength(oid));
516 if(seqlen <= 0 || size_t(seqlen) >= 0xFFFFFFFF) {
517 CNcbiOstrstream ostr;
518 ostr << "Cannot create remap data for:\t" <<
519 blastdb.GetSeqIDs(oid).front()->GetSeqIdString(true);
520 const string err = CNcbiOstrstreamToString(ostr);
521 NCBI_THROW(CException, eUnknown, err);
522 }
523
524 const Uint4 bases (seqlen);
525 seq_infos.push_back(SSeqInfo(current_offset, bases, oid));
526 current_offset += bases;
527 }
528
529 const string remap_filename ((mode == eIM_Genomic? m_lbn_s: m_lbn_q) +
530 kFileExt_Remap);
531
532 const string full_remap_filename = m_FilePath + CDirEntry::GetPathSeparator() + remap_filename;
533
534 CNcbiOfstream ofstr_remap (full_remap_filename.c_str(), IOS_BASE::binary);
535 const Uint8 len_bytes (seq_infos.size() * sizeof(SSeqInfo));
536 ofstr_remap.write((const char*) &seq_infos.front(), len_bytes);
537 ofstr_remap.close();
538
539 CheckWrittenFile(full_remap_filename, len_bytes);
540
541 cerr << " Remap data created for " << db << "; max offset = "
542 << current_offset << endl;
543 }
544
x_CreateRemapData(ISequenceSource * m_qsrc,EIndexMode mode)545 void CElementaryMatching::x_CreateRemapData(ISequenceSource *m_qsrc, EIndexMode mode)
546 {
547 // create ID and coordinate remapping tables
548
549 TSeqInfos seq_infos;
550 const size_t num_seqs (m_qsrc->GetNumSeqs());
551 seq_infos.reserve(num_seqs);
552
553 Uint4 current_offset (0);
554 for(m_qsrc->ResetIndex(); m_qsrc->GetNext(); ) {
555
556 const int seqlen (m_qsrc->GetSeqLength());
557 if(seqlen <= 0 || size_t(seqlen) >= 0xFFFFFFFF) {
558 CNcbiOstrstream ostr;
559 ostr << "Cannot create remap data for:\t" <<
560 m_qsrc->GetSeqID()->GetSeqIdString(true);
561 const string err = CNcbiOstrstreamToString(ostr);
562 NCBI_THROW(CException, eUnknown, err);
563 }
564
565 const Uint4 bases (seqlen);
566 seq_infos.push_back(SSeqInfo(current_offset, bases, m_qsrc->GetCurrentIndex()));
567 current_offset += bases;
568 }
569
570 const string remap_filename ((mode == eIM_Genomic? m_lbn_s: m_lbn_q) +
571 kFileExt_Remap);
572
573 const string full_remap_filename = m_FilePath + CDirEntry::GetPathSeparator() + remap_filename;
574
575 CNcbiOfstream ofstr_remap (full_remap_filename.c_str(), IOS_BASE::binary);
576 const Uint8 len_bytes (seq_infos.size() * sizeof(SSeqInfo));
577 ofstr_remap.write((const char*) &seq_infos.front(), len_bytes);
578 ofstr_remap.close();
579
580 CheckWrittenFile(full_remap_filename, len_bytes);
581
582 cerr << " Remap data created for sequences; max offset = "
583 << current_offset << endl;
584 }
585
586
x_CreateIndex(const string & db,EIndexMode mode,bool strand)587 void CElementaryMatching::x_CreateIndex(const string& db, EIndexMode mode, bool strand)
588 {
589 // sort all adjacent genomic N-mers and their positions
590 // except for those marked in the Nr-mer bit vector
591
592 cerr << " Scanning " << db << " for N-mers and their positions." << endl;
593
594 if(m_Mers.get_at(0)) {
595 NCBI_THROW(CException, eUnknown, "NULL mer not excluded!");
596 }
597
598 const size_t mcidx_max (m_MaxVolSize / 8);
599 vector<Uint8> MersAndCoords (mcidx_max, 0);
600 size_t mcidx (0);
601 size_t current_offset (0);
602
603 CRef<CSeqDB> blastdb (new CSeqDB(db, CSeqDB::eNucleotide));
604
605 const Uint8 blastdb_total_length (blastdb->GetTotalLength());
606 if((mode == eIM_Genomic && blastdb_total_length / kN > numeric_limits<Uint4>::max())
607 || (mode == eIM_cDNA && blastdb_total_length > numeric_limits<Uint4>::max()))
608 {
609 CNcbiOstrstream ostr;
610 ostr << "Sequence volumes with total length exceeding "
611 << numeric_limits<Uint4>::max()
612 << " are not yet supported. Please split your FASTA file and re-run "
613 << " formatdb.";
614 const string err = CNcbiOstrstreamToString(ostr);
615 NCBI_THROW(CException, eUnknown, err);
616 }
617
618 size_t volume(0);
619 for(int oid (0); blastdb->CheckOrFindOID(oid); ++oid) {
620
621 const char * pcb (0);
622 const Uint4 bases (blastdb->GetSequence(oid, &pcb));
623 const char * pce (pcb + bases/4);
624 uintptr_t npcb (reinterpret_cast<uintptr_t>(pcb)), npcb0 (npcb);
625 npcb -= npcb % 8;
626 if(npcb < npcb0) npcb += 8;
627 const Uint4 gcbase (4*(npcb - npcb0));
628 const Uint8* pui8_start (reinterpret_cast<const Uint8*>(npcb));
629 const Uint8* pui8 (pui8_start);
630
631 // It helps not to break volumes in the middle of a sequence.
632 // We explicitly check here if the volume is close to its limit.
633
634 const size_t max_new_elems (mode == eIM_Genomic? size_t(8.0 * bases / kN):
635 bases);
636
637 if(mcidx > 1000 && mcidx + max_new_elems >= mcidx_max) {
638 MersAndCoords.resize(mcidx);
639 x_WriteIndexFile(++volume, mode, strand, MersAndCoords);
640 MersAndCoords.assign(mcidx_max, 0);
641 mcidx = 0;
642 }
643
644 if(mode == eIM_Genomic) {
645
646 // index every other position
647 for(size_t gccur (current_offset + gcbase);
648 gccur + 16 < current_offset + bases && mcidx < mcidx_max;
649 ++pui8)
650 {
651 Uint8 w8 (*pui8);
652
653 #define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX \
654 size_t gccur2 (gccur + 2); \
655 const Uint8 ui8_2930 (w8 >> 60); \
656 Uint8 ui8_ls4 (w8 << 4); \
657 const Uint8 ui8_mask (ui8_ls4 & kUI8_LoHalfWordEachByte); \
658 ui8_ls4 &= kUI8_LoHalfWordEachByte << 4; \
659 ui8_ls4 |= (ui8_mask >> 16) | (ui8_2930 << 48);
660
661 QCOMP_PREPARE_SHIFTED_GENOMIC_IDX;
662
663 #define QCOMP_CREATE_GENOMIC_IDX(w8,gccur) \
664 { \
665 if(gccur + 16 >= current_offset + bases) { \
666 break; \
667 } \
668 const Uint8 mer (w8 & kUI8_LoWord); \
669 if(strand) { \
670 if(m_Mers.get_at(mer)) { \
671 MersAndCoords[mcidx++] = (mer << 32) | gccur; \
672 } \
673 } \
674 else { \
675 const Uint4 rc (g_RC(Uint4(mer))); \
676 if(m_Mers.get_at(rc)) { \
677 MersAndCoords[mcidx++] = (Uint8(rc) << 32) | \
678 ((current_offset + bases - gccur - 16) \
679 + current_offset); \
680 } \
681 } \
682 gccur += 4; \
683 w8 >>= 8; \
684 }
685
686 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
687 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
688 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
689 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
690
691 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
692 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
693 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
694 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
695
696 if(gccur + 32 >= current_offset + bases) {
697 break;
698 }
699 else {
700
701 w8 |= ((*(pui8 + 1)) << 32);
702
703 QCOMP_PREPARE_SHIFTED_GENOMIC_IDX;
704
705 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
706 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
707 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
708 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
709
710 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
711 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
712 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
713 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
714 }
715 #undef QCOMP_PREPARE_SHIFTED_GENOMIC_IDX
716 #undef QCOMP_CREATE_GENOMIC_IDX
717 }
718 }
719 else { // cDNA mode
720
721 if(bases >= m_MinQueryLength && bases <= m_MaxQueryLength) {
722
723 // head
724 Uint8 fivebytes (0);
725 for(const char * p (pcb),
726 *pche ( (reinterpret_cast<const char*>(pui8)) + 5 );
727 p < pche; ++p)
728 {
729 const Uint8 c8 (*p & kUI8_LoByte);
730 const Uint8 ui8curbyte (c8);
731 if(pcb + 5 > p) {
732 fivebytes = (fivebytes >> 8) | (ui8curbyte << 32);
733 }
734 else {
735
736 for(Uint4 k(0); k < 4; ++k) {
737
738 const Uint4 mer (fivebytes & kUI8_LoWord);
739 if(m_Mers.get_at(strand? (mer >> 4): (mer & kUI4_Lo28))) {
740 const Uint8 ui8 (mer);
741 const Uint4 coord (current_offset +
742 4*(p - pcb - 5) + k);
743 MersAndCoords[mcidx++] = (ui8 << 32) | coord;
744 }
745
746 fivebytes &= kUI8_LoFive;
747 fivebytes <<= 2;
748 const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
749 fivebytes &= ~kUI8_SeqDb;
750 fivebytes |= ui8m;
751 }
752 fivebytes |= ui8curbyte << 32;
753 }
754 }
755
756 // body
757 Uint8 ui8 (0);
758 Uint4 gccur (current_offset + gcbase);
759 for(; gccur + 32 < current_offset + bases; ++pui8)
760 {
761 ui8 = *pui8;
762
763 for(Uint4 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
764
765 const Uint8 loword = ui8 & kUI8_LoWord;
766 if(m_Mers.get_at(strand? (loword >> 4):
767 (loword & kUI4_Lo28)))
768 {
769 MersAndCoords[mcidx++] = (loword << 32) | gccur;
770 }
771
772 const Uint8 ui8hi2 ((ui8 >> 62) << 48);
773 ui8 <<= 2;
774 const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
775 ui8 &= ~kUI8_SeqDb;
776 ui8 |= (ui8m | ui8hi2);
777 }
778
779 if(gccur + 32 < current_offset + bases) {
780
781 // pre-read next 16 residues
782 const uintptr_t n (reinterpret_cast<uintptr_t>(pui8 + 1));
783 const Uint4 * pui4 (reinterpret_cast<const Uint4*>(n));
784 Uint4 ui4 (*pui4);
785 Uint8 ui8_4 (ui4);
786 ui8 |= (ui8_4 << 32);
787
788 for(Uint4 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
789
790 const Uint8 loword = ui8 & kUI8_LoWord;
791 if(m_Mers.get_at(strand? (loword >> 4):
792 (loword & kUI4_Lo28)))
793 {
794 MersAndCoords[mcidx++] = (loword << 32) | gccur;
795 }
796
797 const Uint8 ui8hi2 ((ui8 >> 62) << 48);
798 ui8 <<= 2;
799 const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
800 ui8 &= ~kUI8_SeqDb;
801 ui8 |= (ui8m | ui8hi2);
802 }
803 }
804 }
805
806 // tail
807 if(gccur + 16 <= current_offset + bases) {
808
809 fivebytes = (gccur == current_offset + gcbase)? fivebytes:
810 (ui8 & kUI8_LoWord);
811 const char * p (reinterpret_cast<const char*>(pui8_start) + 4
812 + (gccur - current_offset - gcbase) / 4);
813 size_t k (0);
814 do {
815 const Uint8 loword = fivebytes & kUI8_LoWord;
816
817 if(m_Mers.get_at(strand? (loword >> 4):
818 (loword & kUI4_Lo28)))
819 {
820 MersAndCoords[mcidx++] = (loword << 32) | gccur;
821 }
822
823 if(k % 4 == 0) {
824 if(p < pce) {
825 const Uint8 ui8 (*p++ & kUI8_LoByte);
826 fivebytes |= (ui8 << 32);
827 }
828 else {
829 break;
830 }
831 }
832
833 fivebytes &= kUI8_LoFive;
834 fivebytes <<= 2;
835 const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
836 fivebytes &= ~kUI8_SeqDb;
837 fivebytes |= ui8m;
838
839 ++k;
840 ++gccur;
841 } while (true);
842 }
843
844 } // min cDNA length check
845
846 } // genomic or cDNA
847
848 blastdb->RetSequence(&pcb);
849
850 current_offset += bases;
851
852 if(mcidx >= mcidx_max) {
853 CNcbiOstrstream ostr;
854 ostr << "Selected max volume size is too small: "
855 << "it must be large enough to fit the index for the "
856 << "longest input sequence.";
857 const string err = CNcbiOstrstreamToString(ostr);
858 NCBI_THROW(CException, eUnknown, err);
859 }
860 } // seqdb iteration
861
862 blastdb.Reset(0);
863
864 if(mcidx > 0) {
865 MersAndCoords.resize(mcidx);
866 x_WriteIndexFile(++volume, mode, strand, MersAndCoords);
867 mcidx = 0;
868 }
869
870 m_Mers.Clear();
871 }
872
x_CreateIndex(ISequenceSource * m_qsrc,EIndexMode mode,bool strand)873 void CElementaryMatching::x_CreateIndex(ISequenceSource *m_qsrc, EIndexMode mode, bool strand)
874 {
875 // sort all adjacent genomic N-mers and their positions
876 // except for those marked in the Nr-mer bit vector
877
878 cerr << " Scanning sequences for N-mers and their positions." << endl;
879
880 if(m_Mers.get_at(0)) {
881 NCBI_THROW(CException, eUnknown, "NULL mer not excluded!");
882 }
883
884 const size_t mcidx_max (m_MaxVolSize / 8);
885 vector<Uint8> MersAndCoords (mcidx_max, 0);
886 size_t mcidx (0);
887 size_t current_offset (0);
888
889
890 const Uint8 blastdb_total_length (m_qsrc->GetTotalLength());
891 if((mode == eIM_Genomic && blastdb_total_length / kN > numeric_limits<Uint4>::max())
892 || (mode == eIM_cDNA && blastdb_total_length > numeric_limits<Uint4>::max()))
893 {
894 CNcbiOstrstream ostr;
895 ostr << "Sequence volumes with total length exceeding "
896 << numeric_limits<Uint4>::max()
897 << " are not yet supported. Please split your FASTA file and re-run "
898 << " formatdb.";
899 const string err = CNcbiOstrstreamToString(ostr);
900 NCBI_THROW(CException, eUnknown, err);
901 }
902
903 size_t volume(0);
904 for(m_qsrc->ResetIndex(); m_qsrc->GetNext(); ) {
905
906 const char * pcb (0);
907 const Uint4 bases (m_qsrc->GetSeq(&pcb));
908 const char * pce (pcb + bases/4);
909 uintptr_t npcb (reinterpret_cast<uintptr_t>(pcb)), npcb0 (npcb);
910 npcb -= npcb % 8;
911 if(npcb < npcb0) npcb += 8;
912 const Uint4 gcbase (4*(npcb - npcb0));
913 const Uint8* pui8_start (reinterpret_cast<const Uint8*>(npcb));
914 const Uint8* pui8 (pui8_start);
915
916 // It helps not to break volumes in the middle of a sequence.
917 // We explicitly check here if the volume is close to its limit.
918
919 const size_t max_new_elems (mode == eIM_Genomic? size_t(8.0 * bases / kN):
920 bases);
921
922 if(mcidx > 1000 && mcidx + max_new_elems >= mcidx_max) {
923 MersAndCoords.resize(mcidx);
924 x_WriteIndexFile(++volume, mode, strand, MersAndCoords);
925 MersAndCoords.assign(mcidx_max, 0);
926 mcidx = 0;
927 }
928
929 if(mode == eIM_Genomic) {
930
931 // index every other position
932 for(size_t gccur (current_offset + gcbase);
933 gccur + 16 < current_offset + bases && mcidx < mcidx_max;
934 ++pui8)
935 {
936 Uint8 w8 (*pui8);
937
938 #define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX \
939 size_t gccur2 (gccur + 2); \
940 const Uint8 ui8_2930 (w8 >> 60); \
941 Uint8 ui8_ls4 (w8 << 4); \
942 const Uint8 ui8_mask (ui8_ls4 & kUI8_LoHalfWordEachByte); \
943 ui8_ls4 &= kUI8_LoHalfWordEachByte << 4; \
944 ui8_ls4 |= (ui8_mask >> 16) | (ui8_2930 << 48);
945
946 QCOMP_PREPARE_SHIFTED_GENOMIC_IDX;
947
948 #define QCOMP_CREATE_GENOMIC_IDX(w8,gccur) \
949 { \
950 if(gccur + 16 >= current_offset + bases) { \
951 break; \
952 } \
953 const Uint8 mer (w8 & kUI8_LoWord); \
954 if(strand) { \
955 if(m_Mers.get_at(mer)) { \
956 MersAndCoords[mcidx++] = (mer << 32) | gccur; \
957 } \
958 } \
959 else { \
960 const Uint4 rc (g_RC(Uint4(mer))); \
961 if(m_Mers.get_at(rc)) { \
962 MersAndCoords[mcidx++] = (Uint8(rc) << 32) | \
963 ((current_offset + bases - gccur - 16) \
964 + current_offset); \
965 } \
966 } \
967 gccur += 4; \
968 w8 >>= 8; \
969 }
970
971 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
972 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
973 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
974 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
975
976 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
977 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
978 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
979 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
980
981 if(gccur + 32 >= current_offset + bases) {
982 break;
983 }
984 else {
985
986 w8 |= ((*(pui8 + 1)) << 32);
987
988 QCOMP_PREPARE_SHIFTED_GENOMIC_IDX;
989
990 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
991 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
992 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
993 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
994
995 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
996 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
997 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
998 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
999 }
1000 #undef QCOMP_PREPARE_SHIFTED_GENOMIC_IDX
1001 #undef QCOMP_CREATE_GENOMIC_IDX
1002 }
1003 }
1004 else { // cDNA mode
1005
1006 if(bases >= m_MinQueryLength && bases <= m_MaxQueryLength) {
1007
1008 // head
1009 Uint8 fivebytes (0);
1010 for(const char * p (pcb),
1011 *pche ( (reinterpret_cast<const char*>(pui8)) + 5 );
1012 p < pche; ++p)
1013 {
1014 const Uint8 c8 (*p & kUI8_LoByte);
1015 const Uint8 ui8curbyte (c8);
1016 if(pcb + 5 > p) {
1017 fivebytes = (fivebytes >> 8) | (ui8curbyte << 32);
1018 }
1019 else {
1020
1021 for(Uint4 k(0); k < 4; ++k) {
1022
1023 const Uint4 mer (fivebytes & kUI8_LoWord);
1024 if(m_Mers.get_at(strand? (mer >> 4): (mer & kUI4_Lo28))) {
1025 const Uint8 ui8 (mer);
1026 const Uint4 coord (current_offset +
1027 4*(p - pcb - 5) + k);
1028 MersAndCoords[mcidx++] = (ui8 << 32) | coord;
1029 }
1030
1031 fivebytes &= kUI8_LoFive;
1032 fivebytes <<= 2;
1033 const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
1034 fivebytes &= ~kUI8_SeqDb;
1035 fivebytes |= ui8m;
1036 }
1037 fivebytes |= ui8curbyte << 32;
1038 }
1039 }
1040
1041 // body
1042 Uint8 ui8 (0);
1043 Uint4 gccur (current_offset + gcbase);
1044 for(; gccur + 32 < current_offset + bases; ++pui8)
1045 {
1046 ui8 = *pui8;
1047
1048 for(Uint4 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
1049
1050 const Uint8 loword = ui8 & kUI8_LoWord;
1051 if(m_Mers.get_at(strand? (loword >> 4):
1052 (loword & kUI4_Lo28)))
1053 {
1054 MersAndCoords[mcidx++] = (loword << 32) | gccur;
1055 }
1056
1057 const Uint8 ui8hi2 ((ui8 >> 62) << 48);
1058 ui8 <<= 2;
1059 const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
1060 ui8 &= ~kUI8_SeqDb;
1061 ui8 |= (ui8m | ui8hi2);
1062 }
1063
1064 if(gccur + 32 < current_offset + bases) {
1065
1066 // pre-read next 16 residues
1067 const uintptr_t n (reinterpret_cast<uintptr_t>(pui8 + 1));
1068 const Uint4 * pui4 (reinterpret_cast<const Uint4*>(n));
1069 Uint4 ui4 (*pui4);
1070 Uint8 ui8_4 (ui4);
1071 ui8 |= (ui8_4 << 32);
1072
1073 for(Uint4 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
1074
1075 const Uint8 loword = ui8 & kUI8_LoWord;
1076 if(m_Mers.get_at(strand? (loword >> 4):
1077 (loword & kUI4_Lo28)))
1078 {
1079 MersAndCoords[mcidx++] = (loword << 32) | gccur;
1080 }
1081
1082 const Uint8 ui8hi2 ((ui8 >> 62) << 48);
1083 ui8 <<= 2;
1084 const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
1085 ui8 &= ~kUI8_SeqDb;
1086 ui8 |= (ui8m | ui8hi2);
1087 }
1088 }
1089 }
1090
1091 // tail
1092 if(gccur + 16 <= current_offset + bases) {
1093
1094 fivebytes = (gccur == current_offset + gcbase)? fivebytes:
1095 (ui8 & kUI8_LoWord);
1096 const char * p (reinterpret_cast<const char*>(pui8_start) + 4
1097 + (gccur - current_offset - gcbase) / 4);
1098 size_t k (0);
1099 do {
1100 const Uint8 loword = fivebytes & kUI8_LoWord;
1101
1102 if(m_Mers.get_at(strand? (loword >> 4):
1103 (loword & kUI4_Lo28)))
1104 {
1105 MersAndCoords[mcidx++] = (loword << 32) | gccur;
1106 }
1107
1108 if(k % 4 == 0) {
1109 if(p < pce) {
1110 const Uint8 ui8 (*p++ & kUI8_LoByte);
1111 fivebytes |= (ui8 << 32);
1112 }
1113 else {
1114 break;
1115 }
1116 }
1117
1118 fivebytes &= kUI8_LoFive;
1119 fivebytes <<= 2;
1120 const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
1121 fivebytes &= ~kUI8_SeqDb;
1122 fivebytes |= ui8m;
1123
1124 ++k;
1125 ++gccur;
1126 } while (true);
1127 }
1128
1129 } // min cDNA length check
1130
1131 } // genomic or cDNA
1132
1133 m_qsrc->RetSequence(&pcb);
1134
1135 current_offset += bases;
1136
1137 if(mcidx >= mcidx_max) {
1138 CNcbiOstrstream ostr;
1139 ostr << "Selected max volume size is too small: "
1140 << "it must be large enough to fit the index for the "
1141 << "longest input sequence.";
1142 const string err = CNcbiOstrstreamToString(ostr);
1143 NCBI_THROW(CException, eUnknown, err);
1144 }
1145 } // seqdb iteration
1146
1147 if(mcidx > 0) {
1148 MersAndCoords.resize(mcidx);
1149 x_WriteIndexFile(++volume, mode, strand, MersAndCoords);
1150 mcidx = 0;
1151 }
1152
1153 m_Mers.Clear();
1154 }
1155
1156
x_WriteIndexFile(size_t volume,EIndexMode mode,bool strand,vector<Uint8> & MersAndCoords)1157 size_t CElementaryMatching::x_WriteIndexFile(
1158 size_t volume,
1159 EIndexMode mode,
1160 bool strand,
1161 vector<Uint8>& MersAndCoords)
1162 {
1163 const size_t t0 (time(0));
1164
1165 string basename;
1166 if(mode == eIM_Genomic) {
1167 basename = m_lbn_s;
1168 }
1169 else {
1170 basename = m_lbn_q;
1171 }
1172 basename += string(strand? ".p": ".m") + ".v" + NStr::NumericToString(volume);
1173 const string filename_offs ( m_FilePath + CDirEntry::GetPathSeparator() + basename + kFileExt_Offsets );
1174
1175 CNcbiOfstream ofstr_offs (filename_offs.c_str(), IOS_BASE::binary);
1176 Uint8 bytes_offs (0);
1177
1178 const string filename_positions (m_FilePath + CDirEntry::GetPathSeparator() + basename + kFileExt_Positions);
1179 CNcbiOfstream ofstr_positions (filename_positions.c_str(), IOS_BASE::binary);
1180 Uint8 bytes_positions (0);
1181
1182 cerr << " Generating index volume: " << basename << " ... ";
1183
1184 Uint4 curmer (numeric_limits<Uint4>::max());
1185 Uint4 curofs (0);
1186
1187 sort(MersAndCoords.begin(), MersAndCoords.end());
1188
1189 ITERATE(vector<Uint8>, ii, MersAndCoords) {
1190
1191 const Uint4 mer ((*ii & kUI8_HiWord) >> 32);
1192 if(mer != curmer) {
1193
1194 ofstr_offs.write((const char*) &mer, sizeof mer);
1195 ofstr_offs.write((const char*) &curofs, sizeof curofs);
1196 curmer = mer;
1197 bytes_offs += sizeof(mer) + sizeof(curofs);
1198 }
1199 const Uint4 pos (*ii & kUI8_LoWord);
1200
1201 ofstr_positions.write((const char*) &pos, sizeof pos);
1202 bytes_positions += sizeof(pos);
1203 ++curofs;
1204 }
1205
1206 // termination zero mer
1207 const Uint4 mer (0);
1208 ofstr_offs.write((const char*)&mer, sizeof mer);
1209 ofstr_offs.write((const char*)&curofs, sizeof curofs);
1210 bytes_offs += sizeof(mer) + sizeof(curofs);
1211
1212 ofstr_offs.close();
1213 ofstr_positions.close();
1214
1215 CheckWrittenFile(filename_offs, bytes_offs);
1216 CheckWrittenFile(filename_positions, bytes_positions);
1217
1218 cerr << "Ok" << endl;
1219
1220 return time(0) - t0;
1221 }
1222
1223 #define CHECK_MEMMAP(mm,ofs,len) \
1224 {{ \
1225 const size_t ofs1 (mm.GetOffset()); \
1226 if(ofs1 != ofs) { \
1227 cerr << "Real offset " << ofs1 << " different from " << ofs << endl; \
1228 } \
1229 const size_t len1 (mm.GetSize()); \
1230 if(len1 != len) { \
1231 cerr << "Real length " << len1 << " different from " << len << endl; \
1232 } \
1233 }}
1234
1235
x_Search(bool strand)1236 void CElementaryMatching::x_Search(bool strand)
1237 {
1238 m_Mers.Clear();
1239
1240 cerr << " Matching (strand = " << (strand? "plus": "minus") << ") ... ";
1241 m_CurGenomicStrand = strand;
1242
1243 CDir dir (m_FilePath);
1244
1245 const string sfx (string(strand?".p": ".m") + ".v*");
1246
1247 const string mask_ofs_q (m_lbn_q + sfx + kFileExt_Offsets);
1248 const string mask_ofs_s (m_lbn_s + sfx + kFileExt_Offsets);
1249 CDir::TEntries vols_ofs_q (dir.GetEntries(mask_ofs_q));
1250 CDir::TEntries vols_ofs_s (dir.GetEntries(mask_ofs_s));
1251
1252 Uint8 elem_hits_total (0);
1253
1254 ITERATE(CDir::TEntries, ii_s, vols_ofs_s) {
1255
1256 const string filename_ofs_s ((*ii_s)->GetPath());
1257 const string filename_pos_s (ReplaceExt(filename_ofs_s, kFileExt_Positions));
1258 const CFile file_subj (filename_ofs_s);
1259 const size_t dim_ofs_s (file_subj.GetLength() / 8);
1260
1261 ITERATE(CDir::TEntries, ii_q, vols_ofs_q) {
1262
1263 const string filename_ofs_q ((*ii_q)->GetPath());
1264 const string filename_pos_q (ReplaceExt(filename_ofs_q,
1265 kFileExt_Positions));
1266
1267 Uint8 hit_index_dim (0), elem_hits_this_pair (0);
1268 string filename_hit_index;
1269
1270 {{
1271
1272 // map offset files
1273 CMemoryFile mf_ofs_s ((*ii_s)->GetPath());
1274 const Uint8 * ofs_s (reinterpret_cast<const Uint8*>
1275 (mf_ofs_s.Map()));
1276
1277 const CFile file_query ((*ii_q)->GetPath());
1278 const size_t dim_ofs_q (file_query.GetLength() / 8);
1279
1280 CMemoryFile mf_ofs_q ((*ii_q)->GetPath());
1281 const Uint8 * ofs_q (reinterpret_cast<const Uint8*>
1282 (mf_ofs_q.Map()));
1283
1284 // build hit index
1285 THitIndex hit_index;
1286 hit_index.reserve(min(dim_ofs_q, dim_ofs_s));
1287
1288 while((*ofs_q & kUI8_LoWord) && (*ofs_s & kUI8_LoWord)) {
1289
1290 while( (*ofs_q & kUI8_LoWord)
1291 && ((*ofs_q & kUI8_LoWord) < (*ofs_s & kUI8_LoWord)))
1292 {
1293 ++ofs_q;
1294 }
1295 if((*ofs_q & kUI8_LoWord) == 0) break;
1296
1297 while( (*ofs_s & kUI8_LoWord)
1298 && ((*ofs_s & kUI8_LoWord) < (*ofs_q & kUI8_LoWord)))
1299 {
1300 ++ofs_s;
1301 }
1302 if((*ofs_s & kUI8_LoWord) == 0) break;
1303
1304 if((*ofs_s & kUI8_LoWord) == (*ofs_q & kUI8_LoWord)) {
1305
1306 hit_index.push_back(SHitIndexEntry());
1307 SHitIndexEntry& elem (hit_index.back());
1308
1309 elem.m_QueryOfs = (*ofs_q) >> 32;
1310 size_t count ((*(ofs_q + 1) >> 32) - elem.m_QueryOfs);
1311 elem.m_QueryCount = (count > 0xFFFF)? 0xFFFF: count;
1312
1313 elem.m_SubjOfs = (*ofs_s) >> 32;
1314 count = ((*(ofs_s + 1) >> 32) - elem.m_SubjOfs);
1315 elem.m_SubjCount = (count > 0xFFFF)? 0xFFFF: count;
1316
1317 ++ofs_s;
1318 ++ofs_q;
1319
1320 elem_hits_this_pair += elem.m_QueryCount * elem.m_SubjCount;
1321 }
1322 }
1323
1324 // unload offset files; save hit index
1325 filename_hit_index = g_SaveToTemp(hit_index, m_FilePath);
1326 hit_index_dim = (hit_index.size());
1327 elem_hits_total += elem_hits_this_pair;
1328 }}
1329
1330 // load position files
1331
1332 const CFile file_pos_s (filename_pos_s);
1333 const size_t dim_pos_s (file_pos_s.GetLength() / 4);
1334
1335 CMemoryFile mf_pos_s (filename_pos_s);
1336 size_t map_offset_s (0);
1337 size_t map_length_s (min(kMapGran, 4*dim_pos_s - map_offset_s));
1338 const Uint4 * pos_s (reinterpret_cast<const Uint4*>(
1339 mf_pos_s.Map(map_offset_s, map_length_s)));
1340
1341 const CFile file_pos_q (filename_pos_q);
1342 const size_t dim_pos_q (file_pos_q.GetLength() / 4);
1343
1344 CMemoryFile mf_pos_q (filename_pos_q);
1345 size_t map_offset_q (0);
1346 size_t map_length_q (min(kMapGran, 4*dim_pos_q - map_offset_q));
1347 const Uint4 * pos_q (reinterpret_cast<const Uint4*>(
1348 mf_pos_q.Map(map_offset_q, map_length_q)));
1349
1350 // scan hit index file and build hits
1351 vector<Uint8> hits (elem_hits_this_pair);
1352 size_t hidx (0);
1353
1354 CNcbiIfstream ifstr (filename_hit_index.c_str(), IOS_BASE::binary);
1355 for(size_t cnt(0); cnt < hit_index_dim; ++cnt) {
1356
1357 SHitIndexEntry hie;
1358 ifstr.read((char*) &hie, sizeof(hie));
1359
1360 const size_t idx_s_max (hie.m_SubjOfs + hie.m_SubjCount);
1361 const size_t idx_q_max (hie.m_QueryOfs + hie.m_QueryCount);
1362 if(idx_s_max > dim_pos_s || idx_q_max > dim_pos_q) {
1363 NCBI_THROW(CException, eUnknown, "Coordinate out of scope");
1364 }
1365
1366 if(4*idx_s_max >= map_offset_s + map_length_s) {
1367
1368 map_offset_s = 4 * hie.m_SubjOfs;
1369 map_length_s = min(kMapGran, 4*dim_pos_s - map_offset_s);
1370 mf_pos_s.Unmap();
1371 pos_s = (reinterpret_cast<const Uint4*>
1372 (mf_pos_s.Map(map_offset_s, map_length_s)));
1373 }
1374
1375 if(4*idx_q_max >= map_offset_q + map_length_q) {
1376
1377 map_offset_q = 4 * hie.m_QueryOfs;
1378 map_length_q = min(kMapGran, 4*dim_pos_q - map_offset_q);
1379 mf_pos_q.Unmap();
1380 pos_q = (reinterpret_cast<const Uint4*>
1381 (mf_pos_q.Map(map_offset_q, map_length_q)));
1382 }
1383
1384 for(size_t idx_s (hie.m_SubjOfs); idx_s < idx_s_max; ++idx_s) {
1385
1386 Uint8 hiword = pos_s[idx_s - map_offset_s/4];
1387 hiword <<= 32;
1388 for(size_t idx_q (hie.m_QueryOfs); idx_q < idx_q_max; ++idx_q) {
1389
1390 const Uint8 loword = pos_q[idx_q - map_offset_q/4];
1391 hits[hidx++] = (hiword | loword);
1392 }
1393 }
1394 }
1395
1396 if(hidx != elem_hits_this_pair) {
1397 CNcbiOstrstream ostr;
1398 ostr << "The number of hits found (" << hidx
1399 << ") does not match the expected " << elem_hits_this_pair;
1400 const string str = CNcbiOstrstreamToString(ostr);
1401 NCBI_THROW(CException, eUnknown, str);
1402 }
1403
1404 // remove the hit index file
1405 CFile(filename_hit_index).Remove();
1406
1407 // detect compartments
1408 x_CompartVolume (&hits);
1409
1410 } // query vols
1411
1412 } // subj vols
1413
1414 cerr << "Ok" << endl;
1415 }
1416
1417 #undef CHECK_MEMMAP
1418
1419
PLoWord(const Uint8 & lhs,const Uint8 & rhs)1420 bool PLoWord(const Uint8& lhs, const Uint8& rhs)
1421 {
1422 return (lhs & kUI8_LoWord) < (rhs & kUI8_LoWord);
1423 }
1424
1425
PDiag(const Uint8 & lhs,const Uint8 & rhs)1426 bool PDiag(const Uint8& lhs, const Uint8& rhs)
1427 {
1428 const double lhs_q (double(lhs & kUI8_LoWord) / 2);
1429 const double lhs_s (double(lhs >> 32) / 2);
1430 const double rhs_q (double(rhs & kUI8_LoWord) / 2);
1431 const double rhs_s (double(rhs >> 32) / 2);
1432
1433 const double lhs_q1 (lhs_q + lhs_s);
1434 const double lhs_s1 (lhs_s - lhs_q);
1435 const double rhs_q1 (rhs_q + rhs_s);
1436 const double rhs_s1 (rhs_s - rhs_q);
1437
1438 if(lhs_s1 == rhs_s1) {
1439 return lhs_q1 < rhs_q1;
1440 }
1441 else {
1442 return lhs_s1 < rhs_s1;
1443 }
1444 }
1445
1446
x_CleanVolumes(const string & lbn,const TStrings & vol_extensions)1447 void CElementaryMatching::x_CleanVolumes(const string& lbn,
1448 const TStrings& vol_extensions)
1449 {
1450
1451 // make sure there are no offset or position files left before
1452 // we generate the new ones
1453
1454 CDir dir (m_FilePath);
1455
1456 CFileDeleteList fdl;
1457 ITERATE(TStrings, ii, vol_extensions) {
1458
1459 const string mask (lbn + "*" + *ii);
1460 CDir::TEntries dir_entries (dir.GetEntries(mask));
1461 ITERATE(CDir::TEntries, jj, dir_entries) {
1462 fdl.Add((*jj)->GetPath());
1463 }
1464 }
1465 }
1466
1467
x_CompartVolume(vector<Uint8> * phits)1468 void CElementaryMatching::x_CompartVolume(vector<Uint8>* phits)
1469 {
1470 if(phits->size() == 0) {
1471 return;
1472 }
1473
1474 // sort by the global genomic corrdinate
1475 sort(phits->begin(), phits->end());
1476
1477 TSeqInfos::const_iterator ii_genomic_b (m_SeqInfos_Genomic.begin());
1478 TSeqInfos::const_iterator ii_genomic_e (m_SeqInfos_Genomic.end());
1479 TSeqInfos::const_iterator ii_genomic (ii_genomic_b);
1480 TSeqInfos::const_iterator ii_cdna_b (m_SeqInfos_cdna.begin());
1481 TSeqInfos::const_iterator ii_cdna_e (m_SeqInfos_cdna.end());
1482
1483 CSeqDB seqdb_genomic (m_sdb, CSeqDB::eNucleotide);
1484
1485 m_CurSeq_cDNA = m_CurSeq_Genomic = 0;
1486
1487 size_t idx_compacted (0);
1488 for(size_t idx (0), idx_hi (phits->size()); idx < idx_hi; ) {
1489
1490 Uint4 gc_s (((*phits)[idx]) >> 32);
1491
1492 // find the relevant genomic record
1493 ii_genomic = lower_bound(ii_genomic + 1, ii_genomic_e, SSeqInfo(gc_s, 0, 0));
1494 if(ii_genomic == ii_genomic_e || ii_genomic->m_Start > gc_s) {
1495 --ii_genomic;
1496 }
1497
1498 if(gc_s < ii_genomic->m_Start ||
1499 ii_genomic->m_Start + ii_genomic->m_Length <= gc_s)
1500 {
1501 CNcbiOstrstream ostr;
1502 ostr << "Global genomic coordinate "
1503 << gc_s << " out of range: ["
1504 << ii_genomic->m_Start << ", "
1505 << (ii_genomic->m_Start + ii_genomic->m_Length) << "), "
1506 << (m_GenomicSeqIds[ii_genomic->m_Oid]->GetSeqIdString(true));
1507 const string str = CNcbiOstrstreamToString(ostr);
1508 NCBI_THROW(CException, eUnknown, str);
1509 }
1510
1511 const Uint4 gc_s_max (ii_genomic->m_Start + ii_genomic->m_Length);
1512
1513 // preload genomic sequence
1514 seqdb_genomic.GetSequence(ii_genomic->m_Oid, &m_CurSeq_Genomic);
1515
1516 // find all hits that belong to the current genomic record
1517 size_t idx0 (idx);
1518 while(idx < idx_hi && (((*phits)[idx]) >> 32) < gc_s_max) {
1519 ++idx;
1520 }
1521
1522 // sort by the global cDNA coordinate
1523 sort(phits->begin() + idx0, phits->begin() + idx, PLoWord);
1524
1525 // find the relevant cDNA record
1526 TSeqInfos::const_iterator ii_cdna (ii_cdna_b);
1527 Uint4 gc_q_min (ii_cdna->m_Start);
1528 Uint4 gc_q_max (ii_cdna->m_Start + ii_cdna->m_Length);
1529 size_t jdx (idx0), jdx0 (jdx);
1530 for(; jdx < idx; ++jdx) {
1531
1532 Uint4 gc_q (((*phits)[jdx]) & kUI8_LoWord);
1533
1534 if(gc_q < gc_q_min || gc_q >= gc_q_max) {
1535
1536 const THit::TCoord min_matches1 = THit::TCoord(
1537 round(m_MinCompartmentIdty * ii_cdna->m_Length));
1538
1539 const THit::TCoord min_matches2 = THit::TCoord(
1540 round(m_MinSingletonIdty * ii_cdna->m_Length));
1541
1542 const THit::TCoord min_matches (min(min_matches1,min_matches2));
1543
1544 if(kN * (jdx - jdx0) >= min_matches / 2) {
1545
1546 // preload genomic sequence
1547 m_qsrc->GetSeq(ii_cdna->m_Oid, &m_CurSeq_cDNA);
1548 x_CompartPair(phits, ii_cdna, ii_genomic,
1549 jdx0, jdx, &idx_compacted);
1550 m_qsrc->RetSequence(&m_CurSeq_cDNA);
1551 m_CurSeq_cDNA = 0;
1552 }
1553
1554 ii_cdna = lower_bound(ii_cdna + 1, ii_cdna_e, SSeqInfo(gc_q, 0, 0));
1555
1556 if(ii_cdna == ii_cdna_e || ii_cdna->m_Start > gc_q) {
1557 --ii_cdna;
1558 }
1559
1560 if(gc_q < ii_cdna->m_Start ||
1561 ii_cdna->m_Start + ii_cdna->m_Length <= gc_q)
1562 {
1563 CNcbiOstrstream ostr;
1564 ostr << "Global cDNA coordinate "
1565 << gc_q << " out of range: ["
1566 << ii_cdna->m_Start << ", "
1567 << (ii_cdna->m_Start + ii_cdna->m_Length) << "), "
1568 << (m_cDNASeqIds[ii_cdna->m_Oid]->GetSeqIdString(true));
1569 const string str = CNcbiOstrstreamToString(ostr);
1570 NCBI_THROW(CException, eUnknown, str);
1571 }
1572
1573 gc_q_min = ii_cdna->m_Start;
1574 gc_q_max = ii_cdna->m_Start + ii_cdna->m_Length;
1575 jdx0 = jdx;
1576 }
1577 }
1578
1579 const THit::TCoord min_matches1 = THit::TCoord(
1580 round(m_MinCompartmentIdty * ii_cdna->m_Length));
1581
1582 const THit::TCoord min_matches2 = THit::TCoord(
1583 round(m_MinSingletonIdty * ii_cdna->m_Length));
1584
1585 const THit::TCoord min_matches (min(min_matches1,min_matches2));
1586
1587 if(kN * (jdx - jdx0) >= min_matches / 2) {
1588
1589 m_qsrc->GetSeq(ii_cdna->m_Oid, &m_CurSeq_cDNA);
1590
1591 x_CompartPair(phits, ii_cdna, ii_genomic,
1592 jdx0, jdx, &idx_compacted);
1593 m_qsrc->RetSequence(&m_CurSeq_cDNA);
1594 m_CurSeq_cDNA = 0;
1595 }
1596
1597 seqdb_genomic.RetSequence(&m_CurSeq_Genomic);
1598 m_CurSeq_Genomic = 0;
1599 }
1600 }
1601
1602
x_IsMatch(Uint4 q,Uint4 s) const1603 bool CElementaryMatching::x_IsMatch(Uint4 q, Uint4 s) const
1604 {
1605 const Uint1 cq (((m_CurSeq_cDNA[q / 4]) >> ((3 - (q % 4))*2)) & 0x03);
1606 const Uint1 cs (((m_CurSeq_Genomic[s / 4]) >> ((3 - (s % 4))*2)) & 0x03);
1607 const bool rv (m_CurGenomicStrand? (cq == cs): ((cq^cs) == 3));
1608 return rv;
1609 }
1610
1611
x_ExtendHit(const Int8 & left_limit0,const Int8 & right_limit0,THitRef hitref)1612 Int8 CElementaryMatching::x_ExtendHit(const Int8 & left_limit0,
1613 const Int8 & right_limit0,
1614 THitRef hitref)
1615 {
1616 const int Wm (1), Wms (-2);
1617 int score (int(hitref->GetLength()) * Wm
1618 + int(hitref->GetMismatches()) * (Wms - Wm));
1619 int score_max (score);
1620
1621 const int overrun (6); // enables connecting hits over a mismatch
1622 const Int8 left_limit (left_limit0 >= overrun? left_limit0 - overrun: 0);
1623 const Int8 right_limit (right_limit0 >= kDiagMax - overrun?
1624 kDiagMax: right_limit0 + overrun);
1625
1626 // extend left
1627 Int8 q0 (hitref->GetQueryStart()), s0 (hitref->GetSubjStart());
1628 size_t mm (0), mm0 (0);
1629 bool no_overrun_yet (true);
1630 for(Int8 q (q0 - 1), s (s0 - 1);
1631 (q + s > left_limit && score + m_XDropOff >= score_max
1632 && q >= m_ii_cdna->m_Start && s >= m_ii_genomic->m_Start);
1633 --q, --s)
1634 {
1635
1636 if(q + s == left_limit0) {
1637 no_overrun_yet = false;
1638 mm0 += mm;
1639 }
1640
1641 Uint4 qq (q - m_ii_cdna->m_Start);
1642 Uint4 ss (s - m_ii_genomic->m_Start);
1643 if(m_CurGenomicStrand == false) {
1644 ss = m_ii_genomic->m_Length - ss - 1;
1645 }
1646
1647 if(x_IsMatch(qq, ss)) {
1648
1649 score += Wm;
1650 if(score > score_max) {
1651 q0 = q;
1652 s0 = s;
1653 score_max = score;
1654 if(no_overrun_yet) {
1655 mm0 += mm;
1656 mm = 0;
1657 }
1658 }
1659 }
1660 else {
1661 score += Wms;
1662 ++mm;
1663 }
1664 }
1665
1666 while(q0 + s0 <= left_limit0) {++q0; ++s0;}
1667
1668 bool extended_left (false);
1669 if(q0 < hitref->GetQueryStart()) {
1670 hitref->SetQueryStart(q0);
1671 hitref->SetSubjStart(s0);
1672 extended_left = true;
1673 }
1674
1675 // extend right
1676 q0 = (hitref->GetQueryStop());
1677 s0 = (hitref->GetSubjStop());
1678 score = score_max;
1679 mm = mm0;
1680
1681 no_overrun_yet = true;
1682
1683 for(Int8 q (q0 + 1), s (s0 + 1);
1684 (q + s < right_limit && score + m_XDropOff >= score_max
1685 && q < m_ii_cdna->m_Start + m_ii_cdna->m_Length
1686 && s < m_ii_genomic->m_Start + m_ii_genomic->m_Length);
1687 ++q, ++s)
1688 {
1689 if(q + s == right_limit0) {
1690 no_overrun_yet = false;
1691 mm0 += mm;
1692 }
1693
1694 Uint4 qq (q - m_ii_cdna->m_Start);
1695 Uint4 ss (s - m_ii_genomic->m_Start);
1696 if(m_CurGenomicStrand == false) {
1697 ss = m_ii_genomic->m_Length - ss - 1;
1698 }
1699
1700 if(x_IsMatch(qq, ss)) {
1701
1702 score += Wm;
1703 if(score > score_max) {
1704 q0 = q;
1705 s0 = s;
1706 score_max = score;
1707 if(no_overrun_yet) {
1708 mm0 += mm;
1709 mm = 0;
1710 }
1711 }
1712 }
1713 else {
1714 score += Wms;
1715 ++mm;
1716 }
1717 }
1718
1719 while(q0 + s0 >= right_limit0) {--q0; --s0; }
1720
1721 bool extended_right (false);
1722 if(q0 > hitref->GetQueryStop()) {
1723 hitref->SetQueryStop(q0);
1724 hitref->SetSubjStop(s0);
1725 extended_right = true;
1726 }
1727
1728 if(extended_left || extended_right) {
1729
1730 hitref->SetMismatches(mm0);
1731 const THit::TCoord len (hitref->GetQueryStop() - hitref->GetQueryStart() + 1);
1732 hitref->SetLength(len);
1733 hitref->SetIdentity((len - mm0) / double(len));
1734 hitref->SetScore(2 * len);
1735 }
1736
1737 return q0 + s0;
1738 }
1739
1740
x_CompartPair(vector<Uint8> * pvol,TSeqInfos::const_iterator ii_cdna,TSeqInfos::const_iterator ii_genomic,size_t idx_start,size_t idx_stop,size_t * pidx_compacted)1741 void CElementaryMatching::x_CompartPair(vector<Uint8>* pvol,
1742 TSeqInfos::const_iterator ii_cdna,
1743 TSeqInfos::const_iterator ii_genomic,
1744 size_t idx_start,
1745 size_t idx_stop,
1746 size_t* pidx_compacted)
1747 {
1748 if(idx_start >= idx_stop) {
1749 return;
1750 }
1751
1752 // init "global" members
1753 m_ii_cdna = ii_cdna;
1754 m_ii_genomic = ii_genomic;
1755
1756 // re-sort for better compactization
1757 sort(pvol->begin() + idx_start, pvol->begin() + idx_stop, PDiag);
1758
1759 const size_t idx_compacted_start (*pidx_compacted);
1760
1761 vector<Uint4> lens;
1762 lens.reserve(pvol->size() / 2);
1763
1764 Uint8 qs0 ((*pvol)[idx_start]);
1765 Uint4 q0 (qs0 & kUI8_LoWord);
1766 Uint4 s0 (qs0 >> 32);
1767 (*pvol)[(*pidx_compacted)++] = qs0;
1768 lens.push_back(16);
1769
1770 for(size_t idx (idx_start + 1); idx < idx_stop; ++idx) {
1771
1772 const Uint8 qs ((*pvol)[idx]);
1773 const Uint4 q (qs & kUI8_LoWord);
1774 const Uint4 s (qs >> 32);
1775
1776 if((q0 >= s0 && q0 - s0 == q - s && q <= q0 + 16) ||
1777 (q0 < s0 && s0 - q0 == s - q && q <= q0 + 16) )
1778 {
1779 lens.back() += q - q0;
1780 }
1781 else {
1782 (*pvol)[(*pidx_compacted)++] = qs;
1783 lens.push_back(16);
1784 }
1785
1786 q0 = q;
1787 s0 = s;
1788 }
1789
1790 THitRefs hitrefs (lens.size());
1791
1792 for(size_t idx (idx_compacted_start); idx < *pidx_compacted; ++idx) {
1793
1794 const Uint8 qs ((*pvol)[idx]);
1795 const Uint4 q (qs & kUI8_LoWord);
1796 const Uint4 s (qs >> 32);
1797
1798 THitRef hitref (new THit);
1799 hitref->SetQueryStart(q);
1800 hitref->SetSubjStart(s);
1801 const Uint4 len (lens[idx - idx_compacted_start]);
1802 hitref->SetQueryStop(q + len - 1);
1803 hitref->SetSubjStop(s + len - 1);
1804 hitref->SetMismatches(0);
1805 hitref->SetGaps(0);
1806 hitref->SetEValue(0);
1807 hitref->SetIdentity(1.0);
1808 hitref->SetLength(len);
1809 hitref->SetScore(2*len);
1810 hitrefs[idx - idx_compacted_start] = hitref;
1811 }
1812
1813 #define EXTEND_USING_SEQUENCE_CHARS
1814 #ifdef EXTEND_USING_SEQUENCE_CHARS
1815
1816 // try to extend even further over possible mismatches
1817
1818 Int8 left_limit (0); // diag extension left limit
1819 Int8 right_limit (kDiagMax); // diag ext right limit
1820 Int8 s_prime_cur (0);
1821
1822 const int kn (hitrefs.size());
1823 for(int k(0); k < kn; ++k) {
1824
1825 // diag coords: q_prime = q + s; s_prime = s - q;
1826 const THit::TCoord * box (hitrefs[k]->GetBox());
1827
1828 if(Int8(box[2]) - Int8(box[0]) != s_prime_cur) {
1829 left_limit = box[0];
1830 s_prime_cur = Int8(box[2]) - Int8(box[0]);
1831 }
1832
1833 right_limit = kDiagMax;
1834 if(k + 1 < kn) {
1835 const THit::TCoord * boX (hitrefs[k+1]->GetBox());
1836 if(Int8(boX[2]) - Int8(boX[0]) == s_prime_cur) {
1837 right_limit = boX[0] + boX[2];
1838 }
1839 }
1840
1841 left_limit = x_ExtendHit(left_limit, right_limit, hitrefs[k]);
1842 }
1843
1844 // merge adjacent hits
1845 int d (-1);
1846 Int8 rlimit (0), spc (0);
1847 for(int k (0); k < kn; ++k) {
1848
1849 const THit::TCoord cur_diag_stop (hitrefs[k]->GetQueryStop()
1850 + hitrefs[k]->GetSubjStop());
1851
1852 const Int8 s_prime (-Int8(hitrefs[k]->GetQueryStart())
1853 + Int8(hitrefs[k]->GetSubjStart()));
1854 if(k == 0 || spc != s_prime) {
1855
1856 hitrefs[++d] = hitrefs[k];
1857 rlimit = cur_diag_stop;
1858 spc = s_prime;
1859 }
1860 else {
1861
1862 const THit::TCoord cur_diag_start (hitrefs[k]->GetQueryStart()
1863 + hitrefs[k]->GetSubjStart());
1864
1865 THitRef & hrd (hitrefs[d]);
1866
1867 if(rlimit + 2 == cur_diag_start) {
1868
1869 rlimit = cur_diag_stop;
1870 hrd->SetQueryStop(hitrefs[k]->GetQueryStop());
1871 hrd->SetSubjStop(hitrefs[k]->GetSubjStop());
1872 hrd->SetMismatches(hrd->GetMismatches()
1873 + hitrefs[k]->GetMismatches());
1874 const THit::TCoord len (hrd->GetQueryStop()
1875 - hrd->GetQueryStart() + 1);
1876 hrd->SetLength(len);
1877 hrd->SetIdentity(double(len - hrd->GetMismatches()) / len);
1878 hrd->SetScore(2 * len);
1879 }
1880 else if (rlimit + 1 < cur_diag_start) {
1881
1882 hitrefs[++d] = hitrefs[k];
1883 rlimit = cur_diag_stop;
1884 }
1885 else {
1886 NCBI_THROW(CException, eUnknown,
1887 "x_CompartPair(): Unexpected alignment overlap");
1888 }
1889 }
1890 }
1891 hitrefs.resize(min(d + 1, kn));
1892
1893 #undef EXTEND_USING_SEQUENCE_CHARS
1894 #endif
1895
1896 // make sure nothing is stretching out
1897 const Uint4 qmax (m_ii_cdna->m_Start + m_ii_cdna->m_Length);
1898 NON_CONST_ITERATE(THitRefs, ii, hitrefs) {
1899 if((*ii)->GetQueryStop() >= qmax) {
1900 (*ii)->Modify(1, qmax - 1);
1901 }
1902 }
1903
1904 // Remap hit coordinates.
1905 NON_CONST_ITERATE(THitRefs, ii, hitrefs) {
1906 THit& h (**ii);
1907
1908 h.SetQueryId (m_cDNASeqIds[m_ii_cdna->m_Oid]);
1909 h.SetSubjId (m_GenomicSeqIds[m_ii_genomic->m_Oid]);
1910
1911 const Uint4 * box0 (h.GetBox());
1912 Uint4 box [4] = {
1913 box0[0] - m_ii_cdna->m_Start,
1914 box0[1] - m_ii_cdna->m_Start,
1915 box0[2] - m_ii_genomic->m_Start,
1916 box0[3] - m_ii_genomic->m_Start
1917 };
1918
1919 if(m_CurGenomicStrand == false) {
1920 box[2] = m_ii_genomic->m_Length - box[2] - 1;
1921 box[3] = m_ii_genomic->m_Length - box[3] - 1;
1922 }
1923
1924 h.SetBox(box);
1925 }
1926
1927 if (m_HitsOnly) {
1928 ITERATE(THitRefs, ii, hitrefs) {
1929 const THit& h (**ii);
1930 if(h.GetLength() < m_MinHitLength) continue;
1931 cout << h << endl;
1932 }
1933 } else {
1934
1935 // identify compartments
1936 const Uint4 qlen (m_ii_cdna->m_Length);
1937
1938 const THit::TCoord min_matches1 = THit::TCoord(
1939 round(m_MinCompartmentIdty * qlen));
1940
1941 const THit::TCoord min_matches2 = THit::TCoord(
1942 round(m_MinSingletonIdty * qlen));
1943
1944 const THit::TCoord penalty = THit::TCoord(round(m_Penalty * qlen));
1945
1946 CCompartmentAccessor<THit> ca(penalty,
1947 min_matches1,
1948 min_matches2,
1949 true);
1950 ca.SetMaxIntron(m_MaxIntron);
1951 ca.Run(hitrefs.begin(), hitrefs.end());
1952
1953 if (m_OutputMethod) {
1954 // print individual compartments
1955 THitRefs comp;
1956 for(bool b0 (ca.GetFirst(comp)); b0; b0 = ca.GetNext(comp)) {
1957 ITERATE(THitRefs, ii, comp) {
1958 const THit& h (**ii);
1959 cout << h << endl;
1960 }
1961 // empty line to separate compartments
1962 cout << endl;
1963 }
1964 } else {
1965 TResults pair_results = ca.AsSeqAlignSet();
1966 if (! m_Results) { // Store first results, or append to existing?
1967 m_Results = pair_results;
1968 } else {
1969 m_Results->Set().insert(m_Results->Set().end(),
1970 pair_results->Set().begin(),
1971 pair_results->Set().end());
1972 }
1973 }
1974 }
1975 }
1976
1977
x_LoadRemapData(ISequenceSource * m_qsrc,const string & sdb)1978 void CElementaryMatching::x_LoadRemapData(ISequenceSource *m_qsrc, const string& sdb)
1979 {
1980 const string filename_genomic (m_FilePath + CDirEntry::GetPathSeparator() + m_lbn_s + kFileExt_Remap);
1981 const size_t elems_genomic (CFile(filename_genomic).GetLength()/sizeof(SSeqInfo));
1982 m_SeqInfos_Genomic.resize(elems_genomic);
1983 CNcbiIfstream ifstr_genomic (filename_genomic.c_str(), IOS_BASE::binary);
1984 ifstr_genomic.read((char *) &m_SeqInfos_Genomic.front(),
1985 elems_genomic * sizeof (SSeqInfo));
1986 ifstr_genomic.close();
1987
1988 const string filename_cdna (m_FilePath + CDirEntry::GetPathSeparator() + m_lbn_q + kFileExt_Remap);
1989 const size_t elems_cdna (CFile(filename_cdna).GetLength()/sizeof(SSeqInfo));
1990 m_SeqInfos_cdna.resize(elems_cdna);
1991 CNcbiIfstream ifstr_cdna (filename_cdna.c_str(), IOS_BASE::binary);
1992 ifstr_cdna.read((char *) &m_SeqInfos_cdna.front(),
1993 elems_cdna * sizeof (SSeqInfo));
1994 ifstr_cdna.close();
1995
1996 {{
1997 CSeqDB blastdb (sdb, CSeqDB::eNucleotide);
1998 m_GenomicSeqIds.clear();
1999 for(int oid (0); blastdb.CheckOrFindOID(oid); ++oid) {
2000 THit::TId id (blastdb.GetSeqIDs(oid).front());
2001 m_GenomicSeqIds.push_back(id);
2002 }
2003 }}
2004
2005 {{
2006 m_cDNASeqIds.clear();
2007 for(m_qsrc->ResetIndex(); m_qsrc->GetNext(); ) {
2008
2009 THit::TId id (m_qsrc->GetSeqID());
2010 m_cDNASeqIds.push_back(id);
2011 }
2012 }}
2013 }
2014
2015
2016 ////////////////////////////////////////////////////////////////////////////////////
2017 ////
2018
2019
GenerateSeed(const string & str)2020 CRandom::TValue GenerateSeed(const string & str)
2021 {
2022 CRandom::TValue rv (0);
2023 ITERATE(string, ii, str) {
2024 rv = (rv * 3 + (*ii)) % 3571;
2025 }
2026 return time(0) - 5000 + rv;
2027 }
2028
2029
x_InitBasic(void)2030 void CElementaryMatching::x_InitBasic(void)
2031 {
2032 CRandom rand (GenerateSeed("qq" + m_sdb));
2033 const string base_sfx ( NStr::NumericToString(rand.GetRand()));
2034 m_lbn_q = GetLocalBaseName("qq", base_sfx);
2035 m_lbn_s = GetLocalBaseName(m_sdb, base_sfx);
2036
2037 m_OutputMethod = false;
2038
2039 m_Penalty = 0.55;
2040 m_MinSingletonIdty = m_MinCompartmentIdty = .75;
2041 m_MaxIntron = CCompartmentFinder<THit>::s_GetDefaultMaxIntron();
2042 m_HitsOnly = false;
2043 m_MaxVolSize = 512 * 1024 * 1024;
2044
2045 m_MinQueryLength = 50;
2046 m_MaxQueryLength = 500000;
2047 m_MinHitLength = 1;
2048 }
2049
2050
Run(void)2051 void CElementaryMatching::Run(void)
2052 {
2053 x_Cleanup();
2054
2055 // create genomic ID and coordinate remapping tables.
2056 x_CreateRemapData(m_sdb, eIM_Genomic);
2057
2058 // create the filtering (repeats + low complexity) table
2059 // using the plus strand of the genomic sequence;
2060 x_InitFilteringVector(m_sdb, true);
2061
2062 // create cDNA ID and coordinate remapping tables;
2063 x_CreateRemapData(m_qsrc, eIM_cDNA);
2064
2065 // create cDNA index using the current filtering table;
2066 x_CreateIndex(m_qsrc, eIM_cDNA, true);
2067
2068 // init the N-mer participation vector;
2069 x_InitParticipationVector(true);
2070
2071 // create plus-strand genomic index using the participation vector
2072 x_CreateIndex(m_sdb, eIM_Genomic, true);
2073
2074 // generate N-hits for both strands of the genome;
2075 // compact and identify compartments;
2076 // restore original coordinates and IDs in final hits;
2077 // print compartments
2078 x_LoadRemapData(m_qsrc, m_sdb);
2079
2080 x_Search(true);
2081
2082 TStrings vol_exts;
2083 vol_exts.push_back(kFileExt_Offsets);
2084 vol_exts.push_back(kFileExt_Positions);
2085 x_CleanVolumes(m_lbn_q, vol_exts);
2086 x_CleanVolumes(m_lbn_s, vol_exts);
2087
2088 // create the filtering table for the minus genomic strand
2089 // from the plus strand filtering table
2090 x_InitFilteringVector(m_sdb, false);
2091
2092 // repeat the steps to create cDNA and genomic indices
2093 x_CreateIndex(m_qsrc, eIM_cDNA, false);
2094 x_InitParticipationVector(false);
2095 x_CreateIndex(m_sdb, eIM_Genomic, false);
2096
2097 x_Search(false);
2098 }
2099
2100
x_Cleanup(void)2101 void CElementaryMatching::x_Cleanup(void)
2102 {
2103 m_Mers.Clear();
2104 TStrings vol_exts;
2105 vol_exts.push_back(kFileExt_Offsets);
2106 vol_exts.push_back(kFileExt_Positions);
2107 vol_exts.push_back(kFileExt_Masked);
2108 vol_exts.push_back(kFileExt_Remap);
2109 x_CleanVolumes(m_lbn_q, vol_exts);
2110 x_CleanVolumes(m_lbn_s, vol_exts);
2111 m_Results.Reset();
2112 }
2113
2114
2115 END_NCBI_SCOPE
2116