1 /* $Id: compart_matching.cpp 536658 2017-05-22 15:48:20Z zaretska $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Yuri Kapustin
27 *
28 * ===========================================================================
29 */
30 
31 
32 #include <ncbi_pch.hpp>
33 
34 #include <math.h>
35 
36 #include <corelib/ncbi_system.hpp>
37 #include <corelib/ncbifile.hpp>
38 #include <util/random_gen.hpp>
39 #include <algo/align/util/compartment_finder.hpp>
40 #include <objtools/blast/seqdb_reader/seqdb.hpp>
41 #include <algo/align/splign/compart_matching.hpp>
42 
43 BEGIN_NCBI_SCOPE
44 
45 vector<CSeq_id_Handle> CBlastSequenceSource::s_ids;
46 
47 
48 namespace {
49 
50     // N-mer size for repeat filtering
51     const Uint4        kNr                (14);
52 
53     // total number of Nr-mers for repeat filtering
54     const size_t       kNrMersTotal       (1 << (kNr * 2));
55 
56     // percentile used for repeat filtering
57     const double       kRepeatsPercentile (0.995);
58 
59     // N-mer size for matching
60     const Uint4        kN                 (16);
61 
62     // total number of N-mers for indexing
63     const Uint8        kNMersTotal        (Uint8(1) << (kN * 2));
64 
65     // extension to indicate repeat filtering table
66     const char *kFileExt_Masked = ".rep";
67 
68     // extension to indicate ID and coordinate remapping table
69     const char *kFileExt_Remap = ".idc";
70     const char *kFileExt_Offsets = ".ofs";
71     const char *kFileExt_Positions = ".pos";
72 
73     const Uint8  kUI8_LoWord              (0xFFFFFFFF);
74     const Uint8  kUI8_LoByte              (kUI8_LoWord >> 24);
75     const Uint8  kUI8_MidWord             (kUI8_LoWord << 16);
76     const Uint8  kUI8_HiWord              (kUI8_LoWord << 32);
77     const Uint8  kUI8_LoFive              (kUI8_LoWord | (kUI8_LoWord << 8));
78 
79     const Uint8  kUI8_LoHalfWordEachByte  ((Uint8(0x0F0F0F0F) << 32) | 0x0F0F0F0F);
80 
81     const Uint4  kUI4_Lo28                (0xFFFFFFFF >> 4);
82 
83     const Uint8  kUI8_SeqDb_lo            (0x03030303);
84     const Uint8  kUI8_SeqDb               ((kUI8_SeqDb_lo << 32) | kUI8_SeqDb_lo);
85 
86     const Int8   kDiagMax                 (numeric_limits<Int8>::max());
87 
88     const Uint8  kSeqDbMemBound           (512 * 1024 * 1024);
89     const size_t kMapGran                 (512 * 1024 * 1024);
90 }
91 
92 
DecodeSeqDbChar(Uint1 c)93 char DecodeSeqDbChar(Uint1 c)
94 {
95     char rv ('*');
96     switch(c) {
97     case 0: rv = 'A'; break;
98     case 1: rv = 'C'; break;
99     case 2: rv = 'G'; break;
100     case 3: rv = 'T'; break;
101     }
102     return rv;
103 }
104 
105 //#define ALGO_ALIGN_SPLIGN_QCOMP_DEBUG
106 #ifdef ALGO_ALIGN_SPLIGN_QCOMP_DEBUG
107 
108 template<typename T>
GetBinString(T v)109 string GetBinString(T v)
110 {
111     vector<bool> vb;
112 
113     const size_t bitdim (sizeof(T) * 8);
114     for(size_t i(0); i < bitdim; ++i) {
115         vb.push_back(v & 1);
116         v >>= 1;
117     }
118 
119     CNcbiOstrstream ostr;
120     for(size_t i (bitdim); i > 0; --i) {
121         if(i < bitdim && i % 8 == 0) {
122             ostr << ' ';
123         }
124         ostr << vb[i-1];
125     }
126 
127     const string rv = CNcbiOstrstreamToString(ostr);
128     return rv;
129 }
130 
131 #endif
132 
CheckWrittenFile(const string & filename,const Uint8 & len_bytes)133 void CheckWrittenFile(const string& filename, const Uint8& len_bytes)
134 {
135     Int8 reported_len (-1);
136     for(size_t attempt(0); attempt < 1; ++attempt) {
137         reported_len = CFile(filename).GetLength();
138         if(reported_len >= 0 && Uint8(reported_len) == len_bytes) {
139             return;
140         }
141         else {
142             SleepSec(1);
143         }
144     }
145 
146     CNcbiOstrstream ostr;
147 
148     if(reported_len < 0) {
149         ostr << "Cannot write " << filename
150              << " (error code = " << reported_len << "). ";
151     }
152     else {
153         ostr << "The size of " << filename << " (" << reported_len << ')'
154              << " is different from the expected " << len_bytes << ". ";
155     }
156     ostr << "Please make sure there is enough disk space.";
157 
158     const string errmsg = CNcbiOstrstreamToString(ostr);
159     NCBI_THROW(CException, eUnknown, errmsg);
160 }
161 
162 
163 //
164 // Brute-force reversal & complementation
165 
166 template<typename T>
ReverseAndComplement(T v)167 T ReverseAndComplement(T v)
168 {
169     T rv (0);
170     for(size_t i (0), imax (4*sizeof(T)); i < imax; ++i, v >>= 2) {
171         rv = (rv << 2) | (v & 3);
172     }
173     rv = ~rv;
174 
175     return rv;
176 }
177 
178 
179 //
180 // Table-based bit reversal & complementation
181 
182 template<typename T>
183 class CReverseAndComplement {
184 
185 public:
186 
CReverseAndComplement(void)187     CReverseAndComplement(void) {
188 
189         m_Table.resize(256);
190 
191         for(Uint1 i(1); i < 0xFF; ++i) {
192             m_Table[i] = ReverseAndComplement(i);
193         }
194 
195         m_Table[0]     = 0xFF;
196         m_Table[0xFF]  = 0;
197     }
198 
operator ()(T v) const199     T operator() (T v) const {
200 
201         T rv (0);
202         for(size_t i(0), imax(sizeof(T)); i < imax; ++i)  {
203             rv <<= 8;
204             rv |= m_Table[v & 0xFF];
205             v >>= 8;
206         }
207         return rv;
208     }
209 
210 private:
211 
212     vector<Uint1> m_Table;
213 };
214 
215 
216 namespace {
217     CReverseAndComplement<Uint4> g_RC;
218 }
219 
220 
s_IsLowComplexity(Uint4 key)221 bool CElementaryMatching::s_IsLowComplexity(Uint4 key)
222 {
223     // evaluate the lower 28 bits (14 residues) only
224 
225     const Uint4 kMaxTwoBaseContent (14);
226 
227     vector<Uint4> counts(4, 0);
228 
229     for(Uint4 k = 0; k < 14; ++k) {
230         ++counts[0x00000003 & key];
231         key >>= 2;
232     }
233 
234     const Uint4 ag (counts[0] + counts[1]);
235     const Uint4 at (counts[0] + counts[2]);
236     const Uint4 ac (counts[0] + counts[3]);
237     const Uint4 gt (counts[1] + counts[2]);
238     const Uint4 gc (counts[1] + counts[3]);
239     const Uint4 tc (counts[2] + counts[3]);
240 
241     return
242         ag >= kMaxTwoBaseContent || at >= kMaxTwoBaseContent ||
243         ac >= kMaxTwoBaseContent || gt >= kMaxTwoBaseContent ||
244         gc >= kMaxTwoBaseContent || tc >= kMaxTwoBaseContent;
245 }
246 
247 
GetLocalBaseName(const string & extended_name,const string & sfx)248 string GetLocalBaseName(const string& extended_name, const string & sfx)
249 {
250     string dir, base, ext;
251     CDirEntry::SplitPath(extended_name, &dir, &base, &ext);
252     string rv (base);
253     if(!ext.empty()) {
254         rv += ext;
255     }
256     rv += "." + sfx;
257     return rv;
258 }
259 
ReplaceExt(const string & extended_name,const string & new_ext)260 string ReplaceExt(const string& extended_name, const string & new_ext)
261 {
262     string dir, base, ext;
263     CDirEntry::SplitPath(extended_name, &dir, &base, &ext);
264     string rv;
265     if(!dir.empty()) {
266         rv = dir; // + CDirEntry::GetPathSeparator();
267     }
268     if(!base.empty()) {
269         rv += base;
270     }
271     rv += new_ext;
272 
273     return rv;
274 }
275 
276 
277 template <typename VectorT>
g_SaveToTemp(const VectorT & v,const string & path)278 string g_SaveToTemp(const VectorT& v, const string& path)
279 {
280     typedef typename VectorT::value_type TElem;
281 
282     const string filename(CFile::GetTmpNameEx(path, "splqcomp_"));
283     const Uint8 len_bytes (v.size() * sizeof(TElem));
284 
285     {
286         CNcbiOfstream tempcgrfile (filename.c_str(), IOS_BASE::binary);
287         tempcgrfile.write((const char* ) & v.front(), len_bytes);
288         tempcgrfile.close();
289     }
290 
291     CheckWrittenFile(filename, len_bytes);
292 
293     return filename;
294 }
295 
296 
297 template <typename VectorT>
g_RestoreFromTemp(const string & filename,VectorT * pvd)298 void g_RestoreFromTemp(const string& filename, VectorT* pvd)
299 {
300     typedef typename  VectorT::value_type TElem;
301 
302     VectorT& v = *pvd;
303 
304     const Uint8 dim (CFile(filename).GetLength() / sizeof(TElem));
305 
306     CNcbiIfstream tempcgrfile (filename.c_str(), IOS_BASE::binary);
307     tempcgrfile.read((char* ) & v.front(), dim * sizeof(TElem));
308 
309     CFile(filename).Remove();
310 }
311 
312 
x_InitParticipationVector(bool strand)313 void CElementaryMatching::x_InitParticipationVector(bool strand)
314 {
315     // Skim over the query offset volumes and set the bitmask
316 
317     m_Mers.Init(kNMersTotal, false);
318 
319     CDir dir (m_FilePath);
320     const string sfx (string(strand?".p": ".m") + ".v*");
321     const string mask_ofs_q (m_lbn_q + sfx + kFileExt_Offsets);
322     CDir::TEntries vols_ofs_q (dir.GetEntries(mask_ofs_q));
323     ITERATE(CDir::TEntries, ii, vols_ofs_q) {
324 
325         const string  filename ((*ii)->GetPath());
326         const Int8    vol_length (CFile(filename).GetLength());
327 
328         CMemoryFile   mf (filename);
329         for(const Uint8 * p8 (reinterpret_cast<const Uint8 *> (mf.Map())),
330                 * p8e (p8 + vol_length / 8);  p8 != p8e;
331             m_Mers.set_at(*p8++ & kUI8_LoWord));
332         mf.Unmap();
333     }
334 
335     m_Mers.reset_at(0);
336 }
337 
338 
x_InitFilteringVector(const string & sdb,bool strand)339 void CElementaryMatching::x_InitFilteringVector(const string& sdb, bool strand)
340 {
341     // for plus strand create using the actual sequence data;
342     // use the plus strand table to init the minus strand table
343 
344     if(strand) {
345 
346         // create repeat filtering table (genome);
347         CRef<CSeqDB> subjdb (new CSeqDB(sdb, CSeqDB::eNucleotide));
348         if(subjdb->GetTotalLength() > numeric_limits<Uint4>::max())
349         {
350             CNcbiOstrstream ostr;
351             ostr << "Sequence volumes with total length exceeding "
352                  << numeric_limits<Uint4>::max()
353                  << " are not yet supported. Please split your FASTA file and re-run "
354                  << " formatdb.";
355             const string err = CNcbiOstrstreamToString(ostr);
356             NCBI_THROW(CException, eUnknown, err);
357         }
358 
359         Uint4 current_offset (0);
360 
361         auto_ptr<vector<Uint4> > pNrCounts (new vector<Uint4> (kNrMersTotal, 0));
362         vector<Uint4> & NrCounts (*pNrCounts);
363 
364         cerr << " Scanning " << subjdb->GetNumSeqs() << " genomic sequences ... ";
365         for(int oid (0); subjdb->CheckOrFindOID(oid); ++oid)
366         {
367             const char * pcb (0);
368             const Uint4 bases (subjdb->GetSequence(oid, &pcb));
369             const char * pcbe (pcb + bases / 4);
370             uintptr_t npcb (reinterpret_cast<uintptr_t>(pcb)), npcb0 (npcb);
371             npcb -= npcb % 8;
372             if(npcb < npcb0) npcb += 8;
373             const size_t gcbase (4*(npcb - npcb0));
374             const Uint8 * pui8 (reinterpret_cast<const Uint8*>(npcb));
375             const Uint8 * pui8_e (reinterpret_cast<const Uint8*>(pcbe));
376 
377             for(size_t gccur (current_offset + gcbase); pui8 < pui8_e; ++pui8) {
378 
379                 Uint8 ui8 (*pui8);
380 
381                 // counting the hi 28 bits correspond to
382                 // a subsequence at positions 1,2,5-16
383 #define QCOMP_COUNT_NrMERS  {{ \
384                 if(gccur + 16 >= current_offset + bases) { \
385                     break; \
386                 } \
387                 const Uint4 mer4 (ui8 & kUI8_LoWord); \
388                 ++NrCounts[mer4 >> 4];           \
389                 gccur += 4; \
390                 ui8 >>= 8; \
391                 }}
392 
393                 QCOMP_COUNT_NrMERS;
394                 QCOMP_COUNT_NrMERS;
395                 QCOMP_COUNT_NrMERS;
396                 QCOMP_COUNT_NrMERS;
397 
398                 if(pui8 + 1 < pui8_e) {
399 
400                     ui8 |= (*(pui8 + 1) << 32);
401 
402                     QCOMP_COUNT_NrMERS;
403                     QCOMP_COUNT_NrMERS;
404                     QCOMP_COUNT_NrMERS;
405                     QCOMP_COUNT_NrMERS;
406                 }
407 
408 #undef QCOMP_COUNT_NMERS
409             }
410 
411             subjdb->RetSequence(&pcb);
412 
413             current_offset += bases;
414         } // OIDs
415 
416         subjdb.Reset(0);
417 
418         cerr << "Ok" << endl;
419         cerr << " Constructing FV ... ";
420 
421         string filename_temp_01;
422         auto_ptr<vector<Uint4> > pNrCounts2 (new vector<Uint4>);
423         vector<Uint4> & NrCounts2 (* pNrCounts2);
424         try {
425             NrCounts2 = NrCounts;
426         }
427         catch(...) {
428             filename_temp_01 = g_SaveToTemp(NrCounts, m_FilePath);
429             pNrCounts2.reset(0);
430         }
431 
432         // find the percentile
433         size_t total_mers (0);
434         ITERATE(vector<Uint4>, ii, NrCounts) {
435             if(*ii > 0) ++total_mers;
436         }
437         const size_t percentile ((size_t)(kNrMersTotal -
438                                           total_mers * (1 - kRepeatsPercentile)));
439         nth_element(NrCounts.begin(), NrCounts.begin() + percentile, NrCounts.end());
440         const Uint4 max_repeat_count (NrCounts[percentile]);
441 
442         if(filename_temp_01.empty()) {
443             NrCounts = NrCounts2;
444         }
445         else {
446             g_RestoreFromTemp(filename_temp_01, &NrCounts);
447         }
448         pNrCounts2.reset(0);
449 
450         m_Mers.Init(kNrMersTotal, false);
451         for(size_t i (0); i < kNrMersTotal; ++i) {
452             const bool v (NrCounts[i] <= max_repeat_count && !s_IsLowComplexity(i));
453             if(v) {
454                 m_Mers.set_at(i);
455             }
456         }
457         pNrCounts.reset(0);
458 
459         // serialize
460         const string masked_filename (m_FilePath + CDirEntry::GetPathSeparator() + m_lbn_s + kFileExt_Masked);
461         const Uint8 len_bytes (kNrMersTotal / 8);
462         {{
463             CMemoryFile mf (masked_filename, CMemoryFile::eMMP_Write,
464                             CMemoryFile::eMMS_Shared, 0, kNrMersTotal / 8,
465                             CMemoryFile::eCreate,
466                             len_bytes);
467             Uint8* dest (reinterpret_cast<Uint8*>(mf.Map()));
468 
469             const Uint8 * src (m_Mers.GetBuffer());
470             copy(src, src + kNrMersTotal / 64, dest);
471         }}
472 
473         CheckWrittenFile(masked_filename, len_bytes);
474     }
475     else {
476 
477         cerr << " Reading/transforming FV ... ";
478 
479         // read the plus strand vector and trasnform
480         const string masked_filename (m_FilePath + CDirEntry::GetPathSeparator() + m_lbn_s + kFileExt_Masked);
481 
482         CMemoryFile mf (masked_filename);
483         const Uint8 * p8 (reinterpret_cast<Uint8*>(mf.Map()));
484         CBoolVector nrmers_plus (kNrMersTotal, p8);
485         mf.Unmap();
486 
487         m_Mers.Init(kNrMersTotal, false);
488 
489         for(size_t i(0); i < kNrMersTotal; ++i) {
490             if(nrmers_plus.get_at(i)) {
491                 // todo: optimize using tables
492                 const size_t irc (g_RC(Uint4(i) << 4) & kUI4_Lo28);
493                 m_Mers.set_at(irc);
494             }
495         }
496     }
497 
498     cerr << "Ok" << endl;
499 }
500 
501 
x_CreateRemapData(const string & db,EIndexMode mode)502 void CElementaryMatching::x_CreateRemapData(const string& db, EIndexMode mode)
503 {
504     // create ID and coordinate remapping tables
505 
506     CSeqDB blastdb (db, CSeqDB::eNucleotide);
507 
508     TSeqInfos seq_infos;
509     const size_t num_seqs (blastdb.GetNumSeqs());
510     seq_infos.reserve(num_seqs);
511 
512     Uint4 current_offset (0);
513     for(int oid (0); blastdb.CheckOrFindOID(oid); ++oid) {
514 
515         const int seqlen (blastdb.GetSeqLength(oid));
516         if(seqlen <= 0 || size_t(seqlen) >= 0xFFFFFFFF) {
517             CNcbiOstrstream ostr;
518             ostr << "Cannot create remap data for:\t" <<
519                 blastdb.GetSeqIDs(oid).front()->GetSeqIdString(true);
520             const string err = CNcbiOstrstreamToString(ostr);
521             NCBI_THROW(CException, eUnknown, err);
522         }
523 
524         const Uint4 bases (seqlen);
525         seq_infos.push_back(SSeqInfo(current_offset, bases, oid));
526         current_offset += bases;
527     }
528 
529     const string remap_filename ((mode == eIM_Genomic? m_lbn_s: m_lbn_q) +
530                                  kFileExt_Remap);
531 
532     const string full_remap_filename = m_FilePath + CDirEntry::GetPathSeparator() + remap_filename;
533 
534     CNcbiOfstream ofstr_remap (full_remap_filename.c_str(), IOS_BASE::binary);
535     const Uint8 len_bytes (seq_infos.size() * sizeof(SSeqInfo));
536     ofstr_remap.write((const char*) &seq_infos.front(), len_bytes);
537     ofstr_remap.close();
538 
539     CheckWrittenFile(full_remap_filename, len_bytes);
540 
541     cerr << " Remap data created for " << db << "; max offset = "
542          << current_offset << endl;
543 }
544 
x_CreateRemapData(ISequenceSource * m_qsrc,EIndexMode mode)545 void CElementaryMatching::x_CreateRemapData(ISequenceSource *m_qsrc, EIndexMode mode)
546 {
547     // create ID and coordinate remapping tables
548 
549     TSeqInfos seq_infos;
550     const size_t num_seqs (m_qsrc->GetNumSeqs());
551     seq_infos.reserve(num_seqs);
552 
553     Uint4 current_offset (0);
554     for(m_qsrc->ResetIndex(); m_qsrc->GetNext(); ) {
555 
556         const int seqlen (m_qsrc->GetSeqLength());
557         if(seqlen <= 0 || size_t(seqlen) >= 0xFFFFFFFF) {
558             CNcbiOstrstream ostr;
559             ostr << "Cannot create remap data for:\t" <<
560                 m_qsrc->GetSeqID()->GetSeqIdString(true);
561             const string err = CNcbiOstrstreamToString(ostr);
562             NCBI_THROW(CException, eUnknown, err);
563         }
564 
565         const Uint4 bases (seqlen);
566         seq_infos.push_back(SSeqInfo(current_offset, bases, m_qsrc->GetCurrentIndex()));
567         current_offset += bases;
568     }
569 
570     const string remap_filename ((mode == eIM_Genomic? m_lbn_s: m_lbn_q) +
571                                  kFileExt_Remap);
572 
573     const string full_remap_filename = m_FilePath + CDirEntry::GetPathSeparator() + remap_filename;
574 
575     CNcbiOfstream ofstr_remap (full_remap_filename.c_str(), IOS_BASE::binary);
576     const Uint8 len_bytes (seq_infos.size() * sizeof(SSeqInfo));
577     ofstr_remap.write((const char*) &seq_infos.front(), len_bytes);
578     ofstr_remap.close();
579 
580     CheckWrittenFile(full_remap_filename, len_bytes);
581 
582     cerr << " Remap data created for sequences; max offset = "
583          << current_offset << endl;
584 }
585 
586 
x_CreateIndex(const string & db,EIndexMode mode,bool strand)587 void CElementaryMatching::x_CreateIndex(const string& db, EIndexMode mode, bool strand)
588 {
589     // sort all adjacent genomic N-mers and their positions
590     // except for those marked in the Nr-mer bit vector
591 
592     cerr << " Scanning " << db << " for N-mers and their positions." << endl;
593 
594     if(m_Mers.get_at(0)) {
595         NCBI_THROW(CException, eUnknown, "NULL mer not excluded!");
596     }
597 
598     const size_t mcidx_max (m_MaxVolSize / 8);
599     vector<Uint8> MersAndCoords (mcidx_max, 0);
600     size_t mcidx (0);
601     size_t current_offset (0);
602 
603     CRef<CSeqDB> blastdb (new CSeqDB(db, CSeqDB::eNucleotide));
604 
605     const Uint8 blastdb_total_length (blastdb->GetTotalLength());
606     if((mode == eIM_Genomic && blastdb_total_length / kN > numeric_limits<Uint4>::max())
607        || (mode == eIM_cDNA && blastdb_total_length > numeric_limits<Uint4>::max()))
608     {
609         CNcbiOstrstream ostr;
610         ostr << "Sequence volumes with total length exceeding "
611              << numeric_limits<Uint4>::max()
612              << " are not yet supported. Please split your FASTA file and re-run "
613              << " formatdb.";
614         const string err = CNcbiOstrstreamToString(ostr);
615         NCBI_THROW(CException, eUnknown, err);
616     }
617 
618     size_t volume(0);
619     for(int oid (0); blastdb->CheckOrFindOID(oid); ++oid) {
620 
621         const char * pcb (0);
622         const Uint4 bases (blastdb->GetSequence(oid, &pcb));
623         const char * pce (pcb + bases/4);
624         uintptr_t npcb (reinterpret_cast<uintptr_t>(pcb)), npcb0 (npcb);
625         npcb -= npcb % 8;
626         if(npcb < npcb0) npcb += 8;
627         const Uint4  gcbase (4*(npcb - npcb0));
628         const Uint8* pui8_start (reinterpret_cast<const Uint8*>(npcb));
629         const Uint8* pui8 (pui8_start);
630 
631         // It helps not to break volumes in the middle of a sequence.
632         // We explicitly check here if the volume is close to its limit.
633 
634         const size_t max_new_elems (mode == eIM_Genomic? size_t(8.0 * bases / kN):
635                                     bases);
636 
637         if(mcidx > 1000 && mcidx + max_new_elems >= mcidx_max) {
638             MersAndCoords.resize(mcidx);
639             x_WriteIndexFile(++volume, mode, strand, MersAndCoords);
640             MersAndCoords.assign(mcidx_max, 0);
641             mcidx = 0;
642         }
643 
644         if(mode == eIM_Genomic) {
645 
646             // index every other position
647             for(size_t gccur (current_offset + gcbase);
648                 gccur + 16 < current_offset + bases && mcidx < mcidx_max;
649                 ++pui8)
650             {
651                 Uint8 w8 (*pui8);
652 
653 #define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX \
654                 size_t gccur2 (gccur + 2); \
655                 const Uint8 ui8_2930 (w8 >> 60); \
656                 Uint8 ui8_ls4 (w8 << 4); \
657                 const Uint8 ui8_mask (ui8_ls4 & kUI8_LoHalfWordEachByte); \
658                 ui8_ls4 &= kUI8_LoHalfWordEachByte << 4; \
659                 ui8_ls4 |= (ui8_mask >> 16) | (ui8_2930 << 48);
660 
661                 QCOMP_PREPARE_SHIFTED_GENOMIC_IDX;
662 
663 #define QCOMP_CREATE_GENOMIC_IDX(w8,gccur) \
664                 { \
665                     if(gccur + 16 >= current_offset + bases) { \
666                         break; \
667                     } \
668                     const Uint8 mer (w8 & kUI8_LoWord); \
669                     if(strand) { \
670                         if(m_Mers.get_at(mer)) { \
671                             MersAndCoords[mcidx++] = (mer << 32) | gccur; \
672                         } \
673                     } \
674                     else { \
675                         const Uint4 rc (g_RC(Uint4(mer))); \
676                         if(m_Mers.get_at(rc)) { \
677                             MersAndCoords[mcidx++] = (Uint8(rc) << 32) | \
678                                 ((current_offset + bases - gccur - 16) \
679                                  + current_offset); \
680                         } \
681                     } \
682                     gccur += 4; \
683                     w8 >>= 8; \
684                 }
685 
686                 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
687                 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
688                 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
689                 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
690 
691                 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
692                 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
693                 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
694                 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
695 
696                 if(gccur + 32 >= current_offset + bases) {
697                     break;
698                 }
699                 else {
700 
701                     w8 |= ((*(pui8 + 1)) << 32);
702 
703                     QCOMP_PREPARE_SHIFTED_GENOMIC_IDX;
704 
705                     QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
706                     QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
707                     QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
708                     QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
709 
710                     QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
711                     QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
712                     QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
713                     QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
714                 }
715 #undef QCOMP_PREPARE_SHIFTED_GENOMIC_IDX
716 #undef QCOMP_CREATE_GENOMIC_IDX
717             }
718         }
719         else {  // cDNA mode
720 
721             if(bases >= m_MinQueryLength && bases <= m_MaxQueryLength) {
722 
723                 // head
724                 Uint8 fivebytes (0);
725                 for(const char * p (pcb),
726                     *pche ( (reinterpret_cast<const char*>(pui8)) + 5 );
727                     p < pche; ++p)
728                 {
729                     const Uint8 c8 (*p & kUI8_LoByte);
730                     const Uint8 ui8curbyte (c8);
731                     if(pcb + 5 > p) {
732                         fivebytes = (fivebytes >> 8) | (ui8curbyte << 32);
733                     }
734                     else {
735 
736                         for(Uint4 k(0); k < 4; ++k) {
737 
738                             const Uint4 mer (fivebytes & kUI8_LoWord);
739                             if(m_Mers.get_at(strand? (mer >> 4): (mer & kUI4_Lo28))) {
740                                 const Uint8 ui8   (mer);
741                                 const Uint4 coord (current_offset +
742                                                    4*(p - pcb - 5) + k);
743                                 MersAndCoords[mcidx++] = (ui8 << 32) | coord;
744                             }
745 
746                             fivebytes &= kUI8_LoFive;
747                             fivebytes <<= 2;
748                             const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
749                             fivebytes &= ~kUI8_SeqDb;
750                             fivebytes |= ui8m;
751                         }
752                         fivebytes |= ui8curbyte << 32;
753                     }
754                 }
755 
756                 // body
757                 Uint8 ui8 (0);
758                 Uint4 gccur (current_offset + gcbase);
759                 for(; gccur + 32 < current_offset + bases; ++pui8)
760                 {
761                     ui8 = *pui8;
762 
763                     for(Uint4 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
764 
765                         const Uint8 loword = ui8 & kUI8_LoWord;
766                         if(m_Mers.get_at(strand? (loword >> 4):
767                                          (loword & kUI4_Lo28)))
768                         {
769                             MersAndCoords[mcidx++] = (loword << 32) | gccur;
770                         }
771 
772                         const Uint8 ui8hi2 ((ui8 >> 62) << 48);
773                         ui8 <<= 2;
774                         const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
775                         ui8 &= ~kUI8_SeqDb;
776                         ui8 |= (ui8m | ui8hi2);
777                     }
778 
779                     if(gccur + 32 < current_offset + bases) {
780 
781                         // pre-read next 16 residues
782                         const uintptr_t n (reinterpret_cast<uintptr_t>(pui8 + 1));
783                         const Uint4 * pui4 (reinterpret_cast<const Uint4*>(n));
784                         Uint4 ui4 (*pui4);
785                         Uint8 ui8_4 (ui4);
786                         ui8 |= (ui8_4 << 32);
787 
788                         for(Uint4 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
789 
790                             const Uint8 loword = ui8 & kUI8_LoWord;
791                             if(m_Mers.get_at(strand? (loword >> 4):
792                                              (loword & kUI4_Lo28)))
793                             {
794                                 MersAndCoords[mcidx++] = (loword << 32) | gccur;
795                             }
796 
797                             const Uint8 ui8hi2 ((ui8 >> 62) << 48);
798                             ui8 <<= 2;
799                             const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
800                             ui8 &= ~kUI8_SeqDb;
801                             ui8 |= (ui8m | ui8hi2);
802                         }
803                     }
804                 }
805 
806                 // tail
807                 if(gccur + 16 <= current_offset + bases) {
808 
809                     fivebytes = (gccur == current_offset + gcbase)? fivebytes:
810                         (ui8 & kUI8_LoWord);
811                     const char * p (reinterpret_cast<const char*>(pui8_start) + 4
812                                     + (gccur - current_offset - gcbase) / 4);
813                     size_t k (0);
814                     do {
815                         const Uint8 loword = fivebytes & kUI8_LoWord;
816 
817                         if(m_Mers.get_at(strand? (loword >> 4):
818                                          (loword & kUI4_Lo28)))
819                         {
820                             MersAndCoords[mcidx++] = (loword << 32) | gccur;
821                         }
822 
823                         if(k % 4 == 0) {
824                             if(p < pce) {
825                                 const Uint8 ui8 (*p++ & kUI8_LoByte);
826                                 fivebytes |= (ui8 << 32);
827                             }
828                             else {
829                                 break;
830                             }
831                         }
832 
833                         fivebytes &= kUI8_LoFive;
834                         fivebytes <<= 2;
835                         const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
836                         fivebytes &= ~kUI8_SeqDb;
837                         fivebytes |= ui8m;
838 
839                         ++k;
840                         ++gccur;
841                     } while (true);
842                 }
843 
844             } // min cDNA length check
845 
846         } // genomic or cDNA
847 
848         blastdb->RetSequence(&pcb);
849 
850         current_offset += bases;
851 
852         if(mcidx >= mcidx_max) {
853             CNcbiOstrstream ostr;
854             ostr << "Selected max volume size is too small: "
855                  << "it must be large enough to fit the index for the "
856                  << "longest input sequence.";
857             const string err = CNcbiOstrstreamToString(ostr);
858             NCBI_THROW(CException, eUnknown, err);
859         }
860     } // seqdb iteration
861 
862     blastdb.Reset(0);
863 
864     if(mcidx > 0) {
865         MersAndCoords.resize(mcidx);
866         x_WriteIndexFile(++volume, mode, strand, MersAndCoords);
867         mcidx = 0;
868     }
869 
870     m_Mers.Clear();
871 }
872 
x_CreateIndex(ISequenceSource * m_qsrc,EIndexMode mode,bool strand)873 void CElementaryMatching::x_CreateIndex(ISequenceSource *m_qsrc, EIndexMode mode, bool strand)
874 {
875     // sort all adjacent genomic N-mers and their positions
876     // except for those marked in the Nr-mer bit vector
877 
878     cerr << " Scanning sequences for N-mers and their positions." << endl;
879 
880     if(m_Mers.get_at(0)) {
881         NCBI_THROW(CException, eUnknown, "NULL mer not excluded!");
882     }
883 
884     const size_t mcidx_max (m_MaxVolSize / 8);
885     vector<Uint8> MersAndCoords (mcidx_max, 0);
886     size_t mcidx (0);
887     size_t current_offset (0);
888 
889 
890     const Uint8 blastdb_total_length (m_qsrc->GetTotalLength());
891     if((mode == eIM_Genomic && blastdb_total_length / kN > numeric_limits<Uint4>::max())
892        || (mode == eIM_cDNA && blastdb_total_length > numeric_limits<Uint4>::max()))
893     {
894         CNcbiOstrstream ostr;
895         ostr << "Sequence volumes with total length exceeding "
896              << numeric_limits<Uint4>::max()
897              << " are not yet supported. Please split your FASTA file and re-run "
898              << " formatdb.";
899         const string err = CNcbiOstrstreamToString(ostr);
900         NCBI_THROW(CException, eUnknown, err);
901     }
902 
903     size_t volume(0);
904     for(m_qsrc->ResetIndex(); m_qsrc->GetNext(); ) {
905 
906         const char * pcb (0);
907         const Uint4 bases (m_qsrc->GetSeq(&pcb));
908         const char * pce (pcb + bases/4);
909         uintptr_t npcb (reinterpret_cast<uintptr_t>(pcb)), npcb0 (npcb);
910         npcb -= npcb % 8;
911         if(npcb < npcb0) npcb += 8;
912         const Uint4  gcbase (4*(npcb - npcb0));
913         const Uint8* pui8_start (reinterpret_cast<const Uint8*>(npcb));
914         const Uint8* pui8 (pui8_start);
915 
916         // It helps not to break volumes in the middle of a sequence.
917         // We explicitly check here if the volume is close to its limit.
918 
919         const size_t max_new_elems (mode == eIM_Genomic? size_t(8.0 * bases / kN):
920                                     bases);
921 
922         if(mcidx > 1000 && mcidx + max_new_elems >= mcidx_max) {
923             MersAndCoords.resize(mcidx);
924             x_WriteIndexFile(++volume, mode, strand, MersAndCoords);
925             MersAndCoords.assign(mcidx_max, 0);
926             mcidx = 0;
927         }
928 
929         if(mode == eIM_Genomic) {
930 
931             // index every other position
932             for(size_t gccur (current_offset + gcbase);
933                 gccur + 16 < current_offset + bases && mcidx < mcidx_max;
934                 ++pui8)
935             {
936                 Uint8 w8 (*pui8);
937 
938 #define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX \
939                 size_t gccur2 (gccur + 2); \
940                 const Uint8 ui8_2930 (w8 >> 60); \
941                 Uint8 ui8_ls4 (w8 << 4); \
942                 const Uint8 ui8_mask (ui8_ls4 & kUI8_LoHalfWordEachByte); \
943                 ui8_ls4 &= kUI8_LoHalfWordEachByte << 4; \
944                 ui8_ls4 |= (ui8_mask >> 16) | (ui8_2930 << 48);
945 
946                 QCOMP_PREPARE_SHIFTED_GENOMIC_IDX;
947 
948 #define QCOMP_CREATE_GENOMIC_IDX(w8,gccur) \
949                 { \
950                     if(gccur + 16 >= current_offset + bases) { \
951                         break; \
952                     } \
953                     const Uint8 mer (w8 & kUI8_LoWord); \
954                     if(strand) { \
955                         if(m_Mers.get_at(mer)) { \
956                             MersAndCoords[mcidx++] = (mer << 32) | gccur; \
957                         } \
958                     } \
959                     else { \
960                         const Uint4 rc (g_RC(Uint4(mer))); \
961                         if(m_Mers.get_at(rc)) { \
962                             MersAndCoords[mcidx++] = (Uint8(rc) << 32) | \
963                                 ((current_offset + bases - gccur - 16) \
964                                  + current_offset); \
965                         } \
966                     } \
967                     gccur += 4; \
968                     w8 >>= 8; \
969                 }
970 
971                 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
972                 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
973                 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
974                 QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
975 
976                 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
977                 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
978                 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
979                 QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
980 
981                 if(gccur + 32 >= current_offset + bases) {
982                     break;
983                 }
984                 else {
985 
986                     w8 |= ((*(pui8 + 1)) << 32);
987 
988                     QCOMP_PREPARE_SHIFTED_GENOMIC_IDX;
989 
990                     QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
991                     QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
992                     QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
993                     QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
994 
995                     QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
996                     QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
997                     QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
998                     QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
999                 }
1000 #undef QCOMP_PREPARE_SHIFTED_GENOMIC_IDX
1001 #undef QCOMP_CREATE_GENOMIC_IDX
1002             }
1003         }
1004         else {  // cDNA mode
1005 
1006             if(bases >= m_MinQueryLength && bases <= m_MaxQueryLength) {
1007 
1008                 // head
1009                 Uint8 fivebytes (0);
1010                 for(const char * p (pcb),
1011                     *pche ( (reinterpret_cast<const char*>(pui8)) + 5 );
1012                     p < pche; ++p)
1013                 {
1014                     const Uint8 c8 (*p & kUI8_LoByte);
1015                     const Uint8 ui8curbyte (c8);
1016                     if(pcb + 5 > p) {
1017                         fivebytes = (fivebytes >> 8) | (ui8curbyte << 32);
1018                     }
1019                     else {
1020 
1021                         for(Uint4 k(0); k < 4; ++k) {
1022 
1023                             const Uint4 mer (fivebytes & kUI8_LoWord);
1024                             if(m_Mers.get_at(strand? (mer >> 4): (mer & kUI4_Lo28))) {
1025                                 const Uint8 ui8   (mer);
1026                                 const Uint4 coord (current_offset +
1027                                                    4*(p - pcb - 5) + k);
1028                                 MersAndCoords[mcidx++] = (ui8 << 32) | coord;
1029                             }
1030 
1031                             fivebytes &= kUI8_LoFive;
1032                             fivebytes <<= 2;
1033                             const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
1034                             fivebytes &= ~kUI8_SeqDb;
1035                             fivebytes |= ui8m;
1036                         }
1037                         fivebytes |= ui8curbyte << 32;
1038                     }
1039                 }
1040 
1041                 // body
1042                 Uint8 ui8 (0);
1043                 Uint4 gccur (current_offset + gcbase);
1044                 for(; gccur + 32 < current_offset + bases; ++pui8)
1045                 {
1046                     ui8 = *pui8;
1047 
1048                     for(Uint4 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
1049 
1050                         const Uint8 loword = ui8 & kUI8_LoWord;
1051                         if(m_Mers.get_at(strand? (loword >> 4):
1052                                          (loword & kUI4_Lo28)))
1053                         {
1054                             MersAndCoords[mcidx++] = (loword << 32) | gccur;
1055                         }
1056 
1057                         const Uint8 ui8hi2 ((ui8 >> 62) << 48);
1058                         ui8 <<= 2;
1059                         const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
1060                         ui8 &= ~kUI8_SeqDb;
1061                         ui8 |= (ui8m | ui8hi2);
1062                     }
1063 
1064                     if(gccur + 32 < current_offset + bases) {
1065 
1066                         // pre-read next 16 residues
1067                         const uintptr_t n (reinterpret_cast<uintptr_t>(pui8 + 1));
1068                         const Uint4 * pui4 (reinterpret_cast<const Uint4*>(n));
1069                         Uint4 ui4 (*pui4);
1070                         Uint8 ui8_4 (ui4);
1071                         ui8 |= (ui8_4 << 32);
1072 
1073                         for(Uint4 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
1074 
1075                             const Uint8 loword = ui8 & kUI8_LoWord;
1076                             if(m_Mers.get_at(strand? (loword >> 4):
1077                                              (loword & kUI4_Lo28)))
1078                             {
1079                                 MersAndCoords[mcidx++] = (loword << 32) | gccur;
1080                             }
1081 
1082                             const Uint8 ui8hi2 ((ui8 >> 62) << 48);
1083                             ui8 <<= 2;
1084                             const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
1085                             ui8 &= ~kUI8_SeqDb;
1086                             ui8 |= (ui8m | ui8hi2);
1087                         }
1088                     }
1089                 }
1090 
1091                 // tail
1092                 if(gccur + 16 <= current_offset + bases) {
1093 
1094                     fivebytes = (gccur == current_offset + gcbase)? fivebytes:
1095                         (ui8 & kUI8_LoWord);
1096                     const char * p (reinterpret_cast<const char*>(pui8_start) + 4
1097                                     + (gccur - current_offset - gcbase) / 4);
1098                     size_t k (0);
1099                     do {
1100                         const Uint8 loword = fivebytes & kUI8_LoWord;
1101 
1102                         if(m_Mers.get_at(strand? (loword >> 4):
1103                                          (loword & kUI4_Lo28)))
1104                         {
1105                             MersAndCoords[mcidx++] = (loword << 32) | gccur;
1106                         }
1107 
1108                         if(k % 4 == 0) {
1109                             if(p < pce) {
1110                                 const Uint8 ui8 (*p++ & kUI8_LoByte);
1111                                 fivebytes |= (ui8 << 32);
1112                             }
1113                             else {
1114                                 break;
1115                             }
1116                         }
1117 
1118                         fivebytes &= kUI8_LoFive;
1119                         fivebytes <<= 2;
1120                         const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
1121                         fivebytes &= ~kUI8_SeqDb;
1122                         fivebytes |= ui8m;
1123 
1124                         ++k;
1125                         ++gccur;
1126                     } while (true);
1127                 }
1128 
1129             } // min cDNA length check
1130 
1131         } // genomic or cDNA
1132 
1133         m_qsrc->RetSequence(&pcb);
1134 
1135         current_offset += bases;
1136 
1137         if(mcidx >= mcidx_max) {
1138             CNcbiOstrstream ostr;
1139             ostr << "Selected max volume size is too small: "
1140                  << "it must be large enough to fit the index for the "
1141                  << "longest input sequence.";
1142             const string err = CNcbiOstrstreamToString(ostr);
1143             NCBI_THROW(CException, eUnknown, err);
1144         }
1145     } // seqdb iteration
1146 
1147     if(mcidx > 0) {
1148         MersAndCoords.resize(mcidx);
1149         x_WriteIndexFile(++volume, mode, strand, MersAndCoords);
1150         mcidx = 0;
1151     }
1152 
1153     m_Mers.Clear();
1154 }
1155 
1156 
x_WriteIndexFile(size_t volume,EIndexMode mode,bool strand,vector<Uint8> & MersAndCoords)1157 size_t CElementaryMatching::x_WriteIndexFile(
1158           size_t volume,
1159           EIndexMode mode,
1160           bool   strand,
1161           vector<Uint8>& MersAndCoords)
1162 {
1163     const size_t t0 (time(0));
1164 
1165     string basename;
1166     if(mode == eIM_Genomic) {
1167         basename = m_lbn_s;
1168     }
1169     else {
1170         basename = m_lbn_q;
1171     }
1172     basename += string(strand? ".p": ".m") + ".v" + NStr::NumericToString(volume);
1173     const string filename_offs ( m_FilePath + CDirEntry::GetPathSeparator() + basename + kFileExt_Offsets );
1174 
1175     CNcbiOfstream ofstr_offs   (filename_offs.c_str(), IOS_BASE::binary);
1176     Uint8 bytes_offs (0);
1177 
1178     const string filename_positions (m_FilePath + CDirEntry::GetPathSeparator() + basename + kFileExt_Positions);
1179     CNcbiOfstream ofstr_positions (filename_positions.c_str(), IOS_BASE::binary);
1180     Uint8 bytes_positions (0);
1181 
1182     cerr << " Generating index volume: " << basename << " ... ";
1183 
1184     Uint4 curmer (numeric_limits<Uint4>::max());
1185     Uint4 curofs (0);
1186 
1187     sort(MersAndCoords.begin(), MersAndCoords.end());
1188 
1189     ITERATE(vector<Uint8>, ii, MersAndCoords) {
1190 
1191         const Uint4 mer ((*ii & kUI8_HiWord) >> 32);
1192         if(mer != curmer) {
1193 
1194             ofstr_offs.write((const char*) &mer,    sizeof mer);
1195             ofstr_offs.write((const char*) &curofs, sizeof curofs);
1196             curmer = mer;
1197             bytes_offs += sizeof(mer) + sizeof(curofs);
1198         }
1199         const Uint4 pos (*ii & kUI8_LoWord);
1200 
1201         ofstr_positions.write((const char*) &pos, sizeof pos);
1202         bytes_positions += sizeof(pos);
1203         ++curofs;
1204     }
1205 
1206     // termination zero mer
1207     const Uint4 mer (0);
1208     ofstr_offs.write((const char*)&mer, sizeof mer);
1209     ofstr_offs.write((const char*)&curofs, sizeof curofs);
1210     bytes_offs += sizeof(mer) + sizeof(curofs);
1211 
1212     ofstr_offs.close();
1213     ofstr_positions.close();
1214 
1215     CheckWrittenFile(filename_offs,      bytes_offs);
1216     CheckWrittenFile(filename_positions, bytes_positions);
1217 
1218     cerr << "Ok" << endl;
1219 
1220     return time(0) - t0;
1221 }
1222 
1223 #define CHECK_MEMMAP(mm,ofs,len) \
1224 {{ \
1225     const size_t ofs1 (mm.GetOffset());     \
1226     if(ofs1 != ofs) {                                                        \
1227         cerr << "Real offset " << ofs1 << " different from " << ofs << endl; \
1228     }                                                                        \
1229     const size_t len1 (mm.GetSize());                                    \
1230     if(len1 != len) {                                                        \
1231         cerr << "Real length " << len1 << " different from " << len << endl; \
1232     } \
1233 }}
1234 
1235 
x_Search(bool strand)1236 void CElementaryMatching::x_Search(bool strand)
1237 {
1238     m_Mers.Clear();
1239 
1240     cerr << " Matching (strand = " << (strand? "plus": "minus") << ") ... ";
1241     m_CurGenomicStrand = strand;
1242 
1243     CDir dir (m_FilePath);
1244 
1245     const string sfx (string(strand?".p": ".m") + ".v*");
1246 
1247     const string mask_ofs_q   (m_lbn_q + sfx + kFileExt_Offsets);
1248     const string mask_ofs_s   (m_lbn_s + sfx + kFileExt_Offsets);
1249     CDir::TEntries vols_ofs_q (dir.GetEntries(mask_ofs_q));
1250     CDir::TEntries vols_ofs_s (dir.GetEntries(mask_ofs_s));
1251 
1252     Uint8 elem_hits_total (0);
1253 
1254     ITERATE(CDir::TEntries, ii_s, vols_ofs_s) {
1255 
1256         const string filename_ofs_s ((*ii_s)->GetPath());
1257         const string filename_pos_s (ReplaceExt(filename_ofs_s, kFileExt_Positions));
1258         const CFile  file_subj (filename_ofs_s);
1259         const size_t dim_ofs_s (file_subj.GetLength() / 8);
1260 
1261         ITERATE(CDir::TEntries, ii_q, vols_ofs_q) {
1262 
1263             const string filename_ofs_q ((*ii_q)->GetPath());
1264             const string filename_pos_q (ReplaceExt(filename_ofs_q,
1265                                                     kFileExt_Positions));
1266 
1267             Uint8 hit_index_dim (0), elem_hits_this_pair (0);
1268             string filename_hit_index;
1269 
1270             {{
1271 
1272             // map offset files
1273             CMemoryFile mf_ofs_s ((*ii_s)->GetPath());
1274             const Uint8 * ofs_s (reinterpret_cast<const Uint8*>
1275                                                (mf_ofs_s.Map()));
1276 
1277             const CFile  file_query ((*ii_q)->GetPath());
1278             const size_t dim_ofs_q (file_query.GetLength() / 8);
1279 
1280             CMemoryFile mf_ofs_q ((*ii_q)->GetPath());
1281             const Uint8 * ofs_q (reinterpret_cast<const Uint8*>
1282                                                (mf_ofs_q.Map()));
1283 
1284             // build hit index
1285             THitIndex hit_index;
1286             hit_index.reserve(min(dim_ofs_q, dim_ofs_s));
1287 
1288             while((*ofs_q & kUI8_LoWord) && (*ofs_s & kUI8_LoWord)) {
1289 
1290                 while( (*ofs_q & kUI8_LoWord)
1291                        && ((*ofs_q & kUI8_LoWord) < (*ofs_s & kUI8_LoWord)))
1292                 {
1293                     ++ofs_q;
1294                 }
1295                 if((*ofs_q & kUI8_LoWord) == 0) break;
1296 
1297                 while( (*ofs_s & kUI8_LoWord)
1298                        && ((*ofs_s & kUI8_LoWord) < (*ofs_q & kUI8_LoWord)))
1299                 {
1300                     ++ofs_s;
1301                 }
1302                 if((*ofs_s & kUI8_LoWord) == 0) break;
1303 
1304                 if((*ofs_s & kUI8_LoWord) == (*ofs_q & kUI8_LoWord)) {
1305 
1306                     hit_index.push_back(SHitIndexEntry());
1307                     SHitIndexEntry& elem (hit_index.back());
1308 
1309                     elem.m_QueryOfs = (*ofs_q) >> 32;
1310                     size_t count ((*(ofs_q + 1) >> 32) - elem.m_QueryOfs);
1311                     elem.m_QueryCount = (count > 0xFFFF)? 0xFFFF: count;
1312 
1313                     elem.m_SubjOfs = (*ofs_s) >> 32;
1314                     count = ((*(ofs_s + 1) >> 32) - elem.m_SubjOfs);
1315                     elem.m_SubjCount = (count > 0xFFFF)? 0xFFFF: count;
1316 
1317                     ++ofs_s;
1318                     ++ofs_q;
1319 
1320                     elem_hits_this_pair += elem.m_QueryCount * elem.m_SubjCount;
1321                 }
1322             }
1323 
1324             // unload offset files; save hit index
1325             filename_hit_index = g_SaveToTemp(hit_index, m_FilePath);
1326             hit_index_dim = (hit_index.size());
1327             elem_hits_total += elem_hits_this_pair;
1328             }}
1329 
1330             // load position files
1331 
1332             const CFile  file_pos_s (filename_pos_s);
1333             const size_t dim_pos_s (file_pos_s.GetLength() / 4);
1334 
1335             CMemoryFile mf_pos_s (filename_pos_s);
1336             size_t map_offset_s (0);
1337             size_t map_length_s (min(kMapGran, 4*dim_pos_s - map_offset_s));
1338             const Uint4 * pos_s (reinterpret_cast<const Uint4*>(
1339                                            mf_pos_s.Map(map_offset_s, map_length_s)));
1340 
1341             const CFile  file_pos_q (filename_pos_q);
1342             const size_t dim_pos_q (file_pos_q.GetLength() / 4);
1343 
1344             CMemoryFile mf_pos_q (filename_pos_q);
1345             size_t map_offset_q (0);
1346             size_t map_length_q (min(kMapGran, 4*dim_pos_q - map_offset_q));
1347             const Uint4 * pos_q (reinterpret_cast<const Uint4*>(
1348                                            mf_pos_q.Map(map_offset_q, map_length_q)));
1349 
1350             // scan hit index file and build hits
1351             vector<Uint8> hits (elem_hits_this_pair);
1352             size_t hidx (0);
1353 
1354             CNcbiIfstream ifstr (filename_hit_index.c_str(), IOS_BASE::binary);
1355             for(size_t cnt(0); cnt < hit_index_dim; ++cnt) {
1356 
1357                 SHitIndexEntry hie;
1358                 ifstr.read((char*) &hie, sizeof(hie));
1359 
1360                 const size_t idx_s_max (hie.m_SubjOfs + hie.m_SubjCount);
1361                 const size_t idx_q_max (hie.m_QueryOfs + hie.m_QueryCount);
1362                 if(idx_s_max > dim_pos_s || idx_q_max > dim_pos_q) {
1363                     NCBI_THROW(CException, eUnknown, "Coordinate out of scope");
1364                 }
1365 
1366                 if(4*idx_s_max >= map_offset_s + map_length_s) {
1367 
1368                     map_offset_s = 4 * hie.m_SubjOfs;
1369                     map_length_s = min(kMapGran, 4*dim_pos_s - map_offset_s);
1370                     mf_pos_s.Unmap();
1371                     pos_s = (reinterpret_cast<const Uint4*>
1372                              (mf_pos_s.Map(map_offset_s, map_length_s)));
1373                 }
1374 
1375                 if(4*idx_q_max >= map_offset_q + map_length_q) {
1376 
1377                     map_offset_q = 4 * hie.m_QueryOfs;
1378                     map_length_q = min(kMapGran, 4*dim_pos_q - map_offset_q);
1379                     mf_pos_q.Unmap();
1380                     pos_q = (reinterpret_cast<const Uint4*>
1381                              (mf_pos_q.Map(map_offset_q, map_length_q)));
1382                 }
1383 
1384                 for(size_t idx_s (hie.m_SubjOfs); idx_s < idx_s_max; ++idx_s) {
1385 
1386                     Uint8 hiword = pos_s[idx_s - map_offset_s/4];
1387                     hiword <<= 32;
1388                     for(size_t idx_q (hie.m_QueryOfs); idx_q < idx_q_max; ++idx_q) {
1389 
1390                         const Uint8 loword = pos_q[idx_q - map_offset_q/4];
1391                         hits[hidx++] = (hiword | loword);
1392                     }
1393                 }
1394             }
1395 
1396             if(hidx != elem_hits_this_pair) {
1397                 CNcbiOstrstream ostr;
1398                 ostr << "The number of hits found (" << hidx
1399                      << ") does not match the expected " << elem_hits_this_pair;
1400                 const string str = CNcbiOstrstreamToString(ostr);
1401                 NCBI_THROW(CException, eUnknown, str);
1402             }
1403 
1404             // remove the hit index file
1405             CFile(filename_hit_index).Remove();
1406 
1407             // detect compartments
1408             x_CompartVolume (&hits);
1409 
1410         } // query vols
1411 
1412     } // subj vols
1413 
1414     cerr << "Ok" << endl;
1415 }
1416 
1417 #undef CHECK_MEMMAP
1418 
1419 
PLoWord(const Uint8 & lhs,const Uint8 & rhs)1420 bool PLoWord(const Uint8& lhs, const Uint8& rhs)
1421 {
1422     return (lhs & kUI8_LoWord) < (rhs & kUI8_LoWord);
1423 }
1424 
1425 
PDiag(const Uint8 & lhs,const Uint8 & rhs)1426 bool PDiag(const Uint8& lhs, const Uint8& rhs)
1427 {
1428     const double lhs_q (double(lhs & kUI8_LoWord) / 2);
1429     const double lhs_s (double(lhs >> 32) / 2);
1430     const double rhs_q (double(rhs & kUI8_LoWord) / 2);
1431     const double rhs_s (double(rhs >> 32) / 2);
1432 
1433     const double lhs_q1 (lhs_q + lhs_s);
1434     const double lhs_s1 (lhs_s - lhs_q);
1435     const double rhs_q1 (rhs_q + rhs_s);
1436     const double rhs_s1 (rhs_s - rhs_q);
1437 
1438     if(lhs_s1 == rhs_s1) {
1439         return lhs_q1 < rhs_q1;
1440     }
1441     else {
1442         return lhs_s1 < rhs_s1;
1443     }
1444 }
1445 
1446 
x_CleanVolumes(const string & lbn,const TStrings & vol_extensions)1447 void CElementaryMatching::x_CleanVolumes(const string& lbn,
1448                                        const TStrings& vol_extensions)
1449 {
1450 
1451     // make sure there are no offset or position files left before
1452     // we generate the new ones
1453 
1454     CDir dir (m_FilePath);
1455 
1456     CFileDeleteList fdl;
1457     ITERATE(TStrings, ii, vol_extensions) {
1458 
1459         const string mask (lbn + "*" + *ii);
1460         CDir::TEntries dir_entries (dir.GetEntries(mask));
1461         ITERATE(CDir::TEntries, jj, dir_entries) {
1462             fdl.Add((*jj)->GetPath());
1463         }
1464     }
1465 }
1466 
1467 
x_CompartVolume(vector<Uint8> * phits)1468 void CElementaryMatching::x_CompartVolume(vector<Uint8>* phits)
1469 {
1470     if(phits->size() == 0) {
1471         return;
1472     }
1473 
1474     // sort by the global genomic corrdinate
1475     sort(phits->begin(), phits->end());
1476 
1477     TSeqInfos::const_iterator ii_genomic_b (m_SeqInfos_Genomic.begin());
1478     TSeqInfos::const_iterator ii_genomic_e (m_SeqInfos_Genomic.end());
1479     TSeqInfos::const_iterator ii_genomic   (ii_genomic_b);
1480     TSeqInfos::const_iterator ii_cdna_b    (m_SeqInfos_cdna.begin());
1481     TSeqInfos::const_iterator ii_cdna_e    (m_SeqInfos_cdna.end());
1482 
1483     CSeqDB seqdb_genomic (m_sdb, CSeqDB::eNucleotide);
1484 
1485     m_CurSeq_cDNA = m_CurSeq_Genomic = 0;
1486 
1487     size_t idx_compacted (0);
1488     for(size_t idx (0), idx_hi (phits->size()); idx < idx_hi; ) {
1489 
1490         Uint4 gc_s (((*phits)[idx]) >> 32);
1491 
1492         // find the relevant genomic record
1493         ii_genomic = lower_bound(ii_genomic + 1, ii_genomic_e, SSeqInfo(gc_s, 0, 0));
1494         if(ii_genomic == ii_genomic_e || ii_genomic->m_Start > gc_s) {
1495             --ii_genomic;
1496         }
1497 
1498         if(gc_s < ii_genomic->m_Start ||
1499            ii_genomic->m_Start + ii_genomic->m_Length <= gc_s)
1500         {
1501             CNcbiOstrstream ostr;
1502             ostr << "Global genomic coordinate "
1503                  << gc_s << " out of range: ["
1504                  << ii_genomic->m_Start << ", "
1505                  << (ii_genomic->m_Start + ii_genomic->m_Length) << "), "
1506                  << (m_GenomicSeqIds[ii_genomic->m_Oid]->GetSeqIdString(true));
1507             const string str = CNcbiOstrstreamToString(ostr);
1508             NCBI_THROW(CException, eUnknown, str);
1509         }
1510 
1511         const Uint4 gc_s_max (ii_genomic->m_Start + ii_genomic->m_Length);
1512 
1513         // preload genomic sequence
1514         seqdb_genomic.GetSequence(ii_genomic->m_Oid, &m_CurSeq_Genomic);
1515 
1516         // find all hits that belong to the current genomic record
1517         size_t idx0 (idx);
1518         while(idx < idx_hi && (((*phits)[idx]) >> 32) < gc_s_max) {
1519             ++idx;
1520         }
1521 
1522         // sort by the global cDNA coordinate
1523         sort(phits->begin() + idx0, phits->begin() + idx, PLoWord);
1524 
1525         // find the relevant cDNA record
1526         TSeqInfos::const_iterator ii_cdna (ii_cdna_b);
1527         Uint4 gc_q_min (ii_cdna->m_Start);
1528         Uint4 gc_q_max (ii_cdna->m_Start + ii_cdna->m_Length);
1529         size_t jdx (idx0), jdx0 (jdx);
1530         for(; jdx < idx; ++jdx) {
1531 
1532             Uint4 gc_q (((*phits)[jdx]) & kUI8_LoWord);
1533 
1534             if(gc_q < gc_q_min || gc_q >= gc_q_max) {
1535 
1536                 const THit::TCoord min_matches1 = THit::TCoord(
1537                       round(m_MinCompartmentIdty * ii_cdna->m_Length));
1538 
1539                 const THit::TCoord min_matches2 = THit::TCoord(
1540                       round(m_MinSingletonIdty * ii_cdna->m_Length));
1541 
1542                 const THit::TCoord min_matches (min(min_matches1,min_matches2));
1543 
1544                 if(kN * (jdx - jdx0) >= min_matches / 2) {
1545 
1546                     // preload genomic sequence
1547                     m_qsrc->GetSeq(ii_cdna->m_Oid, &m_CurSeq_cDNA);
1548                     x_CompartPair(phits, ii_cdna, ii_genomic,
1549                                   jdx0, jdx, &idx_compacted);
1550                     m_qsrc->RetSequence(&m_CurSeq_cDNA);
1551                     m_CurSeq_cDNA = 0;
1552                 }
1553 
1554                 ii_cdna = lower_bound(ii_cdna + 1, ii_cdna_e, SSeqInfo(gc_q, 0, 0));
1555 
1556                 if(ii_cdna == ii_cdna_e || ii_cdna->m_Start > gc_q) {
1557                     --ii_cdna;
1558                 }
1559 
1560                 if(gc_q < ii_cdna->m_Start ||
1561                    ii_cdna->m_Start + ii_cdna->m_Length <= gc_q)
1562                 {
1563                     CNcbiOstrstream ostr;
1564                     ostr << "Global cDNA coordinate "
1565                          << gc_q << " out of range: ["
1566                          << ii_cdna->m_Start << ", "
1567                          << (ii_cdna->m_Start + ii_cdna->m_Length) << "), "
1568                          << (m_cDNASeqIds[ii_cdna->m_Oid]->GetSeqIdString(true));
1569                     const string str = CNcbiOstrstreamToString(ostr);
1570                     NCBI_THROW(CException, eUnknown, str);
1571                 }
1572 
1573                 gc_q_min  = ii_cdna->m_Start;
1574                 gc_q_max  = ii_cdna->m_Start + ii_cdna->m_Length;
1575                 jdx0 = jdx;
1576             }
1577         }
1578 
1579         const THit::TCoord min_matches1 = THit::TCoord(
1580                       round(m_MinCompartmentIdty * ii_cdna->m_Length));
1581 
1582         const THit::TCoord min_matches2 = THit::TCoord(
1583                       round(m_MinSingletonIdty * ii_cdna->m_Length));
1584 
1585         const THit::TCoord min_matches (min(min_matches1,min_matches2));
1586 
1587         if(kN * (jdx - jdx0) >= min_matches / 2) {
1588 
1589             m_qsrc->GetSeq(ii_cdna->m_Oid, &m_CurSeq_cDNA);
1590 
1591             x_CompartPair(phits, ii_cdna, ii_genomic,
1592                           jdx0, jdx, &idx_compacted);
1593             m_qsrc->RetSequence(&m_CurSeq_cDNA);
1594             m_CurSeq_cDNA = 0;
1595         }
1596 
1597         seqdb_genomic.RetSequence(&m_CurSeq_Genomic);
1598         m_CurSeq_Genomic = 0;
1599     }
1600 }
1601 
1602 
x_IsMatch(Uint4 q,Uint4 s) const1603 bool CElementaryMatching::x_IsMatch(Uint4 q, Uint4 s) const
1604 {
1605     const Uint1 cq (((m_CurSeq_cDNA[q / 4]) >> ((3 - (q % 4))*2)) & 0x03);
1606     const Uint1 cs (((m_CurSeq_Genomic[s / 4]) >> ((3 - (s % 4))*2)) & 0x03);
1607     const bool rv (m_CurGenomicStrand? (cq == cs): ((cq^cs) == 3));
1608     return rv;
1609 }
1610 
1611 
x_ExtendHit(const Int8 & left_limit0,const Int8 & right_limit0,THitRef hitref)1612 Int8 CElementaryMatching::x_ExtendHit(const Int8 & left_limit0,
1613                                       const Int8 & right_limit0,
1614                                       THitRef hitref)
1615 {
1616     const int Wm (1), Wms (-2);
1617     int score (int(hitref->GetLength()) * Wm
1618                + int(hitref->GetMismatches()) * (Wms - Wm));
1619     int score_max (score);
1620 
1621     const int overrun (6); // enables connecting hits over a mismatch
1622     const Int8 left_limit (left_limit0 >= overrun? left_limit0 - overrun: 0);
1623     const Int8 right_limit (right_limit0 >= kDiagMax - overrun?
1624                             kDiagMax: right_limit0 + overrun);
1625 
1626     // extend left
1627     Int8 q0 (hitref->GetQueryStart()), s0 (hitref->GetSubjStart());
1628     size_t mm (0), mm0 (0);
1629     bool no_overrun_yet (true);
1630     for(Int8 q (q0 - 1), s (s0 - 1);
1631         (q + s > left_limit && score + m_XDropOff >= score_max
1632          && q >= m_ii_cdna->m_Start && s >= m_ii_genomic->m_Start);
1633         --q, --s)
1634     {
1635 
1636         if(q + s == left_limit0) {
1637             no_overrun_yet = false;
1638             mm0 += mm;
1639         }
1640 
1641         Uint4 qq (q - m_ii_cdna->m_Start);
1642         Uint4 ss (s - m_ii_genomic->m_Start);
1643         if(m_CurGenomicStrand == false) {
1644             ss = m_ii_genomic->m_Length - ss - 1;
1645         }
1646 
1647         if(x_IsMatch(qq, ss)) {
1648 
1649             score += Wm;
1650             if(score > score_max) {
1651                 q0 = q;
1652                 s0 = s;
1653                 score_max = score;
1654                 if(no_overrun_yet) {
1655                     mm0 += mm;
1656                     mm = 0;
1657                 }
1658             }
1659         }
1660         else {
1661             score += Wms;
1662             ++mm;
1663         }
1664     }
1665 
1666     while(q0 + s0 <= left_limit0) {++q0; ++s0;}
1667 
1668     bool extended_left (false);
1669     if(q0 < hitref->GetQueryStart()) {
1670         hitref->SetQueryStart(q0);
1671         hitref->SetSubjStart(s0);
1672         extended_left = true;
1673     }
1674 
1675     // extend right
1676     q0 = (hitref->GetQueryStop());
1677     s0 = (hitref->GetSubjStop());
1678     score = score_max;
1679     mm = mm0;
1680 
1681     no_overrun_yet = true;
1682 
1683     for(Int8 q (q0 + 1), s (s0 + 1);
1684         (q + s < right_limit && score + m_XDropOff >= score_max
1685          && q < m_ii_cdna->m_Start + m_ii_cdna->m_Length
1686          && s < m_ii_genomic->m_Start + m_ii_genomic->m_Length);
1687         ++q, ++s)
1688     {
1689         if(q + s == right_limit0) {
1690             no_overrun_yet = false;
1691             mm0 += mm;
1692         }
1693 
1694         Uint4 qq (q - m_ii_cdna->m_Start);
1695         Uint4 ss (s - m_ii_genomic->m_Start);
1696         if(m_CurGenomicStrand == false) {
1697             ss = m_ii_genomic->m_Length - ss - 1;
1698         }
1699 
1700         if(x_IsMatch(qq, ss)) {
1701 
1702             score += Wm;
1703             if(score > score_max) {
1704                 q0 = q;
1705                 s0 = s;
1706                 score_max = score;
1707                 if(no_overrun_yet) {
1708                     mm0 += mm;
1709                     mm = 0;
1710                 }
1711             }
1712         }
1713         else {
1714             score += Wms;
1715             ++mm;
1716         }
1717     }
1718 
1719     while(q0 + s0 >= right_limit0) {--q0; --s0; }
1720 
1721     bool extended_right (false);
1722     if(q0 > hitref->GetQueryStop()) {
1723         hitref->SetQueryStop(q0);
1724         hitref->SetSubjStop(s0);
1725         extended_right = true;
1726     }
1727 
1728     if(extended_left || extended_right) {
1729 
1730         hitref->SetMismatches(mm0);
1731         const THit::TCoord len (hitref->GetQueryStop() - hitref->GetQueryStart() + 1);
1732         hitref->SetLength(len);
1733         hitref->SetIdentity((len - mm0) / double(len));
1734         hitref->SetScore(2 * len);
1735     }
1736 
1737     return q0 + s0;
1738 }
1739 
1740 
x_CompartPair(vector<Uint8> * pvol,TSeqInfos::const_iterator ii_cdna,TSeqInfos::const_iterator ii_genomic,size_t idx_start,size_t idx_stop,size_t * pidx_compacted)1741 void CElementaryMatching::x_CompartPair(vector<Uint8>* pvol,
1742                                         TSeqInfos::const_iterator ii_cdna,
1743                                         TSeqInfos::const_iterator ii_genomic,
1744                                         size_t  idx_start,
1745                                         size_t  idx_stop,
1746                                         size_t* pidx_compacted)
1747 {
1748     if(idx_start >= idx_stop) {
1749         return;
1750     }
1751 
1752     // init "global" members
1753     m_ii_cdna    = ii_cdna;
1754     m_ii_genomic = ii_genomic;
1755 
1756     // re-sort for better compactization
1757     sort(pvol->begin() + idx_start, pvol->begin() + idx_stop, PDiag);
1758 
1759     const size_t idx_compacted_start (*pidx_compacted);
1760 
1761     vector<Uint4> lens;
1762     lens.reserve(pvol->size() / 2);
1763 
1764     Uint8 qs0 ((*pvol)[idx_start]);
1765     Uint4 q0  (qs0 & kUI8_LoWord);
1766     Uint4 s0  (qs0 >> 32);
1767     (*pvol)[(*pidx_compacted)++] = qs0;
1768     lens.push_back(16);
1769 
1770     for(size_t idx (idx_start + 1); idx < idx_stop; ++idx) {
1771 
1772         const Uint8 qs  ((*pvol)[idx]);
1773         const Uint4 q   (qs & kUI8_LoWord);
1774         const Uint4 s   (qs >> 32);
1775 
1776         if((q0 >= s0 && q0 - s0 == q - s && q <= q0 + 16) ||
1777            (q0 < s0 && s0 - q0 == s - q && q <= q0 + 16) )
1778         {
1779             lens.back() += q - q0;
1780         }
1781         else {
1782             (*pvol)[(*pidx_compacted)++] = qs;
1783             lens.push_back(16);
1784         }
1785 
1786         q0 = q;
1787         s0 = s;
1788     }
1789 
1790     THitRefs hitrefs (lens.size());
1791 
1792     for(size_t idx (idx_compacted_start); idx < *pidx_compacted; ++idx) {
1793 
1794         const Uint8 qs  ((*pvol)[idx]);
1795         const Uint4 q   (qs & kUI8_LoWord);
1796         const Uint4 s   (qs >> 32);
1797 
1798         THitRef hitref (new THit);
1799         hitref->SetQueryStart(q);
1800         hitref->SetSubjStart(s);
1801         const Uint4 len (lens[idx - idx_compacted_start]);
1802         hitref->SetQueryStop(q + len - 1);
1803         hitref->SetSubjStop(s + len - 1);
1804         hitref->SetMismatches(0);
1805         hitref->SetGaps(0);
1806         hitref->SetEValue(0);
1807         hitref->SetIdentity(1.0);
1808         hitref->SetLength(len);
1809         hitref->SetScore(2*len);
1810         hitrefs[idx - idx_compacted_start] = hitref;
1811     }
1812 
1813 #define EXTEND_USING_SEQUENCE_CHARS
1814 #ifdef  EXTEND_USING_SEQUENCE_CHARS
1815 
1816     // try to extend even further over possible mismatches
1817 
1818     Int8 left_limit (0);         // diag extension left limit
1819     Int8 right_limit (kDiagMax); // diag ext right limit
1820     Int8 s_prime_cur (0);
1821 
1822     const int kn (hitrefs.size());
1823     for(int k(0); k < kn; ++k) {
1824 
1825         // diag coords: q_prime = q + s; s_prime = s - q;
1826         const THit::TCoord * box (hitrefs[k]->GetBox());
1827 
1828         if(Int8(box[2]) - Int8(box[0]) != s_prime_cur) {
1829             left_limit = box[0];
1830             s_prime_cur = Int8(box[2]) - Int8(box[0]);
1831         }
1832 
1833         right_limit = kDiagMax;
1834         if(k + 1 < kn) {
1835             const THit::TCoord * boX (hitrefs[k+1]->GetBox());
1836             if(Int8(boX[2]) - Int8(boX[0]) == s_prime_cur) {
1837                 right_limit = boX[0] + boX[2];
1838             }
1839         }
1840 
1841         left_limit = x_ExtendHit(left_limit, right_limit, hitrefs[k]);
1842     }
1843 
1844     // merge adjacent hits
1845     int d (-1);
1846     Int8 rlimit (0), spc (0);
1847     for(int k (0); k < kn; ++k) {
1848 
1849         const THit::TCoord cur_diag_stop (hitrefs[k]->GetQueryStop()
1850                                           + hitrefs[k]->GetSubjStop());
1851 
1852         const Int8 s_prime (-Int8(hitrefs[k]->GetQueryStart())
1853                             + Int8(hitrefs[k]->GetSubjStart()));
1854         if(k == 0 || spc != s_prime) {
1855 
1856             hitrefs[++d] = hitrefs[k];
1857             rlimit = cur_diag_stop;
1858             spc = s_prime;
1859         }
1860         else {
1861 
1862             const THit::TCoord cur_diag_start (hitrefs[k]->GetQueryStart()
1863                                                + hitrefs[k]->GetSubjStart());
1864 
1865             THitRef & hrd (hitrefs[d]);
1866 
1867             if(rlimit + 2 == cur_diag_start) {
1868 
1869                 rlimit = cur_diag_stop;
1870                 hrd->SetQueryStop(hitrefs[k]->GetQueryStop());
1871                 hrd->SetSubjStop(hitrefs[k]->GetSubjStop());
1872                 hrd->SetMismatches(hrd->GetMismatches()
1873                                    + hitrefs[k]->GetMismatches());
1874                 const THit::TCoord len (hrd->GetQueryStop()
1875                                         - hrd->GetQueryStart() + 1);
1876                 hrd->SetLength(len);
1877                 hrd->SetIdentity(double(len - hrd->GetMismatches()) / len);
1878                 hrd->SetScore(2 * len);
1879             }
1880             else if (rlimit + 1 < cur_diag_start) {
1881 
1882                 hitrefs[++d] = hitrefs[k];
1883                 rlimit = cur_diag_stop;
1884             }
1885             else {
1886                 NCBI_THROW(CException, eUnknown,
1887                            "x_CompartPair(): Unexpected alignment overlap");
1888             }
1889         }
1890     }
1891     hitrefs.resize(min(d + 1, kn));
1892 
1893 #undef EXTEND_USING_SEQUENCE_CHARS
1894 #endif
1895 
1896     // make sure nothing is stretching out
1897     const Uint4 qmax (m_ii_cdna->m_Start + m_ii_cdna->m_Length);
1898     NON_CONST_ITERATE(THitRefs, ii, hitrefs) {
1899         if((*ii)->GetQueryStop() >= qmax) {
1900             (*ii)->Modify(1, qmax - 1);
1901         }
1902     }
1903 
1904     // Remap hit coordinates.
1905     NON_CONST_ITERATE(THitRefs, ii, hitrefs) {
1906         THit& h (**ii);
1907 
1908         h.SetQueryId (m_cDNASeqIds[m_ii_cdna->m_Oid]);
1909         h.SetSubjId  (m_GenomicSeqIds[m_ii_genomic->m_Oid]);
1910 
1911         const Uint4 * box0 (h.GetBox());
1912         Uint4 box [4] = {
1913             box0[0] - m_ii_cdna->m_Start,
1914             box0[1] - m_ii_cdna->m_Start,
1915             box0[2] - m_ii_genomic->m_Start,
1916             box0[3] - m_ii_genomic->m_Start
1917         };
1918 
1919         if(m_CurGenomicStrand == false) {
1920             box[2] = m_ii_genomic->m_Length - box[2] - 1;
1921             box[3] = m_ii_genomic->m_Length - box[3] - 1;
1922         }
1923 
1924         h.SetBox(box);
1925     }
1926 
1927     if (m_HitsOnly) {
1928         ITERATE(THitRefs, ii, hitrefs) {
1929             const THit& h (**ii);
1930             if(h.GetLength() < m_MinHitLength) continue;
1931             cout << h << endl;
1932         }
1933     } else {
1934 
1935         // identify compartments
1936         const Uint4 qlen (m_ii_cdna->m_Length);
1937 
1938         const THit::TCoord min_matches1 = THit::TCoord(
1939                       round(m_MinCompartmentIdty * qlen));
1940 
1941         const THit::TCoord min_matches2 = THit::TCoord(
1942                       round(m_MinSingletonIdty * qlen));
1943 
1944         const THit::TCoord penalty = THit::TCoord(round(m_Penalty * qlen));
1945 
1946         CCompartmentAccessor<THit> ca(penalty,
1947                                       min_matches1,
1948                                       min_matches2,
1949                                       true);
1950         ca.SetMaxIntron(m_MaxIntron);
1951         ca.Run(hitrefs.begin(), hitrefs.end());
1952 
1953         if (m_OutputMethod) {
1954             // print individual compartments
1955             THitRefs comp;
1956             for(bool b0 (ca.GetFirst(comp)); b0; b0 = ca.GetNext(comp)) {
1957                 ITERATE(THitRefs, ii, comp) {
1958                     const THit& h (**ii);
1959                     cout << h << endl;
1960                 }
1961                 // empty line to separate compartments
1962                 cout << endl;
1963             }
1964         } else {
1965             TResults pair_results = ca.AsSeqAlignSet();
1966             if (! m_Results) { // Store first results, or append to existing?
1967                 m_Results = pair_results;
1968             } else {
1969                 m_Results->Set().insert(m_Results->Set().end(),
1970                                         pair_results->Set().begin(),
1971                                         pair_results->Set().end());
1972             }
1973         }
1974     }
1975 }
1976 
1977 
x_LoadRemapData(ISequenceSource * m_qsrc,const string & sdb)1978 void CElementaryMatching::x_LoadRemapData(ISequenceSource *m_qsrc, const string& sdb)
1979 {
1980     const string filename_genomic (m_FilePath + CDirEntry::GetPathSeparator() + m_lbn_s + kFileExt_Remap);
1981     const size_t elems_genomic (CFile(filename_genomic).GetLength()/sizeof(SSeqInfo));
1982     m_SeqInfos_Genomic.resize(elems_genomic);
1983     CNcbiIfstream ifstr_genomic (filename_genomic.c_str(), IOS_BASE::binary);
1984     ifstr_genomic.read((char *) &m_SeqInfos_Genomic.front(),
1985                        elems_genomic * sizeof (SSeqInfo));
1986     ifstr_genomic.close();
1987 
1988     const string filename_cdna (m_FilePath + CDirEntry::GetPathSeparator() + m_lbn_q + kFileExt_Remap);
1989     const size_t elems_cdna (CFile(filename_cdna).GetLength()/sizeof(SSeqInfo));
1990     m_SeqInfos_cdna.resize(elems_cdna);
1991     CNcbiIfstream ifstr_cdna (filename_cdna.c_str(), IOS_BASE::binary);
1992     ifstr_cdna.read((char *) &m_SeqInfos_cdna.front(),
1993                        elems_cdna * sizeof (SSeqInfo));
1994     ifstr_cdna.close();
1995 
1996     {{
1997         CSeqDB blastdb (sdb, CSeqDB::eNucleotide);
1998         m_GenomicSeqIds.clear();
1999         for(int oid (0); blastdb.CheckOrFindOID(oid); ++oid) {
2000             THit::TId id (blastdb.GetSeqIDs(oid).front());
2001             m_GenomicSeqIds.push_back(id);
2002         }
2003     }}
2004 
2005     {{
2006         m_cDNASeqIds.clear();
2007         for(m_qsrc->ResetIndex(); m_qsrc->GetNext(); ) {
2008 
2009             THit::TId id (m_qsrc->GetSeqID());
2010             m_cDNASeqIds.push_back(id);
2011         }
2012     }}
2013 }
2014 
2015 
2016 ////////////////////////////////////////////////////////////////////////////////////
2017 ////
2018 
2019 
GenerateSeed(const string & str)2020 CRandom::TValue GenerateSeed(const string & str)
2021 {
2022     CRandom::TValue rv (0);
2023     ITERATE(string, ii, str) {
2024         rv = (rv * 3 + (*ii)) % 3571;
2025     }
2026     return time(0) - 5000 + rv;
2027 }
2028 
2029 
x_InitBasic(void)2030 void CElementaryMatching::x_InitBasic(void)
2031 {
2032     CRandom rand (GenerateSeed("qq" + m_sdb));
2033     const string base_sfx ( NStr::NumericToString(rand.GetRand()));
2034     m_lbn_q = GetLocalBaseName("qq", base_sfx);
2035     m_lbn_s = GetLocalBaseName(m_sdb, base_sfx);
2036 
2037     m_OutputMethod = false;
2038 
2039     m_Penalty = 0.55;
2040     m_MinSingletonIdty = m_MinCompartmentIdty = .75;
2041     m_MaxIntron = CCompartmentFinder<THit>::s_GetDefaultMaxIntron();
2042     m_HitsOnly = false;
2043     m_MaxVolSize = 512 * 1024 * 1024;
2044 
2045     m_MinQueryLength = 50;
2046     m_MaxQueryLength = 500000;
2047     m_MinHitLength = 1;
2048 }
2049 
2050 
Run(void)2051 void CElementaryMatching::Run(void)
2052 {
2053     x_Cleanup();
2054 
2055     // create genomic ID and coordinate remapping tables.
2056     x_CreateRemapData(m_sdb, eIM_Genomic);
2057 
2058     // create the filtering (repeats + low complexity) table
2059     // using the plus strand of the genomic sequence;
2060     x_InitFilteringVector(m_sdb, true);
2061 
2062     // create cDNA ID and coordinate remapping tables;
2063     x_CreateRemapData(m_qsrc, eIM_cDNA);
2064 
2065     // create cDNA index using the current filtering table;
2066     x_CreateIndex(m_qsrc, eIM_cDNA, true);
2067 
2068     // init the N-mer participation vector;
2069     x_InitParticipationVector(true);
2070 
2071     // create plus-strand genomic index using the participation vector
2072     x_CreateIndex(m_sdb, eIM_Genomic, true);
2073 
2074     // generate N-hits for both strands of the genome;
2075     // compact and identify compartments;
2076     // restore original coordinates and IDs in final hits;
2077     // print compartments
2078     x_LoadRemapData(m_qsrc, m_sdb);
2079 
2080     x_Search(true);
2081 
2082     TStrings vol_exts;
2083     vol_exts.push_back(kFileExt_Offsets);
2084     vol_exts.push_back(kFileExt_Positions);
2085     x_CleanVolumes(m_lbn_q, vol_exts);
2086     x_CleanVolumes(m_lbn_s, vol_exts);
2087 
2088     // create the filtering table for the minus genomic strand
2089     // from the plus strand filtering table
2090     x_InitFilteringVector(m_sdb, false);
2091 
2092     // repeat the steps to create cDNA and genomic indices
2093     x_CreateIndex(m_qsrc, eIM_cDNA, false);
2094     x_InitParticipationVector(false);
2095     x_CreateIndex(m_sdb, eIM_Genomic, false);
2096 
2097     x_Search(false);
2098 }
2099 
2100 
x_Cleanup(void)2101 void CElementaryMatching::x_Cleanup(void)
2102 {
2103     m_Mers.Clear();
2104     TStrings vol_exts;
2105     vol_exts.push_back(kFileExt_Offsets);
2106     vol_exts.push_back(kFileExt_Positions);
2107     vol_exts.push_back(kFileExt_Masked);
2108     vol_exts.push_back(kFileExt_Remap);
2109     x_CleanVolumes(m_lbn_q, vol_exts);
2110     x_CleanVolumes(m_lbn_s, vol_exts);
2111     m_Results.Reset();
2112 }
2113 
2114 
2115 END_NCBI_SCOPE
2116