1 /*  $Id: seq_writer.cpp 617919 2020-10-08 11:57:15Z gouriano $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file seq_writer.cpp
31  *  Implementation of the CSeqFormatter class
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <objtools/blast/blastdb_format/seq_writer.hpp>
36 #include <objtools/blast/blastdb_format/blastdb_dataextract.hpp>
37 #include <objtools/blast/blastdb_format/invalid_data_exception.hpp>
38 #include <objects/seq/Seqdesc.hpp>
39 #include <objects/seq/Seq_descr.hpp>
40 #include <numeric>      // for std::accumulate
41 #include <objmgr/object_manager.hpp>
42 #include <objmgr/scope.hpp>
43 
44 BEGIN_NCBI_SCOPE
45 USING_SCOPE(objects);
46 
CSeqFormatter(const string & format_spec,CSeqDB & blastdb,CNcbiOstream & out,CSeqFormatterConfig config)47 CSeqFormatter::CSeqFormatter(const string& format_spec, CSeqDB& blastdb,
48                  CNcbiOstream& out,
49                  CSeqFormatterConfig config /* = CSeqFormatterConfig() */)
50     : m_Out(out), m_FmtSpec(format_spec), m_BlastDb(blastdb),
51       m_DataExtractor(blastdb,
52                       config.m_SeqRange,
53                       config.m_Strand,
54                       config.m_FiltAlgoId,
55                       config.m_FmtAlgoId,
56                       config.m_LineWidth,
57                       config.m_TargetOnly,
58                       config.m_UseCtrlA)
59 {
60     // Validate the algo id
61     if (config.m_FiltAlgoId >= 0 || config.m_FmtAlgoId >= 0) {
62         vector<int> algo_ids;
63         if (config.m_FiltAlgoId >= 0)
64             algo_ids.push_back(config.m_FiltAlgoId);
65         if (config.m_FmtAlgoId >= 0)
66             algo_ids.push_back(config.m_FmtAlgoId);
67         vector<int> invalid_algo_ids =
68             m_BlastDb.ValidateMaskAlgorithms(algo_ids);
69         if ( !invalid_algo_ids.empty()) {
70             NCBI_THROW(CInvalidDataException, eInvalidInput,
71                 "Invalid filtering algorithm ID.");
72         }
73     }
74 
75     // Record where the offsets where the replacements must occur
76     for (SIZE_TYPE i = 0; i < m_FmtSpec.size(); i++) {
77         if (m_FmtSpec[i] == '%' && m_FmtSpec[i+1] == '%') {
78             // remove the escape character for '%'
79             m_FmtSpec.erase(i++, 1);
80             continue;
81         }
82 
83         if (m_FmtSpec[i] == '%') {
84             m_ReplOffsets.push_back(i);
85             m_ReplTypes.push_back(m_FmtSpec[i+1]);
86         }
87     }
88 
89     if (m_ReplOffsets.empty() || m_ReplTypes.size() != m_ReplOffsets.size()) {
90         NCBI_THROW(CInvalidDataException, eInvalidInput,
91                    "Invalid format specification");
92     }
93 
94     m_Fasta = (m_ReplTypes[0] == 'f');
95 }
96 
x_RequireData() const97 bool CSeqFormatter::x_RequireData() const
98 {
99 	bool retval=false;
100 
101 	ITERATE(vector<char>, fmt, m_ReplTypes) {
102 	switch (*fmt) {
103 		case 's':
104 		case 'h':
105 		case 'm':
106 		case 'e':
107 		case 'd':
108 		case 'b':
109 			retval = true;
110 			break;
111 	}
112     }
113     return retval;
114 }
115 
x_Builder(vector<string> & data2write)116 void CSeqFormatter::x_Builder(vector<string>& data2write)
117 {
118     data2write.reserve(m_ReplTypes.size());
119 
120     ITERATE(vector<char>, fmt, m_ReplTypes) {
121         switch (*fmt) {
122 
123         case 's':
124             data2write.push_back(m_DataExtractor.ExtractSeqData());
125             break;
126 
127         case 'a':
128             data2write.push_back(m_DataExtractor.ExtractAccession());
129             break;
130 
131         case 'i':
132             data2write.push_back(m_DataExtractor.ExtractSeqId());
133             break;
134 
135         case 'g':
136             data2write.push_back(m_DataExtractor.ExtractGi());
137             break;
138 
139         case 'o':
140             data2write.push_back(m_DataExtractor.ExtractOid());
141             break;
142 
143         case 't':
144             data2write.push_back(m_DataExtractor.ExtractTitle());
145             break;
146 
147         case 'h':
148             data2write.push_back(m_DataExtractor.ExtractHash());
149             break;
150 
151         case 'l':
152             data2write.push_back(m_DataExtractor.ExtractSeqLen());
153             break;
154 
155         case 'T':
156             data2write.push_back(m_DataExtractor.ExtractTaxId());
157             break;
158 
159         case 'X':
160             data2write.push_back(m_DataExtractor.ExtractLeafTaxIds());
161             break;
162 
163         case 'P':
164             data2write.push_back(m_DataExtractor.ExtractPig());
165             break;
166 
167         case 'L':
168             data2write.push_back(m_DataExtractor.ExtractCommonTaxonomicName());
169             break;
170 
171         case 'C':
172             data2write.push_back(m_DataExtractor.ExtractLeafCommonTaxonomicNames());
173             break;
174 
175         case 'B':
176             data2write.push_back(m_DataExtractor.ExtractBlastName());
177             break;
178 
179         case 'K':
180             data2write.push_back(m_DataExtractor.ExtractSuperKingdom());
181             break;
182 
183         case 'S':
184             data2write.push_back(m_DataExtractor.ExtractScientificName());
185             break;
186 
187         case 'N':
188             data2write.push_back(m_DataExtractor.ExtractLeafScientificNames());
189             break;
190 
191         case 'm':
192             data2write.push_back(m_DataExtractor.ExtractMaskingData());
193             break;
194 
195         case 'e':
196             data2write.push_back(m_DataExtractor.ExtractMembershipInteger());
197             break;
198 
199         case 'n':
200             data2write.push_back(m_DataExtractor.ExtractLinksInteger());
201             break;
202 
203         case 'd':
204             data2write.push_back(m_DataExtractor.ExtractAsn1Defline());
205             break;
206 
207         case 'b':
208             data2write.push_back(m_DataExtractor.ExtractAsn1Bioseq());
209             break;
210 
211         default:
212             CNcbiOstrstream os;
213             os << "Unrecognized format specification: '%" << *fmt << "'";
214             NCBI_THROW(CInvalidDataException, eInvalidInput,
215                        CNcbiOstrstreamToString(os));
216         }
217     }
218 }
219 
Write(CBlastDBSeqId & id)220 void CSeqFormatter::Write(CBlastDBSeqId& id)
221 {
222     if (m_Fasta) {
223         m_Out << m_DataExtractor.ExtractFasta(id);
224         return;
225     }
226 
227     bool get_data = x_RequireData();
228     m_DataExtractor.SetSeqId(id, get_data);
229     vector<string> data2write;
230     x_Builder(data2write);
231     m_Out << x_Replacer(data2write) << endl;
232 }
233 
s_GetTitle(CConstRef<CBioseq> bioseq)234 static string s_GetTitle(CConstRef<CBioseq> bioseq)
235 {
236     ITERATE(CSeq_descr::Tdata, desc, bioseq->GetDescr().Get()) {
237         if ((*desc)->Which() == CSeqdesc::e_Title) {
238             return (*desc)->GetTitle();
239         }
240     }
241     return string();
242 }
243 
s_ReplaceCtrlAsInTitle(CRef<CBioseq> bioseq)244 static void s_ReplaceCtrlAsInTitle(CRef<CBioseq> bioseq)
245 {
246     static const string kTarget(" >gi|");
247     static const string kCtrlA = string(1, '\001') + string("gi|");
248     NON_CONST_ITERATE(CSeq_descr::Tdata, desc, bioseq->SetDescr().Set()) {
249         if ((*desc)->Which() == CSeqdesc::e_Title) {
250             NStr::ReplaceInPlace((*desc)->SetTitle(), kTarget, kCtrlA);
251             break;
252         }
253     }
254 }
255 
GetBareId(const CSeq_id & id)256 string GetBareId(const CSeq_id& id)
257 {
258     string retval;
259 
260     if (id.IsGi() || id.IsPrf() || id.IsPir()) {
261         retval = id.AsFastaString();
262     }
263     else {
264         retval = id.GetSeqIdString(true);
265     }
266 
267     return retval;
268 }
269 
DumpAll(CSeqDB & blastdb,CSeqFormatterConfig config)270 void CSeqFormatter::DumpAll(CSeqDB& blastdb, CSeqFormatterConfig config)
271 {
272     CFastaOstream fasta(m_Out);
273     fasta.SetWidth(config.m_LineWidth);
274     fasta.SetAllFlags(CFastaOstream::fKeepGTSigns|CFastaOstream::fNoExpensiveOps|CFastaOstream::fEnableGI);
275 
276     bool long_seqids = false;
277     CNcbiApplication* app = CNcbiApplication::Instance();
278     if (app) {
279         const CNcbiRegistry& registry = app->GetConfig();
280         long_seqids = (registry.Get("BLAST", "LONG_SEQID") == "1");
281     }
282 
283     CRef<CBioseq> bioseq;
284     for (int i=0; blastdb.CheckOrFindOID(i); i++) {
285          bioseq.Reset(blastdb.GetBioseq(i));
286          if (bioseq.Empty()) {
287              continue;
288          }
289          // TODO: remove gnl|BL_ORD_ID
290          CRef<CSeq_id> id(*(bioseq->GetId().begin()));
291          if (id->IsGeneral() &&
292              id->GetGeneral().GetDb() == "BL_ORD_ID") {
293              m_Out << ">" << s_GetTitle(bioseq) << '\n';
294              CScope scope(*CObjectManager::GetInstance());
295              fasta.WriteSequence(scope.AddBioseq(*bioseq));
296          }
297          else if (id->IsLocal()) {
298 	     string lcl_tmp = id->AsFastaString();
299 	     lcl_tmp = lcl_tmp.erase(0,4);
300 	     m_Out << ">" << lcl_tmp << " " << s_GetTitle(bioseq) << '\n';
301              CScope scope(*CObjectManager::GetInstance());
302              fasta.WriteSequence(scope.AddBioseq(*bioseq));
303          }
304          else if (long_seqids) {
305 
306              if (config.m_UseCtrlA) {
307                  s_ReplaceCtrlAsInTitle(bioseq);
308              }
309              fasta.Write(*bioseq, 0, true);
310          }
311          else {
312 
313             string separator = config.m_UseCtrlA ? "\001" : " >";
314 
315             m_Out << '>';
316             id = FindBestChoice(bioseq->GetId(), CSeq_id::Score);
317             m_Out << GetBareId(*id);
318 
319             string title = s_GetTitle(bioseq);
320 
321             if (!title.empty()) {
322                 m_Out << ' ';
323 
324                 NStr::ReplaceInPlace(title, " >", "\001");
325 
326                 vector<string> tokens;
327                 NStr::Split(title, "\001", tokens);
328                 auto it = tokens.begin();
329                 m_Out << *it;
330                 ++it;
331                 for (; it != tokens.end(); ++it) {
332                     size_t pos = it->find (" ");
333                     string str_id(*it, 0, pos != NPOS ? pos : it->length());
334                     list< CRef<CSeq_id> > seqids;
335                     CSeq_id::ParseFastaIds(seqids, str_id);
336 
337                     // no valid sequence ids indicates that '>' was within the
338                     // defline text
339                     if (seqids.empty()) {
340                         m_Out << " >" << *it;
341                         continue;
342                     }
343                     m_Out << separator;
344                     id = FindBestChoice(seqids, CSeq_id::Score);
345                     m_Out << GetBareId(*id);
346                     if (pos != NPOS) {
347                         m_Out << it->substr(pos, it->length() - pos);
348                     }
349                 }
350             }
351             m_Out << endl;
352 
353             CScope scope(*CObjectManager::GetInstance());
354             fasta.WriteSequence(scope.AddBioseq(*bioseq));
355          }
356     }
357 }
358 
359 /// Auxiliary functor to compute the length of a string
360 struct StrLenAdd
361 {
operator ()StrLenAdd362     SIZE_TYPE operator() (SIZE_TYPE a, const string& b) const {
363         return a + b.size();
364     }
365 };
366 
367 string
x_Replacer(const vector<string> & data2write) const368 CSeqFormatter::x_Replacer(const vector<string>& data2write) const
369 {
370     SIZE_TYPE data2write_size = accumulate(data2write.begin(), data2write.end(),
371                                            0, StrLenAdd());
372 
373     string retval;
374     retval.reserve(m_FmtSpec.size() + data2write_size -
375                    (m_ReplTypes.size() * 2));
376 
377     SIZE_TYPE fmt_idx = 0;
378     for (SIZE_TYPE i = 0, kSize = m_ReplOffsets.size(); i < kSize; i++) {
379         retval.append(&m_FmtSpec[fmt_idx], &m_FmtSpec[m_ReplOffsets[i]]);
380         retval.append(data2write[i]);
381         fmt_idx = m_ReplOffsets[i] + 2;
382     }
383     if (fmt_idx <= m_FmtSpec.size()) {
384         retval.append(&m_FmtSpec[fmt_idx], &m_FmtSpec[m_FmtSpec.size()]);
385     }
386 
387     return retval;
388 }
389 
SetConfig(TSeqRange range,objects::ENa_strand strand,int filt_algo_id)390 void CSeqFormatter::SetConfig(TSeqRange range, objects::ENa_strand strand,
391 							  int filt_algo_id)
392 {
393 	m_DataExtractor.SetConfig(range, strand, filt_algo_id);
394 }
395 
396 
397 END_NCBI_SCOPE
398