1 /* $Id: seq_writer.cpp 617919 2020-10-08 11:57:15Z gouriano $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Christiam Camacho
27 *
28 */
29
30 /** @file seq_writer.cpp
31 * Implementation of the CSeqFormatter class
32 */
33
34 #include <ncbi_pch.hpp>
35 #include <objtools/blast/blastdb_format/seq_writer.hpp>
36 #include <objtools/blast/blastdb_format/blastdb_dataextract.hpp>
37 #include <objtools/blast/blastdb_format/invalid_data_exception.hpp>
38 #include <objects/seq/Seqdesc.hpp>
39 #include <objects/seq/Seq_descr.hpp>
40 #include <numeric> // for std::accumulate
41 #include <objmgr/object_manager.hpp>
42 #include <objmgr/scope.hpp>
43
44 BEGIN_NCBI_SCOPE
45 USING_SCOPE(objects);
46
CSeqFormatter(const string & format_spec,CSeqDB & blastdb,CNcbiOstream & out,CSeqFormatterConfig config)47 CSeqFormatter::CSeqFormatter(const string& format_spec, CSeqDB& blastdb,
48 CNcbiOstream& out,
49 CSeqFormatterConfig config /* = CSeqFormatterConfig() */)
50 : m_Out(out), m_FmtSpec(format_spec), m_BlastDb(blastdb),
51 m_DataExtractor(blastdb,
52 config.m_SeqRange,
53 config.m_Strand,
54 config.m_FiltAlgoId,
55 config.m_FmtAlgoId,
56 config.m_LineWidth,
57 config.m_TargetOnly,
58 config.m_UseCtrlA)
59 {
60 // Validate the algo id
61 if (config.m_FiltAlgoId >= 0 || config.m_FmtAlgoId >= 0) {
62 vector<int> algo_ids;
63 if (config.m_FiltAlgoId >= 0)
64 algo_ids.push_back(config.m_FiltAlgoId);
65 if (config.m_FmtAlgoId >= 0)
66 algo_ids.push_back(config.m_FmtAlgoId);
67 vector<int> invalid_algo_ids =
68 m_BlastDb.ValidateMaskAlgorithms(algo_ids);
69 if ( !invalid_algo_ids.empty()) {
70 NCBI_THROW(CInvalidDataException, eInvalidInput,
71 "Invalid filtering algorithm ID.");
72 }
73 }
74
75 // Record where the offsets where the replacements must occur
76 for (SIZE_TYPE i = 0; i < m_FmtSpec.size(); i++) {
77 if (m_FmtSpec[i] == '%' && m_FmtSpec[i+1] == '%') {
78 // remove the escape character for '%'
79 m_FmtSpec.erase(i++, 1);
80 continue;
81 }
82
83 if (m_FmtSpec[i] == '%') {
84 m_ReplOffsets.push_back(i);
85 m_ReplTypes.push_back(m_FmtSpec[i+1]);
86 }
87 }
88
89 if (m_ReplOffsets.empty() || m_ReplTypes.size() != m_ReplOffsets.size()) {
90 NCBI_THROW(CInvalidDataException, eInvalidInput,
91 "Invalid format specification");
92 }
93
94 m_Fasta = (m_ReplTypes[0] == 'f');
95 }
96
x_RequireData() const97 bool CSeqFormatter::x_RequireData() const
98 {
99 bool retval=false;
100
101 ITERATE(vector<char>, fmt, m_ReplTypes) {
102 switch (*fmt) {
103 case 's':
104 case 'h':
105 case 'm':
106 case 'e':
107 case 'd':
108 case 'b':
109 retval = true;
110 break;
111 }
112 }
113 return retval;
114 }
115
x_Builder(vector<string> & data2write)116 void CSeqFormatter::x_Builder(vector<string>& data2write)
117 {
118 data2write.reserve(m_ReplTypes.size());
119
120 ITERATE(vector<char>, fmt, m_ReplTypes) {
121 switch (*fmt) {
122
123 case 's':
124 data2write.push_back(m_DataExtractor.ExtractSeqData());
125 break;
126
127 case 'a':
128 data2write.push_back(m_DataExtractor.ExtractAccession());
129 break;
130
131 case 'i':
132 data2write.push_back(m_DataExtractor.ExtractSeqId());
133 break;
134
135 case 'g':
136 data2write.push_back(m_DataExtractor.ExtractGi());
137 break;
138
139 case 'o':
140 data2write.push_back(m_DataExtractor.ExtractOid());
141 break;
142
143 case 't':
144 data2write.push_back(m_DataExtractor.ExtractTitle());
145 break;
146
147 case 'h':
148 data2write.push_back(m_DataExtractor.ExtractHash());
149 break;
150
151 case 'l':
152 data2write.push_back(m_DataExtractor.ExtractSeqLen());
153 break;
154
155 case 'T':
156 data2write.push_back(m_DataExtractor.ExtractTaxId());
157 break;
158
159 case 'X':
160 data2write.push_back(m_DataExtractor.ExtractLeafTaxIds());
161 break;
162
163 case 'P':
164 data2write.push_back(m_DataExtractor.ExtractPig());
165 break;
166
167 case 'L':
168 data2write.push_back(m_DataExtractor.ExtractCommonTaxonomicName());
169 break;
170
171 case 'C':
172 data2write.push_back(m_DataExtractor.ExtractLeafCommonTaxonomicNames());
173 break;
174
175 case 'B':
176 data2write.push_back(m_DataExtractor.ExtractBlastName());
177 break;
178
179 case 'K':
180 data2write.push_back(m_DataExtractor.ExtractSuperKingdom());
181 break;
182
183 case 'S':
184 data2write.push_back(m_DataExtractor.ExtractScientificName());
185 break;
186
187 case 'N':
188 data2write.push_back(m_DataExtractor.ExtractLeafScientificNames());
189 break;
190
191 case 'm':
192 data2write.push_back(m_DataExtractor.ExtractMaskingData());
193 break;
194
195 case 'e':
196 data2write.push_back(m_DataExtractor.ExtractMembershipInteger());
197 break;
198
199 case 'n':
200 data2write.push_back(m_DataExtractor.ExtractLinksInteger());
201 break;
202
203 case 'd':
204 data2write.push_back(m_DataExtractor.ExtractAsn1Defline());
205 break;
206
207 case 'b':
208 data2write.push_back(m_DataExtractor.ExtractAsn1Bioseq());
209 break;
210
211 default:
212 CNcbiOstrstream os;
213 os << "Unrecognized format specification: '%" << *fmt << "'";
214 NCBI_THROW(CInvalidDataException, eInvalidInput,
215 CNcbiOstrstreamToString(os));
216 }
217 }
218 }
219
Write(CBlastDBSeqId & id)220 void CSeqFormatter::Write(CBlastDBSeqId& id)
221 {
222 if (m_Fasta) {
223 m_Out << m_DataExtractor.ExtractFasta(id);
224 return;
225 }
226
227 bool get_data = x_RequireData();
228 m_DataExtractor.SetSeqId(id, get_data);
229 vector<string> data2write;
230 x_Builder(data2write);
231 m_Out << x_Replacer(data2write) << endl;
232 }
233
s_GetTitle(CConstRef<CBioseq> bioseq)234 static string s_GetTitle(CConstRef<CBioseq> bioseq)
235 {
236 ITERATE(CSeq_descr::Tdata, desc, bioseq->GetDescr().Get()) {
237 if ((*desc)->Which() == CSeqdesc::e_Title) {
238 return (*desc)->GetTitle();
239 }
240 }
241 return string();
242 }
243
s_ReplaceCtrlAsInTitle(CRef<CBioseq> bioseq)244 static void s_ReplaceCtrlAsInTitle(CRef<CBioseq> bioseq)
245 {
246 static const string kTarget(" >gi|");
247 static const string kCtrlA = string(1, '\001') + string("gi|");
248 NON_CONST_ITERATE(CSeq_descr::Tdata, desc, bioseq->SetDescr().Set()) {
249 if ((*desc)->Which() == CSeqdesc::e_Title) {
250 NStr::ReplaceInPlace((*desc)->SetTitle(), kTarget, kCtrlA);
251 break;
252 }
253 }
254 }
255
GetBareId(const CSeq_id & id)256 string GetBareId(const CSeq_id& id)
257 {
258 string retval;
259
260 if (id.IsGi() || id.IsPrf() || id.IsPir()) {
261 retval = id.AsFastaString();
262 }
263 else {
264 retval = id.GetSeqIdString(true);
265 }
266
267 return retval;
268 }
269
DumpAll(CSeqDB & blastdb,CSeqFormatterConfig config)270 void CSeqFormatter::DumpAll(CSeqDB& blastdb, CSeqFormatterConfig config)
271 {
272 CFastaOstream fasta(m_Out);
273 fasta.SetWidth(config.m_LineWidth);
274 fasta.SetAllFlags(CFastaOstream::fKeepGTSigns|CFastaOstream::fNoExpensiveOps|CFastaOstream::fEnableGI);
275
276 bool long_seqids = false;
277 CNcbiApplication* app = CNcbiApplication::Instance();
278 if (app) {
279 const CNcbiRegistry& registry = app->GetConfig();
280 long_seqids = (registry.Get("BLAST", "LONG_SEQID") == "1");
281 }
282
283 CRef<CBioseq> bioseq;
284 for (int i=0; blastdb.CheckOrFindOID(i); i++) {
285 bioseq.Reset(blastdb.GetBioseq(i));
286 if (bioseq.Empty()) {
287 continue;
288 }
289 // TODO: remove gnl|BL_ORD_ID
290 CRef<CSeq_id> id(*(bioseq->GetId().begin()));
291 if (id->IsGeneral() &&
292 id->GetGeneral().GetDb() == "BL_ORD_ID") {
293 m_Out << ">" << s_GetTitle(bioseq) << '\n';
294 CScope scope(*CObjectManager::GetInstance());
295 fasta.WriteSequence(scope.AddBioseq(*bioseq));
296 }
297 else if (id->IsLocal()) {
298 string lcl_tmp = id->AsFastaString();
299 lcl_tmp = lcl_tmp.erase(0,4);
300 m_Out << ">" << lcl_tmp << " " << s_GetTitle(bioseq) << '\n';
301 CScope scope(*CObjectManager::GetInstance());
302 fasta.WriteSequence(scope.AddBioseq(*bioseq));
303 }
304 else if (long_seqids) {
305
306 if (config.m_UseCtrlA) {
307 s_ReplaceCtrlAsInTitle(bioseq);
308 }
309 fasta.Write(*bioseq, 0, true);
310 }
311 else {
312
313 string separator = config.m_UseCtrlA ? "\001" : " >";
314
315 m_Out << '>';
316 id = FindBestChoice(bioseq->GetId(), CSeq_id::Score);
317 m_Out << GetBareId(*id);
318
319 string title = s_GetTitle(bioseq);
320
321 if (!title.empty()) {
322 m_Out << ' ';
323
324 NStr::ReplaceInPlace(title, " >", "\001");
325
326 vector<string> tokens;
327 NStr::Split(title, "\001", tokens);
328 auto it = tokens.begin();
329 m_Out << *it;
330 ++it;
331 for (; it != tokens.end(); ++it) {
332 size_t pos = it->find (" ");
333 string str_id(*it, 0, pos != NPOS ? pos : it->length());
334 list< CRef<CSeq_id> > seqids;
335 CSeq_id::ParseFastaIds(seqids, str_id);
336
337 // no valid sequence ids indicates that '>' was within the
338 // defline text
339 if (seqids.empty()) {
340 m_Out << " >" << *it;
341 continue;
342 }
343 m_Out << separator;
344 id = FindBestChoice(seqids, CSeq_id::Score);
345 m_Out << GetBareId(*id);
346 if (pos != NPOS) {
347 m_Out << it->substr(pos, it->length() - pos);
348 }
349 }
350 }
351 m_Out << endl;
352
353 CScope scope(*CObjectManager::GetInstance());
354 fasta.WriteSequence(scope.AddBioseq(*bioseq));
355 }
356 }
357 }
358
359 /// Auxiliary functor to compute the length of a string
360 struct StrLenAdd
361 {
operator ()StrLenAdd362 SIZE_TYPE operator() (SIZE_TYPE a, const string& b) const {
363 return a + b.size();
364 }
365 };
366
367 string
x_Replacer(const vector<string> & data2write) const368 CSeqFormatter::x_Replacer(const vector<string>& data2write) const
369 {
370 SIZE_TYPE data2write_size = accumulate(data2write.begin(), data2write.end(),
371 0, StrLenAdd());
372
373 string retval;
374 retval.reserve(m_FmtSpec.size() + data2write_size -
375 (m_ReplTypes.size() * 2));
376
377 SIZE_TYPE fmt_idx = 0;
378 for (SIZE_TYPE i = 0, kSize = m_ReplOffsets.size(); i < kSize; i++) {
379 retval.append(&m_FmtSpec[fmt_idx], &m_FmtSpec[m_ReplOffsets[i]]);
380 retval.append(data2write[i]);
381 fmt_idx = m_ReplOffsets[i] + 2;
382 }
383 if (fmt_idx <= m_FmtSpec.size()) {
384 retval.append(&m_FmtSpec[fmt_idx], &m_FmtSpec[m_FmtSpec.size()]);
385 }
386
387 return retval;
388 }
389
SetConfig(TSeqRange range,objects::ENa_strand strand,int filt_algo_id)390 void CSeqFormatter::SetConfig(TSeqRange range, objects::ENa_strand strand,
391 int filt_algo_id)
392 {
393 m_DataExtractor.SetConfig(range, strand, filt_algo_id);
394 }
395
396
397 END_NCBI_SCOPE
398