1 /* $Id: dbsource_item.cpp 607132 2020-04-30 12:54:53Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Mati Shomrat, NCBI
27 *
28 * File Description:
29 *
30 */
31 #include <ncbi_pch.hpp>
32 #include <corelib/ncbistd.hpp>
33 #include <corelib/ncbiutil.hpp>
34
35 #include <objects/general/Dbtag.hpp>
36 #include <objects/general/Date.hpp>
37 #include <objects/general/Object_id.hpp>
38 #include <objects/seqblock/PIR_block.hpp>
39 #include <objects/seqblock/PRF_block.hpp>
40 #include <objects/seqblock/PRF_ExtraSrc.hpp>
41 #include <objects/seqblock/PDB_block.hpp>
42 #include <objects/seqblock/PDB_replace.hpp>
43 #include <objects/seqblock/SP_block.hpp>
44 #include <objects/seqloc/PDB_seq_id.hpp>
45 #include <objects/seqloc/Textseq_id.hpp>
46 #include <objects/seq/Bioseq.hpp>
47 #include <objects/seq/seq_id_handle.hpp>
48 #include <objects/seqset/Bioseq_set.hpp>
49 #include <objmgr/seq_entry_handle.hpp>
50 #include <objmgr/bioseq_handle.hpp>
51 #include <objmgr/scope.hpp>
52 #include <objmgr/feat_ci.hpp>
53 #include <objmgr/seqdesc_ci.hpp>
54 #include <objmgr/bioseq_ci.hpp>
55 #include <objmgr/util/seq_loc_util.hpp>
56 #include <objmgr/util/sequence.hpp>
57
58 #include <objtools/format/formatter.hpp>
59 #include <objtools/format/text_ostream.hpp>
60 #include <objtools/format/items/dbsource_item.hpp>
61 #include <objtools/format/context.hpp>
62 #include <objmgr/util/objutil.hpp>
63
64
65 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)66 BEGIN_SCOPE(objects)
67
68
69 CDBSourceItem::CDBSourceItem(CBioseqContext& ctx) :
70 CFlatItem(&ctx)
71 {
72 x_GatherInfo(ctx);
73 }
74
GetItemType(void) const75 IFlatItem::EItem CDBSourceItem::GetItemType(void) const
76 {
77 return eItem_DbSource;
78 }
79
Format(IFormatter & formatter,IFlatTextOStream & text_os) const80 void CDBSourceItem::Format
81 (IFormatter& formatter,
82 IFlatTextOStream& text_os) const
83
84 {
85 formatter.FormatDBSource(*this, text_os);
86 }
87
88
s_ScoreForDBSource(const CSeq_id_Handle & idh)89 static int s_ScoreForDBSource(const CSeq_id_Handle& idh)
90 {
91 CConstRef<CSeq_id> id = idh.GetSeqId();
92 switch (id->Which()) {
93 case CSeq_id::e_not_set: return kMax_Int;
94 case CSeq_id::e_Gi: return 31;
95 case CSeq_id::e_Giim: return 30;
96 case CSeq_id::e_Local: case CSeq_id::e_General: return 20;
97 case CSeq_id::e_Other: return 18;
98 case CSeq_id::e_Gibbmt: return 16;
99 case CSeq_id::e_Gibbsq: case CSeq_id::e_Patent: return 15;
100 case CSeq_id::e_Pdb: return 12;
101 default: return 10;
102 }
103 }
104
105
s_FindBestChoiceForDbsource(const CSeq_id_Handle & idh,CScope & scope)106 static const CSeq_id_Handle s_FindBestChoiceForDbsource(const CSeq_id_Handle& idh, CScope& scope)
107 {
108 return FindBestChoice(scope.GetIds(idh), s_ScoreForDBSource);
109 }
110
111
s_AddToUniqueIdList(const CSeq_id_Handle & idh,vector<CSeq_id_Handle> & unique_ids)112 static void s_AddToUniqueIdList(const CSeq_id_Handle& idh, vector<CSeq_id_Handle>& unique_ids)
113 {
114 ITERATE (vector<CSeq_id_Handle>, it, unique_ids) {
115 if (idh == *it) {
116 return;
117 }
118 }
119 unique_ids.push_back(idh);
120 }
121
122
s_HasLocalBioseq(const CSeq_loc & loc,const CSeq_entry_Handle & tse)123 static bool s_HasLocalBioseq(const CSeq_loc& loc, const CSeq_entry_Handle& tse)
124 {
125 CScope& scope = tse.GetScope();
126 for (CSeq_loc_CI li(loc); li; ++li) {
127 CBioseq_Handle local =
128 scope.GetBioseqHandleFromTSE(li.GetSeq_id(), tse);
129 if (local) {
130 return true;
131 }
132 }
133 return false;
134 }
135
136
x_GatherInfo(CBioseqContext & ctx)137 void CDBSourceItem::x_GatherInfo(CBioseqContext& ctx)
138 {
139 const bool bHtml = ctx.Config().DoHTML();
140
141 const CBioseq_Handle& seq = ctx.GetHandle();
142 const CBioseq_Handle::TId& ids = seq.GetId();
143 CSeq_id_Handle idh = FindBestChoice(ids, s_ScoreForDBSource);
144
145 if (!idh) {
146 m_DBSource.push_back("UNKNOWN");
147 return;
148 }
149
150 switch (idh.Which()) {
151 case CSeq_id::e_Pir:
152 m_DBSource.push_back(x_FormatDBSourceID(idh));
153 x_AddPIRBlock(ctx);
154 break;
155
156 case CSeq_id::e_Swissprot:
157 m_DBSource.push_back(x_FormatDBSourceID(idh));
158 x_AddSPBlock(ctx);
159 break;
160
161 case CSeq_id::e_Prf:
162 m_DBSource.push_back(x_FormatDBSourceID(idh));
163 x_AddPRFBlock(ctx);
164 break;
165
166 case CSeq_id::e_Pdb:
167 m_DBSource.push_back(x_FormatDBSourceID(idh));
168 x_AddPDBBlock(ctx);
169 break;
170
171 case CSeq_id::e_General:
172 if (!NStr::StartsWith(idh.GetSeqId()->GetGeneral().GetDb(), "PID")) {
173 m_DBSource.push_back("UNKNOWN");
174 break;
175 }
176 // otherwise, fall through
177 case CSeq_id::e_Gibbsq: case CSeq_id::e_Gibbmt: case CSeq_id::e_Giim:
178 case CSeq_id::e_Genbank: case CSeq_id::e_Embl: case CSeq_id::e_Other:
179 case CSeq_id::e_Gi: case CSeq_id::e_Ddbj:
180 case CSeq_id::e_Tpg: case CSeq_id::e_Tpe: case CSeq_id::e_Tpd:
181 {
182 CScope& scope = ctx.GetScope();
183 vector<CSeq_id_Handle> unique_ids;
184
185 // find generating feature
186 const CSeq_feat* feat = sequence::GetCDSForProduct(seq);
187 if (feat == NULL) {
188 // may also be protein product of mature peptide feature
189 feat = sequence::GetPROTForProduct(seq);
190 }
191
192 if (feat != NULL) {
193 const CSeq_loc& loc = feat->GetLocation();
194 CSeq_entry_Handle topLevelEntry = seq.GetTopLevelEntry();
195 if (s_HasLocalBioseq(loc, topLevelEntry)) {
196 for (CSeq_loc_CI li(loc); li; ++li) {
197 s_AddToUniqueIdList(li.GetSeq_id_Handle(), unique_ids);
198 }
199 } /* else {
200 const CSeq_id *cds_seq_id = loc.GetId();
201 if( NULL != cds_seq_id && cds_seq_id->IsGi() ) {
202 CSeq_id_Base::TGi cds_gi = cds_seq_id->GetGi();
203 s_AddToUniqueIdList( CSeq_id_Handle::GetHandle(cds_gi), unique_ids);
204 }
205 } */
206 }
207
208 string str;
209 ITERATE (vector<CSeq_id_Handle>, it, unique_ids) {
210 CSeq_id_Handle idh2 = s_FindBestChoiceForDbsource(*it, scope);
211 if (idh2) {
212 str.erase();
213 str = x_FormatDBSourceID(idh2);
214 if (!NStr::IsBlank(str)) {
215 m_DBSource.push_back(str);
216 }
217 } else {
218 m_DBSource.push_back( x_FormatDBSourceID( *it ) );
219 }
220 }
221
222 if( m_DBSource.empty() && feat != NULL ) {
223 const CSeq_loc& loc = feat->GetLocation();
224 const CSeq_id *cds_seq_id = loc.GetId();
225 if( NULL != cds_seq_id && cds_seq_id->IsGi() ) {
226 CSeq_id_Base::TGi cds_gi = cds_seq_id->GetGi();
227 // s_AddToUniqueIdList( CSeq_id_Handle::GetHandle(cds_gi), unique_ids);
228 m_DBSource.push_back( x_FormatDBSourceID( CSeq_id_Handle::GetHandle(cds_gi) ) );
229 }
230 }
231
232 if (m_DBSource.empty()) {
233 m_DBSource.push_back(x_FormatDBSourceID(idh));
234 }
235 break;
236 }
237 default:
238 m_DBSource.push_back("UNKNOWN");
239 }
240
241 // turn double-quotes to single-quotes in all m_DBSources,
242 // except inside HTML tags
243 NON_CONST_ITERATE( list<string>, it, m_DBSource ) {
244 if( bHtml ) {
245 ConvertQuotesNotInHTMLTags( *it );
246 } else {
247 replace( it->begin(), it->end(), '\"', '\'' );
248 }
249 }
250 }
251
x_AddPIRBlock(CBioseqContext & ctx)252 void CDBSourceItem::x_AddPIRBlock(CBioseqContext& ctx)
253 {
254 // In this function, the newlines seem weird because the C toolkit
255 // outputs this way. Hopefully in the future we can do something
256 // more consistent.
257
258
259 CSeqdesc_CI dsc(ctx.GetHandle(), CSeqdesc::e_Pir);
260 if ( !dsc ) {
261 return;
262 }
263
264 x_SetObject(*dsc);
265
266 bool containsHostLine = false; // try to match C's whitespace
267
268 const CPIR_block& pir = dsc->GetPir();
269 if (pir.CanGetHost()) {
270 m_DBSource.push_back("host:" + pir.GetHost() + "\n");
271 containsHostLine = true;
272 }
273 if (pir.CanGetSource()) {
274 m_DBSource.push_back("source: " + pir.GetSource() + "\n");
275 }
276 if (pir.CanGetSummary()) {
277 m_DBSource.push_back("summary: " + pir.GetSummary() + "\n");
278 }
279 if (pir.CanGetGenetic()) {
280 m_DBSource.push_back("genetic: " + pir.GetGenetic() + "\n");
281 }
282 if (pir.CanGetIncludes()) {
283 m_DBSource.push_back("includes: " + pir.GetIncludes() + "\n");
284 }
285 if (pir.CanGetPlacement()) {
286 m_DBSource.push_back("placement: " + pir.GetPlacement() + "\n");
287 }
288 if (pir.CanGetSuperfamily()) {
289 m_DBSource.push_back("superfamily: " + pir.GetSuperfamily() + "\n");
290 }
291 if (pir.CanGetCross_reference()) {
292 m_DBSource.push_back("xref: " + pir.GetCross_reference() + "\n");
293 }
294 if (pir.CanGetDate()) {
295 m_DBSource.push_back("PIR dates: " + pir.GetDate() + "\n");
296 }
297 if (pir.CanGetHad_punct() && pir.GetHad_punct() ) {
298 m_DBSource.push_back("punctuation in sequence");
299 }
300 if (pir.CanGetSeqref()) {
301 list<string> xrefs;
302 ITERATE (CPIR_block::TSeqref, it, pir.GetSeqref()) {
303 const char* type = 0;
304 switch ((*it)->Which()) {
305 case CSeq_id::e_Genbank: type = "genbank "; break;
306 case CSeq_id::e_Embl: type = "embl "; break;
307 case CSeq_id::e_Pir: type = "pir "; break;
308 case CSeq_id::e_Swissprot: type = "swissprot "; break;
309 case CSeq_id::e_Gi: type = "gi: "; break;
310 case CSeq_id::e_Ddbj: type = "ddbj "; break;
311 case CSeq_id::e_Prf: type = "prf "; break;
312 default: break;
313 }
314 if (type) {
315 xrefs.push_back(type + (*it)->GetSeqIdString(true));
316 }
317 }
318 if ( !xrefs.empty() ) {
319 m_DBSource.push_back("xrefs: " + NStr::Join(xrefs, ", "));
320 }
321 }
322
323 NON_CONST_ITERATE (list<string>, it, m_DBSource) {
324 if( &*it == &m_DBSource.front() ) {
325 // first one has newline AFTER the semicolon
326 *it += ";\n";
327 // match C toolkit
328 /* if( (it + 1) != m_DBSource.end() && ! NStr::StartsWith(*(it + 1), "host") ) {
329 *it += ";\n";
330 } */
331 } else if( &*it == &m_DBSource.back() ) {
332 // last one ends in period
333 *it += ".";
334 } else {
335 // The C version puts newlines before some of these for some reason
336 *it += ";\n";
337 }
338 // *it += (&*it == &m_DBSource.back() ? "." : "\n;");
339 }
340
341 // match C's whitespace
342 if( ! containsHostLine ) {
343 m_DBSource.front() += "\n";
344 }
345 }
346
s_FormatDate(const CDate & date,string & str)347 static void s_FormatDate(const CDate& date, string& str)
348 {
349 CTime time = date.AsCTime();
350 str += time.AsString(CTimeFormat("b d, Y"));
351 }
352
353
x_AddSPBlock(CBioseqContext & ctx)354 void CDBSourceItem::x_AddSPBlock(CBioseqContext& ctx)
355 {
356 CSeqdesc_CI dsc(ctx.GetHandle(), CSeqdesc::e_Sp);
357 if ( !dsc ) {
358 return;
359 }
360 x_SetObject(*dsc);
361
362 const CSP_block& sp = dsc->GetSp();
363 switch (sp.GetClass()) {
364 case CSP_block::eClass_standard:
365 m_DBSource.push_back("class: standard.");
366 break;
367 case CSP_block::eClass_prelim:
368 m_DBSource.push_back("class: preliminary.");
369 break;
370 default:
371 break;
372 }
373 // laid out slightly differently from the C version, but I think that's
374 // a bug in the latter (which runs some things together)
375 if (sp.CanGetExtra_acc() && !sp.GetExtra_acc().empty() ) {
376 m_DBSource.push_back("extra accessions:"
377 + NStr::Join(sp.GetExtra_acc(), ","));
378 }
379 if (sp.GetImeth()) {
380 m_DBSource.push_back("seq starts with Met");
381 }
382 if (sp.CanGetPlasnm() && !sp.GetPlasnm().empty() ) {
383 m_DBSource.push_back("plasmid:" + NStr::Join(sp.GetPlasnm(), ","));
384 }
385 if (sp.CanGetCreated()) {
386 string s("created: ");
387 //sp.GetCreated().GetDate(&s, "%3N %D %Y");
388 s_FormatDate(sp.GetCreated(), s);
389 m_DBSource.push_back(s + '.');
390 }
391 if (sp.CanGetSequpd()) {
392 string s("sequence updated: ");
393 //sp.GetSequpd().GetDate(&s, "%3N %D %Y");
394 s_FormatDate(sp.GetSequpd(), s);
395 m_DBSource.push_back(s + '.');
396 }
397 if (sp.CanGetAnnotupd()) {
398 string s("annotation updated: ");
399 //sp.GetAnnotupd().GetDate(&s, "%3N %D %Y");
400 s_FormatDate(sp.GetAnnotupd(), s);
401 m_DBSource.push_back(s + '.');
402 }
403 if (sp.CanGetSeqref() && !sp.GetSeqref().empty() ) {
404 list<string> xrefs;
405 ITERATE (CSP_block::TSeqref, it, sp.GetSeqref()) {
406 CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(**it);
407 CSeq_id_Handle best = sequence::GetId(idh, ctx.GetScope(),
408 sequence::eGetId_Best);
409 if ( !best ) {
410 best = idh;
411 }
412 if (best) {
413 string acc = best.GetSeqId()->GetSeqIdString(true);
414 xrefs.push_back(acc);
415 }
416 /**
417 const char* s = 0;
418 switch ((*it)->Which()) {
419 case CSeq_id::e_Genbank: s = "genbank accession "; break;
420 case CSeq_id::e_Embl: s = "embl accession "; break;
421 case CSeq_id::e_Pir: s = "pir locus "; break;
422 case CSeq_id::e_Swissprot: s = "swissprot accession "; break;
423 case CSeq_id::e_Gi: s = "gi: "; break;
424 case CSeq_id::e_Ddbj: s = "ddbj accession "; break;
425 case CSeq_id::e_Prf: s = "prf accession "; break;
426 case CSeq_id::e_Pdb: s = "pdb accession "; break;
427 case CSeq_id::e_Tpg: s = "genbank third party accession "; break;
428 case CSeq_id::e_Tpe: s = "embl third party accession "; break;
429 case CSeq_id::e_Tpd: s = "ddbj third party accession "; break;
430 default: break;
431 }
432 if ( s ) {
433 string acc = (*it)->GetSeqIdString(true);
434 xrefs.push_back(s + acc);
435 }
436 **/
437 }
438 if ( !xrefs.empty() ) {
439 m_DBSource.push_back("xrefs: " + NStr::Join(xrefs, ", "));
440 }
441 }
442 if (sp.CanGetDbref() && !sp.GetDbref().empty() ) {
443 list<string> xrefs;
444 ITERATE (CSP_block::TDbref, it, sp.GetDbref()) {
445 const CObject_id& tag = (*it)->GetTag();
446 string id = (tag.IsStr() ? tag.GetStr()
447 : NStr::IntToString(tag.GetId()));
448 string db = (*it)->GetDb();
449 if ( db == "MIM") {
450 if (ctx.Config().DoHTML()) {
451 xrefs.push_back
452 ("MIM <a href=\""
453 "https://omim.org/entry/" + id
454 + "\">" + id + "</a>");
455 } else {
456 xrefs.push_back("MIM:" + id);
457 }
458 } else {
459 // For exmaple, HGNC has HGNC as part of its identifier, so we may need to eliminate
460 // such redundancies (example accession: Q02094.1)
461 if( id.substr(0, db.length() + 1) == (db + ":") ) {
462 xrefs.push_back(id); // in this case, id already has db at beginning
463 } else {
464 xrefs.push_back(db + ':' + id); // no space(!)
465 }
466 }
467 }
468 m_DBSource.push_back
469 ("xrefs (non-sequence databases): " + NStr::Join(xrefs, ", "));
470 }
471 }
472
473
x_AddPRFBlock(CBioseqContext & ctx)474 void CDBSourceItem::x_AddPRFBlock(CBioseqContext& ctx)
475 {
476 CSeqdesc_CI dsc(ctx.GetHandle(), CSeqdesc::e_Prf);
477 if ( !dsc ) {
478 return;
479 }
480
481 x_SetObject(*dsc);
482
483 const CPRF_block& prf = dsc->GetPrf();
484 if (prf.CanGetExtra_src()) {
485 const CPRF_ExtraSrc& es = prf.GetExtra_src();
486 if (es.CanGetHost()) {
487 m_DBSource.push_back("host:" + es.GetHost());
488 }
489 if (es.CanGetPart()) {
490 m_DBSource.push_back("part: " + es.GetPart());
491 }
492 if (es.CanGetState()) {
493 m_DBSource.push_back("state: " + es.GetState());
494 }
495 if (es.CanGetStrain()) {
496 m_DBSource.push_back("strain: " + es.GetStrain());
497 }
498 if (es.CanGetTaxon()) {
499 m_DBSource.push_back("taxonomy: " + es.GetTaxon());
500 }
501 }
502 NON_CONST_ITERATE (list<string>, it, m_DBSource) {
503 *it += (&*it == &m_DBSource.back() ? '.' : ';');
504 }
505 }
506
507
x_AddPDBBlock(CBioseqContext & ctx)508 void CDBSourceItem::x_AddPDBBlock(CBioseqContext& ctx)
509 {
510 CSeqdesc_CI dsc(ctx.GetHandle(), CSeqdesc::e_Pdb);
511 if ( !dsc ) {
512 return;
513 }
514
515 x_SetObject(*dsc);
516
517 const CPDB_block& pdb = dsc->GetPdb();
518 {{
519 string s("deposition: ");
520 s_FormatDate(pdb.GetDeposition(), s);
521 m_DBSource.push_back(s);
522 }}
523 m_DBSource.push_back("class: " + pdb.GetClass());
524 if (!pdb.GetSource().empty() ) {
525 m_DBSource.push_back("source: " + x_FormatPDBSource(pdb));
526 }
527 if (pdb.CanGetExp_method()) {
528 m_DBSource.push_back("Exp. method: " + pdb.GetExp_method());
529 }
530 if (pdb.CanGetReplace()) {
531 const CPDB_replace& rep = pdb.GetReplace();
532 if ( !rep.GetIds().empty() ) {
533 m_DBSource.push_back
534 ("ids replaced: " + x_FormatPDBSource(pdb));
535 }
536 string s("replacement date: ");
537 DateToString(rep.GetDate(), s);
538 m_DBSource.push_back(s);
539 }
540 NON_CONST_ITERATE (list<string>, it, m_DBSource) {
541 *it += (&*it == &m_DBSource.back() ? '.' : ';');
542 }
543 }
544
545
x_FormatDBSourceID(const CSeq_id_Handle & idh)546 string CDBSourceItem::x_FormatDBSourceID(const CSeq_id_Handle& idh)
547 {
548 #ifndef NEW_HTML_FMT
549 const bool is_html = ( GetContext()->Config().DoHTML() );
550 #endif
551
552 CConstRef<CSeq_id> id;
553 if (idh) {
554 id = idh.GetSeqId();
555 }
556 if (!id) {
557 return kEmptyStr;
558 }
559
560 CSeq_id::E_Choice choice = id->Which();
561
562 switch (choice) {
563 case CSeq_id::e_Local:
564 {{
565 const CObject_id& oi = id->GetLocal();
566 return (oi.IsStr() ? oi.GetStr() : NStr::IntToString(oi.GetId()));
567 }}
568 case CSeq_id::e_Gi:
569 {{
570 return "gi: " + NStr::NumericToString(id->GetGi());
571 }}
572 case CSeq_id::e_Pdb:
573 {{
574 const CPDB_seq_id& pdb = id->GetPdb();
575 string s("pdb: "), sep;
576 if ( !pdb.GetMol().Get().empty() ) {
577 s += "molecule " + pdb.GetMol().Get();
578 sep = ", ";
579 }
580 if (pdb.IsSetChain() && pdb.GetChain() > 0) {
581 s += sep + "chain " + NStr::IntToString(pdb.GetChain());
582 sep = ", ";
583 }
584 if (pdb.IsSetChain_id()) {
585 s += sep + "chain " + pdb.GetChain_id();
586 sep = ", ";
587 }
588 if (pdb.CanGetRel()) {
589 s += sep + "release ";
590 s_FormatDate(pdb.GetRel(), s);
591 sep = ", ";
592 }
593 return s;
594 }}
595 default:
596 {{
597 const CTextseq_id* tsid = id->GetTextseq_Id();
598 if (tsid == NULL) {
599 return kEmptyStr;
600 }
601 string s, sep, comma, ht;
602 bool is_uniprot = false;
603 switch (choice) {
604 case CSeq_id::e_Embl: s = "embl "; comma = ","; break;
605 case CSeq_id::e_Other: s = "REFSEQ: "; break;
606 case CSeq_id::e_Swissprot: s = "UniProtKB: "; is_uniprot = true; comma = ","; break;
607 case CSeq_id::e_Pir: s = "UniProtKB: "; is_uniprot = true; break;
608 case CSeq_id::e_Prf: s = "prf: "; break;
609 default: break;
610 }
611 if (tsid->CanGetName()) {
612 s += "locus " + tsid->GetName();
613 sep = " ";
614 } else {
615 comma.erase();
616 }
617 if (tsid->CanGetAccession()) {
618 string acc = tsid->GetAccession();
619 if (tsid->CanGetVersion() &&
620 choice != CSeq_id::e_Swissprot) {
621 acc += '.' + NStr::IntToString(tsid->GetVersion());
622 }
623 #ifdef NEW_HTML_FMT
624 #if 0
625 GetContext()->Config().GetHTMLFormatter().FormatNucId(ht, *idh.GetSeqId(), GetContext()->GetScope().GetGi(idh), acc);
626 #else
627 if (is_uniprot) {
628 GetContext()->Config().GetHTMLFormatter().FormatUniProtId(ht, acc);
629 } else {
630 GetContext()->Config().GetHTMLFormatter().FormatNucId(ht, *idh.GetSeqId(),
631 GI_TO(TIntId, GetContext()->GetScope().GetGi(idh)), acc);
632 }
633 #endif
634 s += comma + sep + "accession " + ht;
635 #else
636 if (is_html) {
637 const TIntId gi = GetContext()->GetScope().GetGi(idh);
638 s += comma + sep + "accession <a href=\"" + strLinkBaseNuc +
639 NStr::NumericToString(gi) + "\">" + acc + "</a>";
640 } else {
641 s += comma + sep + "accession " + acc;
642 }
643 #endif
644 sep = " ";
645 }
646 /**
647 if (tsid->CanGetRelease()) {
648 s += sep + "release " + tsid->GetRelease();
649 }
650 **/
651 if (id->IsSwissprot()) {
652 s += ';';
653 }
654 return s;
655 }}
656 }
657
658 return kEmptyStr;
659 }
660
x_FormatPDBSource(const CPDB_block & pdb)661 string CDBSourceItem::x_FormatPDBSource(const CPDB_block& pdb)
662 {
663 if( ! pdb.IsSetSource() || pdb.GetSource().empty() ) {
664 return kEmptyStr;
665 }
666
667 const bool bIsHtml = ( GetContext() && GetContext()->Config().DoHTML() );
668
669 string answer;
670 const CPDB_block::TSource & source = pdb.GetSource();
671 ITERATE( CPDB_block::TSource, source_iter, source ) {
672 const string & a_source = *source_iter;
673 if( ! answer.empty() ) {
674 answer += ", ";
675 }
676
677 const static string kMmdbIdPrefix = "Mmdb_id:";
678 string prefix;
679 string url;
680 string url_suffix;
681 if( bIsHtml && x_ExtractLinkableSource(a_source, prefix, url, url_suffix) ) {
682 answer += prefix;
683 answer += " <a href=\"" + url + url_suffix + "\">";
684 answer += url_suffix;
685 answer += "</a>";
686 } else {
687 answer += a_source;
688 }
689 }
690
691 return answer;
692 }
693
x_ExtractLinkableSource(const string & a_source,string & out_prefix,string & out_url,string & out_url_suffix)694 bool CDBSourceItem::x_ExtractLinkableSource(
695 const string & a_source,
696 string & out_prefix,
697 string & out_url,
698 string & out_url_suffix )
699 {
700 const static struct {
701 string m_prefix;
702 string m_url;
703 bool m_must_be_all_digits;
704 } potentialPrefixes[] = {
705 { "Mmdb_id:", "https://www.ncbi.nlm.nih.gov/Structure/mmdb/mmdbsrv.cgi?uid=", true }
706 };
707
708 const static size_t numPotentialPrefixes = sizeof(potentialPrefixes)/sizeof(potentialPrefixes[0]);
709
710 for( size_t idx = 0; idx < numPotentialPrefixes; ++idx ) {
711 const string & prefix = potentialPrefixes[idx].m_prefix;
712 const string & url = potentialPrefixes[idx].m_url;
713 const bool must_be_all_digits = potentialPrefixes[idx].m_must_be_all_digits;
714
715 if( a_source.length() <= prefix.length() ) {
716 continue;
717 }
718
719 if( ! NStr::StartsWith(a_source, prefix, NStr::eNocase) ) {
720 continue;
721 }
722
723 // first_non_space_pos points to first non-space character after the prefix.
724 string::size_type first_non_space_pos = prefix.length();
725 for( ; first_non_space_pos < a_source.length(); ++first_non_space_pos ) {
726 if( ! isspace(a_source[first_non_space_pos]) ) {
727 break;
728 }
729 }
730 if( first_non_space_pos >= a_source.length() ) {
731 continue;
732 }
733
734 // some require extra test to make sure they're all digits
735 if( must_be_all_digits ) {
736 bool non_digit_found = false;
737 string::size_type test_pos = first_non_space_pos;
738 for( ; test_pos < a_source.length(); ++test_pos ) {
739 if( ! isdigit(a_source[test_pos]) ) {
740 non_digit_found = true;
741 break;
742 }
743 }
744 if( non_digit_found ) {
745 continue;
746 }
747 }
748
749 // all tests passed, so prepare to give result to caller
750 out_prefix = prefix;
751 out_url = url;
752 out_url_suffix = NStr::TruncateSpaces(a_source.substr(first_non_space_pos));
753 return true;
754 }
755
756 // didn't find any matches
757 return false;
758 }
759
760
761 END_SCOPE(objects)
762 END_NCBI_SCOPE
763