1 /*  $Id: dbsource_item.cpp 607132 2020-04-30 12:54:53Z grichenk $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Mati Shomrat, NCBI
27 *
28 * File Description:
29 *
30 */
31 #include <ncbi_pch.hpp>
32 #include <corelib/ncbistd.hpp>
33 #include <corelib/ncbiutil.hpp>
34 
35 #include <objects/general/Dbtag.hpp>
36 #include <objects/general/Date.hpp>
37 #include <objects/general/Object_id.hpp>
38 #include <objects/seqblock/PIR_block.hpp>
39 #include <objects/seqblock/PRF_block.hpp>
40 #include <objects/seqblock/PRF_ExtraSrc.hpp>
41 #include <objects/seqblock/PDB_block.hpp>
42 #include <objects/seqblock/PDB_replace.hpp>
43 #include <objects/seqblock/SP_block.hpp>
44 #include <objects/seqloc/PDB_seq_id.hpp>
45 #include <objects/seqloc/Textseq_id.hpp>
46 #include <objects/seq/Bioseq.hpp>
47 #include <objects/seq/seq_id_handle.hpp>
48 #include <objects/seqset/Bioseq_set.hpp>
49 #include <objmgr/seq_entry_handle.hpp>
50 #include <objmgr/bioseq_handle.hpp>
51 #include <objmgr/scope.hpp>
52 #include <objmgr/feat_ci.hpp>
53 #include <objmgr/seqdesc_ci.hpp>
54 #include <objmgr/bioseq_ci.hpp>
55 #include <objmgr/util/seq_loc_util.hpp>
56 #include <objmgr/util/sequence.hpp>
57 
58 #include <objtools/format/formatter.hpp>
59 #include <objtools/format/text_ostream.hpp>
60 #include <objtools/format/items/dbsource_item.hpp>
61 #include <objtools/format/context.hpp>
62 #include <objmgr/util/objutil.hpp>
63 
64 
65 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)66 BEGIN_SCOPE(objects)
67 
68 
69 CDBSourceItem::CDBSourceItem(CBioseqContext& ctx) :
70     CFlatItem(&ctx)
71 {
72     x_GatherInfo(ctx);
73 }
74 
GetItemType(void) const75 IFlatItem::EItem CDBSourceItem::GetItemType(void) const
76 {
77     return eItem_DbSource;
78 }
79 
Format(IFormatter & formatter,IFlatTextOStream & text_os) const80 void CDBSourceItem::Format
81 (IFormatter& formatter,
82  IFlatTextOStream& text_os) const
83 
84 {
85     formatter.FormatDBSource(*this, text_os);
86 }
87 
88 
s_ScoreForDBSource(const CSeq_id_Handle & idh)89 static int s_ScoreForDBSource(const CSeq_id_Handle& idh)
90 {
91     CConstRef<CSeq_id> id = idh.GetSeqId();
92     switch (id->Which()) {
93     case CSeq_id::e_not_set:                        return kMax_Int;
94     case CSeq_id::e_Gi:                             return 31;
95     case CSeq_id::e_Giim:                           return 30;
96     case CSeq_id::e_Local: case CSeq_id::e_General: return 20;
97     case CSeq_id::e_Other:                          return 18;
98     case CSeq_id::e_Gibbmt:                         return 16;
99     case CSeq_id::e_Gibbsq: case CSeq_id::e_Patent: return 15;
100     case CSeq_id::e_Pdb:                            return 12;
101     default:                                        return 10;
102     }
103 }
104 
105 
s_FindBestChoiceForDbsource(const CSeq_id_Handle & idh,CScope & scope)106 static const CSeq_id_Handle s_FindBestChoiceForDbsource(const CSeq_id_Handle& idh, CScope& scope)
107 {
108     return FindBestChoice(scope.GetIds(idh), s_ScoreForDBSource);
109 }
110 
111 
s_AddToUniqueIdList(const CSeq_id_Handle & idh,vector<CSeq_id_Handle> & unique_ids)112 static void s_AddToUniqueIdList(const CSeq_id_Handle& idh, vector<CSeq_id_Handle>& unique_ids)
113 {
114     ITERATE (vector<CSeq_id_Handle>, it, unique_ids) {
115         if (idh == *it) {
116             return;
117         }
118     }
119     unique_ids.push_back(idh);
120 }
121 
122 
s_HasLocalBioseq(const CSeq_loc & loc,const CSeq_entry_Handle & tse)123 static bool s_HasLocalBioseq(const CSeq_loc& loc, const CSeq_entry_Handle& tse)
124 {
125     CScope& scope = tse.GetScope();
126     for (CSeq_loc_CI li(loc); li; ++li) {
127         CBioseq_Handle local =
128             scope.GetBioseqHandleFromTSE(li.GetSeq_id(), tse);
129         if (local) {
130             return true;
131         }
132     }
133     return false;
134 }
135 
136 
x_GatherInfo(CBioseqContext & ctx)137 void CDBSourceItem::x_GatherInfo(CBioseqContext& ctx)
138 {
139     const bool bHtml = ctx.Config().DoHTML();
140 
141     const CBioseq_Handle& seq = ctx.GetHandle();
142     const CBioseq_Handle::TId& ids = seq.GetId();
143     CSeq_id_Handle idh = FindBestChoice(ids, s_ScoreForDBSource);
144 
145     if (!idh) {
146         m_DBSource.push_back("UNKNOWN");
147         return;
148     }
149 
150     switch (idh.Which()) {
151     case CSeq_id::e_Pir:
152         m_DBSource.push_back(x_FormatDBSourceID(idh));
153         x_AddPIRBlock(ctx);
154         break;
155 
156     case CSeq_id::e_Swissprot:
157         m_DBSource.push_back(x_FormatDBSourceID(idh));
158         x_AddSPBlock(ctx);
159         break;
160 
161     case CSeq_id::e_Prf:
162         m_DBSource.push_back(x_FormatDBSourceID(idh));
163         x_AddPRFBlock(ctx);
164         break;
165 
166     case CSeq_id::e_Pdb:
167         m_DBSource.push_back(x_FormatDBSourceID(idh));
168         x_AddPDBBlock(ctx);
169         break;
170 
171     case CSeq_id::e_General:
172         if (!NStr::StartsWith(idh.GetSeqId()->GetGeneral().GetDb(), "PID")) {
173             m_DBSource.push_back("UNKNOWN");
174             break;
175         }
176         // otherwise, fall through
177     case CSeq_id::e_Gibbsq: case CSeq_id::e_Gibbmt: case CSeq_id::e_Giim:
178     case CSeq_id::e_Genbank: case CSeq_id::e_Embl: case CSeq_id::e_Other:
179     case CSeq_id::e_Gi: case CSeq_id::e_Ddbj:
180     case CSeq_id::e_Tpg: case CSeq_id::e_Tpe: case CSeq_id::e_Tpd:
181     {
182         CScope& scope = ctx.GetScope();
183         vector<CSeq_id_Handle> unique_ids;
184 
185         // find generating feature
186         const CSeq_feat* feat = sequence::GetCDSForProduct(seq);
187         if (feat == NULL) {
188             // may also be protein product of mature peptide feature
189             feat = sequence::GetPROTForProduct(seq);
190         }
191 
192         if (feat != NULL) {
193             const CSeq_loc& loc = feat->GetLocation();
194             CSeq_entry_Handle topLevelEntry = seq.GetTopLevelEntry();
195             if (s_HasLocalBioseq(loc, topLevelEntry)) {
196                 for (CSeq_loc_CI li(loc); li; ++li) {
197                     s_AddToUniqueIdList(li.GetSeq_id_Handle(), unique_ids);
198                 }
199             } /* else {
200                 const CSeq_id *cds_seq_id = loc.GetId();
201                 if( NULL != cds_seq_id && cds_seq_id->IsGi() ) {
202                     CSeq_id_Base::TGi cds_gi = cds_seq_id->GetGi();
203                     s_AddToUniqueIdList( CSeq_id_Handle::GetHandle(cds_gi), unique_ids);
204                 }
205             } */
206         }
207 
208         string str;
209         ITERATE (vector<CSeq_id_Handle>, it, unique_ids) {
210             CSeq_id_Handle idh2 = s_FindBestChoiceForDbsource(*it, scope);
211             if (idh2) {
212                 str.erase();
213                 str = x_FormatDBSourceID(idh2);
214                 if (!NStr::IsBlank(str)) {
215                     m_DBSource.push_back(str);
216                 }
217             } else {
218                 m_DBSource.push_back( x_FormatDBSourceID( *it ) );
219             }
220         }
221 
222         if( m_DBSource.empty() && feat != NULL ) {
223             const CSeq_loc& loc = feat->GetLocation();
224             const CSeq_id *cds_seq_id = loc.GetId();
225             if( NULL != cds_seq_id && cds_seq_id->IsGi() ) {
226                 CSeq_id_Base::TGi cds_gi = cds_seq_id->GetGi();
227                 // s_AddToUniqueIdList( CSeq_id_Handle::GetHandle(cds_gi), unique_ids);
228                 m_DBSource.push_back( x_FormatDBSourceID( CSeq_id_Handle::GetHandle(cds_gi) ) );
229             }
230         }
231 
232         if (m_DBSource.empty()) {
233             m_DBSource.push_back(x_FormatDBSourceID(idh));
234         }
235         break;
236     }
237     default:
238         m_DBSource.push_back("UNKNOWN");
239     }
240 
241     // turn double-quotes to single-quotes in all m_DBSources,
242     // except inside HTML tags
243     NON_CONST_ITERATE( list<string>, it, m_DBSource ) {
244         if( bHtml ) {
245             ConvertQuotesNotInHTMLTags( *it );
246         } else {
247             replace( it->begin(), it->end(), '\"', '\'' );
248         }
249     }
250 }
251 
x_AddPIRBlock(CBioseqContext & ctx)252 void CDBSourceItem::x_AddPIRBlock(CBioseqContext& ctx)
253 {
254     // In this function, the newlines seem weird because the C toolkit
255     // outputs this way.  Hopefully in the future we can do something
256     // more consistent.
257 
258 
259     CSeqdesc_CI dsc(ctx.GetHandle(), CSeqdesc::e_Pir);
260     if ( !dsc ) {
261         return;
262     }
263 
264     x_SetObject(*dsc);
265 
266     bool containsHostLine = false; // try to match C's whitespace
267 
268     const CPIR_block& pir = dsc->GetPir();
269     if (pir.CanGetHost()) {
270         m_DBSource.push_back("host:" + pir.GetHost() + "\n");
271         containsHostLine = true;
272     }
273     if (pir.CanGetSource()) {
274         m_DBSource.push_back("source: " + pir.GetSource() + "\n");
275     }
276     if (pir.CanGetSummary()) {
277         m_DBSource.push_back("summary: " + pir.GetSummary() + "\n");
278     }
279     if (pir.CanGetGenetic()) {
280         m_DBSource.push_back("genetic: " + pir.GetGenetic() + "\n");
281     }
282     if (pir.CanGetIncludes()) {
283         m_DBSource.push_back("includes: " + pir.GetIncludes() + "\n");
284     }
285     if (pir.CanGetPlacement()) {
286         m_DBSource.push_back("placement: " + pir.GetPlacement() + "\n");
287     }
288     if (pir.CanGetSuperfamily()) {
289         m_DBSource.push_back("superfamily: " + pir.GetSuperfamily() + "\n");
290     }
291     if (pir.CanGetCross_reference()) {
292         m_DBSource.push_back("xref: " + pir.GetCross_reference() + "\n");
293     }
294     if (pir.CanGetDate()) {
295         m_DBSource.push_back("PIR dates: " + pir.GetDate() + "\n");
296     }
297     if (pir.CanGetHad_punct() && pir.GetHad_punct() ) {
298         m_DBSource.push_back("punctuation in sequence");
299     }
300     if (pir.CanGetSeqref()) {
301         list<string> xrefs;
302         ITERATE (CPIR_block::TSeqref, it, pir.GetSeqref()) {
303             const char* type = 0;
304             switch ((*it)->Which()) {
305             case CSeq_id::e_Genbank:    type = "genbank ";    break;
306             case CSeq_id::e_Embl:       type = "embl ";       break;
307             case CSeq_id::e_Pir:        type = "pir ";        break;
308             case CSeq_id::e_Swissprot:  type = "swissprot ";  break;
309             case CSeq_id::e_Gi:         type = "gi: ";        break;
310             case CSeq_id::e_Ddbj:       type = "ddbj ";       break;
311             case CSeq_id::e_Prf:        type = "prf ";        break;
312             default:                    break;
313             }
314             if (type) {
315                 xrefs.push_back(type + (*it)->GetSeqIdString(true));
316             }
317         }
318         if ( !xrefs.empty() ) {
319             m_DBSource.push_back("xrefs: " + NStr::Join(xrefs, ", "));
320         }
321     }
322 
323     NON_CONST_ITERATE (list<string>, it, m_DBSource) {
324         if( &*it == &m_DBSource.front() ) {
325             // first one has newline AFTER the semicolon
326             *it += ";\n";
327             // match C toolkit
328             /* if( (it + 1) != m_DBSource.end() && ! NStr::StartsWith(*(it + 1), "host")  ) {
329                 *it += ";\n";
330             } */
331         } else if( &*it == &m_DBSource.back() ) {
332             // last one ends in period
333             *it += ".";
334         } else {
335             // The C version puts newlines before some of these for some reason
336             *it += ";\n";
337         }
338         // *it += (&*it == &m_DBSource.back() ? "." : "\n;");
339     }
340 
341     // match C's whitespace
342     if( ! containsHostLine ) {
343         m_DBSource.front() += "\n";
344     }
345 }
346 
s_FormatDate(const CDate & date,string & str)347 static void s_FormatDate(const CDate& date, string& str)
348 {
349     CTime time = date.AsCTime();
350     str += time.AsString(CTimeFormat("b d, Y"));
351 }
352 
353 
x_AddSPBlock(CBioseqContext & ctx)354 void CDBSourceItem::x_AddSPBlock(CBioseqContext& ctx)
355 {
356     CSeqdesc_CI dsc(ctx.GetHandle(), CSeqdesc::e_Sp);
357     if ( !dsc ) {
358         return;
359     }
360     x_SetObject(*dsc);
361 
362     const CSP_block& sp = dsc->GetSp();
363     switch (sp.GetClass()) {
364     case CSP_block::eClass_standard:
365         m_DBSource.push_back("class: standard.");
366         break;
367     case CSP_block::eClass_prelim:
368         m_DBSource.push_back("class: preliminary.");
369         break;
370     default:
371         break;
372     }
373     // laid out slightly differently from the C version, but I think that's
374     // a bug in the latter (which runs some things together)
375     if (sp.CanGetExtra_acc()  &&  !sp.GetExtra_acc().empty() ) {
376         m_DBSource.push_back("extra accessions:"
377                              + NStr::Join(sp.GetExtra_acc(), ","));
378     }
379     if (sp.GetImeth()) {
380         m_DBSource.push_back("seq starts with Met");
381     }
382     if (sp.CanGetPlasnm()  &&  !sp.GetPlasnm().empty() ) {
383         m_DBSource.push_back("plasmid:" + NStr::Join(sp.GetPlasnm(), ","));
384     }
385     if (sp.CanGetCreated()) {
386         string s("created: ");
387         //sp.GetCreated().GetDate(&s, "%3N %D %Y");
388         s_FormatDate(sp.GetCreated(), s);
389         m_DBSource.push_back(s + '.');
390     }
391     if (sp.CanGetSequpd()) {
392         string s("sequence updated: ");
393         //sp.GetSequpd().GetDate(&s, "%3N %D %Y");
394         s_FormatDate(sp.GetSequpd(), s);
395         m_DBSource.push_back(s + '.');
396     }
397     if (sp.CanGetAnnotupd()) {
398         string s("annotation updated: ");
399         //sp.GetAnnotupd().GetDate(&s, "%3N %D %Y");
400         s_FormatDate(sp.GetAnnotupd(), s);
401         m_DBSource.push_back(s + '.');
402     }
403     if (sp.CanGetSeqref()  &&  !sp.GetSeqref().empty() ) {
404         list<string> xrefs;
405         ITERATE (CSP_block::TSeqref, it, sp.GetSeqref()) {
406             CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(**it);
407             CSeq_id_Handle best = sequence::GetId(idh, ctx.GetScope(),
408                                                   sequence::eGetId_Best);
409             if ( !best ) {
410                 best = idh;
411             }
412             if (best) {
413                 string acc = best.GetSeqId()->GetSeqIdString(true);
414                 xrefs.push_back(acc);
415             }
416             /**
417             const char* s = 0;
418             switch ((*it)->Which()) {
419             case CSeq_id::e_Genbank:  s = "genbank accession ";          break;
420             case CSeq_id::e_Embl:     s = "embl accession ";             break;
421             case CSeq_id::e_Pir:      s = "pir locus ";                  break;
422             case CSeq_id::e_Swissprot: s = "swissprot accession ";       break;
423             case CSeq_id::e_Gi:       s = "gi: ";                        break;
424             case CSeq_id::e_Ddbj:     s = "ddbj accession ";             break;
425             case CSeq_id::e_Prf:      s = "prf accession ";              break;
426             case CSeq_id::e_Pdb:      s = "pdb accession ";              break;
427             case CSeq_id::e_Tpg:   s = "genbank third party accession "; break;
428             case CSeq_id::e_Tpe:      s = "embl third party accession "; break;
429             case CSeq_id::e_Tpd:      s = "ddbj third party accession "; break;
430             default:                  break;
431             }
432             if ( s ) {
433                 string acc = (*it)->GetSeqIdString(true);
434                 xrefs.push_back(s + acc);
435             }
436             **/
437         }
438         if ( !xrefs.empty() ) {
439             m_DBSource.push_back("xrefs: " + NStr::Join(xrefs, ", "));
440         }
441     }
442     if (sp.CanGetDbref()  &&  !sp.GetDbref().empty() ) {
443         list<string> xrefs;
444         ITERATE (CSP_block::TDbref, it, sp.GetDbref()) {
445             const CObject_id& tag = (*it)->GetTag();
446             string id = (tag.IsStr() ? tag.GetStr()
447                                      : NStr::IntToString(tag.GetId()));
448             string db = (*it)->GetDb();
449             if ( db == "MIM") {
450                 if (ctx.Config().DoHTML()) {
451                     xrefs.push_back
452                         ("MIM <a href=\""
453                          "https://omim.org/entry/" + id
454                          + "\">" + id + "</a>");
455                 } else {
456                     xrefs.push_back("MIM:" + id);
457                 }
458             } else {
459                 // For exmaple, HGNC has HGNC as part of its identifier, so we may need to eliminate
460                 // such redundancies (example accession: Q02094.1)
461                 if( id.substr(0, db.length() + 1) == (db + ":") ) {
462                     xrefs.push_back(id); // in this case, id already has db at beginning
463                 } else {
464                     xrefs.push_back(db + ':' + id); // no space(!)
465                 }
466             }
467         }
468         m_DBSource.push_back
469             ("xrefs (non-sequence databases): " + NStr::Join(xrefs, ", "));
470     }
471 }
472 
473 
x_AddPRFBlock(CBioseqContext & ctx)474 void CDBSourceItem::x_AddPRFBlock(CBioseqContext& ctx)
475 {
476     CSeqdesc_CI dsc(ctx.GetHandle(), CSeqdesc::e_Prf);
477     if ( !dsc ) {
478         return;
479     }
480 
481     x_SetObject(*dsc);
482 
483     const CPRF_block& prf = dsc->GetPrf();
484     if (prf.CanGetExtra_src()) {
485         const CPRF_ExtraSrc& es = prf.GetExtra_src();
486         if (es.CanGetHost()) {
487             m_DBSource.push_back("host:" + es.GetHost());
488         }
489         if (es.CanGetPart()) {
490             m_DBSource.push_back("part: " + es.GetPart());
491         }
492         if (es.CanGetState()) {
493             m_DBSource.push_back("state: " + es.GetState());
494         }
495         if (es.CanGetStrain()) {
496             m_DBSource.push_back("strain: " + es.GetStrain());
497         }
498         if (es.CanGetTaxon()) {
499             m_DBSource.push_back("taxonomy: " + es.GetTaxon());
500         }
501     }
502     NON_CONST_ITERATE (list<string>, it, m_DBSource) {
503         *it += (&*it == &m_DBSource.back() ? '.' : ';');
504     }
505 }
506 
507 
x_AddPDBBlock(CBioseqContext & ctx)508 void CDBSourceItem::x_AddPDBBlock(CBioseqContext& ctx)
509 {
510     CSeqdesc_CI dsc(ctx.GetHandle(), CSeqdesc::e_Pdb);
511     if ( !dsc ) {
512         return;
513     }
514 
515     x_SetObject(*dsc);
516 
517     const CPDB_block& pdb = dsc->GetPdb();
518     {{
519         string s("deposition: ");
520         s_FormatDate(pdb.GetDeposition(), s);
521         m_DBSource.push_back(s);
522     }}
523     m_DBSource.push_back("class: " + pdb.GetClass());
524     if (!pdb.GetSource().empty() ) {
525         m_DBSource.push_back("source: " + x_FormatPDBSource(pdb));
526     }
527     if (pdb.CanGetExp_method()) {
528         m_DBSource.push_back("Exp. method: " + pdb.GetExp_method());
529     }
530     if (pdb.CanGetReplace()) {
531         const CPDB_replace& rep = pdb.GetReplace();
532         if ( !rep.GetIds().empty() ) {
533             m_DBSource.push_back
534                 ("ids replaced: " + x_FormatPDBSource(pdb));
535         }
536         string s("replacement date: ");
537         DateToString(rep.GetDate(), s);
538         m_DBSource.push_back(s);
539     }
540     NON_CONST_ITERATE (list<string>, it, m_DBSource) {
541         *it += (&*it == &m_DBSource.back() ? '.' : ';');
542     }
543 }
544 
545 
x_FormatDBSourceID(const CSeq_id_Handle & idh)546 string CDBSourceItem::x_FormatDBSourceID(const CSeq_id_Handle& idh)
547 {
548 #ifndef NEW_HTML_FMT
549     const bool is_html = ( GetContext()->Config().DoHTML() );
550 #endif
551 
552     CConstRef<CSeq_id> id;
553     if (idh) {
554         id = idh.GetSeqId();
555     }
556     if (!id) {
557         return kEmptyStr;
558     }
559 
560     CSeq_id::E_Choice choice = id->Which();
561 
562     switch (choice) {
563     case CSeq_id::e_Local:
564         {{
565             const CObject_id& oi = id->GetLocal();
566             return (oi.IsStr() ? oi.GetStr() : NStr::IntToString(oi.GetId()));
567         }}
568     case CSeq_id::e_Gi:
569         {{
570             return "gi: " + NStr::NumericToString(id->GetGi());
571         }}
572     case CSeq_id::e_Pdb:
573         {{
574             const CPDB_seq_id& pdb = id->GetPdb();
575             string s("pdb: "), sep;
576             if ( !pdb.GetMol().Get().empty() ) {
577                 s += "molecule " + pdb.GetMol().Get();
578                 sep = ", ";
579             }
580             if (pdb.IsSetChain() && pdb.GetChain() > 0) {
581                 s += sep + "chain " + NStr::IntToString(pdb.GetChain());
582                 sep = ", ";
583             }
584             if (pdb.IsSetChain_id()) {
585                 s += sep + "chain " + pdb.GetChain_id();
586                 sep = ", ";
587             }
588             if (pdb.CanGetRel()) {
589                 s += sep + "release ";
590                 s_FormatDate(pdb.GetRel(), s);
591                 sep = ", ";
592             }
593             return s;
594         }}
595     default:
596         {{
597             const CTextseq_id* tsid = id->GetTextseq_Id();
598             if (tsid == NULL) {
599                 return kEmptyStr;
600             }
601             string s, sep, comma, ht;
602             bool is_uniprot = false;
603             switch (choice) {
604             case CSeq_id::e_Embl:       s = "embl ";        comma = ",";  break;
605             case CSeq_id::e_Other:      s = "REFSEQ: ";                   break;
606             case CSeq_id::e_Swissprot:  s = "UniProtKB: "; is_uniprot = true; comma = ",";  break;
607             case CSeq_id::e_Pir:        s = "UniProtKB: "; is_uniprot = true;               break;
608             case CSeq_id::e_Prf:        s = "prf: ";                      break;
609             default:                    break;
610             }
611             if (tsid->CanGetName()) {
612                 s += "locus " + tsid->GetName();
613                 sep = " ";
614             } else {
615                 comma.erase();
616             }
617             if (tsid->CanGetAccession()) {
618                 string acc = tsid->GetAccession();
619                 if (tsid->CanGetVersion()  &&
620                     choice != CSeq_id::e_Swissprot) {
621                     acc += '.' + NStr::IntToString(tsid->GetVersion());
622                 }
623 #ifdef NEW_HTML_FMT
624 #if 0
625                 GetContext()->Config().GetHTMLFormatter().FormatNucId(ht, *idh.GetSeqId(), GetContext()->GetScope().GetGi(idh), acc);
626 #else
627                 if (is_uniprot) {
628                     GetContext()->Config().GetHTMLFormatter().FormatUniProtId(ht, acc);
629                 } else {
630                     GetContext()->Config().GetHTMLFormatter().FormatNucId(ht, *idh.GetSeqId(),
631                         GI_TO(TIntId, GetContext()->GetScope().GetGi(idh)), acc);
632                 }
633 #endif
634                 s += comma + sep + "accession " + ht;
635 #else
636                 if (is_html) {
637                     const TIntId gi = GetContext()->GetScope().GetGi(idh);
638                     s += comma + sep + "accession <a href=\"" + strLinkBaseNuc +
639                         NStr::NumericToString(gi) + "\">" + acc + "</a>";
640                 } else {
641                     s += comma + sep + "accession " + acc;
642                 }
643 #endif
644                 sep = " ";
645             }
646             /**
647             if (tsid->CanGetRelease()) {
648                 s += sep + "release " + tsid->GetRelease();
649             }
650             **/
651             if (id->IsSwissprot()) {
652                 s += ';';
653             }
654             return s;
655         }}
656     }
657 
658     return kEmptyStr;
659 }
660 
x_FormatPDBSource(const CPDB_block & pdb)661 string CDBSourceItem::x_FormatPDBSource(const CPDB_block& pdb)
662 {
663     if( ! pdb.IsSetSource() || pdb.GetSource().empty() ) {
664         return kEmptyStr;
665     }
666 
667     const bool bIsHtml = ( GetContext() && GetContext()->Config().DoHTML() );
668 
669     string answer;
670     const CPDB_block::TSource & source = pdb.GetSource();
671     ITERATE( CPDB_block::TSource, source_iter, source ) {
672         const string & a_source = *source_iter;
673         if( ! answer.empty() ) {
674             answer += ", ";
675         }
676 
677         const static string kMmdbIdPrefix = "Mmdb_id:";
678         string prefix;
679         string url;
680         string url_suffix;
681         if( bIsHtml && x_ExtractLinkableSource(a_source, prefix, url, url_suffix) ) {
682             answer += prefix;
683             answer += " <a href=\"" + url + url_suffix + "\">";
684             answer += url_suffix;
685             answer += "</a>";
686         } else {
687             answer += a_source;
688         }
689     }
690 
691     return answer;
692 }
693 
x_ExtractLinkableSource(const string & a_source,string & out_prefix,string & out_url,string & out_url_suffix)694 bool CDBSourceItem::x_ExtractLinkableSource(
695     const string & a_source,
696     string & out_prefix,
697     string & out_url,
698     string & out_url_suffix )
699 {
700     const static struct {
701         string m_prefix;
702         string m_url;
703         bool   m_must_be_all_digits;
704     } potentialPrefixes[] = {
705         { "Mmdb_id:", "https://www.ncbi.nlm.nih.gov/Structure/mmdb/mmdbsrv.cgi?uid=", true }
706     };
707 
708     const static size_t numPotentialPrefixes = sizeof(potentialPrefixes)/sizeof(potentialPrefixes[0]);
709 
710     for( size_t idx = 0; idx < numPotentialPrefixes; ++idx ) {
711         const string & prefix = potentialPrefixes[idx].m_prefix;
712         const string & url = potentialPrefixes[idx].m_url;
713         const bool must_be_all_digits = potentialPrefixes[idx].m_must_be_all_digits;
714 
715         if( a_source.length() <= prefix.length() ) {
716             continue;
717         }
718 
719         if( ! NStr::StartsWith(a_source, prefix, NStr::eNocase) ) {
720             continue;
721         }
722 
723         // first_non_space_pos points to first non-space character after the prefix.
724         string::size_type first_non_space_pos = prefix.length();
725         for( ; first_non_space_pos < a_source.length(); ++first_non_space_pos ) {
726             if( ! isspace(a_source[first_non_space_pos]) ) {
727                 break;
728             }
729         }
730         if( first_non_space_pos >= a_source.length() ) {
731             continue;
732         }
733 
734         // some require extra test to make sure they're all digits
735         if( must_be_all_digits ) {
736             bool non_digit_found = false;
737             string::size_type test_pos = first_non_space_pos;
738             for( ; test_pos < a_source.length(); ++test_pos ) {
739                 if( ! isdigit(a_source[test_pos]) ) {
740                     non_digit_found = true;
741                     break;
742                 }
743             }
744             if( non_digit_found ) {
745                 continue;
746             }
747         }
748 
749         // all tests passed, so prepare to give result to caller
750         out_prefix = prefix;
751         out_url = url;
752         out_url_suffix = NStr::TruncateSpaces(a_source.substr(first_non_space_pos));
753         return true;
754     }
755 
756     // didn't find any matches
757     return false;
758 }
759 
760 
761 END_SCOPE(objects)
762 END_NCBI_SCOPE
763