1 /* utilfeat.c
2  *
3  * ===========================================================================
4  *
5  *                            PUBLIC DOMAIN NOTICE
6  *               National Center for Biotechnology Information
7  *
8  *  This software/database is a "United States Government Work" under the
9  *  terms of the United States Copyright Act.  It was written as part of
10  *  the author's official duties as a United States Government employee and
11  *  thus cannot be copyrighted.  This software/database is freely available
12  *  to the public for use. The National Library of Medicine and the U.S.
13  *  Government have not placed any restriction on its use or reproduction.
14  *
15  *  Although all reasonable efforts have been taken to ensure the accuracy
16  *  and reliability of the software and data, the NLM and the U.S.
17  *  Government do not and cannot warrant the performance or results that
18  *  may be obtained by using this software or data. The NLM and the U.S.
19  *  Government disclaim all warranties, express or implied, including
20  *  warranties of performance, merchantability or fitness for any particular
21  *  purpose.
22  *
23  *  Please cite the author in any work or product based on this material.
24  *
25  * ===========================================================================
26  *
27  * File Name:  utilfeat.c
28  *
29  * Author: Karl Sirotkin, Hsiu-Chuan Chen
30  *
31  * File Description: functions for features parsing
32  *
33  */
34 #include <ncbi_pch.hpp>
35 
36 #include "ftacpp.hpp"
37 
38 #include <objects/seqfeat/BioSource.hpp>
39 #include <objects/seqset/Bioseq_set.hpp>
40 #include <objects/seq/Bioseq.hpp>
41 #include <objects/seq/Seq_descr.hpp>
42 #include <objects/seqfeat/Org_ref.hpp>
43 #include <objects/seqfeat/OrgName.hpp>
44 #include <objects/seqfeat/SubSource.hpp>
45 
46 #include "index.h"
47 
48 #include <objtools/flatfile/flatdefn.h>
49 
50 #include "ftaerr.hpp"
51 #include "asci_blk.h"
52 #include "add.h"
53 #include "utilfeat.h"
54 #include "utilfun.h"
55 
56 #ifdef THIS_FILE
57 #    undef THIS_FILE
58 #endif
59 #define THIS_FILE "utilfeat.cpp"
60 
61 // This is the forward declaration for ValidAminoAcid(...). The main declaration located in
62 // ../src/objtools/cleanup/cleanup_utils.hpp
63 // TODO: it should be removed after ValidAminoAcid(...) will be moved into
64 // any of public header file.
65 // for finding the correct amino acid letter given an abbreviation
66 BEGIN_NCBI_SCOPE
67 BEGIN_SCOPE(objects)
68 char ValidAminoAcid(const string &abbrev);
69 END_SCOPE(objects)
70 
71 
72 const char *ParFlat_GImod[] = {
73     "Mitochondr",
74     "Chloroplast",
75     "Kinetoplas",
76     "Cyanelle",
77     "Chromoplast",
78     "Plastid",
79     "Macronuclear",
80     "Extrachrom",
81     "Plasmid",
82     "Leucoplast",
83     "Apicoplast",
84     NULL
85 };
86 
87 const char *valid_organelle[] = {
88     "apicoplast",
89     "chloroplast",
90     "chromatophore",
91     "chromoplast",
92     "cyanelle",
93     "hydrogenosome",
94     "kinetoplast",
95     "leucoplast",
96     "mitochondrion",
97     "nucleomorph",
98     "plastid",
99     "proplastid",
100     NULL
101 };
102 
103 /**********************************************************/
SeqLocHaveFuzz(const objects::CSeq_loc & loc)104 bool SeqLocHaveFuzz(const objects::CSeq_loc& loc)
105 {
106     bool flag;
107 
108     std::string loc_str;
109     loc.GetLabel(&loc_str);
110 
111     if (loc_str.find('<') == std::string::npos && loc_str.find('>') == std::string::npos)
112         flag = false;
113     else
114         flag = true;
115 
116     return(flag);
117 }
118 
119 /**********************************************************
120  *
121  *   char* CpTheQualValue(qlist, qual):
122  *
123  *      Return qual's value if found the "qual" in the
124  *   "qlist"; otherwise, return NULL.
125  *
126  **********************************************************/
CpTheQualValue(const TQualVector & qlist,const Char * qual)127 char* CpTheQualValue(const TQualVector& qlist, const Char *qual)
128 {
129     std::string qvalue;
130     ITERATE(TQualVector, cur, qlist)
131     {
132         if ((*cur)->GetQual() != qual)
133             continue;
134 
135         const std::string& val = (*cur)->GetVal();
136         if (val == "\"\"")
137         {
138             ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownQualSpelling,
139                       "Empty qual %s : %s", qual, val.c_str());
140             break;
141         }
142 
143         qvalue = NStr::Sanitize(val);
144         break;
145     }
146 
147     char* ret = NULL;
148     if (!qvalue.empty())
149         ret = StringSave(qvalue.c_str());
150     return ret;
151 }
152 
153 /**********************************************************
154  *
155  *   char* GetTheQualValue(qlist, qual):
156  *
157  *      Return qual's value if found the "qual" in the
158  *   "qlist", and remove the "qual" from the qlist;
159  *   otherwise, return NULL.
160  *
161  **********************************************************/
GetTheQualValue(TQualVector & qlist,const Char * qual)162 char* GetTheQualValue(TQualVector& qlist, const Char *qual)
163 {
164     char*   qvalue = NULL;
165 
166     NON_CONST_ITERATE(TQualVector, cur, qlist)
167     {
168         if ((*cur)->GetQual() != qual)
169             continue;
170 
171         const std::string& val = (*cur)->GetVal();
172         if (val == "\"\"")
173         {
174             ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownQualSpelling,
175                       "Empty qual %s : %s", qual, val.c_str());
176             break;
177         }
178 
179         std::vector<Char> buf(val.begin(), val.end());
180         buf.push_back(0);
181         qvalue = tata_save(&buf[0]);
182 
183         qlist.erase(cur);
184         break;
185     }
186 
187     return(qvalue);
188 }
189 
190 /**********************************************************
191  *
192  *   bool DeleteQual(qlist, qual):
193  *
194  *      Return TRUE the "qual" has found in and removed
195  *   from the "qlist".
196  *
197  **********************************************************/
DeleteQual(TQualVector & qlist,const Char * qual)198 bool DeleteQual(TQualVector& qlist, const Char *qual)
199 {
200     bool got = false;
201     for (TQualVector::iterator cur = qlist.begin(); cur != qlist.end();)
202     {
203         if ((*cur)->GetQual() != qual)
204         {
205             ++cur;
206             continue;
207         }
208 
209         cur = qlist.erase(cur);
210         got = true;
211     }
212 
213     return(got);
214 }
215 
216 /**********************************************************
217  *
218  *   Uint1 GetQualValueAa(qual, checkseq):
219  *
220  *      Return 255 if not a valid amino acid, not in
221  *   "ParFlat_AA_array".
222  *
223  **********************************************************/
GetQualValueAa(char * qval,bool checkseq)224 Uint1 GetQualValueAa(char* qval, bool checkseq)
225 {
226     char* str;
227     char* p;
228     Uint1   aa;
229     Char    ch;
230 
231     str = StringStr(qval, "aa:");
232     if(str == NULL)
233         return(255);
234 
235     for(str += 3; *str == ' ';)
236         str++;
237     for(p = str; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z');)
238         p++;
239 
240     if(checkseq && StringStr(p, "seq:") == NULL)
241         ErrPostEx(SEV_ERROR, ERR_QUALIFIER_AntiCodonLacksSequence,
242                   "Anticodon qualifier \"%s\" lacks a 'seq' field for the sequence of the anticodon.",
243                   qval);
244     ch = *p;
245     *p = '\0';
246     aa = objects::ValidAminoAcid(str);
247     *p = ch;
248 
249     return(aa);
250 }
251 
252 /**********************************************************/
GetGenomeInfo(objects::CBioSource & bsp,const Char * bptr)253 bool GetGenomeInfo(objects::CBioSource& bsp, const Char* bptr)
254 {
255     Int4 i = StringMatchIcase(ParFlat_GImod, bptr);
256     if(i == -1)
257         return false;
258 
259     if(i == 0)
260         bsp.SetGenome(5);
261     else if (i == 1)
262         bsp.SetGenome(2);
263     else if(i == 2)
264         bsp.SetGenome(4);
265     else if(i == 3)
266         bsp.SetGenome(12);
267     else if(i == 4)
268         bsp.SetGenome(3);
269     else if(i == 5)
270         bsp.SetGenome(6);
271     else if(i == 6)
272         bsp.SetGenome(7);
273     else if(i == 7)
274         bsp.SetGenome(8);
275     else if(i == 8)
276         bsp.SetGenome(9);
277     else
278         bsp.SetGenome(17);
279 
280     return true;
281 }
282 
283 /**********************************************************/
GetTaxnameNameFromDescrs(TSeqdescList & descrs,std::vector<std::string> & names)284 static void GetTaxnameNameFromDescrs(TSeqdescList& descrs, std::vector<std::string>& names)
285 {
286     NON_CONST_ITERATE(TSeqdescList, descr, descrs)
287     {
288         if (!(*descr)->IsSource() || !(*descr)->GetSource().IsSetOrg() ||
289             !(*descr)->GetSource().GetOrg().IsSetTaxname())
290             continue;
291 
292         const objects::COrg_ref& org_ref = (*descr)->GetSource().GetOrg();
293         names[0] = org_ref.GetTaxname();
294 
295         if (org_ref.IsSetOrgname() && org_ref.GetOrgname().IsSetMod())
296         {
297             ITERATE(objects::COrgName::TMod, mod, org_ref.GetOrgname().GetMod())
298             {
299                 if (!(*mod)->IsSetSubname() || !(*mod)->IsSetSubtype())
300                     continue;
301 
302                 int stype = (*mod)->GetSubtype();
303 
304                 if (stype == 254)        /* old-name */
305                     names[1] = (*mod)->GetSubname();
306                 /* acronym(19), synonym(28), anamorph(29), teleomorph(30),
307                 gb-acronym(32), gb-anamorph(33), gb-synonym(34) */
308                 else if (stype == 19 || stype == 28 || stype == 29 ||
309                          stype == 30 || stype == 32 || stype == 33 ||
310                          stype == 34)
311                 {
312                     names.push_back((*mod)->GetSubname());
313                 }
314             }
315         }
316 
317         if ((*descr)->GetSource().IsSetSubtype())
318         {
319             ITERATE(objects::CBioSource::TSubtype, subtype, (*descr)->GetSource().GetSubtype())
320             {
321                 /* subtype = "other"
322                 */
323                 if (!(*subtype)->IsSetSubtype() || (*subtype)->GetSubtype() != 255 || !(*subtype)->IsSetName())
324                     continue;
325 
326                 const Char* p = StringIStr((*subtype)->GetName().c_str(), "common:");
327                 if (p == NULL)
328                     continue;
329 
330                 for (p += 7; *p == ' ';)
331                     p++;
332                 if (*p == '\0')
333                     continue;
334 
335                 names.push_back(p);
336             }
337         }
338 
339         if (org_ref.IsSetCommon())
340             names[2] = org_ref.GetCommon();
341 
342         break;
343     }
344 }
345 
346 /**********************************************************/
GetTaxnameName(TEntryList & seq_entries,std::vector<std::string> & names)347 static void GetTaxnameName(TEntryList& seq_entries, std::vector<std::string>& names)
348 {
349     names.resize(3);
350 
351     NON_CONST_ITERATE(TEntryList, entry, seq_entries)
352     {
353         for (CTypeIterator<objects::CBioseq_set> bio_set(Begin(*(*entry))); bio_set; ++bio_set)
354         {
355             if (bio_set->IsSetDescr())
356                 GetTaxnameNameFromDescrs(bio_set->SetDescr().Set(), names);
357         }
358 
359         for (CTypeIterator<objects::CBioseq> bioseq(Begin(*(*entry))); bioseq; ++bioseq)
360         {
361             if (bioseq->IsSetDescr())
362                 GetTaxnameNameFromDescrs(bioseq->SetDescr().Set(), names);
363         }
364     }
365 }
366 
367 /**********************************************************/
CheckDelGbblockSourceFromDescrs(TSeqdescList & descrs,const std::vector<std::string> & names)368 static void CheckDelGbblockSourceFromDescrs(TSeqdescList& descrs, const std::vector<std::string>& names)
369 {
370     NON_CONST_ITERATE(TSeqdescList, descr, descrs)
371     {
372         if (!(*descr)->IsGenbank())
373             continue;
374 
375         if (!(*descr)->GetGenbank().IsSetSource())
376             break;
377 
378         objects::CGB_block& gb_block = (*descr)->SetGenbank();
379         char* p = StringSave(gb_block.GetSource().c_str());
380         char* pper = 0;
381 
382         size_t len = StringLen(p);
383         if (p[len - 1] == '.')
384         {
385             pper = StringSave(p);
386             p[len - 1] = '\0';
387         }
388 
389         char* q = StringChr(p, ' ');
390         if (q != NULL)
391             *q = '\0';
392 
393         if (StringMatchIcase(valid_organelle, p) > -1)
394         {
395             if (q != NULL)
396             {
397                 for (q++; *q == ' ';)
398                     q++;
399                 fta_StringCpy(p, q);
400             }
401         }
402         else if (q != NULL)
403             *q = ' ';
404 
405         std::vector<std::string>::const_iterator name = names.begin();
406         for (name += 2; name != names.end(); ++name)
407         {
408             if (name->empty())
409                 continue;
410 
411             len = name->size();
412             for (q = p;; q++)
413             {
414                 q = StringChr(q, '(');
415                 if (q == NULL)
416                     break;
417                 char* s = q + 1;
418                 if (StringNCmp(s, "acronym:", 8) == 0 ||
419                     StringNCmp(s, "synonym:", 8) == 0)
420                     s += 8;
421                 else if (StringNCmp(s, "anamorph:", 9) == 0)
422                     s += 9;
423                 else if (StringNCmp(s, "teleomorph:", 11) == 0)
424                     s += 11;
425                 if (*s == ' ')
426                     while (*s == ' ')
427                         s++;
428                 if (StringNICmp(s, name->c_str(), len) == 0 && s[len] == ')')
429                 {
430                     char* t = NULL;
431                     for (t = s + len + 1; *t == ' ';)
432                         t++;
433                     if (*t != '\0')
434                         fta_StringCpy(q, t);
435                     else
436                     {
437                         if (q > p)
438                             q--;
439                         *q = '\0';
440                     }
441                     break;
442                 }
443             }
444         }
445 
446         if (pper != NULL)
447         {
448             MemFree(pper);
449             pper = (char*)MemNew(StringLen(p) + 2);
450             StringCpy(pper, p);
451             StringCat(pper, ".");
452         }
453 
454         const std::string& first_name = names[0];
455         const std::string& second_name = names[1];
456 
457         if (StringICmp(p, first_name.c_str()) == 0 || (pper != NULL && StringICmp(pper, first_name.c_str()) == 0))
458         {
459             gb_block.ResetSource();
460         }
461         else if (StringICmp(p, second_name.c_str()) == 0 || (pper != NULL && StringICmp(pper, second_name.c_str()) == 0))
462         {
463             gb_block.ResetSource();
464         }
465 
466         MemFree(p);
467         if (pper != NULL)
468             MemFree(pper);
469         break;
470     }
471 }
472 
473 /**********************************************************/
CheckDelGbblockSource(TEntryList & seq_entries,std::vector<std::string> & names)474 static void CheckDelGbblockSource(TEntryList& seq_entries, std::vector<std::string>& names)
475 {
476     NON_CONST_ITERATE(TEntryList, entry, seq_entries)
477     {
478         for (CTypeIterator<objects::CBioseq> bioseq(Begin(*(*entry))); bioseq; ++bioseq)
479         {
480             if (bioseq->IsSetDescr())
481                 CheckDelGbblockSourceFromDescrs(bioseq->SetDescr().Set(), names);
482         }
483     }
484 
485 }
486 
487 /**********************************************************/
MaybeCutGbblockSource(TEntryList & seq_entries)488 void MaybeCutGbblockSource(TEntryList& seq_entries)
489 {
490     std::vector<std::string> names; /* 0 - taxname */
491                                     /* 1 - 254 old-name */
492                                     /* 2 etc. - common name */
493 
494     GetTaxnameName(seq_entries, names);
495 
496     if (!names[0].empty())
497         CheckDelGbblockSource(seq_entries, names);
498 }
499 
500 /**********************************************************/
MakeLocStrCompatible(std::string & str)501 void MakeLocStrCompatible(std::string& str)
502 {
503     const static Char STR_TO_REPLACE[] = "minus";
504 
505     // changing brackets is for backward compatibility
506     if (!str.empty())
507     {
508         if (str[0] == '[')
509             str[0] = '(';
510 
511         size_t last = str.size() - 1;
512         if (str[last] == ']')
513             str[last] = ')';
514     }
515 
516     // for backward compatibility with C-toolkit version
517     size_t pos = str.find(STR_TO_REPLACE);
518     while (pos != std::string::npos)
519     {
520         str.replace(pos, sizeof(STR_TO_REPLACE) - 1, "c");
521         pos = str.find(STR_TO_REPLACE);
522     }
523 }
524 
525 /**********************************************************/
location_to_string(const objects::CSeq_loc & loc)526 Char* location_to_string(const objects::CSeq_loc& loc)
527 {
528     std::string loc_str;
529     loc.GetLabel(&loc_str);
530 
531     MakeLocStrCompatible(loc_str);
532 
533     Char* ret = StringSave(loc_str.c_str());
534     if (ret != NULL && StringLen(ret) > 50)
535         ret[50] = '\0';
536 
537     return ret;
538 }
539 
540 END_NCBI_SCOPE
541