1 /* utilfeat.c
2 *
3 * ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================
26 *
27 * File Name: utilfeat.c
28 *
29 * Author: Karl Sirotkin, Hsiu-Chuan Chen
30 *
31 * File Description: functions for features parsing
32 *
33 */
34 #include <ncbi_pch.hpp>
35
36 #include "ftacpp.hpp"
37
38 #include <objects/seqfeat/BioSource.hpp>
39 #include <objects/seqset/Bioseq_set.hpp>
40 #include <objects/seq/Bioseq.hpp>
41 #include <objects/seq/Seq_descr.hpp>
42 #include <objects/seqfeat/Org_ref.hpp>
43 #include <objects/seqfeat/OrgName.hpp>
44 #include <objects/seqfeat/SubSource.hpp>
45
46 #include "index.h"
47
48 #include <objtools/flatfile/flatdefn.h>
49
50 #include "ftaerr.hpp"
51 #include "asci_blk.h"
52 #include "add.h"
53 #include "utilfeat.h"
54 #include "utilfun.h"
55
56 #ifdef THIS_FILE
57 # undef THIS_FILE
58 #endif
59 #define THIS_FILE "utilfeat.cpp"
60
61 // This is the forward declaration for ValidAminoAcid(...). The main declaration located in
62 // ../src/objtools/cleanup/cleanup_utils.hpp
63 // TODO: it should be removed after ValidAminoAcid(...) will be moved into
64 // any of public header file.
65 // for finding the correct amino acid letter given an abbreviation
66 BEGIN_NCBI_SCOPE
67 BEGIN_SCOPE(objects)
68 char ValidAminoAcid(const string &abbrev);
69 END_SCOPE(objects)
70
71
72 const char *ParFlat_GImod[] = {
73 "Mitochondr",
74 "Chloroplast",
75 "Kinetoplas",
76 "Cyanelle",
77 "Chromoplast",
78 "Plastid",
79 "Macronuclear",
80 "Extrachrom",
81 "Plasmid",
82 "Leucoplast",
83 "Apicoplast",
84 NULL
85 };
86
87 const char *valid_organelle[] = {
88 "apicoplast",
89 "chloroplast",
90 "chromatophore",
91 "chromoplast",
92 "cyanelle",
93 "hydrogenosome",
94 "kinetoplast",
95 "leucoplast",
96 "mitochondrion",
97 "nucleomorph",
98 "plastid",
99 "proplastid",
100 NULL
101 };
102
103 /**********************************************************/
SeqLocHaveFuzz(const objects::CSeq_loc & loc)104 bool SeqLocHaveFuzz(const objects::CSeq_loc& loc)
105 {
106 bool flag;
107
108 std::string loc_str;
109 loc.GetLabel(&loc_str);
110
111 if (loc_str.find('<') == std::string::npos && loc_str.find('>') == std::string::npos)
112 flag = false;
113 else
114 flag = true;
115
116 return(flag);
117 }
118
119 /**********************************************************
120 *
121 * char* CpTheQualValue(qlist, qual):
122 *
123 * Return qual's value if found the "qual" in the
124 * "qlist"; otherwise, return NULL.
125 *
126 **********************************************************/
CpTheQualValue(const TQualVector & qlist,const Char * qual)127 char* CpTheQualValue(const TQualVector& qlist, const Char *qual)
128 {
129 std::string qvalue;
130 ITERATE(TQualVector, cur, qlist)
131 {
132 if ((*cur)->GetQual() != qual)
133 continue;
134
135 const std::string& val = (*cur)->GetVal();
136 if (val == "\"\"")
137 {
138 ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownQualSpelling,
139 "Empty qual %s : %s", qual, val.c_str());
140 break;
141 }
142
143 qvalue = NStr::Sanitize(val);
144 break;
145 }
146
147 char* ret = NULL;
148 if (!qvalue.empty())
149 ret = StringSave(qvalue.c_str());
150 return ret;
151 }
152
153 /**********************************************************
154 *
155 * char* GetTheQualValue(qlist, qual):
156 *
157 * Return qual's value if found the "qual" in the
158 * "qlist", and remove the "qual" from the qlist;
159 * otherwise, return NULL.
160 *
161 **********************************************************/
GetTheQualValue(TQualVector & qlist,const Char * qual)162 char* GetTheQualValue(TQualVector& qlist, const Char *qual)
163 {
164 char* qvalue = NULL;
165
166 NON_CONST_ITERATE(TQualVector, cur, qlist)
167 {
168 if ((*cur)->GetQual() != qual)
169 continue;
170
171 const std::string& val = (*cur)->GetVal();
172 if (val == "\"\"")
173 {
174 ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownQualSpelling,
175 "Empty qual %s : %s", qual, val.c_str());
176 break;
177 }
178
179 std::vector<Char> buf(val.begin(), val.end());
180 buf.push_back(0);
181 qvalue = tata_save(&buf[0]);
182
183 qlist.erase(cur);
184 break;
185 }
186
187 return(qvalue);
188 }
189
190 /**********************************************************
191 *
192 * bool DeleteQual(qlist, qual):
193 *
194 * Return TRUE the "qual" has found in and removed
195 * from the "qlist".
196 *
197 **********************************************************/
DeleteQual(TQualVector & qlist,const Char * qual)198 bool DeleteQual(TQualVector& qlist, const Char *qual)
199 {
200 bool got = false;
201 for (TQualVector::iterator cur = qlist.begin(); cur != qlist.end();)
202 {
203 if ((*cur)->GetQual() != qual)
204 {
205 ++cur;
206 continue;
207 }
208
209 cur = qlist.erase(cur);
210 got = true;
211 }
212
213 return(got);
214 }
215
216 /**********************************************************
217 *
218 * Uint1 GetQualValueAa(qual, checkseq):
219 *
220 * Return 255 if not a valid amino acid, not in
221 * "ParFlat_AA_array".
222 *
223 **********************************************************/
GetQualValueAa(char * qval,bool checkseq)224 Uint1 GetQualValueAa(char* qval, bool checkseq)
225 {
226 char* str;
227 char* p;
228 Uint1 aa;
229 Char ch;
230
231 str = StringStr(qval, "aa:");
232 if(str == NULL)
233 return(255);
234
235 for(str += 3; *str == ' ';)
236 str++;
237 for(p = str; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z');)
238 p++;
239
240 if(checkseq && StringStr(p, "seq:") == NULL)
241 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_AntiCodonLacksSequence,
242 "Anticodon qualifier \"%s\" lacks a 'seq' field for the sequence of the anticodon.",
243 qval);
244 ch = *p;
245 *p = '\0';
246 aa = objects::ValidAminoAcid(str);
247 *p = ch;
248
249 return(aa);
250 }
251
252 /**********************************************************/
GetGenomeInfo(objects::CBioSource & bsp,const Char * bptr)253 bool GetGenomeInfo(objects::CBioSource& bsp, const Char* bptr)
254 {
255 Int4 i = StringMatchIcase(ParFlat_GImod, bptr);
256 if(i == -1)
257 return false;
258
259 if(i == 0)
260 bsp.SetGenome(5);
261 else if (i == 1)
262 bsp.SetGenome(2);
263 else if(i == 2)
264 bsp.SetGenome(4);
265 else if(i == 3)
266 bsp.SetGenome(12);
267 else if(i == 4)
268 bsp.SetGenome(3);
269 else if(i == 5)
270 bsp.SetGenome(6);
271 else if(i == 6)
272 bsp.SetGenome(7);
273 else if(i == 7)
274 bsp.SetGenome(8);
275 else if(i == 8)
276 bsp.SetGenome(9);
277 else
278 bsp.SetGenome(17);
279
280 return true;
281 }
282
283 /**********************************************************/
GetTaxnameNameFromDescrs(TSeqdescList & descrs,std::vector<std::string> & names)284 static void GetTaxnameNameFromDescrs(TSeqdescList& descrs, std::vector<std::string>& names)
285 {
286 NON_CONST_ITERATE(TSeqdescList, descr, descrs)
287 {
288 if (!(*descr)->IsSource() || !(*descr)->GetSource().IsSetOrg() ||
289 !(*descr)->GetSource().GetOrg().IsSetTaxname())
290 continue;
291
292 const objects::COrg_ref& org_ref = (*descr)->GetSource().GetOrg();
293 names[0] = org_ref.GetTaxname();
294
295 if (org_ref.IsSetOrgname() && org_ref.GetOrgname().IsSetMod())
296 {
297 ITERATE(objects::COrgName::TMod, mod, org_ref.GetOrgname().GetMod())
298 {
299 if (!(*mod)->IsSetSubname() || !(*mod)->IsSetSubtype())
300 continue;
301
302 int stype = (*mod)->GetSubtype();
303
304 if (stype == 254) /* old-name */
305 names[1] = (*mod)->GetSubname();
306 /* acronym(19), synonym(28), anamorph(29), teleomorph(30),
307 gb-acronym(32), gb-anamorph(33), gb-synonym(34) */
308 else if (stype == 19 || stype == 28 || stype == 29 ||
309 stype == 30 || stype == 32 || stype == 33 ||
310 stype == 34)
311 {
312 names.push_back((*mod)->GetSubname());
313 }
314 }
315 }
316
317 if ((*descr)->GetSource().IsSetSubtype())
318 {
319 ITERATE(objects::CBioSource::TSubtype, subtype, (*descr)->GetSource().GetSubtype())
320 {
321 /* subtype = "other"
322 */
323 if (!(*subtype)->IsSetSubtype() || (*subtype)->GetSubtype() != 255 || !(*subtype)->IsSetName())
324 continue;
325
326 const Char* p = StringIStr((*subtype)->GetName().c_str(), "common:");
327 if (p == NULL)
328 continue;
329
330 for (p += 7; *p == ' ';)
331 p++;
332 if (*p == '\0')
333 continue;
334
335 names.push_back(p);
336 }
337 }
338
339 if (org_ref.IsSetCommon())
340 names[2] = org_ref.GetCommon();
341
342 break;
343 }
344 }
345
346 /**********************************************************/
GetTaxnameName(TEntryList & seq_entries,std::vector<std::string> & names)347 static void GetTaxnameName(TEntryList& seq_entries, std::vector<std::string>& names)
348 {
349 names.resize(3);
350
351 NON_CONST_ITERATE(TEntryList, entry, seq_entries)
352 {
353 for (CTypeIterator<objects::CBioseq_set> bio_set(Begin(*(*entry))); bio_set; ++bio_set)
354 {
355 if (bio_set->IsSetDescr())
356 GetTaxnameNameFromDescrs(bio_set->SetDescr().Set(), names);
357 }
358
359 for (CTypeIterator<objects::CBioseq> bioseq(Begin(*(*entry))); bioseq; ++bioseq)
360 {
361 if (bioseq->IsSetDescr())
362 GetTaxnameNameFromDescrs(bioseq->SetDescr().Set(), names);
363 }
364 }
365 }
366
367 /**********************************************************/
CheckDelGbblockSourceFromDescrs(TSeqdescList & descrs,const std::vector<std::string> & names)368 static void CheckDelGbblockSourceFromDescrs(TSeqdescList& descrs, const std::vector<std::string>& names)
369 {
370 NON_CONST_ITERATE(TSeqdescList, descr, descrs)
371 {
372 if (!(*descr)->IsGenbank())
373 continue;
374
375 if (!(*descr)->GetGenbank().IsSetSource())
376 break;
377
378 objects::CGB_block& gb_block = (*descr)->SetGenbank();
379 char* p = StringSave(gb_block.GetSource().c_str());
380 char* pper = 0;
381
382 size_t len = StringLen(p);
383 if (p[len - 1] == '.')
384 {
385 pper = StringSave(p);
386 p[len - 1] = '\0';
387 }
388
389 char* q = StringChr(p, ' ');
390 if (q != NULL)
391 *q = '\0';
392
393 if (StringMatchIcase(valid_organelle, p) > -1)
394 {
395 if (q != NULL)
396 {
397 for (q++; *q == ' ';)
398 q++;
399 fta_StringCpy(p, q);
400 }
401 }
402 else if (q != NULL)
403 *q = ' ';
404
405 std::vector<std::string>::const_iterator name = names.begin();
406 for (name += 2; name != names.end(); ++name)
407 {
408 if (name->empty())
409 continue;
410
411 len = name->size();
412 for (q = p;; q++)
413 {
414 q = StringChr(q, '(');
415 if (q == NULL)
416 break;
417 char* s = q + 1;
418 if (StringNCmp(s, "acronym:", 8) == 0 ||
419 StringNCmp(s, "synonym:", 8) == 0)
420 s += 8;
421 else if (StringNCmp(s, "anamorph:", 9) == 0)
422 s += 9;
423 else if (StringNCmp(s, "teleomorph:", 11) == 0)
424 s += 11;
425 if (*s == ' ')
426 while (*s == ' ')
427 s++;
428 if (StringNICmp(s, name->c_str(), len) == 0 && s[len] == ')')
429 {
430 char* t = NULL;
431 for (t = s + len + 1; *t == ' ';)
432 t++;
433 if (*t != '\0')
434 fta_StringCpy(q, t);
435 else
436 {
437 if (q > p)
438 q--;
439 *q = '\0';
440 }
441 break;
442 }
443 }
444 }
445
446 if (pper != NULL)
447 {
448 MemFree(pper);
449 pper = (char*)MemNew(StringLen(p) + 2);
450 StringCpy(pper, p);
451 StringCat(pper, ".");
452 }
453
454 const std::string& first_name = names[0];
455 const std::string& second_name = names[1];
456
457 if (StringICmp(p, first_name.c_str()) == 0 || (pper != NULL && StringICmp(pper, first_name.c_str()) == 0))
458 {
459 gb_block.ResetSource();
460 }
461 else if (StringICmp(p, second_name.c_str()) == 0 || (pper != NULL && StringICmp(pper, second_name.c_str()) == 0))
462 {
463 gb_block.ResetSource();
464 }
465
466 MemFree(p);
467 if (pper != NULL)
468 MemFree(pper);
469 break;
470 }
471 }
472
473 /**********************************************************/
CheckDelGbblockSource(TEntryList & seq_entries,std::vector<std::string> & names)474 static void CheckDelGbblockSource(TEntryList& seq_entries, std::vector<std::string>& names)
475 {
476 NON_CONST_ITERATE(TEntryList, entry, seq_entries)
477 {
478 for (CTypeIterator<objects::CBioseq> bioseq(Begin(*(*entry))); bioseq; ++bioseq)
479 {
480 if (bioseq->IsSetDescr())
481 CheckDelGbblockSourceFromDescrs(bioseq->SetDescr().Set(), names);
482 }
483 }
484
485 }
486
487 /**********************************************************/
MaybeCutGbblockSource(TEntryList & seq_entries)488 void MaybeCutGbblockSource(TEntryList& seq_entries)
489 {
490 std::vector<std::string> names; /* 0 - taxname */
491 /* 1 - 254 old-name */
492 /* 2 etc. - common name */
493
494 GetTaxnameName(seq_entries, names);
495
496 if (!names[0].empty())
497 CheckDelGbblockSource(seq_entries, names);
498 }
499
500 /**********************************************************/
MakeLocStrCompatible(std::string & str)501 void MakeLocStrCompatible(std::string& str)
502 {
503 const static Char STR_TO_REPLACE[] = "minus";
504
505 // changing brackets is for backward compatibility
506 if (!str.empty())
507 {
508 if (str[0] == '[')
509 str[0] = '(';
510
511 size_t last = str.size() - 1;
512 if (str[last] == ']')
513 str[last] = ')';
514 }
515
516 // for backward compatibility with C-toolkit version
517 size_t pos = str.find(STR_TO_REPLACE);
518 while (pos != std::string::npos)
519 {
520 str.replace(pos, sizeof(STR_TO_REPLACE) - 1, "c");
521 pos = str.find(STR_TO_REPLACE);
522 }
523 }
524
525 /**********************************************************/
location_to_string(const objects::CSeq_loc & loc)526 Char* location_to_string(const objects::CSeq_loc& loc)
527 {
528 std::string loc_str;
529 loc.GetLabel(&loc_str);
530
531 MakeLocStrCompatible(loc_str);
532
533 Char* ret = StringSave(loc_str.c_str());
534 if (ret != NULL && StringLen(ret) > 50)
535 ret[50] = '\0';
536
537 return ret;
538 }
539
540 END_NCBI_SCOPE
541