1 /*   tax3api.c
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *            National Center for Biotechnology Information (NCBI)
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government do not place any restriction on its use or reproduction.
13 *  We would, however, appreciate having the NCBI and the author cited in
14 *  any work or product based on this material
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name:  tax3api.c
27 *
28 * Author:  Jonathan Kans
29 *
30 * Version Creation Date:   7/8/04
31 *
32 * $Revision: 1.94 $
33 *
34 * File Description:
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date     Name        Description of modification
39 * -------  ----------  -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44 
45 #include <ncbi.h>
46 #include <objseq.h>
47 #include <objsset.h>
48 #include <tax3api.h>
49 #include <sqnutils.h>
50 #include <subutil.h>
51 #include <findrepl.h>
52 #define NLM_GENERATED_CODE_PROTO
53 #include <objmacro.h>
54 #include <macroapi.h>
55 
56 /* low-level connection functions */
57 
58 static Boolean text_tax_asn = FALSE;
59 static Boolean text_tax_set = FALSE;
60 
61 static Boolean test_tax_asn = FALSE;
62 static Boolean test_tax_set = FALSE;
63 
64 #if 1
65 static const CharPtr tax3servicename = "TaxService3";
66 #else
67 static const CharPtr tax3servicename = "TaxService3Test";
68 #endif
69 
Tax3ReplyFixup(Taxon3ReplyPtr t3ry)70 static void Tax3ReplyFixup (
71   Taxon3ReplyPtr t3ry
72 )
73 
74 {
75   OrgNamePtr  onp;
76   OrgRefPtr   orp;
77   T3DataPtr   tdp;
78   T3ReplyPtr  trp;
79 
80   if (t3ry == NULL) return;
81 
82   for (trp = t3ry->reply; trp != NULL; trp = trp->next) {
83     if (trp->choice != T3Reply_data) continue;
84     tdp = (T3DataPtr) trp->data.ptrvalue;
85     if (tdp == NULL) continue;
86     orp = (OrgRefPtr) tdp->org;
87     if (orp == NULL) continue;
88     onp = orp->orgname;
89     if (onp == NULL) continue;
90     if (onp->pgcode != 0) continue;
91     onp->pgcode = GetSpecialPlastidGenCode (orp->taxname, onp->lineage);
92   }
93 }
94 
Tax3OpenConnection(void)95 NLM_EXTERN CONN Tax3OpenConnection (
96   void
97 )
98 
99 {
100 #ifdef OS_UNIX
101   CharPtr  str;
102 
103   if (! text_tax_set) {
104     str = (CharPtr) getenv ("TEXT_TAX_ASN");
105     if (StringDoesHaveText (str)) {
106       if (StringICmp (str, "TRUE") == 0) {
107         text_tax_asn = TRUE;
108       }
109     }
110     text_tax_set = TRUE;
111   }
112 
113   if (! test_tax_set) {
114     str = (CharPtr) getenv ("TEST_TAX_ASN");
115     if (StringDoesHaveText (str)) {
116       if (StringICmp (str, "TRUE") == 0) {
117         test_tax_asn = TRUE;
118       }
119     }
120     test_tax_set = TRUE;
121   }
122 #endif
123 
124   if (test_tax_asn) {
125     return QUERY_OpenServiceQuery ("TaxService3Test", NULL, 30);
126   }
127 
128   return QUERY_OpenServiceQuery (text_tax_asn ? "TaxService3Text" : tax3servicename, NULL, 30);
129 }
130 
131 #ifdef OS_MAC
132 #include <Events.h>
133 #endif
134 
Tax3WaitForReply(CONN conn)135 NLM_EXTERN Taxon3ReplyPtr Tax3WaitForReply (
136   CONN conn
137 )
138 
139 {
140   AsnIoConnPtr    aicp;
141   time_t          currtime, starttime;
142   time_t          max = 0;
143   EIO_Status      status;
144   STimeout        timeout;
145   Taxon3ReplyPtr  t3ry = NULL;
146 #ifdef OS_MAC
147   EventRecord     currEvent;
148 #endif
149 
150   if (conn == NULL) return NULL;
151 
152 #ifdef OS_MAC
153   timeout.sec = 0;
154   timeout.usec = 0;
155 #else
156   timeout.sec = 300;
157   timeout.usec = 0;
158 #endif
159 
160   starttime = GetSecs ();
161   while ((status = CONN_Wait (conn, eIO_Read, &timeout)) == eIO_Timeout && max < 300) {
162     currtime = GetSecs ();
163     max = currtime - starttime;
164 #ifdef OS_MAC
165     WaitNextEvent (0, &currEvent, 0, NULL);
166 #endif
167   }
168   if (status == eIO_Success) {
169     aicp = QUERY_AsnIoConnOpen (text_tax_asn ? "r" : "rb", conn);
170     t3ry = Taxon3ReplyAsnRead (aicp->aip, NULL);
171     Tax3ReplyFixup (t3ry);
172     QUERY_AsnIoConnClose (aicp);
173   }
174   CONN_Close (conn);
175 
176   return t3ry;
177 }
178 
179 /* high-level connection functions */
180 
Tax3SynchronousQuery(Taxon3RequestPtr t3rq)181 NLM_EXTERN Taxon3ReplyPtr Tax3SynchronousQuery (
182   Taxon3RequestPtr t3rq
183 )
184 
185 {
186   AsnIoConnPtr    aicp;
187   CONN            conn;
188   Taxon3ReplyPtr  t3ry;
189   time_t          t1, t2, t3;
190 
191   if (t3rq == NULL) return NULL;
192 
193   conn = Tax3OpenConnection ();
194 
195   if (conn == NULL) return NULL;
196 
197   aicp = QUERY_AsnIoConnOpen (text_tax_asn ? "w" : "wb", conn);
198 
199   Taxon3RequestAsnWrite (t3rq, aicp->aip, NULL);
200 
201   AsnIoFlush (aicp->aip);
202   QUERY_AsnIoConnClose (aicp);
203 
204   QUERY_SendQuery (conn);
205 
206   t1 = time(NULL);
207   t3ry = Tax3WaitForReply (conn);
208   t2 = time(NULL);
209   t3 = t2 - t1;
210 
211   return t3ry;
212 }
213 
Tax3AsynchronousQuery(Taxon3RequestPtr t3rq,QUEUE * queue,QueryResultProc resultproc,VoidPtr userdata)214 NLM_EXTERN Boolean Tax3AsynchronousQuery (
215   Taxon3RequestPtr t3rq,
216   QUEUE* queue,
217   QueryResultProc resultproc,
218   VoidPtr userdata
219 )
220 
221 {
222   AsnIoConnPtr  aicp;
223   CONN          conn;
224 
225   if (t3rq == NULL) return FALSE;
226 
227   conn = Tax3OpenConnection ();
228 
229   if (conn == NULL) return FALSE;
230 
231   aicp = QUERY_AsnIoConnOpen (text_tax_asn ? "w" : "wb", conn);
232 
233   Taxon3RequestAsnWrite (t3rq, aicp->aip, NULL);
234 
235   AsnIoFlush (aicp->aip);
236   QUERY_AsnIoConnClose (aicp);
237 
238   QUERY_SendQuery (conn);
239 
240   QUERY_AddToQueue (queue, conn, resultproc, userdata, TRUE);
241 
242   return TRUE;
243 }
244 
Tax3CheckQueue(QUEUE * queue)245 NLM_EXTERN Int4 Tax3CheckQueue (
246   QUEUE* queue
247 )
248 
249 {
250   return QUERY_CheckQueue (queue);
251 }
252 
Tax3ReadReply(CONN conn,EIO_Status status)253 NLM_EXTERN Taxon3ReplyPtr Tax3ReadReply (
254   CONN conn,
255   EIO_Status status
256 )
257 
258 {
259   AsnIoConnPtr    aicp;
260   Taxon3ReplyPtr  t3ry = NULL;
261 
262   if (conn != NULL && status == eIO_Success) {
263     aicp = QUERY_AsnIoConnOpen (text_tax_asn ? "r" : "rb", conn);
264     t3ry = Taxon3ReplyAsnRead (aicp->aip, NULL);
265     Tax3ReplyFixup (t3ry);
266     QUERY_AsnIoConnClose (aicp);
267   }
268   return t3ry;
269 }
270 
CreateTaxon3Request(Int4 taxid,CharPtr name,OrgRefPtr orp)271 NLM_EXTERN Taxon3RequestPtr CreateTaxon3Request (
272   Int4 taxid,
273   CharPtr name,
274   OrgRefPtr orp
275 )
276 
277 {
278   Taxon3RequestPtr  t2rp;
279 
280   t2rp = Taxon3RequestNew ();
281   if (t2rp == NULL) return NULL;
282 
283   if (StringDoesHaveText (name)) {
284     ValNodeCopyStr (&(t2rp->request), 2, name);
285   } else if (taxid > 0) {
286     ValNodeAddInt (&(t2rp->request), 1, taxid);
287   } else if (orp != NULL) {
288     orp = AsnIoMemCopy ((Pointer) orp,
289                         (AsnReadFunc) OrgRefAsnRead,
290                         (AsnWriteFunc) OrgRefAsnWrite);
291     ValNodeAddPointer (&(t2rp->request), 3, (Pointer) orp);
292   }
293 
294   return t2rp;
295 }
296 
297 
SaveTaxon3Request(Taxon3RequestPtr t3rp,CharPtr path)298 static void SaveTaxon3Request (Taxon3RequestPtr t3rp, CharPtr path)
299 {
300   AsnIoPtr aip;
301 
302   if (t3rp != NULL) {
303     aip = AsnIoOpen (path, "w");
304     if (aip != NULL) {
305       Taxon3RequestAsnWrite (t3rp, aip, NULL);
306       AsnIoClose (aip);
307     }
308   }
309 }
310 
CreateMultiTaxon3Request(ValNodePtr org_list)311 NLM_EXTERN Taxon3RequestPtr CreateMultiTaxon3Request (ValNodePtr org_list)
312 {
313   ValNodePtr vnp;
314   Taxon3RequestPtr t3rp;
315   OrgRefPtr orp;
316 
317   t3rp = Taxon3RequestNew ();
318   if (t3rp == NULL) return NULL;
319 
320   for (vnp = org_list; vnp != NULL; vnp = vnp->next)
321   {
322     switch (vnp->choice)
323     {
324       case T3Request_taxid:
325         ValNodeAddInt (&(t3rp->request), T3Request_taxid, vnp->data.intvalue);
326         break;
327       case T3Request_name:
328         ValNodeCopyStr (&(t3rp->request), T3Request_name, vnp->data.ptrvalue);
329         break;
330       case T3Request_org:
331         orp = AsnIoMemCopy (vnp->data.ptrvalue,
332                         (AsnReadFunc) OrgRefAsnRead,
333                         (AsnWriteFunc) OrgRefAsnWrite);
334         ValNodeAddPointer (&(t3rp->request), T3Request_org, (Pointer) orp);
335         break;
336     }
337   }
338 
339   /* SaveTaxon3Request(t3rp, "request.txt"); */
340   return t3rp;
341 }
342 
343 /* takes ValNode list of integers, creates request */
CreateJoinRequest(ValNodePtr taxon_list)344 NLM_EXTERN Taxon3RequestPtr CreateJoinRequest (ValNodePtr taxon_list)
345 {
346   Taxon3RequestPtr t3rp;
347   ValNodePtr vnp, data = NULL;
348 
349   t3rp = Taxon3RequestNew();
350   if (t3rp == NULL) return NULL;
351 
352   for (vnp = taxon_list; vnp != NULL; vnp = vnp->next) {
353     ValNodeAddInt (&data, T3Request_join, vnp->data.intvalue);
354   }
355   ValNodeAddPointer ((&t3rp->request), T3Request_join, data);
356 
357 /*  SaveTaxon3Request(t3rp, "join_request.txt"); */
358   return t3rp;
359 }
360 
361 
HasMisspellingFlag(T3DataPtr t)362 static Boolean HasMisspellingFlag (T3DataPtr t)
363 {
364   T3StatusFlagsPtr status;
365 
366   if (t == NULL) return FALSE;
367   status = t->status;
368   while (status != NULL) {
369     if (StringCmp (status->property, "misspelled_name") == 0) {
370       return TRUE;
371     }
372     status = status->next;
373   }
374   return FALSE;
375 }
376 
377 
GetStatusFlags(T3DataPtr t)378 static Uint1 GetStatusFlags (T3DataPtr t)
379 {
380   Uint1 flags = 0;
381   T3StatusFlagsPtr status;
382   ValNodePtr vnp;
383 
384   if (t == NULL) return FALSE;
385   status = t->status;
386   while (status != NULL) {
387     if (StringCmp (status->property, "unpublished_name") == 0) {
388       flags |= eReturnedOrgFlag_unpublished;
389     } else if (StringCmp (status->property, "misspelled_name") == 0) {
390       flags |= eReturnedOrgFlag_misspelled;
391     } else if (StringCmp (status->property, "old_name_class") == 0) {
392       for (vnp = status->Value_value; vnp != NULL; vnp = vnp->next) {
393         if (vnp->choice == Value_value_str) {
394           if (StringCmp ((CharPtr)(vnp->data.ptrvalue), "common name") == 0
395               || StringCmp ((CharPtr)(vnp->data.ptrvalue), "genbank common name")) {
396             flags |= eReturnedOrgFlag_common_name;
397           }
398         }
399       }
400     }
401 
402     status = status->next;
403   }
404   if (flags == 0) {
405     flags = eReturnedOrgFlag_normal;
406   }
407   return flags;
408 }
409 
410 
ObjectIdCompare(ObjectIdPtr a,ObjectIdPtr b)411 NLM_EXTERN int LIBCALL ObjectIdCompare (ObjectIdPtr a, ObjectIdPtr b)
412 {
413   int rval = 0;
414   Char buf[30];
415 
416     if (a == b) {
417         rval = 0;
418   } else if (a == NULL) {
419     rval = -1;
420   } else if (b == NULL) {
421     rval = 1;
422   } else if (a->str == NULL && b->str == NULL) {
423     if (a->id < b->id) {
424       rval = -1;
425     } else if (a->id > b->id) {
426       rval = 1;
427     }
428   } else if (a->str == NULL) {
429     sprintf (buf, "%d", a->id);
430     rval = StringCmp (buf, b->str);
431   } else if (b->str == NULL) {
432     sprintf (buf, "%d", b->id);
433     rval = StringCmp (a->str, buf);
434   } else {
435     rval = StringCmp (a->str, b->str);
436   }
437   return rval;
438 }
439 
440 
441 /*****************************************************************************
442 *
443 *   DbtagMatch(a, b)
444 *
445 *****************************************************************************/
DbtagCompare(DbtagPtr a,DbtagPtr b)446 NLM_EXTERN int LIBCALL DbtagCompare (DbtagPtr a, DbtagPtr b)
447 {
448   int rval = 0;
449 
450     if (a == b) {
451         rval = 0;
452   } else if (a == NULL) {
453     rval = -1;
454   } else if (b == NULL) {
455     rval = 1;
456   } else if ((rval = StringICmp (a->db, b->db)) == 0) {
457     rval = ObjectIdCompare (a->tag, b->tag);
458   }
459   return rval;
460 }
461 
462 
SortVnpByDbtag(VoidPtr ptr1,VoidPtr ptr2)463 static int LIBCALLBACK SortVnpByDbtag (VoidPtr ptr1, VoidPtr ptr2)
464 
465 {
466   ValNodePtr  vnp1;
467   ValNodePtr  vnp2;
468 
469   if (ptr1 != NULL && ptr2 != NULL) {
470     vnp1 = *((ValNodePtr PNTR) ptr1);
471     vnp2 = *((ValNodePtr PNTR) ptr2);
472     if (vnp1 != NULL && vnp2 != NULL) {
473       return DbtagCompare (vnp1->data.ptrvalue, vnp2->data.ptrvalue);
474     }
475   }
476   return 0;
477 }
478 
OrgModSetCompare(OrgModPtr mod1,OrgModPtr mod2)479 NLM_EXTERN int LIBCALL OrgModSetCompare (OrgModPtr mod1, OrgModPtr mod2)
480 {
481   int rval = 0;
482 
483   while (mod1 != NULL && mod2 != NULL && rval == 0)
484   {
485     if (mod1->subtype < mod2->subtype)
486     {
487       rval = -1;
488     }
489     else if (mod1->subtype > mod2->subtype)
490     {
491       rval = 1;
492     }
493     else if ((rval = StringCmp (mod1->subname, mod2->subname)) == 0
494            && (rval = StringCmp (mod1->attrib, mod2->attrib)) == 0)
495     {
496       mod1 = mod1->next;
497       mod2 = mod2->next;
498     }
499   }
500 
501   if (rval == 0)
502   {
503     if (mod1 == NULL && mod2 == NULL)
504     {
505       rval = 0;
506     }
507     else if (mod1 == NULL)
508     {
509       rval = -1;
510     }
511     else if (mod2 == NULL)
512     {
513       rval = 1;
514     }
515   }
516   return rval;
517 }
518 
519 
OrgNameCompare(OrgNamePtr onp1,OrgNamePtr onp2)520 NLM_EXTERN int LIBCALL OrgNameCompare (OrgNamePtr onp1, OrgNamePtr onp2)
521 {
522   int rval = 0;
523 
524   while (onp1 != NULL && onp2 != NULL && rval == 0)
525   {
526     if ((rval = OrgModSetCompare(onp1->mod, onp2->mod)) != 0
527         || (rval = StringCmp (onp1->lineage, onp2->lineage)) != 0
528         || (rval = StringCmp (onp1->div, onp2->div)) != 0
529         || (rval = StringCmp (onp1->attrib, onp2->attrib)) != 0)
530     {
531       /* no further processing */
532     }
533     else if (onp1->choice < onp2->choice)
534     {
535       rval = -1;
536     }
537     else if (onp1->choice > onp2->choice)
538     {
539       rval = 1;
540     }
541     else if (onp1->gcode < onp2->gcode)
542     {
543       rval = -1;
544     }
545     else if (onp1->gcode > onp2->gcode)
546     {
547       rval = 1;
548     }
549     else if (onp1->mgcode < onp2->mgcode)
550     {
551       rval = -1;
552     }
553     else if (onp1->mgcode > onp2->mgcode)
554     {
555       rval = 1;
556     }
557     else if (onp1->pgcode < onp2->pgcode)
558     {
559       rval = -1;
560     }
561     else if (onp1->pgcode > onp2->pgcode)
562     {
563       rval = 1;
564     }
565     onp1 = onp1->next;
566     onp2 = onp2->next;
567   }
568   if (rval == 0)
569   {
570     if (onp1 == NULL && onp2 == NULL)
571     {
572       rval = 0;
573     }
574     else if (onp1 == NULL)
575     {
576       rval = -1;
577     }
578     else if (onp2 == NULL)
579     {
580       rval = 1;
581     }
582   }
583   return rval;
584 }
585 
586 
587 /*****************************************************************************
588 *
589 *   OrgRefCompare (orp1, orp2)
590 *
591 *****************************************************************************/
OrgRefCompare(OrgRefPtr orp1,OrgRefPtr orp2)592 NLM_EXTERN int LIBCALL OrgRefCompare (OrgRefPtr orp1, OrgRefPtr orp2)
593 {
594   int rval = 0;
595   if (orp1 == NULL && orp2 == NULL)
596   {
597     return 0;
598   }
599   else if (orp1 == NULL)
600   {
601     return -1;
602   }
603   else if (orp2 == NULL)
604   {
605     return 1;
606   }
607   else if ((rval = StringCmp (orp1->taxname, orp2->taxname)) != 0)
608   {
609     return rval;
610   }
611   else if ((rval = StringCmp (orp1->common, orp2->common)) != 0)
612   {
613     return rval;
614   }
615   else if ((rval = ValNodeCompare (orp1->syn, orp2->syn, SortVnpByString)) != 0)
616   {
617     return rval;
618   }
619   else if ((rval = ValNodeCompare (orp1->db, orp2->db, SortVnpByDbtag)) != 0)
620   {
621     return rval;
622   }
623   else
624   {
625     rval = OrgNameCompare (orp1->orgname, orp2->orgname);
626   }
627   return rval;
628 }
629 
630 
SortVnpByOrgRef(VoidPtr ptr1,VoidPtr ptr2)631 static int LIBCALLBACK SortVnpByOrgRef (VoidPtr ptr1, VoidPtr ptr2)
632 
633 {
634   ValNodePtr  vnp1;
635   ValNodePtr  vnp2;
636 
637   if (ptr1 != NULL && ptr2 != NULL) {
638     vnp1 = *((ValNodePtr PNTR) ptr1);
639     vnp2 = *((ValNodePtr PNTR) ptr2);
640     if (vnp1 != NULL && vnp2 != NULL) {
641       return OrgRefCompare (vnp1->data.ptrvalue, vnp2->data.ptrvalue);
642     }
643   }
644   return 0;
645 }
646 
647 
Taxon3GetOrgRefList(ValNodePtr org_list)648 NLM_EXTERN ValNodePtr Taxon3GetOrgRefList (ValNodePtr org_list)
649 {
650   Taxon3RequestPtr t3rq;
651   Taxon3ReplyPtr   t3ry;
652   T3DataPtr        tdp;
653   OrgRefPtr        t3orp = NULL;
654   T3ReplyPtr       trp;
655   T3ErrorPtr       tep;
656   ValNodePtr       uniq_list, response_list = NULL, next_org_list, last_org;
657   Int4             request_num, max_requests = 2000;
658   ValNodePtr PNTR  ptr_array;
659   ValNodePtr       vnp, vnp_rq, vnp_rp;
660   Int4             i, num_orgs;
661   Uint1            choice;
662   TextFsaPtr       tags;
663 
664   if (org_list == NULL) {
665     return NULL;
666   }
667 
668   tags = GetOrgModSearch();
669 
670   /* make a copy of the original list - we will prepare the response list by substituting the OrgRef */
671   org_list = ValNodeCopyPtr (org_list);
672 
673   /* make array to show original order of ValNodes, so that we can restore after sorting */
674   num_orgs = ValNodeLen (org_list);
675   ptr_array = (ValNodePtr PNTR) MemNew (sizeof (ValNodePtr) * num_orgs);
676   for (vnp = org_list, i = 0; vnp != NULL; vnp = vnp->next, i++) {
677     ptr_array[i] = vnp;
678   }
679 
680   org_list = ValNodeSort (org_list, SortVnpByOrgRef);
681 
682   /* now make a list of just the unique requests */
683   uniq_list = ValNodeCopyPtr (org_list);
684   ValNodeUnique (&uniq_list, SortVnpByOrgRef, ValNodeFree);
685 
686   /* now break large lists into manageable chunks */
687   vnp = uniq_list;
688   while (vnp != NULL) {
689     next_org_list = vnp->next;
690     last_org = vnp;
691     request_num = 1;
692     while (next_org_list != NULL && request_num < max_requests) {
693       last_org = next_org_list;
694       next_org_list = next_org_list->next;
695       request_num++;
696     }
697     if (last_org != NULL) {
698       last_org->next = NULL;
699     }
700 
701     /* now create the request */
702 
703     t3rq = CreateMultiTaxon3Request (vnp);
704     if (t3rq == NULL) return NULL;
705     t3ry = Tax3SynchronousQuery (t3rq);
706     Taxon3RequestFree (t3rq);
707     if (t3ry != NULL) {
708       for (trp = t3ry->reply; trp != NULL; trp = trp->next) {
709         switch (trp->choice) {
710           case T3Reply_error :
711             tep = (T3ErrorPtr) trp->data.ptrvalue;
712             if (tep != NULL) {
713               ErrPostEx (SEV_ERROR, 0, 0, tep->message);
714             }
715             if (tep != NULL && StringStr (tep->message, "ambiguous") != NULL) {
716               ValNodeAddPointer (&response_list, eReturnedOrgFlag_ambiguous, NULL);
717             } else {
718               ValNodeAddPointer (&response_list, eReturnedOrgFlag_error, NULL);
719             }
720             break;
721           case T3Reply_data :
722             tdp = (T3DataPtr) trp->data.ptrvalue;
723             if (tdp != NULL) {
724               t3orp = (OrgRefPtr)(tdp->org);
725               choice = GetStatusFlags (tdp);
726               /* disable
727               ParseTaxNameToQuals(t3orp, tags); */
728               ValNodeAddPointer (&response_list, choice, (Pointer) t3orp);
729               tdp->org = NULL;
730             }
731             break;
732           default :
733             break;
734         }
735       }
736       Taxon3ReplyFree (t3ry);
737     }
738 
739     if (last_org != NULL) {
740         last_org->next = next_org_list;
741     }
742     vnp = next_org_list;
743   }
744 
745   /* now put responses in list */
746   vnp = uniq_list;
747   vnp_rq = org_list;
748   vnp_rp = response_list;
749 
750   while (vnp != NULL && vnp_rq != NULL && vnp_rp != NULL) {
751     while (vnp_rq != NULL && OrgRefCompare (vnp->data.ptrvalue, vnp_rq->data.ptrvalue) == 0) {
752       vnp_rq->data.ptrvalue = AsnIoMemCopy (vnp_rp->data.ptrvalue, (AsnReadFunc) OrgRefAsnRead, (AsnWriteFunc) OrgRefAsnWrite);
753       vnp_rq->choice = vnp_rp->choice;
754       vnp_rq = vnp_rq->next;
755     }
756     vnp_rp->data.ptrvalue = OrgRefFree (vnp_rp->data.ptrvalue);
757     vnp_rp = vnp_rp->next;
758     vnp = vnp->next;
759   }
760   /* if there were more requests than responses, set responses to NULL */
761   while (vnp_rq != NULL) {
762     vnp_rq->data.ptrvalue = NULL;
763     vnp_rq = vnp_rq->next;
764   }
765   /* if there were more responses than requests, free extra responses */
766   while (vnp_rp != NULL) {
767     vnp_rp->data.ptrvalue = OrgRefFree (vnp_rp->data.ptrvalue);
768     vnp_rp = vnp_rp->next;
769   }
770   response_list = ValNodeFree (response_list);
771   uniq_list = ValNodeFree (uniq_list);
772 
773   /* now restore original order */
774   for (i = 0; i < num_orgs - 1; i++) {
775     ptr_array[i]->next = ptr_array[i + 1];
776   }
777   ptr_array[num_orgs - 1]->next = NULL;
778   org_list = ptr_array[0];
779   ptr_array = MemFree (ptr_array);
780   tags = TextFsaFree (tags);
781 
782   return org_list;
783 }
784 
785 
TaxFixItemNew(void)786 NLM_EXTERN TaxFixItemPtr TaxFixItemNew (void)
787 {
788   TaxFixItemPtr t;
789 
790   t = (TaxFixItemPtr) MemNew (sizeof (TaxFixItemData));
791   MemSet (t, 0, sizeof (TaxFixItemData));
792   return t;
793 }
794 
795 
TaxFixItemFree(TaxFixItemPtr t)796 NLM_EXTERN TaxFixItemPtr TaxFixItemFree (TaxFixItemPtr t)
797 {
798   if (t != NULL) {
799     t->response_org = OrgRefFree (t->response_org);
800     t->taxname = MemFree (t->taxname);
801     t->suggested_fix = MemFree (t->suggested_fix);
802     t->rank = MemFree (t->rank);
803     t = MemFree (t);
804   }
805   return t;
806 }
807 
808 
TaxFixItemListFree(ValNodePtr vnp)809 NLM_EXTERN ValNodePtr LIBCALLBACK TaxFixItemListFree (ValNodePtr vnp)
810 {
811   ValNodePtr vnp_next;
812 
813   while (vnp != NULL) {
814     vnp_next = vnp->next;
815     vnp->next = NULL;
816     vnp->data.ptrvalue = TaxFixItemFree (vnp->data.ptrvalue);
817     vnp = ValNodeFree (vnp);
818     vnp = vnp_next;
819   }
820   return vnp;
821 }
822 
823 
StringSum(CharPtr str1,CharPtr str2)824 static CharPtr StringSum (CharPtr str1, CharPtr str2)
825 {
826   CharPtr sum = NULL;
827 
828   if (str1 == NULL && str2 == NULL) {
829     sum = NULL;
830   } else if (str1 == NULL) {
831     sum = StringSave (str2);
832   } else if (str2 == NULL) {
833     sum = StringSave (str1);
834   } else {
835     sum = (CharPtr) MemNew (sizeof (Char) * (StringLen (str1) + StringLen (str2) + 1));
836     sprintf (sum, "%s%s", str1, str2);
837   }
838   return sum;
839 }
840 
841 
MakeUnculturedName(CharPtr taxname,CharPtr suffix)842 static CharPtr MakeUnculturedName (CharPtr taxname, CharPtr suffix)
843 {
844     CharPtr uncultured = "uncultured ";
845     Int4 len, suffix_len = 0, uncultured_len;
846     CharPtr name = NULL;
847     Boolean add_suffix = TRUE, add_uncultured = TRUE;
848 
849     if (taxname == NULL) {
850         return NULL;
851     }
852     len = StringLen (taxname) + 1;
853     if (suffix == NULL) {
854         add_suffix = FALSE;
855     } else {
856         suffix_len = StringLen (suffix);
857         if (len > suffix_len && StringCmp (taxname + len - suffix_len, suffix) == 0) {
858             add_suffix = FALSE;
859         } else {
860             len += suffix_len;
861         }
862     }
863     uncultured_len = StringLen (uncultured);
864     if (StringNCmp (taxname, uncultured, uncultured_len) == 0) {
865         add_uncultured = FALSE;
866     } else {
867         len += uncultured_len;
868     }
869 
870     name = (CharPtr) MemNew (sizeof (Char) * len);
871     name[0] = 0;
872     if (add_uncultured) {
873         StringCat(name, uncultured);
874     }
875     StringCat(name, taxname);
876     if (add_suffix) {
877         StringCat(name, suffix);
878     }
879     return name;
880 }
881 
882 
IsArchaea(OrgRefPtr response_org)883 static Boolean IsArchaea(OrgRefPtr response_org)
884 {
885     if (response_org != NULL && response_org->orgname != NULL
886         && StringISearch (response_org->orgname->lineage, "archaea") != NULL) {
887         return TRUE;
888     } else {
889         return FALSE;
890     }
891 }
892 
893 
IsBacteria(OrgRefPtr response_org)894 static Boolean IsBacteria(OrgRefPtr response_org)
895 {
896     if (response_org != NULL && response_org->orgname != NULL
897         && StringISearch (response_org->orgname->lineage, "bacteria") != NULL) {
898         return TRUE;
899     } else {
900         return FALSE;
901     }
902 }
903 
904 
IsFungi(OrgRefPtr response_org)905 static Boolean IsFungi(OrgRefPtr response_org)
906 {
907     if (response_org != NULL && response_org->orgname != NULL
908         && StringISearch (response_org->orgname->lineage, " Fungi;") != NULL) {
909         return TRUE;
910     } else {
911         return FALSE;
912     }
913 }
914 
915 
SuggestedTaxNameFixFromOrgAndRank(TaxFixItemPtr t)916 static CharPtr SuggestedTaxNameFixFromOrgAndRank (TaxFixItemPtr t)
917 {
918   CharPtr fix = NULL;
919   CharPtr stop;
920 
921   if (t == NULL || t->response_org == NULL) {
922     return NULL;
923   }
924 
925   if (StringICmp (t->rank, "species") == 0) {
926     if (t->is_species_specific) {
927       fix = StringSave (t->response_org->taxname);
928     } else {
929       /* truncate binomial, need to check again */
930       stop = StringChr (t->taxname, ' ');
931       if (stop != NULL) {
932         *stop = 0;
933         t->truncate_binomial = TRUE;
934       }
935     }
936   } else if (t->response_org->orgname != NULL) {
937     if (StringICmp (t->rank, "genus") == 0) {
938       if (IsArchaea(t->response_org) || IsBacteria(t->response_org)) {
939         fix = MakeUnculturedName (t->response_org->taxname, " sp.");
940       } else if (IsFungi(t->response_org)) {
941         fix = MakeUnculturedName (t->response_org->taxname, NULL);
942       }
943     } else {
944       if (IsArchaea(t->response_org)) {
945         fix = MakeUnculturedName (t->response_org->taxname, " archaeon");
946       } else if (IsBacteria(t->response_org)) {
947         fix = MakeUnculturedName (t->response_org->taxname, " bacterium");
948       } else if (IsFungi(t->response_org)) {
949         fix = MakeUnculturedName (t->response_org->taxname, NULL);
950       }
951     }
952     if (t->is_species_specific) {
953       t->remove_species_specific = TRUE;
954     }
955   }
956   return fix;
957 }
958 
959 
StripCommas(CharPtr str)960 static void StripCommas (CharPtr str)
961 {
962   CharPtr src, dst;
963 
964   if (str == NULL) {
965     return;
966   }
967 
968   src = str;
969   dst = src;
970   while (*src != 0) {
971     if (*src == ',') {
972       if (*(src + 1) == 0) {
973         /* don't add to dst */
974       } else if (*(src + 1) == ' ') {
975         /* don't add to dst */
976       } else {
977         *dst = ' ';
978         dst++;
979       }
980     } else {
981       *dst = *src;
982       dst++;
983     }
984     src++;
985   }
986   *dst = 0;
987 }
988 
989 
MakeTaxFixRequestList(ValNodePtr biop_list)990 static ValNodePtr MakeTaxFixRequestList (ValNodePtr biop_list)
991 {
992   ValNodePtr rq_list = NULL, prev = NULL;
993   BioSourcePtr biop;
994   OrgRefPtr  org;
995   CharPtr    new_name;
996   Int4       len;
997 
998   while (biop_list != NULL) {
999     biop = GetBioSourceFromObject (biop_list->choice, biop_list->data.ptrvalue);
1000     org = AsnIoMemCopy (biop->org, (AsnReadFunc) OrgRefAsnRead, (AsnWriteFunc) OrgRefAsnWrite);
1001     /* add period to end of sp */
1002     if ((len = StringLen (org->taxname)) > 3 && StringCmp (org->taxname + len - 3, " sp") == 0) {
1003       new_name = StringSum (org->taxname, ".");
1004       org->taxname = MemFree (org->taxname);
1005       org->taxname = new_name;
1006     }
1007     /* strip commas */
1008     StripCommas (org->taxname);
1009 
1010     ValNodeAddPointer (&prev, 3, org);
1011     if (rq_list == NULL) {
1012       rq_list = prev;
1013     }
1014     biop_list = biop_list->next;
1015   }
1016   return rq_list;
1017 }
1018 
1019 
CheckSuggestedFixes(ValNodePtr tax_fix_list)1020 static void CheckSuggestedFixes (ValNodePtr tax_fix_list)
1021 {
1022   ValNodePtr rq_list = NULL, rp_list = NULL, prev, vnp_rq, vnp_rp;
1023   ValNodePtr vnp, next_org_list, last_org;
1024   Int4             request_num, max_requests = 2000;
1025   TaxFixItemPtr t;
1026   Taxon3RequestPtr t3rq;
1027   Taxon3ReplyPtr   t3ry;
1028   T3DataPtr        tdp;
1029   T3ReplyPtr       trp;
1030   T3ErrorPtr       tep;
1031   T3StatusFlagsPtr tfp;
1032   OrgRefPtr        org;
1033   Boolean          is_species;
1034 
1035   prev = NULL;
1036   for (vnp = tax_fix_list; vnp != NULL; vnp = vnp->next) {
1037     t = (TaxFixItemPtr) vnp->data.ptrvalue;
1038     if (t != NULL && t->suggested_fix != NULL) {
1039       ValNodeAddPointer (&prev, 2, StringSave (t->suggested_fix));
1040     }
1041     if (rq_list == NULL) {
1042       rq_list = prev;
1043     }
1044   }
1045 
1046   /* now break large lists into manageable chunks */
1047   vnp = rq_list;
1048   while (vnp != NULL) {
1049     next_org_list = vnp->next;
1050     last_org = vnp;
1051     request_num = 1;
1052     while (next_org_list != NULL && request_num < max_requests) {
1053       last_org = next_org_list;
1054       next_org_list = next_org_list->next;
1055       request_num++;
1056     }
1057     if (last_org != NULL) {
1058       last_org->next = NULL;
1059     }
1060 
1061     /* now create the request */
1062 
1063     t3rq = CreateMultiTaxon3Request (vnp);
1064     if (t3rq == NULL) return;
1065     t3ry = Tax3SynchronousQuery (t3rq);
1066     Taxon3RequestFree (t3rq);
1067     if (t3ry != NULL) {
1068       for (trp = t3ry->reply; trp != NULL; trp = trp->next) {
1069         switch (trp->choice) {
1070           case T3Reply_error :
1071             tep = (T3ErrorPtr) trp->data.ptrvalue;
1072             ValNodeAddPointer (&rp_list, 0, NULL);
1073             break;
1074           case T3Reply_data :
1075             tdp = (T3DataPtr) trp->data.ptrvalue;
1076             is_species = FALSE;
1077             if (tdp != NULL) {
1078               for (tfp = tdp->status; tfp != NULL; tfp = tfp->next) {
1079                 if (StringICmp (tfp->property, "rank") == 0
1080                     && tfp->Value_value != NULL
1081                     && tfp->Value_value->choice == Value_value_str
1082                     && StringICmp (tfp->Value_value->data.ptrvalue, "species") == 0) {
1083                   is_species = TRUE;
1084                 }
1085               }
1086             }
1087             if (is_species) {
1088               org = (OrgRefPtr) tdp->org;
1089               ValNodeAddPointer (&rp_list, 0, StringSave (org->taxname));
1090             } else {
1091               ValNodeAddPointer (&rp_list, 0, NULL);
1092             }
1093             break;
1094           default :
1095             break;
1096         }
1097       }
1098       Taxon3ReplyFree (t3ry);
1099     }
1100 
1101     if (last_org != NULL) {
1102         last_org->next = next_org_list;
1103     }
1104     vnp = next_org_list;
1105   }
1106   rq_list = ValNodeFreeData (rq_list);
1107 
1108   /* adjust suggested fixes */
1109   vnp_rq = tax_fix_list;
1110   vnp_rp = rp_list;
1111 
1112   while (vnp_rq != NULL && vnp_rp != NULL) {
1113     while (vnp_rq != NULL && ((t = (TaxFixItemPtr) vnp_rq->data.ptrvalue) == NULL || t->suggested_fix == NULL)) {
1114       vnp_rq = vnp_rq->next;
1115     }
1116     if (t != NULL) {
1117       t->suggested_fix = MemFree (t->suggested_fix);
1118       if (vnp_rq != NULL) {
1119         t->suggested_fix = vnp_rp->data.ptrvalue;
1120         vnp_rp->data.ptrvalue = NULL;
1121         vnp_rq = vnp_rq->next;
1122         vnp_rp = vnp_rp->next;
1123       }
1124     }
1125   }
1126   rp_list = ValNodeFreeData (rp_list);
1127 }
1128 
1129 
1130 typedef CharPtr (*TryTaxFixChangeFunc) PROTO ((CharPtr orig));
1131 
TryNewSuggestedFixes(ValNodePtr tax_fix_list,TryTaxFixChangeFunc func)1132 static void TryNewSuggestedFixes (ValNodePtr tax_fix_list, TryTaxFixChangeFunc func)
1133 {
1134   ValNodePtr rq_list = NULL, rp_list = NULL, prev, vnp_rq, vnp_rp;
1135   ValNodePtr vnp, next_org_list, last_org;
1136   Int4             request_num, max_requests = 2000;
1137   TaxFixItemPtr t;
1138   Taxon3RequestPtr t3rq;
1139   Taxon3ReplyPtr   t3ry;
1140   T3DataPtr        tdp;
1141   T3ReplyPtr       trp;
1142   T3ErrorPtr       tep;
1143   T3StatusFlagsPtr tfp;
1144   OrgRefPtr        org;
1145   Boolean          is_species;
1146   ValNodePtr       fix_copy = NULL, fix_prev = NULL;
1147   CharPtr          tmp;
1148 
1149   prev = NULL;
1150   for (vnp = tax_fix_list; vnp != NULL; vnp = vnp->next) {
1151     t = (TaxFixItemPtr) vnp->data.ptrvalue;
1152     if (t->suggested_fix == NULL) {
1153       tmp = func(t->taxname);
1154       if (tmp != NULL) {
1155         ValNodeAddPointer (&prev, 2, tmp);
1156         if (rq_list == NULL) {
1157           rq_list = prev;
1158         }
1159         ValNodeAddPointer (&fix_prev, 0, t);
1160         if (fix_copy == NULL) {
1161           fix_copy = fix_prev;
1162         }
1163       }
1164     }
1165   }
1166 
1167   /* now break large lists into manageable chunks */
1168   vnp = rq_list;
1169   while (vnp != NULL) {
1170     next_org_list = vnp->next;
1171     last_org = vnp;
1172     request_num = 1;
1173     while (next_org_list != NULL && request_num < max_requests) {
1174       last_org = next_org_list;
1175       next_org_list = next_org_list->next;
1176       request_num++;
1177     }
1178     if (last_org != NULL) {
1179       last_org->next = NULL;
1180     }
1181 
1182     /* now create the request */
1183 
1184     t3rq = CreateMultiTaxon3Request (vnp);
1185     if (t3rq == NULL) return;
1186     t3ry = Tax3SynchronousQuery (t3rq);
1187     Taxon3RequestFree (t3rq);
1188     if (t3ry != NULL) {
1189       for (trp = t3ry->reply; trp != NULL; trp = trp->next) {
1190         switch (trp->choice) {
1191           case T3Reply_error :
1192             tep = (T3ErrorPtr) trp->data.ptrvalue;
1193             ValNodeAddPointer (&rp_list, 0, NULL);
1194             break;
1195           case T3Reply_data :
1196             tdp = (T3DataPtr) trp->data.ptrvalue;
1197             is_species = FALSE;
1198             if (tdp != NULL) {
1199               for (tfp = tdp->status; tfp != NULL; tfp = tfp->next) {
1200                 if (StringICmp (tfp->property, "rank") == 0
1201                     && tfp->Value_value != NULL
1202                     && tfp->Value_value->choice == Value_value_str
1203                     && StringICmp (tfp->Value_value->data.ptrvalue, "species") == 0) {
1204                   is_species = TRUE;
1205                 }
1206               }
1207             }
1208             if (is_species) {
1209               org = (OrgRefPtr) tdp->org;
1210               ValNodeAddPointer (&rp_list, 0, StringSave (org->taxname));
1211             } else {
1212               ValNodeAddPointer (&rp_list, 0, NULL);
1213             }
1214             break;
1215           default :
1216             ValNodeAddPointer (&rp_list, 0, NULL);
1217             break;
1218         }
1219       }
1220       Taxon3ReplyFree (t3ry);
1221     }
1222 
1223     if (last_org != NULL) {
1224         last_org->next = next_org_list;
1225     }
1226     vnp = next_org_list;
1227   }
1228   rq_list = ValNodeFreeData (rq_list);
1229 
1230   /* adjust suggested fixes */
1231   vnp_rq = fix_copy;
1232   vnp_rp = rp_list;
1233 
1234   while (vnp_rq != NULL && vnp_rp != NULL) {
1235     t = (TaxFixItemPtr) vnp_rq->data.ptrvalue;
1236     t->suggested_fix = MemFree (t->suggested_fix);
1237     t->suggested_fix = vnp_rp->data.ptrvalue;
1238     vnp_rp->data.ptrvalue = NULL;
1239     vnp_rq = vnp_rq->next;
1240     vnp_rp = vnp_rp->next;
1241   }
1242   rp_list = ValNodeFreeData (rp_list);
1243   /* note - fix_copy points to data that will be freed elsewhere */
1244   fix_copy = ValNodeFree (fix_copy);
1245 }
1246 
1247 static const CharPtr SUncultured = "uncultured ";
1248 const Int4 SUnculturedLen = 11;
1249 
StandardFixes(CharPtr orig)1250 static CharPtr StandardFixes (CharPtr orig)
1251 {
1252   CharPtr val = NULL;
1253   CharPtr cp, src, dst;
1254   Int4    len;
1255 
1256   if (StringNICmp (orig, SUncultured, SUnculturedLen) == 0) {
1257     val = StringSave(orig + 11);
1258   } else {
1259     val = StringSave (orig);
1260   }
1261   /* remove trailing sp. */
1262   len = StringLen (val);
1263   if (len > 3 && StringICmp(val + len - 3, " sp") == 0) {
1264     val[len - 3] = 0;
1265   } else if (len > 4 && StringICmp (val + len - 4, " sp.") == 0) {
1266     val[len - 4] = 0;
1267   }
1268 
1269   /* remove commas */
1270   cp = StringChr (val, ',');
1271   if (cp != NULL) {
1272     src = val;
1273     dst = val;
1274     while (*src != 0) {
1275       if (*src == ',') {
1276         if (*(src + 1) != ' ') {
1277           *dst = ' ';
1278           dst++;
1279         }
1280       } else {
1281         *dst = *src;
1282         dst++;
1283       }
1284       src++;
1285     }
1286     *dst = 0;
1287   }
1288   return val;
1289 }
1290 
1291 
AddUnculturedIfNotPresent(CharPtr orig)1292 static CharPtr AddUnculturedIfNotPresent (CharPtr orig)
1293 {
1294   CharPtr val = NULL;
1295   if (!StringHasNoText (orig) && StringNICmp (orig, "uncultured ", 11) != 0) {
1296     val = MemNew (sizeof (Char) * (StringLen (orig) + 12));
1297     sprintf (val, "uncultured %s", orig);
1298   }
1299   return val;
1300 }
1301 
1302 
TryUnculturedAndSp(CharPtr orig)1303 static CharPtr TryUnculturedAndSp (CharPtr orig)
1304 {
1305   CharPtr val = NULL;
1306   Int4    len;
1307   Boolean prefix = FALSE, suffix = FALSE;
1308 
1309   if (StringHasNoText (orig)) {
1310     return NULL;
1311   }
1312   len = StringLen (orig);
1313 
1314   if (len > 4 && StringICmp (orig + len - 4, " sp.") != 0) {
1315     suffix = TRUE;
1316     len += 4;
1317   }
1318   if (StringNICmp (orig, "uncultured ", 11) != 0) {
1319     prefix = TRUE;
1320     len += 11;
1321   }
1322 
1323   if (prefix || suffix) {
1324     val = MemNew (sizeof (Char) * (len + 1));
1325     sprintf (val, "%s%s%s", prefix ? "uncultured " : "", orig, suffix ? " sp." : "");
1326   }
1327   return val;
1328 }
1329 
1330 static const CharPtr sAmplifiedSpeciesSpecific[] = {
1331   "[BankIt_uncultured16S_wizard]; [species_specific primers]; [tgge]",
1332   "[BankIt_uncultured16S_wizard]; [species_specific primers]; [dgge]",
1333   "[BankIt_uncultured16S_wizard]; [species_specific primers]",
1334   "[uncultured (with species-specific primers)]",
1335   "[uncultured]; [amplified with species-specific primers]",
1336   "[uncultured (using species-specific primers) bacterial source]",
1337   "amplified with species-specific primers",
1338   NULL
1339 };
1340 
1341 
IsSpeciesSpecificNote(CharPtr note)1342 static Boolean IsSpeciesSpecificNote(CharPtr note)
1343 {
1344   Int4 i;
1345   Boolean rval = FALSE;
1346 
1347   if (note == NULL || StringHasNoText(note)) {
1348     return FALSE;
1349   }
1350   for (i = 0; sAmplifiedSpeciesSpecific[i] != NULL && !rval; i++) {
1351     if (StringISearch (note, sAmplifiedSpeciesSpecific[i]) != NULL) {
1352       rval = TRUE;
1353     }
1354   }
1355   return rval;
1356 }
1357 
1358 
IsSpeciesSpecific(BioSourcePtr biop)1359 static Boolean IsSpeciesSpecific (BioSourcePtr biop)
1360 {
1361     Boolean rval = FALSE;
1362     SubSourcePtr ssp;
1363 
1364     if (biop != NULL) {
1365         for (ssp = biop->subtype; ssp != NULL && !rval; ssp = ssp->next) {
1366             if (ssp->subtype == SUBSRC_other
1367                 && IsSpeciesSpecificNote (ssp->name)) {
1368                 rval = TRUE;
1369             }
1370         }
1371     }
1372     return rval;
1373 }
1374 
1375 
RemoveSpeciesSpecificFromNote(CharPtr note)1376 static Boolean RemoveSpeciesSpecificFromNote(CharPtr note)
1377 {
1378   Int4 i;
1379   CharPtr cp;
1380   Boolean rval = FALSE;
1381 
1382   if (note == NULL || StringHasNoText (note)) {
1383     return rval;
1384   }
1385   for (i = 0; sAmplifiedSpeciesSpecific[i] != NULL; i++) {
1386     if ((cp = StringISearch (note, sAmplifiedSpeciesSpecific[i])) != NULL) {
1387       StringCpy (cp, cp + StringLen (sAmplifiedSpeciesSpecific[i]));
1388       rval = TRUE;
1389     }
1390   }
1391   return rval;
1392 }
1393 
1394 
RemoveSpeciesSpecific(BioSourcePtr biop)1395 NLM_EXTERN void RemoveSpeciesSpecific (BioSourcePtr biop)
1396 {
1397     SubSourcePtr ssp, ssp_prev = NULL, ssp_next;
1398     CharPtr cp;
1399 
1400     if (biop != NULL) {
1401         for (ssp = biop->subtype; ssp != NULL; ssp = ssp_next) {
1402             ssp_next = ssp->next;
1403             if (ssp->subtype == SUBSRC_other
1404                 && RemoveSpeciesSpecificFromNote(ssp->name)
1405                 && StringHasNoText (ssp->name)) {
1406                 ssp = SubSourceFree (ssp);
1407                 if (ssp_prev == NULL) {
1408                     biop->subtype = ssp_next;
1409                 } else {
1410                     ssp_prev->next = ssp_next;
1411                 }
1412             } else {
1413                 ssp_prev = ssp;
1414             }
1415         }
1416     }
1417 }
1418 
1419 
1420 static CharPtr sUnfixable[] = {
1421   "rickettsia",
1422   "candidatus",
1423   "endosymbiont",
1424   "phytoplasma",
1425   "wolbachia",
1426   NULL
1427 };
1428 
OkToTaxFix(CharPtr orgname)1429 NLM_EXTERN Boolean OkToTaxFix(CharPtr orgname)
1430 {
1431   Int4 i;
1432   Boolean rval = TRUE;
1433 
1434   if (orgname == NULL || StringHasNoText (orgname)) {
1435     return FALSE;
1436   }
1437 
1438   for (i = 0; sUnfixable[i] != NULL && rval; i++) {
1439     if (StringISearch(orgname, sUnfixable[i]) != NULL) {
1440       rval = FALSE;
1441     }
1442   }
1443   return rval;
1444 }
1445 
1446 
BuildBlankTaxFixList(ValNodePtr biop_list)1447 static ValNodePtr BuildBlankTaxFixList (ValNodePtr biop_list)
1448 {
1449   ValNodePtr    list = NULL, prev = NULL;
1450   BioSourcePtr  biop;
1451   TaxFixItemPtr t;
1452 
1453   while (biop_list != NULL) {
1454     t = TaxFixItemNew ();
1455     t->data_choice = biop_list->choice;
1456     t->data = biop_list->data.ptrvalue;
1457     biop = GetBioSourceFromObject (biop_list->choice, biop_list->data.ptrvalue);
1458     if (biop != NULL && biop->org != NULL && biop->org->taxname != NULL) {
1459       t->orig_org = biop->org;
1460       t->taxname = StandardFixes (biop->org->taxname);
1461     }
1462     t->is_species_specific = IsSpeciesSpecific(biop);
1463     ValNodeAddPointer (&prev, 0, t);
1464     if (list == NULL) {
1465       list = prev;
1466     }
1467     biop_list = biop_list->next;
1468   }
1469   return list;
1470 }
1471 
1472 
BuildRequestFromTaxnameList(ValNodePtr taxfix_list)1473 static ValNodePtr BuildRequestFromTaxnameList (ValNodePtr taxfix_list)
1474 {
1475   ValNodePtr list = NULL, prev = NULL;
1476   TaxFixItemPtr t;
1477 
1478   while (taxfix_list != NULL) {
1479     t = (TaxFixItemPtr) taxfix_list->data.ptrvalue;
1480     if (t->suggested_fix == NULL) {
1481       ValNodeAddPointer (&prev, 2, StringSave (t->taxname));
1482     } else {
1483       ValNodeAddPointer (&prev, 2, StringSave (t->suggested_fix));
1484     }
1485     if (list == NULL) {
1486       list = prev;
1487     }
1488     taxfix_list = taxfix_list->next;
1489   }
1490   return list;
1491 }
1492 
1493 
1494 /* what is passed in is a list of names (request_list), a list of fixes (to be filled in),
1495  * and the number of names to include per server request.
1496  */
GetSuggestedNamesFromRank(ValNodePtr request_list,ValNodePtr taxfix_list,Int4 max_requests)1497 static void GetSuggestedNamesFromRank (ValNodePtr request_list, ValNodePtr taxfix_list, Int4 max_requests)
1498 {
1499   ValNodePtr start_request, vnp_rp;
1500   ValNodePtr next_org_list, last_org;
1501   Int4       request_num;
1502   TaxFixItemPtr t;
1503   Taxon3RequestPtr t3rq;
1504   Taxon3ReplyPtr   t3ry;
1505   T3DataPtr        tdp;
1506   T3ReplyPtr       trp;
1507   T3ErrorPtr       tep;
1508   T3StatusFlagsPtr tfp;
1509 
1510   /* break large lists into manageable chunks */
1511   start_request = request_list;
1512   vnp_rp = taxfix_list;
1513   while (start_request != NULL && vnp_rp != NULL) {
1514     next_org_list = start_request->next;
1515     last_org = start_request;
1516     request_num = 1;
1517     while (next_org_list != NULL && request_num < max_requests) {
1518       last_org = next_org_list;
1519       next_org_list = next_org_list->next;
1520       request_num++;
1521     }
1522     if (last_org != NULL) {
1523       last_org->next = NULL;
1524     }
1525 
1526     /* now create the request */
1527 
1528     t3rq = CreateMultiTaxon3Request (start_request);
1529     if (t3rq == NULL) return;
1530     t3ry = Tax3SynchronousQuery (t3rq);
1531     Taxon3RequestFree (t3rq);
1532     if (t3ry != NULL) {
1533       for (trp = t3ry->reply; trp != NULL; trp = trp->next) {
1534         switch (trp->choice) {
1535           case T3Reply_error :
1536             tep = (T3ErrorPtr) trp->data.ptrvalue;
1537             t = (TaxFixItemPtr) vnp_rp->data.ptrvalue;
1538             if (tep != NULL && StringCmp (tep->message, "Taxname is ambiguous") == 0) {
1539                 t->is_ambiguous = TRUE;
1540             }
1541             vnp_rp = vnp_rp->next;
1542             break;
1543           case T3Reply_data :
1544             tdp = (T3DataPtr) trp->data.ptrvalue;
1545             if (tdp != NULL) {
1546               t = (TaxFixItemPtr) vnp_rp->data.ptrvalue;
1547               if (t->suggested_fix == NULL) {
1548                 t->response_org = (OrgRefPtr)(tdp->org);
1549                 tdp->org = NULL;
1550                 for (tfp = tdp->status; tfp != NULL; tfp = tfp->next) {
1551                   if (StringICmp (tfp->property, "rank") == 0
1552                       && tfp->Value_value != NULL
1553                       && tfp->Value_value->choice == Value_value_str) {
1554                     t->rank = StringSave (tfp->Value_value->data.ptrvalue);
1555                   }
1556                 }
1557                 t->suggested_fix = SuggestedTaxNameFixFromOrgAndRank (t);
1558               }
1559               vnp_rp = vnp_rp->next;
1560             }
1561             break;
1562           default :
1563             vnp_rp = vnp_rp->next;
1564             break;
1565         }
1566       }
1567       Taxon3ReplyFree (t3ry);
1568     }
1569 
1570     if (last_org != NULL) {
1571         last_org->next = next_org_list;
1572     }
1573     start_request = next_org_list;
1574   }
1575 }
1576 
1577 
AddAmbiguousRequests(ValNodePtr taxfix_list,CharPtr domain)1578 static void AddAmbiguousRequests (ValNodePtr taxfix_list, CharPtr domain)
1579 {
1580   CharPtr    uncultured = "uncultured ";
1581   Int4       len;
1582   Int4       uncultured_len;
1583   TaxFixItemPtr t;
1584 
1585   uncultured_len = StringLen (uncultured);
1586   while (taxfix_list != NULL) {
1587     t = (TaxFixItemPtr) taxfix_list->data.ptrvalue;
1588     if (t->is_ambiguous && t->suggested_fix == NULL) {
1589       len = StringLen (t->taxname) + StringLen (domain) + uncultured_len + 2;
1590       t->suggested_fix = (CharPtr) MemNew (sizeof (Char) * len);
1591       if (StringNICmp(t->taxname, uncultured, uncultured_len) == 0) {
1592         sprintf(t->suggested_fix, "%s %s", t->taxname, domain);
1593       } else {
1594         sprintf(t->suggested_fix, "%s%s %s", uncultured, t->taxname, domain);
1595       }
1596     }
1597     taxfix_list = taxfix_list->next;
1598   }
1599 }
1600 
1601 
TryAmbiguousFixes(ValNodePtr taxfix_list)1602 static void TryAmbiguousFixes(ValNodePtr taxfix_list)
1603 {
1604   ValNodeBlock  ambig_list;
1605   ValNodePtr    vnp;
1606   TaxFixItemPtr tip;
1607 
1608   /* try ambiguous values */
1609   InitValNodeBlock(&ambig_list, NULL);
1610   for (vnp = taxfix_list; vnp != NULL; vnp = vnp->next) {
1611     tip = vnp->data.ptrvalue;
1612     if (tip->is_ambiguous) {
1613       ValNodeAddPointerToEnd(&ambig_list, 0, tip);
1614     }
1615   }
1616   if (ambig_list.head != NULL) {
1617     AddAmbiguousRequests(ambig_list.head, "bacterium");
1618     CheckSuggestedFixes (ambig_list.head);
1619     AddAmbiguousRequests(ambig_list.head, "archaeon");
1620     CheckSuggestedFixes (ambig_list.head);
1621     ambig_list.head = ValNodeFree(ambig_list.head);
1622   }
1623 }
1624 
1625 
TryRankFix(ValNodePtr taxfix_list)1626 static void TryRankFix (ValNodePtr taxfix_list)
1627 {
1628   ValNodePtr request_list;
1629 
1630   request_list = BuildRequestFromTaxnameList (taxfix_list);
1631   GetSuggestedNamesFromRank (request_list, taxfix_list, 2000);
1632   request_list = ValNodeFreeData (request_list);
1633 
1634   CheckSuggestedFixes (taxfix_list);
1635 }
1636 
1637 
TryBinomalTruncations(ValNodePtr taxfix_list)1638 static void TryBinomalTruncations(ValNodePtr taxfix_list)
1639 {
1640   ValNodeBlock  trunc_list;
1641   ValNodePtr    vnp;
1642   TaxFixItemPtr tip;
1643 
1644   /* only fill in binomial truncations */
1645   InitValNodeBlock(&trunc_list, NULL);
1646   for (vnp = taxfix_list; vnp != NULL; vnp = vnp->next) {
1647     tip = vnp->data.ptrvalue;
1648     if (tip->truncate_binomial) {
1649       ValNodeAddPointerToEnd(&trunc_list, 0, tip);
1650     }
1651   }
1652 
1653 
1654   if (trunc_list.head != NULL) {
1655     TryRankFix(trunc_list.head);
1656     trunc_list.head = ValNodeFree (trunc_list.head);
1657   }
1658 }
1659 
1660 
ProvideDefaultTaxFixes(ValNodePtr taxfix_list)1661 static void ProvideDefaultTaxFixes(ValNodePtr taxfix_list)
1662 {
1663     ValNodePtr vnp;
1664     TaxFixItemPtr t;
1665 
1666     for (vnp = taxfix_list; vnp != NULL; vnp = vnp->next) {
1667         t = (TaxFixItemPtr) vnp->data.ptrvalue;
1668         if (t != NULL && StringHasNoText (t->suggested_fix)) {
1669             t->suggested_fix = MemFree (t->suggested_fix);
1670             t->suggested_fix = MakeUnculturedName(t->taxname, NULL);
1671         }
1672     }
1673 }
1674 
1675 
Taxon3GetTaxFixList(ValNodePtr biop_list)1676 NLM_EXTERN ValNodePtr Taxon3GetTaxFixList (ValNodePtr biop_list)
1677 {
1678   ValNodePtr       response_list = NULL;
1679 
1680   if (biop_list == NULL) {
1681     return NULL;
1682   }
1683 
1684   response_list = BuildBlankTaxFixList(biop_list);
1685 
1686   TryRankFix (response_list);
1687   TryBinomalTruncations (response_list);
1688   TryAmbiguousFixes (response_list);
1689 
1690   TryNewSuggestedFixes (response_list, AddUnculturedIfNotPresent);
1691   TryNewSuggestedFixes (response_list, TryUnculturedAndSp);
1692   ProvideDefaultTaxFixes (response_list);
1693 
1694   return response_list;
1695 }
1696 
1697 
Taxon3GetOrg(OrgRefPtr orp)1698 NLM_EXTERN OrgRefPtr Taxon3GetOrg (OrgRefPtr orp)
1699 
1700 {
1701   Taxon3RequestPtr t3rq;
1702   Taxon3ReplyPtr   t3ry;
1703   T3DataPtr        tdp;
1704   OrgRefPtr        t3orp = NULL;
1705   T3ReplyPtr        trp;
1706   T3ErrorPtr        tep;
1707 
1708   if (orp == NULL) return NULL;
1709 
1710   t3rq = CreateTaxon3Request (0, NULL, orp);
1711   if (t3rq == NULL) return NULL;
1712   t3ry = Tax3SynchronousQuery (t3rq);
1713   Taxon3RequestFree (t3rq);
1714   if (t3ry != NULL) {
1715     for (trp = t3ry->reply; trp != NULL; trp = trp->next) {
1716       switch (trp->choice) {
1717         case T3Reply_error :
1718           tep = (T3ErrorPtr) trp->data.ptrvalue;
1719           if (tep != NULL) {
1720             ErrPostEx (SEV_ERROR, 0, 0, tep->message);
1721           }
1722           break;
1723         case T3Reply_data :
1724           tdp = (T3DataPtr) trp->data.ptrvalue;
1725           if (tdp != NULL) {
1726             t3orp = (OrgRefPtr)(tdp->org);
1727             tdp->org = NULL;
1728           }
1729           break;
1730         default :
1731           break;
1732       }
1733     }
1734     Taxon3ReplyFree (t3ry);
1735   }
1736 
1737   return t3orp;
1738 }
1739 
DoOrgIdsMatch(BioSourcePtr b1,BioSourcePtr b2)1740 static Boolean DoOrgIdsMatch(BioSourcePtr b1, BioSourcePtr b2)
1741 {
1742   DbtagPtr d1 = NULL, d2 = NULL;
1743   ValNodePtr vnp;
1744 
1745   if (b1 == NULL || b2 == NULL)
1746   {
1747     return FALSE;
1748   }
1749   if (b1->org ==  NULL || b2->org == NULL)
1750   {
1751     return FALSE;
1752   }
1753   for (vnp = b1->org->db; vnp; vnp = vnp->next)
1754   {
1755     d1 = (DbtagPtr) vnp->data.ptrvalue;
1756     if (StringCmp(d1->db, "taxon") == 0)
1757     {
1758       break;
1759     }
1760   }
1761   for (vnp = b2->org->db; vnp; vnp = vnp->next)
1762   {
1763     d2 = (DbtagPtr) vnp->data.ptrvalue;
1764     if (StringCmp(d2->db, "taxon") == 0)
1765     {
1766       break;
1767     }
1768   }
1769   if (d1 && d2)
1770   {
1771     if (d1->tag->id == d2->tag->id)
1772     {
1773       return TRUE;
1774     }
1775   }
1776   else if (StringICmp(b1->org->taxname, b2->org->taxname) == 0)
1777   {
1778     return TRUE;
1779   }
1780   return FALSE;
1781 }
1782 
Tax3BioSourceMerge(BioSourcePtr host,BioSourcePtr guest)1783 static BioSourcePtr Tax3BioSourceMerge(BioSourcePtr host, BioSourcePtr guest)
1784 {
1785   SubSourcePtr ssp, sp, last_ssp;
1786   OrgModPtr omp, homp, last_omp;
1787   OrgNamePtr    onp;
1788 
1789   if (host == NULL && guest == NULL)
1790   {
1791     return NULL;
1792   }
1793   if (host == NULL && guest != NULL)
1794   {
1795     host = AsnIoMemCopy(guest, (AsnReadFunc) BioSourceAsnRead,
1796                                    (AsnWriteFunc) BioSourceAsnWrite);
1797     return host;
1798   }
1799   if (host != NULL && guest == NULL)
1800   {
1801     return host;
1802   }
1803   if (host->genome == 0 && guest->genome != 0)
1804   {
1805     host->genome = guest->genome;
1806   }
1807   if (host->origin == 0 && guest->origin != 0)
1808   {
1809     host->origin = guest->origin;
1810   }
1811   last_ssp = host->subtype;
1812   while (last_ssp != NULL && last_ssp->next != NULL)
1813   {
1814       last_ssp = last_ssp->next;
1815   }
1816   for (ssp = guest->subtype; ssp; ssp = ssp->next)
1817   {
1818     sp = AsnIoMemCopy(ssp, (AsnReadFunc) SubSourceAsnRead,
1819                                    (AsnWriteFunc) SubSourceAsnWrite);
1820     if (last_ssp == NULL)
1821     {
1822       host->subtype = sp;
1823     }
1824     else
1825     {
1826       last_ssp->next = sp;
1827       last_ssp = sp;
1828     }
1829   }
1830   if (guest->org->orgname)
1831   {
1832        if ((onp = host->org->orgname)    == NULL)
1833        {
1834          onp = OrgNameNew();
1835          host->org->orgname = onp;
1836     }
1837     last_omp = onp->mod;
1838     while (last_omp != NULL && last_omp->next != NULL)
1839     {
1840       last_omp = last_omp->next;
1841     }
1842     for (omp = guest->org->orgname->mod; omp; omp = omp->next)
1843     {
1844       homp = AsnIoMemCopy(omp, (AsnReadFunc) OrgModAsnRead,
1845                                    (AsnWriteFunc) OrgModAsnWrite);
1846       if (last_omp == NULL)
1847       {
1848           onp->mod = homp;
1849       }
1850       else
1851       {
1852           last_omp->next = homp;
1853           last_omp = homp;
1854       }
1855     }
1856   }
1857   return host;
1858 }
1859 
1860 
1861 /**************************************************************************
1862 *    Compare BioSources in one bioseq->descr using Taxonomy to find
1863 *    their join parent
1864 *    merge if organisms are the same or create a feature if different
1865 *
1866 **************************************************************************/
Tax3MergeSourceDescr(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)1867 NLM_EXTERN void Tax3MergeSourceDescr (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
1868 {
1869     BioseqPtr    bsp = NULL;
1870     ValNodePtr   vnp, newlist;
1871     SeqFeatPtr   sfp;
1872     BioSourcePtr first_biop = NULL;
1873     BioSourcePtr other_biop;
1874     BioSourcePtr tmp_biop;
1875     ObjValNodePtr ovp;
1876 
1877     if (!IS_Bioseq(sep)) {
1878         return;
1879     }
1880     newlist = (ValNodePtr) data;
1881     bsp = (BioseqPtr) sep->data.ptrvalue;
1882     if ((bsp->repr != Seq_repr_raw) && (bsp->repr != Seq_repr_const)
1883             && (bsp->repr != Seq_repr_delta))
1884         return;
1885 
1886     if (! ISA_na(bsp->mol))
1887         return;
1888 
1889     /* add the descriptors in newlist to the end of the list in bsp->descr*/
1890     if (bsp->descr == NULL)
1891     {
1892       bsp->descr = newlist;
1893     }
1894     else
1895     {
1896       for (vnp = bsp->descr; vnp->next != NULL; vnp = vnp->next)
1897       {
1898       }
1899       vnp->next = newlist;
1900     }
1901 
1902     /* now find the first source descriptor in bsp->descr that has an org*/
1903     /* note - we can't use SeqMgrGetNextDescriptor here because we have just
1904      * added to the descriptors, so they are not indexed. */
1905     for (vnp = bsp->descr; vnp != NULL; vnp = vnp->next)
1906     {
1907       if (vnp->choice != Seq_descr_source) continue;
1908       if (vnp->data.ptrvalue == NULL)
1909       {
1910           ErrPostStr(SEV_WARNING, 0, 0, "Source descriptor missing data");
1911           if (vnp->extended)
1912           {
1913             ovp = (ObjValNodePtr) vnp;
1914             ovp->idx.deleteme = TRUE;
1915           }
1916       }
1917       if (first_biop == NULL)
1918       {
1919           first_biop = vnp->data.ptrvalue;
1920       }
1921       else
1922       {
1923         other_biop = vnp->data.ptrvalue;
1924         /* detach biosource pointer from descr, so that it will not be freed
1925          * when the descriptor is deleted.
1926          */
1927         vnp->data.ptrvalue = NULL;
1928         if (vnp->extended)
1929         {
1930           ovp = (ObjValNodePtr) vnp;
1931             ovp->idx.deleteme = TRUE;
1932         }
1933         if (DoOrgIdsMatch(first_biop, other_biop))
1934         {
1935           /* merge the two sources */
1936           tmp_biop = Tax3BioSourceMerge(first_biop, other_biop);
1937           if (tmp_biop == NULL)
1938           {
1939               ErrPostStr (SEV_WARNING, 0, 0, "Failed to merge biosources");
1940           }
1941           else
1942           {
1943               first_biop = tmp_biop;
1944           }
1945           other_biop = BioSourceFree (other_biop);
1946         } else {
1947           /* create a source feature */
1948           sfp = CreateNewFeatureOnBioseq (bsp, SEQFEAT_BIOSRC, NULL);
1949           if (sfp != NULL)
1950           {
1951             sfp->data.value.ptrvalue = other_biop;
1952           }
1953         }
1954       }
1955     }
1956     return;
1957 }
1958 
GetTaxIdFromOrgRef(OrgRefPtr orp)1959 static Int4 GetTaxIdFromOrgRef (OrgRefPtr orp)
1960 {
1961   Int4       tax_id = -1;
1962   ValNodePtr vnp;
1963   DbtagPtr   d;
1964 
1965   if (orp != NULL)
1966   {
1967     for (vnp = orp->db; vnp != NULL; vnp = vnp->next)
1968     {
1969       d = (DbtagPtr) vnp->data.ptrvalue;
1970       if (StringCmp(d->db, "taxon") == 0)
1971       {
1972         tax_id = d->tag->id;
1973         break;
1974       }
1975     }
1976   }
1977   return tax_id;
1978 }
1979 
Taxon3GetTaxIdByOrgRef(OrgRefPtr orp)1980 NLM_EXTERN Int4 Taxon3GetTaxIdByOrgRef (OrgRefPtr orp)
1981 {
1982   OrgRefPtr  orp_repl;
1983   Int4       tax_id = -1;
1984 
1985   if (orp == NULL) return -1;
1986 
1987   orp_repl = Taxon3GetOrg (orp);
1988   tax_id = GetTaxIdFromOrgRef (orp_repl);
1989   OrgRefFree (orp_repl);
1990 
1991   return tax_id;
1992 }
1993 
Taxon3GetOrgRefByName(CharPtr orgname)1994 NLM_EXTERN OrgRefPtr Taxon3GetOrgRefByName (CharPtr orgname)
1995 {
1996   OrgRefPtr request, org;
1997 
1998   request = OrgRefNew ();
1999   if (request == NULL) return NULL;
2000   request->taxname = orgname;
2001   org = Taxon3GetOrg (request);
2002   request->taxname = NULL;
2003   OrgRefFree (request);
2004   return org;
2005 }
2006 
Taxon3GetTaxIdByName(CharPtr orgname)2007 NLM_EXTERN Int4 Taxon3GetTaxIdByName (CharPtr orgname)
2008 {
2009   OrgRefPtr orp;
2010   Int4      tax_id;
2011 
2012   orp = Taxon3GetOrgRefByName (orgname);
2013   tax_id = GetTaxIdFromOrgRef (orp);
2014 
2015   OrgRefFree(orp);
2016   return tax_id;
2017 }
2018 
AddBioSourceToList(BioSourcePtr biop,Pointer userdata)2019 static void AddBioSourceToList (BioSourcePtr biop, Pointer userdata)
2020 {
2021   ValNodePtr PNTR list;
2022 
2023   if (biop == NULL || userdata == NULL) return;
2024   list = (ValNodePtr PNTR) userdata;
2025   ValNodeAddPointer (list, 4, (Pointer) biop);
2026 }
2027 
Taxon3ReplaceOrgInSeqEntryEx(SeqEntryPtr sep,Boolean keep_syn,Boolean replace_unpub)2028 NLM_EXTERN void Taxon3ReplaceOrgInSeqEntryEx (SeqEntryPtr sep, Boolean keep_syn, Boolean replace_unpub)
2029 {
2030   ValNodePtr   biop_list = NULL;
2031   ValNodePtr   request_list = NULL;
2032   ValNodePtr   response_list = NULL;
2033   ValNodePtr   biop_vnp, response_vnp;
2034   BioSourcePtr biop;
2035   OrgRefPtr    swap_org, response_org;
2036 
2037   VisitBioSourcesInSep (sep, &biop_list, AddBioSourceToList);
2038 
2039   for (biop_vnp = biop_list; biop_vnp != NULL; biop_vnp = biop_vnp->next)
2040   {
2041     biop = (BioSourcePtr) biop_vnp->data.ptrvalue;
2042     ValNodeAddPointer (&request_list, 3, biop->org);
2043   }
2044   response_list = Taxon3GetOrgRefList (request_list);
2045 
2046   if (ValNodeLen (response_list) != ValNodeLen (request_list))
2047   {
2048     Message (MSG_POST, "Unable to retrieve information from tax server");
2049     return;
2050   }
2051 
2052   for (biop_vnp = biop_list, response_vnp = response_list;
2053        biop_vnp != NULL && response_vnp != NULL;
2054        biop_vnp = biop_vnp->next, response_vnp = response_vnp->next)
2055   {
2056     biop = (BioSourcePtr) biop_vnp->data.ptrvalue;
2057     swap_org = biop->org;
2058     response_org = response_vnp->data.ptrvalue;
2059     if (response_org != NULL
2060         && (replace_unpub || !(response_vnp->choice & eReturnedOrgFlag_unpublished)))
2061     {
2062       biop->org = response_org;
2063       response_vnp->data.ptrvalue = NULL;
2064       OrgRefFree (swap_org);
2065       if (! keep_syn)
2066       {
2067         biop->org->syn = ValNodeFreeData(biop->org->syn);
2068       }
2069     }
2070   }
2071   ValNodeFree (request_list);
2072   ValNodeFree (response_list);
2073   ValNodeFree (biop_list);
2074 }
2075 
2076 
Taxon3ReplaceOrgInSeqEntry(SeqEntryPtr sep,Boolean keep_syn)2077 NLM_EXTERN void Taxon3ReplaceOrgInSeqEntry (SeqEntryPtr sep, Boolean keep_syn)
2078 {
2079   Taxon3ReplaceOrgInSeqEntryEx (sep, keep_syn, TRUE);
2080 }
2081 
2082 
GetBioSourceFeaturesForCheck(SeqFeatPtr sfp,Pointer userdata)2083 static void GetBioSourceFeaturesForCheck (SeqFeatPtr sfp, Pointer userdata)
2084 {
2085   ValNodePtr PNTR list = (ValNodePtr PNTR) userdata;
2086   if (sfp == NULL || sfp->data.choice != SEQFEAT_BIOSRC || list == NULL
2087       || sfp->data.value.ptrvalue == NULL) {
2088     return;
2089   }
2090   ValNodeAddPointer (list, OBJ_SEQFEAT, sfp);
2091 }
2092 
2093 
GetBioSourceDescriptorsForCheck(SeqDescrPtr sdp,Pointer userdata)2094 static void GetBioSourceDescriptorsForCheck (SeqDescrPtr sdp, Pointer userdata)
2095 {
2096   ValNodePtr PNTR list = (ValNodePtr PNTR) userdata;
2097   if (sdp == NULL || sdp->choice != Seq_descr_source || list == NULL
2098       || sdp->data.ptrvalue == NULL) {
2099     return;
2100   }
2101   ValNodeAddPointer (list, OBJ_SEQDESC, sdp);
2102 }
2103 
2104 
GetTaxonXref(OrgRefPtr org)2105 static DbtagPtr GetTaxonXref (OrgRefPtr org)
2106 {
2107   ValNodePtr vnp;
2108   DbtagPtr   dbt = NULL;
2109 
2110   if (org == NULL) return NULL;
2111   vnp = org->db;
2112   while (vnp != NULL && dbt == NULL) {
2113     dbt = (DbtagPtr) vnp->data.ptrvalue;
2114     if (dbt != NULL && StringICmp ((CharPtr) dbt->db, "taxon") != 0) {
2115       dbt = NULL;
2116     }
2117     vnp = vnp->next;
2118   }
2119   return dbt;
2120 }
2121 
DoTaxonIdsMatch(OrgRefPtr org1,OrgRefPtr org2)2122 static Boolean DoTaxonIdsMatch (OrgRefPtr org1, OrgRefPtr org2)
2123 {
2124   DbtagPtr   dbt1 = NULL, dbt2 = NULL;
2125 
2126   if (org1 == NULL || org2 == NULL) return FALSE;
2127 
2128   dbt1 = GetTaxonXref (org1);
2129   if (dbt1 == NULL) return FALSE;
2130   dbt2 = GetTaxonXref (org2);
2131   if (dbt2 == NULL) return FALSE;
2132 
2133   return DbtagMatch(dbt1, dbt2);
2134 }
2135 
2136 
Taxon3CheckOrgInSeqEntry(SeqEntryPtr sep,ValNodePtr PNTR not_found,ValNodePtr PNTR bad_match)2137 NLM_EXTERN void Taxon3CheckOrgInSeqEntry (SeqEntryPtr sep, ValNodePtr PNTR not_found, ValNodePtr PNTR bad_match)
2138 {
2139   ValNodePtr   request_list = NULL;
2140   ValNodePtr   response_list = NULL;
2141   ValNodePtr   biop_vnp, response_vnp;
2142   BioSourcePtr biop;
2143   OrgRefPtr    orig_org, response_org;
2144   ValNodePtr   item_list = NULL;
2145   SeqFeatPtr   sfp;
2146   SeqDescrPtr  sdp;
2147 
2148   VisitFeaturesInSep (sep, &item_list, GetBioSourceFeaturesForCheck);
2149   VisitDescriptorsInSep (sep, &item_list, GetBioSourceDescriptorsForCheck);
2150 
2151   for (biop_vnp = item_list; biop_vnp != NULL; biop_vnp = biop_vnp->next) {
2152     biop = NULL;
2153     if (biop_vnp->choice == OBJ_SEQFEAT) {
2154       sfp = (SeqFeatPtr) biop_vnp->data.ptrvalue;
2155       if (sfp != NULL) {
2156         biop = (BioSourcePtr) sfp->data.value.ptrvalue;
2157       }
2158     } else if (biop_vnp->choice == OBJ_SEQDESC) {
2159       sdp = (SeqDescrPtr) biop_vnp->data.ptrvalue;
2160       if (sdp != NULL) {
2161         biop = (BioSourcePtr) sdp->data.ptrvalue;
2162       }
2163     }
2164     if (biop != NULL) {
2165       ValNodeAddPointer (&request_list, 3, biop->org);
2166     }
2167   }
2168 
2169   response_list = Taxon3GetOrgRefList (request_list);
2170 
2171   if (ValNodeLen (response_list) != ValNodeLen (request_list))
2172   {
2173     Message (MSG_POST, "Unable to retrieve information from tax server");
2174     ValNodeFree (request_list);
2175     ValNodeFree (item_list);
2176     return;
2177   }
2178 
2179   for (biop_vnp = item_list, response_vnp = response_list;
2180        biop_vnp != NULL && response_vnp != NULL;
2181        biop_vnp = biop_vnp->next, response_vnp = response_vnp->next)
2182   {
2183     response_org = response_vnp->data.ptrvalue;
2184     biop = NULL;
2185     orig_org = NULL;
2186     if (biop_vnp->choice == OBJ_SEQFEAT) {
2187       sfp = (SeqFeatPtr) biop_vnp->data.ptrvalue;
2188       if (sfp != NULL) {
2189         biop = (BioSourcePtr) sfp->data.value.ptrvalue;
2190       }
2191     } else if (biop_vnp->choice == OBJ_SEQDESC) {
2192       sdp = (SeqDescrPtr) biop_vnp->data.ptrvalue;
2193       if (sdp != NULL) {
2194         biop = (BioSourcePtr) sdp->data.ptrvalue;
2195       }
2196     }
2197     if (biop == NULL) {
2198       Message (MSG_POST, "Error collecting data");
2199       ValNodeFree (request_list);
2200       ValNodeFree (item_list);
2201       return;
2202     } else {
2203       orig_org = biop->org;
2204       if (orig_org != NULL) {
2205         if (response_org == NULL) {
2206           ValNodeAddPointer (not_found, biop_vnp->choice, biop_vnp->data.ptrvalue);
2207         } else if (StringCmp (orig_org->taxname, response_org->taxname) != 0) {
2208           ValNodeAddPointer (bad_match, biop_vnp->choice, biop_vnp->data.ptrvalue);
2209         } else if (!DoTaxonIdsMatch(orig_org, response_org)) {
2210           ValNodeAddPointer (bad_match, biop_vnp->choice, biop_vnp->data.ptrvalue);
2211         }
2212       }
2213     }
2214     OrgRefFree (response_org);
2215   }
2216   ValNodeFree (request_list);
2217   ValNodeFree (response_list);
2218   ValNodeFree (item_list);
2219 }
2220 
2221 
CheckTaxNamesAgainstTaxDatabase(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)2222 NLM_EXTERN void CheckTaxNamesAgainstTaxDatabase (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
2223 {
2224   ValNodePtr  vnp;
2225   SeqEntryPtr sep;
2226   SeqEntryPtr orig_scope;
2227   ValNodePtr  not_found = NULL, bad_match = NULL;
2228   CharPtr     bad_match_fmt = "%d tax names do not match taxonomy lookup.";
2229   CharPtr     no_match_fmt = "%d organisms are not found in taxonomy lookup.";
2230   ClickableItemPtr dip;
2231 
2232   if (discrepancy_list == NULL) return;
2233 
2234 
2235   orig_scope = SeqEntryGetScope ();
2236   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
2237     sep = vnp->data.ptrvalue;
2238     SeqEntrySetScope (sep);
2239     Taxon3CheckOrgInSeqEntry (sep, &not_found, &bad_match);
2240   }
2241   SeqEntrySetScope (orig_scope);
2242   if (not_found != NULL) {
2243     dip = NewClickableItem (DISC_NO_TAXLOOKUP, no_match_fmt, not_found);
2244     dip->subcategories = NULL;
2245     ValNodeAddPointer (discrepancy_list, 0, dip);
2246   }
2247   if (bad_match != NULL) {
2248     dip = NewClickableItem (DISC_BAD_TAXLOOKUP, bad_match_fmt, bad_match);
2249     dip->subcategories = NULL;
2250     ValNodeAddPointer (discrepancy_list, 0, dip);
2251   }
2252 }
2253 
2254 
FreeOrgRefValNodeList(ValNodePtr vnp)2255 static ValNodePtr FreeOrgRefValNodeList (ValNodePtr vnp)
2256 {
2257   ValNodePtr vnp_next;
2258   OrgRefPtr  org;
2259 
2260   while (vnp != NULL)
2261   {
2262     vnp_next = vnp->next;
2263     vnp->next = NULL;
2264     org = (OrgRefPtr) vnp->data.ptrvalue;
2265     vnp->data.ptrvalue = OrgRefFree (org);
2266     vnp = ValNodeFree (vnp);
2267     vnp = vnp_next;
2268   }
2269   return vnp;
2270 }
2271 
2272 
EndsWithSp(CharPtr str)2273 static Boolean EndsWithSp (CharPtr str)
2274 {
2275   Int4 len;
2276 
2277   if (StringHasNoText (str)) return FALSE;
2278   len = StringLen (str);
2279   if (len < 4) return FALSE;
2280   if (StringCmp (str + len - 4, " sp.") == 0) return TRUE;
2281   return FALSE;
2282 }
2283 
2284 
RemoveSp(CharPtr orig)2285 static CharPtr RemoveSp (CharPtr orig)
2286 {
2287   CharPtr cpy = NULL;
2288   Int4    len;
2289 
2290   len = StringLen (orig);
2291   if (len >= 4 && StringCmp (orig + len - 4, " sp.") == 0) {
2292     cpy = (CharPtr) MemNew (sizeof (Char) * len - 3);
2293     StringNCpy (cpy, orig, len - 4);
2294     cpy[len - 4] = 0;
2295   }
2296   return cpy;
2297 }
2298 
2299 
AddRequestOrgForString(CharPtr str,CharPtr host,ValNodePtr PNTR request_list,ValNodePtr PNTR req_host_list)2300 static void AddRequestOrgForString (CharPtr str, CharPtr host, ValNodePtr PNTR request_list, ValNodePtr PNTR req_host_list)
2301 {
2302   OrgRefPtr    request_org;
2303   CharPtr      cp, cpy;
2304   CharPtr      truncated_host;
2305 
2306   if (StringHasNoText (str) || host == NULL || request_list == NULL || req_host_list == NULL)
2307   {
2308     return;
2309   }
2310   truncated_host = StringSave (str);
2311   cp = StringChr(truncated_host, ';');
2312   if (cp != NULL) {
2313     *cp = 0;
2314   }
2315 
2316   /* if ends with " sp.", remove " sp." */
2317   cpy = RemoveSp (truncated_host);
2318   if (cpy != NULL) {
2319     request_org = OrgRefNew();
2320     request_org->taxname = StringSave (cpy);
2321     ValNodeAddPointer (request_list, 3, request_org);
2322     ValNodeAddPointer (req_host_list, 0, StringSave (host));
2323   } else {
2324     request_org = OrgRefNew();
2325     request_org->taxname = StringSave (truncated_host);
2326     ValNodeAddPointer (request_list, 3, request_org);
2327     ValNodeAddPointer (req_host_list, 0, StringSave (host));
2328 
2329 
2330     /* if more than one word, try chopping off last to see if abbreviated name looks up */
2331     cp = StringRChr (truncated_host, ' ');
2332     if (cp != NULL)
2333     {
2334       cpy = StringSave (truncated_host);
2335       cp = StringRChr (cpy, ' ');
2336       if (cp != NULL)
2337       {
2338         *cp = 0;
2339         AddRequestOrgForString (cpy, host, request_list, req_host_list);
2340       }
2341       cpy = MemFree (cpy);
2342     }
2343   }
2344   truncated_host = MemFree (truncated_host);
2345 }
2346 
2347 typedef struct specifichostcheck {
2348   CharPtr      spec_host;
2349   ValNodePtr   request_list;  /* ValNodeList of orgs */
2350   ValNodePtr   response_list; /* ValNodeList of orgs */
2351   ValNodePtr   biop_list;     /* ValNodeList of sources with this spec_host value */
2352 } SpecificHostCheckData, PNTR SpecificHostCheckPtr;
2353 
2354 
SpecificHostCheckListFree(ValNodePtr vnp)2355 static ValNodePtr SpecificHostCheckListFree (ValNodePtr vnp)
2356 {
2357   ValNodePtr vnp_next;
2358   SpecificHostCheckPtr p;
2359 
2360   while (vnp != NULL)
2361   {
2362     vnp_next = vnp->next;
2363     vnp->next = NULL;
2364     p = (SpecificHostCheckPtr) vnp->data.ptrvalue;
2365     if (p != NULL)
2366     {
2367       p->request_list = FreeOrgRefValNodeList (p->request_list);
2368       p->response_list = FreeOrgRefValNodeList (p->response_list);
2369       p->spec_host = MemFree (p->spec_host);
2370       p->biop_list = ValNodeFree (p->biop_list);
2371     }
2372     vnp = ValNodeFreeData (vnp);
2373     vnp = vnp_next;
2374   }
2375   return vnp;
2376 }
2377 
2378 
SortSpecificHostOrgs(ValNodePtr host_list,ValNodePtr request_list,ValNodePtr response_list)2379 static ValNodePtr SortSpecificHostOrgs (ValNodePtr host_list, ValNodePtr request_list, ValNodePtr response_list)
2380 {
2381   ValNodePtr           check_list = NULL;
2382   SpecificHostCheckPtr p = NULL;
2383   CharPtr              host, prev_host = NULL;
2384 
2385   while (host_list != NULL
2386          && request_list != NULL
2387          && response_list != NULL)
2388   {
2389     host = (CharPtr) host_list->data.ptrvalue;
2390     if (StringCmp (host, prev_host) != 0)
2391     {
2392       p = (SpecificHostCheckPtr) MemNew (sizeof (SpecificHostCheckData));
2393       p->spec_host = StringSave (host);
2394       ValNodeAddPointer (&check_list, 0, p);
2395       prev_host = host;
2396     }
2397     ValNodeAddPointer (&(p->request_list), request_list->choice, request_list->data.ptrvalue);
2398     ValNodeAddPointer (&(p->response_list), response_list->choice, response_list->data.ptrvalue);
2399     request_list->data.ptrvalue = NULL;
2400     response_list->data.ptrvalue = NULL;
2401     host_list = host_list->next;
2402     request_list = request_list->next;
2403     response_list = response_list->next;
2404   }
2405   return check_list;
2406 }
2407 
2408 
StringAlreadyInValNodeList(CharPtr str,ValNodePtr list)2409 static Boolean StringAlreadyInValNodeList (CharPtr str, ValNodePtr list)
2410 {
2411   if (StringHasNoText (str))
2412   {
2413     return TRUE;
2414   }
2415 
2416   while (list != NULL)
2417   {
2418     if (StringCmp (str, list->data.ptrvalue) == 0)
2419     {
2420       return TRUE;
2421     }
2422     list = list->next;
2423   }
2424   return FALSE;
2425 }
2426 
2427 
GetBioSourceFromValNode(ValNodePtr vnp)2428 static BioSourcePtr GetBioSourceFromValNode (ValNodePtr vnp)
2429 {
2430   SeqFeatPtr sfp;
2431   SeqDescrPtr sdp;
2432   BioSourcePtr biop = NULL;
2433 
2434   if (vnp == NULL || vnp->data.ptrvalue == NULL) return NULL;
2435 
2436   if (vnp->choice == OBJ_SEQFEAT)
2437   {
2438     sfp = (SeqFeatPtr) vnp->data.ptrvalue;
2439     biop = (BioSourcePtr) sfp->data.value.ptrvalue;
2440   }
2441   else if (vnp->choice == OBJ_SEQDESC)
2442   {
2443     sdp = (SeqDescrPtr) vnp->data.ptrvalue;
2444     biop = (BioSourcePtr) sdp->data.ptrvalue;
2445   }
2446   return biop;
2447 }
2448 
2449 
2450 static CharPtr extract_list[] = {
2451   "cf.",
2452   "cf ",
2453   "aff ",
2454   "aff.",
2455   "near",
2456   "nr.",
2457   "nr ",
2458   NULL};
2459 
AdjustSpecificHostForTaxServer(CharPtr spec_host)2460 static void AdjustSpecificHostForTaxServer (CharPtr spec_host)
2461 {
2462   CharPtr cp, src, dst;
2463   Int4 i;
2464 
2465   /* ignore separator words */
2466   for (i = 0; extract_list[i] != NULL; i++) {
2467     if ((cp = StringSearch (spec_host, extract_list[i])) != NULL && cp > spec_host && isspace (*(cp - 1))) {
2468       src = cp + StringLen (extract_list[i]);
2469       dst = cp;
2470       while (isspace (*src)) {
2471         src++;
2472       }
2473       while (*src != 0) {
2474         *dst = *src;
2475         dst++;
2476         src++;
2477       }
2478       *dst = 0;
2479     }
2480   }
2481 }
2482 
2483 
AddBioSourcesToSpecificHostChecklist(ValNodePtr biop_list,ValNodePtr check_list)2484 static void AddBioSourcesToSpecificHostChecklist (ValNodePtr biop_list, ValNodePtr check_list)
2485 {
2486   ValNodePtr biop_vnp, last_vnp = NULL, stop_search;
2487   BioSourcePtr biop;
2488   OrgModPtr    mod;
2489   SpecificHostCheckPtr p;
2490   CharPtr tmp;
2491 
2492   if (biop_list == NULL || check_list == NULL) return;
2493 
2494   for (biop_vnp = biop_list; biop_vnp != NULL; biop_vnp = biop_vnp->next)
2495   {
2496 
2497     biop = GetBioSourceFromValNode (biop_vnp);
2498     if (biop == NULL) continue;
2499 
2500     if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL) continue;
2501     mod = biop->org->orgname->mod;
2502     while (mod != NULL)
2503     {
2504       if (mod->subtype == ORGMOD_nat_host
2505           && !StringHasNoText (mod->subname))
2506       {
2507         if (last_vnp == NULL)
2508         {
2509           last_vnp = check_list;
2510           stop_search = NULL;
2511         }
2512         else
2513         {
2514           stop_search = last_vnp;
2515         }
2516         tmp = StringSave (mod->subname);
2517         AdjustSpecificHostForTaxServer (tmp);
2518         p = NULL;
2519         while (last_vnp != NULL
2520                && (p = (SpecificHostCheckPtr) last_vnp->data.ptrvalue) != NULL
2521                && StringCmp (p->spec_host, tmp) != 0)
2522         {
2523           p = NULL;
2524           last_vnp = last_vnp->next;
2525         }
2526         if (p == NULL && stop_search != NULL)
2527         {
2528           last_vnp = check_list;
2529           while (last_vnp != stop_search
2530                  && (p = (SpecificHostCheckPtr) last_vnp->data.ptrvalue) != NULL
2531                  && StringCmp (p->spec_host, tmp) != 0)
2532           {
2533             p = NULL;
2534             last_vnp = last_vnp->next;
2535           }
2536         }
2537         tmp = MemFree (tmp);
2538         if (p != NULL)
2539         {
2540           ValNodeAddPointer (&(p->biop_list), biop_vnp->choice, biop_vnp->data.ptrvalue);
2541         }
2542       }
2543       mod = mod->next;
2544     }
2545   }
2546 }
2547 
2548 
ShouldCheckSpecificHostValueForValidator(CharPtr spec_host)2549 static Boolean ShouldCheckSpecificHostValueForValidator (CharPtr spec_host)
2550 {
2551   CharPtr semicolon, space;
2552   if (StringHasNoText (spec_host) || !isupper (*spec_host) || (space = StringChr(spec_host, ' ')) == NULL) {
2553     return FALSE;
2554   } else {
2555     semicolon = StringChr(spec_host, ';');
2556     if (semicolon != NULL && semicolon < space) {
2557       return FALSE;
2558     } else {
2559       return TRUE;
2560     }
2561   }
2562 }
2563 
GetSpecificHostValueToCheckForValidator(CharPtr spec_host)2564 static CharPtr GetSpecificHostValueToCheckForValidator (CharPtr spec_host)
2565 {
2566   CharPtr cp, check_val = NULL;
2567   Int4    len = 0;
2568 
2569   if (ShouldCheckSpecificHostValueForValidator(spec_host)) {
2570     cp = spec_host;
2571     /* skip first word */
2572     while (*cp != 0 && !isspace (*cp)) {
2573       cp++;
2574       len++;
2575     }
2576     while (isspace (*cp)) {
2577       cp++;
2578       len++;
2579     }
2580 
2581     /* if next word is "hybrid" or "x" then want this word plus the third word */
2582     if (StringNCmp (cp, "hybrid ", 7) == 0) {
2583       cp += 7;
2584       len += 7;
2585     } else if (StringNCmp (cp, "x ", 2) == 0) {
2586       cp += 2;
2587       len += 2;
2588     }
2589 
2590     if (*cp != '(' && StringNCmp (cp, "sp.", 3) != 0 && *cp != 0) {
2591       /* collect second word */
2592       while (*cp != 0 && !isspace (*cp)) {
2593         cp++;
2594         len++;
2595       }
2596     }
2597 
2598     cp = StringChr(spec_host, ';');
2599     if (cp != NULL && cp - spec_host < len) {
2600       len = cp - spec_host;
2601     }
2602 
2603     check_val = (CharPtr) MemNew (sizeof (Char) * (len + 1));
2604     StringNCpy (check_val, spec_host, len);
2605     check_val[len] = 0;
2606     TrimSpacesAroundString (check_val);
2607   }
2608   return check_val;
2609 }
2610 
ShouldCheckSpecificHostInBioSource(BioSourcePtr biop)2611 static Boolean ShouldCheckSpecificHostInBioSource (BioSourcePtr biop)
2612 {
2613   OrgModPtr mod;
2614   Boolean   rval = FALSE;
2615 
2616   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL) {
2617     return FALSE;
2618   }
2619   for (mod = biop->org->orgname->mod; mod != NULL && !rval; mod = mod->next) {
2620     if (mod->subtype == ORGMOD_nat_host) {
2621       rval = ShouldCheckSpecificHostValueForValidator (mod->subname);
2622     }
2623   }
2624   return rval;
2625 }
2626 
2627 
2628 
AddValidatorSpecificHostBioSourceFeatToList(SeqFeatPtr sfp,Pointer userdata)2629 static void AddValidatorSpecificHostBioSourceFeatToList (SeqFeatPtr sfp, Pointer userdata)
2630 {
2631   if (sfp == NULL || sfp->data.choice != SEQFEAT_BIOSRC || userdata == NULL) return;
2632 
2633   if (ShouldCheckSpecificHostInBioSource (sfp->data.value.ptrvalue))
2634   {
2635     ValNodeAddPointer ((ValNodePtr PNTR) userdata, OBJ_SEQFEAT, sfp);
2636   }
2637 }
2638 
2639 
AddValidatorSpecificHostBioSourceDescToList(SeqDescrPtr sdp,Pointer userdata)2640 static void AddValidatorSpecificHostBioSourceDescToList (SeqDescrPtr sdp, Pointer userdata)
2641 {
2642   if (sdp == NULL || sdp->choice != Seq_descr_source || userdata == NULL) return;
2643 
2644   if (ShouldCheckSpecificHostInBioSource (sdp->data.ptrvalue))
2645   {
2646     ValNodeAddPointer ((ValNodePtr PNTR) userdata, OBJ_SEQDESC, sdp);
2647   }
2648 }
2649 
2650 
GetValidatorSpecificHostBioSourceList(SeqEntryPtr sep)2651 static ValNodePtr GetValidatorSpecificHostBioSourceList (SeqEntryPtr sep)
2652 {
2653   ValNodePtr list = NULL;
2654 
2655   VisitFeaturesInSep (sep, &list, AddValidatorSpecificHostBioSourceFeatToList);
2656   VisitDescriptorsInSep (sep, &list, AddValidatorSpecificHostBioSourceDescToList);
2657   return list;
2658 }
2659 
2660 
2661 static void
FormatValidatorSpecificHostRequests(ValNodePtr spec_host_list,ValNodePtr PNTR request_list,ValNodePtr PNTR req_host_list)2662 FormatValidatorSpecificHostRequests
2663 (ValNodePtr spec_host_list,
2664  ValNodePtr PNTR request_list,
2665  ValNodePtr PNTR req_host_list)
2666 {
2667   ValNodePtr vnp;
2668   CharPtr    orig;
2669   OrgRefPtr  request_org;
2670 
2671   /* now format requests for unique specific_host values */
2672   for (vnp = spec_host_list; vnp != NULL; vnp = vnp->next)
2673   {
2674     orig = (CharPtr) vnp->data.ptrvalue;
2675     request_org = OrgRefNew();
2676     request_org->taxname = GetSpecificHostValueToCheckForValidator (orig);
2677     ValNodeAddPointer (request_list, 3, request_org);
2678     ValNodeAddPointer (req_host_list, 0, StringSave (orig));
2679   }
2680 }
2681 
MatchesSynonym(CharPtr txt,OrgRefPtr response_org)2682 static Boolean MatchesSynonym (CharPtr txt, OrgRefPtr response_org)
2683 {
2684   ValNodePtr syn;
2685   Boolean    rval = FALSE;
2686   if (StringHasNoText (txt) || response_org == NULL) return FALSE;
2687 
2688   for (syn = response_org->syn; syn != NULL && !rval; syn = syn->next)
2689   {
2690     if (StringCmp (txt, syn->data.ptrvalue) == 0)
2691     {
2692       rval = TRUE;
2693     }
2694   }
2695   return rval;
2696 }
2697 
2698 
MatchesGenBankSynonym(CharPtr txt,OrgRefPtr response_org)2699 static Boolean MatchesGenBankSynonym (CharPtr txt, OrgRefPtr response_org)
2700 {
2701   OrgModPtr mod;
2702   Boolean   rval = FALSE;
2703 
2704   if (StringHasNoText (txt) || response_org == NULL || response_org->orgname == NULL) return FALSE;
2705   mod = response_org->orgname->mod;
2706   while (mod != NULL)
2707   {
2708     if ((mod->subtype == ORGMOD_gb_synonym || mod->subtype == ORGMOD_old_name) && StringCmp (txt, mod->subname) == 0)
2709     {
2710       rval = TRUE;
2711     }
2712     mod = mod->next;
2713   }
2714   return rval;
2715 }
2716 
2717 
GetListOfUniqueSpecificHostValues(ValNodePtr biop_list)2718 static ValNodePtr GetListOfUniqueSpecificHostValues (ValNodePtr biop_list)
2719 {
2720   ValNodePtr   biop_vnp;
2721   BioSourcePtr biop;
2722   OrgModPtr    mod;
2723   ValNodePtr   spec_host_list = NULL;
2724   CharPtr      tmp;
2725 
2726   /* get a list of unique specific_host values */
2727   for (biop_vnp = biop_list; biop_vnp != NULL; biop_vnp = biop_vnp->next)
2728   {
2729     if (biop_vnp->data.ptrvalue == NULL) continue;
2730     biop = GetBioSourceFromValNode (biop_vnp);
2731     if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL) continue;
2732     mod = biop->org->orgname->mod;
2733     while (mod != NULL)
2734     {
2735       if (mod->subtype == ORGMOD_nat_host
2736           && !StringHasNoText (mod->subname))
2737       {
2738         tmp = StringSave (mod->subname);
2739         AdjustSpecificHostForTaxServer (tmp);
2740         ValNodeAddPointer (&spec_host_list, 0, tmp);
2741       }
2742       mod = mod->next;
2743     }
2744   }
2745   spec_host_list = ValNodeSort (spec_host_list, SortVnpByString);
2746   ValNodeUnique (&spec_host_list, SortVnpByString, ValNodeFreeData);
2747   return spec_host_list;
2748 }
2749 
2750 
FindMatchInOrgRef(CharPtr str,OrgRefPtr org)2751 static CharPtr FindMatchInOrgRef (CharPtr str, OrgRefPtr org)
2752 {
2753   ValNodePtr syn;
2754   OrgModPtr  mod;
2755   CharPtr    rval = NULL;
2756 
2757   if (StringHasNoText (str) || org == NULL) {
2758     rval = NULL;
2759   } else if (StringICmp (org->taxname, str) == 0) {
2760     rval = org->taxname;
2761   } else if (StringICmp (org->common, str) == 0) {
2762     rval = org->common;
2763   } else {
2764     for (syn = org->syn; syn != NULL && rval == NULL; syn = syn->next) {
2765       if (StringICmp (str, (CharPtr)(syn->data.ptrvalue)) == 0) {
2766         rval = (CharPtr)(syn->data.ptrvalue);
2767       }
2768     }
2769     if (org->orgname != NULL) {
2770       for (mod = org->orgname->mod; mod != NULL && rval == NULL; mod = mod->next) {
2771         if ((mod->subtype == ORGMOD_gb_synonym || mod->subtype == ORGMOD_old_name)
2772             && StringICmp (str, mod->subname) == 0) {
2773           rval = mod->subname;
2774         }
2775       }
2776     }
2777   }
2778   return rval;
2779 }
2780 
2781 
2782 /* Want to check that specific host names are valid */
2783 NLM_EXTERN void
Taxon3ValidateSpecificHostsInSeqEntry(SeqEntryPtr sep,ValNodePtr PNTR misspelled_list,ValNodePtr PNTR bad_caps_list,ValNodePtr PNTR ambiguous_list,ValNodePtr PNTR unrecognized_list)2784 Taxon3ValidateSpecificHostsInSeqEntry
2785 (SeqEntryPtr sep,
2786  ValNodePtr PNTR misspelled_list,
2787  ValNodePtr PNTR bad_caps_list,
2788  ValNodePtr PNTR ambiguous_list,
2789  ValNodePtr PNTR unrecognized_list)
2790 {
2791   ValNodePtr   biop_list = NULL;
2792   ValNodePtr   req_host_list = NULL, spec_host_list = NULL;
2793   ValNodePtr   request_list = NULL;
2794   ValNodePtr   response_list = NULL;
2795   ValNodePtr   response_vnp, request_vnp;
2796   ValNodePtr   check_list, check_vnp;
2797   OrgRefPtr    request_org, response_org;
2798   SpecificHostCheckPtr p;
2799   Boolean              has_match;
2800   ErrSev               level;
2801   Boolean              misspelled_flag;
2802   Boolean              bad_caps_flag;
2803   Boolean              ambiguous_flag;
2804   CharPtr              match;
2805 
2806   biop_list = GetValidatorSpecificHostBioSourceList (sep);
2807 
2808   /* get a list of unique specific_host values */
2809   spec_host_list = GetListOfUniqueSpecificHostValues (biop_list);
2810 
2811   /* now format requests for unique specific_host values */
2812   FormatValidatorSpecificHostRequests (spec_host_list, &request_list, &req_host_list);
2813 
2814   spec_host_list = ValNodeFreeData (spec_host_list);
2815 
2816   level = ErrSetMessageLevel (SEV_MAX);
2817   response_list = Taxon3GetOrgRefList (request_list);
2818   ErrSetMessageLevel (level);
2819 
2820   if (ValNodeLen (response_list) != ValNodeLen (request_list))
2821   {
2822     Message (MSG_POST, "Unable to retrieve information from tax server");
2823   }
2824   else
2825   {
2826     /* resort requests so that we can check all responses for the same BioSource together */
2827     check_list = SortSpecificHostOrgs (req_host_list, request_list, response_list);
2828     AddBioSourcesToSpecificHostChecklist (biop_list, check_list);
2829 
2830     /* now look at responses */
2831     check_vnp = check_list;
2832     while (check_vnp != NULL)
2833     {
2834       p = (SpecificHostCheckPtr) check_vnp->data.ptrvalue;
2835       if (p != NULL)
2836       {
2837         has_match = FALSE;
2838         misspelled_flag = FALSE;
2839         bad_caps_flag = FALSE;
2840         ambiguous_flag = FALSE;
2841 
2842         request_vnp = p->request_list;
2843         response_vnp = p->response_list;
2844         while (!has_match && request_vnp != NULL && response_vnp != NULL)
2845         {
2846           request_org = (OrgRefPtr) request_vnp->data.ptrvalue;
2847           response_org = (OrgRefPtr) response_vnp->data.ptrvalue;
2848           if (response_vnp->choice & eReturnedOrgFlag_misspelled)
2849           {
2850             misspelled_flag = TRUE;
2851           }
2852           else if (response_vnp->choice & eReturnedOrgFlag_ambiguous)
2853           {
2854             ambiguous_flag = TRUE;
2855           }
2856           else
2857           {
2858             match = FindMatchInOrgRef (request_org->taxname, response_org);
2859             if (StringCmp (match, request_org->taxname) == 0)
2860             {
2861               has_match = TRUE;
2862             }
2863             else if (StringICmp (match, request_org->taxname) == 0)
2864             {
2865               if (response_vnp->choice & eReturnedOrgFlag_common_name) {
2866                 has_match = TRUE;
2867               } else {
2868                 bad_caps_flag = TRUE;
2869               }
2870             }
2871           }
2872           request_vnp = request_vnp->next;
2873           response_vnp = response_vnp->next;
2874         }
2875         if (!has_match)
2876         {
2877           /* add to the list of bad */
2878           if (misspelled_flag) {
2879             if (misspelled_list != NULL) {
2880               ValNodeLink (misspelled_list, p->biop_list);
2881               p->biop_list = NULL;
2882             }
2883           } else if (bad_caps_flag) {
2884             if (bad_caps_list != NULL) {
2885               ValNodeLink (bad_caps_list, p->biop_list);
2886               p->biop_list = NULL;
2887             }
2888           } else if (ambiguous_flag) {
2889             if (ambiguous_list != NULL) {
2890               ValNodeLink (ambiguous_list, p->biop_list);
2891               p->biop_list = NULL;
2892             }
2893           } else {
2894             if (unrecognized_list != NULL) {
2895               ValNodeLink (unrecognized_list, p->biop_list);
2896               p->biop_list = NULL;
2897             }
2898           }
2899         }
2900       }
2901       check_vnp = check_vnp->next;
2902     }
2903     check_list = SpecificHostCheckListFree (check_list);
2904   }
2905 
2906   biop_list = ValNodeFree (biop_list);
2907   request_list = FreeOrgRefValNodeList (request_list);
2908   response_list = FreeOrgRefValNodeList (response_list);
2909   req_host_list = ValNodeFreeData (req_host_list);
2910 }
2911 
2912 
2913 typedef struct spechostgather {
2914   ValNodePtr list;
2915   Boolean    caps; /* if true, check only when first letter of first word is capitalized */
2916   Boolean    paren; /* if true, check portion inside parentheses as separate string */
2917 } SpecHostGatherData, PNTR SpecHostGatherPtr;
2918 
2919 
ShouldCheckSpecificHostString(CharPtr str,SpecHostGatherPtr p)2920 static Boolean ShouldCheckSpecificHostString (CharPtr str, SpecHostGatherPtr p)
2921 {
2922   CharPtr cp_start;
2923   Boolean rval = FALSE;
2924 
2925   if (StringHasNoText (str) || p == NULL) {
2926     return FALSE;
2927   }
2928 
2929   if (!p->caps) {
2930     rval = TRUE;
2931   } else if (isupper (*str)) {
2932     rval = TRUE;
2933   } else if (p->paren) {
2934     cp_start = StringChr (str, '(');
2935     if (cp_start != NULL && ShouldCheckSpecificHostString (cp_start + 1, p)) {
2936       rval = TRUE;
2937     }
2938   }
2939   return rval;
2940 }
2941 
2942 
HasSpecificHostToBeChecked(BioSourcePtr biop,SpecHostGatherPtr p)2943 static Boolean HasSpecificHostToBeChecked (BioSourcePtr biop, SpecHostGatherPtr p)
2944 {
2945   OrgModPtr mod;
2946   Boolean   rval = FALSE;
2947 
2948   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL || p == NULL) return FALSE;
2949 
2950   for (mod = biop->org->orgname->mod; mod != NULL && !rval; mod = mod->next) {
2951     if (mod->subtype == ORGMOD_nat_host && ShouldCheckSpecificHostString (mod->subname, p)) {
2952       rval = TRUE;
2953     }
2954   }
2955   return rval;
2956 }
2957 
2958 
AddSpecificHostBioSourceFeatToList(SeqFeatPtr sfp,Pointer userdata)2959 static void AddSpecificHostBioSourceFeatToList (SeqFeatPtr sfp, Pointer userdata)
2960 {
2961   SpecHostGatherPtr p;
2962 
2963   if (sfp == NULL || sfp->data.choice != SEQFEAT_BIOSRC || userdata == NULL) return;
2964 
2965   p = (SpecHostGatherPtr) userdata;
2966   if (HasSpecificHostToBeChecked (sfp->data.value.ptrvalue, p))
2967   {
2968     ValNodeAddPointer (&(p->list), OBJ_SEQFEAT, sfp);
2969   }
2970 }
2971 
2972 
AddSpecificHostBioSourceDescToList(SeqDescrPtr sdp,Pointer userdata)2973 static void AddSpecificHostBioSourceDescToList (SeqDescrPtr sdp, Pointer userdata)
2974 {
2975   SpecHostGatherPtr p;
2976 
2977   if (sdp == NULL || sdp->choice != Seq_descr_source || userdata == NULL) return;
2978 
2979   p = (SpecHostGatherPtr) userdata;
2980   if (HasSpecificHostToBeChecked (sdp->data.ptrvalue, p))
2981   {
2982     ValNodeAddPointer (&(p->list), OBJ_SEQDESC, sdp);
2983   }
2984 }
2985 
2986 
GetSpecificHostBioSourceList(SeqEntryPtr sep,Boolean caps,Boolean paren)2987 static ValNodePtr GetSpecificHostBioSourceList (SeqEntryPtr sep, Boolean caps, Boolean paren)
2988 {
2989   SpecHostGatherData   d;
2990 
2991   d.caps = caps;
2992   d.paren = paren;
2993   d.list = NULL;
2994   VisitFeaturesInSep (sep, &d, AddSpecificHostBioSourceFeatToList);
2995   VisitDescriptorsInSep (sep, &d, AddSpecificHostBioSourceDescToList);
2996   return d.list;
2997 }
2998 
2999 
3000 static void
FormatSpecificHostRequests(ValNodePtr spec_host_list,ValNodePtr PNTR request_list,ValNodePtr PNTR req_host_list,Boolean caps,Boolean paren)3001 FormatSpecificHostRequests
3002 (ValNodePtr spec_host_list,
3003  ValNodePtr PNTR request_list,
3004  ValNodePtr PNTR req_host_list,
3005  Boolean caps,
3006  Boolean paren)
3007 {
3008   ValNodePtr vnp;
3009   CharPtr    orig, cp, str, cp2 = NULL;
3010 
3011   /* now format requests for unique specific_host values */
3012   for (vnp = spec_host_list; vnp != NULL; vnp = vnp->next)
3013   {
3014     orig = (CharPtr) vnp->data.ptrvalue;
3015     /* if we have a value in parentheses, submit it separately */
3016     cp = StringChr (orig, '(');
3017     if (cp != NULL)
3018     {
3019       cp2 = StringChr (cp, ')');
3020     }
3021     if (cp != NULL && cp2 != NULL
3022         && ((cp > orig && orig[StringLen (orig) - 1] == ')') /* ends with paren */
3023             || (cp == orig))) /* starts with paren */
3024     {
3025       if (cp > orig && orig[StringLen (orig) - 1] == ')')
3026       {
3027         str = StringSave (orig);
3028         /* remove trailing parenthesis */
3029         str [StringLen(str) - 1] = 0;
3030 
3031         cp = str + (cp - orig);
3032 
3033         /* remove opening parenthesis */
3034         *cp = 0;
3035         cp++;
3036       }
3037       else
3038       {
3039         str = StringSave (orig);
3040         /* remove leading parenthesis */
3041         str[0] = ' ';
3042         cp = str + (cp2 - orig);
3043         /* remove trailing parenthesis */
3044         *cp = 0;
3045         cp++;
3046       }
3047       TrimSpacesAroundString (cp);
3048       TrimSpacesAroundString (str);
3049       if (paren && (!caps || isupper (*cp))) {
3050         AddRequestOrgForString (cp, orig, request_list, req_host_list);
3051       }
3052       if (!caps || isupper (*str)) {
3053         AddRequestOrgForString (str, orig, request_list, req_host_list);
3054       }
3055     }
3056     else
3057     {
3058       if (!caps || isupper (*orig)) {
3059         AddRequestOrgForString (orig, orig, request_list, req_host_list);
3060       }
3061     }
3062   }
3063 }
3064 
3065 
3066 typedef struct replacementpair {
3067   CharPtr find;
3068   CharPtr repl;
3069 } ReplacementPairData, PNTR ReplacementPairPtr;
3070 
ReplacementPairNew(CharPtr find,CharPtr repl)3071 static ReplacementPairPtr ReplacementPairNew (CharPtr find, CharPtr repl)
3072 {
3073   ReplacementPairPtr r;
3074 
3075   r = (ReplacementPairPtr) MemNew (sizeof (ReplacementPairData));
3076   r->find = StringSave (find);
3077   r->repl = StringSave (repl);
3078   return r;
3079 }
3080 
ReplacementPairFree(ReplacementPairPtr r)3081 static ReplacementPairPtr ReplacementPairFree (ReplacementPairPtr r)
3082 {
3083   if (r != NULL) {
3084     r->find = MemFree (r->find);
3085     r->repl = MemFree (r->repl);
3086     r = MemFree (r);
3087   }
3088   return r;
3089 }
3090 
ReplacementPairListFree(ValNodePtr list)3091 static ValNodePtr ReplacementPairListFree (ValNodePtr list)
3092 {
3093   ValNodePtr list_next;
3094 
3095   while (list != NULL) {
3096     list_next = list->next;
3097     list->next = NULL;
3098     list->data.ptrvalue = ReplacementPairFree (list->data.ptrvalue);
3099     list = ValNodeFree (list);
3100     list = list_next;
3101   }
3102   return list;
3103 }
3104 
3105 
3106 static SpecificHostFixPtr
SpecificHostFixNew(ValNodePtr feat_or_desc,CharPtr bad_host,CharPtr old_taxname,CharPtr new_taxname,Uint1 fix_type)3107 SpecificHostFixNew
3108 (ValNodePtr feat_or_desc,
3109  CharPtr    bad_host,
3110  CharPtr    old_taxname,
3111  CharPtr    new_taxname,
3112  Uint1      fix_type)
3113 {
3114   SpecificHostFixPtr s;
3115 
3116   s = (SpecificHostFixPtr) MemNew (sizeof (SpecificHostFixData));
3117   if (feat_or_desc != NULL)
3118   {
3119     s->feat_or_desc = ValNodeNew(NULL);
3120     s->feat_or_desc->choice = feat_or_desc->choice;
3121     s->feat_or_desc->data.ptrvalue = feat_or_desc->data.ptrvalue;
3122   }
3123   s->bad_specific_host = StringSave (bad_host);
3124   s->old_taxname = StringSave (old_taxname);
3125   s->new_taxname = StringSave (new_taxname);
3126   s->fix_type = fix_type;
3127   return s;
3128 }
3129 
3130 
SpecificHostFixFree(SpecificHostFixPtr s)3131 static SpecificHostFixPtr SpecificHostFixFree (SpecificHostFixPtr s)
3132 {
3133   if (s != NULL)
3134   {
3135     s->feat_or_desc = ValNodeFree (s->feat_or_desc);
3136     s->bad_specific_host = MemFree (s->bad_specific_host);
3137     s->old_taxname = MemFree (s->old_taxname);
3138     s->new_taxname = MemFree (s->new_taxname);
3139     s = MemFree (s);
3140   }
3141   return s;
3142 }
3143 
3144 
SpecificHostFixListFree(ValNodePtr vnp)3145 extern ValNodePtr SpecificHostFixListFree (ValNodePtr vnp)
3146 {
3147   ValNodePtr vnp_next;
3148 
3149   while (vnp != NULL)
3150   {
3151     vnp_next = vnp->next;
3152     vnp->next = NULL;
3153     vnp->data.ptrvalue = SpecificHostFixFree (vnp->data.ptrvalue);
3154     vnp = ValNodeFree (vnp);
3155     vnp = vnp_next;
3156   }
3157   return vnp;
3158 }
3159 
3160 
GetFixesForOneSpecificHostValue(SpecificHostCheckPtr p)3161 static ValNodePtr GetFixesForOneSpecificHostValue (SpecificHostCheckPtr p)
3162 {
3163   CharPtr      prev_success = NULL, new_val, prev_fail = NULL;
3164   Boolean      fix_needed = FALSE;
3165   ValNodePtr   suggested_fixes = NULL;
3166   OrgRefPtr    request_org, response_org;
3167   ValNodePtr   biop_vnp, response_vnp, request_vnp, vnp;
3168   SpecificHostFixPtr s;
3169   ValNodePtr         fix_list = NULL;
3170   ReplacementPairPtr r;
3171   Uint1              fix_type;
3172   Boolean            add_nontrunc_fix;
3173   Boolean            ambiguous = FALSE;
3174 
3175   if (p == NULL) return NULL;
3176 
3177   request_vnp = p->request_list;
3178   response_vnp = p->response_list;
3179 
3180   while (request_vnp != NULL && response_vnp != NULL)
3181   {
3182     request_org = (OrgRefPtr) request_vnp->data.ptrvalue;
3183     response_org = (OrgRefPtr) response_vnp->data.ptrvalue;
3184     if (prev_success != NULL
3185         && StringNCmp (request_org->taxname, prev_success, StringLen (request_org->taxname)) == 0) {
3186       /* we don't need to check this one */
3187     } else if (response_org == NULL) {
3188       fix_needed = TRUE;
3189       if (response_vnp->choice & eReturnedOrgFlag_ambiguous) {
3190         ambiguous = TRUE;
3191       }
3192       if (prev_fail == NULL) {
3193         prev_fail = request_org->taxname;
3194       } else if (StringNCmp (prev_fail, request_org->taxname, StringLen (request_org->taxname)) != 0) {
3195         if (response_vnp->choice & eReturnedOrgFlag_ambiguous) {
3196           ValNodeAddPointer (&suggested_fixes, eSpecificHostFix_ambiguous, ReplacementPairNew (request_org->taxname, NULL));
3197         } else {
3198           ValNodeAddPointer (&suggested_fixes, eSpecificHostFix_unrecognized, ReplacementPairNew (request_org->taxname, NULL));
3199         }
3200         prev_fail = request_org->taxname;
3201       }
3202     } else {
3203       prev_success = request_org->taxname;
3204       add_nontrunc_fix = FALSE;
3205       if (response_vnp->choice & eReturnedOrgFlag_misspelled) {
3206         fix_needed = TRUE;
3207         fix_type = eSpecificHostFix_spelling;
3208         new_val = response_org->taxname;
3209         add_nontrunc_fix = TRUE;
3210       } else {
3211         new_val = FindMatchInOrgRef (request_org->taxname, response_org);
3212         if (new_val == NULL) {
3213           fix_needed = TRUE;
3214           fix_type = eSpecificHostFix_replacement;
3215           new_val = response_org->taxname;
3216           add_nontrunc_fix = TRUE;
3217         } else if (StringCmp (new_val, request_org->taxname) != 0) {
3218           fix_needed = TRUE;
3219           fix_type = eSpecificHostFix_capitalization;
3220           add_nontrunc_fix = TRUE;
3221         }
3222       }
3223 
3224       /* add fix to truncate and correct spelling and capitalization first */
3225       /* this way the truncation won't fail when it looks for the old version that's already been corrected */
3226       if (prev_fail != NULL) {
3227         if (StringNCmp (prev_fail, request_org->taxname, StringLen (request_org->taxname)) == 0) {
3228           if (new_val != NULL) {
3229             ValNodeAddPointer (&suggested_fixes, eSpecificHostFix_truncation, ReplacementPairNew (prev_fail, new_val));
3230             fix_needed = TRUE;
3231           }
3232         } else {
3233           ValNodeAddPointer (&suggested_fixes, eSpecificHostFix_unrecognized, ReplacementPairNew (prev_fail, NULL));
3234         }
3235       }
3236       /* add fix for just spelling and capitalization after */
3237       if (add_nontrunc_fix) {
3238         ValNodeAddPointer (&suggested_fixes, fix_type, ReplacementPairNew (request_org->taxname, new_val));
3239       }
3240 
3241       prev_fail = NULL;
3242     }
3243     request_vnp = request_vnp->next;
3244     response_vnp = response_vnp->next;
3245   }
3246 
3247   if (fix_needed) {
3248     for (biop_vnp = p->biop_list; biop_vnp != NULL; biop_vnp = biop_vnp->next) {
3249       if (suggested_fixes == NULL) {
3250         s = SpecificHostFixNew (biop_vnp, p->spec_host, p->spec_host, NULL, ambiguous ? eSpecificHostFix_ambiguous : eSpecificHostFix_unrecognized);
3251         ValNodeAddPointer (&fix_list, 0, s);
3252       } else {
3253         for (vnp = suggested_fixes; vnp != NULL; vnp = vnp->next) {
3254           r = (ReplacementPairPtr) vnp->data.ptrvalue;
3255           s = SpecificHostFixNew (biop_vnp, p->spec_host, r->find, r->repl, vnp->choice);
3256           ValNodeAddPointer (&fix_list, 0, s);
3257         }
3258       }
3259     }
3260   }
3261   suggested_fixes = ReplacementPairListFree (suggested_fixes);
3262   return fix_list;
3263 }
3264 
3265 
Taxon3GetSpecificHostFixesInSeqEntry(SeqEntryPtr sep,Boolean caps,Boolean paren)3266 NLM_EXTERN ValNodePtr Taxon3GetSpecificHostFixesInSeqEntry (SeqEntryPtr sep, Boolean caps, Boolean paren)
3267 {
3268   ValNodePtr   biop_list = NULL;
3269   ValNodePtr   req_host_list = NULL, spec_host_list = NULL;
3270   ValNodePtr   request_list = NULL;
3271   ValNodePtr   response_list = NULL;
3272   ValNodePtr   check_list, check_vnp;
3273   SpecificHostCheckPtr p;
3274   ErrSev               level;
3275   ValNodePtr           fix_list = NULL;
3276 
3277   biop_list = GetSpecificHostBioSourceList (sep, caps, paren);
3278 
3279   /* get a list of unique specific_host values */
3280   spec_host_list = GetListOfUniqueSpecificHostValues (biop_list);
3281 
3282   /* now format requests for unique specific_host values */
3283   FormatSpecificHostRequests (spec_host_list, &request_list, &req_host_list, caps, paren);
3284 
3285   spec_host_list = ValNodeFreeData (spec_host_list);
3286 
3287   level = ErrSetMessageLevel (SEV_MAX);
3288   response_list = Taxon3GetOrgRefList (request_list);
3289   ErrSetMessageLevel (level);
3290 
3291   if (ValNodeLen (response_list) != ValNodeLen (request_list))
3292   {
3293     Message (MSG_POST, "Unable to retrieve information from tax server");
3294   }
3295   else
3296   {
3297     /* resort requests so that we can check all responses for the same BioSource together */
3298     check_list = SortSpecificHostOrgs (req_host_list, request_list, response_list);
3299     AddBioSourcesToSpecificHostChecklist (biop_list, check_list);
3300 
3301     /* now look at responses */
3302     check_vnp = check_list;
3303     while (check_vnp != NULL)
3304     {
3305       p = (SpecificHostCheckPtr) check_vnp->data.ptrvalue;
3306       ValNodeLink (&fix_list, GetFixesForOneSpecificHostValue (p));
3307       check_vnp = check_vnp->next;
3308     }
3309     check_list = SpecificHostCheckListFree (check_list);
3310   }
3311 
3312   biop_list = ValNodeFree (biop_list);
3313   request_list = FreeOrgRefValNodeList (request_list);
3314   response_list = FreeOrgRefValNodeList (response_list);
3315   req_host_list = ValNodeFreeData (req_host_list);
3316 
3317   return fix_list;
3318 }
3319 
3320 
ApplyOneSpecificHostFix(SpecificHostFixPtr s)3321 extern Boolean ApplyOneSpecificHostFix (SpecificHostFixPtr s)
3322 {
3323   BioSourcePtr biop = NULL;
3324   Boolean      rval = FALSE;
3325   CharPtr      new_spec_host;
3326   ValNode      vn;
3327 
3328   if (s == NULL || s->feat_or_desc == NULL
3329       || StringHasNoText (s->bad_specific_host)
3330       || StringHasNoText (s->old_taxname)
3331       || StringHasNoText (s->new_taxname)) {
3332     return rval;
3333   }
3334   biop = GetBioSourceFromValNode (s->feat_or_desc);
3335   if (biop == NULL) return rval;
3336 
3337   vn.choice = SourceQualChoice_textqual;
3338   vn.data.intvalue = Source_qual_nat_host;
3339   vn.next = NULL;
3340 
3341   new_spec_host = GetSourceQualFromBioSource (biop, &vn, NULL);
3342   FindReplaceString (&new_spec_host, s->old_taxname, s->new_taxname, TRUE, TRUE);
3343   if (StringCmp (new_spec_host, s->bad_specific_host) != 0)
3344   {
3345     rval = SetSourceQualInBioSource (biop, &vn, NULL, new_spec_host, ExistingTextOption_replace_old);
3346   }
3347   new_spec_host = MemFree (new_spec_host);
3348   return rval;
3349 }
3350 
AddBioSourceFeatToList(SeqFeatPtr sfp,Pointer userdata)3351 static void AddBioSourceFeatToList (SeqFeatPtr sfp, Pointer userdata)
3352 {
3353   if (sfp == NULL || sfp->data.choice != SEQFEAT_BIOSRC || userdata == NULL) return;
3354 
3355   ValNodeAddPointer ((ValNodePtr PNTR) userdata, OBJ_SEQFEAT, sfp);
3356 }
3357 
3358 
AddBioSourceDescToList(SeqDescrPtr sdp,Pointer userdata)3359 static void AddBioSourceDescToList (SeqDescrPtr sdp, Pointer userdata)
3360 {
3361 
3362   if (sdp == NULL || sdp->choice != Seq_descr_source || userdata == NULL) return;
3363 
3364   ValNodeAddPointer ((ValNodePtr PNTR) userdata, OBJ_SEQDESC, sdp);
3365 }
3366 
3367 
GetBioSourceList(SeqEntryPtr sep)3368 static ValNodePtr GetBioSourceList (SeqEntryPtr sep)
3369 {
3370   ValNodePtr list = NULL;
3371 
3372   VisitFeaturesInSep (sep, &list, AddBioSourceFeatToList);
3373   VisitDescriptorsInSep (sep, &list, AddBioSourceDescToList);
3374   return list;
3375 }
3376 
3377 
GetListOfOrganismNames(ValNodePtr biop_list)3378 static ValNodePtr GetListOfOrganismNames (ValNodePtr biop_list)
3379 {
3380   ValNodePtr   biop_vnp;
3381   BioSourcePtr biop;
3382   ValNodePtr   list = NULL;
3383 
3384   /* get a list of unique specific_host values */
3385   for (biop_vnp = biop_list; biop_vnp != NULL; biop_vnp = biop_vnp->next)
3386   {
3387     if (biop_vnp->data.ptrvalue == NULL) continue;
3388     biop = GetBioSourceFromValNode (biop_vnp);
3389     if (biop == NULL || biop->org == NULL || StringHasNoText (biop->org->taxname)) continue;
3390     if (!StringAlreadyInValNodeList (biop->org->taxname, list))
3391     {
3392       ValNodeAddPointer (&list, 0, biop->org->taxname);
3393     }
3394   }
3395   return list;
3396 }
3397 
3398 
AddBioSourcesToChecklist(ValNodePtr biop_list,ValNodePtr check_list)3399 static void AddBioSourcesToChecklist (ValNodePtr biop_list, ValNodePtr check_list)
3400 {
3401   ValNodePtr biop_vnp, last_vnp = NULL, stop_search;
3402   BioSourcePtr biop;
3403   SpecificHostCheckPtr p;
3404 
3405   if (biop_list == NULL || check_list == NULL) return;
3406 
3407   for (biop_vnp = biop_list; biop_vnp != NULL; biop_vnp = biop_vnp->next)
3408   {
3409 
3410     biop = GetBioSourceFromValNode (biop_vnp);
3411     if (biop == NULL) continue;
3412 
3413     if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL) continue;
3414     if (last_vnp == NULL)
3415     {
3416       last_vnp = check_list;
3417       stop_search = NULL;
3418     }
3419     else
3420     {
3421       stop_search = last_vnp;
3422     }
3423     p = NULL;
3424     while (last_vnp != NULL
3425            && (p = (SpecificHostCheckPtr) last_vnp->data.ptrvalue) != NULL
3426            && StringCmp (p->spec_host, biop->org->taxname) != 0)
3427     {
3428       p = NULL;
3429       last_vnp = last_vnp->next;
3430     }
3431     if (p == NULL && stop_search != NULL)
3432     {
3433       last_vnp = check_list;
3434       while (last_vnp != stop_search
3435               && (p = (SpecificHostCheckPtr) last_vnp->data.ptrvalue) != NULL
3436               && StringCmp (p->spec_host, biop->org->taxname) != 0)
3437       {
3438         p = NULL;
3439         last_vnp = last_vnp->next;
3440       }
3441     }
3442 
3443     if (p != NULL)
3444     {
3445       ValNodeAddPointer (&(p->biop_list), biop_vnp->choice, biop_vnp->data.ptrvalue);
3446     }
3447   }
3448 }
3449 
3450 
GetBioSourcesWithTaxName(CharPtr taxname,ValNodePtr biop_list)3451 static ValNodePtr GetBioSourcesWithTaxName (CharPtr taxname, ValNodePtr biop_list)
3452 {
3453   SeqFeatPtr sfp;
3454   SeqDescrPtr sdp;
3455   BioSourcePtr biop;
3456   ValNodePtr match_list = NULL, vnp;
3457 
3458   if (StringHasNoText (taxname) || biop_list == NULL) return NULL;
3459 
3460   for (vnp = biop_list; vnp != NULL; vnp = vnp->next) {
3461     biop = NULL;
3462     if (vnp->choice == OBJ_SEQFEAT) {
3463       sfp = (SeqFeatPtr) vnp->data.ptrvalue;
3464       if (sfp != NULL && sfp->data.choice == SEQFEAT_BIOSRC) {
3465         biop = (BioSourcePtr) sfp->data.value.ptrvalue;
3466       }
3467     } else if (vnp->choice == OBJ_SEQDESC) {
3468       sdp = (SeqDescrPtr) vnp->data.ptrvalue;
3469       if (sdp != NULL && sdp->choice == Seq_descr_source) {
3470         biop = (BioSourcePtr) sdp->data.ptrvalue;
3471       }
3472     }
3473     if (biop != NULL && biop->org != NULL && StringCmp (taxname, biop->org->taxname) == 0) {
3474       ValNodeAddPointer (&match_list, vnp->choice, vnp->data.ptrvalue);
3475     }
3476   }
3477   return match_list;
3478 }
3479 
3480 
GetOrganismTaxLookupFailuresInSeqEntry(SeqEntryPtr sep)3481 NLM_EXTERN ValNodePtr GetOrganismTaxLookupFailuresInSeqEntry (SeqEntryPtr sep)
3482 {
3483   ValNodePtr   biop_list = NULL;
3484   ValNodePtr   unique_list = NULL;
3485   ValNodePtr   request_list = NULL;
3486   ValNodePtr   response_list = NULL;
3487   ValNodePtr   req_vnp, resp_vnp;
3488   ErrSev               level;
3489   ValNodePtr           failed_list = NULL, vnp;
3490   OrgRefPtr            request_org;
3491 
3492   biop_list = GetBioSourceList (sep);
3493 
3494   /* get a list of unique specific_host values */
3495   unique_list = GetListOfOrganismNames (biop_list);
3496 
3497   /* now format requests for unique taxname values */
3498   for (vnp = unique_list; vnp != NULL; vnp = vnp->next)
3499   {
3500     request_org = OrgRefNew();
3501     request_org->taxname = StringSave (vnp->data.ptrvalue);
3502     ValNodeAddPointer (&request_list, 3, request_org);
3503   }
3504 
3505   unique_list = ValNodeFree (unique_list);
3506 
3507   level = ErrSetMessageLevel (SEV_MAX);
3508   response_list = Taxon3GetOrgRefList (request_list);
3509   ErrSetMessageLevel (level);
3510 
3511   if (ValNodeLen (response_list) != ValNodeLen (request_list))
3512   {
3513     Message (MSG_POST, "Unable to retrieve information from tax server");
3514   }
3515   else
3516   {
3517     for (req_vnp = request_list, resp_vnp = response_list;
3518          req_vnp != NULL && resp_vnp != NULL;
3519          req_vnp = req_vnp->next, resp_vnp = resp_vnp->next)
3520     {
3521       if (resp_vnp->data.ptrvalue == NULL)
3522       {
3523         request_org = (OrgRefPtr) req_vnp->data.ptrvalue;
3524         vnp = GetBioSourcesWithTaxName (request_org->taxname, biop_list);
3525         if (vnp != NULL) {
3526           ValNodeAddPointer (&failed_list, 0, StringSave (request_org->taxname));
3527           ValNodeLink (&failed_list, vnp);
3528         }
3529       }
3530     }
3531   }
3532 
3533   biop_list = ValNodeFree (biop_list);
3534   request_list = FreeOrgRefValNodeList (request_list);
3535   response_list = FreeOrgRefValNodeList (response_list);
3536 
3537   return failed_list;
3538 }
3539 
3540 
CollectTaxIds(BioSourcePtr biop,Pointer data)3541 static void CollectTaxIds (BioSourcePtr biop, Pointer data)
3542 {
3543   ValNodePtr vnp;
3544   DbtagPtr   dbtag;
3545 
3546   if (biop == NULL || biop->org == NULL || data == NULL) {
3547     return;
3548   }
3549   for (vnp = biop->org->db; vnp != NULL; vnp = vnp->next) {
3550     dbtag = (DbtagPtr) vnp->data.ptrvalue;
3551     if (dbtag != NULL && StringCmp ("taxon", dbtag->db) == 0 && dbtag->tag->id > 0) {
3552       ValNodeAddInt ((ValNodePtr PNTR) data, 0, dbtag->tag->id);
3553     }
3554   }
3555 }
3556 
3557 
GetCommonOrgRefForSeqEntry(SeqEntryPtr sep)3558 NLM_EXTERN OrgRefPtr GetCommonOrgRefForSeqEntry (SeqEntryPtr sep)
3559 {
3560   ValNodePtr       list = NULL;
3561   Taxon3RequestPtr t3rq;
3562   T3ReplyPtr       trp;
3563   Taxon3ReplyPtr   t3ry;
3564   T3DataPtr        tdp;
3565   T3ErrorPtr       tep;
3566   OrgRefPtr        org = NULL;
3567 
3568   VisitBioSourcesInSep (sep, &list, CollectTaxIds);
3569   if (list == NULL) {
3570     ErrPostEx (SEV_ERROR, 0, 0, "No tax IDs found - cannot create PopSet Title");
3571     return NULL;
3572   }
3573   ValNodeUnique (&list, SortByIntvalue, ValNodeFree);
3574 
3575   t3rq = CreateJoinRequest (list);
3576   list = ValNodeFree (list);
3577 
3578   t3ry = Tax3SynchronousQuery (t3rq);
3579   Taxon3RequestFree (t3rq);
3580   if (t3ry != NULL) {
3581     for (trp = t3ry->reply; trp != NULL; trp = trp->next) {
3582       switch (trp->choice) {
3583         case T3Reply_error :
3584           tep = (T3ErrorPtr) trp->data.ptrvalue;
3585           if (tep != NULL) {
3586             ErrPostEx (SEV_ERROR, 0, 0, tep->message);
3587           }
3588           break;
3589         case T3Reply_data :
3590           tdp = (T3DataPtr) trp->data.ptrvalue;
3591           if (tdp != NULL) {
3592             org = (OrgRefPtr)(tdp->org);
3593             tdp->org = NULL;
3594           }
3595           break;
3596         default :
3597           break;
3598       }
3599     }
3600     Taxon3ReplyFree (t3ry);
3601   }
3602   if (org == NULL) {
3603     org = OrgRefNew ();
3604     org->taxname = StringSave ("Mixed organisms");
3605   }
3606   return org;
3607 }
3608 
3609 
SeqDescrFromBioSample(CharPtr number)3610 NLM_EXTERN SeqDescrPtr SeqDescrFromBioSample (CharPtr number)
3611 
3612 {
3613   SeqDescrPtr  sdp = NULL;
3614   CONN         conn;
3615   AsnIoConnPtr aicp;
3616   size_t       n_written;
3617   CharPtr      i_query_fmt = "id=%s&format=asn1";
3618   CharPtr      a_query_fmt = "accession=%s&format=asn1raw";
3619   CharPtr      query_fmt;
3620   CharPtr      query;
3621   EIO_Status   status;
3622   CharPtr      host = "api-int";
3623   CharPtr      url = "/biosample/fetch/";
3624 
3625   if (StringHasNoText (number)) return NULL;
3626   if (isalpha (*number)) {
3627     query_fmt = a_query_fmt;
3628   } else {
3629     query_fmt = i_query_fmt;
3630   }
3631   query = (CharPtr) MemNew (sizeof (Char) * (StringLen (query_fmt) + StringLen (number)));
3632   sprintf (query, query_fmt, number);
3633   conn = QUERY_OpenUrlQuery (host, 0, url,
3634                              query, "Sequin", 30, eMIME_T_NcbiData,
3635                              eMIME_Fasta, eENCOD_None, 0);
3636   query = MemFree (query);
3637   if (conn == NULL) return NULL;
3638   status = CONN_Write (conn, (const void *) query, StringLen (query),
3639                        &n_written, eIO_WritePersist);
3640   if (status != eIO_Success) return NULL;
3641   QUERY_SendQuery (conn);
3642   aicp = QUERY_AsnIoConnOpen ("r", conn);
3643   sdp = SeqDescrAsnRead (aicp->aip, NULL);
3644   if (sdp == NULL) {
3645     if (aicp->aip->buf != NULL) {
3646       Message (MSG_POSTERR, "%s [%s]", aicp->aip->buf, number);
3647     } else {
3648       Message (MSG_POSTERR, "Unable to retrieve BioSample Data for %s", number);
3649     }
3650   }
3651   QUERY_AsnIoConnClose (aicp);
3652 
3653   return sdp;
3654 }
3655 
3656 
3657