1 /* ref.c
2  *
3  * ===========================================================================
4  *
5  *                            PUBLIC DOMAIN NOTICE
6  *               National Center for Biotechnology Information
7  *
8  *  This software/database is a "United States Government Work" under the
9  *  terms of the United States Copyright Act.  It was written as part of
10  *  the author's official duties as a United States Government employee and
11  *  thus cannot be copyrighted.  This software/database is freely available
12  *  to the public for use. The National Library of Medicine and the U.S.
13  *  Government have not placed any restriction on its use or reproduction.
14  *
15  *  Although all reasonable efforts have been taken to ensure the accuracy
16  *  and reliability of the software and data, the NLM and the U.S.
17  *  Government do not and cannot warrant the performance or results that
18  *  may be obtained by using this software or data. The NLM and the U.S.
19  *  Government disclaim all warranties, express or implied, including
20  *  warranties of performance, merchantability or fitness for any particular
21  *  purpose.
22  *
23  *  Please cite the author in any work or product based on this material.
24  *
25  * ===========================================================================
26  *
27  * File Name:  ref.c
28  *
29  * Author: Karl Sirotkin, Hsiu-Chuan Chen
30  *
31  * File Description:
32  * -----------------
33  *
34  */
35 #include <ncbi_pch.hpp>
36 
37 #include "ftacpp.hpp"
38 
39 #include <objects/biblio/Id_pat.hpp>
40 #include <objects/biblio/Id_pat_.hpp>
41 #include <objects/biblio/Auth_list.hpp>
42 #include <objects/biblio/Affil.hpp>
43 #include <objects/seq/Pubdesc.hpp>
44 #include <objects/pub/Pub_equiv.hpp>
45 #include <objects/pub/Pub.hpp>
46 #include <objects/biblio/Cit_gen.hpp>
47 #include <objects/biblio/PubMedId.hpp>
48 #include <objects/biblio/Cit_book.hpp>
49 #include <objects/biblio/Imprint.hpp>
50 #include <objects/biblio/Cit_let.hpp>
51 #include <objects/biblio/Cit_sub.hpp>
52 #include <objects/biblio/Cit_jour.hpp>
53 #include <objects/biblio/Cit_pat.hpp>
54 #include <objects/biblio/Cit_art.hpp>
55 #include <objects/biblio/ArticleIdSet.hpp>
56 #include <objects/biblio/ArticleId.hpp>
57 #include <objects/general/Dbtag.hpp>
58 #include <objects/general/Object_id.hpp>
59 #include <objects/general/Person_id.hpp>
60 #include <objects/medline/Medline_entry.hpp>
61 #include <objects/biblio/Cit_proc.hpp>
62 
63 #include "index.h"
64 #include "genbank.h"
65 #include "embl.h"
66 
67 #include <objtools/flatfile/flatdefn.h>
68 #include "ftanet.h"
69 
70 #include "ftaerr.hpp"
71 #include "indx_blk.h"
72 #include "utilref.h"
73 #include "asci_blk.h"
74 #include "add.h"
75 #include "utilfun.h"
76 #include "ind.hpp"
77 #include "ref.h"
78 #include "xgbfeat.h"
79 #include "xutils.h"
80 #include "fta_xml.h"
81 
82 #ifdef THIS_FILE
83 #    undef THIS_FILE
84 #endif
85 #define THIS_FILE "ref.cpp"
86 
87 #define MAXKW 38
88 
89 
90 BEGIN_NCBI_SCOPE
91 
92 static const char *strip_sub_str[] = {
93     "to the EMBL/GenBank/DDBJ databases",
94     "to the EMBL/DDBJ/GenBank databases",
95     "to the DDBJ/GenBank/EMBL databases",
96     "to the DDBJ/EMBL/GenBank databases",
97     "to the GenBank/DDBJ/EMBL databases",
98     "to the GenBank/EMBL/DDBJ databases",
99     "to the INSDC",
100     NULL
101 };
102 
103 static const char *ERRemarks[] = {
104     "Publication Status: Online-Only",                          /*  1 */
105     "Publication Status : Online-Only",                         /*  2 */
106     "Publication_Status: Online-Only",                          /*  3 */
107     "Publication_Status : Online-Only",                         /*  4 */
108     "Publication-Status: Online-Only",                          /*  5 */
109     "Publication-Status : Online-Only",                         /*  6 */
110     "Publication Status: Available-Online",                     /*  7 */
111     "Publication Status : Available-Online",                    /*  8 */
112     "Publication_Status: Available-Online",                     /*  9 */
113     "Publication_Status : Available-Online",                    /* 10 */
114     "Publication-Status: Available-Online",                     /* 11 */
115     "Publication-Status : Available-Online",                    /* 12 */
116     "Publication Status: Available-Online prior to print",      /* 13 */
117     "Publication Status : Available-Online prior to print",     /* 14 */
118     "Publication_Status: Available-Online prior to print",      /* 15 */
119     "Publication_Status : Available-Online prior to print",     /* 16 */
120     "Publication-Status: Available-Online prior to print",      /* 17 */
121     "Publication-Status : Available-Online prior to print",     /* 18 */
122     NULL
123 };
124 
125 /**********************************************************/
normalize_comment(std::string & comment)126 static void normalize_comment(std::string& comment)
127 {
128     std::string new_comment = comment;
129     char *q, *r;
130 
131     for(r = (char *) new_comment.c_str();;)
132     {
133         r = strstr(r, "; ");
134         if(r == NULL)
135             break;
136         for(r += 2, q = r; *q == ' ' || *q == ';';)
137             q++;
138         if(q > r)
139             fta_StringCpy(r, q);
140     }
141 
142     comment = new_comment;
143 }
144 
145 /**********************************************************
146  *
147  *   static DatePtr get_lanl_date(s):
148  *
149  *      Get year, month, day  and return NCBI_DatePtr.
150  *      Temporary used for lanl form of date that
151  *   is (JUL 21 1993).
152  *
153  *                                              01-4-94
154  *
155  **********************************************************/
get_lanl_date(char * s)156 static CRef<objects::CDate> get_lanl_date(char* s)
157 {
158     int            day = 0;
159     int            month = 0;
160     int            year;
161     int            cal;
162 
163     const char     *months[12] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun",
164                                   "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"};
165 
166     CRef<objects::CDate> date(new objects::CDate);
167     for(cal = 0; cal < 12; cal++)
168     {
169         if(StringNICmp(s + 1, months[cal], 3) == 0)
170         {
171             month = cal + 1;
172             break;
173         }
174     }
175     day = atoi(s + 5);
176     year = atoi(s + 8);
177     if(year < 1900 || year > 1994)
178     {
179         ErrPostEx(SEV_WARNING, ERR_REFERENCE_IllegalDate,
180                   "Illegal year: %d", year);
181     }
182 
183     date->SetStd().SetYear(year);
184     date->SetStd().SetMonth(month);
185     date->SetStd().SetDay(day);
186 
187     if (XDateCheck(date->GetStd()) != 0)
188     {
189         ErrPostEx(SEV_WARNING, ERR_REFERENCE_IllegalDate,
190                   "Illegal date: %s", s);
191         date.Reset();
192     }
193 
194     return(date);
195 }
196 
197 /**********************************************************
198  *
199  *   static char* clean_up(str):
200  *
201  *      Deletes front and tail double or single quotes
202  *   if any.
203  *
204  **********************************************************/
clean_up(char * str)205 static char* clean_up(char* str)
206 {
207     char* newp;
208     char* s;
209 
210     if(str == NULL)
211         return(NULL);
212 
213     s = str + StringLen(str) - 1;
214     if(*s == ';')
215         *s = '\0';
216 
217     while(*str == '\"' || *str == '\'')
218         str++;
219 
220     newp = strdup(str);
221     size_t size = StringLen(newp);
222     while(size > 0 && (newp[size-1] == '\"' || newp[size-1] == '\''))
223     {
224         size--;
225         newp[size] = '\0';
226     }
227 
228     return(newp);
229 }
230 
231 /**********************************************************
232 *
233 *   static ValNodePtr get_num(str):
234 *
235 *      Get gb serial number and put it to PUB_Gen.
236 *
237 *                                              12-4-93
238 *
239 **********************************************************/
get_num(char * str)240 static CRef<objects::CPub> get_num(char* str)
241 {
242     int serial_num = NStr::StringToInt(str, NStr::fAllowTrailingSymbols);
243 
244     CRef<objects::CPub> ret(new objects::CPub);
245     ret->SetGen().SetSerial_number(serial_num);
246 
247     return ret;
248 }
249 
250 /**********************************************************
251  *
252  *   static ValNodePtr get_muid(str, format):
253  *
254  *      Get gb MUID and put it to PUB_Gen.
255  *
256  *                                              12-4-93
257  *
258  **********************************************************/
get_muid(char * str,Parser::EFormat format)259 static CRef<objects::CPub> get_muid(char* str, Parser::EFormat format)
260 {
261     char* p;
262     Int4    i;
263 
264     CRef<objects::CPub> muid;
265 
266     if(str == NULL)
267         return muid;
268 
269     if(format == Parser::EFormat::GenBank || format == Parser::EFormat::XML)
270         p = str;
271     else if(format == Parser::EFormat::EMBL)
272     {
273         p = StringIStr(str, "MEDLINE;");
274         if(p == NULL)
275             return muid;
276         for(p += 8; *p == ' ';)
277             p++;
278     }
279     else
280         return muid;
281 
282     i = NStr::StringToInt(p, NStr::fAllowTrailingSymbols);
283     if(i < 1)
284         return muid;
285 
286     muid.Reset(new objects::CPub);
287     muid->SetMuid(ENTREZ_ID_FROM(int, i));
288     return muid;
289 }
290 
291 /**********************************************************/
get_embl_str_pub_id(char * str,const Char * tag)292 static char* get_embl_str_pub_id(char* str, const Char *tag)
293 {
294     char* p;
295     char* q;
296     char* ret;
297     Char    ch;
298 
299     if(str == NULL || tag == NULL)
300         return(NULL);
301 
302     p = StringIStr(str, tag);
303     if(p == NULL)
304         return(NULL);
305     for(p += StringLen(tag); *p == ' ';)
306         p++;
307 
308     ret = NULL;
309     for(q = p; *q != ' ' && *q != '\0';)
310         q++;
311     q--;
312     if(*q != '.')
313         q++;
314     ch = *q;
315     *q = '\0';
316     ret = StringSave(p);
317     *q = ch;
318     return(ret);
319 }
320 
321 /**********************************************************/
get_embl_pmid(char * str)322 static Int4 get_embl_pmid(char* str)
323 {
324     char* p;
325     Int4    i;
326 
327     if(str == NULL)
328         return(0);
329 
330     p = StringIStr(str, "PUBMED;");
331     if(p == NULL)
332         return(0);
333     for(p += 7; *p == ' ';)
334         p++;
335     i = (Int4) atol(p);
336     if(i < 1)
337         return(0);
338     return(i);
339 }
340 
341 /**********************************************************
342  *
343  *   static char* check_book_tit(title):
344  *
345  *      Get volume from book title.
346  *
347  *                                              12-4-93
348  *
349  **********************************************************/
check_book_tit(char * title)350 static char* check_book_tit(char* title)
351 {
352     char* p;
353     char* q;
354     char* r;
355 
356     p = StringRStr(title, "Vol");
357     if(p == NULL)
358         return(NULL);
359 
360     if(p[3] == '.')
361         q = p + 4;
362     else if(StringNCmp(p + 3, "ume", 3) == 0)
363         q = p + 6;
364     else
365         return(NULL);
366 
367     while(*q == ' ' || *q == '\t')
368         q++;
369     for(r = q; *r >= '0' && *r <= '9';)
370         r++;
371 
372     if(r == q || *r != '\0')
373         return(NULL);
374 
375     if(p > title)
376     {
377         p--;
378         if(*p != ' ' && *p != '\t' && *p != ',' && *p != ';' && *p != '.')
379             return(NULL);
380 
381         while(*p == ' ' || *p == '\t' || *p == ',' || *p == ';' || *p == '.')
382         {
383             if(p == title)
384                 break;
385             p--;
386         }
387         if(*p != ' ' && *p != '\t' && *p != ',' && *p != ';' && *p != '.')
388             p++;
389     }
390     *p = '\0';
391 
392     return(q);
393 }
394 
395 /**********************************************************
396  *
397  *   static CitPatPtr get_pat(pp, bptr, auth, title, eptr):
398  *
399  *      Return a CitPat pointer for patent ref in ncbi or
400  *   embl or ddbj.
401  *      Leading "I" or "AR" for NCBI or "A" for EMBL or
402  *   "E" for DDBJ in accesion number requiered
403  *
404  *   JOURNAL   Patent: US 4446235-A 6 01-MAY-1984;
405  *   or
406  *   RL   Patent number US4446235-A/6, 01-MAY-1984.
407  *
408  *                                              11-14-93
409  *
410  **********************************************************/
get_pat(ParserPtr pp,char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title,char * eptr)411 static CRef<objects::CCit_pat> get_pat(ParserPtr pp, char* bptr, CRef<objects::CAuth_list>& auth_list, CRef<objects::CTitle::C_E>& title, char* eptr)
412 {
413     IndexblkPtr ibp;
414 
415     CRef<objects::CCit_pat> cit_pat;
416 
417     char*     country;
418     char*     number;
419     char*     type;
420     char*     app;
421     char*     s;
422     char*     p;
423     char*     q;
424     char*     temp;
425 
426     ErrSev      sev;
427     Char        ch;
428 
429     ibp = pp->entrylist[pp->curindx];
430 
431     temp = StringSave(bptr);
432 
433     ch = (pp->format == Parser::EFormat::EMBL) ? '.' : ';';
434     p = StringChr(temp, ch);
435     if(p != NULL)
436         *p = '\0';
437 
438     p = StringChr(bptr, ch);
439     if(p != NULL)
440         *p = '\0';
441 
442     if(ibp->is_pat && ibp->psip.NotEmpty())
443     {
444         ErrPostStr(SEV_ERROR, ERR_FORMAT_MultiplePatRefs,
445                    "Too many patent references for patent sequence; ignoring all but the first.");
446     }
447 
448     if(pp->source == Parser::ESource::USPTO)
449         s = bptr;
450     else
451     {
452         q = (pp->format == Parser::EFormat::EMBL) ? (char *) "Patent number" :
453                                                     (char *) "Patent:";
454         size_t len = StringLen(q);
455         if(StringNICmp(q, bptr, len) != 0)
456         {
457             ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
458                       "Illegal format: \"%s\"", temp);
459             MemFree(temp);
460             return cit_pat;
461         }
462 
463         for(s = bptr + len; *s == ' ';)
464             s++;
465     }
466 
467     for(country = s, q = s; isalpha((int) *s) != 0 || *s == ' '; s++)
468         if(*s != ' ')
469             q = s;
470     if(country == q)
471     {
472         ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
473                   "No Patent Document Country: \"%s\"", temp);
474         MemFree(temp);
475         return cit_pat;
476     }
477     s = q + 1;
478 
479     if(pp->format != Parser::EFormat::EMBL &&
480        pp->format != Parser::EFormat::XML)
481         *s++ = '\0';
482     while(*s == ' ')
483         s++;
484     for(number = s, q = s; isdigit((int) *s) != 0 || *s == ','; s++)
485         if(*s != ',')
486             *q++ = *s;
487 
488     if(number == s)
489     {
490         ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
491                   "No Patent Document Number: \"%s\"", temp);
492         MemFree(temp);
493         return cit_pat;
494     }
495 
496     if(q != s)
497         *q = '\0';
498 
499     if(*s == '-')
500     {
501         *s++ = '\0';
502         for(type = s; *s != ' ' && *s != '/' && *s != '\0';)
503             s++;
504         if(type == s)
505             type = NULL;
506     }
507     else
508         type = NULL;
509     if(*s != '\0')
510         *s++ = '\0';
511 
512     if(type == NULL)
513     {
514         sev = (ibp->is_pat ? SEV_ERROR : SEV_WARNING);
515         ErrPostEx(sev, ERR_REFERENCE_Fail_to_parse,
516                   "No Patent Document Type: \"%s\"", temp);
517     }
518 
519     for(app = s, q = s; *s >= '0' && *s <= '9';)
520         s++;
521     if(*s != '\0' && *s != ',' && *s != '.' && *s != ' ' && *s != ';' &&
522        *s != '\n')
523     {
524         sev = (ibp->is_pat ? SEV_ERROR : SEV_WARNING);
525         ErrPostEx(sev, ERR_REFERENCE_Fail_to_parse,
526                   "No number of sequence in patent: \"%s\"", temp);
527         app = NULL;
528         s = q;
529     }
530     else if(*s != '\0')
531         for(*s++ = '\0'; *s == ' ';)
532             s++;
533 
534     CRef<objects::CDate_std> std_date;
535     if(*s != '\0')
536     {
537         std_date = get_full_date(s, true, pp->source);
538     }
539 
540     if (std_date.Empty())
541     {
542         ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
543                   "Illegal format: \"%s\"", temp);
544         MemFree(temp);
545         return cit_pat;
546     }
547 
548     if(p != NULL)
549         *p = ch;
550 
551     std::string msg = NStr::Sanitize(number);
552     if(pp->format == Parser::EFormat::EMBL ||
553        pp->source == Parser::ESource::USPTO)
554         *number = '\0';
555 
556     cit_pat.Reset(new objects::CCit_pat);
557 
558     cit_pat->SetCountry(country);
559     cit_pat->SetNumber(msg);
560 
561     cit_pat->SetDoc_type(type == NULL ? "" : type);
562     cit_pat->SetDate_issue().SetStd(*std_date);
563     cit_pat->SetTitle(title.Empty() ? "" : title->GetName());
564 
565     if (auth_list.Empty() || !auth_list->IsSetNames())
566     {
567         objects::CAuth_list& pat_auth_list = cit_pat->SetAuthors();
568         pat_auth_list.SetNames().SetStr().push_back("");
569     }
570     else
571         cit_pat->SetAuthors(*auth_list);
572 
573     if (auth_list.NotEmpty())
574     {
575         objects::CAffil& affil = auth_list->SetAffil();
576 
577         s += 13;
578         if (s < eptr && *s != '\0')
579             affil.SetStr(s);
580         else
581             affil.SetStr("");
582     }
583 
584     if(ibp->is_pat && ibp->psip.Empty())
585     {
586         ibp->psip = new objects::CPatent_seq_id;
587         ibp->psip->SetCit().SetCountry(country);
588         ibp->psip->SetCit().SetId().SetNumber(msg);
589         ibp->psip->SetSeqid(app != NULL ? atoi(app) : 0);
590         if(type)
591             ibp->psip->SetCit().SetDoc_type(type);
592     }
593 
594     MemFree(temp);
595     return cit_pat;
596 }
597 
598 /**********************************************************/
fta_get_part_sup(char * parts,objects::CImprint & imp)599 static void fta_get_part_sup(char* parts, objects::CImprint& imp)
600 {
601     char* start;
602     char* end;
603     char* p;
604     char* q;
605     Char    ch;
606     Int4    i;
607     Int4    j;
608 
609     if(parts == NULL || *parts == '\0')
610         return;
611 
612     for(p = parts, i = 0, j = 0; *p != '\0'; p++)
613     {
614         if(*p == '(')
615             i++;
616         else if(*p == ')')
617             j++;
618 
619         if(j > i || i - j > 1)
620             break;
621     }
622 
623     if(*p != '\0' || i < 2)
624         return;
625 
626     start = StringChr(parts, '(');
627     end = StringChr(start + 1, ')');
628 
629     for(p = start + 1; *p == ' ';)
630         p++;
631     if(p == end)
632         return;
633 
634     for(q = end - 1; *q == ' ' && q > p;)
635         q--;
636     if(*q != ' ')
637         q++;
638 
639     ch = *q;
640     *q = '\0';
641 
642     imp.SetPart_sup(p);
643     *q = ch;
644 
645     fta_StringCpy(start, end + 1);
646 }
647 
648 /**********************************************************
649  *
650  *   static bool get_parts(bptr, eptr, imp):
651  *
652  *      Return a PARTS from medart2asn.c.
653  *
654  **********************************************************/
get_parts(char * bptr,char * eptr,objects::CImprint & imp)655 static bool get_parts(char* bptr, char* eptr, objects::CImprint& imp)
656 {
657     char* parts;
658     char* p;
659     char* q;
660     Char    ch;
661     Int4    bad;
662 
663     if(bptr == NULL || eptr == NULL)
664         return false;
665 
666     ch = *eptr;
667     *eptr = '\0';
668     parts = StringSave(bptr);
669     *eptr = ch;
670 
671     for(p = parts; *p != '\0'; p++)
672         if(*p == '\t')
673             *p = ' ';
674 
675     fta_get_part_sup(parts, imp);
676 
677     bad = 0;
678     q = StringChr(parts, '(');
679     p = StringChr(parts, ')');
680 
681     if(p != NULL && q != NULL)
682     {
683         if(p < q || StringChr(p + 1, ')') != NULL ||
684            StringChr(q + 1, '(') != NULL)
685             bad = 1;
686     }
687     else if(p != NULL || q != NULL)
688         bad = 1;
689 
690     if(bad != 0)
691     {
692         MemFree(parts);
693         return false;
694     }
695 
696     if(q != NULL)
697     {
698         *q++ = '\0';
699         *p = '\0';
700 
701         for(p = q; *p == ' ';)
702             p++;
703         for(q = p; *q != '\0' && *q != ' ';)
704             q++;
705         if(*q != '\0')
706             *q++ = '\0';
707         if(q > p)
708             imp.SetIssue(p);
709         for(p = q; *p == ' ';)
710             p++;
711         for(q = p; *q != '\0';)
712             q++;
713         if(q > p)
714         {
715             for(q--; *q == ' ';)
716                 q--;
717             *++q = '\0';
718 
719             std::string supi(" ");
720             supi += p;
721             imp.SetPart_supi(supi);
722         }
723 
724         const Char* issue_str = imp.IsSetIssue() ? imp.GetIssue().c_str() : NULL;
725         if (imp.IsSetPart_supi() && issue_str != NULL &&
726             (issue_str[0] == 'P' || issue_str[0] == 'p') && (issue_str[1] == 'T' || issue_str[1] == 't') &&
727             issue_str[2] == '\0')
728         {
729             std::string& issue = imp.SetIssue();
730             issue += imp.GetPart_supi();
731             imp.ResetPart_supi();
732         }
733     }
734 
735     for(p = parts; *p == ' ';)
736         p++;
737     for(q = p; *q != '\0' && *q != ' ';)
738         q++;
739     if(*q != '\0')
740         *q++ = '\0';
741     if(q > p)
742         imp.SetVolume(p);
743     for(p = q; *p == ' ';)
744         p++;
745     for(q = p; *q != '\0';)
746         q++;
747     if(q > p)
748     {
749         for(q--; *q == ' ';)
750             q--;
751         *++q = '\0';
752         imp.SetPart_sup(p);
753     }
754 
755     MemFree(parts);
756     return true;
757 }
758 
759 /**********************************************************
760  *
761  *   static CitArtPtr get_art(pp, bptr, auth, title, pre,
762  *                            has_muid, all_zeros, er):
763  *
764  *      Return a CitArt pointer for GENBANK or EMBL mode.
765  *
766  **********************************************************/
get_art(ParserPtr pp,char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title,int pre,bool has_muid,bool * all_zeros,Int4 er)767 static CRef<objects::CCit_art> get_art(ParserPtr pp, char* bptr, CRef<objects::CAuth_list>& auth_list, CRef<objects::CTitle::C_E>& title,
768                          int pre, bool has_muid, bool* all_zeros, Int4 er)
769 {
770     char*      eptr;
771     char*      end_tit;
772     char*      s;
773     char*      p;
774     char*      ss;
775     char*      end_volume;
776     char*      end_pages;
777     char*      buf;
778     char*      tit = NULL;
779     char*      volume = NULL;
780     char*      pages = NULL;
781     char*      year;
782     Char         symbol;
783 
784     Int4         i;
785     Int4         is_er;
786 
787     *all_zeros = false;
788 
789     is_er = 0;
790     if(er > 0)
791         is_er |= 01;                    /* based on REMARKs */
792     if(StringNCmp(bptr, "(er)", 4) == 0)
793         is_er |= 02;
794 
795     CRef<objects::CCit_art> cit_art;
796 
797     if(pp->format == Parser::EFormat::GenBank || pp->format == Parser::EFormat::PRF)
798         symbol = ',';
799     else if(pp->format == Parser::EFormat::EMBL)
800         symbol = ':';
801     else if(pp->format == Parser::EFormat::XML)
802     {
803         if(pp->source == Parser::ESource::EMBL)
804             symbol = ':';
805         else
806             symbol = ',';
807     }
808     else
809         return cit_art;
810 
811     end_volume = NULL;
812 
813     size_t len = StringLen(bptr);
814     buf = (char*) MemNew(len + 1);
815     StringCpy(buf, bptr);
816     eptr = buf + len - 1;
817     while(eptr > buf && (*eptr == ' ' || *eptr == '\t' || *eptr == '.'))
818         *eptr-- = '\0';
819     if(*eptr != ')')
820     {
821         MemFree(buf);
822         return cit_art;
823     }
824     for(s = eptr - 1; s > buf && *s != '(';)
825         s--;
826     if(*s != '(')
827     {
828         MemFree(buf);
829         return cit_art;
830     }
831 
832     if(pp->format == Parser::EFormat::PRF && s > buf &&
833        (StringLen(s) != 6 || s[1] < '1' || s[1] > '2' || s[2] < '0' ||
834         s[2] > '9' || s[3] < '0' || s[3] > '9' || s[4] < '0' || s[4] > '9'))
835     {
836         for(p = s - 1; p > buf && *p != '(';)
837             p--;
838         if(*p == '(' && p[5] == ')' && p[1] > '0' && p[1] < '3' &&
839            p[2] >= '0' && p[2] <= '9' && p[3] >= '0' && p[3] <= '9' &&
840            p[4] >= '0' && p[4] <= '9')
841         {
842             *s = '\0';
843             s = p;
844         }
845     }
846 
847     year = s + 1;
848     for(s--; s >= buf && isspace((int) *s) != 0;)
849         s--;
850     if(s < buf)
851         s = buf;
852     end_pages = s + 1;
853     if(buf[0] == 'G' && buf[1] == '3')
854         ss = buf + 2;
855     else
856         ss = buf;
857     for(i = 0; ss <= year; ss++)
858     {
859         if(*ss == '(')
860             i++;
861         else if(*ss == ')')
862             i--;
863         else if(*ss >= '0' && *ss <= '9' && i == 0)
864             break;
865     }
866 
867     for(s = end_pages; s >= buf && *s != symbol;)
868         s--;
869     if(s < buf)
870         s = buf;
871     if(*s != symbol)
872     {
873         /* try delimiter from other format
874          */
875         if(pp->format == Parser::EFormat::GenBank)
876             symbol = ':';
877         else if(pp->format == Parser::EFormat::EMBL)
878             symbol = ',';
879         else if(pp->format == Parser::EFormat::XML)
880         {
881             if(pp->source == Parser::ESource::EMBL)
882                 symbol = ',';
883             else
884                 symbol = ':';
885         }
886 
887         for(s = end_pages; s >= buf && *s != symbol;)
888             s--;
889         if(s < buf)
890             s = buf;
891     }
892 
893     if(*s == symbol && ss != year)
894     {
895         if(ss > s)
896             ss = s + 1;
897         end_volume = s;
898         for(pages = s + 1; IS_WHITESP(*pages) != 0;)
899             pages++;
900         end_tit = ss - 1;
901         if(end_volume > ss)
902         {
903             volume = ss;
904             if(*end_tit == '(')
905                 volume--;
906         }
907     }
908     else
909     {
910         if(pre != 1)
911             pre = 2;
912 
913         end_tit = end_pages;
914     }
915 
916     if(*year == '0')
917     {
918         if(pages != NULL && StringNCmp(pages, "0-0", 3) == 0 &&
919            pp->source == Parser::ESource::EMBL)
920             *all_zeros = true;
921         MemFree(buf);
922         return cit_art;
923     }
924 
925     tit = buf;
926     if(*tit == '\0')
927     {
928         ErrPostStr(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
929                    "No journal title.");
930         MemFree(buf);
931         return cit_art;
932     }
933 
934     cit_art.Reset(new objects::CCit_art);
935     objects::CCit_jour& journal = cit_art->SetFrom().SetJournal();
936     objects::CImprint& imp = journal.SetImp();
937 
938     if (pre > 0)
939         imp.SetPrepub(static_cast<objects::CImprint::EPrepub>(pre));
940 
941     *end_pages = '\0';
942     if(pages != NULL && StringNCmp(pages, "0-0", 3) != 0)
943     {
944         i = valid_pages_range(pages, tit, is_er, (pre == 2));
945         if(i == 0)
946             imp.SetPages(pages);
947         else if(i == 1)
948             end_tit = end_pages;
949         else if(i == -1 && is_er > 0)
950         {
951             MemFree(buf);
952             cit_art.Reset();
953             return cit_art;
954         }
955     }
956     else if(pre != 1)
957         pre = 2;
958 
959     if(volume != NULL)
960     {
961         if(!get_parts(volume, end_volume, imp))
962         {
963             MemFree(buf);
964             cit_art.Reset();
965             return cit_art;
966         }
967 
968         if(pre != 1 && !imp.IsSetVolume())
969         {
970             if(imp.IsSetPages())
971             {
972                 MemFree(buf);
973                 cit_art.Reset();
974                 return cit_art;
975             }
976             pre = 2;
977         }
978     }
979     else if(is_er > 0 && pre != 2)
980     {
981         MemFree(buf);
982         cit_art.Reset();
983         return cit_art;
984     }
985 
986     CRef<objects::CDate> date;
987     if (*year != '0')
988         date = get_date(year);
989 
990     if(date.Empty())
991     {
992         if(is_er == 0)
993             ErrPostStr(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
994                        "No date in journal reference");
995 
996         MemFree(buf);
997         cit_art.Reset();
998         return cit_art;
999     }
1000 
1001     *end_tit = '\0';
1002 
1003     CRef<objects::CTitle::C_E> journal_title(new objects::CTitle::C_E);
1004 
1005     for (char* aux = end_tit - 1; aux > tit && *aux != '.' && *aux != ')' && !isalnum(*aux); --aux)
1006         *aux = 0;
1007 
1008     journal_title->SetIso_jta(NStr::Sanitize(tit));
1009     journal.SetTitle().Set().push_back(journal_title);
1010 
1011     imp.SetDate(*date);
1012     if (pre > 0)
1013         imp.SetPrepub(static_cast<objects::CImprint::EPrepub>(pre));
1014 
1015     if((is_er & 01) == 01)
1016     {
1017         if(er == 1)
1018             imp.SetPubstatus(3);         /* epublish     */
1019         else
1020             imp.SetPubstatus(10);        /* aheadofprint */
1021     }
1022 
1023     /* check invalid "in-press"
1024      */
1025     if(pre == 2)
1026     {
1027         if(has_muid)
1028         {
1029             ErrPostEx(SEV_WARNING, ERR_REFERENCE_InvalidInPress,
1030                       "Reference flagged as In-press, but Medline UID exists, In-press ignored: %s",
1031                       buf);
1032             imp.ResetPrepub();
1033         }
1034 
1035         if(imp.IsSetPages() && imp.IsSetVolume() && imp.IsSetDate())
1036         {
1037             ErrPostEx(SEV_WARNING, ERR_REFERENCE_InvalidInPress,
1038                       "Reference flagged as In-press, but citation is complete, In-press ignored: %s",
1039                       buf);
1040             imp.ResetPrepub();
1041         }
1042     }
1043 
1044     /* Title and authors are optional for cit_art
1045      */
1046     if(title != NULL)
1047         cit_art->SetTitle().Set().push_back(title);
1048 
1049     if (auth_list.NotEmpty())
1050         cit_art->SetAuthors(*auth_list);
1051 
1052     MemFree(buf);
1053     return cit_art;
1054 }
1055 
1056 /**********************************************************
1057  *
1058  *   static CitGenPtr get_unpub(bptr, eptr, auth, title):
1059  *
1060  *      Return a CitGen pointer.
1061  *
1062  *                                              11-14-93
1063  *
1064  **********************************************************/
get_unpub(char * bptr,char * eptr,CRef<objects::CAuth_list> & auth_list,const Char * title)1065 static CRef<objects::CCit_gen> get_unpub(char* bptr, char* eptr, CRef<objects::CAuth_list>& auth_list,
1066                                                      const Char* title)
1067 {
1068     CRef<objects::CCit_gen> cit_gen(new objects::CCit_gen);
1069 
1070     char*   s;
1071     char*   str;
1072 
1073     if (bptr != NULL)
1074     {
1075         for(s = bptr; *s != '\0' && *s != '(';)
1076             s++;
1077         for(str = s - 1; str > bptr && IS_WHITESP(*str) != 0;)
1078             str--;
1079         if(*s == '(')
1080             s += 6;
1081 
1082         if (s < eptr && *s != '\0' && auth_list.NotEmpty())
1083             auth_list->SetAffil().SetStr(NStr::Sanitize(s));
1084 
1085         cit_gen->SetCit(std::string(bptr, str + 1));
1086     }
1087 
1088     if (auth_list.NotEmpty())
1089         cit_gen->SetAuthors(*auth_list);
1090 
1091     if (title != NULL)
1092         cit_gen->SetTitle(title);
1093 
1094     return cit_gen;
1095 }
1096 
1097 /**********************************************************
1098  *
1099  *   static CitArtPtr get_book(bptr, auth, title, pre,
1100  *                             format, p):
1101  *
1102  *      Return a CitArt pointer (!!! that is an article
1103  *   from book!!).
1104  *
1105  *                                              11-14-93
1106  *
1107  **********************************************************/
get_book(char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title,int pre,Parser::EFormat format,char * jour)1108 static CRef<objects::CCit_art> get_book(char* bptr, CRef<objects::CAuth_list>& auth_list, CRef<objects::CTitle::C_E>& title,
1109                                                     int pre, Parser::EFormat format, char* jour)
1110 {
1111     char*    s;
1112     char*    ss;
1113     char*    tit;
1114     char*    volume;
1115     char*    pages;
1116     char*    press;
1117 
1118     Uint1      ref_fmt;
1119     bool       IS_AUTH = false;
1120     char*    tbptr;
1121     char*    p;
1122     Char       c;
1123     Int4       i;
1124 
1125     tit = NULL;
1126     ref_fmt = GB_REF;
1127 
1128     tbptr = (bptr == NULL) ? NULL : StringSave(bptr);
1129 
1130     switch(format)
1131     {
1132         case Parser::EFormat::EMBL:
1133             ref_fmt = EMBL_REF;
1134             break;
1135         case Parser::EFormat::GenBank:
1136             ref_fmt = GB_REF;
1137             break;
1138         case Parser::EFormat::PIR:
1139             ref_fmt = PIR_REF;
1140             break;
1141         case Parser::EFormat::SPROT:
1142             ref_fmt = SP_REF;
1143             break;
1144         default:
1145             break;
1146     }
1147 
1148     CRef<objects::CCit_art> cit_art(new objects::CCit_art);
1149     objects::CCit_book& cit_book = cit_art->SetFrom().SetBook();
1150 
1151     if (pre > 0)
1152         cit_book.SetImp().SetPrepub(static_cast<objects::CImprint::EPrepub>(pre));
1153 
1154     p = tbptr;
1155     CRef<objects::CTitle::C_E> book_title(new objects::CTitle::C_E);
1156 
1157     if(StringNCmp("(in)", tbptr, 4) == 0)
1158     {
1159         for(s = tbptr + 4; *s == ' ';)
1160             s++;
1161         for(bptr = s; *s != ';' && *s != '(' && *s != '\0';)
1162             s++;
1163         if(StringNICmp(s, "(Eds.)", 6) == 0)
1164         {
1165             tit = s + 6;
1166             IS_AUTH = true;
1167         }
1168         else if(StringNICmp(s, "(Ed.)", 5) == 0)
1169         {
1170             tit = s + 5;
1171             IS_AUTH = true;
1172         }
1173         else if(*s == ';')
1174             tit = s;
1175         if(tit != NULL)
1176             while(*tit == ' ' || *tit == ';' || *tit == '\n')
1177                 tit++;
1178         c = *s;
1179         *s++ = '\0';
1180         if(IS_AUTH && *bptr != '\0')
1181         {
1182             CRef<objects::CAuth_list> book_auth_list;
1183             get_auth(bptr, ref_fmt, jour, book_auth_list);
1184             if (book_auth_list.NotEmpty())
1185                 cit_book.SetAuthors(*book_auth_list);
1186         }
1187         else
1188         {
1189             ErrPostEx(SEV_ERROR, ERR_REFERENCE_UnusualBookFormat,
1190                       "Cannot parse unusually formatted book reference (generating Cit-gen instead): %s",
1191                       p);
1192             if(tbptr != NULL)
1193                 MemFree(tbptr);
1194 
1195             cit_art.Reset();
1196             return cit_art;
1197         }
1198 
1199         ss = StringRChr(tit, ';');
1200         if(ss == NULL)
1201             for(ss = tit; *ss != '\0';)
1202                 ss++;
1203         for(s = ss; *s != ':' && s != tit;)
1204             s--;
1205         if(*s != ':')
1206             s = ss;
1207         c = *s;
1208         if(*s != '\0')
1209             *s++ = '\0';
1210 
1211         book_title->SetName("");
1212         if(*tit != '\0')
1213         {
1214             volume = check_book_tit(tit);
1215             if(volume != NULL)
1216                 cit_book.SetImp().SetVolume(volume);
1217 
1218             book_title->SetName(NStr::Sanitize(tit));
1219         }
1220 
1221         if(c == ':')
1222         {
1223             for(pages = s; *s != '\0' && *s != ',' && *s != ';';)
1224                 s++;
1225             if(*s != '\0')
1226                 *s++ = '\0';
1227 
1228             while(*pages == ' ')
1229                 pages++;
1230 
1231             if (StringNCmp(pages, "0-0",  3) == 0)
1232                 cit_book.SetImp().SetPrepub(objects::CImprint::ePrepub_in_press);
1233             else
1234             {
1235                 bool is_in_press = cit_book.GetImp().IsSetPrepub() && cit_book.GetImp().GetPrepub() == objects::CImprint::ePrepub_in_press;
1236                 i = valid_pages_range(pages, book_title->GetName().c_str(), 0, is_in_press);
1237 
1238                 if(i == 0)
1239                     cit_book.SetImp().SetPages(NStr::Sanitize(pages));
1240                 else if(i == 1)
1241                 {
1242                     std::string new_title = book_title->GetName();
1243                     new_title += ": ";
1244                     new_title += pages;
1245                     book_title->SetName(new_title);
1246                 }
1247             }
1248         }
1249 
1250         for(press = s; *s != '(' && *s != '\0';)
1251             s++;
1252         if(*s != '\0')
1253             *s++ = '\0';
1254 
1255         cit_book.SetImp().SetPub().SetStr(NStr::Sanitize(press));
1256 
1257         CRef<objects::CDate> date = get_date(s);
1258         if (date.Empty())
1259         {
1260             ErrPostStr(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1261                        "No date in book reference");
1262             ErrPostEx(SEV_WARNING, ERR_REFERENCE_Illegalreference,
1263                       "Book format error (cit-gen created): %s", p);
1264             if(tbptr != NULL)
1265                 MemFree(tbptr);
1266 
1267             cit_art.Reset();
1268             return cit_art;
1269         }
1270 
1271         cit_book.SetImp().SetDate(*date);
1272     }
1273 
1274     cit_book.SetTitle().Set().push_back(book_title);
1275 
1276     if (title.NotEmpty())
1277         cit_art->SetTitle().Set().push_back(title);
1278 
1279     if (auth_list.NotEmpty())
1280         cit_art->SetAuthors(*auth_list);
1281 
1282     if(tbptr != NULL)
1283         MemFree(tbptr);
1284 
1285     return cit_art;
1286 }
1287 
1288 /**********************************************************
1289  *
1290  *   static CitBookPtr get_thesis(bptr, auth, title, pre):
1291  *
1292  *      Return a CitBook pointer.
1293  *
1294  *                                              11-14-93
1295  *
1296  **********************************************************/
get_thesis(char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title,int pre)1297 static CRef<objects::CCit_let> get_thesis(char* bptr, CRef<objects::CAuth_list>& auth_list,
1298                                                       CRef<objects::CTitle::C_E>& title, int pre)
1299 {
1300     CRef<objects::CCit_let> cit_let(new objects::CCit_let);
1301 
1302     cit_let->SetType(objects::CCit_let::eType_thesis);
1303 
1304     objects::CCit_book& book = cit_let->SetCit();
1305 
1306     if (pre > 0)
1307         book.SetImp().SetPrepub(static_cast<objects::CImprint::EPrepub>(pre));
1308 
1309     char* s;
1310     for (s = bptr; *s != '\0' && *s != '(';)
1311         s++;
1312 
1313     if(*s == '(')
1314     {
1315         CRef<objects::CDate> date = get_date(s + 1);
1316         if (date.NotEmpty())
1317             book.SetImp().SetDate(*date);
1318 
1319         s = s + 6;
1320     }
1321 
1322     if (!book.GetImp().IsSetDate())
1323     {
1324         ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1325                   "Fail to parse thesis: missing date");
1326 
1327         cit_let.Reset();
1328         return cit_let;
1329     }
1330 
1331     if(*s != '\0')
1332         book.SetImp().SetPub().SetStr(NStr::Sanitize(s));
1333 
1334     if (title.NotEmpty())
1335         book.SetTitle().Set().push_back(title);
1336     else
1337     {
1338         ErrPostStr(SEV_WARNING, ERR_REFERENCE_Thesis, "Missing thesis title");
1339 
1340         CRef<objects::CTitle::C_E> empty_title(new objects::CTitle::C_E);
1341         empty_title->SetName("");
1342         book.SetTitle().Set().push_back(empty_title);
1343     }
1344 
1345     if (auth_list.NotEmpty())
1346         book.SetAuthors(*auth_list);
1347     return cit_let;
1348 }
1349 
1350 /**********************************************************
1351  *
1352  *   static CitBookPtr get_whole_book(bptr, auth, title,
1353  *                                    pre):
1354  *
1355  *      Return a CitBook pointer.
1356  *
1357  *                                              11-14-93
1358  *
1359  **********************************************************/
get_whole_book(char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title,int pre)1360 static CRef<objects::CCit_book> get_whole_book(char* bptr, CRef<objects::CAuth_list>& auth_list,
1361                                  CRef<objects::CTitle::C_E>& title, int pre)
1362 {
1363     CRef<objects::CCit_book> cit_book;
1364 
1365     char*    s;
1366 
1367     for(bptr += 5; IS_WHITESP(*bptr) != 0;)
1368         bptr++;
1369 
1370 
1371     for(s = bptr; *s != '\0' && *s != '(';)
1372         s++;
1373 
1374     if(*s != '(')
1375     {
1376         ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1377                   "Fail to parse book: missing date");
1378         return cit_book;
1379     }
1380 
1381     cit_book.Reset(new objects::CCit_book);
1382 
1383     if (pre > 0)
1384         cit_book->SetImp().SetPrepub(static_cast<objects::CImprint::EPrepub>(pre));
1385 
1386     CRef<objects::CDate> date = get_date(s + 1);
1387     if (date.NotEmpty())
1388         cit_book->SetImp().SetDate(*date);
1389 
1390     *s = '\0';
1391     for(s = bptr; *s != '\0' && *s != '.';)
1392         s++;
1393 
1394     CRef<objects::CTitle::C_E> book_title(new objects::CTitle::C_E);
1395     book_title->SetName(std::string(bptr, s));
1396     cit_book->SetTitle().Set().push_back(book_title);
1397 
1398     if(*s == '.')
1399     {
1400         for(s++; IS_WHITESP(*s) != 0;)
1401             s++;
1402 
1403         cit_book->SetImp().SetPub().SetStr(NStr::Sanitize(s));
1404     }
1405 
1406     if (auth_list.Empty() || !auth_list->IsSetNames())
1407     {
1408         ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1409                   "Fail to parse thesis: missing thesis author");
1410         cit_book.Reset();
1411         return cit_book;
1412     }
1413 
1414     cit_book->SetAuthors(*auth_list);
1415 
1416     return cit_book;
1417 }
1418 
1419 /**********************************************************
1420  *
1421  *   static CitSubPtr get_sub(pp, bptr, auth):
1422  *
1423  *      Return a CitSub pointer.
1424  *
1425  **********************************************************/
get_sub(ParserPtr pp,char * bptr,CRef<objects::CAuth_list> & auth_list)1426 static CRef<objects::CCit_sub> get_sub(ParserPtr pp, char* bptr, CRef<objects::CAuth_list>& auth_list)
1427 {
1428     const char  **b;
1429     char*     s;
1430     Int2        medium = OTHER_MEDIUM;
1431 
1432     CRef<objects::CCit_sub> ret;
1433 
1434     for(s = bptr; *s != '(' &&  *s != '\0';)
1435         s++;
1436     if(*s == '\0')
1437     {
1438         ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1439                   "Fail to parse submission: missing date");
1440         return ret;
1441     }
1442 
1443     ret.Reset(new objects::CCit_sub);
1444     CRef<objects::CDate> date;
1445 
1446     if(pp != NULL && pp->entrylist != NULL &&
1447        IsNewAccessFormat(pp->entrylist[pp->curindx]->acnum) == 0 &&
1448        (StringChr(ParFlat_LANL_AC,
1449                   pp->entrylist[pp->curindx]->acnum[0]) != NULL) &&
1450        isdigit((int) *(s + 1)) == 0)
1451     {
1452         date = get_lanl_date(s);
1453     }
1454     else
1455     {
1456         CRef<objects::CDate_std> std_date = get_full_date(s + 1, true, pp->source);
1457         date.Reset(new objects::CDate);
1458         date->SetStd(*std_date);
1459     }
1460 
1461     if (date.Empty())
1462         return ret;
1463 
1464     ret.Reset(new objects::CCit_sub);
1465     ret->SetDate(*date);
1466 
1467     s = s + 13;
1468     if(StringStr(s, "E-mail") != NULL)
1469         medium = EMAIL_MEDIUM;
1470 
1471     if(StringNICmp(" on tape", s, 8) == 0)
1472     {
1473         medium = TAPE_MEDIUM;
1474         for(s += 8; *s != '\0' && *s != ':';)
1475             s++;
1476     }
1477     if(*s != '\0' && *(s + 1) != '\0')
1478     {
1479         while(*s == ' ')
1480             s++;
1481 
1482         if(*s == ':')
1483             s++;
1484         for(;;)
1485         {
1486             for(b = strip_sub_str; *b != NULL; b++)
1487             {
1488                 size_t l_str = StringLen(*b);
1489                 if(StringNCmp(s, *b, l_str) == 0)
1490                 {
1491                     for(s += l_str; *s == ' ' || *s == '.';)
1492                         s++;
1493                     break;
1494                 }
1495             }
1496             if(*b == NULL)
1497                 break;
1498         }
1499 
1500         if (*s != '\0' && auth_list.NotEmpty())
1501         {
1502             auth_list->SetAffil().SetStr(NStr::Sanitize(s));
1503         }
1504     }
1505 
1506     if(*s == '\0')
1507     {
1508         ErrPostEx(SEV_WARNING, ERR_REFERENCE_NoContactInfo,
1509                   "Missing contact info : %s", bptr);
1510     }
1511 
1512     if (auth_list.Empty() || !auth_list->IsSetNames())
1513     {
1514         ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1515                   "Direct submission: missing author (cit-gen created)");
1516 
1517         ret.Reset();
1518         return ret;
1519     }
1520 
1521     ret->SetAuthors(*auth_list);
1522     ret->SetMedium(static_cast<objects::CCit_sub::EMedium>(medium));
1523 
1524     return ret;
1525 }
1526 
1527 /**********************************************************
1528  *
1529  *   static CitSubPtr get_sub_gsdb(bptr, auth, title, pp):
1530  *
1531  *      GSDB specific format for CitSub :
1532  *   REFERENCE   1  (bases 1 to 378)
1533  *     AUTHORS   Mundt,M.O.
1534  *     TITLE     Published by M.O. Mundt, Genomics LS-3,
1535  *               Los Alamos National Laboratory,
1536  *               Mail Stop M888, Los Alamos, NM, USA, 87545
1537  *     JOURNAL   Published in GSDB (11-OCT-1996)
1538  *
1539  **********************************************************/
get_sub_gsdb(char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title,ParserPtr pp)1540 static CRef<objects::CCit_sub> get_sub_gsdb(char* bptr, CRef<objects::CAuth_list>& auth_list,
1541                               CRef<objects::CTitle::C_E>& title, ParserPtr pp)
1542 {
1543     CRef<objects::CCit_sub> cit_sub;
1544 
1545     char*   s;
1546 
1547     for(s = bptr; *s != '(' && *s != '\0';)
1548         s++;
1549     if(*s == '\0')
1550     {
1551         ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1552                   "Fail to parse submission: missing date");
1553         return cit_sub;
1554     }
1555 
1556     CRef<objects::CDate_std> std_date = get_full_date(s + 1, true, pp->source);
1557     if(std_date.Empty())
1558         return cit_sub;
1559 
1560     CRef<objects::CDate> date;
1561     date->SetStd(*std_date);
1562 
1563     if (auth_list.Empty() || !auth_list->IsSetNames())
1564     {
1565         ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1566                   "Direct submission: missing author (cit-gen created)");
1567         return cit_sub;
1568     }
1569 
1570     cit_sub.Reset(new objects::CCit_sub);
1571     cit_sub->SetAuthors(*auth_list);
1572     cit_sub->SetDate(*date);
1573 
1574     if (title.NotEmpty())
1575     {
1576         const Char* s = title->GetName().c_str();
1577         size_t l_str = StringLen("Published by");
1578         if(StringNCmp(s, "Published by", l_str) == 0)
1579         {
1580             s += l_str;
1581             while(*s == ' ')
1582                 s++;
1583         }
1584 
1585         if(*s != '\0')
1586         {
1587             auth_list->SetAffil().SetStr(NStr::Sanitize(s));
1588         }
1589         else
1590         {
1591             ErrPostEx(SEV_WARNING, ERR_REFERENCE_NoContactInfo,
1592                       "Missing contact info : %s", bptr);
1593         }
1594     }
1595     else
1596     {
1597         ErrPostEx(SEV_WARNING, ERR_REFERENCE_NoContactInfo,
1598                   "Missing contact info : %s", bptr);
1599     }
1600 
1601     return cit_sub;
1602 }
1603 
1604 /**********************************************************/
fta_get_citgen(char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title)1605 static CRef<objects::CCit_gen> fta_get_citgen(char* bptr, CRef<objects::CAuth_list>& auth_list,
1606                                                           CRef<objects::CTitle::C_E>& title)
1607 {
1608     CRef<objects::CCit_gen> cit_gen;
1609 
1610     char*   p;
1611     char*   q;
1612     char*   r;
1613     Char      ch;
1614     Int2      year;
1615 
1616     if (bptr == NULL || auth_list.Empty() || !auth_list->IsSetNames() || title.Empty())
1617         return cit_gen;
1618 
1619     year = 0;
1620     p = StringChr(bptr, '(');
1621     if(p != NULL)
1622     {
1623         for(p++; *p == ' ' || *p == '\t';)
1624             p++;
1625         for(q = p; *p >= '0' && *p <= '9';)
1626             p++;
1627         for(r = p; *p == ' ' || *p == '\t' || *p == ')';)
1628             p++;
1629         if(*p == '\n' || *p == '\0')
1630         {
1631             ch = *r;
1632             *r = '\0';
1633             year = atoi(q);
1634             if(year < 1900)
1635                 *r = ch;
1636             else
1637             {
1638                 for(q--; *q == ' ' || *q == '\t' || *q == '(';)
1639                     q--;
1640                 *++q = '\0';
1641             }
1642         }
1643     }
1644 
1645     cit_gen.Reset(new objects::CCit_gen);
1646 
1647     if(bptr != NULL)
1648         cit_gen->SetCit(bptr);
1649 
1650     cit_gen->SetAuthors(*auth_list);
1651     cit_gen->SetTitle(title->GetName());
1652 
1653     if(year >= 1900)
1654         cit_gen->SetDate().SetStd().SetYear(year);
1655 
1656     return cit_gen;
1657 }
1658 
1659 /**********************************************************
1660  *
1661  *   ValNodePtr journal(pp, bptr, eptr, auth, title,
1662  *                      has_muid, cit_art, er):
1663  *
1664  *      Return a ValNodePtr.
1665  *
1666  **********************************************************/
journal(ParserPtr pp,char * bptr,char * eptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title,bool has_muid,CRef<objects::CCit_art> & cit_art,Int4 er)1667 CRef<objects::CPub> journal(ParserPtr pp, char* bptr, char* eptr, CRef<objects::CAuth_list>& auth_list,
1668                                         CRef<objects::CTitle::C_E>& title, bool has_muid, CRef<objects::CCit_art>& cit_art, Int4 er)
1669 {
1670     int        pre = 0;
1671     char*    p;
1672     char*    nearend;
1673     char*    end;
1674     bool       all_zeros;
1675     int        retval = ParFlat_MISSING_JOURNAL;
1676 
1677     CRef<objects::CPub> ret(new objects::CPub);
1678     if(bptr == NULL)
1679     {
1680         const Char* title_str = title.Empty() ? NULL : title->GetName().c_str();
1681         ret->SetGen(*get_unpub(bptr, eptr, auth_list, title_str));
1682         return ret;
1683     }
1684 
1685     p = bptr;
1686     size_t my_len = StringLen(p);
1687     if(my_len > 7)
1688     {
1689         nearend = p + StringLen(p) - 1;
1690         while(*nearend == ' ' || *nearend == '\t' || *nearend == '.')
1691             *nearend-- = '\0';
1692 
1693         nearend -= 8;
1694         end = nearend + 2;
1695         if(StringNICmp("In press", nearend + 1, 8) == 0)
1696         {
1697             pre = 2;
1698             *(nearend + 1) = '\0';
1699         }
1700         if(StringNICmp("Submitted", nearend, 9) == 0)
1701         {
1702             pre = 1;
1703             *nearend = '\0';
1704         }
1705         if(pre == 0 && *end == '(' && IS_DIGIT(*(end + 1)) != 0)
1706         {
1707             for(nearend = end - 1; nearend > bptr && *nearend != ' ';)
1708                 nearend--;
1709             if(StringNICmp("In press", nearend + 1, 8) == 0)
1710             {
1711                 pre = 2;
1712                 *(nearend + 1) = '\0';
1713             }
1714         }
1715     }
1716 
1717     if(my_len >= 6 && *p == '(')
1718     {
1719         p += 6;
1720         my_len -= 6;
1721         if(StringNCmp(" In press", p, 9) == 0)
1722         {
1723             retval = ParFlat_IN_PRESS;
1724             pre = 2;
1725         }
1726     }
1727 
1728     p = bptr;
1729     my_len = StringLen(p);
1730     if(StringNCmp("Unpub", p, 5) == 0 || StringNCmp("Unknown", p, 7) == 0)
1731     {
1732         retval = ParFlat_UNPUB_JOURNAL;
1733         const Char* title_str = title.Empty() ? NULL : title->GetName().c_str();
1734         ret->SetGen(*get_unpub(bptr, eptr, auth_list, title_str));
1735     }
1736     else if(StringNCmp("(in)", p, 4) == 0)
1737     {
1738         retval = ParFlat_MONOGRAPH_NOT_JOURNAL;
1739 
1740         CRef<objects::CCit_art> article = get_book(bptr, auth_list, title, pre, pp->format, p);
1741 
1742         if (article.Empty())
1743             ret->SetGen(*get_error(bptr, auth_list, title));
1744         else
1745             ret->SetArticle(*article);
1746 
1747     }
1748     else if (StringNCmp("Thesis", p, 6) == 0)
1749     {
1750         retval = ParFlat_THESIS_CITATION;
1751 
1752         CRef<objects::CCit_let> cit_let = get_thesis(bptr, auth_list, title, pre);
1753         if (cit_let.Empty())
1754         {
1755             ret.Reset();
1756             return ret;
1757         }
1758         ret->SetMan(*cit_let);
1759     }
1760     else if (StringNCmp("Submi", p,  5) == 0)
1761     {
1762         retval = ParFlat_SUBMITTED;
1763 
1764         CRef<objects::CCit_sub> cit_sub = get_sub(pp, bptr, auth_list);
1765         if (cit_sub.Empty())
1766         {
1767             ret.Reset();
1768             return ret;
1769         }
1770 
1771         ret->SetSub(*cit_sub);
1772     }
1773     else if(StringNCmp("Published in GSDB", p,  17) == 0)
1774     {
1775         ErrPostEx(SEV_WARNING, ERR_REFERENCE_GsdbRefDropped,
1776                   "A published-in-gsdb reference was encountered and has been dropped [%s]",
1777                   bptr);
1778         retval = ParFlat_SUBMITTED;
1779 
1780         CRef<objects::CCit_sub> cit_sub = get_sub_gsdb(bptr, auth_list, title, pp);
1781         if (cit_sub.Empty())
1782         {
1783             ret.Reset();
1784             return ret;
1785         }
1786 
1787         ret->SetSub(*cit_sub);
1788     }
1789     else if(StringNCmp("Patent", p, 6) == 0 ||
1790             pp->source == Parser::ESource::USPTO)
1791     {
1792         retval = ParFlat_PATENT_CITATION;
1793 
1794         if (pp->seqtype == objects::CSeq_id::e_Genbank || pp->seqtype == objects::CSeq_id::e_Ddbj ||
1795             pp->seqtype == objects::CSeq_id::e_Embl || pp->seqtype == objects::CSeq_id::e_Other ||
1796             pp->seqtype == objects::CSeq_id::e_Tpe || pp->seqtype == objects::CSeq_id::e_Tpg ||
1797             pp->seqtype == objects::CSeq_id::e_Tpd ||
1798             pp->source == Parser::ESource::USPTO)
1799         {
1800             CRef<objects::CCit_pat> cit_pat = get_pat(pp, bptr, auth_list, title, eptr);
1801             if (cit_pat.Empty())
1802             {
1803                 ret.Reset();
1804                 return ret;
1805             }
1806 
1807             ret->SetPatent(*cit_pat);
1808         }
1809         else
1810         {
1811             ret.Reset();
1812             return ret;
1813         }
1814     }
1815     else if(StringNCmp("Book:", p, 5) == 0)
1816     {
1817         retval = ParFlat_BOOK_CITATION;
1818 
1819         CRef<objects::CCit_book> book = get_whole_book(bptr, auth_list, title, pre);
1820         if(book.Empty())
1821         {
1822             ret.Reset();
1823             return ret;
1824         }
1825 
1826         ret->SetBook(*book);
1827     }
1828     else if(StringNICmp("Published Only in Database", p, 26) == 0)
1829     {
1830         retval = ParFlat_GEN_CITATION;
1831         CRef<objects::CCit_gen> cit_gen = fta_get_citgen(bptr, auth_list, title);
1832 
1833         if (cit_gen.Empty())
1834         {
1835             ret.Reset();
1836             return ret;
1837         }
1838 
1839         ret->SetGen(*cit_gen);
1840     }
1841     else if(StringNICmp("Online Publication", p, 18) == 0)
1842     {
1843         retval = ParFlat_ONLINE_CITATION;
1844 
1845         CRef<objects::CCit_gen> cit_gen = fta_get_citgen(bptr, auth_list, title);
1846 
1847         if (cit_gen.Empty())
1848         {
1849             ret.Reset();
1850             return ret;
1851         }
1852 
1853         ret->SetGen(*cit_gen);
1854     }
1855 
1856     if(retval == ParFlat_MISSING_JOURNAL)
1857     {
1858         if(cit_art.NotEmpty())
1859             ret->SetArticle(*cit_art);
1860         else
1861         {
1862             CRef<objects::CCit_art> new_art = get_art(pp, bptr, auth_list, title, pre,
1863                                                                   has_muid, &all_zeros, er);
1864             if (new_art.Empty())
1865             {
1866                 if(!all_zeros &&
1867                    StringNCmp(bptr, "(er)", 4) != 0 && er == 0)
1868                     ErrPostEx(SEV_WARNING, ERR_REFERENCE_Illegalreference,
1869                               "Journal format error (cit-gen created): %s",
1870                               bptr);
1871 
1872                 ret->SetGen(*get_error(bptr, auth_list, title));
1873             }
1874             else
1875                 ret->SetArticle(*new_art);
1876         }
1877     }
1878 
1879     return ret;
1880 }
1881 
1882 /**********************************************************/
FindBackSemicolon(char * pchStart,char * pchCurrent)1883 static char* FindBackSemicolon(char* pchStart, char* pchCurrent)
1884 {
1885     if(pchStart == NULL || pchCurrent == NULL || pchStart >= pchCurrent)
1886         return(NULL);
1887 
1888     for(pchCurrent--; pchCurrent >= pchStart; pchCurrent--)
1889     {
1890         if(isspace((int) *pchCurrent) != 0)
1891             continue;
1892         if(*pchCurrent == ';')
1893             return(pchCurrent);
1894         break;
1895     }
1896 
1897     return(NULL);
1898 }
1899 
1900 /**********************************************************/
FindSemicolon(char * str)1901 static char* FindSemicolon(char* str)
1902 {
1903     if(str == NULL || *str == '\0')
1904         return(NULL);
1905 
1906     str = SkipSpaces(str);
1907 
1908     if(*str == ';')
1909         return(str);
1910 
1911     return(NULL);
1912 }
1913 
1914 /**********************************************************/
ExtractErratum(char * comm)1915 static char* ExtractErratum(char* comm)
1916 {
1917     char* start;
1918     char* pchNumber = NULL;
1919     char* end;
1920     char* p;
1921 
1922     if(comm == NULL)
1923         return(NULL);
1924 
1925     start = StringStr(comm, "Erratum:");
1926     if(start == NULL)
1927         return(comm);
1928 
1929     end = StringChr(start, ']');
1930     if(end == NULL)
1931         return(comm);
1932 
1933     pchNumber = end + 1;
1934     end = FindSemicolon(pchNumber);
1935     if(end != NULL)
1936         pchNumber = end + 1;
1937     p = FindBackSemicolon(comm, start);
1938     if(p != NULL)
1939         start = p;
1940     fta_StringCpy(start, pchNumber);
1941 
1942     /* Check if the string after cutting signature is empty. If it's really
1943      * empty we have to ignore the whole string (comment).
1944      * Do you want to have a comment which contains nothing!? Probably no.
1945      */
1946     for(p = comm; *p == ' ' || *p == '\t' || *p == '\n';)
1947         p++;
1948     if(*p == '\0')
1949         *comm = '\0';
1950 
1951     return(comm);
1952 }
1953 
1954 /**********************************************************/
XMLGetXrefs(char * entry,XmlIndexPtr xip,TQualVector & quals)1955 static void XMLGetXrefs(char* entry, XmlIndexPtr xip, TQualVector& quals)
1956 {
1957     XmlIndexPtr xipqual;
1958 
1959     if(entry == NULL || xip == NULL)
1960         return;
1961 
1962     for (; xip != NULL; xip = xip->next)
1963     {
1964         if(xip->subtags == NULL)
1965             continue;
1966 
1967         CRef<objects::CGb_qual> qual(new objects::CGb_qual);
1968 
1969         for(xipqual = xip->subtags; xipqual != NULL; xipqual = xipqual->next)
1970         {
1971             if (xipqual->tag == INSDXREF_DBNAME)
1972                 qual->SetQual(XMLGetTagValue(entry, xipqual));
1973             else if(xipqual->tag == INSDXREF_ID)
1974                 qual->SetVal(XMLGetTagValue(entry, xipqual));
1975         }
1976 
1977         if (qual->IsSetQual() && !qual->GetQual().empty())
1978             quals.push_back(qual);
1979     }
1980 }
1981 
1982 /**********************************************************/
fta_add_article_ids(objects::CPub & pub,const std::string & doi,const std::string & agricola)1983 static void fta_add_article_ids(objects::CPub& pub, const std::string& doi, const std::string& agricola)
1984 {
1985     if (doi.empty() && agricola.empty())
1986         return;
1987 
1988     if (pub.IsArticle())
1989     {
1990         objects::CCit_art& cit_art = pub.SetArticle();
1991 
1992         if (!agricola.empty())
1993         {
1994             CRef<objects::CArticleId> id(new objects::CArticleId);
1995             id->SetOther().SetDb("AGRICOLA");
1996             id->SetOther().SetTag().SetStr(agricola);
1997 
1998             cit_art.SetIds().Set().push_front(id);
1999         }
2000 
2001         if (!doi.empty())
2002         {
2003             CRef<objects::CArticleId> id(new objects::CArticleId);
2004             id->SetDoi().Set(doi);
2005 
2006             cit_art.SetIds().Set().push_front(id);
2007         }
2008     }
2009 }
2010 
2011 /**********************************************************/
fta_remark_is_er(const Char * str)2012 Int4 fta_remark_is_er(const Char* str)
2013 {
2014     const char **b;
2015     char*    s;
2016     Int4       i;
2017 
2018     s = StringSave(str);
2019     ShrinkSpaces(s);
2020     for(i = 1, b = ERRemarks; *b != NULL; b++, i++)
2021         if(StringIStr(s, *b) != NULL)
2022             break;
2023 
2024     MemFree(s);
2025     if(*b == NULL)
2026         return(0);
2027     if(i < 7)
2028         return(1);                      /* epublish     */
2029     return(2);                          /* aheadofprint */
2030 }
2031 
2032 /**********************************************************/
XMLRefs(ParserPtr pp,DataBlkPtr dbp,bool & no_auth,bool & rej)2033 static CRef<objects::CPubdesc> XMLRefs(ParserPtr pp, DataBlkPtr dbp, bool& no_auth, bool& rej)
2034 {
2035     char*           title;
2036 
2037     char*           p;
2038     char*           q;
2039     char*           r;
2040     bool              is_online;
2041     Int4              pmid;
2042     bool              retstat;
2043 
2044     XmlIndexPtr       xip;
2045 
2046     Int4              er;
2047 
2048     CRef<objects::CPubdesc> desc;
2049 
2050     if(pp == NULL || dbp == NULL || dbp->offset == NULL || dbp->data == NULL)
2051         return desc;
2052 
2053     desc.Reset(new objects::CPubdesc);
2054 
2055     p = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2056                         INSDREFERENCE_REFERENCE);
2057     if(p != NULL && isdigit((int) *p) != 0)
2058     {
2059         desc->SetPub().Set().push_back(get_num(p));
2060     }
2061     else
2062     {
2063         ErrPostEx(SEV_WARNING, ERR_REFERENCE_Illegalreference,
2064                   "No reference number.");
2065     }
2066 
2067     if(p != NULL)
2068         MemFree(p);
2069 
2070     p = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2071                         INSDREFERENCE_MEDLINE);
2072     if(p != NULL)
2073     {
2074         rej = true;
2075         MemFree(p);
2076         desc.Reset();
2077         return desc;
2078     }
2079 
2080     pmid = 0;
2081     p = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2082                         INSDREFERENCE_PUBMED);
2083     if(p != NULL)
2084     {
2085         pmid = NStr::StringToInt(p, NStr::fAllowTrailingSymbols);
2086         MemFree(p);
2087     }
2088 
2089     CRef<objects::CAuth_list> auth_list;
2090 
2091     p = XMLConcatSubTags(dbp->offset, (XmlIndexPtr) dbp->data,
2092                          INSDREFERENCE_AUTHORS, ',');
2093     if(p != NULL)
2094     {
2095         if(pp->xml_comp)
2096         {
2097             q = StringRChr(p, '.');
2098             if(q == NULL || q[1] != '\0')
2099             {
2100                 q = (char*) MemNew(StringLen(p) + 2);
2101                 StringCpy(q, p);
2102                 StringCat(q, ".");
2103                 MemFree(p);
2104                 p = q;
2105                 q = NULL;
2106             }
2107         }
2108         for(q = p; *q == ' ' || *q == '.' || *q == ',';)
2109             q++;
2110         if(*q != '\0')
2111         {
2112             q = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2113                                 INSDREFERENCE_JOURNAL);
2114             get_auth(p, (pp->source == Parser::ESource::EMBL) ? EMBL_REF : GB_REF, q, auth_list);
2115             MemFree(q);
2116         }
2117         MemFree(p);
2118     }
2119 
2120     p = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2121                         INSDREFERENCE_CONSORTIUM);
2122     if(p != NULL)
2123     {
2124         for(q = p; *q == ' ' || *q == '.' || *q == ',';)
2125             q++;
2126 
2127         if (*q != '\0')
2128             get_auth_consortium(p, auth_list);
2129 
2130         MemFree(p);
2131     }
2132 
2133     if (auth_list.Empty() || !auth_list->IsSetNames())
2134         no_auth = true;
2135 
2136     p = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2137                         INSDREFERENCE_TITLE);
2138 
2139     CRef<objects::CTitle::C_E> title_art(new objects::CTitle::C_E);
2140     if (p != NULL)
2141     {
2142         if(StringNCmp(p, "Direct Submission", 17) != 0 &&
2143            *p != '\0' && *p != ';')
2144         {
2145             title = clean_up(p);
2146             if(title != NULL)
2147             {
2148                 title_art->SetName(tata_save(title));
2149                 MemFree(title);
2150             }
2151         }
2152         MemFree(p);
2153     }
2154 
2155     is_online = false;
2156     p = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2157                         INSDREFERENCE_JOURNAL);
2158     if(p == NULL)
2159     {
2160         ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
2161                   "No JOURNAL line, reference dropped");
2162         desc.Reset();
2163         return desc;
2164     }
2165 
2166     if(*p == '\0' || *p == ';')
2167     {
2168         ErrPostStr(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
2169                    "JOURNAL line is empty, reference dropped");
2170         MemFree(p);
2171         desc.Reset();
2172         return desc;
2173     }
2174 
2175     if (NStr::EqualNocase(p, 0, 18, "Online Publication"))
2176         is_online = true;
2177 
2178     r = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2179                         INSDREFERENCE_REMARK);
2180     if(r != NULL)
2181     {
2182         r = ExtractErratum(r);
2183         desc->SetComment(NStr::Sanitize(r));
2184         MemFree(r);
2185 
2186         if(!is_online)
2187             normalize_comment(desc->SetComment());
2188     }
2189 
2190     er = fta_remark_is_er(desc->IsSetComment() ? desc->GetComment().c_str() : NULL);
2191 
2192     CRef<objects::CCit_art> cit_art;
2193     if((StringNCmp(p, "(er)", 4) == 0 || er > 0) &&
2194        pmid > 0 && pp->medserver == 1)
2195     {
2196         cit_art = fta_citart_by_pmid(pmid, retstat);
2197         if(retstat && cit_art.Empty())
2198             pmid = 0;
2199     }
2200 
2201     if (pmid > 0)
2202     {
2203         CRef<objects::CPub> pub(new objects::CPub);
2204         pub->SetPmid().Set(ENTREZ_ID_FROM(int, pmid));
2205         desc->SetPub().Set().push_back(pub);
2206     }
2207 
2208     CRef<objects::CPub> pub_ref = journal(pp, p, p + StringLen(p), auth_list, title_art, false, cit_art, er);
2209     MemFree(p);
2210 
2211     TQualVector xrefs;
2212     for (xip = (XmlIndexPtr)dbp->data; xip != NULL; xip = xip->next)
2213     {
2214         if (xip->tag == INSDREFERENCE_XREF)
2215             XMLGetXrefs(dbp->offset, xip->subtags, xrefs);
2216     }
2217 
2218     std::string doi;
2219     std::string agricola;
2220     ITERATE(TQualVector, xref, xrefs)
2221     {
2222         if (!(*xref)->IsSetQual())
2223             continue;
2224 
2225         if (NStr::EqualNocase((*xref)->GetQual(), "ARGICOLA") && agricola.empty())
2226             agricola = (*xref)->GetVal();
2227         else if (NStr::EqualNocase((*xref)->GetQual(), "DOI") && doi.empty())
2228             doi = (*xref)->GetVal();
2229     }
2230 
2231     fta_add_article_ids(*pub_ref, doi, agricola);
2232 
2233     if (pub_ref.Empty())
2234     {
2235         desc.Reset();
2236         return desc;
2237     }
2238 
2239     if(dbp->type == ParFlat_REF_NO_TARGET)
2240         desc->SetReftype(3);
2241 
2242     desc->SetPub().Set().push_back(pub_ref);
2243 
2244     return desc;
2245 }
2246 
2247 /**********************************************************/
gb_refs_common(ParserPtr pp,DataBlkPtr dbp,Int4 col_data,bool bParser,DataBlkPtr ** ppInd,bool & no_auth)2248 CRef<objects::CPubdesc> gb_refs_common(ParserPtr pp, DataBlkPtr dbp, Int4 col_data,
2249                                                    bool bParser, DataBlkPtr** ppInd, bool& no_auth)
2250 {
2251     static DataBlkPtr ind[MAXKW+1];
2252 
2253     bool              has_muid;
2254     char*           p;
2255     char*           q;
2256     char*           r;
2257     bool              is_online;
2258     Int4              pmid;
2259     bool              retstat;
2260     Int4              er;
2261 
2262     CRef<objects::CPubdesc> desc(new objects::CPubdesc);
2263 
2264     p = dbp->offset + col_data;
2265     if(bParser)
2266     {
2267         /* This branch works when this function called in context of PARSER
2268          */
2269         if(*p >= '0' && *p <= '9')
2270             desc->SetPub().Set().push_back(get_num(p));
2271         else
2272             ErrPostEx(SEV_WARNING, ERR_REFERENCE_Illegalreference,
2273                       "No reference number.");
2274         ind_subdbp(dbp, ind, MAXKW, Parser::EFormat::GenBank);
2275     }
2276     else
2277     {
2278         /* This branch works when this function is called in context of GBDIFF
2279          */
2280         if(ppInd != NULL)
2281         {
2282             ind_subdbp(dbp, ind, MAXKW, Parser::EFormat::GenBank);
2283             *ppInd = &ind[0];
2284 
2285             return desc;
2286         }
2287 
2288         if(*p < '0' || *p > '9')
2289             ErrPostEx(SEV_WARNING, ERR_REFERENCE_Illegalreference,
2290                       "No reference number.");
2291     }
2292 
2293     has_muid = false;
2294     if(ind[ParFlat_MEDLINE] != NULL)
2295     {
2296         p = ind[ParFlat_MEDLINE]->offset;
2297         CRef<objects::CPub> pub = get_muid(p, Parser::EFormat::GenBank);
2298         if (pub.NotEmpty())
2299         {
2300             has_muid = true;
2301             desc->SetPub().Set().push_back(get_num(p));
2302         }
2303     }
2304 
2305     pmid = 0;
2306     if(ind[ParFlat_PUBMED] != NULL)
2307     {
2308         p = ind[ParFlat_PUBMED]->offset;
2309         if(p != NULL)
2310             pmid = NStr::StringToInt(p, NStr::fAllowTrailingSymbols);
2311     }
2312 
2313     CRef<objects::CAuth_list> auth_list;
2314     if(ind[ParFlat_AUTHORS] != NULL)
2315     {
2316         p = ind[ParFlat_AUTHORS]->offset;
2317         for(q = p; *q == ' ' || *q == '.' || *q == ',';)
2318             q++;
2319 
2320         if(*q != '\0')
2321         {
2322             if(ind[ParFlat_JOURNAL] != NULL)
2323                 q = ind[ParFlat_JOURNAL]->offset;
2324 
2325             get_auth(p, GB_REF, q, auth_list);
2326         }
2327     }
2328 
2329     if(ind[ParFlat_CONSRTM] != NULL)
2330     {
2331         p = ind[ParFlat_CONSRTM]->offset;
2332         for(q = p; *q == ' ' || *q == '.' || *q == ',';)
2333             q++;
2334 
2335         if (*q != '\0')
2336             get_auth_consortium(p, auth_list);
2337     }
2338 
2339     if (auth_list.Empty() || !auth_list->IsSetNames())
2340         no_auth = true;
2341 
2342     CRef<objects::CTitle::C_E> title_art;
2343     if(ind[ParFlat_TITLE] != NULL)
2344     {
2345         p = ind[ParFlat_TITLE]->offset;
2346         if(StringNCmp(p, "Direct Submission", 17) != 0 &&
2347            *p != '\0' && *p != ';')
2348         {
2349             q = clean_up(p);
2350             if(q != NULL)
2351             {
2352                 title_art.Reset(new objects::CTitle::C_E);
2353                 title_art->SetName(NStr::Sanitize(q));
2354                 MemFree(q);
2355             }
2356         }
2357     }
2358 
2359     if(ind[ParFlat_JOURNAL] == NULL)
2360     {
2361         ErrPostStr(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
2362                    "No JOURNAL line, reference dropped");
2363 
2364         desc.Reset();
2365         return desc;
2366     }
2367 
2368     p = ind[ParFlat_JOURNAL]->offset;
2369     if(*p == '\0' || *p == ';')
2370     {
2371         ErrPostStr(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
2372                    "JOURNAL line is empty, reference dropped");
2373 
2374         desc.Reset();
2375         return desc;
2376     }
2377 
2378     is_online = (StringNICmp(p, "Online Publication", 18) == 0);
2379 
2380     if(ind[ParFlat_REMARK] != NULL)
2381     {
2382         r = ind[ParFlat_REMARK]->offset;
2383         r = ExtractErratum(r);
2384         desc->SetComment(NStr::Sanitize(r));
2385 
2386         if(!is_online)
2387             normalize_comment(desc->SetComment());
2388     }
2389 
2390     er = fta_remark_is_er(desc->IsSetComment() ? desc->GetComment().c_str() : NULL);
2391 
2392     CRef<objects::CCit_art> cit_art;
2393 
2394     if(pp->medserver == 1 && pmid > 0 &&
2395        (StringNCmp(p, "(er)", 4) == 0 || er > 0))
2396     {
2397         cit_art = fta_citart_by_pmid(pmid, retstat);
2398         if(retstat && cit_art == NULL)
2399             pmid = 0;
2400     }
2401 
2402     if (pmid > 0)
2403     {
2404         CRef<objects::CPub> pub(new objects::CPub);
2405         pub->SetPmid().Set(ENTREZ_ID_FROM(int, pmid));
2406         desc->SetPub().Set().push_back(pub);
2407     }
2408 
2409     CRef<objects::CPub> pub_ref = journal(pp, p, p + ind[ParFlat_JOURNAL]->len,
2410                                                       auth_list, title_art, has_muid, cit_art, er);
2411 
2412     if (pub_ref.Empty())
2413     {
2414         desc.Reset();
2415         return desc;
2416     }
2417 
2418     if(dbp->type == ParFlat_REF_NO_TARGET)
2419         desc->SetReftype(3);
2420 
2421     desc->SetPub().Set().push_back(pub_ref);
2422 
2423     return desc;
2424 }
2425 
2426 /**********************************************************
2427  *
2428  *   static PubdescPtr embl_refs(pp, dbp, col_data, no_auth):
2429  *
2430  *      Parse EMBL references. Return a Pubdesc pointer.
2431  *
2432  *                                              11-14-93
2433  *
2434  **********************************************************/
embl_refs(ParserPtr pp,DataBlkPtr dbp,Int4 col_data,bool & no_auth)2435 static CRef<objects::CPubdesc> embl_refs(ParserPtr pp, DataBlkPtr dbp, Int4 col_data, bool& no_auth)
2436 {
2437     static DataBlkPtr ind[MAXKW+1];
2438     char*           s;
2439 
2440     char*           title;
2441     bool              has_muid;
2442     char*           p;
2443     char*           q;
2444     Int4              pmid;
2445 
2446     bool              retstat;
2447     Int4              er;
2448 
2449     CRef<objects::CPubdesc> desc(new objects::CPubdesc);
2450 
2451     p = dbp->offset + col_data;
2452     while((*p < '0' || *p > '9') && dbp->len > 0)
2453         p++;
2454     if(*p >= '0' && *p <= '9')
2455         desc->SetPub().Set().push_back(get_num(p));
2456     else
2457         ErrPostEx(SEV_WARNING, ERR_REFERENCE_Illegalreference,
2458                   "No reference number.");
2459 
2460     ind_subdbp(dbp, ind, MAXKW, Parser::EFormat::EMBL);
2461 
2462     has_muid = false;
2463     pmid = 0;
2464 
2465     std::string doi;
2466     std::string agricola;
2467 
2468     if(ind[ParFlat_RC] != NULL)
2469         desc->SetComment(NStr::Sanitize(ind[ParFlat_RC]->offset));
2470 
2471     er = fta_remark_is_er(desc->IsSetComment() ? desc->GetComment().c_str() : NULL);
2472 
2473     if(ind[ParFlat_RX] != NULL)
2474     {
2475         p = ind[ParFlat_RX]->offset;
2476         CRef<objects::CPub> pub = get_muid(p, Parser::EFormat::EMBL);
2477 
2478         const Char* id = get_embl_str_pub_id(p, "DOI;");
2479         if (id)
2480             doi = id;
2481 
2482         id = get_embl_str_pub_id(p, "AGRICOLA;");
2483         if (id)
2484             agricola = id;
2485 
2486         if (pub.NotEmpty())
2487         {
2488             desc->SetPub().Set().push_back(pub);
2489             has_muid = true;
2490         }
2491 
2492         pmid = get_embl_pmid(p);
2493     }
2494 
2495     CRef<objects::CAuth_list> auth_list;
2496     if(ind[ParFlat_RA] != NULL)
2497     {
2498         p = ind[ParFlat_RA]->offset;
2499         s = p + StringLen(p) - 1;
2500         if(*s == ';')
2501             *s = '\0';
2502         for(q = p; *q == ' ' || *q == '.' || *q == ',';)
2503             q++;
2504         if(*q != '\0')
2505         {
2506             if(ind[ParFlat_RL] != NULL)
2507                 q = ind[ParFlat_RL]->offset;
2508 
2509             get_auth(p, EMBL_REF, q, auth_list);
2510         }
2511     }
2512 
2513     if(ind[ParFlat_RG] != NULL)
2514     {
2515         p = ind[ParFlat_RG]->offset;
2516         s = p + StringLen(p) - 1;
2517         if(*s == ';')
2518             *s = '\0';
2519 
2520         for(q = p; *q == ' ' || *q == '.' || *q == ',';)
2521             q++;
2522 
2523         if (*q != '\0')
2524             get_auth_consortium(p, auth_list);
2525     }
2526 
2527     if (auth_list.Empty() || !auth_list->IsSetNames())
2528         no_auth = true;
2529 
2530     CRef<objects::CTitle::C_E> title_art;
2531     if (ind[ParFlat_RT] != NULL)
2532     {
2533         p = ind[ParFlat_RT]->offset;
2534         if(*p != '\0' && *p != ';')
2535         {
2536             title = clean_up(p);
2537             if (title != NULL && title[0])
2538             {
2539                 title_art.Reset(new objects::CTitle::C_E);
2540                 title_art->SetName(NStr::Sanitize(title));
2541             }
2542             MemFree(title);
2543         }
2544     }
2545 
2546     if(ind[ParFlat_RL] == NULL)
2547     {
2548         ErrPostStr(SEV_ERROR, ERR_REFERENCE_Illegalreference,
2549                    "No JOURNAL line, reference dropped.");
2550 
2551         desc.Reset();
2552         return desc;
2553     }
2554 
2555     p = ind[ParFlat_RL]->offset;
2556     if(*p == '\0' || *p == ';')
2557     {
2558         ErrPostStr(SEV_ERROR, ERR_REFERENCE_Illegalreference,
2559                    "JOURNAL line is empty, reference dropped.");
2560 
2561         desc.Reset();
2562         return desc;
2563     }
2564 
2565     CRef<objects::CCit_art> cit_art;
2566     if ((StringNCmp(p, "(er)", 4) == 0 || er > 0) &&
2567         pmid > 0 && pp->medserver == 1)
2568     {
2569         cit_art = fta_citart_by_pmid(pmid, retstat);
2570         if(retstat && cit_art == NULL)
2571             pmid = 0;
2572     }
2573 
2574     if (pmid > 0)
2575     {
2576         CRef<objects::CPub> pub(new objects::CPub);
2577         pub->SetPmid().Set(ENTREZ_ID_FROM(int, pmid));
2578         desc->SetPub().Set().push_back(pub);
2579     }
2580 
2581     CRef<objects::CPub> pub_ref = journal(pp, p, p + ind[ParFlat_RL]->len, auth_list,
2582                       title_art, has_muid, cit_art, er);
2583 
2584     if (pub_ref.Empty())
2585     {
2586         desc.Reset();
2587         return desc;
2588     }
2589 
2590     fta_add_article_ids(*pub_ref, doi, agricola);
2591 
2592     if(dbp->type == ParFlat_REF_NO_TARGET)
2593         desc->SetReftype(3);
2594 
2595     desc->SetPub().Set().push_back(pub_ref);
2596 
2597     return desc;
2598 }
2599 
2600 /**********************************************************/
fta_sort_pubs(TPubList & pubs)2601 static void fta_sort_pubs(TPubList& pubs)
2602 {
2603     NON_CONST_ITERATE(TPubList, pub, pubs)
2604     {
2605         TPubList::iterator next_pub = pub;
2606         for (++next_pub; next_pub != pubs.end(); ++next_pub)
2607         {
2608             if ((*next_pub)->Which() > (*pub)->Which())
2609                 continue;
2610 
2611             if ((*next_pub)->Which() == (*pub)->Which())
2612             {
2613                 if (!(*pub)->IsMuid() || (*pub)->GetMuid() >= (*next_pub)->GetMuid())
2614                     continue;
2615             }
2616 
2617             pub->Swap(*next_pub);
2618         }
2619     }
2620 }
2621 
2622 /**********************************************************/
fta_check_long_last_name(const objects::CAuth_list & authors,bool soft_report)2623 static void fta_check_long_last_name(const objects::CAuth_list& authors, bool soft_report)
2624 {
2625     static const size_t MAX_LAST_NAME_LEN = 30;
2626 
2627     ErrSev     sev;
2628 
2629     if (!authors.IsSetNames() || !authors.GetNames().IsStd())
2630         return;
2631 
2632     ITERATE(objects::CAuth_list::C_Names::TStd, author, authors.GetNames().GetStd())
2633     {
2634         if (!(*author)->IsSetName() || !(*author)->GetName().IsName())
2635             continue;
2636 
2637         const objects::CName_std& name = (*author)->GetName().GetName();
2638 
2639         if (name.IsSetLast() && name.GetLast().size() > MAX_LAST_NAME_LEN)
2640         {
2641             /* Downgrade severity of this error to WARNING
2642              * if in HTGS mode. As of 7/31/2002, very long
2643              * consortium names were treated as if
2644              * they were author last names, for HTGS data.
2645              * This can be reverted to ERROR after the
2646              * consortium name slot is available and utilized
2647              * in the ASN.1.
2648              */
2649             sev = (soft_report ? SEV_WARNING : SEV_ERROR);
2650             ErrPostEx(sev, ERR_REFERENCE_LongAuthorName,
2651                       "Last name of author exceeds 30 characters in length. A format error in the reference data might have caused the author name to be parsed incorrectly. Name is \"%s\".",
2652                       name.GetLast().c_str());
2653         }
2654     }
2655 }
2656 
2657 /**********************************************************/
fta_check_long_name_in_article(const objects::CCit_art & cit_art,bool soft_report)2658 static void fta_check_long_name_in_article(const objects::CCit_art& cit_art, bool soft_report)
2659 {
2660     if (cit_art.IsSetAuthors())
2661         fta_check_long_last_name(cit_art.GetAuthors(), soft_report);
2662 
2663     if (cit_art.IsSetFrom())
2664     {
2665         const objects::CCit_book* book = nullptr;
2666         if (cit_art.GetFrom().IsBook())
2667             book = &cit_art.GetFrom().GetBook();
2668         else if (cit_art.GetFrom().IsProc())
2669         {
2670             if (cit_art.GetFrom().GetProc().IsSetBook())
2671                 book = &cit_art.GetFrom().GetProc().GetBook();
2672         }
2673 
2674         if (book != nullptr && book->IsSetAuthors())
2675             fta_check_long_last_name(book->GetAuthors(), soft_report);
2676     }
2677 }
2678 
2679 /**********************************************************/
fta_check_long_names(const objects::CPub & pub,bool soft_report)2680 static void fta_check_long_names(const objects::CPub& pub, bool soft_report)
2681 {
2682     if (pub.IsGen())                        /* CitGen */
2683     {
2684         const objects::CCit_gen& cit_gen = pub.GetGen();
2685         if (cit_gen.IsSetAuthors())
2686             fta_check_long_last_name(cit_gen.GetAuthors(), soft_report);
2687     }
2688     else if (pub.IsSub())                   /* CitSub */
2689     {
2690         if (!soft_report)
2691         {
2692             const objects::CCit_sub& cit_sub = pub.GetSub();
2693             if (cit_sub.IsSetAuthors())
2694                 fta_check_long_last_name(cit_sub.GetAuthors(), soft_report);
2695         }
2696     }
2697     else if (pub.IsMedline())                   /* Medline */
2698     {
2699         const objects::CMedline_entry& medline = pub.GetMedline();
2700         if (medline.IsSetCit())
2701         {
2702             fta_check_long_name_in_article(medline.GetCit(), soft_report);
2703         }
2704     }
2705     else if (pub.IsArticle())                   /* CitArt */
2706     {
2707         fta_check_long_name_in_article(pub.GetArticle(), soft_report);
2708     }
2709     else if (pub.IsBook() || pub.IsProc() || pub.IsMan())  /* CitBook or CitProc or
2710                                                               CitLet */
2711     {
2712         const objects::CCit_book* book = nullptr;
2713 
2714         if (pub.IsBook())
2715             book = &pub.GetBook();
2716         else if (pub.IsProc())
2717         {
2718             if (pub.GetProc().IsSetBook())
2719                 book = &pub.GetProc().GetBook();
2720         }
2721         else
2722         {
2723             if (pub.GetMan().IsSetCit())
2724                 book = &pub.GetMan().GetCit();
2725         }
2726 
2727         if (book != nullptr && book->IsSetAuthors())
2728             fta_check_long_last_name(book->GetAuthors(), soft_report);
2729     }
2730     else if (pub.IsPatent())                   /* CitPat */
2731     {
2732         const objects::CCit_pat& patent = pub.GetPatent();
2733 
2734         if (patent.IsSetAuthors())
2735             fta_check_long_last_name(patent.GetAuthors(), soft_report);
2736 
2737         if (patent.IsSetApplicants())
2738             fta_check_long_last_name(patent.GetApplicants(), soft_report);
2739 
2740         if (patent.IsSetAssignees())
2741             fta_check_long_last_name(patent.GetAssignees(), soft_report);
2742     }
2743     else if (pub.IsEquiv())                  /* PubEquiv */
2744     {
2745         ITERATE(TPubList, cur_pub, pub.GetEquiv().Get())
2746         {
2747             fta_check_long_names(*(*cur_pub), soft_report);
2748         }
2749     }
2750 }
2751 
2752 /**********************************************************/
fta_propagate_pmid_muid(objects::CPub_equiv & pub_equiv)2753 static void fta_propagate_pmid_muid(objects::CPub_equiv& pub_equiv)
2754 {
2755     Int4       pmid;
2756     Int4       muid;
2757 
2758     pmid = 0;
2759     muid = 0;
2760 
2761     objects::CCit_art* cit_art = nullptr;
2762     NON_CONST_ITERATE(TPubList, pub, pub_equiv.Set())
2763     {
2764         if ((*pub)->IsMuid() && muid == 0)
2765             muid = ENTREZ_ID_TO(int, (*pub)->GetMuid());
2766         else if ((*pub)->IsPmid() && pmid == 0)
2767             pmid = ENTREZ_ID_TO(int, (*pub)->GetPmid().Get());
2768         else if ((*pub)->IsArticle() && cit_art == nullptr)
2769             cit_art = &(*pub)->SetArticle();
2770     }
2771 
2772     if (cit_art == NULL || (muid == 0 && pmid == 0))
2773         return;
2774 
2775     if(muid != 0)
2776     {
2777         CRef<objects::CArticleId> id(new objects::CArticleId);
2778         id->SetMedline().Set(ENTREZ_ID_FROM(int, muid));
2779         cit_art->SetIds().Set().push_front(id);
2780     }
2781 
2782     if(pmid != 0)
2783     {
2784         CRef<objects::CArticleId> id(new objects::CArticleId);
2785         id->SetPubmed().Set(ENTREZ_ID_FROM(int, pmid));
2786         cit_art->SetIds().Set().push_front(id);
2787     }
2788 }
2789 
2790 /**********************************************************
2791  *
2792  *   PubdescPtr DescrRefs(pp, dbp, col_data):
2793  *
2794  *      Return a Pubdesc pointer.
2795  *
2796  *                                              4-14-93
2797  *
2798  **********************************************************/
DescrRefs(ParserPtr pp,DataBlkPtr dbp,Int4 col_data)2799 CRef<objects::CPubdesc> DescrRefs(ParserPtr pp, DataBlkPtr dbp, Int4 col_data)
2800 {
2801     bool soft_report = false;
2802 
2803     bool rej = false;
2804     bool no_auth = false;
2805 
2806     if(pp->mode == Parser::EMode::HTGS)
2807         soft_report = true;
2808 
2809     CRef<objects::CPubdesc> desc;
2810 
2811     if (pp->format == Parser::EFormat::SPROT)
2812         desc = sp_refs(pp, dbp, col_data);
2813     else if(pp->format == Parser::EFormat::XML)
2814         desc = XMLRefs(pp, dbp, no_auth, rej);
2815     else if(pp->format == Parser::EFormat::GenBank)
2816         desc = gb_refs_common(pp, dbp, col_data, true, NULL, no_auth);
2817     else if(pp->format == Parser::EFormat::EMBL)
2818         desc = embl_refs(pp, dbp, col_data, no_auth);
2819 
2820     if(desc && desc->IsSetComment())
2821     {
2822         char *comment = (char *) desc->GetComment().c_str();
2823         ShrinkSpaces(comment);
2824         desc->SetComment(comment);
2825     }
2826 
2827     if(no_auth)
2828     {
2829         if(pp->source == Parser::ESource::EMBL)
2830             ErrPostEx(SEV_ERROR, ERR_REFERENCE_MissingAuthors,
2831                       "Reference has no author names.");
2832         else
2833         {
2834             ErrPostEx(SEV_REJECT, ERR_REFERENCE_MissingAuthors,
2835                       "Reference has no author names. Entry dropped.");
2836             pp->entrylist[pp->curindx]->drop = 1;
2837         }
2838     }
2839 
2840     if(rej)
2841     {
2842         ErrPostEx(SEV_REJECT, ERR_REFERENCE_InvalidMuid,
2843                   "Use of Medline ID in INSDSeq format is not alowed. Entry dropped.");
2844         pp->entrylist[pp->curindx]->drop = 1;
2845     }
2846 
2847     if (desc.NotEmpty() && desc->IsSetPub())
2848     {
2849         fta_sort_pubs(desc->SetPub().Set());
2850 
2851         ITERATE(TPubList, pub, desc->GetPub().Get())
2852         {
2853             fta_check_long_names(*(*pub), soft_report);
2854         }
2855 
2856         fta_propagate_pmid_muid(desc->SetPub());
2857     }
2858 
2859     return desc;
2860 }
2861 
2862 END_NCBI_SCOPE
2863