1 /* utilref.c
2  *
3  * ===========================================================================
4  *
5  *                            PUBLIC DOMAIN NOTICE
6  *               National Center for Biotechnology Information
7  *
8  *  This software/database is a "United States Government Work" under the
9  *  terms of the United States Copyright Act.  It was written as part of
10  *  the author's official duties as a United States Government employee and
11  *  thus cannot be copyrighted.  This software/database is freely available
12  *  to the public for use. The National Library of Medicine and the U.S.
13  *  Government have not placed any restriction on its use or reproduction.
14  *
15  *  Although all reasonable efforts have been taken to ensure the accuracy
16  *  and reliability of the software and data, the NLM and the U.S.
17  *  Government do not and cannot warrant the performance or results that
18  *  may be obtained by using this software or data. The NLM and the U.S.
19  *  Government disclaim all warranties, express or implied, including
20  *  warranties of performance, merchantability or fitness for any particular
21  *  purpose.
22  *
23  *  Please cite the author in any work or product based on this material.
24  *
25  * ===========================================================================
26  *
27  * File Name:  utilref.c
28  *
29  * Author: Karl Sirotkin, Hsiu-Chuan Chen
30  *
31  * File Description:
32  * -----------------
33  *      Utility routines for parsing reference block of flatfile.
34  *
35  */
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 
40 #include <objects/general/Person_id.hpp>
41 #include <objects/biblio/Auth_list.hpp>
42 #include <objects/biblio/Cit_gen.hpp>
43 
44 
45 #include "index.h"
46 
47 #include <objtools/flatfile/flatdefn.h>
48 
49 #include "ftaerr.hpp"
50 #include "asci_blk.h"
51 #include "utilref.h"
52 #include "add.h"
53 #include "utilfun.h"
54 
55 #ifdef THIS_FILE
56 #    undef THIS_FILE
57 #endif
58 #define THIS_FILE "utilref.cpp"
59 
60 #define MAX_PAGE     50
61 #define OTHER_MEDIUM 255
62 
63 BEGIN_NCBI_SCOPE
64 
65 /**********************************************************/
get_tokens(char * pt,const Char * delimeter)66 ValNodePtr get_tokens(char* pt, const Char *delimeter)
67 {
68     ValNodePtr token;
69     ValNodePtr vnp;
70 
71     bool    more;
72 
73     if(pt == NULL || *pt == '\0')
74         return(NULL);
75 
76     token = ValNodeNew(NULL);
77     vnp = token;
78     for(; *pt != '\0'; pt++)
79     {
80         for(; *pt != '\0'; pt++)
81         {
82             if(StringChr(" \n\t\f~,", *pt) == NULL)
83                 break;
84             *pt = '\0';
85         }
86         if(*pt == '\0')
87             break;
88 
89         vnp->next = ValNodeNew(NULL);
90         vnp = vnp->next;
91         vnp->data.ptrvalue = pt;
92         more = false;
93         for(; *pt != '\0'; pt++)
94         {
95             if(StringNCmp(pt, delimeter, StringLen(delimeter)) != 0 &&
96                StringNCmp(pt, ",\n", 2) != 0 && StringNCmp(pt, ",~", 2) != 0 &&
97                StringNCmp(pt, " and ", 5) != 0)
98                 continue;
99 
100             *pt = '\0';
101 
102             if(StringNCmp(pt + 1, "and ", 4) == 0)
103                 pt += 4;
104 
105             more = true;
106             break;
107         }
108 
109         if(!more)
110             break;
111     } /* for, completed parsing author list */
112 
113     vnp = token->next;
114     MemFree(token);
115     return(vnp);
116 }
117 
118 /**********************************************************/
AllUpperCase(char * p)119 static bool AllUpperCase(char* p)
120 {
121     if (p == NULL)
122         return false;
123     while (*p != '\0')
124     {
125         if (!IS_UPPER(*p))
126             return false;
127         p++;
128     }
129     return true;
130 }
131 
132 /**********************************************************/
SplitMlAuthorName(const Char * name,char * last,char * initials,char * suffix)133 static void SplitMlAuthorName(const Char* name, char* last, char* initials,
134                               char* suffix)
135 {
136     char* p;
137     char* p2;
138     Char    sbuf[20];
139     Char    ibuf[20];
140     Int2    i;
141 
142     /* Clear the ibuf field and transfer the entire name to 'last',
143     * excluding leading and trailing spaces
144     */
145     if (name == NULL)
146         return;
147 
148     ibuf[0] = '\0';
149     sbuf[0] = '\0';
150     last[0] = '\0';
151     initials[0] = '\0';
152     suffix[0] = '\0';
153     while (*name <= ' ')
154     {
155         name++;
156         if (*name == '\0')
157             return;
158     }
159     StringCpy(last, name);
160 
161     for (i = static_cast<Int2>(StringLen(last)) - 1; i >= 0 && last[i] <= ' '; i--)
162         last[i] = '\0';
163 
164     /* Strip off the last token (initials or name suffix (Jr, Sr, suffix.)
165     */
166     p = StringRChr(last, ' ');
167     if (p != NULL)                       /* more than just last name */
168     {
169         /* Separate the token from the last name
170         */
171         p2 = p + 1;
172         while (p > last && *p == ' ')
173         {
174             *p = '\0';
175             p--;
176         }
177 
178         /* If the last token is not all upper case, and there are more than
179         * two tokens, see if the next to the last are initials (upper case)
180         */
181         if (!AllUpperCase(p2) && (p = StringRChr(last, (int) ' ')) != NULL)
182         {
183             /* We have at least three tokens, is the next to last initials?
184             */
185             if (AllUpperCase(p + 1))
186             {
187                 /* Yes - concatenate the last two tokens as initials
188                 */
189                 StringCpy(ibuf, p + 1);
190                 StringCpy(sbuf, p2);
191                 while (p > last && *p == ' ')
192                 {
193                     *p = '\0';
194                     p--;
195                 }
196             }
197         }
198 
199         if (ibuf[0] == '\0')             /* Only the last token goes in ibuf */
200             StringCpy(ibuf, p2);
201     }
202 
203     /* now add periods to ibuf and convert suffix
204     */
205     for (p = initials, p2 = ibuf; *p2 != '\0'; p2++, p++)
206     {
207         *p = *p2;
208         if (!IS_LOWER(*(p2 + 1)))        /* watch out for foreign names */
209         {
210             p++;
211             *p = '.';
212         }
213     }
214     *p = '\0';
215 
216     if (sbuf[0])
217     {
218         if (StringCmp(sbuf, "1d") == 0)
219             p = StringMove(suffix, "I.");
220         else if (StringCmp(sbuf, "2d") == 0)
221             p = StringMove(suffix, "II.");
222         else if (StringCmp(sbuf, "3d") == 0)
223             p = StringMove(suffix, "III.");
224         else if (StringCmp(sbuf, "4th") == 0)
225             p = StringMove(suffix, "IV.");
226         else if (StringCmp(sbuf, "5th") == 0)
227             p = StringMove(suffix, "V.");
228         else if (StringCmp(sbuf, "6th") == 0)
229             p = StringMove(suffix, "VI.");
230         else if (StringCmp(sbuf, "Sr") == 0)
231             p = StringMove(suffix, "Sr.");
232         else if (StringCmp(sbuf, "Jr") == 0)
233             p = StringMove(suffix, "Jr.");
234         else
235             p = StringMove(suffix, sbuf);
236     }
237 }
238 
239 /**********************************************************/
GetNameStdFromMl(objects::CName_std & namestd,const Char * token)240 void GetNameStdFromMl(objects::CName_std& namestd, const Char* token)
241 {
242     Char last[80];
243     Char initials[20];
244     Char suffix[20];
245 
246     if (token == NULL)
247         return;
248 
249     SplitMlAuthorName(token, last, initials, suffix);
250     namestd.SetLast(last);
251     if (initials[0] != '\0')
252         namestd.SetInitials(initials);
253     if (suffix[0] != '\0')
254         namestd.SetSuffix(suffix);
255 }
256 
257 /**********************************************************/
RemoveSpacesAndCommas(std::string & str)258 static void RemoveSpacesAndCommas(std::string& str)
259 {
260     std::string buf;
261     for (std::string::iterator it = str.begin(); it != str.end(); ++it)
262         if (*it != ',' && *it != '\t' && *it != ' ')
263             buf.push_back(*it);
264 
265     str.swap(buf);
266 }
267 
268 /**********************************************************/
get_auth_from_toks(ValNodePtr token,Uint1 format,CRef<objects::CAuth_list> & auths)269 void get_auth_from_toks(ValNodePtr token, Uint1 format, CRef<objects::CAuth_list>& auths)
270 {
271     ValNodePtr  vnp;
272     char*     p;
273 
274     if (token == NULL)
275         return;
276 
277     for (vnp = token; vnp != NULL; vnp = vnp->next)
278     {
279         p = (char*)vnp->data.ptrvalue;
280         if (StringNCmp(p, "and ", 4) == 0)
281             p += 4;
282 
283         CRef<objects::CAuthor> author = get_std_auth(p, format);
284 
285         if (author.Empty())
286         {
287             ErrPostEx(SEV_WARNING, ERR_REFERENCE_IllegalAuthorName, "%s", p);
288             continue;
289         }
290 
291         if (author->GetName().GetName().IsSetInitials())
292         {
293             std::string& initials = author->SetName().SetName().SetInitials();
294             RemoveSpacesAndCommas(initials);
295         }
296 
297         if (author->GetName().GetName().IsSetSuffix())
298         {
299             std::string& suffix = author->SetName().SetName().SetSuffix();
300             RemoveSpacesAndCommas(suffix);
301         }
302 
303         if (auths.Empty())
304             auths.Reset(new objects::CAuth_list);
305         auths->SetNames().SetStd().push_back(author);
306     }
307 }
308 
309 /**********************************************************/
get_std_auth(const Char * token,Uint1 format)310 CRef<objects::CAuthor> get_std_auth(const Char* token, Uint1 format)
311 {
312     const Char* auth;
313     const Char* eptr;
314 
315     CRef<objects::CAuthor> author;
316 
317     if (token == NULL || *token == '\0')
318         return author;
319 
320     author = new objects::CAuthor;
321     objects::CPerson_id& person_id = author->SetName();
322     objects::CName_std& namestd = person_id.SetName();
323 
324     for (eptr = token + StringLen(token) - 1; eptr > token && *eptr == ' ';)
325         eptr--;
326 
327     if (format == PIR_REF || format == GB_REF)
328     {
329         for (auth = token; *auth != ',' && *auth != '\0';)
330             auth++;
331         if (*auth == ',')
332         {
333             if (auth[1] != '\0')
334                 namestd.SetInitials(auth + 1);
335         }
336 
337         namestd.SetLast(std::string(token, auth));
338     }
339     else if (format == PDB_REF)
340     {
341         for (auth = eptr; auth > token && *auth != '.';)
342             auth--;
343         if (*auth == '.')
344         {
345             if (auth[1] != '\0' && auth[1] != '.')
346                 namestd.SetLast(auth + 1);
347             namestd.SetInitials(std::string(token, auth + 1));
348         }
349         else
350             namestd.SetLast(token);
351     }
352     else if (format == EMBL_REF || format == SP_REF)
353     {
354         for (auth = eptr; *auth != ' ' && auth > token;)
355             auth--;
356         if (*auth == ' ')
357         {
358             if (*(auth - 1) == '.')
359                 for (auth--; *auth != ' ' && auth > token;)
360                     auth--;
361             if (*auth == ' ')
362             {
363                 if (auth[1] != '\0')
364                     namestd.SetInitials(auth + 1);
365             }
366         }
367         else
368             auth = eptr + 1;
369 
370         namestd.SetLast(std::string(token, auth));
371     }
372     else if (format == ML_REF)
373         GetNameStdFromMl(namestd, token);
374 
375     if (!namestd.IsSetLast())
376     {
377         author.Reset();
378         return author;
379     }
380 
381     return author;
382 }
383 
384 /**********************************************************
385  *
386  *   AuthListPtr get_auth(pt, format, jour):
387  *
388  *      Get AuthListPtr for the authors. Delimiter between
389  *   the authors is ', ' for GenBank and EMBL. Delimiter
390  *   between the authors is ';' for PIR. Delimiter between
391  *   last name and initials is ',' for GenBank and PIR,
392  *   ' ' for EMBL.
393  *      Modified from ParseAuthorList (utilref.c).
394  *
395  *                                              12-4-93
396  *
397  **********************************************************/
get_auth(char * pt,Uint1 format,char * jour,CRef<objects::CAuth_list> & auths)398 void get_auth(char* pt, Uint1 format, char* jour, CRef<objects::CAuth_list>& auths)
399 {
400     static const char *delimiter;
401     static char*    eptr;
402     ValNodePtr        token;
403 
404     switch(format)
405     {
406         case GB_REF:
407         case EMBL_REF:
408         case SP_REF:
409             delimiter = ", ";
410             break;
411         case PIR_REF:
412         case PDB_REF:
413             delimiter = "; ";
414             break;
415         default:
416             break;
417     }
418     if(pt == NULL || *pt == '\0' || *pt == ';')
419         return;
420 
421     size_t len = StringLen(pt);
422     for(eptr = pt + len - 1; IS_ALPHANUM(*eptr) == 0; eptr--)
423         len--;
424 
425     if(len > 4 && StringNCmp(eptr - 4, "et al", 5) == 0)
426     {
427         if(jour == NULL)
428             ErrPostEx(SEV_WARNING, ERR_REFERENCE_EtAlInAuthors, "%s", pt);
429         else
430             ErrPostEx(SEV_WARNING, ERR_REFERENCE_EtAlInAuthors, "%s : %s",
431                       pt, jour);
432     }
433 
434     token = get_tokens(pt, delimiter);
435     get_auth_from_toks(token, format, auths);
436     ValNodeFree(token);
437 }
438 
439 /**********************************************************/
get_auth_consortium(char * cons,CRef<objects::CAuth_list> & auths)440 void get_auth_consortium(char* cons, CRef<objects::CAuth_list>& auths)
441 {
442     char*    p;
443     char*    q;
444 
445     if(cons == NULL || *cons == '\0')
446         return;
447 
448     for (q = cons;; q = p)
449     {
450         p = StringChr(q, ';');
451         if(p != NULL)
452             *p = '\0';
453 
454         CRef<objects::CAuthor> author(new objects::CAuthor);
455         author->SetName().SetConsortium(q);
456 
457         if (auths.Empty())
458             auths.Reset(new objects::CAuth_list);
459         auths->SetNames().SetStd().push_front(author);
460 
461         if(p == NULL)
462             break;
463 
464         for(*p++ = ';'; *p == ';' || *p == ' ';)
465             p++;
466 
467         if (NStr::EqualNocase(p, 0, 4, "and "))
468         {
469             for (p += 4; *p == ' ';)
470                 p++;
471         }
472     }
473 }
474 
475 /**********************************************************/
check_mix_pages_range(char * pages)476 static Int4 check_mix_pages_range(char* pages)
477 {
478     char* page1;
479     char* page2;
480     char* dash;
481     char* p;
482     char* q;
483     Char    ch1;
484     Char    ch2;
485     Int4    i;
486 
487     dash = StringChr(pages, '-');
488     if(dash == NULL)
489         return(0);
490 
491     *dash = '\0';
492     page1 = pages;
493     page2 = dash + 1;
494 
495     if((*page1 >= 'a' && *page1 <= 'z') || (*page1 >= 'A' && *page1 <= 'Z'))
496     {
497         for(p = page1; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z');)
498             p++;
499 
500         if((*page2 < 'a' || *page2 > 'z') && (*page2 < 'A' || *page2 > 'Z'))
501         {
502             *dash = '-';
503             return(-1);
504         }
505 
506         for(q = page2; (*q >= 'a' && *q <= 'z') || (*q >= 'A' && *q <= 'Z');)
507             q++;
508         ch1 = *p;
509         *p = '\0';
510         ch2 = *q;
511         *q = '\0';
512         i = StringCmp(page1, page2);
513         *p = ch1;
514         *q = ch2;
515         if(i != 0)
516         {
517             *dash = '-';
518             return(-1);
519         }
520         for(page1 = p; *p >= '0' && *p <= '9';)
521             p++;
522         for(page2 = q; *q >= '0' && *q <= '9';)
523             q++;
524 
525         i = atoi(page1) - atoi(page2);
526 
527         if(*p != '\0' || *q != '\0')
528         {
529             *dash = '-';
530             return(-1);
531         }
532         *dash = '-';
533         if(i > 0)
534             return(1);
535         return(0);
536     }
537 
538     if(*page1 < '0' || *page1 > '9' || *page2 < '0' || *page2 > '9')
539     {
540         *dash = '-';
541         return(-1);
542     }
543 
544     for(p = page1; *p >= '0' && *p <= '9';)
545         p++;
546     for(q = page2; *q >= '0' && *q <= '9';)
547         q++;
548     ch1 = *p;
549     *p = '\0';
550     ch2 = *q;
551     *q = '\0';
552     i = atoi(page2) - atoi(page1);
553     *p = ch1;
554     *q = ch2;
555 
556     for(page1 = p; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z');)
557         p++;
558     for(page2 = q; (*q >= 'a' && *q <= 'z') || (*q >= 'A' && *q <= 'Z');)
559         q++;
560     if(*p != '\0' || *q != '\0' || StringCmp(page1, page2) != 0)
561     {
562         *dash = '-';
563         return(-1);
564     }
565 
566     *dash = '-';
567     if(i < 0)
568         return(1);
569     return(0);
570 }
571 
572 /**********************************************************/
valid_pages_range(char * pages,const Char * title,Int4 er,bool inpress)573 Int4 valid_pages_range(char* pages, const Char* title, Int4 er, bool inpress)
574 {
575     char* p;
576     char* q;
577     char* s;
578     Int4    fps;
579     Int4    lps;
580     Int4    i;
581 
582     if(pages == NULL || *pages == '\0')
583         return(-1);
584 
585     if(title == NULL)
586         title = (char*) "";
587     while(*pages == ' ' || *pages == ';' || *pages == '\t' || *pages == ',')
588         pages++;
589     if(*pages == '\0')
590         return(-1);
591 
592     for(s = pages; *s != '\0';)
593         s++;
594     for(s--; *s == ' ' || *s == ';' || *s == ',' || *s == '\t';)
595         s--;
596     *++s = '\0';
597 
598     p = StringChr(pages, '-');
599     if(p == NULL)
600     {
601         for(q = pages; (*q >= 'a' && *q <= 'z') || (*q >= 'A' && *q <= 'Z') ||
602                        (*q >= '0' && *q <= '9');)
603             q++;
604         if(*q == '\0')
605             return(0);
606         if((er & 01) == 01)
607             return(0);
608         else if(er > 0)
609             return(-1);
610         return(1);
611     }
612 
613     if(p == pages || p[1] == '\0')
614     {
615         if(er == 0)
616             ErrPostEx(SEV_WARNING, ERR_REFERENCE_IllegPageRange,
617                       "Incorrect pages range provided: \"%s\".", pages);
618         return(-1);
619     }
620 
621     if(inpress && (*(p - 1) == ' ' || *(p - 1) == '\t' ||
622        p[1] == ' ' || p[1] == '\t'))
623         return(1);
624 
625     for(q = p + 1; *q >= '0' && *q <= '9';)
626         q++;
627     for(p = pages; *p >= '0' && *p <= '9';)
628         p++;
629     if(*p == '-' && *q == '\0')
630     {
631         *p = '\0';
632         fps = atoi(pages);
633         *p = '-';
634         lps = atoi(p + 1);
635 
636         if(lps - fps >= MAX_PAGE)
637         {
638             ErrPostEx(SEV_WARNING, ERR_REFERENCE_LargePageRange,
639                       "Total pages exceed %d: %s: %s",
640                       MAX_PAGE, pages, title);
641         }
642         else if(fps > lps)
643         {
644             ErrPostEx(SEV_WARNING, ERR_REFERENCE_InvertPageRange,
645                       "Page numbers may be inverted, %s: %s", pages, title);
646         }
647     }
648     else
649     {
650         i = check_mix_pages_range(pages);
651         if(i == -1)
652         {
653             if(er > 0 && (er & 01) != 01)
654                 return(-1);
655             ErrPostEx(SEV_WARNING, ERR_REFERENCE_UnusualPageNumber,
656                       "Pages numbers are not digits, letter+digits, or digits_letter: \"%s\": \"%s\".",
657                       pages, title);
658         }
659         else if(i == 1)
660         {
661             ErrPostEx(SEV_WARNING, ERR_REFERENCE_InvertPageRange,
662                       "Page numbers may be inverted, %s: %s", pages, title);
663         }
664     }
665     return(0);
666 }
667 
668 /**********************************************************
669  *
670  *   NCBI_DatePtr get_date(year):
671  *
672  *      Gets only year and return NCBI_DatePtr.
673  *
674  **********************************************************/
get_date(const Char * year)675 CRef<objects::CDate> get_date(const Char* year)
676 {
677     CRef<objects::CDate> ret;
678 
679     if(year == NULL || *year == '\0')
680     {
681         ErrPostEx(SEV_ERROR, ERR_REFERENCE_IllegalDate,
682                   "No year in reference.");
683         return ret;
684     }
685 
686     if(year[0] < '0' || year[0] > '9' || year[1] < '0' || year[1] > '9' ||
687        year[2] < '0' || year[2] > '9' || year[3] < '0' || year[3] > '9')
688     {
689         ErrPostEx(SEV_ERROR, ERR_REFERENCE_IllegalDate,
690                   "Illegal year: \"%s\".", year);
691         return ret;
692     }
693 
694     std::string year_str(year, year + 4);
695     time_t now = 0;
696     time(&now);
697     struct tm *tm = localtime(&now);
698 
699     Int4 i = NStr::StringToInt(year_str, NStr::fAllowTrailingSymbols);
700 
701     if (i < 1900)
702     {
703         ErrPostEx(SEV_ERROR, ERR_REFERENCE_YearPrecedes1900,
704                   "Reference's year is extremely far in past: \"%s\".", year_str.c_str());
705         return ret;
706     }
707     else if (i < 1950)
708     {
709         ErrPostEx(SEV_WARNING, ERR_REFERENCE_YearPrecedes1950,
710                   "Reference's year is too far in past: \"%s\".", year_str.c_str());
711     }
712     else if (i > tm->tm_year + 1900 + 2)
713     {
714         ErrPostEx(SEV_WARNING, ERR_REFERENCE_ImpendingYear,
715                   "Reference's year is too far in future: \"%s\"", year_str.c_str());
716     }
717 
718     ret.Reset(new objects::CDate);
719     ret->SetStd().SetYear(i);
720 
721     return ret;
722 }
723 
724 /**********************************************************/
get_error(char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title)725 CRef<objects::CCit_gen> get_error(char* bptr, CRef<objects::CAuth_list>& auth_list, CRef<objects::CTitle::C_E>& title)
726 {
727     CRef<objects::CCit_gen> cit_gen(new objects::CCit_gen);
728 
729     char*    s;
730     bool       zero_year = false;
731     char*    end_tit;
732     char*    eptr;
733 
734     size_t len = StringLen(bptr);
735     eptr = bptr + len - 1;
736     while(*eptr == ' ' || *eptr == '\t' || *eptr == '.')
737         *eptr-- = '\0';
738 
739     if(*eptr == ')')
740     {
741         for(s = eptr - 1; s >= bptr && *s != '(';)
742             s--;
743         if(*s == '(' && s[1] == '0')
744         {
745             zero_year = true;
746             for(end_tit = bptr; isdigit((int) *end_tit) == 0;)
747                 end_tit++;
748             *end_tit = '\0';
749         }
750     }
751 
752     if(zero_year)
753     {
754         CRef<objects::CTitle::C_E> journal_title(new objects::CTitle::C_E);
755         if(StringNCmp(bptr, "(re)", 4) == 0)
756             journal_title->SetName(NStr::Sanitize(bptr));
757         else
758             journal_title->SetIso_jta(NStr::Sanitize(bptr));
759 
760         cit_gen->SetJournal().Set().push_back(journal_title);
761         cit_gen->SetCit("In press");
762     }
763     else if(bptr != NULL)
764     {
765         cit_gen->SetCit(NStr::Sanitize(bptr));
766     }
767 
768     if (auth_list.NotEmpty())
769         cit_gen->SetAuthors(*auth_list);
770 
771     if (title.NotEmpty())
772         cit_gen->SetTitle(title->GetName());
773 
774     return cit_gen;
775 }
776 
777 END_NCBI_SCOPE
778