1 /* $Id: SubSource.cpp 632184 2021-05-27 13:27:21Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  .......
27  *
28  * File Description:
29  *   .......
30  *
31  * Remark:
32  *   This code was originally generated by application DATATOOL
33  *   using the following specifications:
34  *   'seqfeat.asn'.
35  */
36 
37 // standard includes
38 #include <ncbi_pch.hpp>
39 #include <serial/enumvalues.hpp>
40 
41 // generated includes
42 #include <objects/seqfeat/SubSource.hpp>
43 
44 #include <math.h>
45 #include <objects/misc/sequence_util_macros.hpp>
46 #include <corelib/ncbitime.hpp>
47 
48 #include <util/row_reader_ncbi_tsv.hpp>
49 
50 // generated classes
51 
52 BEGIN_NCBI_SCOPE
53 
54 BEGIN_objects_SCOPE // namespace ncbi::objects::
55 
56 unique_ptr<CLatLonCountryMap> CSubSource::m_LatLonCountryMap;
57 unique_ptr<CLatLonCountryMap> CSubSource::m_LatLonWaterMap;
58 
59 
60 // destructor
~CSubSource(void)61 CSubSource::~CSubSource(void)
62 {
63 }
64 
GetLabel(string * str) const65 void CSubSource::GetLabel(string* str) const
66 {
67     *str += '/';
68     string type_name;
69     if (GetSubtype() == eSubtype_other) {
70         type_name = "other";
71     } else {
72         try {
73             // eVocabulary_insdc has some special cases not (historically)
74             // used here.
75             type_name = GetSubtypeName(GetSubtype());
76             replace(type_name.begin(), type_name.end(), '_', '-');
77         } catch (CSerialException&) {
78             type_name = "unknown";
79         }
80     }
81     *str += type_name;
82     *str += '=';
83     *str += GetName();
84     if (IsSetAttrib()) {
85         *str += " (";
86         *str += GetAttrib();
87         *str += ")";
88     }
89 }
90 
91 
GetSubtypeValue(const string & str,EVocabulary vocabulary)92 CSubSource::TSubtype CSubSource::GetSubtypeValue(const string& str,
93                                                  EVocabulary vocabulary)
94 {
95     string name = NStr::TruncateSpaces(str);
96     NStr::ToLower(name);
97     replace(name.begin(), name.end(), '_', '-');
98     replace(name.begin(), name.end(), ' ', '-');
99 
100     if ( NStr::EqualNocase(name, "note") ||
101          NStr::EqualNocase(name, "subsource-note") ||
102          NStr::EqualNocase(name, "subsrc-note") ||
103          NStr::EqualNocase(name, "note-subsource")) {
104         return eSubtype_other;
105     } else if (vocabulary == eVocabulary_insdc) {
106         // consider a table if more special cases arise.
107         if (name == "insertion-seq") {
108             return eSubtype_insertion_seq_name;
109         } else if (name == "plasmid") {
110             return eSubtype_plasmid_name;
111         } else if (name == "transposon") {
112             return eSubtype_transposon_name;
113         } else if (name == "sub-clone") {
114             return eSubtype_subclone;
115         }
116     }
117     return ENUM_METHOD_NAME(ESubtype)()->FindValue(name);
118 }
119 
120 
IsValidSubtypeName(const string & str,EVocabulary vocabulary)121 bool CSubSource::IsValidSubtypeName(const string& str,
122                                     EVocabulary vocabulary)
123 {
124 
125     string name = NStr::TruncateSpaces(str);
126     NStr::ToLower(name);
127     replace(name.begin(), name.end(), '_', '-');
128     replace(name.begin(), name.end(), ' ', '-');
129 
130     if ( NStr::EqualNocase(name, "note") ||
131          NStr::EqualNocase(name, "subsource-note") ||
132          NStr::EqualNocase(name, "subsrc-note") ||
133          NStr::EqualNocase(name, "note-subsource")) {
134          return true;
135     }
136     if (vocabulary == eVocabulary_insdc) {
137         // consider a table if more special cases arise.
138         if (name == "insertion-seq" ||
139             name == "plasmid" ||
140             name == "transposon" ||
141             name == "sub-clone") {
142             return true;
143         }
144     }
145     return ENUM_METHOD_NAME(ESubtype)()->IsValidName(name);
146 }
147 
148 
GetSubtypeName(CSubSource::TSubtype stype,EVocabulary vocabulary)149 string CSubSource::GetSubtypeName(CSubSource::TSubtype stype,
150                                   EVocabulary vocabulary)
151 {
152     if (stype == CSubSource::eSubtype_other) {
153         return "note";
154     } else if (vocabulary == eVocabulary_insdc) {
155         switch (stype) {
156         case eSubtype_subclone:           return "sub_clone";
157         case eSubtype_plasmid_name:       return "plasmid";
158         case eSubtype_transposon_name:    return "transposon";
159         case eSubtype_insertion_seq_name: return "insertion_seq";
160         default:
161             return NStr::Replace
162                 (ENUM_METHOD_NAME(ESubtype)()->FindName(stype, true),
163                  "-", "_");
164         }
165     } else {
166         return ENUM_METHOD_NAME(ESubtype)()->FindName(stype, true);
167     }
168 }
169 
170 
171 
IsMultipleValuesAllowed(TSubtype subtype)172 bool CSubSource::IsMultipleValuesAllowed(TSubtype subtype)
173 {
174     return subtype != eSubtype_chromosome
175         && subtype != eSubtype_sex
176         && subtype != eSubtype_germline
177         && subtype != eSubtype_rearranged
178         && subtype != eSubtype_plasmid_name
179         && subtype != eSubtype_segment
180         && subtype != eSubtype_country
181         && subtype != eSubtype_transgenic
182         && subtype != eSubtype_environmental_sample
183         && subtype != eSubtype_lat_lon
184         && subtype != eSubtype_collection_date
185         && subtype != eSubtype_collected_by
186         && subtype != eSubtype_identified_by
187         && subtype != eSubtype_fwd_primer_seq
188         && subtype != eSubtype_rev_primer_seq
189         && subtype != eSubtype_fwd_primer_name
190         && subtype != eSubtype_rev_primer_name
191         && subtype != eSubtype_metagenomic
192         && subtype != eSubtype_altitude
193         && subtype != eSubtype_clone;
194 }
195 
196 
NeedsNoText(const TSubtype & subtype)197 bool CSubSource::NeedsNoText(const TSubtype& subtype)
198 {
199     if (subtype == eSubtype_germline
200         || subtype == eSubtype_rearranged
201         || subtype == eSubtype_transgenic
202         || subtype == eSubtype_environmental_sample
203         || subtype == eSubtype_metagenomic) {
204         return true;
205     } else {
206         return false;
207     }
208 }
209 
210 
IsDiscouraged(const TSubtype subtype)211 bool CSubSource::IsDiscouraged(const TSubtype subtype)
212 {
213     if (subtype == eSubtype_frequency
214         || subtype == eSubtype_insertion_seq_name
215         || subtype == eSubtype_phenotype
216         || subtype == eSubtype_plastid_name
217         || subtype == eSubtype_transposon_name
218         || subtype == eSubtype_fwd_primer_seq
219         || subtype == eSubtype_rev_primer_seq
220         || subtype == eSubtype_fwd_primer_name
221         || subtype == eSubtype_rev_primer_name
222         || subtype == eSubtype_whole_replicon) {  // metagenomic subsrc qualifier taken off this list: GB-3384
223         return true;
224     } else {
225         return false;
226     }
227 }
228 
229 
IsDayValueOkForMonth(int day,int month,int year)230 bool CSubSource::IsDayValueOkForMonth(int day, int month, int year)
231 {
232     if (month < 1 || month > 12 || day < 1) {
233         return false;
234     }
235     bool rval = true;
236     if (year < 100) {
237         year += 2000;
238     } else if (year > 3000) {
239         return false;
240     } else if (year < 1538) {
241         return false;
242     }
243     CTime month_o(year, month, 1);
244     if (day > month_o.DaysInMonth()) {
245         rval = false;
246     }
247     return rval;
248 }
249 
250 
DateFromCollectionDate(const string & test)251 CRef<CDate> CSubSource::DateFromCollectionDate (const string& test) THROWS((CException))
252 {
253     if (NStr::IsBlank(test)) {
254         NCBI_THROW (CException, eUnknown,
255                         "collection-date string is blank");
256     }
257     string str = NStr::TruncateSpaces(test);
258 
259     if (IsISOFormatDate(str)) {
260         return GetDateFromISODate(str);
261     }
262 
263     size_t pos = NStr::Find(str, "-");
264     string year;
265     string month;
266     string day;
267 
268     if (pos == NPOS) {
269         year = str;
270     } else {
271         size_t pos2 = NStr::Find(str, "-", pos + 1);
272         if (pos2 == NPOS) {
273             month = str.substr(0, pos);
274             year = str.substr(pos + 1);
275             if (NStr::IsBlank(month)) {
276                 NCBI_THROW (CException, eUnknown,
277                                 "collection-date string is improperly formatted");
278             }
279         } else {
280             day = str.substr(0, pos);
281             month = str.substr(pos + 1, pos2 - pos - 1);
282             year = str.substr(pos2 + 1);
283             if (NStr::IsBlank(month) || NStr::IsBlank(day)) {
284                 NCBI_THROW (CException, eUnknown,
285                                 "collection-date string is improperly formatted");
286             }
287         }
288     }
289 
290     int month_val = 0;
291     if (!NStr::IsBlank(month)) {
292         try {
293             month_val = CTime::MonthNameToNum(month);
294         } catch (CTimeException& ex) {
295             NCBI_THROW (CException, eUnknown,
296                             "collection-date string has invalid month");
297         }
298     }
299 
300     int day_val = 0;
301     if (!NStr::IsBlank(day)) {
302         try {
303             day_val = NStr::StringToInt (day);
304             if (day_val < 1) {
305                 NCBI_THROW (CException, eUnknown,
306                                 "collection-date string has invalid day value");
307             }
308         } catch ( const exception& ) {
309             // threw exception while converting to int
310             NCBI_THROW (CException, eUnknown,
311                             "collection-date string is improperly formatted");
312         }
313     }
314 
315     if (NStr::IsBlank(year)) {
316         NCBI_THROW (CException, eUnknown,
317                         "collection-date string is improperly formatted");
318     }
319 
320     int year_val = 0;
321     try {
322         year_val = NStr::StringToInt (year);
323     } catch ( const exception& ) {
324         // threw exception while converting to int
325         NCBI_THROW (CException, eUnknown,
326                         "collection-date string is improperly formatted");
327     }
328 
329     /*
330     if (year_val < 1000 || year_val >= 2100) {
331         NCBI_THROW (CException, eUnknown,
332                         "collection-date year is out of range");
333     }
334     */
335 
336     if (year_val < 1000) {
337         NCBI_THROW (CException, eUnknown,
338                         "collection-date year is out of range");
339     }
340 
341     if (year_val >= 2100) {
342         NCBI_THROW (CException, eUnknown,
343                         "collection-date year is out of range");
344     }
345 
346     if (day_val > 0 && month_val > 0 && !IsDayValueOkForMonth(day_val, month_val, year_val)) {
347         NCBI_THROW (CException, eUnknown,
348                         "collection-date day is greater than monthly maximum");
349     }
350 
351     CRef<CDate> date(new CDate);
352 
353     date->SetStd().SetYear (year_val);
354     if (month_val > 0) {
355         date->SetStd().SetMonth (month_val);
356     }
357     if (day_val > 0) {
358         date->SetStd().SetDay (day_val);
359     }
360 
361     time_t t;
362 
363     time(&t);
364 
365     CDate now(t);
366 
367     /*
368     if (IsCollectionDateAfterTime(*date, t)) {
369          NCBI_THROW (CException, eUnknown,
370                         "collection-date year is out of range");
371     }
372     */
373 
374     return date;
375 }
376 
377 
IsCollectionDateAfterTime(const string & collection_date,time_t t,bool & bad_format)378 bool CSubSource::IsCollectionDateAfterTime(const string& collection_date, time_t t, bool& bad_format)
379 {
380     bad_format = false;
381     bool in_future = false;
382     vector<string> pieces;
383     NStr::Split(collection_date, "/", pieces);
384     if (pieces.size() > 2) {
385         bad_format = true;
386     } else {
387         ITERATE(vector<string>, it, pieces) {
388             CRef<CDate> coll_date = DateFromCollectionDate (*it);
389             if (!coll_date) {
390                 bad_format = true;
391             } else if (IsCollectionDateAfterTime(*coll_date, t)) {
392                 in_future = true;
393             }
394         }
395     }
396     return in_future;
397 }
398 
399 
IsCollectionDateAfterTime(const CDate & collection_date,time_t t)400 bool CSubSource::IsCollectionDateAfterTime(const CDate& collection_date, time_t t)
401 {
402     CDate now(t);
403     if (collection_date.Compare(now) == CDate::eCompare_after) {
404         return true;
405     } else {
406         return false;
407     }
408 }
409 
410 
IsCollectionDateAfterTime(const CDate & collection_date,CTime & ctime)411 bool CSubSource::IsCollectionDateAfterTime(const CDate& collection_date, CTime& ctime)
412 {
413     time_t t = ctime.GetTimeT();
414     return IsCollectionDateAfterTime(collection_date, t);
415 }
416 
417 
IsCorrectDateFormat(const string & date_string,bool & bad_format,bool & in_future)418 void CSubSource::IsCorrectDateFormat(const string& date_string, bool& bad_format, bool& in_future)
419 {
420     bad_format = false;
421     in_future = false;
422 
423     vector<string> pieces;
424     NStr::Split(date_string, "/", pieces);
425     if (pieces.size() > 2) {
426         bad_format = true;
427         return;
428     } else if (pieces.size() == 2) {
429         bool first_bad = false;
430         bool first_future = false;
431         bool second_bad = false;
432         bool second_future = false;
433         IsCorrectDateFormat(pieces[0], first_bad, first_future);
434         IsCorrectDateFormat(pieces[1], second_bad, second_future);
435         bad_format = first_bad || second_bad;
436         if (!bad_format) {
437             in_future = first_future || second_future;
438         }
439         return;
440     }
441 
442     try {
443         CRef<CDate> coll_date = CSubSource::DateFromCollectionDate (date_string);
444 
445         if (!IsISOFormatDate(date_string)) {
446             // if there are two dashes, then the first token needs to be the day, and the
447             // day has to have two numbers, a leading zero if the day is less than 10
448             size_t pos = NStr::Find(date_string, "-");
449             if (pos != NPOS) {
450                 size_t pos2 = NStr::Find(date_string, "-", pos + 1);
451                 if (pos2 != NPOS  &&  pos != 2) {
452                     bad_format = true;
453                 }
454             }
455         }
456 
457         if (!bad_format) {
458             time_t t;
459 
460             time(&t);
461 
462             in_future = IsCollectionDateAfterTime(*coll_date, t);
463         }
464     } catch (CException ) {
465         bad_format = true;
466     }
467 }
468 
CheckDateFormat(const string & date_string)469 size_t CSubSource::CheckDateFormat(const string& date_string)
470 {
471     size_t rval = eDateFormatFlag_ok;
472     vector<string> pieces;
473     NStr::Split(date_string, "/", pieces);
474     if (pieces.size() > 2) {
475         rval |= eDateFormatFlag_bad_format;
476     } else if (pieces.size() == 2) {
477         rval |= CheckDateFormat(pieces[0]);
478         rval |= CheckDateFormat(pieces[1]);
479         if (rval == eDateFormatFlag_ok) {
480             try {
481                 CRef<CDate> d1 = CSubSource::DateFromCollectionDate(pieces[0]);
482                 CRef<CDate> d2 = CSubSource::DateFromCollectionDate(pieces[1]);
483                 if (d2->Compare(*d1) == CDate::eCompare_before) {
484                     rval |= eDateFormatFlag_out_of_order;
485                 }
486             } catch (CException) {
487                 rval |= eDateFormatFlag_bad_format;
488             }
489         }
490         return rval;
491     }
492 
493     try {
494         CRef<CDate> coll_date = CSubSource::DateFromCollectionDate(date_string);
495 
496         if (!IsISOFormatDate(date_string)) {
497             // if there are two dashes, then the first token needs to be the day, and the
498             // day has to have two numbers, a leading zero if the day is less than 10
499             size_t pos = NStr::Find(date_string, "-");
500             if (pos != NPOS) {
501                 size_t pos2 = NStr::Find(date_string, "-", pos + 1);
502                 if (pos2 != NPOS  &&  pos != 2) {
503                     rval |= eDateFormatFlag_bad_format;
504                 }
505             }
506         }
507 
508         if (rval == eDateFormatFlag_ok) {
509             time_t t;
510 
511             time(&t);
512             if (IsCollectionDateAfterTime(*coll_date, t)) {
513                 rval |= eDateFormatFlag_in_future;
514             }
515         }
516     } catch (CException) {
517         rval |= eDateFormatFlag_bad_format;
518     }
519     return rval;
520 }
521 
GetCollectionDateProblem(const string & date_string)522 string CSubSource::GetCollectionDateProblem (const string& date_string)
523 {
524     string problem;
525     size_t rval = CheckDateFormat(date_string);
526     if (rval & eDateFormatFlag_bad_format) {
527         problem = "Collection_date format is not in DD-Mmm-YYYY format";
528     } else if (rval & eDateFormatFlag_in_future) {
529         problem = "Collection_date is in the future";
530     } else if (rval & eDateFormatFlag_out_of_order) {
531         problem = "Collection_dates are out of order";
532     }
533     return problem;
534 }
535 
536 
x_ParseDateRangeWithDelimiter(const string & orig_date,CTempString delim)537 string CSubSource::x_ParseDateRangeWithDelimiter(const string& orig_date, CTempString delim)
538 {
539     size_t pos = NStr::Find(orig_date, delim, NStr::eNocase);
540     if (pos == NPOS) {
541         return kEmptyStr;
542     }
543     size_t second_pos = NStr::Find(orig_date.substr(pos + 1), delim, NStr::eNocase);
544     if (second_pos != NPOS) {
545         return kEmptyStr;
546     }
547     bool month_ambig = false;
548     string first_date = FixDateFormat(orig_date.substr(0, pos), true, month_ambig);
549     if (month_ambig || NStr::IsBlank(first_date)) {
550         return kEmptyStr;
551     }
552     string second_date = FixDateFormat(orig_date.substr(pos + delim.length()), true, month_ambig);
553     if (month_ambig || NStr::IsBlank(second_date)) {
554         return kEmptyStr;
555     }
556     string fix = first_date + "/" + second_date;
557     return fix;
558 }
559 
560 
FixDateFormat(const string & orig_date)561 string CSubSource::FixDateFormat (const string& orig_date)
562 {
563     bool month_ambiguous = false;
564 
565     string fix = FixDateFormat(orig_date, true, month_ambiguous);
566     if (month_ambiguous) {
567         fix.clear();
568     } else if (NStr::IsBlank(fix)) {
569         static const char* delimiters[] = {"/", " to ", " and ", "-", "_"};
570         for (size_t i = 0; i < ArraySize(delimiters); i++) {
571             fix = x_ParseDateRangeWithDelimiter(orig_date, delimiters[i]);
572             if (!NStr::IsBlank(fix)) {
573                 break;
574             }
575         }
576     }
577     return fix;
578 }
579 
580 // ISO Format for time is one of these:
581 // HH:MM:SS
582 // HH:MM
583 // HH
584 // Followed by either Z or +hh:mm to indicate an offset from Zulu
IsISOFormatTime(const string & orig_time,int & hour,int & min,int & sec,bool require_time_zone)585 bool CSubSource::IsISOFormatTime(const string& orig_time, int& hour, int& min, int& sec, bool require_time_zone)
586 {
587     int offset_hour = 0;
588     int offset_min = 0;
589     size_t suffix = NStr::Find(orig_time, "Z");
590     if (suffix == NPOS) {
591         suffix = NStr::Find(orig_time, "+");
592         if (suffix == NPOS) {
593             if (require_time_zone) {
594                 return false;
595             } else {
596                 suffix = orig_time.length();
597             }
598         } else {
599             if (orig_time.substr(suffix).length() != 6 ||
600                 !isdigit((unsigned char)orig_time[suffix + 1]) ||
601                 !isdigit((unsigned char)orig_time[suffix + 2]) ||
602                 orig_time[suffix + 3] != ':' ||
603                 !isdigit((unsigned char)orig_time[suffix + 4]) ||
604                 !isdigit((unsigned char)orig_time[suffix + 5])) {
605                 return false;
606             }
607             try {
608                 offset_hour = NStr::StringToInt(orig_time.substr(suffix + 1, 2));
609                 offset_min = NStr::StringToInt(orig_time.substr(suffix + 4, 2));
610             } catch (...) {
611                 return false;
612             }
613         }
614     }
615     if (suffix != 2 && suffix != 5 && suffix != 8) {
616         return false;
617     }
618 
619     if (!isdigit((unsigned char)orig_time[0]) || !isdigit((unsigned char)orig_time[1])) {
620         return false;
621     }
622     hour = 0;
623     min = 0;
624     sec = 0;
625     try {
626         hour = NStr::StringToInt(orig_time.substr(0, 2));
627         if (hour < 0 || hour > 23) {
628             return false;
629         }
630         hour -= offset_hour;
631     } catch (...) {
632         return false;
633     }
634     if (suffix > 2) {
635         if (!isdigit((unsigned char)orig_time[3]) || !isdigit((unsigned char)orig_time[4])) {
636             return false;
637         }
638         try {
639             min = NStr::StringToInt(orig_time.substr(3, 2));
640             if (min < 0 || min > 59) {
641                 return false;
642             }
643         } catch (...) {
644             return false;
645         }
646         min -= offset_min;
647     }
648     if (suffix == 8) {
649         if (!isdigit((unsigned char)orig_time[6]) || !isdigit((unsigned char)orig_time[7])) {
650             return false;
651         }
652         try {
653             sec = NStr::StringToInt(orig_time.substr(6, 2));
654             if (sec < 0) {
655                 // negative number bad
656                 return false;
657             } else if (sec > 59) {
658                 // too big
659                 return false;
660             }
661         } catch (...) {
662             return false;
663         }
664     }
665 
666     return true;
667 }
668 
669 // ISO Format for date is exactly 10 characters long OR exactly 7 characters long.
670 // For ten characters:
671 // First four characters must be digits, represent year.
672 // Fifth character must be dash.
673 // Sixth and seventh characters must be digits, represent month, use zero padding.
674 // Eighth character must be dash.
675 // Ninth and tenth characters must be digits, represent day, use zero padding.
676 // For 7 characters:
677 // First four characters must be digits, represent year.
678 // Fifth character must be dash.
679 // Sixth and seventh characters must be digits, represent month, use zero padding.
IsISOFormatDateOnly(const string & cpy)680 bool CSubSource::IsISOFormatDateOnly (const string& cpy)
681 {
682     if (cpy.length() != 10 && cpy.length() != 7) {
683         return false;
684     }
685     bool rval = true;
686     size_t pos = 0;
687     string::const_iterator it = cpy.begin();
688     while (it != cpy.end() && rval) {
689         if (pos == 4 || pos == 7) {
690             if (*it != '-') {
691                 rval = false;
692             }
693         } else if (!isdigit(*it)) {
694             rval = false;
695         }
696         ++it;
697         ++pos;
698     }
699     if (rval) {
700         try {
701             int year = NStr::StringToInt(cpy.substr(0, 4));
702             int month = NStr::StringToInt(cpy.substr(5, 2));
703             if (month < 1 || month > 12) {
704                 rval = false;
705             }
706             if (cpy.length() == 10) { // has day
707                 int day = NStr::StringToInt(cpy.substr(8, 2));
708                 if (!IsDayValueOkForMonth(day, month, year)) {
709                     rval = false;
710                 }
711             }
712         } catch (...) {
713             rval = false;
714         }
715     }
716     return rval;
717 }
718 
719 
x_IsFixableIsoDate(const string & orig_date)720 bool CSubSource::x_IsFixableIsoDate(const string& orig_date)
721 {
722     string cpy = orig_date;
723     NStr::TruncateSpacesInPlace(cpy);
724     size_t time_pos = NStr::Find(cpy, "T");
725     bool rval = false;
726     if (time_pos == NPOS) {
727         rval = false;
728     } else {
729         if (!IsISOFormatDateOnly(cpy.substr(0, time_pos))) {
730             rval = false;
731         } else {
732             int h, m, s;
733             if (IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s, true)) {
734                 // already fine, not fixable
735                 rval = false;
736             } else {
737                 rval = IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s, false);
738             }
739         }
740     }
741     return rval;
742 }
743 
744 
x_RemoveIsoTime(const string & orig_date)745 string CSubSource::x_RemoveIsoTime(const string& orig_date)
746 {
747     string cpy = orig_date;
748     NStr::TruncateSpacesInPlace(cpy);
749     size_t time_pos = NStr::Find(cpy, "T");
750     if (time_pos != NPOS) {
751         cpy = cpy.substr(0, time_pos);
752     }
753     return cpy;
754 }
755 
756 
IsISOFormatDate(const string & orig_date)757 bool CSubSource::IsISOFormatDate(const string& orig_date)
758 {
759     string cpy = orig_date;
760     NStr::TruncateSpacesInPlace(cpy);
761     size_t time_pos = NStr::Find(cpy, "T");
762     if (time_pos == NPOS) {
763         return IsISOFormatDateOnly(cpy);
764     } else {
765         int h, m, s;
766         return (IsISOFormatDateOnly(cpy.substr(0, time_pos)) &&
767             IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s));
768     }
769 
770 }
771 
GetDateFromISODate(const string & orig_date)772 CRef<CDate> CSubSource::GetDateFromISODate(const string& orig_date)
773 {
774     try {
775         string cpy = orig_date;
776         NStr::TruncateSpacesInPlace(cpy);
777         CRef<CDate> date(new CDate());
778         int year_val = NStr::StringToInt(cpy.substr(0, 4));
779         int month_val = NStr::StringToInt(cpy.substr(5, 2));
780         date->SetStd().SetYear (year_val);
781         date->SetStd().SetMonth (month_val);
782         if (cpy.length() > 7) {
783             int day_val = NStr::StringToInt(cpy.substr(8, 2));
784             date->SetStd().SetDay (day_val);
785         }
786         return date;
787     } catch (...) {
788         return CRef<CDate>(NULL);
789     }
790 }
791 
792 
x_GetDateTokens(const string & orig_date)793 vector<string> CSubSource::x_GetDateTokens(const string& orig_date)
794 {
795     vector<string> tokens;
796     string token_delimiters = " ,-/=_.";
797 
798     string cpy = orig_date;
799     NStr::TruncateSpacesInPlace (cpy);
800 
801     string curr_token;
802     bool is_chars = false;
803     ITERATE(string, s, cpy) {
804         if (token_delimiters.find(*s) != NPOS) {
805             if (!NStr::IsBlank(curr_token)) {
806                 tokens.push_back(curr_token);
807             }
808             curr_token.clear();
809             is_chars = false;
810         } else if (is_chars && !isalpha((unsigned char)(*s))) {
811             // previous token was all letters, do not add non-letter characters
812             if (!NStr::IsBlank(curr_token)) {
813                 tokens.push_back(curr_token);
814             }
815             curr_token = *s;
816             is_chars = false;
817         } else if (!NStr::IsBlank(curr_token) && !is_chars && isalpha(*s)) {
818             // previous token had no letters
819             tokens.push_back(curr_token);
820             curr_token = *s;
821             is_chars = true;
822         } else {
823             curr_token += *s;
824             if (isalpha(*s)) {
825                 is_chars = true;
826             }
827         }
828     }
829     if (!NStr::IsBlank(curr_token)) {
830         tokens.push_back(curr_token);
831     }
832 
833     // reattach 'st', 'nd', 'rd', and 'th' to numbers if present
834     if (tokens.size() > 3) {
835         vector<string>::iterator p = tokens.begin();
836         bool prev_is_number = isdigit((unsigned char)(*p)[0]);
837         vector<string>::iterator s = p;
838         ++s;
839         while (s != tokens.end()) {
840             if (prev_is_number &&
841                 (NStr::EqualNocase(*s, "st") ||
842                 NStr::EqualNocase(*s, "nd") ||
843                 NStr::EqualNocase(*s, "rd") ||
844                 NStr::EqualNocase(*s, "th"))) {
845                 *p += *s;
846                 s = tokens.erase(s);
847                 prev_is_number = false;
848             } else {
849                 ++p;
850                 ++s;
851                 prev_is_number = isdigit((unsigned char)(*p)[0]);
852             }
853         }
854     }
855 
856     return tokens;
857 }
858 
859 
s_ChooseMonthAndDay(const string & token1,const string & token2,bool month_first,string & month,int & day,bool & month_ambiguous)860 bool s_ChooseMonthAndDay(const string& token1, const string& token2, bool month_first, string& month, int& day, bool& month_ambiguous)
861 {
862     try {
863         int val1 = NStr::StringToInt (token1);
864         int val2 = NStr::StringToInt (token2);
865         if (val1 > 12 && val2 > 12) {
866             // both numbers too big for month
867             return false;
868         } else if (val1 < 13 && val2 < 13) {
869             if (val1 == val2) {
870                 // no need to call this ambiguous
871                 month = CTime::MonthNumToName(val1, CTime::eAbbr);
872                 day = val2;
873             } else {
874                 // both numbers could be month
875                 month_ambiguous = true;
876                 if (month_first) {
877                     month = CTime::MonthNumToName(val1, CTime::eAbbr);
878                     day = val2;
879                 } else {
880                     month = CTime::MonthNumToName(val2, CTime::eAbbr);
881                     day = val1;
882                 }
883             }
884         } else if (val1 < 13) {
885             month = CTime::MonthNumToName(val1, CTime::eAbbr);
886             day = val2;
887         } else {
888             month = CTime::MonthNumToName(val2, CTime::eAbbr);
889             day = val1;
890         }
891         return true;
892     } catch ( ... ) {
893         return false;
894     }
895 }
896 
897 
FixDateFormat(const string & test,bool month_first,bool & month_ambiguous)898 string CSubSource::FixDateFormat (const string& test, bool month_first, bool& month_ambiguous)
899 {
900     string orig_date = test;
901     NStr::TruncateSpacesInPlace(orig_date);
902 
903     if (IsISOFormatDate(orig_date)) {
904         return orig_date;
905     } else if (x_IsFixableIsoDate(orig_date)) {
906         return x_RemoveIsoTime(orig_date);
907     }
908 
909     string reformatted_date;
910     string month;
911     int year = 0, day = 0;
912     //string token_delimiters = " ,-/=_.";
913     size_t num_original_tokens = 0;
914 
915     month_ambiguous = false;
916     vector<string> tokens = x_GetDateTokens(orig_date);
917 
918     num_original_tokens = tokens.size();
919     if (tokens.size() < 1 || tokens.size() > 3) {
920         // no tokens or too many tokens
921         return kEmptyStr;
922     }
923 
924     string one_token;
925     vector<string>::iterator it = tokens.begin();
926     while (it != tokens.end()) {
927         one_token = *it;
928         bool found = false;
929         if (NStr::EqualNocase(one_token, "1st") || NStr::EqualNocase(one_token, "first")) {
930             day = 1;
931             found = true;
932         } else if (NStr::EqualNocase(one_token, "2nd") || NStr::EqualNocase(one_token, "second")) {
933             day = 2;
934             found = true;
935         } else if (NStr::EqualNocase(one_token, "3rd") || NStr::EqualNocase (one_token, "third")) {
936             day = 3;
937             found = true;
938         } else if (one_token.length() > 0
939                    && isdigit((unsigned char)one_token[0])
940                    && NStr::EndsWith(one_token, "th")) {
941             try {
942                 day = NStr::StringToInt (one_token.substr(0, one_token.length() - 2));
943                 found = true;
944             } catch ( ... ) {
945                 // threw exception while converting to int
946                 return kEmptyStr;
947             }
948         } else if (isalpha((unsigned char)one_token[0])) {
949             if (!NStr::IsBlank(month)) {
950                 // already have month, error
951                 return kEmptyStr;
952             }
953             if (one_token.length() > 3) {
954                 one_token = one_token.substr(0, 3);
955             }
956             try {
957                 int month_num = CTime::MonthNameToNum(one_token);
958                 found = true;
959                 month = CTime::MonthNumToName(month_num, CTime::eAbbr);
960             } catch (CTimeException& e) {
961             }
962         } else {
963             try {
964                 int this_val = NStr::StringToInt (one_token);
965                 int min = 1;
966                 int max = 31;
967                 if (this_val < min) {
968                     return kEmptyStr;
969                 } else if (this_val > max) {
970                     if (year > 0) {
971                         // already have year, error
972                         return kEmptyStr;
973                     }
974                     year = this_val;
975                     found = true;
976                 }
977             } catch ( ... ) {
978                 // threw exception while converting to int
979                 return kEmptyStr;
980             }
981         }
982         if (found) {
983             it = tokens.erase(it);
984         } else {
985             it++;
986         }
987     }
988 
989     if (tokens.size() == 0) {
990         // good - all tokens assigned to values
991     } else if (tokens.size() > 2) {
992         // three numbers: treat last one as year
993         try {
994             year = NStr::StringToInt(tokens[2]);
995             if (year < 100) {
996                 year += 2000;
997             }
998             if (!s_ChooseMonthAndDay(tokens[0], tokens[1], month_first, month, day, month_ambiguous)) {
999                 return kEmptyStr;
1000             }
1001             // mark month as ambiguous, since we are guessing about year
1002             month_ambiguous = true;
1003         } catch ( ... ) {
1004             // threw exception while converting to int
1005             return kEmptyStr;
1006         }
1007     } else if (tokens.size() == 1) {
1008         try {
1009             int val = NStr::StringToInt (tokens[0]);
1010             if (year == 0) {
1011                 year = val;
1012             } else {
1013                 if (NStr::IsBlank (month)) {
1014                     if (val > 0 && val < 13) {
1015                         month = CTime::MonthNumToName(val, CTime::eAbbr);
1016                     } else {
1017                         // month number out of range
1018                         return kEmptyStr;
1019                     }
1020                 } else {
1021                     day = val;
1022                 }
1023             }
1024         } catch ( ... ) {
1025             // threw exception while converting to int
1026             return kEmptyStr;
1027         }
1028     } else if (!NStr::IsBlank (month)) {
1029         if (tokens.size() == 2) {
1030             // we have a month and two other numbers (we hope)
1031             int val1 = 0;
1032             int val2 = 0;
1033             try {
1034                 val1 = NStr::StringToInt (tokens[0]);
1035                 val2 = NStr::StringToInt (tokens[1]);
1036             } catch (CException& /*e*/) {
1037                 // not actually numbers
1038                 return kEmptyStr;
1039             }
1040             bool zero_pad_1 = NStr::StartsWith(tokens[0], "0");
1041             bool zero_pad_2 = NStr::StartsWith(tokens[1], "0");
1042             if (val1 < 10 && !zero_pad_1 && (val2 > 10 || zero_pad_2)) {
1043                 // if one token is not zero-padded and less than 10,
1044                 // the other either is zero-padded and greater than 10,
1045                 // the "small" token is the day and the second (+2000) is the year
1046                 day = val1;
1047                 year = val2 + 2000;
1048             } else if (val2 < 10 && !zero_pad_2 && (val1 > 10 || zero_pad_1)) {
1049                 // if one token is not zero-padded and less than 10,
1050                 // the other either is zero-padded and greater than 10,
1051                 // the "small" token is the day and the second (+2000) is the year
1052                 day = val2;
1053                 year = val1 + 2000;
1054             } else {
1055                 int month_num = CTime::MonthNameToNum(month);
1056                 if (IsDayValueOkForMonth(val1, month_num, val2 + 2000)) {
1057                     day = val1;
1058                     year = val2 + 2000;
1059                 } else {
1060                     day = val2;
1061                     year = val1 + 2000;
1062                 }
1063             }
1064         } else {
1065             return kEmptyStr;
1066         }
1067     } else {
1068         if (!s_ChooseMonthAndDay(tokens[0], tokens[1], month_first, month, day, month_ambiguous)) {
1069             return kEmptyStr;
1070         }
1071     }
1072 
1073     // make sure day is valid
1074     if (day > 0 && !NStr::IsBlank(month) && year > -1) {
1075         try {
1076             int month_num = CTime::MonthNameToNum(month);
1077             if (!IsDayValueOkForMonth(day, month_num, year)) {
1078                 return kEmptyStr;
1079             }
1080         } catch (CTimeException& ex) {
1081             return kEmptyStr;
1082         }
1083     }
1084 
1085     if (year > 0 && year < 100 && num_original_tokens > 1) {
1086         // try to guess year from two-digit year provided,
1087         // only if it could not possibly be a day of the month
1088         // and if there were at least two tokens provided
1089         string year_date = NStr::NumericToString(year + 2000);
1090         bool format_bad = false;
1091         bool in_future = false;
1092         IsCorrectDateFormat(year_date, format_bad, in_future);
1093         if (in_future) {
1094             year += 1900;
1095         } else {
1096             year += 2000;
1097         }
1098     }
1099     if (year >= 1000 && year < 2100) {
1100         reformatted_date = NStr::NumericToString (year);
1101         if (!NStr::IsBlank (month)) {
1102             reformatted_date = month + "-" + reformatted_date;
1103             if (day > 0) {
1104                 string day_str = NStr::NumericToString (day);
1105                 if (day_str.length() < 2) {
1106                     day_str = "0" + day_str;
1107                 }
1108                 reformatted_date = day_str + "-" + reformatted_date;
1109             }
1110         }
1111     }
1112 
1113     return reformatted_date;
1114 }
1115 
1116 
DetectDateFormat(const string & orig_date,bool & ambiguous,bool & day_first)1117 void CSubSource::DetectDateFormat(const string& orig_date, bool& ambiguous, bool &day_first)
1118 {
1119     ambiguous = false;
1120     day_first = false;
1121     vector<string> tokens = x_GetDateTokens(orig_date);
1122     if (tokens.size() != 3) {
1123         // can't do detection if there are more or less than three tokens
1124         ambiguous = true;
1125         return;
1126     }
1127     vector<int> nums;
1128 
1129     // detection is only valid if all tokens are numbers and at least one is known to be the year
1130     try {
1131         ITERATE(vector<string>, it, tokens) {
1132             nums.push_back(NStr::StringToInt (*it));
1133         }
1134     } catch ( ... ) {
1135         // threw exception while converting to int
1136         ambiguous = true;
1137         return;
1138     }
1139     enum EPos { eDay = 0, eMonth = 1, eYear = 2 };
1140     vector<int> positions;
1141     positions.push_back(0);
1142     positions.push_back(0);
1143     positions.push_back(0);
1144 
1145     int token_pos = 1;
1146     ITERATE(vector<int>, it, nums) {
1147         if (*it > 31) {
1148             if (positions[eYear] > 0) {
1149                 // already found a year
1150                 ambiguous = true;
1151                 return;
1152             }
1153             positions[eYear] = token_pos;
1154         } else if (*it > 12) {
1155             if (positions[eDay] > 0) {
1156                 // already found a day
1157                 ambiguous = true;
1158                 return;
1159             }
1160             positions[eDay] = token_pos;
1161         } else if (positions[eMonth] > 0) {
1162             // already found a month
1163             ambiguous = true;
1164             return;
1165         } else {
1166             positions[eMonth] = token_pos;
1167         }
1168         token_pos++;
1169     }
1170     if (positions[eDay] < positions[eMonth]) {
1171         day_first = true;
1172     } else {
1173         day_first = false;
1174     }
1175 }
1176 
1177 
IsCorrectLatLonFormat(string lat_lon,bool & format_correct,bool & precision_correct,bool & lat_in_range,bool & lon_in_range,double & lat_value,double & lon_value)1178 void CSubSource::IsCorrectLatLonFormat (string lat_lon, bool& format_correct, bool& precision_correct,
1179                                      bool& lat_in_range, bool& lon_in_range,
1180                                      double& lat_value, double& lon_value)
1181 {
1182     format_correct = false;
1183     lat_in_range = false;
1184     lon_in_range = false;
1185     precision_correct = false;
1186     double ns, ew;
1187     char lon, lat;
1188     int processed;
1189 
1190     lat_value = 0.0;
1191     lon_value = 0.0;
1192 
1193     if (NStr::IsBlank(lat_lon)) {
1194         return;
1195     } else if (sscanf (lat_lon.c_str(), "%lf %c %lf %c%n", &ns, &lat, &ew, &lon, &processed) != 4
1196                || size_t(processed) != lat_lon.length()) {
1197         return;
1198     } else if ((lat != 'N' && lat != 'S') || (lon != 'E' && lon != 'W')) {
1199         return;
1200     } else {
1201         // init values found
1202         if (lat == 'N') {
1203             lat_value = ns;
1204         } else {
1205             lat_value = 0.0 - ns;
1206         }
1207         if (lon == 'E') {
1208             lon_value = ew;
1209         } else {
1210             lon_value = 0.0 - ew;
1211         }
1212 
1213         // make sure format is correct
1214         vector<string> pieces;
1215         NStr::Split(lat_lon, " ", pieces);
1216         if (pieces.size() > 3) {
1217             int precision_lat = x_GetPrecision(pieces[0]);
1218             int precision_lon = x_GetPrecision(pieces[2]);
1219 
1220             char reformatted[1000];
1221             sprintf (reformatted, "%.*lf %c %.*lf %c", precision_lat, ns, lat,
1222                                                        precision_lon, ew, lon);
1223 
1224             size_t len = strlen (reformatted);
1225             if (NStr::StartsWith(lat_lon, reformatted)
1226                 && (len == lat_lon.length()
1227                   || (len < lat_lon.length()
1228                       && lat_lon[len] == ';'))) {
1229                 format_correct = true;
1230                 if (ns <= 90 && ns >= 0) {
1231                     lat_in_range = true;
1232                 }
1233                 if (ew <= 180 && ew >= 0) {
1234                     lon_in_range = true;
1235                 }
1236                 if (precision_lat < 3 && precision_lon < 3) {
1237                     precision_correct = true;
1238                 }
1239             }
1240         }
1241     }
1242 }
1243 
1244 
FixLatLonPrecision(const string & orig)1245 string CSubSource::FixLatLonPrecision(const string& orig)
1246 {
1247     bool format_correct = false;
1248     bool precision_correct = false;
1249     bool lat_in_range = false;
1250     bool lon_in_range = false;
1251     double lat_value = 0.0;
1252     double lon_value = 0.0;
1253     IsCorrectLatLonFormat(orig, format_correct, precision_correct,
1254                           lat_in_range, lon_in_range,
1255                           lat_value, lon_value);
1256     if (!format_correct || !lat_in_range || !lon_in_range || precision_correct) {
1257         return orig;
1258     }
1259     vector<string> pieces;
1260     NStr::Split(orig, " ", pieces);
1261     if (pieces.size() > 3) {
1262         int precision_lat = x_GetPrecision(pieces[0]);
1263         int precision_lon = x_GetPrecision(pieces[2]);
1264         if (precision_lat > 4) {
1265             precision_lat = 4;
1266         }
1267         if (precision_lon > 4) {
1268             precision_lon = 4;
1269         }
1270 
1271         char reformatted[1000];
1272         sprintf(reformatted, "%.*lf %c %.*lf %c", precision_lat, fabs(lat_value), pieces[1].c_str()[0],
1273             precision_lon, fabs(lon_value), pieces[3].c_str()[0]);
1274         string new_val = reformatted;
1275         return reformatted;
1276     }
1277     return kEmptyStr;
1278 }
1279 
1280 /*
1281 1. String should be converted to UTF8 string, this will get rid of \xC0 and similar substrings
1282 2. Every codepoint (note that this is not regular ascii "char") that is not a digit or a decimal point or a letter should be prepended with a space.
1283    Transitions from alpha to digit/point and from digit/point to alpha should also be prepended with a space.
1284 3. NStr::Split is called with space as a separator and Tokenize flag - need to check if Split works with UTF8 strings properly.
1285 4. After this we should have a vector of tokens, some of which are numbers and others are "modifiers" such as ', '', degrees, N, S, E, W, etc.
1286 5. A pattern string is created where each number is replaced with "1" and modifiers are normalized to "lat", or "N"; the actual numerical values are kept in a separate vector
1287 5. Based on the pattern the vector of numbers is parsed into degrees, minutes, or seconds,
1288 6. NSEW and "lattitude/longitude" are applied to degrees in the order of appearance, if none are present other heuristic to determine which is latitude and which is longitude
1289 */
1290 
s_InsertSpacesBetweenTokens(const string & old_str)1291 static string s_InsertSpacesBetweenTokens(const string &old_str)
1292 {
1293     string new_str;
1294     for (string::const_iterator i = old_str.begin(); i != old_str.end(); ++i)
1295     {
1296         TUnicodeSymbol sym = CUtf8::Decode(i);
1297         if (sym < 0x80)
1298         {
1299             char c = static_cast<char>(sym);
1300             if (!isalpha(c) && !isdigit(c) && c != '.' && c != '-' && c != '+')
1301             {
1302                 new_str += ' ';
1303             }
1304             else if (!new_str.empty() &&
1305                  ((isalpha(new_str.back()) && !isalpha(c)) ||
1306                   (!isalpha(new_str.back()) && isalpha(c))))
1307             {
1308                 new_str += ' ';
1309             }
1310             new_str += c;
1311             if (!isalpha(c) && !isdigit(c) && c != '.' && c != '-' && c != '+')
1312             {
1313                 new_str += ' ';
1314             }
1315         }
1316         else
1317         {
1318             new_str += ' ';
1319         }
1320     }
1321     return new_str;
1322 }
1323 
s_RemoveSpacesWithinNumbers(const string & old_str)1324 static string s_RemoveSpacesWithinNumbers(const string &old_str)
1325 {
1326     string new_str;
1327     bool is_number = true;
1328     for (string::const_iterator i = old_str.begin(); i != old_str.end(); ++i)
1329     {
1330         TUnicodeSymbol sym = CUtf8::Decode(i);
1331         if (sym < 0x80)
1332         {
1333             char c = static_cast<char>(sym);
1334                 size_t j = new_str.size();
1335                 if (j >= 4 &&  new_str[j-1] == ' ' && new_str[j-2] == '.' && new_str[j-3] == ' ' && isdigit(new_str[j-4]) && isdigit(c))
1336                 {
1337                     new_str.pop_back();
1338                     new_str.pop_back();
1339                     new_str.pop_back();
1340                     new_str += '.';
1341                 }
1342                 new_str += c;
1343                 if (!isdigit(c) && c != '+' && c != '-' && c != '.' && !isspace(c)) {
1344                     is_number = false;
1345                 }
1346             }
1347         else
1348         {
1349             new_str += ' ';
1350             is_number = false;
1351         }
1352     }
1353     if (is_number)
1354     {
1355         NStr::ReplaceInPlace(new_str, "+", " +");
1356         NStr::ReplaceInPlace(new_str, "-", " -");
1357     }
1358     return new_str;
1359 }
1360 
s_IsNumber(const string & token,double * result=NULL)1361 static bool s_IsNumber(const string &token, double *result = NULL)
1362 {
1363     double num = NStr::StringToDouble(token, NStr::fConvErr_NoThrow);
1364     if (!num && errno)
1365     {
1366         return false;
1367     }
1368     if (result) {
1369         *result = num;
1370     }
1371     return true;
1372 }
1373 
s_NormalizeTokens(vector<string> & tokens,vector<double> & numbers,vector<string> & anum,vector<int> & precision,vector<string> & lat_long,vector<string> & nsew)1374 static string s_NormalizeTokens(vector<string> &tokens, vector<double> &numbers, vector<string> &anum, vector<int> &precision, vector<string> &lat_long,  vector<string> &nsew)
1375 {
1376     vector<string> pattern;
1377     for (size_t i = 0; i < tokens.size(); i++)
1378     {
1379         string &token = tokens[i];
1380 
1381         double num;
1382         if (s_IsNumber(token, &num))
1383         {
1384             numbers.push_back(num);
1385             anum.push_back(token);
1386             pattern.push_back("1");
1387             precision.push_back(0);
1388             if (NStr::Find(token, ".") != NPOS && !NStr::EndsWith(token, "."))
1389             {
1390                 precision.back() = token.length() - NStr::Find(token, ".") - 1;
1391             }
1392             continue;
1393         }
1394 
1395         {
1396             vector<string> tmp;
1397             NStr::Split(token, ".", tmp);
1398             double num0, num1, num2;
1399             if (tmp.size() == 3 && s_IsNumber(tmp[0], &num0) && s_IsNumber(tmp[1], &num1) && s_IsNumber(tmp[2], &num2))
1400             {
1401                 numbers.push_back(num0);
1402                 anum.push_back(tmp[0]);
1403                 pattern.push_back("1");
1404                 precision.push_back(0);
1405                 numbers.push_back(num1);
1406                 anum.push_back(tmp[1]);
1407                 pattern.push_back("1");
1408                 precision.push_back(0);
1409                 numbers.push_back(num2);
1410                 anum.push_back(tmp[2]);
1411                 pattern.push_back("1");
1412                 precision.push_back(0);
1413                 continue;
1414             }
1415         }
1416 
1417         if (token == "\'" && i >= 3 && s_IsNumber(tokens[i - 1]) && tokens[i - 2] == "\'" && s_IsNumber(tokens[i - 3]))
1418         {
1419             token = "\"";
1420         }
1421 
1422         if (NStr::EqualNocase(token, "degrees") || NStr::EqualNocase(token, "deg")  || NStr::EqualNocase(token, "deg.") || NStr::EqualNocase(token, "degree"))
1423         {
1424             token = "degrees";
1425             pattern.push_back("degrees");
1426         }
1427         else if ( token == "\'"  || NStr::EqualNocase(token, "min") || NStr::EqualNocase(token, "min.") || NStr::EqualNocase(token, "minute") || NStr::EqualNocase(token, "minutes"))
1428         {
1429             token  = "\'";
1430             pattern.push_back("\'");
1431         }
1432         else if (token == "\"" || NStr::EqualNocase(token, "sec") || NStr::EqualNocase(token, "sec.") || NStr::EqualNocase(token, "second") || NStr::EqualNocase(token, "seconds"))
1433         {
1434             token = "\"";
1435             pattern.push_back("\"");
1436         }
1437         else if (token == "," || token == ":" || token == "_" || token == "&" || token == "." || token == ";" || token == "#" || NStr::EqualNocase(token, "and"))
1438         {
1439         }
1440         else if (NStr::EqualNocase(token, "lattitude") || NStr::EqualNocase(token, "latitude") || NStr::EqualNocase(token, "lat") || NStr::EqualNocase(token, "lat."))
1441         {
1442             pattern.push_back("lat");
1443             lat_long.push_back("lat");
1444         }
1445         else if (NStr::EqualNocase(token, "longitude") || NStr::EqualNocase(token, "lo") || NStr::EqualNocase(token, "lon") || NStr::EqualNocase(token, "long")
1446                      || NStr::EqualNocase(token, "lo.") || NStr::EqualNocase(token, "lon.") || NStr::EqualNocase(token, "long."))
1447         {
1448             pattern.push_back("lat");
1449             lat_long.push_back("long");
1450         }
1451         else if (token == "N"  || NStr::EqualNocase(token, "north"))
1452         {
1453             pattern.push_back("N");
1454             nsew.push_back("N");
1455         }
1456         else if (token == "S"  || NStr::EqualNocase(token, "south"))
1457         {
1458             pattern.push_back("N");
1459             nsew.push_back("S");
1460         }
1461         else if (token == "E"  || NStr::EqualNocase(token, "east"))
1462         {
1463             pattern.push_back("N");
1464             nsew.push_back("E");
1465         }
1466         else if (token == "W"  || NStr::EqualNocase(token, "west") || token == "Wdeg")
1467         {
1468             pattern.push_back("N");
1469             nsew.push_back("W");
1470         }
1471         else if (token == "NW")
1472         {
1473             nsew.push_back("N");
1474             nsew.push_back("W");
1475         }
1476         else if (token == "NE")
1477         {
1478             nsew.push_back("N");
1479             nsew.push_back("E");
1480         }
1481         else if (token == "SW")
1482         {
1483             nsew.push_back("S");
1484             nsew.push_back("W");
1485         }
1486         else if (token == "SE")
1487         {
1488             nsew.push_back("S");
1489             nsew.push_back("E");
1490         }
1491         else
1492         {
1493             //cout << "Token: " << token << endl;
1494             numbers.clear();
1495             return kEmptyStr;
1496         }
1497     }
1498     //cout << "Pattern: " << NStr::Join(pattern, " ") << endl;
1499     return NStr::Join(pattern, " ");
1500 }
1501 
s_ReorderNorthSouthEastWest(vector<double> & numbers,vector<int> & precision,const vector<string> & lat_long,vector<string> & nsew)1502 static void s_ReorderNorthSouthEastWest(vector<double> &numbers, vector<int> &precision, const vector<string> &lat_long, vector<string> &nsew)
1503 {
1504     if (numbers.size() != 2)
1505     {
1506         numbers.clear();
1507         return;
1508     }
1509     if (lat_long.size() == 2)
1510     {
1511         if (lat_long.front() == "long")
1512         {
1513             swap(numbers[0], numbers[1]);
1514             swap(precision[0], precision[1]);
1515             if (nsew.size() == 2) {
1516                 swap(nsew[0], nsew[1]);
1517             }
1518         }
1519     }
1520     else if (!lat_long.empty())
1521     {
1522         numbers.clear();
1523         return;
1524     }
1525     if (nsew.size() == 2)
1526     {
1527         if ((nsew[0] == "E" || nsew[0] == "W") &&
1528             (nsew[1] == "N" || nsew[1] == "S"))
1529         {
1530             swap(numbers[0], numbers[1]);
1531             swap(precision[0], precision[1]);
1532             swap(nsew[0], nsew[1]);
1533         }
1534         if (nsew[0] == "N")
1535         {
1536         numbers[0] = fabs(numbers[0]);
1537         }
1538         else if (nsew[0] == "S")
1539         {
1540             if (numbers[0] != 0)
1541                 numbers[0] = -fabs(numbers[0]);
1542         }
1543         else
1544         {
1545             numbers.clear();
1546             return;
1547         }
1548         if (nsew[1] == "E")
1549         {
1550             numbers[1] = fabs(numbers[1]);
1551         }
1552         else if (nsew[1] == "W")
1553         {
1554             if (numbers[1] != 0)
1555                 numbers[1] = -fabs(numbers[1]);
1556         }
1557         else
1558         {
1559             numbers.clear();
1560             return;
1561         }
1562 
1563     }
1564     else if (!nsew.empty())
1565     {
1566         numbers.clear();
1567         return;
1568     }
1569     if (lat_long.empty() && nsew.empty() && fabs(numbers[0]) > 90 && fabs(numbers[1]) < 90)
1570     {
1571         swap(numbers[0], numbers[1]);
1572         swap(precision[0], precision[1]);
1573     }
1574     if (fabs(numbers[0]) > 90 || fabs(numbers[1]) > 180)
1575     {
1576         numbers.clear();
1577         return;
1578     }
1579 }
1580 
s_GetLatLong(const string & new_str,vector<double> & numbers,vector<int> & precision)1581 static void s_GetLatLong(const string &new_str, vector<double> &numbers, vector<int> &precision)
1582 {
1583     vector<string> tokens;
1584     NStr::Split(new_str, " ", tokens, NStr::fSplit_Tokenize);
1585     vector<string> lat_long;
1586     vector<string> nsew;
1587     vector<string> anum;
1588     string pattern = s_NormalizeTokens(tokens, numbers, anum, precision, lat_long, nsew);
1589     if (pattern.empty())
1590     {
1591         numbers.clear();
1592         return;
1593     }
1594     vector<double> degrees(2, 0);
1595     vector<int> prec(2, 0);
1596     int sign1 = 1;
1597     int sign2 = 1;
1598     if ( pattern == "1 1" ||
1599      pattern == "1 N 1 N" ||
1600          pattern == "N 1 N 1" ||
1601      pattern == "1 degrees N 1 degrees N" ||
1602      pattern == "lat 1 lat 1" ||
1603          pattern == "1 N lat 1 N lat" ||
1604          pattern == "1 degrees N lat 1 degrees N lat")
1605     {
1606         degrees[0] = numbers[0];
1607         degrees[1] = numbers[1];
1608         prec[0] = precision[0];
1609         prec[1] = precision[1];
1610     }
1611     else if ((pattern == "1 1 \" 1 1 '" ||
1612           pattern == "1 degrees 1 \" N 1 degrees 1 ' N")
1613          && numbers[1] < 60 && numbers[3] < 60
1614              && numbers[1] >= 0 && numbers[3] >= 0)
1615     {
1616         sign1 = anum[0][0] == '-' ? -1 : 1;
1617         sign2 = anum[2][0] == '-' ? -1 : 1;
1618         degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 3600);
1619         degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60);
1620         prec[0] = max(precision[0], precision[1] + 4);
1621         prec[1] = max(precision[2], precision[3] + 2);
1622     }
1623     else if ( (pattern == "1 1 ' 1" ||
1624                pattern == "1 degrees 1 ' N 1 degrees N")
1625               && numbers[1] < 60
1626               && numbers[1] >= 0)
1627     {
1628         sign1 = anum[0][0] == '-' ? -1 : 1;
1629         degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
1630         degrees[1] = numbers[2];
1631         prec[0] = max(precision[0], precision[1] + 2);
1632         prec[1] = precision[2];
1633         }
1634     else if (pattern == "1 1 ' 1 \" 1"
1635          && numbers[1] < 60 && numbers[2] < 60
1636              && numbers[1] >= 0 && numbers[2] >= 0)
1637     {
1638         sign1 = anum[0][0] == '-' ? -1 : 1;
1639         degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1640         degrees[1] = numbers[3];
1641         prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1642         prec[1] = precision[3];
1643     }
1644     else if ((pattern == "1 1 ' 1 \" 1 1 '" ||
1645           pattern == "1 1 1 N 1 1 N" ||
1646           pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 ' N")
1647          && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60
1648              && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0)
1649     {
1650         sign1 = anum[0][0] == '-' ? -1 : 1;
1651         sign2 = anum[3][0] == '-' ? -1 : 1;
1652         degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1653         degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 60);
1654         prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1655         prec[1] = max(precision[3], precision[4] + 2);
1656     }
1657     else if (( pattern == "1 1 ' 1 \" 1 1 ' 1 \"" ||
1658            pattern == "1 1 ' 1 \" N 1 1 ' 1 \" N" ||
1659            pattern == "1 degrees 1 ' 1 \" 1 degrees 1 ' 1 \"" ||
1660            pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 ' 1 \" N" ||
1661            pattern == "N 1 degrees 1 ' 1 \" N 1 degrees 1 ' 1 \"" ||
1662            pattern == "1 degrees 1 ' 1 N 1 degrees 1 ' 1 N" ||
1663            pattern == "1 degrees 1 1 N 1 degrees 1 1 N" ||
1664            pattern == "1 1 1 N 1 1 1 N")
1665              && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60 && numbers[5] < 60
1666              && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0 && numbers[5] >= 0)
1667     {
1668         sign1 = anum[0][0] == '-' ? -1 : 1;
1669         sign2 = anum[3][0] == '-' ? -1 : 1;
1670         degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1671         degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 60 + numbers[5] / 3600);
1672         prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1673         prec[1] = max(max(precision[3], precision[4] + 2), precision[5] + 4);
1674     }
1675     else if (( pattern == "1 1 ' 1 1 '" ||
1676            pattern == "1 1 N 1 1 N" ||
1677                pattern == "1 1 ' N 1 1 ' N" ||
1678            pattern == "1 degrees 1 ' N 1 degrees 1 ' N" ||
1679                pattern == "lat 1 degrees 1 ' N lat 1 degrees 1 ' N" ||
1680            pattern == "1 degrees 1 N 1 degrees 1 N" ||
1681            pattern == "1 degrees 1 N 1 degrees 1 ' N" ||
1682                pattern == "1 degrees 1 ' N 1 degrees 1 N" ||
1683                pattern == "N 1 degrees 1 ' N 1 degrees 1" ||
1684                pattern == "N 1 degrees 1 ' N 1 degrees 1 '" ||
1685                pattern == "N 1 degrees 1 ' N 1 1 '")
1686          && numbers[1] < 60  && numbers[3] < 60
1687              && numbers[1] >= 0  && numbers[3] >= 0)
1688     {
1689         sign1 = anum[0][0] == '-' ? -1 : 1;
1690         sign2 = anum[2][0] == '-' ? -1 : 1;
1691         degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
1692         degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60);
1693         prec[0] = max(precision[0], precision[1] + 2);
1694         prec[1] = max(precision[2], precision[3] + 2);
1695     }
1696     else if ((pattern == "1 N 1 1 N" ||
1697               pattern == "1 degrees N 1 degrees 1 ' N")
1698          &&  numbers[2] < 60
1699              &&  numbers[2] >= 0)
1700     {
1701         sign2 = anum[1][0] == '-' ? -1 : 1;
1702         degrees[0] = numbers[0];
1703         degrees[1] = sign2*(fabs(numbers[1]) + numbers[2] / 60);
1704         prec[0] = precision[0];
1705         prec[1] = max(precision[1], precision[2] + 2);
1706     }
1707     else if ((pattern == "1 degrees 1 ' 1 degrees 1 ' 1 \"" ||
1708               pattern == "N 1 1 N 1 1 1")
1709          && numbers[1] < 60 && numbers[3] < 60 && numbers[4] < 60
1710              && numbers[1] >= 0 && numbers[3] >= 0 && numbers[4] >= 0)
1711     {
1712         sign1 = anum[0][0] == '-' ? -1 : 1;
1713         sign2 = anum[2][0] == '-' ? -1 : 1;
1714         degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
1715         degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60 + numbers[4] / 3600);
1716         prec[0] = max(precision[0], precision[1] + 2);
1717         prec[1] = max(max(precision[2], precision[3] + 2), precision[4] + 4);
1718     }
1719     else if (pattern == "1 degrees 1 degrees 1 ' 1 \""
1720          && numbers[2] < 60 && numbers[3] < 60
1721              && numbers[2] >= 0 && numbers[3] >= 0)
1722     {
1723         sign2 = anum[1][0] == '-' ? -1 : 1;
1724         degrees[0] = numbers[0];
1725         degrees[1] = sign2*(fabs(numbers[1]) + numbers[2] / 60 + numbers[3] / 3600);
1726         prec[0] = precision[0];
1727         prec[1] = max(max(precision[1], precision[2] + 2), precision[3] + 4);
1728     }
1729     else if (pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 \" N"
1730          && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60
1731              && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0)
1732     {
1733         sign1 = anum[0][0] == '-' ? -1 : 1;
1734         sign2 = anum[3][0] == '-' ? -1 : 1;
1735         degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1736         degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 3600);
1737         prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1738         prec[1] = max(precision[3], precision[4] + 4);
1739     }
1740     else
1741     {
1742         degrees.clear();
1743         prec.clear();
1744     }
1745     swap(degrees, numbers);
1746     swap(prec, precision);
1747     s_ReorderNorthSouthEastWest(numbers, precision, lat_long, nsew);
1748 }
1749 
1750 
FixLatLonFormat(string orig_lat_lon,bool guess)1751 string CSubSource::FixLatLonFormat (string orig_lat_lon, bool guess)
1752 {
1753     //cout << "Before: " << orig_lat_lon << endl;
1754     NStr::ParseEscapes(orig_lat_lon);
1755     CStringUTF8 old_str = CUtf8::AsUTF8(orig_lat_lon, CUtf8::GuessEncoding(orig_lat_lon));
1756     if (NStr::StartsWith(old_str, "\""))
1757     {
1758         NStr::TrimPrefixInPlace(old_str, "\"");
1759         NStr::TrimSuffixInPlace(old_str, "\"");
1760     }
1761     NStr::ReplaceInPlace(old_str, "\'\'", "\"");
1762     string fixed_str = s_RemoveSpacesWithinNumbers(old_str);
1763     string new_str = s_InsertSpacesBetweenTokens(fixed_str);
1764     NStr::Sanitize(new_str);
1765     vector<double> numbers;
1766     vector<int> precision;
1767     s_GetLatLong(new_str, numbers, precision);
1768     string res;
1769     if (!numbers.empty())
1770     {
1771         res = MakeLatLon(numbers[0], numbers[1], precision[0], precision[1]);
1772     }
1773     //cout << "After: " << res << endl;
1774     return res;
1775 }
1776 
1777 
MakeLatLon(double lat_value,double lon_value,int lat_precision,int lon_precision)1778 string CSubSource::MakeLatLon(double lat_value, double lon_value, int lat_precision, int lon_precision )
1779 {
1780     char ns = 'N';
1781     if (lat_value < 0) {
1782         ns = 'S';
1783         lat_value = -lat_value;
1784     }
1785     char ew = 'E';
1786     if (lon_value < 0) {
1787         ew = 'W';
1788         lon_value = -lon_value;
1789     }
1790     string lat = NStr::DoubleToString(lat_value, lat_precision);
1791     string lon = NStr::DoubleToString(lon_value, lon_precision);
1792 
1793     NStr::TrimSuffixInPlace(lat, ".");
1794     NStr::TrimSuffixInPlace(lon, ".");
1795     string res = lat + " " + ns + " " + lon + " " + ew;
1796     return res;
1797 }
1798 
1799 
x_CalculateLatLonId(float lat_value,float lon_value,string country,string province)1800 CLatLonCountryId *CSubSource::x_CalculateLatLonId(float lat_value, float lon_value, string country, string province)
1801 {
1802     CLatLonCountryId *id = new CLatLonCountryId(lat_value, lon_value);
1803 
1804     bool goodmatch = false;
1805 
1806     // lookup region by coordinates, or find nearest region and calculate distance
1807     const CCountryExtreme * guess = m_LatLonCountryMap->GuessRegionForLatLon(lat_value, lon_value, country, province);
1808     if (guess) {
1809         id->SetFullGuess(guess->GetCountry());
1810         id->SetGuessCountry(guess->GetLevel0());
1811         id->SetGuessProvince(guess->GetLevel1());
1812         if (NStr::EqualNocase(country, id->GetGuessCountry())
1813             && (NStr::IsBlank(province) || NStr::EqualNocase(province, id->GetGuessProvince()))) {
1814             goodmatch = true;
1815         }
1816     } else {
1817         // not inside a country, check water
1818         guess = m_LatLonWaterMap->GuessRegionForLatLon(lat_value, lon_value, country);
1819         if (guess) {
1820             // found inside water
1821             id->SetGuessWater(guess->GetCountry());
1822             if (NStr::EqualNocase(country, id->GetGuessWater())) {
1823                 goodmatch = true;
1824             }
1825 
1826             // also see if close to land for coastal warning (if country is land)
1827             // or proximity message (if country is water)
1828             double landdistance = 0.0;
1829             guess = m_LatLonCountryMap->FindClosestToLatLon (lat_value, lon_value, 5.0, landdistance);
1830             if (guess) {
1831                 id->SetClosestFull(guess->GetCountry());
1832                 id->SetClosestCountry(guess->GetLevel0());
1833                 id->SetClosestProvince(guess->GetLevel1());
1834                 id->SetLandDistance(m_LatLonCountryMap->AdjustAndRoundDistance (landdistance));
1835                 if (NStr::EqualNocase(country, id->GetClosestCountry())
1836                     && (NStr::IsBlank(province) || NStr::EqualNocase(province, guess->GetLevel1()))) {
1837                     goodmatch = true;
1838                 }
1839             }
1840         } else {
1841             // may be coastal inlet, area of data insufficiency
1842             double landdistance = 0.0;
1843             guess = m_LatLonCountryMap->FindClosestToLatLon (lat_value, lon_value, 5.0, landdistance);
1844             if (guess) {
1845                 id->SetClosestFull(guess->GetCountry());
1846                 id->SetClosestCountry(guess->GetLevel0());
1847                 id->SetClosestProvince(guess->GetLevel1());
1848                 id->SetLandDistance(m_LatLonCountryMap->AdjustAndRoundDistance (landdistance));
1849                 if (NStr::EqualNocase(country, id->GetClosestCountry())
1850                      && (NStr::IsBlank(province) || NStr::EqualNocase(province, guess->GetLevel1()))) {
1851                     goodmatch = true;
1852                 }
1853             }
1854 
1855             double waterdistance = 0.0;
1856             guess = m_LatLonWaterMap->FindClosestToLatLon (lat_value, lon_value, 5.0, waterdistance);
1857             if (guess) {
1858                 id->SetClosestWater(guess->GetLevel0());
1859                 id->SetWaterDistance(m_LatLonWaterMap->AdjustAndRoundDistance (waterdistance));
1860                 if (NStr::EqualNocase(country, id->GetClosestWater())) {
1861                     goodmatch = true;
1862                 }
1863             }
1864         }
1865     }
1866 
1867     // if guess is not the provided country or province, calculate distance to claimed country
1868     if (!goodmatch) {
1869         double distance = 0.0;
1870         guess = m_LatLonCountryMap->IsNearLatLon (lat_value, lon_value, 5.0, distance, country, province);
1871         if (guess) {
1872             if (distance < ErrorDistance(lat_value, lon_value, m_LatLonCountryMap->GetScale())) {
1873                 // close enough
1874                 id->SetGuessCountry(country);
1875                 id->SetGuessProvince(province);
1876                 id->SetFullGuess(guess->GetCountry());
1877             } else {
1878                 id->SetClaimedFull(guess->GetCountry());
1879                 id->SetClaimedDistance(m_LatLonCountryMap->AdjustAndRoundDistance (distance));
1880             }
1881         } else if (NStr::IsBlank(province)) {
1882             guess = m_LatLonWaterMap->IsNearLatLon (lat_value, lon_value, 5.0, distance, country, province);
1883             if (guess) {
1884                 id->SetClaimedFull(guess->GetCountry());
1885                 id->SetClaimedDistance(m_LatLonWaterMap->AdjustAndRoundDistance (distance));
1886             }
1887         }
1888     }
1889 
1890     return id;
1891 }
1892 
1893 
1894 
1895 typedef SStaticPair<const char*, const char*>  TWaterPairElem;
1896 static const TWaterPairElem k_water_pair_map[] = {
1897     {"Adriatic Sea",         "Mediterranean Sea"},
1898     {"Aegean Sea",           "Mediterranean Sea"},
1899     {"Alboran Sea",          "Mediterranean Sea"},
1900     {"Andaman Sea",          "Indian Ocean"},
1901     {"Arabian Sea",          "Indian Ocean"},
1902     {"Argentine Sea",        "Atlantic Ocean"},
1903     {"Ariake Sea",           "Pacific Ocean"},
1904     {"Baffin Bay",           "Atlantic Ocean"},
1905     {"Balearic Sea",         "Mediterranean Sea"},
1906     {"Baltic Sea",           "Atlantic Ocean"},
1907     {"Barents Sea",          "Arctic Ocean"},
1908     {"Bay of Bengal",        "Indian Ocean"},
1909     {"Beaufort Sea",         "Arctic Ocean"},
1910     {"Bering Sea",           "Pacific Ocean"},
1911     {"Bismarck Sea",         "Pacific Ocean"},
1912     {"Black Sea",            "Mediterranean Sea"},
1913     {"Bohai Sea",            "Pacific Ocean"},
1914     {"Caribbean Sea",        "Atlantic Ocean"},
1915     {"Celebes Sea",          "Pacific Ocean"},
1916     {"Champlain Sea",        "Atlantic Ocean"},
1917     {"Chilean Sea",          "Pacific Ocean"},
1918     {"China Seas",           "Pacific Ocean"},
1919     {"Chukchi Sea",          "Arctic Ocean"},
1920     {"Coral Sea",            "Pacific Ocean"},
1921     {"Davis Strait",         "Atlantic Ocean"},
1922     {"East China Sea",       "Pacific Ocean"},
1923     {"East Siberian Sea",    "Arctic Ocean"},
1924     {"English Channel",      "Atlantic Ocean"},
1925     {"Erythraean Sea",       "Indian Ocean"},
1926     {"Golfo de California",  "Pacific Ocean"},
1927     {"Greenland Sea",        "Arctic Ocean"},
1928     {"Gulf of Mexico",       "Atlantic Ocean"},
1929     {"Gulf of Thailand",     "Pacific Ocean"},
1930     {"Gulf of Tonkin",       "Pacific Ocean"},
1931     {"Hudson Bay",           "Arctic Ocean"},
1932     {"Ionian Sea",           "Mediterranean Sea"},
1933     {"Irish Sea",            "Atlantic Ocean"},
1934     {"Irminger Sea",         "Atlantic Ocean"},
1935     {"James Bay",            "Atlantic Ocean"},
1936     {"Java Sea",             "Indian Ocean"},
1937     {"Kara Sea",             "Arctic Ocean"},
1938     {"Koro Sea",             "Pacific Ocean"},
1939     {"Labrador Sea",         "Atlantic Ocean"},
1940     {"Laccadive Sea",        "Indian Ocean"},
1941     {"Laptev Sea",           "Arctic Ocean"},
1942     {"Ligurian Sea",         "Mediterranean Sea"},
1943     {"Lincoln Sea",          "Arctic Ocean"},
1944     {"Myrtoan Sea",          "Mediterranean Sea"},
1945     {"North Sea",            "Atlantic Ocean"},
1946     {"Norwegian Sea",        "Atlantic Ocean"},
1947     {"Pechora Sea",          "Arctic Ocean"},
1948     {"Persian Gulf",         "Indian Ocean"},
1949     {"Philippine Sea",       "Pacific Ocean"},
1950     {"Red Sea",              "Indian Ocean"},
1951     {"Salish Sea",           "Pacific Ocean"},
1952     {"Sargasso Sea",         "Atlantic Ocean"},
1953     {"Scotia Sea",           "Southern Ocean"},
1954     {"Sea of Azov",          "Black Sea"},
1955     {"Sea of Chiloe",        "Pacific Ocean"},
1956     {"Sea of Crete",         "Mediterranean Sea"},
1957     {"Sea of Japan",         "Pacific Ocean"},
1958     {"Sea of Okhotsk",       "Pacific Ocean"},
1959     {"Sea of the Hebrides",  "Atlantic Ocean"},
1960     {"Sea of Zanj",          "Indian Ocean"},
1961     {"Seas of Greenland",    "Atlantic Ocean"},
1962     {"Sethusamudram",        "Indian Ocean"},
1963     {"Sibutu Passage",       "Pacific Ocean"},
1964     {"Solomon Sea",          "Pacific Ocean"},
1965     {"South China Sea",      "Pacific Ocean"},
1966     {"Sulu Sea",             "Pacific Ocean"},
1967     {"Tasman Sea",           "Pacific Ocean"},
1968     {"Thracian Sea",         "Mediterranean Sea"},
1969     {"Timor Sea",            "Indian Ocean"},
1970     {"Tyrrhenian Sea",       "Mediterranean Sea"},
1971     {"Wandel Sea",           "Arctic Ocean"},
1972     {"White Sea",            "Arctic Ocean"},
1973     {"Yellow Sea",           "Pacific Ocean"}
1974 };
1975 typedef CStaticArrayMap<const char*, const char*, PNocase_CStr> TWaterPairMap;
1976 DEFINE_STATIC_ARRAY_MAP(TWaterPairMap, sc_WaterPairMap, k_water_pair_map);
1977 
x_FindSurroundingOcean(string & water)1978 static string x_FindSurroundingOcean (string& water)
1979 
1980 {
1981     TWaterPairMap::const_iterator new_water_pair_iter = sc_WaterPairMap.find(water.c_str());
1982     if( new_water_pair_iter != sc_WaterPairMap.end() ) {
1983         return new_water_pair_iter->second;
1984     }
1985     return kEmptyStr;
1986 }
1987 
1988 
ValidateLatLonCountry(const string & input_countryname,string & lat_lon,bool check_state,ELatLonCountryErr & errcode)1989 string CSubSource::ValidateLatLonCountry (const string& input_countryname, string& lat_lon, bool check_state, ELatLonCountryErr& errcode)
1990 {
1991     errcode = eLatLonCountryErr_None;
1992     string countryname = input_countryname;
1993     if (NStr::IsBlank(countryname) || NStr::IsBlank(lat_lon)) {
1994         return kEmptyStr;
1995     }
1996 
1997     if ( m_LatLonCountryMap.get() == 0 ) {
1998         m_LatLonCountryMap.reset (new CLatLonCountryMap(false));
1999     }
2000     if ( m_LatLonWaterMap.get() == 0 ) {
2001         m_LatLonWaterMap.reset (new CLatLonCountryMap(true));
2002     }
2003 
2004     // only do these checks if the latlon format is good
2005     bool format_correct, lat_in_range, lon_in_range, precision_correct;
2006     double lat_value = 0.0, lon_value = 0.0;
2007     CSubSource::IsCorrectLatLonFormat (lat_lon, format_correct, precision_correct,
2008                                lat_in_range, lon_in_range,
2009                                lat_value, lon_value);
2010     if (!format_correct) {
2011         // may have comma and then altitude, so just get lat_lon component */
2012         size_t pos = NStr::Find(lat_lon, ",", NStr::eNocase, NStr::eReverseSearch);
2013         if (pos != NPOS) {
2014             lat_lon = lat_lon.substr(0, pos);
2015             CSubSource::IsCorrectLatLonFormat (lat_lon, format_correct, precision_correct,
2016                                        lat_in_range, lon_in_range,
2017                                        lat_value, lon_value);
2018         }
2019     }
2020 
2021     // reality checks
2022     if (!format_correct || !lat_in_range || !lon_in_range) {
2023         // incorrect lat_lon format should be reported elsewhere
2024         // incorrect latitude range should be reported elsewhere
2025         // incorrect longitude range should be reported elsewhere
2026         return kEmptyStr;
2027     }
2028 
2029     // get rid of comments after semicolon or comma in country name
2030     size_t pos = NStr::Find(countryname, ";");
2031     if (pos != NPOS) {
2032          countryname = countryname.substr(0, pos);
2033         }
2034     pos = NStr::Find(countryname, ",");
2035     if (pos != NPOS) {
2036          countryname = countryname.substr(0, pos);
2037     }
2038 
2039     // adjust for special cases
2040     if (NStr::StartsWith(countryname, "Norway: Svalbard")) {
2041         countryname = "Svalbard";
2042     }
2043 
2044     string country = countryname;
2045     string province;
2046     pos = NStr::Find(country, ":");
2047     if (pos != NPOS) {
2048         // is the full string in the list?
2049         if (m_LatLonCountryMap->HaveLatLonForRegion(countryname)) {
2050             province = country.substr(pos + 1);
2051             NStr::TruncateSpacesInPlace(province, NStr::eTrunc_Both);
2052         }
2053         country = country.substr(0, pos);
2054         NStr::TruncateSpacesInPlace(country, NStr::eTrunc_Both);
2055     }
2056     if (NStr::IsBlank(country)) {
2057         return kEmptyStr;
2058     }
2059 
2060     // known exceptions - don't even bother calculating any further
2061     if (NStr::EqualNocase (country, "Antarctica") && lat_value < -60.0) {
2062         return kEmptyStr;
2063     }
2064 
2065     if (! NStr::IsBlank(province)) {
2066         // do not attempt quick exit
2067     } else if (m_LatLonCountryMap->HaveLatLonForRegion(country)) {
2068         if (m_LatLonCountryMap->IsCountryInLatLon(country, lat_value, lon_value)) {
2069             return kEmptyStr;
2070         }
2071     } else if (m_LatLonWaterMap->HaveLatLonForRegion(country)) {
2072         if (m_LatLonWaterMap->IsCountryInLatLon(country, lat_value, lon_value)) {
2073             return kEmptyStr;
2074         }
2075     } else if (NStr::EqualNocase (country, "State of Palestine")) {
2076     } else {
2077         // report unrecognized country
2078         return kEmptyStr;
2079     }
2080 
2081     CLatLonCountryId *id = x_CalculateLatLonId(lat_value, lon_value, country, province);
2082     CLatLonCountryId::TClassificationFlags flags = (id == NULL ? 0 : id->Classify(country, province));
2083 
2084     string wguess = id->GetGuessWater();
2085     string cguess = id->GetGuessCountry();
2086 
2087     // special case where subsection of country has been identified but is not in coordinates of country
2088     // VR-840
2089     if (province.empty() && NStr::Equal(cguess, country)) {
2090         delete id;
2091         return kEmptyStr;
2092     }
2093 
2094     if (NStr::EqualNocase (country, "State of Palestine") &&
2095         (NStr::EqualNocase (cguess, "Gaza Strip") ||
2096          NStr::EqualNocase (cguess, "West Bank"))) {
2097         delete id;
2098         return kEmptyStr;
2099     }
2100 
2101     if (NStr::IsBlank (cguess) && (! NStr::IsBlank (wguess))) {
2102         string parent = x_FindSurroundingOcean (wguess);
2103         if ((! NStr::IsBlank (parent)) && NStr::EqualNocase (country, parent)) {
2104             delete id;
2105             return kEmptyStr;
2106         }
2107     }
2108 
2109     double neardist = 0.0;
2110     CLatLonCountryMap::TLatLonAdjustFlags adjustment = CLatLonCountryMap::fNone;
2111     CLatLonCountryId::TClassificationFlags adjusted_flags = 0;
2112 
2113     if (!flags && m_LatLonCountryMap->IsNearLatLon(lat_value, lon_value, 2.0, neardist, country) && neardist < 5.0) {
2114         id->SetGuessCountry (country);
2115         id->SetGuessProvince (kEmptyStr);
2116         flags = id->Classify(country, province);
2117     }
2118 
2119     if (!flags && !m_LatLonCountryMap->IsNearLatLon(lat_value, lon_value, 20.0, neardist, country)
2120         && !m_LatLonWaterMap->IsNearLatLon(lat_value, lon_value, 20.0, neardist, country)) {
2121         /* do not flip from water */
2122         CLatLonCountryId *adjust_id = x_CalculateLatLonId(lon_value, lat_value, country, province);
2123         adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
2124         if (adjusted_flags) {
2125             string awguess = adjust_id->GetGuessWater();
2126             string acguess = adjust_id->GetGuessCountry();
2127             if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
2128                 delete id;
2129                 id = adjust_id;
2130                 flags = adjusted_flags;
2131                 adjustment = CLatLonCountryMap::fFlip;
2132             }
2133         } else {
2134             if (adjust_id) {
2135                 delete adjust_id;
2136             }
2137             adjust_id = x_CalculateLatLonId(-lat_value, lon_value, country, province);
2138             adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
2139             if (adjusted_flags) {
2140                 string awguess = adjust_id->GetGuessWater();
2141                 string acguess = adjust_id->GetGuessCountry();
2142                 if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
2143                     delete id;
2144                     id = adjust_id;
2145                     flags = adjusted_flags;
2146                     adjustment = CLatLonCountryMap::fNegateLat;
2147                 }
2148             } else {
2149                 if (adjust_id) {
2150                     delete adjust_id;
2151                 }
2152                 adjust_id = x_CalculateLatLonId(lat_value, -lon_value, country, province);
2153                 adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
2154                 if (adjusted_flags) {
2155                     string awguess = adjust_id->GetGuessWater();
2156                     string acguess = adjust_id->GetGuessCountry();
2157                     if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
2158                         delete id;
2159                         id = adjust_id;
2160                         flags = adjusted_flags;
2161                         adjustment = CLatLonCountryMap::fNegateLon;
2162                     }
2163                 } else {
2164                     if (adjust_id) {
2165                         delete adjust_id;
2166                     }
2167                 }
2168             }
2169         }
2170     }
2171 
2172     string error;
2173 
2174     if (adjustment != CLatLonCountryMap::fNone) {
2175         if (adjustment == CLatLonCountryMap::fFlip) {
2176             errcode = eLatLonCountryErr_Value;
2177             error = "Latitude and longitude values appear to be exchanged";
2178             lat_lon = MakeLatLon(lon_value, lat_value);
2179         } else if (adjustment == CLatLonCountryMap::fNegateLat) {
2180             errcode = eLatLonCountryErr_Value;
2181             if (lat_value < 0.0) {
2182                 error = "Latitude should be set to N (northern hemisphere)";
2183             } else {
2184                 error = "Latitude should be set to S (southern hemisphere)";
2185             }
2186             lat_lon = MakeLatLon(-lat_value, lon_value);
2187         } else if (adjustment == CLatLonCountryMap::fNegateLon) {
2188             errcode = eLatLonCountryErr_Value;
2189             if (lon_value < 0.0) {
2190                 error = "Longitude should be set to E (eastern hemisphere)";
2191             } else {
2192                 error = "Longitude should be set to W (western hemisphere)";
2193             }
2194             lat_lon = MakeLatLon(lat_value, -lon_value);
2195         }
2196     } else if ((flags & CLatLonCountryId::fCountryMatch) && (flags & CLatLonCountryId::fProvinceMatch)) {
2197         // success!  nothing to report
2198     } else if (flags & CLatLonCountryId::fWaterMatch) {
2199         // success!  nothing to report
2200     } else if (flags & CLatLonCountryId::fCountryMatch && NStr::IsBlank(province)) {
2201         if (check_state) {
2202             string full_guess = id->GetFullGuess();
2203             if (!NStr::Equal(full_guess, country)) {
2204                 errcode = eLatLonCountryErr_State;
2205                 error = "Lat_lon " + lat_lon + " is in " + id->GetFullGuess()
2206                     + " (more specific than " + country + ")";
2207             }
2208         }
2209     } else if (!NStr::IsBlank(id->GetGuessWater())) {
2210         if (flags & (CLatLonCountryId::fCountryClosest | CLatLonCountryId::fProvinceClosest)) {
2211             bool suppress = false;
2212             string reportregion;
2213             string nosubphrase;
2214             string desphrase = "designated subregion ";
2215             string subphrase = "another subregion ";
2216             string phrase = nosubphrase;
2217             bool show_claimed = false;
2218 
2219             if (id->GetLandDistance() < 100) {
2220                 // for now, will not report
2221                 // this is a policy decision
2222                 suppress = true;
2223             } else if (NStr::Find(countryname, "Island") != NPOS) {
2224                 suppress = true;
2225             }
2226 
2227 
2228             if (flags & CLatLonCountryId::fProvinceClosest) {
2229                 reportregion = countryname;
2230                 phrase = desphrase;
2231             } else {
2232                 // wasn't closest province, so must be closest country
2233                 if (!NStr::IsBlank(province) && check_state) {
2234                   phrase = subphrase;
2235                   reportregion = id->GetClosestFull();
2236                 } else {
2237                   reportregion = id->GetClosestCountry();
2238                 }
2239                 if (!NStr::IsBlank(id->GetClaimedFull())) {
2240                   show_claimed = true;
2241                 }
2242             }
2243             string water = id->GetGuessWater();
2244             if (NStr::EqualNocase (water, "Red Sea") &&
2245                (NStr::EqualNocase (reportregion, "Egypt") ||
2246                 NStr::EqualNocase (reportregion, "Saudi Arabia") ||
2247                 NStr::EqualNocase (reportregion, "Sudan") ||
2248                 NStr::EqualNocase (reportregion, "Eritrea") ||
2249                 NStr::EqualNocase (reportregion, "Dijibouti") ||
2250                 NStr::EqualNocase (reportregion, "Yemen") ||
2251                 NStr::EqualNocase (reportregion, "Israel") ||
2252                 NStr::EqualNocase (reportregion, "Jordan"))) {
2253             } else if (NStr::EqualNocase (water, "Gulf of Mexico") &&
2254                (NStr::EqualNocase (reportregion, "USA") ||
2255                 NStr::EqualNocase (reportregion, "Mexico"))) {
2256             } else if (!suppress) {
2257                 errcode = eLatLonCountryErr_Water;
2258                 if (show_claimed) {
2259                     error = "Lat_lon '" + lat_lon + "' is closest to " + phrase + "'" + reportregion + "' at distance "
2260                             + NStr::IntToString(id->GetLandDistance())
2261                             + " km, but in water '" + id->GetGuessWater()
2262                             + "' - claimed region '" + id->GetClaimedFull()
2263                             + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
2264                 } else {
2265                     error = "Lat_lon '" + lat_lon + "' is closest to " + phrase + "'" + reportregion
2266                             + "' at distance " + NStr::IntToString(id->GetLandDistance()) + " km, but in water '"
2267                             + id->GetGuessWater() + "'";
2268                 }
2269             }
2270         } else if (neardist > 0.0) {
2271             errcode = eLatLonCountryErr_Water;
2272             error = "Lat_lon '" + lat_lon + "' is in water '" + id->GetGuessWater() + "', '"
2273                         + countryname + "' is " + NStr::IntToString(m_LatLonCountryMap->AdjustAndRoundDistance(neardist)) + " km away";
2274         } else {
2275             errcode = eLatLonCountryErr_Water;
2276             error = "Lat_lon '" + lat_lon + "' is in water '" + id->GetGuessWater() + "'";
2277         }
2278     } else if (!NStr::IsBlank(id->GetGuessCountry())) {
2279         string full_guess = id->GetFullGuess();
2280         if (NStr::EqualNocase (country, "China") && NStr::EqualNocase (full_guess, "Hong Kong")) {
2281             // skip
2282         } else if (NStr::IsBlank(id->GetClaimedFull())) {
2283             if (NStr::Equal(id->GetGuessCountry(), country) && !NStr::Equal(id->GetGuessProvince(), province)) {
2284                 errcode = eLatLonCountryErr_State;
2285             } else {
2286                 errcode = eLatLonCountryErr_Country;
2287             }
2288             error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
2289                         + countryname + "'";
2290         } else {
2291             if (NStr::IsBlank(province)) {
2292                 errcode = eLatLonCountryErr_Country;
2293                 error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
2294                             + country + "' - claimed region '" + id->GetClaimedFull()
2295                             + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
2296             } else {
2297                 errcode = eLatLonCountryErr_Country;
2298                 if (NStr::EqualNocase(id->GetGuessCountry(), country)) {
2299                     errcode = eLatLonCountryErr_State;
2300                 }
2301                 if (errcode == eLatLonCountryErr_Country || check_state) {
2302                     error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
2303                                 + countryname + "' - claimed region '" + id->GetClaimedFull()
2304                                 + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
2305                 } else {
2306                     errcode = eLatLonCountryErr_None;
2307                 }
2308             }
2309         }
2310     } else if (!NStr::IsBlank(id->GetClosestCountry())) {
2311         errcode = eLatLonCountryErr_Country;
2312         error = "Lat_lon '" + lat_lon + "' is closest to '" + id->GetClosestCountry() + "' instead of '"
2313                     + countryname + "'";
2314     } else if (!NStr::IsBlank(id->GetClosestWater())) {
2315         errcode = eLatLonCountryErr_Water;
2316         error = "Lat_lon '" + lat_lon + "' is closest to '" + id->GetClosestWater() + "' instead of '"
2317                     + countryname + "'";
2318     } else {
2319         errcode = eLatLonCountryErr_Country;
2320         error = "Unable to determine mapping for lat_lon '" + lat_lon + "' and country '" + countryname + "'";
2321     }
2322 
2323 
2324     delete id;
2325     return error;
2326 }
2327 
2328 
2329 const char* sm_ValidSexQualifierTokens[] = {
2330   "asexual",
2331   "bisexual",
2332   "diecious",
2333   "dioecious",
2334   "f",
2335   "female",
2336   "gelding",
2337   "hermaphrodite",
2338   "intersex",
2339   "m",
2340   "male",
2341   "mixed",
2342   "monecious",
2343   "monoecious",
2344   "neuter",
2345   "unisexual",
2346 };
2347 
2348 
2349 const char* sm_ValidSexQualifierPhrases[] = {
2350   "pooled males and females",
2351   "pooled male and female",
2352 };
2353 
2354 
s_IsValidSexQualifierPhrase(const string & value)2355 bool s_IsValidSexQualifierPhrase(const string& value)
2356 {
2357     size_t max = sizeof(sm_ValidSexQualifierPhrases) / sizeof(const char*);
2358 
2359     const char* *begin = sm_ValidSexQualifierPhrases;
2360     const char* *end = &(sm_ValidSexQualifierPhrases[max]);
2361 
2362     if (find(begin, end, value) != end) {
2363         return true;
2364     } else {
2365         return false;
2366     }
2367 }
2368 
2369 
IsValidSexQualifierValue(const string & value)2370 bool CSubSource::IsValidSexQualifierValue (const string& value)
2371 
2372 {
2373     string str = value;
2374     NStr::ToLower(str);
2375 
2376     if (s_IsValidSexQualifierPhrase(str)) {
2377         return true;
2378     }
2379 
2380     vector<string> words;
2381     NStr::Split(str, " ,/", words);
2382     if (words.size() == 0) {
2383         return false;
2384     }
2385 
2386     size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
2387 
2388     const char* *begin = sm_ValidSexQualifierTokens;
2389     const char* *end = &(sm_ValidSexQualifierTokens[max]);
2390 
2391     bool is_good = false;
2392 
2393     ITERATE(vector<string>, w, words) {
2394         if (NStr::Equal(*w, "and")) {
2395             // ok, skip it
2396         } else {
2397             if (find(begin, end, *w) != end) {
2398                 is_good = true;
2399             } else {
2400                 is_good = false;
2401                 break;
2402             }
2403         }
2404     }
2405     return is_good;
2406 }
2407 
2408 
FixSexQualifierValue(const string & value)2409 string CSubSource::FixSexQualifierValue (const string& value)
2410 {
2411     string str = value;
2412     NStr::ToLower(str);
2413 
2414     if (s_IsValidSexQualifierPhrase(str)) {
2415         return str;
2416     }
2417 
2418     vector<string> words;
2419     NStr::Split(str, " ,/", words);
2420 
2421     if (words.size() == 0) {
2422         return kEmptyStr;
2423     }
2424     size_t max = ArraySize(sm_ValidSexQualifierTokens);
2425 
2426     const char* *begin = sm_ValidSexQualifierTokens;
2427     const char* *end = &(sm_ValidSexQualifierTokens[max]);
2428 
2429     vector<string> good_values;
2430     bool pooled = false;
2431 
2432     ITERATE(vector<string>, w, words) {
2433         if (NStr::Equal(*w, "and")) {
2434             // ok, skip it
2435         } else if (NStr::EqualNocase(*w, "(pooled)") || NStr::EqualNocase(*w, "pooled")) {
2436             // set pooled flag
2437             pooled = true;
2438         } else {
2439             if (find(begin, end, *w) != end) {
2440                 if (NStr::Equal(*w, "m")) {
2441                     good_values.push_back("male");
2442                 } else if (NStr::Equal(*w, "f")) {
2443                     good_values.push_back("female");
2444                 } else {
2445                     good_values.push_back(*w);
2446                 }
2447             } else {
2448                 // if any bad values, can't autofix
2449                 return kEmptyStr;
2450             }
2451         }
2452     }
2453     if (good_values.size() == 0) {
2454         // no good tokens, can't autofix
2455         return kEmptyStr;
2456     }
2457 
2458     string fixed = good_values[0];
2459     for (size_t i = 1; i < good_values.size(); i++) {
2460         if (good_values.size() > 2) {
2461             fixed += ",";
2462         }
2463         if (i == good_values.size() - 1) {
2464             fixed += " and";
2465         }
2466         fixed += " " + good_values[i];
2467     }
2468     if (pooled) {
2469         fixed = "pooled " + fixed;
2470     }
2471     return fixed;
2472 }
2473 
2474 
s_CollectNumberAndUnits(const string & value,string & number,string & units)2475 void s_CollectNumberAndUnits(const string& value, string& number, string& units)
2476 {
2477     number.clear();
2478     units.clear();
2479 
2480     if (NStr::IsBlank(value)) {
2481         return;
2482     }
2483 
2484     string::const_iterator it = value.begin();
2485     if (*it == '+' || *it == '-') {
2486         number += *it;
2487         it++;
2488     }
2489 
2490     bool any_digit = false;
2491     bool skip_comma = true;
2492     while (it != value.end() && (isdigit(*it) || *it == ',')) {
2493         if (*it == ',') {
2494             if (skip_comma) {
2495                 // only skip the first comma
2496                 skip_comma = false;
2497             } else {
2498                 break;
2499             }
2500         } else {
2501             any_digit = true;
2502             number += *it;
2503         }
2504         it++;
2505     }
2506 
2507     if (it == value.end()) {
2508         number.clear();
2509         return;
2510     }
2511 
2512     if (*it == '.') {
2513         number += *it;
2514         it++;
2515         while (it != value.end() && isdigit(*it)) {
2516             any_digit = true;
2517             number += *it;
2518             it++;
2519         }
2520     }
2521 
2522     if (it == value.end() || *it != ' ' || !any_digit) {
2523         number.clear();
2524         return;
2525     }
2526 
2527     it++;
2528     while (it != value.end()) {
2529         units += *it;
2530         it++;
2531     }
2532 }
2533 
2534 
IsAltitudeValid(const string & value)2535 bool CSubSource::IsAltitudeValid (const string& value)
2536 {
2537     if (NStr::IsBlank(value)) {
2538         return false;
2539     }
2540 
2541     string number;
2542     string units;
2543     s_CollectNumberAndUnits(value, number, units);
2544     if (NStr::IsBlank(number) || !NStr::EqualCase(units, "m")) {
2545         return false;
2546     } else {
2547         return true;
2548     }
2549 
2550 }
2551 
2552 
x_GetPrecision(const string & num_str)2553 int CSubSource::x_GetPrecision(const string& num_str)
2554 {
2555     int precision = 0;
2556     size_t pos = NStr::Find(num_str, ".");
2557     if (pos != NPOS) {
2558         precision = int(num_str.length() - pos - 1);
2559     }
2560     return precision;
2561 }
2562 
2563 
x_FormatWithPrecision(double val,int precision)2564 string CSubSource::x_FormatWithPrecision(double val, int precision)
2565 {
2566     char reformatted[1000];
2567     sprintf(reformatted, "%.*lf", precision, val);
2568     string rval = reformatted;
2569     return rval;
2570 }
2571 
FixAltitude(const string & value)2572 string CSubSource::FixAltitude (const string& value)
2573 {
2574     if (NStr::IsBlank(value)) {
2575         return kEmptyStr;
2576     }
2577 
2578     string number;
2579     string units;
2580     s_CollectNumberAndUnits(value, number, units);
2581     if (NStr::IsBlank(number)) {
2582         return kEmptyStr;
2583     } else if (NStr::Equal(units, "ft.") || NStr::Equal(units, "ft") || NStr::Equal(units, "feet") || NStr::Equal(units, "foot")) {
2584         int precision = x_GetPrecision(number);
2585         double val = NStr::StringToDouble(number);
2586         val *= 0.3048;
2587         number = x_FormatWithPrecision(val, precision);
2588         units = "m";
2589     }
2590 
2591     string rval = kEmptyStr;
2592     if (NStr::Equal(units, "m.")
2593         || NStr::Equal(units, "meters")
2594         || NStr::Equal(units, "meter")
2595         || NStr::Equal(units, "m")) {
2596 
2597         rval = number + " " + "m";
2598     }
2599     return rval;
2600 }
2601 
2602 
2603 // From VR-793:
2604 // A.    For segment, endogenous_virus_name:
2605 //   1.  Must begin with a letter or number
2606 //   2.  Spaces and other printable characters are permitted
2607 //   3.  Must not be empty, must not be longer than 240 characters
2608 
x_GenericRepliconNameValid(const string & value)2609 bool CSubSource::x_GenericRepliconNameValid(const string& value)
2610 {
2611     if (NStr::IsBlank(value)) {
2612         return false;
2613     } else if (!isalnum(value.c_str()[0])) {
2614         return false;
2615     } else if (value.length() > 240) {
2616         return false;
2617     }
2618 
2619     for (auto it : value) {
2620         if (!isprint(it)) {
2621             return false;
2622         }
2623     }
2624 
2625     return true;
2626 }
2627 
2628 
IsSegmentValid(const string & value)2629 bool CSubSource::IsSegmentValid(const string& value)
2630 {
2631     return x_GenericRepliconNameValid(value);
2632 }
2633 
2634 
IsEndogenousVirusNameValid(const string & value)2635 bool CSubSource::IsEndogenousVirusNameValid(const string& value)
2636 {
2637     return x_GenericRepliconNameValid(value);
2638 }
2639 
2640 
2641 // From VR-793:
2642 // B.    For chromosome, linkage_group and plasmid_name values:
2643 //   4.  Must begin with a letter or number
2644 //   5.  Must not be empty, must not be longer than 32 characters
2645 //   6.  Must not contain <tab>
2646 //   7.  Spaces and other printable characters are permitted
2647 //   8.  Must not contain the word "plasmid" (ignoring case)
2648 //   9.  Must not contain the word "chromosome" (ignoring case)
2649 //   10. Must not contain the phrase "linkage group" (ignoring case)
2650 //   11. Must not contain the series of letters "chr" (ignoring case)
2651 //   12. Must not contain the taxname (ignoring case)
2652 //   14. Must not contain the genus (ignoring case)
2653 //   15. Must not contain the species (ignoring case)
2654 //       except allow the species to match the value after an initial 'p' (e.g., JX416328)
2655 //   16. Must not contain the series of letters "chrm" (ignoring case)
2656 //   17. Must not contain the series of letters "chrom" (ignoring case)
2657 //   18. Must not contain the phrase "linkage-group" (ignoring case)
2658 
x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(const string & value,const string & taxname)2659 bool CSubSource::x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(const string& value, const string& taxname)
2660 {
2661     if (NStr::FindNoCase(taxname, "Borrelia") != NPOS || NStr::FindNoCase(taxname, "Borreliella") != NPOS) {
2662         if (NStr::StartsWith(value, "cp") || NStr::StartsWith(value, "lp")) {
2663             return true;
2664         }
2665     }
2666     if (!x_GenericRepliconNameValid(value)) {
2667         // checks for isalnum start, blankness and unprintable characters
2668         // B.4, B.5, B.7
2669         return false;
2670     } else if (value.length() > 32) {
2671         // B.5
2672         return false;
2673     }
2674     if (!NStr::IsBlank(taxname)) {
2675         if (NStr::FindNoCase(value, taxname) != NPOS) {
2676             // B.12
2677             return false;
2678         }
2679         size_t pos = NStr::Find(taxname, " ");
2680         if (pos != NPOS) {
2681             string genus = taxname.substr(0, pos);
2682             if (NStr::FindNoCase(value, genus) != NPOS) {
2683                 // B.14
2684                 return false;
2685             }
2686             string species = taxname.substr(pos + 1);
2687             pos = NStr::FindNoCase(value, species);
2688             if (pos != NPOS) {
2689                 if (pos != 1 || value[0] != 'p') {
2690                     // B.15
2691                     return false;
2692                 }
2693             }
2694         }
2695     }
2696     static string s_ForbiddenPhrases[] = {
2697         "\t",  // B.6.
2698         "plasmid", // B.8
2699         "chromosome", // B.9
2700         "linkage group", // B.10
2701         "chr", // B.11
2702         "linkage_group", // B.15
2703         "chrm", // B.16
2704         "chrom", // B.17
2705         "linkage-group" // B.18
2706     };
2707 
2708     for (auto it : s_ForbiddenPhrases) {
2709         if (NStr::FindNoCase(value, it) != NPOS) {
2710             return false;
2711         }
2712     }
2713     return true;
2714 }
2715 
2716 
IsChromosomeNameValid(const string & value,const string & taxname)2717 bool CSubSource::IsChromosomeNameValid(const string& value, const string& taxname)
2718 {
2719     if (NStr::IsBlank(value)) {
2720         return false;
2721     }
2722     if (NStr::StartsWith(value, "LG", NStr::eNocase)) {
2723         return false;
2724     } else {
2725         return x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(value, taxname);
2726     }
2727 }
2728 
2729 
IsLinkageGroupNameValid(const string & value,const string & taxname)2730 bool CSubSource::IsLinkageGroupNameValid(const string& value, const string& taxname)
2731 {
2732     if (NStr::IsBlank(value)) {
2733         return false;
2734     }
2735     return x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(value, taxname);
2736 }
2737 
2738 
2739 // VR-793
2740 // C.    For plasmid_name values:
2741 //   19. Exception- megaplasmid is legal
IsPlasmidNameValid(const string & value,const string & taxname)2742 bool CSubSource::IsPlasmidNameValid(const string& value, const string& taxname)
2743 {
2744     if (NStr::IsBlank(value)) {
2745         return false;
2746     }
2747     if (NStr::Equal(value, "megaplasmid")) {
2748         return true;
2749     }
2750     if (NStr::StartsWith(value, "megaplasmid ") && value.length() > 12 && NStr::Find(value.substr(12), " ") == NPOS) {
2751         return true;
2752     }
2753     if (NStr::Equal(value, "F") || NStr::Equal(value, "F factor") || NStr::Equal(value, "F plasmid")) {
2754         return true;
2755     }
2756     if (NStr::Equal(value, "Plasmid R") || NStr::Equal(value, "plasmid R") ||
2757         NStr::Equal(value, "Plasmid F") || NStr::Equal(value, "plasmid F")) {
2758         return true;
2759     }
2760     string val = value;
2761     string tax = taxname;
2762     if (NStr::StartsWith(value, "Plasmid ") || NStr::StartsWith(value, "plasmid ")) {
2763         val = value.substr(8, value.length());
2764     }
2765     if (NStr::StartsWith(taxname, "Plasmid ") || NStr::StartsWith(taxname, "plasmid ")) {
2766         tax = taxname.substr(8, taxname.length());
2767     }
2768     if (NStr::StartsWith(tax, val)) {
2769         if (NStr::Equal(tax, taxname) && NStr::Equal(val, value)) {
2770             return false;
2771         }
2772         return true;
2773     }
2774     return x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(val, tax);
2775 }
2776 
2777 
2778 typedef pair<string, string> TContaminatingCellLine;
2779 typedef map<string, TContaminatingCellLine> TSpeciesContaminant;
2780 typedef map<string, TSpeciesContaminant> TCellLineContaminationMap;
2781 
2782 static TCellLineContaminationMap s_CellLineContaminationMap;
2783 static bool s_CellLineContaminationMapInitialized = false;
2784 DEFINE_STATIC_FAST_MUTEX(s_CellLineContaminationMutex);
2785 
2786 #include "cell_line.inc"
2787 
s_ProcessCellLineLine(const CTempString & line)2788 static void s_ProcessCellLineLine(const CTempString& line)
2789 {
2790     vector<string> tokens;
2791     NStr::Split(line, "\t", tokens);
2792     if (tokens.size() < 4) {
2793         ERR_POST_X(1, Warning << "Not enough columns in cell_line entry " << line
2794                    << "; disregarding");
2795     } else {
2796         NStr::ToUpper(tokens[0]);
2797         (s_CellLineContaminationMap[tokens[0]])[tokens[1]] = TContaminatingCellLine(tokens[2], tokens[3]);
2798     }
2799 }
2800 
2801 
s_InitializeCellLineContaminationMap(void)2802 static void s_InitializeCellLineContaminationMap(void)
2803 {
2804     CFastMutexGuard GUARD(s_CellLineContaminationMutex);
2805     if (s_CellLineContaminationMapInitialized) {
2806         return;
2807     }
2808 
2809     // read table
2810 
2811     size_t count = sizeof(kCellLine) / sizeof (*kCellLine);
2812     const char * const * start = kCellLine;
2813     while (count--) {
2814         s_ProcessCellLineLine(*start++);
2815     }
2816 
2817 
2818     s_CellLineContaminationMapInitialized = true;
2819 }
2820 
2821 
CheckCellLine(const string & cell_line,const string & organism)2822 string CSubSource::CheckCellLine(const string& cell_line, const string& organism)
2823 {
2824     string rval;
2825 
2826     s_InitializeCellLineContaminationMap();
2827     string cell_line_search = cell_line;
2828     NStr::ToUpper(cell_line_search);
2829 
2830     if (!NStr::IsBlank(((s_CellLineContaminationMap[cell_line_search])[organism]).first)) {
2831         rval = "The International Cell Line Authentication Committee database indicates that " +
2832                cell_line + " from " + organism + " is known to be contaminated by " +
2833                ((s_CellLineContaminationMap[cell_line_search])[organism]).first +
2834                " from " + ((s_CellLineContaminationMap[cell_line_search])[organism]).second +
2835                ". Please see http://iclac.org/databases/cross-contaminations/ for more information and references.";
2836     }
2837     return rval;
2838 }
2839 
2840 
2841 // =============================================================================
2842 //                                 Country Names
2843 // =============================================================================
2844 
2845 
2846 // legal country names, must be in alphabetical order (case sensitive)
2847 static const char* const s_Countries[] = {
2848     "Afghanistan",
2849     "Albania",
2850     "Algeria",
2851     "American Samoa",
2852     "Andorra",
2853     "Angola",
2854     "Anguilla",
2855     "Antarctica",
2856     "Antigua and Barbuda",
2857     "Arctic Ocean",
2858     "Argentina",
2859     "Armenia",
2860     "Aruba",
2861     "Ashmore and Cartier Islands",
2862     "Atlantic Ocean",
2863     "Australia",
2864     "Austria",
2865     "Azerbaijan",
2866     "Bahamas",
2867     "Bahrain",
2868     "Baker Island",
2869     "Baltic Sea",
2870     "Bangladesh",
2871     "Barbados",
2872     "Bassas da India",
2873     "Belarus",
2874     "Belgium",
2875     "Belize",
2876     "Benin",
2877     "Bermuda",
2878     "Bhutan",
2879     "Bolivia",
2880     "Borneo",
2881     "Bosnia and Herzegovina",
2882     "Botswana",
2883     "Bouvet Island",
2884     "Brazil",
2885     "British Virgin Islands",
2886     "Brunei",
2887     "Bulgaria",
2888     "Burkina Faso",
2889     "Burundi",
2890     "Cambodia",
2891     "Cameroon",
2892     "Canada",
2893     "Cape Verde",
2894     "Cayman Islands",
2895     "Central African Republic",
2896     "Chad",
2897     "Chile",
2898     "China",
2899     "Christmas Island",
2900     "Clipperton Island",
2901     "Cocos Islands",
2902     "Colombia",
2903     "Comoros",
2904     "Cook Islands",
2905     "Coral Sea Islands",
2906     "Costa Rica",
2907     "Cote d'Ivoire",
2908     "Croatia",
2909     "Cuba",
2910     "Curacao",
2911     "Cyprus",
2912     "Czech Republic",
2913     "Democratic Republic of the Congo",
2914     "Denmark",
2915     "Djibouti",
2916     "Dominica",
2917     "Dominican Republic",
2918     "Ecuador",
2919     "Egypt",
2920     "El Salvador",
2921     "Equatorial Guinea",
2922     "Eritrea",
2923     "Estonia",
2924     "Eswatini",
2925     "Ethiopia",
2926     "Europa Island",
2927     "Falkland Islands (Islas Malvinas)",
2928     "Faroe Islands",
2929     "Fiji",
2930     "Finland",
2931     "France",
2932     "French Guiana",
2933     "French Polynesia",
2934     "French Southern and Antarctic Lands",
2935     "Gabon",
2936     "Gambia",
2937     "Gaza Strip",
2938     "Georgia",
2939     "Germany",
2940     "Ghana",
2941     "Gibraltar",
2942     "Glorioso Islands",
2943     "Greece",
2944     "Greenland",
2945     "Grenada",
2946     "Guadeloupe",
2947     "Guam",
2948     "Guatemala",
2949     "Guernsey",
2950     "Guinea",
2951     "Guinea-Bissau",
2952     "Guyana",
2953     "Haiti",
2954     "Heard Island and McDonald Islands",
2955     "Honduras",
2956     "Hong Kong",
2957     "Howland Island",
2958     "Hungary",
2959     "Iceland",
2960     "India",
2961     "Indian Ocean",
2962     "Indonesia",
2963     "Iran",
2964     "Iraq",
2965     "Ireland",
2966     "Isle of Man",
2967     "Israel",
2968     "Italy",
2969     "Jamaica",
2970     "Jan Mayen",
2971     "Japan",
2972     "Jarvis Island",
2973     "Jersey",
2974     "Johnston Atoll",
2975     "Jordan",
2976     "Juan de Nova Island",
2977     "Kazakhstan",
2978     "Kenya",
2979     "Kerguelen Archipelago",
2980     "Kingman Reef",
2981     "Kiribati",
2982     "Kosovo",
2983     "Kuwait",
2984     "Kyrgyzstan",
2985     "Laos",
2986     "Latvia",
2987     "Lebanon",
2988     "Lesotho",
2989     "Liberia",
2990     "Libya",
2991     "Liechtenstein",
2992     "Line Islands",
2993     "Lithuania",
2994     "Luxembourg",
2995     "Macau",
2996     "Madagascar",
2997     "Malawi",
2998     "Malaysia",
2999     "Maldives",
3000     "Mali",
3001     "Malta",
3002     "Marshall Islands",
3003     "Martinique",
3004     "Mauritania",
3005     "Mauritius",
3006     "Mayotte",
3007     "Mediterranean Sea",
3008     "Mexico",
3009     "Micronesia",
3010     "Midway Islands",
3011     "Moldova",
3012     "Monaco",
3013     "Mongolia",
3014     "Montenegro",
3015     "Montserrat",
3016     "Morocco",
3017     "Mozambique",
3018     "Myanmar",
3019     "Namibia",
3020     "Nauru",
3021     "Navassa Island",
3022     "Nepal",
3023     "Netherlands",
3024     "New Caledonia",
3025     "New Zealand",
3026     "Nicaragua",
3027     "Niger",
3028     "Nigeria",
3029     "Niue",
3030     "Norfolk Island",
3031     "North Korea",
3032     "North Macedonia",
3033     "North Sea",
3034     "Northern Mariana Islands",
3035     "Norway",
3036     "Oman",
3037     "Pacific Ocean",
3038     "Pakistan",
3039     "Palau",
3040     "Palmyra Atoll",
3041     "Panama",
3042     "Papua New Guinea",
3043     "Paracel Islands",
3044     "Paraguay",
3045     "Peru",
3046     "Philippines",
3047     "Pitcairn Islands",
3048     "Poland",
3049     "Portugal",
3050     "Puerto Rico",
3051     "Qatar",
3052     "Republic of the Congo",
3053     "Reunion",
3054     "Romania",
3055     "Ross Sea",
3056     "Russia",
3057     "Rwanda",
3058     "Saint Barthelemy",
3059     "Saint Helena",
3060     "Saint Kitts and Nevis",
3061     "Saint Lucia",
3062     "Saint Martin",
3063     "Saint Pierre and Miquelon",
3064     "Saint Vincent and the Grenadines",
3065     "Samoa",
3066     "San Marino",
3067     "Sao Tome and Principe",
3068     "Saudi Arabia",
3069     "Senegal",
3070     "Serbia",
3071     "Seychelles",
3072     "Sierra Leone",
3073     "Singapore",
3074     "Sint Maarten",
3075     "Slovakia",
3076     "Slovenia",
3077     "Solomon Islands",
3078     "Somalia",
3079     "South Africa",
3080     "South Georgia and the South Sandwich Islands",
3081     "South Korea",
3082     "South Sudan",
3083     "Southern Ocean",
3084     "Spain",
3085     "Spratly Islands",
3086     "Sri Lanka",
3087     "State of Palestine",
3088     "Sudan",
3089     "Suriname",
3090     "Svalbard",
3091     "Sweden",
3092     "Switzerland",
3093     "Syria",
3094     "Taiwan",
3095     "Tajikistan",
3096     "Tanzania",
3097     "Tasman Sea",
3098     "Thailand",
3099     "Timor-Leste",
3100     "Togo",
3101     "Tokelau",
3102     "Tonga",
3103     "Trinidad and Tobago",
3104     "Tromelin Island",
3105     "Tunisia",
3106     "Turkey",
3107     "Turkmenistan",
3108     "Turks and Caicos Islands",
3109     "Tuvalu",
3110     "USA",
3111     "Uganda",
3112     "Ukraine",
3113     "United Arab Emirates",
3114     "United Kingdom",
3115     "Uruguay",
3116     "Uzbekistan",
3117     "Vanuatu",
3118     "Venezuela",
3119     "Viet Nam",
3120     "Virgin Islands",
3121     "Wake Island",
3122     "Wallis and Futuna",
3123     "West Bank",
3124     "Western Sahara",
3125     "Yemen",
3126     "Zambia",
3127     "Zimbabwe"
3128 };
3129 typedef CStaticArraySet<const char*, PCase_CStr> TCStrSet;
3130 static const TCStrSet s_CountriesSet(s_Countries, sizeof(s_Countries), __FILE__, __LINE__);
3131 
3132 // former legal country names, must be in alphabetical order (case sensitive)
3133 static const char* const s_Former_Countries[] = {
3134     "Belgian Congo",
3135     "British Guiana",
3136     "Burma",
3137     "Czechoslovakia",
3138     "East Timor",
3139     "Korea",
3140     "Macedonia",
3141     "Netherlands Antilles",
3142     "Serbia and Montenegro",
3143     "Siam",
3144     "Swaziland",
3145     "The former Yugoslav Republic of Macedonia",
3146     "USSR",
3147     "Yugoslavia",
3148     "Zaire"
3149 };
3150 static const TCStrSet s_Former_CountriesSet(s_Former_Countries, sizeof(s_Former_Countries), __FILE__, __LINE__);
3151 
IsValid(const string & country)3152 bool CCountries::IsValid(const string& country)
3153 {
3154     string name = country;
3155     size_t pos = country.find(':');
3156 
3157     if ( pos != NPOS ) {
3158         if (pos == country.length() - 1) {
3159             return false;
3160         }
3161         name = country.substr(0, pos);
3162     }
3163 
3164     // try current countries
3165     if (s_CountriesSet.find(name.c_str()) != s_CountriesSet.end()) {
3166         return true;
3167     } else if (s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end()) {
3168         return true;
3169     } else {
3170         return false;
3171     }
3172 }
3173 
3174 
IsValid(const string & country,bool & is_miscapitalized)3175 bool CCountries::IsValid(const string& country, bool& is_miscapitalized)
3176 {
3177     string name = country;
3178     size_t pos = country.find(':');
3179 
3180     if ( pos != NPOS ) {
3181         name = country.substr(0, pos);
3182         if (pos == country.length() - 1) {
3183             return false;
3184         }
3185     }
3186 
3187     is_miscapitalized = false;
3188     // try current countries
3189     // fast check for properly capitalized
3190     if ( s_CountriesSet.find(name.c_str()) != s_CountriesSet.end() ) {
3191         return true;
3192     }
3193     if ( s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end() ) {
3194         return true;
3195     }
3196     // slow check for miscapitalized
3197     ITERATE ( TCStrSet, it, s_CountriesSet ) {
3198         if ( NStr::EqualNocase(name, *it) ) {
3199             is_miscapitalized = true;
3200             return true;
3201         }
3202     }
3203     ITERATE ( TCStrSet, it, s_Former_CountriesSet ) {
3204         if ( NStr::EqualNocase(name, *it) ) {
3205             is_miscapitalized = true;
3206             return true;
3207         }
3208     }
3209 
3210     return false;
3211 }
3212 
3213 
WasValid(const string & country)3214 bool CCountries::WasValid(const string& country)
3215 {
3216     string name = country;
3217     size_t pos = country.find(':');
3218 
3219     if ( pos != NPOS ) {
3220         name = country.substr(0, pos);
3221     }
3222 
3223     // try formerly-valid countries
3224     return s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end();
3225 }
3226 
3227 
WasValid(const string & country,bool & is_miscapitalized)3228 bool CCountries::WasValid(const string& country, bool& is_miscapitalized)
3229 {
3230     string name = country;
3231     size_t pos = country.find(':');
3232 
3233     if ( pos != NPOS ) {
3234         name = country.substr(0, pos);
3235     }
3236 
3237     is_miscapitalized = false;
3238     // try formerly-valid countries
3239     // fast check for properly capitalized
3240     if ( s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end() ) {
3241         return true;
3242     }
3243     // slow check for miscapitalized
3244     ITERATE ( TCStrSet, it, s_Former_CountriesSet ) {
3245         if ( NStr::EqualNocase(name, *it) ) {
3246             is_miscapitalized = true;
3247             return true;
3248         }
3249     }
3250     return false;
3251 }
3252 
3253 /////////////////////////////////////////////////////////////////////////////
3254 ////// Country Capitalization Fix ///////////////////////////////////////////
3255 
3256 static const SStaticPair<const char*, const char*> s_map_whole_country_fixes[] =
3257 {
3258   {"england", "United Kingdom: England"},
3259   {"great britain", "United Kingdom: Great Britain"},
3260   {"new jersey, usa", "USA: New Jersey"}
3261 };
3262 typedef CStaticPairArrayMap<const char*, const char*, PCase_CStr> TCStringPairsMap;
3263 DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap, k_whole_country_fixes, s_map_whole_country_fixes);
3264 
3265 static const SStaticPair<const char*, const char*> s_map_country_name_fixes[] = {
3266 {"ABW", "Aruba"},
3267 {"AFG", "Afghanistan"},
3268 {"AGO", "Angola"},
3269 {"AIA", "Anguilla"},
3270 {"ALA", "Aland Islands"},
3271 {"ALB", "Albania"},
3272 {"AND", "Andorra"},
3273 {"ARE", "United Arab Emirates"},
3274 {"ARG", "Argentina"},
3275 {"ARM", "Armenia"},
3276 {"ASM", "American Samoa"},
3277 {"ATA", "Antarctica"},
3278 {"ATF", "French Southern Territories"},
3279 {"ATG", "Antigua and Barbuda"},
3280 {"AUS", "Australia"},
3281 {"AUT", "Austria"},
3282 {"AZE", "Azerbaijan"},
3283 {"Antigua & Barbuda", "Antigua and Barbuda"},
3284 {"Ashmore & Cartier Islands", "Ashmore and Cartier Islands"},
3285 {"BDI", "Burundi"},
3286 {"BEL", "Belgium"},
3287 {"BEN", "Benin"},
3288 {"BES", "Bonaire, Sint Eustatius and Saba"},
3289 {"BFA", "Burkina Faso"},
3290 {"BGD", "Bangladesh"},
3291 {"BGR", "Bulgaria"},
3292 {"BHR", "Bahrain"},
3293 {"BHS", "Bahamas"},
3294 {"BIH", "Bosnia and Herzegovina"},
3295 {"BLM", "Saint Barthelemy"},
3296 {"BLR", "Belarus"},
3297 {"BLZ", "Belize"},
3298 {"BMU", "Bermuda"},
3299 {"BOL", "Bolivia"},
3300 {"BRA", "Brazil"},
3301 {"BRB", "Barbados"},
3302 {"BRN", "Brunei"},
3303 {"BTN", "Bhutan"},
3304 {"BVT", "Bouvet Island"},
3305 {"BWA", "Botswana"},
3306 {"Brasil", "Brazil"},
3307 {"CAF", "Central African Republic"},
3308 {"CAN", "Canada"},
3309 {"CCK", "Cocos Islands"},
3310 {"CHE", "Switzerland"},
3311 {"CHL", "Chile"},
3312 {"CHN", "China"},
3313 {"CIV", "Cote d'Ivoire"},
3314 {"CMR", "Cameroon"},
3315 {"COD", "Democratic Republic of the Congo"},
3316 {"COG", "Republic of the Congo"},
3317 {"COK", "Cook Islands"},
3318 {"COL", "Colombia"},
3319 {"COM", "Comoros"},
3320 {"CPV", "Cape Verde"},
3321 {"CRI", "Costa Rica"},
3322 {"CUB", "Cuba"},
3323 {"CUW", "Curacao"},
3324 {"CXR", "Christmas Island"},
3325 {"CYM", "Cayman Islands"},
3326 {"CYP", "Cyprus"},
3327 {"CZE", "Czech Republic"},
3328 {"Cape Verde Islands", "Cape Verde"},
3329 {"DEU", "Germany"},
3330 {"DJI", "Djibouti"},
3331 {"DMA", "Dominica"},
3332 {"DNK", "Denmark"},
3333 {"DOM", "Dominican Republic"},
3334 {"DZA", "Algeria"},
3335 {"Democratic Republic of Congo", "Democratic Republic of the Congo"},
3336 {"ECU", "Ecuador"},
3337 {"EGY", "Egypt"},
3338 {"ERI", "Eritrea"},
3339 {"ESH", "Western Sahara"},
3340 {"ESP", "Spain"},
3341 {"EST", "Estonia"},
3342 {"ETH", "Ethiopia"},
3343 {"FIN", "Finland"},
3344 {"FJI", "Fiji"},
3345 {"FLK", "Falkland Islands (Islas Malvinas)"},
3346 {"FRA", "France"},
3347 {"FRO", "Faroe Islands"},
3348 {"FSM", "Micronesia"},
3349 {"Falkland Islands", "Falkland Islands (Islas Malvinas)"},
3350 {"French Southern & Antarctic Lands", "French Southern and Antarctic Lands"},
3351 {"GAB", "Gabon"},
3352 {"GBR", "United Kingdom"},
3353 {"GEO", "Georgia"},
3354 {"GGY", "Guernsey"},
3355 {"GHA", "Ghana"},
3356 {"GIB", "Gibraltar"},
3357 {"GIN", "Guinea"},
3358 {"GLP", "Guadeloupe"},
3359 {"GMB", "Gambia"},
3360 {"GNB", "Guinea-Bissau"},
3361 {"GNQ", "Equatorial Guinea"},
3362 {"GRC", "Greece"},
3363 {"GRD", "Grenada"},
3364 {"GRL", "Greenland"},
3365 {"GTM", "Guatemala"},
3366 {"GUF", "French Guiana"},
3367 {"GUM", "Guam"},
3368 {"GUY", "Guyana"},
3369 {"HKG", "Hong Kong"},
3370 {"HMD", "Heard Island and McDonald Islands"},
3371 {"HND", "Honduras"},
3372 {"HRV", "Croatia"},
3373 {"HTI", "Haiti"},
3374 {"HUN", "Hungary"},
3375 {"Heard Island & McDonald Islands", "Heard Island and McDonald Islands"},
3376 {"IDN", "Indonesia"},
3377 {"IMN", "Isle of Man"},
3378 {"IND", "India"},
3379 {"IOT", "British Indian Ocean Territory"},
3380 {"IRL", "Ireland"},
3381 {"IRN", "Iran"},
3382 {"IRQ", "Iraq"},
3383 {"ISL", "Iceland"},
3384 {"ISR", "Israel"},
3385 {"ITA", "Italy"},
3386 {"Ivory Coast", "Cote d'Ivoire"},
3387 {"JAM", "Jamaica"},
3388 {"JEY", "Jersey"},
3389 {"JOR", "Jordan"},
3390 {"JPN", "Japan"},
3391 {"KAZ", "Kazakhstan"},
3392 {"KEN", "Kenya"},
3393 {"KGZ", "Kyrgyzstan"},
3394 {"KHM", "Cambodia"},
3395 {"KIR", "Kiribati"},
3396 {"KNA", "Saint Kitts and Nevis"},
3397 {"KOR", "South Korea"},
3398 {"KWT", "Kuwait"},
3399 {"LAO", "Lao People's Democratic Republic"},
3400 {"LBN", "Lebanon"},
3401 {"LBR", "Liberia"},
3402 {"LBY", "Libyan Arab Jamahiriya"},
3403 {"LCA", "Saint Lucia"},
3404 {"LIE", "Liechtenstein"},
3405 {"LKA", "Sri Lanka"},
3406 {"LSO", "Lesotho"},
3407 {"LTU", "Lithuania"},
3408 {"LUX", "Luxembourg"},
3409 {"LVA", "Latvia"},
3410 {"La Reunion Island", "Reunion"},
3411 {"Luxemburg", "Luxembourg"},
3412 {"MAC", "Macao"},
3413 {"MAF", "Saint Martin (French part)"},
3414 {"MAR", "Morocco"},
3415 {"MCO", "Monaco"},
3416 {"MDA", "Moldova"},
3417 {"MDG", "Madagascar"},
3418 {"MDV", "Maldives"},
3419 {"MEX", "Mexico"},
3420 {"MHL", "Marshall Islands"},
3421 {"MKD", "North Macedonia"},
3422 {"MLI", "Mali"},
3423 {"MLT", "Malta"},
3424 {"MMR", "Myanmar"},
3425 {"MNE", "Montenegro"},
3426 {"MNG", "Mongolia"},
3427 {"MNP", "Northern Mariana Islands"},
3428 {"MOZ", "Mozambique"},
3429 {"MRT", "Mauritania"},
3430 {"MSR", "Montserrat"},
3431 {"MTQ", "Martinique"},
3432 {"MUS", "Mauritius"},
3433 {"MWI", "Malawi"},
3434 {"MYS", "Malaysia"},
3435 {"MYT", "Mayotte"},
3436 {"Macedonia", "North Macedonia"},
3437 {"NAM", "Namibia"},
3438 {"NCL", "New Caledonia"},
3439 {"NER", "Niger"},
3440 {"NFK", "Norfolk Island"},
3441 {"NGA", "Nigeria"},
3442 {"NIC", "Nicaragua"},
3443 {"NIU", "Niue"},
3444 {"NLD", "Netherlands"},
3445 {"NOR", "Norway"},
3446 {"NPL", "Nepal"},
3447 {"NRU", "Nauru"},
3448 {"NZL", "New Zealand"},
3449 {"Netherland", "Netherlands"},
3450 {"New Guinea", "Papua New Guinea"},
3451 {"OMN", "Oman"},
3452 {"P, R, China", "China"},
3453 {"P.R. China", "China"},
3454 {"P.R.China", "China"},
3455 {"PAK", "Pakistan"},
3456 {"PAN", "Panama"},
3457 {"PCN", "Pitcairn"},
3458 {"PER", "Peru"},
3459 {"PHL", "Philippines"},
3460 {"PLW", "Palau"},
3461 {"PNG", "Papua New Guinea"},
3462 {"POL", "Poland"},
3463 {"PRI", "Puerto Rico"},
3464 {"PRK", "North Korea"},
3465 {"PRT", "Portugal"},
3466 {"PRY", "Paraguay"},
3467 {"PSE", "Palestinian Territory"},
3468 {"PYF", "French Polynesia"},
3469 {"People's Republic of China", "China"},
3470 {"Pr China", "China"},
3471 {"Prchina", "China"},
3472 {"QAT", "Qatar"},
3473 {"REU", "Reunion"},
3474 {"ROU", "Romania"},
3475 {"RUS", "Russia"},
3476 {"RWA", "Rwanda"},
3477 {"Republic of Congo", "Republic of the Congo"},
3478 {"SAU", "Saudi Arabia"},
3479 {"SDN", "Sudan"},
3480 {"SEN", "Senegal"},
3481 {"SGP", "Singapore"},
3482 {"SGS", "South Georgia and the South Sandwich Islands"},
3483 {"SHN", "Saint Helena"},
3484 {"SJM", "Svalbard and Jan Mayen"},
3485 {"SLB", "Solomon Islands"},
3486 {"SLE", "Sierra Leone"},
3487 {"SLV", "El Salvador"},
3488 {"SMR", "San Marino"},
3489 {"SOM", "Somalia"},
3490 {"SPM", "Saint Pierre and Miquelon"},
3491 {"SRB", "Serbia"},
3492 {"SSD", "South Sudan"},
3493 {"STP", "Sao Tome and Principe"},
3494 {"SUR", "Suriname"},
3495 {"SVK", "Slovakia"},
3496 {"SVN", "Slovenia"},
3497 {"SWE", "Sweden"},
3498 {"SWZ", "Eswatini"},
3499 {"SXM", "Sint Maarten (Dutch part)"},
3500 {"SYC", "Seychelles"},
3501 {"SYR", "Syrian Arab Republic"},
3502 {"Saint Kitts & Nevis", "Saint Kitts and Nevis"},
3503 {"Saint Pierre & Miquelon", "Saint Pierre and Miquelon"},
3504 {"Saint Vincent & Grenadines", "Saint Vincent and the Grenadines"},
3505 {"Saint Vincent & the Grenadines", "Saint Vincent and the Grenadines"},
3506 {"Saint Vincent and Grenadines", "Saint Vincent and the Grenadines"},
3507 {"San Tome and Principe Island", "Sao Tome and Principe"},
3508 {"Sao Tome & Principe", "Sao Tome and Principe"},
3509 {"South Georgia & South Sandwich Islands", "South Georgia and the South Sandwich Islands"},
3510 {"South Georgia & the South Sandwich Islands", "South Georgia and the South Sandwich Islands"},
3511 {"St Helena", "Saint Helena"},
3512 {"St Lucia", "Saint Lucia"},
3513 {"St Pierre and Miquelon", "Saint Pierre and Miquelon"},
3514 {"St Vincent and the Grenadines", "Saint Vincent and the Grenadines"},
3515 {"St. Helena", "Saint Helena"},
3516 {"St. Lucia", "Saint Lucia"},
3517 {"St. Pierre and Miquelon", "Saint Pierre and Miquelon"},
3518 {"St. Vincent and the Grenadines", "Saint Vincent and the Grenadines"},
3519 {"TCA", "Turks and Caicos Islands"},
3520 {"TCD", "Chad"},
3521 {"TGO", "Togo"},
3522 {"THA", "Thailand"},
3523 {"TJK", "Tajikistan"},
3524 {"TKL", "Tokelau"},
3525 {"TKM", "Turkmenistan"},
3526 {"TLS", "Timor-Leste"},
3527 {"TON", "Tonga"},
3528 {"TTO", "Trinidad and Tobago"},
3529 {"TUN", "Tunisia"},
3530 {"TUR", "Turkey"},
3531 {"TUV", "Tuvalu"},
3532 {"TWN", "Taiwan"},
3533 {"TZA", "Tanzania"},
3534 {"The Netherlands", "Netherlands"},
3535 {"Trinidad & Tobago", "Trinidad and Tobago"},
3536 {"Turks & Caicos", "Turks and Caicos Islands"},
3537 {"Turks & Caicos Islands", "Turks and Caicos Islands"},
3538 {"Turks and Caicos", "Turks and Caicos Islands"},
3539 {"U.S.A.", "USA"},
3540 {"UGA", "Uganda"},
3541 {"UK", "United Kingdom"},
3542 {"UKR", "Ukraine"},
3543 {"UMI", "United States Minor Outlying Islands"},
3544 {"URY", "Uruguay"},
3545 {"UZB", "Uzbekistan"},
3546 {"United States", "USA"},
3547 {"United States of America", "USA"},
3548 {"VAT", "Holy See (Vatican City State)"},
3549 {"VCT", "Saint Vincent and the Grenadines"},
3550 {"VEN", "Venezuela"},
3551 {"VGB", "British Virgin Islands"},
3552 {"VIR", "Virgin Islands"},
3553 {"VNM", "Viet Nam"},
3554 {"VUT", "Vanuatu"},
3555 {"Vietnam", "Viet Nam"},
3556 {"WLF", "Wallis and Futuna"},
3557 {"WSM", "Samoa"},
3558 {"YEM", "Yemen"},
3559 {"ZAF", "South Africa"},
3560 {"ZMB", "Zambia"},
3561 {"ZWE", "Zimbabwe"},
3562 {"the Netherlands", "Netherlands"}
3563 };
3564 
3565 DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap,k_country_name_fixes, s_map_country_name_fixes);
3566 
3567 // for GP-24841
3568 static const SStaticPair<const char*, const char*> s_map_old_country_name_fixes[] = {
3569 {"Burma", "Myanmar"},
3570 {"Siam", "Thailand"}
3571 };
3572 DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap,k_old_country_name_fixes, s_map_old_country_name_fixes);
3573 
3574 // for GB-7408
3575 static const SStaticPair<const char*, const char*> s_map_subregion_fixes[] = {
3576 {"Antigua", "Antigua and Barbuda: Antigua"},
3577 {"Ashmore Island", "Ashmore and Cartier Islands: Ashmore Island"},
3578 {"Autonomous Region of the Azores", "Portugal: Azores"},
3579 {"Azores", "Portugal: Azores"},
3580 {"Barbuda", "Antigua and Barbuda: Barbuda"},
3581 {"Bassas da India", "French Southern and Antarctic Lands: Bassas da India"},
3582 {"Caicos Islands", "Turks and Caicos Islands: Caicos Islands"},
3583 {"Canary Islands", "Spain: Canary Islands"},
3584 {"Cartier Island", "Ashmore and Cartier Islands: Cartier Island"},
3585 {"East Germany", "Germany: East Germany"},
3586 {"El Hierro", "Spain: El Hierro"},
3587 {"Europa Island", "French Southern and Antarctic Lands: Europa Island"},
3588 {"Fuerteventura", "Spain: Fuerteventura"},
3589 {"Glorioso Islands", "French Southern and Antarctic Lands: Glorioso Islands"},
3590 {"Gran Canaria", "Spain: Gran Canaria"},
3591 {"Grenadines", "Saint Vincent and the Grenadines: Grenadines"},
3592 {"Heard Island", "Heard Island and McDonald Islands: Heard Island"},
3593 {"Ile Amsterdam", "French Southern and Antarctic Lands: Ile Amsterdam"},
3594 {"Ile Saint-Paul", "French Southern and Antarctic Lands: Ile Saint-Paul"},
3595 {"Iles Crozet", "French Southern and Antarctic Lands: Iles Crozet"},
3596 {"Iles Kerguelen", "French Southern and Antarctic Lands: Iles Kerguelen"},
3597 {"Juan de Nova Island", "French Southern and Antarctic Lands: Juan de Nova Island"},
3598 {"La Gomera", "Spain: La Gomera"},
3599 {"La Graciosa", "Spain: La Graciosa"},
3600 {"La Palma", "Spain: La Palma"},
3601 {"Lanzarote", "Spain: Lanzarote"},
3602 {"Madeira", "Portugal: Madeira"},
3603 {"McDonald Island", "Heard Island and McDonald Islands: McDonald Island"},
3604 {"McDonald Islands", "Heard Island and McDonald Islands: McDonald Islands"},
3605 {"Miquelon", "Saint Pierre and Miquelon: Miquelon"},
3606 {"Nevis", "Saint Kitts and Nevis: Nevis"},
3607 {"Principe", "Sao Tome and Principe: Principe"},
3608 {"Saint Kitts", "Saint Kitts and Nevis: Saint Kitts"},
3609 {"Saint Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
3610 {"Saint Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
3611 {"Sao Tome", "Sao Tome and Principe: Sao Tome"},
3612 {"Scotland", "United Kingdom: Scotland"},
3613 {"South Sandwich Islands", "South Georgia and the South Sandwich Islands: South Sandwich Islands"},
3614 {"St Kitts", "Saint Kitts and Nevis: Saint Kitts"},
3615 {"St Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
3616 {"St Thomas", "USA: Saint Thomas"},
3617 {"St Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
3618 {"St. Kitts", "Saint Kitts and Nevis: Saint Kitts"},
3619 {"St. Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
3620 {"St. Thomas", "USA: Saint Thomas"},
3621 {"St. Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
3622 {"Tenerife", "Spain: Tenerife"},
3623 {"Tobago", "Trinidad and Tobago: Tobago"},
3624 {"Trinidad", "Trinidad and Tobago: Trinidad"},
3625 {"Tromelin Island", "French Southern and Antarctic Lands: Tromelin Island"},
3626 {"Turks Islands", "Turks and Caicos Islands: Turks Islands"},
3627 {"Wales", "United Kingdom: Wales"},
3628 {"West Germany", "Germany: West Germany"},
3629 
3630 };
3631 DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap,k_subregion_fixes, s_map_subregion_fixes);
3632 
3633 
3634 static const char* s_USAStates[] = {
3635     "Alabama",
3636     "Alaska",
3637     "Arizona",
3638     "Arkansas",
3639     "California",
3640     "Colorado",
3641     "Connecticut",
3642     "Delaware",
3643     "District of Columbia",
3644     "Florida",
3645     "Georgia",
3646     "Hawaii",
3647     "Idaho",
3648     "Illinois",
3649     "Indiana",
3650     "Iowa",
3651     "Kansas",
3652     "Kentucky",
3653     "Louisiana",
3654     "Maine",
3655     "Maryland",
3656     "Massachusetts",
3657     "Michigan",
3658     "Minnesota",
3659     "Mississippi",
3660     "Missouri",
3661     "Montana",
3662     "Nebraska",
3663     "Nevada",
3664     "New Hampshire",
3665     "New Jersey",
3666     "New Mexico",
3667     "New York",
3668     "North Carolina",
3669     "North Dakota",
3670     "Ohio",
3671     "Oklahoma",
3672     "Oregon",
3673     "Pennsylvania",
3674     "Rhode Island",
3675     "South Carolina",
3676     "South Dakota",
3677     "Tennessee",
3678     "Texas",
3679     "Utah",
3680     "Vermont",
3681     "Virginia",
3682     "Washington",
3683     "West Virginia",
3684     "Wisconsin",
3685     "Wyoming"
3686 };
3687 
CapitalizeFirstLetterOfEveryWord(const string & phrase)3688 string CCountries::CapitalizeFirstLetterOfEveryWord (const string &phrase)
3689 {
3690     vector<string> words;
3691     NStr::Split(phrase, " \t\r\n", words);
3692     for(vector<string>::iterator word = words.begin(); word != words.end(); ++word)
3693         if (!word->empty() && isalpha(word->at(0)))
3694             word->at(0) = (unsigned char)toupper(word->at(0));
3695     return NStr::Join(words," ");
3696 }
3697 
WholeCountryFix(string country)3698 string CCountries::WholeCountryFix(string country)
3699 {
3700     string new_country;
3701     TCStringPairsMap::const_iterator found = k_whole_country_fixes.find(NStr::ToLower(country).c_str());
3702     if (found != k_whole_country_fixes.end()) {
3703         new_country = found->second;
3704         return new_country;
3705     }
3706 
3707     const size_t num_states = sizeof(s_USAStates) / sizeof(s_USAStates[0]);
3708     for (size_t i = 0; i < num_states; ++i) {
3709         if (NStr::EqualNocase(s_USAStates[i], country)) {
3710             new_country = "USA: " + CTempString(s_USAStates[i]);
3711             break;
3712         }
3713     }
3714 
3715     return new_country;
3716 }
3717 
IsSubstringOfStringInList(const string & phrase,const string & country1,size_t pos1)3718 bool CCountries::IsSubstringOfStringInList(const string& phrase, const string& country1, size_t pos1)
3719 {
3720     bool r = false;
3721     ITERATE ( TCStrSet, c, s_CountriesSet )
3722     {
3723         string country2(*c);
3724         if (country2.length() > country1.length() && NStr::FindNoCase(country2,country1) != NPOS)
3725         {
3726             SIZE_TYPE pos2 = NStr::FindNoCase(phrase,country2);
3727             while (pos2 != NPOS)
3728             {
3729                 if (pos2 <= pos1 && pos2+country2.length() >= pos1+country1.length())
3730                     r = true;
3731                 pos2 = NStr::FindNoCase(phrase,country2,pos2+country2.length());
3732             }
3733         }
3734     }
3735     return r;
3736 }
3737 
ContainsMultipleCountryNames(const string & phrase)3738 bool CCountries::ContainsMultipleCountryNames (const string &phrase)
3739 {
3740     int num_matches = 0;
3741     ITERATE ( TCStrSet, c, s_CountriesSet )
3742     {
3743         string country(*c);
3744         size_t pos = NStr::FindNoCase(phrase,country);
3745         while (pos != NPOS)
3746         {
3747             if (!((pos+country.length()<phrase.length() && isalpha(phrase[pos+country.length()]))
3748                   || (pos > 0 && isalpha(phrase[pos-1]))
3749                   || IsSubstringOfStringInList(phrase,country,pos)))
3750                 num_matches++;
3751             pos = NStr::FindNoCase(phrase,country,pos+country.length());
3752         }
3753 
3754     }
3755     return (num_matches > 1);
3756 }
3757 
GetCorrectedCountryCapitalization(const string & country)3758 string CCountries::GetCorrectedCountryCapitalization(const string& country)
3759 {
3760     string output = country;
3761     ITERATE ( TCStrSet, it, s_CountriesSet ) {
3762         if ( NStr::EqualNocase(country, *it) ) {
3763             output = *it;
3764         }
3765     }
3766     return output;
3767 }
3768 
3769 
x_RemoveDelimitersFromEnds(string & val,bool except_paren)3770 void CCountries::x_RemoveDelimitersFromEnds(string& val, bool except_paren)
3771 {
3772     NStr::TruncateSpacesInPlace(val);
3773     bool any_found = true;
3774     while (!val.empty() && any_found) {
3775         any_found = false;
3776         if (NStr::StartsWith(val, ",")
3777             || NStr::StartsWith(val, ":")
3778             || NStr::StartsWith(val, ".")
3779             || (!except_paren && NStr::StartsWith(val, ")"))) {
3780             val = val.substr(1);
3781             any_found = true;
3782             NStr::TruncateSpacesInPlace(val);
3783         } else if (NStr::EndsWith(val, ",")
3784             || NStr::EndsWith(val, ":")
3785             || (!except_paren && NStr::EndsWith(val, "("))) {
3786             val = val.substr(0, val.length() - 1);
3787             any_found = true;
3788             NStr::TruncateSpacesInPlace(val);
3789         } else if (NStr::EndsWith(val, "the") && val.length() > 3 && !isalpha((unsigned char)val[val.length() - 4])) {
3790             val = val.substr(0, val.length() - 4);
3791             any_found = true;
3792         } else if (NStr::EndsWith(val, ".")) {
3793             size_t len = val.length();
3794             if (len > 1 && isspace((unsigned char)val[len - 2])) {
3795                 val = val.substr(0, val.length() - 1);
3796                 any_found = true;
3797                 NStr::TruncateSpacesInPlace(val);
3798             } else if (len > 5) {
3799                 // make sure no spaces or punctuation within 4 characters before '.'
3800                 bool do_remove = true;
3801                 size_t pos = val.length() - 2;
3802                 size_t dist = 0;
3803                 while (dist < 4 && do_remove) {
3804                     if (isspace((unsigned char)val[pos]) || ispunct((unsigned char)val[pos])) {
3805                         do_remove = false;
3806                     }
3807                     pos--;
3808                     dist++;
3809                 }
3810                 if (do_remove) {
3811                     val = val.substr(0, val.length() - 1);
3812                     any_found = true;
3813                 }
3814             }
3815         }
3816     }
3817 }
3818 
3819 
x_Tokenize(const string & val)3820 vector<string> CCountries::x_Tokenize(const string& val)
3821 {
3822     vector<string> tokens;
3823     NStr::Split(val, ",:()", tokens);
3824     // special tokenizing - if tokens contain periods but resulting token is at least four characters long
3825     vector<string>::iterator it = tokens.begin();
3826     while (it != tokens.end()) {
3827         size_t pos = NStr::Find(*it, ".");
3828         if (pos != NPOS  &&  pos > 3 && (*it).length() - pos > 4) {
3829             string first = (*it).substr(0, pos);
3830             string remainder = (*it).substr(pos + 1);
3831             size_t space_pos = NStr::Find(first, " ");
3832             size_t len_to_space = first.length();
3833             while (space_pos != NPOS) {
3834                 first = first.substr(space_pos + 1);
3835                 len_to_space = first.length();
3836                 space_pos = NStr::Find(first, " ");
3837             }
3838             if (len_to_space > 4) {
3839                 (*it) = (*it).substr(0, pos);
3840                 it = tokens.insert(it, remainder);
3841             } else {
3842                 it++;
3843             }
3844         } else {
3845             it++;
3846         }
3847     }
3848     return tokens;
3849 }
3850 
3851 
s_ContainsWholeWord(const CTempString test,const CTempString word,NStr::ECase case_sense)3852 bool s_ContainsWholeWord(const CTempString test, const CTempString word, NStr::ECase case_sense)
3853 {
3854     size_t start = 0;
3855     size_t tlen = test.length();
3856     size_t wlen = word.length();
3857 
3858     size_t pos = NStr::Find(test, word, case_sense);
3859     while (pos != NPOS) {
3860         size_t p = start + pos;
3861         if ( (p == 0           || !isalpha((unsigned char)test[p - 1]))  &&
3862              (p + wlen >= tlen || !isalpha((unsigned char)test[p + wlen])) ) {
3863             return true;
3864         }
3865         start = p + 1;
3866         pos = NStr::Find(CTempString(test, start, tlen - start), word, case_sense);
3867     }
3868     return false;
3869 }
3870 
3871 
s_SuppressCountryFix(const string & test)3872 bool s_SuppressCountryFix(const string& test)
3873 {
3874     if (s_ContainsWholeWord(test, "Sea", NStr::eNocase)) {
3875         return true;
3876     } else if (s_ContainsWholeWord(test, "USSR", NStr::eNocase)) {
3877         return true;
3878     }
3879     return false;
3880 }
3881 
3882 
x_FindCountryName(const TCStringPairsMap & fix_map,const vector<string> & countries,string & valid_country,string & orig_valid_country,bool & too_many_countries,bool & bad_cap)3883 void CCountries::x_FindCountryName
3884 (const TCStringPairsMap& fix_map,
3885  const vector<string>& countries,
3886  string& valid_country,
3887  string& orig_valid_country,
3888  bool& too_many_countries,
3889  bool& bad_cap)
3890 {
3891     for (auto country : countries) {
3892         if (!country.empty() && !too_many_countries)
3893         {
3894             string check = country;
3895             NStr::TruncateSpacesInPlace(check);
3896             x_RemoveDelimitersFromEnds(check);
3897 
3898             bool check_has_bad_cap = false;
3899             if (IsValid(check,check_has_bad_cap))
3900             {
3901                 if (valid_country.empty())
3902                 {
3903                     valid_country = check;
3904                     orig_valid_country = check;
3905                     bad_cap = check_has_bad_cap;
3906                 }
3907                 else
3908                 {
3909                     too_many_countries = true;
3910                 }
3911             }
3912             else // see if this is a fixable country
3913             {
3914                 TCStringPairsMap::const_iterator found = fix_map.find(check.c_str());
3915                 if (found != fix_map.end())
3916                 {
3917                     if (valid_country.empty())
3918                     {
3919                         valid_country = found->second;
3920                         orig_valid_country = check;
3921                     }
3922                     else
3923                     {
3924                         too_many_countries = true;
3925                     }
3926                 }
3927             }
3928         }
3929     }
3930 }
3931 
3932 // start of RW-1278
3933 
s_CompressRunsOfSpaces(string & val)3934 bool s_CompressRunsOfSpaces(string& val)
3935 {
3936     if (val.length() == 0) return false;
3937 
3938     char * str = new char[sizeof(char) * (val.length() + 1)];
3939     strcpy(str, val.c_str());
3940 
3941     unsigned char    ch;    /* to use 8bit characters in multibyte languages */
3942     unsigned char    pv;    /* to use 8bit characters in multibyte languages */
3943     char *  dst;
3944     char *  ptr;
3945 
3946     dst = str;
3947     ptr = str;
3948     ch = *ptr;
3949     pv = '\0';
3950     while (ch != '\0') {
3951         *dst = ch;
3952         dst++;
3953         ptr++;
3954         pv = ch;
3955         ch = *ptr;
3956         if (pv == ' ') {
3957             while (ch == ' ') {
3958               ptr++;
3959               ch = *ptr;
3960             }
3961             pv = '\0';
3962         }
3963     }
3964     if (dst != NULL) {
3965         *dst = '\0';
3966     }
3967 
3968     string new_val;
3969     new_val = str;
3970     delete[] str;
3971 
3972     if (!NStr::Equal(val, new_val)) {
3973         val = new_val;
3974         return true;
3975     }
3976     else {
3977         return false;
3978     }
3979 }
3980 
3981 typedef SStaticPair<const char*, const char*> TParishMapEntry;
3982 static const TParishMapEntry parish_abbrev_array[] = {
3983     { "Acadia Parish",               "Acadia Parish"               },
3984     { "AcadiaParish",                "Acadia Parish"               },
3985     { "Allen Parish",                "Allen Parish"                },
3986     { "AllenParish",                 "Allen Parish"                },
3987     { "Ascension Parish",            "Ascension Parish"            },
3988     { "AscensionParish",             "Ascension Parish"            },
3989     { "Assumption Parish",           "Assumption Parish"           },
3990     { "AssumptionParish",            "Assumption Parish"           },
3991     { "Avoyelles Parish",            "Avoyelles Parish"            },
3992     { "AvoyellesParish",             "Avoyelles Parish"            },
3993     { "Beauregard Parish",           "Beauregard Parish"           },
3994     { "BeauregardParish",            "Beauregard Parish"           },
3995     { "Bienville Parish",            "Bienville Parish"            },
3996     { "BienvilleParish",             "Bienville Parish"            },
3997     { "Bossier Parish",              "Bossier Parish"              },
3998     { "BossierParish",               "Bossier Parish"              },
3999     { "Caddo Parish",                "Caddo Parish"                },
4000     { "CaddoParish",                 "Caddo Parish"                },
4001     { "Calcasieu Parish",            "Calcasieu Parish"            },
4002     { "CalcasieuParish",             "Calcasieu Parish"            },
4003     { "Caldwell Parish",             "Caldwell Parish"             },
4004     { "CaldwellParish",              "Caldwell Parish"             },
4005     { "Cameron Parish",              "Cameron Parish"              },
4006     { "CameronParish",               "Cameron Parish"              },
4007     { "Catahoula Parish",            "Catahoula Parish"            },
4008     { "CatahoulaParish",             "Catahoula Parish"            },
4009     { "Claiborne Parish",            "Claiborne Parish"            },
4010     { "ClaiborneParish",             "Claiborne Parish"            },
4011     { "Concordia Parish",            "Concordia Parish"            },
4012     { "ConcordiaParish",             "Concordia Parish"            },
4013     { "DeSoto Parish",               "DeSoto Parish"               },
4014     { "DeSotoParish",                "DeSoto Parish"               },
4015     { "East Baton Rouge Parish",     "East Baton Rouge Parish"     },
4016     { "East Carroll Parish",         "East Carroll Parish"         },
4017     { "East Feliciana Parish",       "East Feliciana Parish"       },
4018     { "EastBatonRougeParish",        "East Baton Rouge Parish"     },
4019     { "EastCarrollParish",           "East Carroll Parish"         },
4020     { "EastFelicianaParish",         "East Feliciana Parish"       },
4021     { "Evangeline Parish",           "Evangeline Parish"           },
4022     { "EvangelineParish",            "Evangeline Parish"           },
4023     { "Franklin Parish",             "Franklin Parish"             },
4024     { "FranklinParish",              "Franklin Parish"             },
4025     { "Grant Parish",                "Grant Parish"                },
4026     { "GrantParish",                 "Grant Parish"                },
4027     { "Iberia Parish",               "Iberia Parish"               },
4028     { "IberiaParish",                "Iberia Parish"               },
4029     { "Iberville Parish",            "Iberville Parish"            },
4030     { "IbervilleParish",             "Iberville Parish"            },
4031     { "Jackson Parish",              "Jackson Parish"              },
4032     { "JacksonParish",               "Jackson Parish"              },
4033     { "Jefferson Davis Parish",      "Jefferson Davis Parish"      },
4034     { "Jefferson Parish",            "Jefferson Parish"            },
4035     { "JeffersonDavisParish",        "Jefferson Davis Parish"      },
4036     { "JeffersonParish",             "Jefferson Parish"            },
4037     { "Lafayette Parish",            "Lafayette Parish"            },
4038     { "LafayetteParish",             "Lafayette Parish"            },
4039     { "Lafourche Parish",            "Lafourche Parish"            },
4040     { "LafourcheParish",             "Lafourche Parish"            },
4041     { "LaSalle Parish",              "LaSalle Parish"              },
4042     { "LaSalleParish",               "LaSalle Parish"              },
4043     { "Lincoln Parish",              "Lincoln Parish"              },
4044     { "LincolnParish",               "Lincoln Parish"              },
4045     { "Livingston Parish",           "Livingston Parish"           },
4046     { "LivingstonParish",            "Livingston Parish"           },
4047     { "Madison Parish",              "Madison Parish"              },
4048     { "MadisonParish",               "Madison Parish"              },
4049     { "Morehouse Parish",            "Morehouse Parish"            },
4050     { "MorehouseParish",             "Morehouse Parish"            },
4051     { "Natchitoches Parish",         "Natchitoches Parish"         },
4052     { "NatchitochesParish",          "Natchitoches Parish"         },
4053     { "Orleans Parish",              "Orleans Parish"              },
4054     { "OrleansParish",               "Orleans Parish"              },
4055     { "Ouachita Parish",             "Ouachita Parish"             },
4056     { "OuachitaParish",              "Ouachita Parish"             },
4057     { "Plaquemines Parish",          "Plaquemines Parish"          },
4058     { "PlaqueminesParish",           "Plaquemines Parish"          },
4059     { "Pointe Coupee Parish",        "Pointe Coupee Parish"        },
4060     { "PointeCoupeeParish",          "Pointe Coupee Parish"        },
4061     { "Rapides Parish",              "Rapides Parish"              },
4062     { "RapidesParish",               "Rapides Parish"              },
4063     { "Red River Parish",            "Red River Parish"            },
4064     { "RedRiverParish",              "Red River Parish"            },
4065     { "Richland Parish",             "Richland Parish"             },
4066     { "RichlandParish",              "Richland Parish"             },
4067     { "Sabine Parish",               "Sabine Parish"               },
4068     { "SabineParish",                "Sabine Parish"               },
4069     { "St. Bernard Parish",          "St. Bernard Parish"          },
4070     { "St. Charles Parish",          "St. Charles Parish"          },
4071     { "St. Helena Parish",           "St. Helena Parish"           },
4072     { "St. James Parish",            "St. James Parish"            },
4073     { "St. John the Baptist Parish", "St. John the Baptist Parish" },
4074     { "St. Landry Parish",           "St. Landry Parish"           },
4075     { "St. Martin Parish",           "St. Martin Parish"           },
4076     { "St. Mary Parish",             "St. Mary Parish"             },
4077     { "St. Tammany Parish",          "St. Tammany Parish"          },
4078     { "St.BernardParish",            "St. Bernard Parish"          },
4079     { "St.CharlesParish",            "St. Charles Parish"          },
4080     { "St.HelenaParish",             "St. Helena Parish"           },
4081     { "St.JamesParish",              "St. James Parish"            },
4082     { "St.JohntheBaptistParish",     "St. John the Baptist Parish" },
4083     { "St.LandryParish",             "St. Landry Parish"           },
4084     { "St.MartinParish",             "St. Martin Parish"           },
4085     { "St.MaryParish",               "St. Mary Parish"             },
4086     { "St.TammanyParish",            "St. Tammany Parish"          },
4087     { "Tangipahoa Parish",           "Tangipahoa Parish"           },
4088     { "TangipahoaParish",            "Tangipahoa Parish"           },
4089     { "Tensas Parish",               "Tensas Parish"               },
4090     { "TensasParish",                "Tensas Parish"               },
4091     { "Terrebonne Parish",           "Terrebonne Parish"           },
4092     { "TerrebonneParish",            "Terrebonne Parish"           },
4093     { "Union Parish",                "Union Parish"                },
4094     { "UnionParish",                 "Union Parish"                },
4095     { "Vermilion Parish",            "Vermilion Parish"            },
4096     { "VermilionParish",             "Vermilion Parish"            },
4097     { "Vernon Parish",               "Vernon Parish"               },
4098     { "VernonParish",                "Vernon Parish"               },
4099     { "Washington Parish",           "Washington Parish"           },
4100     { "WashingtonParish",            "Washington Parish"           },
4101     { "Webster Parish",              "Webster Parish"              },
4102     { "WebsterParish",               "Webster Parish"              },
4103     { "West Baton Rouge Parish",     "West Baton Rouge Parish"     },
4104     { "West Carroll Parish",         "West Carroll Parish"         },
4105     { "West Feliciana Parish",       "West Feliciana Parish"       },
4106     { "WestBatonRougeParish",        "West Baton Rouge Parish"     },
4107     { "WestCarrollParish",           "West Carroll Parish"         },
4108     { "WestFelicianaParish",         "West Feliciana Parish"       },
4109     { "Winn Parish",                 "Winn Parish"                 },
4110     { "WinnParish",                  "Winn Parish"                 }
4111 };
4112 
4113 typedef CStaticPairArrayMap<const char *, const char *, PNocase_CStr> TParishMap;
4114 DEFINE_STATIC_ARRAY_MAP(TParishMap, parishAbbrevMap, parish_abbrev_array);
4115 
s_IsParish(string & parish)4116 bool s_IsParish ( string& parish ) {
4117 
4118     if ( parish.empty() ) {
4119         return false;
4120     }
4121 
4122     TParishMap::const_iterator parish_find_iter = parishAbbrevMap.find(parish.c_str());
4123     if ( parish_find_iter != parishAbbrevMap.end() ) {
4124         // replace with full parish name
4125         parish = parish_find_iter->second;
4126         return true;
4127     }
4128 
4129     return false;
4130 }
4131 
4132 typedef SStaticPair<const char*, const char*> TStateMapEntry;
4133 static const TStateMapEntry state_abbrev_array[] = {
4134     { "AK",                    "Alaska"               },
4135     { "AL",                    "Alabama"              },
4136     { "Alabama",               "Alabama"              },
4137     { "Alaska",                "Alaska"               },
4138     { "American Samoa",        "American Samoa"       },
4139     { "AR",                    "Arkansas"             },
4140     { "Arizona",               "Arizona"              },
4141     { "Arkansas",              "Arkansas"             },
4142     { "AS",                    "American Samoa"       },
4143     { "AZ",                    "Arizona"              },
4144     { "CA",                    "California"           },
4145     { "California",            "California"           },
4146     { "CO",                    "Colorado"             },
4147     { "Colorado",              "Colorado"             },
4148     { "Connecticut",           "Connecticut"          },
4149     { "CT",                    "Connecticut"          },
4150     { "DC",                    "District of Columbia" },
4151     { "DE",                    "Delaware"             },
4152     { "Delaware",              "Delaware"             },
4153     { "District of Columbia",  "District of Columbia" },
4154     { "FL",                    "Florida"              },
4155     { "Florida",               "Florida"              },
4156     { "GA",                    "Georgia"              },
4157     { "Georgia",               "Georgia"              },
4158     { "GU",                    "Guam"                 },
4159     { "Guam",                  "Guam"                 },
4160     { "Hawaii",                "Hawaii"               },
4161     { "HI",                    "Hawaii"               },
4162     { "IA",                    "Iowa"                 },
4163     { "ID",                    "Idaho"                },
4164     { "Idaho",                 "Idaho"                },
4165     { "IL",                    "Illinois"             },
4166     { "Illinois",              "Illinois"             },
4167     { "IN",                    "Indiana"              },
4168     { "Indiana",               "Indiana"              },
4169     { "Iowa",                  "Iowa"                 },
4170     { "Kansas",                "Kansas"               },
4171     { "Kentucky",              "Kentucky"             },
4172     { "KS",                    "Kansas"               },
4173     { "KY",                    "Kentucky"             },
4174     { "LA",                    "Louisiana"            },
4175     { "Louisiana",             "Louisiana"            },
4176     { "MA",                    "Massachusetts"        },
4177     { "Maine",                 "Maine"                },
4178     { "Maryland",              "Maryland"             },
4179     { "Massachusetts",         "Massachusetts"        },
4180     { "MD",                    "Maryland"             },
4181     { "ME",                    "Maine"                },
4182     { "MI",                    "Michigan"             },
4183     { "Michigan",              "Michigan"             },
4184     { "Minnesota",             "Minnesota"            },
4185     { "Mississippi",           "Mississippi"          },
4186     { "Missouri",              "Missouri"             },
4187     { "MN",                    "Minnesota"            },
4188     { "MO",                    "Missouri"             },
4189     { "Montana",               "Montana"              },
4190     { "MS",                    "Mississippi"          },
4191     { "MT",                    "Montana"              },
4192     { "NC",                    "North Carolina"       },
4193     { "ND",                    "North Dakota"         },
4194     { "NE",                    "Nebraska"             },
4195     { "Nebraska",              "Nebraska"             },
4196     { "Nevada",                "Nevada"               },
4197     { "New Hampshire",         "New Hampshire"        },
4198     { "New Jersey",            "New Jersey"           },
4199     { "New Mexico",            "New Mexico"           },
4200     { "New York",              "New York"             },
4201     { "NH",                    "New Hampshire"        },
4202     { "NJ",                    "New Jersey"           },
4203     { "NM",                    "New Mexico"           },
4204     { "North Carolina",        "North Carolina"       },
4205     { "North Dakota",          "North Dakota"         },
4206     { "NV",                    "Nevada"               },
4207     { "NY",                    "New York"             },
4208     { "OH",                    "Ohio"                 },
4209     { "Ohio",                  "Ohio"                 },
4210     { "OK",                    "Oklahoma"             },
4211     { "Oklahoma",              "Oklahoma"             },
4212     { "OR",                    "Oregon"               },
4213     { "Oregon",                "Oregon"               },
4214     { "PA",                    "Pennsylvania"         },
4215     { "Pennsylvania",          "Pennsylvania"         },
4216     { "PR",                    "Puerto Rico"          },
4217     { "Puerto Rico",           "Puerto Rico"          },
4218     { "Rhode Island",          "Rhode Island"         },
4219     { "RI",                    "Rhode Island"         },
4220     { "SC",                    "South Carolina"       },
4221     { "SD",                    "South Dakota"         },
4222     { "South Carolina",        "South Carolina"       },
4223     { "South Dakota",          "South Dakota"         },
4224     { "Tennessee",             "Tennessee"            },
4225     { "Texas",                 "Texas"                },
4226     { "TN",                    "Tennessee"            },
4227     { "TX",                    "Texas"                },
4228     { "US Virgin Islands",     "US Virgin Islands"    },
4229     { "UT",                    "Utah"                 },
4230     { "Utah",                  "Utah"                 },
4231     { "VA",                    "Virginia"             },
4232     { "Vermont",               "Vermont"              },
4233     { "VI",                    "US Virgin Islands"    },
4234     { "Virgin Islands",        "US Virgin Islands"    },
4235     { "Virginia",              "Virginia"             },
4236     { "VT",                    "Vermont"              },
4237     { "WA",                    "Washington"           },
4238     { "Washington",            "Washington"           },
4239     { "West Virginia",         "West Virginia"        },
4240     { "WI",                    "Wisconsin"            },
4241     { "Wisconsin",             "Wisconsin"            },
4242     { "WV",                    "West Virginia"        },
4243     { "WY",                    "Wyoming"              },
4244     { "Wyoming",               "Wyoming"              }
4245 };
4246 
4247 typedef CStaticPairArrayMap<const char *, const char *, PNocase_CStr> TStateMap;
4248 DEFINE_STATIC_ARRAY_MAP(TStateMap, stateAbbrevMap, state_abbrev_array);
4249 
s_IsState(string & state,bool & modified)4250 bool s_IsState ( string& state, bool& modified ) {
4251 
4252     if ( state.empty() ) {
4253         return false;
4254     }
4255 
4256     string original = state;
4257     string working = state;
4258 
4259     if ( NStr::StartsWith ( working, "State of ", NStr::eNocase )) {
4260           NStr::TrimPrefixInPlace ( working, "State of ", NStr::eNocase );
4261     }
4262 
4263     if ( NStr::StartsWith ( working, "Commonwealth of ", NStr::eNocase )) {
4264         NStr::TrimPrefixInPlace ( working, "Commonwealth of ", NStr::eNocase );
4265     }
4266 
4267     if ( NStr::EndsWith ( working, " State", NStr::eNocase )) {
4268         NStr::TrimSuffixInPlace ( working, " State", NStr::eNocase );
4269     }
4270 
4271     NStr::TruncateSpacesInPlace ( working );
4272 
4273     TStateMap::const_iterator state_find_iter = stateAbbrevMap.find(working.c_str());
4274     if ( state_find_iter != stateAbbrevMap.end() ) {
4275         // replace with full state name
4276         state = state_find_iter->second;
4277         // report conversion from two-letter, changed capitalization, or prefix/suffix removal
4278         if ( ! NStr::Equal ( original, state )) {
4279             modified = true;
4280         }
4281         return true;
4282     }
4283 
4284     return false;
4285 }
4286 
s_DoUSAStateCleanup(string & country)4287 CCountries::EStateCleanup s_DoUSAStateCleanup ( string& country ) {
4288 
4289     if ( country.empty() ) {
4290         return CCountries::e_NoResult;
4291     }
4292 
4293     // make working copy
4294     string original = country;
4295     string working = country;
4296 
4297     // remove flanking quotation marks - if CCountries::NewFixCountry not called
4298     if ( NStr::StartsWith ( working, "\"" ) && NStr::EndsWith ( working, "\"" )) {
4299         working = working.substr ( 1, working.length() - 2 );
4300     }
4301 
4302     // remove flanking spaces
4303     NStr::TruncateSpacesInPlace ( working );
4304 
4305     // separate strings before and after colon
4306     string frst, scnd;
4307     NStr::SplitInTwo ( working, ":", frst, scnd );
4308 
4309     NStr::TruncateSpacesInPlace ( frst );
4310     NStr::TruncateSpacesInPlace ( scnd );
4311 
4312     // confirm that country is USA
4313     if ( ! NStr::EqualNocase ( frst, "USA") && ! NStr::EqualNocase ( frst, "US")) {
4314         // if not, first try rescuing US territory
4315         working = CCountries::NewFixCountry(working, true);
4316         NStr::SplitInTwo ( working, ":", frst, scnd );
4317         NStr::TruncateSpacesInPlace ( frst );
4318         NStr::TruncateSpacesInPlace ( scnd );
4319         if ( ! NStr::EqualNocase ( frst, "USA") && ! NStr::EqualNocase ( frst, "US")) {
4320             return CCountries::e_NotUSA;
4321         }
4322     }
4323 
4324     // split state/county/city clauses at commas
4325     vector<string> components;
4326     NStr::Split(scnd, ",", components);
4327 
4328     // check for only country
4329     if ( components.size() < 1 ) {
4330         country = "USA";
4331         return CCountries::e_Valid;
4332     }
4333 
4334     for ( int j = 0; j < components.size(); j++ ) {
4335         // remove flanking spaces around components
4336         NStr::TruncateSpacesInPlace ( components[j] );
4337         s_CompressRunsOfSpaces ( components[j] );
4338         // clean up runon strings like EastBatonRougeParish
4339         if ( NStr::EndsWith ( components[j], "Parish", NStr::eNocase )) {
4340             s_IsParish( components[j] );
4341         }
4342     }
4343 
4344     bool any_modified = false;
4345     int num_states = 0;
4346     int match = -1;
4347 
4348     string* first = 0;
4349     string* last = 0;
4350 
4351     // has multiple components
4352     int max = components.size() - 1;
4353     for ( int j = 0; j < components.size(); j++ ) {
4354         bool modified = false;
4355         if ( s_IsState  ( components[j], modified )) {
4356             if (modified) {
4357                 any_modified = true;
4358             }
4359             if ( match < 0 ) {
4360                 // record position of first s_IsState match
4361                 match = j;
4362             }
4363             // count successful matches
4364             num_states++;
4365             if ( j == 0 ) {
4366                 first = &(components[j]);
4367             }
4368             if ( j == max ) {
4369                 last = &(components[j]);
4370             }
4371         }
4372     }
4373 
4374     // generate result
4375     string res;
4376     res.append ("USA: ");
4377     string pfx = "";
4378 
4379     if ( match >= 0 ) {
4380         // move first state matched to first position
4381         res.append ( components[match] );
4382         pfx = ", ";
4383     }
4384 
4385     for ( int j = 0; j < components.size(); j++ ) {
4386         if ( j == match) continue;
4387         res.append ( pfx );
4388         res.append ( components[j] );
4389         pfx = ", ";
4390     }
4391 
4392     country = res;
4393 
4394     if ( match < 0 ) {
4395         return CCountries::e_Missing;
4396     } else if ( num_states > 1 ) {
4397         return CCountries::e_Ambiguous;
4398     } else if ( ! NStr::Equal ( original, res )) {
4399         return CCountries::e_Corrected;
4400     }
4401 
4402     return CCountries::e_Valid;
4403 }
4404 
4405 typedef CRowReader<CRowReaderStream_NCBI_TSV> TNCBITSVStream;
4406 
4407 static CCountries::TUsaExceptionMap exception_map;
4408 static bool exceptions_initialized = false;
4409 
ReadUSAExceptionMap(CCountries::TUsaExceptionMap & exceptions,const string & exception_file)4410 void CCountries::ReadUSAExceptionMap (CCountries::TUsaExceptionMap& exceptions, const string& exception_file ) {
4411 
4412     if ( ! exception_file.empty()) {
4413 
4414         TNCBITSVStream my_stream (exception_file);
4415         for ( const auto & row : my_stream ) {
4416             TFieldNo number_of_fields = row. GetNumberOfFields();
4417             if ( number_of_fields != 2 ) continue;
4418             string fr = row[0].Get<string>();
4419             string to = row[1].Get<string>();
4420             exceptions [fr] = to;
4421         }
4422     }
4423 }
4424 
LoadUSAExceptionMap(const TUsaExceptionMap & exceptions)4425 void CCountries::LoadUSAExceptionMap (const TUsaExceptionMap& exceptions) {
4426 
4427     // clear previous map
4428     exception_map.clear();
4429 
4430     // initialize internal exception map
4431     for ( const auto & itm : exceptions ) {
4432         string fr = itm.first;
4433         string to = itm.second;
4434 
4435         // ensure colon is followed by space to match initial correction
4436         string f1, f2;
4437         NStr::SplitInTwo ( fr, ":", f1, f2 );
4438         NStr::TruncateSpacesInPlace ( f1 );
4439         NStr::TruncateSpacesInPlace ( f2 );
4440         if ( ! f1.empty() && ! f2.empty()) {
4441             fr = f1 + ": " + f2;
4442         }
4443 
4444         exception_map [fr] = to;
4445     }
4446 
4447     exceptions_initialized = true;
4448 }
4449 
LoadUSAExceptionMap(const string & exception_file)4450 void CCountries::LoadUSAExceptionMap (const string& exception_file ) {
4451 
4452     if ( ! exception_file.empty()) {
4453 
4454         TUsaExceptionMap exceptions;
4455         ReadUSAExceptionMap ( exceptions, exception_file );
4456         LoadUSAExceptionMap ( exceptions );
4457     }
4458 }
4459 
USAStateCleanup(const string & country,CCountries::EStateCleanup & type)4460 string CCountries::USAStateCleanup ( const string& country, CCountries::EStateCleanup& type ) {
4461 
4462     // call algorithmic mapping function
4463     string working = country;
4464     type = s_DoUSAStateCleanup ( working );
4465 
4466     // apply exceptions from preloaded data file
4467     if ( exceptions_initialized ) {
4468         string corrected = exception_map [working];
4469         if ( ! corrected.empty()) {
4470             // presence in map here will disambiguate otherwise ambiguous name pair,
4471             // thus self-entries need to be added to the ambiguous state exception list
4472             if ( ! NStr::StartsWith ( corrected, "USA" )) {
4473                 type = e_NotUSA;
4474             } else if ( NStr::Equal ( corrected, working ) && NStr::Equal ( corrected, country )) {
4475                 type = e_Valid;
4476             } else {
4477                 type = e_Corrected;
4478             }
4479             return corrected;
4480         }
4481     }
4482 
4483     if ( ! NStr::StartsWith ( working, "USA" )) {
4484         type = e_NotUSA;
4485     }
4486     return working;
4487 }
4488 
USAStateCleanup(const string & country)4489 string CCountries::USAStateCleanup ( const string& country ) {
4490 
4491     CCountries::EStateCleanup type = e_NoResult;
4492     return USAStateCleanup ( country, type );
4493 }
4494 
4495 // end of RW-1278
4496 
NewFixCountry(const string & test,bool us_territories)4497 string CCountries::NewFixCountry (const string& test, bool us_territories)
4498 {
4499     // change requested for JIRA:SQD-1410
4500     if (s_SuppressCountryFix(test)) {
4501         if (IsValid(test)) {
4502             return test;
4503         } else {
4504             return kEmptyStr;
4505         }
4506     }
4507 
4508     string input = test;
4509     if (NStr::StartsWith(input, "\"") && NStr::EndsWith(input, "\"")) {
4510         input = input.substr(1, input.length() - 2);
4511     }
4512     NStr::TruncateSpacesInPlace(input);
4513 
4514     if (NStr::EndsWith(input, ":")) {
4515         input = input.substr(0, input.length() - 1);
4516         NStr::TruncateSpacesInPlace(input);
4517     }
4518 
4519     string usa1,usa2;
4520     NStr::SplitInTwo(input, ":", usa1, usa2);
4521     if (!usa1.empty() && !usa2.empty()) {
4522         NStr::TruncateSpacesInPlace(usa1);
4523         NStr::TruncateSpacesInPlace(usa2);
4524         if (NStr::EqualNocase(usa1, "U.S.A.") || NStr::EqualNocase(usa1, "United States") || NStr::EqualNocase(usa1, "United States of America")) {
4525             input = "USA: " + usa2;
4526         }
4527     }
4528 
4529     auto old_name_fix = k_old_country_name_fixes.find(input.c_str());
4530     if (old_name_fix != k_old_country_name_fixes.end()) {
4531         input = old_name_fix->second;
4532         return input;
4533     }
4534 
4535     if (us_territories) {
4536         if ( NStr::StartsWith( input, "Puerto Rico", NStr::eNocase) || NStr::StartsWith( input, "Guam", NStr::eNocase) || NStr::StartsWith( input, "American Samoa", NStr::eNocase) ) {
4537             input = "USA: " + input;
4538             CCountries::ChangeExtraColonsToCommas(input);
4539             input = CCountries::USAStateCleanup(input);
4540             return input;
4541         } else if ( NStr::StartsWith( input, "Virgin Islands", NStr::eNocase) ) {
4542             input = "USA: US " + input;
4543             CCountries::ChangeExtraColonsToCommas(input);
4544             input = CCountries::USAStateCleanup(input);
4545             return input;
4546         }
4547     }
4548 
4549     if (IsValid(input)) {
4550         CCountries::ChangeExtraColonsToCommas(input);
4551         return input;
4552     }
4553     string new_country = WholeCountryFix(input);
4554     if (!new_country.empty())
4555         return new_country;
4556 
4557     bool too_many_countries = false;
4558     bool bad_cap = false;
4559     vector<string> countries = x_Tokenize(input);
4560     string valid_country;
4561     string orig_valid_country;
4562 
4563     x_FindCountryName(k_country_name_fixes, countries, valid_country, orig_valid_country, too_many_countries, bad_cap);
4564     if (valid_country.empty()) {
4565         x_FindCountryName(k_subregion_fixes, countries, valid_country, orig_valid_country, too_many_countries, bad_cap);
4566     }
4567 
4568     if (!valid_country.empty() && !too_many_countries)
4569         too_many_countries = ContainsMultipleCountryNames (input);
4570 
4571     if (!valid_country.empty() && too_many_countries && valid_country == input)
4572     {
4573         string str1,str2;
4574         NStr::SplitInTwo(valid_country,":",str1,str2);
4575         if (!str1.empty() && !str2.empty() && !NStr::StartsWith(str2," "))
4576             new_country = str1+": "+str2;
4577 
4578         CCountries::ChangeExtraColonsToCommas(new_country);
4579     }
4580     else if(!valid_country.empty() && !too_many_countries)
4581     {
4582         // find valid_country in input
4583         size_t pos = NStr::Find(input,orig_valid_country);
4584         // save preceeding string without trailing spaces or delimiters ":,"
4585         string before = input.substr(0,pos);
4586 
4587         x_RemoveDelimitersFromEnds(before);
4588         NStr::TruncateSpacesInPlace(before);
4589         // save trailing string without initial spaces or delimiters
4590         string after = input.substr(pos+orig_valid_country.length());
4591         x_RemoveDelimitersFromEnds(after, true);
4592         NStr::TruncateSpacesInPlace(after);
4593         if (bad_cap) new_country = GetCorrectedCountryCapitalization(valid_country);
4594         else new_country = valid_country;
4595         if (!before.empty() || !after.empty()) {
4596             if (NStr::Find(valid_country, ":") == NPOS) {
4597                 new_country += ": ";
4598             } else {
4599                 new_country += ", ";
4600             }
4601         }
4602         if (!before.empty())
4603             new_country += before;
4604         if (!before.empty() && !after.empty() && !NStr::Equal(after, ")"))
4605             new_country += ", ";
4606         if (!after.empty())
4607             new_country += after;
4608         CCountries::ChangeExtraColonsToCommas(new_country);
4609     }
4610 
4611     return new_country;
4612 }
4613 
4614 
ChangeExtraColonsToCommas(string & country)4615 bool CCountries::ChangeExtraColonsToCommas(string& country)
4616 {
4617     // requested in SQD-4516
4618     bool rval = false;
4619     int count = 0;
4620     for (size_t i = 0; i < country.length(); i++) {
4621         if (country[i] == ':') {
4622             count++;
4623             if (count > 1) {
4624                 country[i] = ',';
4625                 rval = true;
4626             }
4627         }
4628     }
4629     return rval;
4630 }
4631 
4632 
CountryFixupItem(const string & input,bool capitalize_after_colon)4633 string CCountries::CountryFixupItem(const string &input, bool capitalize_after_colon)
4634 {
4635     string country = NewFixCountry (input);
4636     string new_country = country;
4637     SIZE_TYPE country_end_pos = NStr::Find(country,":");
4638     if (country_end_pos != NPOS)
4639     {
4640         SIZE_TYPE pos = country_end_pos;
4641         while (country[pos] == ','  ||  country[pos] == ':'  ||  isspace((unsigned char)country[pos]))
4642         {
4643             pos++;
4644         }
4645         string after = country.substr(pos);
4646         if (after.empty()) {
4647             if (pos > country_end_pos) {
4648                 new_country = country.substr(0, country_end_pos);
4649             }
4650         } else {
4651             NStr::TruncateSpacesInPlace(after,NStr::eTrunc_Begin);
4652             if (capitalize_after_colon)
4653                 after = CapitalizeFirstLetterOfEveryWord (after);
4654             new_country = country.substr(0,country_end_pos);
4655             new_country += ": " + after;
4656         }
4657     }
4658     return new_country;
4659 }
4660 
4661 
4662 // SubSource Qual Fixups
4663 typedef SStaticPair<const char*, const char*> TStaticQualFixPair;
4664 typedef CStaticPairArrayMap<const char*, const char*, PNocase_CStr> TStaticQualFixMap;
4665 
4666 static const TStaticQualFixPair kDevStagePairs[] = {
4667     { "adult", "adult" },
4668     { "egg", "egg" },
4669     { "juvenile", "juvenile" },
4670     { "larva", "larva" }
4671 };
4672 
4673 DEFINE_STATIC_ARRAY_MAP(TStaticQualFixMap, sc_DevStagePairs, kDevStagePairs);
4674 
4675 
FixDevStageCapitalization(const string & value)4676 string CSubSource::FixDevStageCapitalization(const string& value)
4677 {
4678     string fix = value;
4679 
4680     TStaticQualFixMap::const_iterator it = sc_DevStagePairs.find(value.c_str());
4681     if (it != sc_DevStagePairs.end()) {
4682         fix = it->second;
4683     }
4684     return fix;
4685 }
4686 
4687 
4688 static const TStaticQualFixPair kCellTypePairs[] = {
4689     { "hemocyte", "hemocyte" },
4690     { "hepatocyte", "hepatocyte" },
4691     { "lymphocyte", "lymphocyte" },
4692     { "neuroblast", "neuroblast" }
4693 };
4694 
4695 DEFINE_STATIC_ARRAY_MAP(TStaticQualFixMap, sc_CellTypePairs, kCellTypePairs);
4696 
FixCellTypeCapitalization(const string & value)4697 string CSubSource::FixCellTypeCapitalization(const string& value)
4698 {
4699     string fix = value;
4700 
4701     TStaticQualFixMap::const_iterator it = sc_CellTypePairs.find(value.c_str());
4702     if (it != sc_CellTypePairs.end()) {
4703         fix = it->second;
4704     }
4705     return fix;
4706 
4707 }
4708 
4709 DEFINE_STATIC_FAST_MUTEX(s_QualFixMutex);
4710 typedef map<string, string, PNocase> TQualFixMap;
4711 
4712 static TQualFixMap s_IsolationSourceMap;
4713 static bool s_QualFixupMapsInitialized = false;
4714 
s_ProcessQualMapLine(const CTempString & line,TQualFixMap & qual_map)4715 static void s_ProcessQualMapLine(const CTempString& line, TQualFixMap& qual_map)
4716 {
4717     vector<CTempString> tokens;
4718     NStr::Split(line, "\t", tokens);
4719     if (tokens.size() > 1) {
4720         qual_map[tokens[0]] = tokens[1];
4721     }
4722 }
4723 
4724 
s_AddOneDataFile(const string & file_name,const string & data_name,const char ** built_in,size_t num_built_in,TQualFixMap & qual_map)4725 void s_AddOneDataFile(const string& file_name, const string& data_name,
4726                       const char **built_in, size_t num_built_in,
4727                       TQualFixMap& qual_map)
4728 {
4729     string file = g_FindDataFile(file_name);
4730     CRef<ILineReader> lr;
4731     if (!file.empty()) {
4732         try {
4733             lr = ILineReader::New(file);
4734         } NCBI_CATCH("s_InitializeQualMaps")
4735     }
4736 
4737     if (lr.Empty()) {
4738         if (built_in == NULL) {
4739             ERR_POST(Note << "No data for " + data_name);
4740         } else {
4741             if (getenv("NCBI_DEBUG")) {
4742                 ERR_POST(Note << "Falling back on built-in data for " + data_name);
4743             }
4744             for (size_t i = 0; i < num_built_in; i++) {
4745                 const char *p = built_in[i];
4746                 s_ProcessQualMapLine(p, qual_map);
4747             }
4748         }
4749     } else {
4750         if (getenv("NCBI_DEBUG")) {
4751             ERR_POST(Note << "Reading from " + file + " for " + data_name);
4752         }
4753         do {
4754             s_ProcessQualMapLine(*++*lr, qual_map);
4755         } while (!lr->AtEOF());
4756     }
4757 }
4758 
4759 #include "isolation_sources.inc"
4760 
s_InitializeQualMaps(void)4761 static void s_InitializeQualMaps(void)
4762 {
4763     CFastMutexGuard GUARD(s_QualFixMutex);
4764     if (s_QualFixupMapsInitialized) {
4765         return;
4766     }
4767 
4768     // tissue types
4769     s_AddOneDataFile("isolation_sources.txt", "isolation sources", (const char **)k_isolation_sources, sizeof(k_isolation_sources) / sizeof(char *), s_IsolationSourceMap);
4770     s_QualFixupMapsInitialized = true;
4771 }
4772 
4773 
4774 
4775 
4776 
FixIsolationSourceCapitalization(const string & value)4777 string CSubSource::FixIsolationSourceCapitalization(const string& value)
4778 {
4779     string fix = value;
4780 
4781     s_InitializeQualMaps();
4782 
4783     TQualFixMap::iterator it = s_IsolationSourceMap.find(value);
4784     if (it != s_IsolationSourceMap.end()) {
4785         return it->second;
4786     }
4787 
4788     size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
4789     for (size_t i = 0; i < max; i++) {
4790         if (NStr::EqualNocase(fix, sm_ValidSexQualifierTokens[i])) {
4791             fix = sm_ValidSexQualifierTokens[i];
4792             break;
4793         }
4794     }
4795 
4796     fix = COrgMod::FixHostCapitalization(fix);
4797     fix = FixDevStageCapitalization(fix);
4798     fix = FixCellTypeCapitalization(fix);
4799 
4800     return fix;
4801 }
4802 
4803 
FixTissueTypeCapitalization(const string & value)4804 string CSubSource::FixTissueTypeCapitalization(const string& value)
4805 {
4806     string fix = value;
4807 
4808     s_InitializeQualMaps();
4809     TQualFixMap::iterator it = s_IsolationSourceMap.find(value);
4810     if (it != s_IsolationSourceMap.end()) {
4811         return it->second;
4812     }
4813 
4814 
4815     size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
4816     for (size_t i = 0; i < max; i++) {
4817         if (NStr::EqualNocase(fix, sm_ValidSexQualifierTokens[i])) {
4818             fix = sm_ValidSexQualifierTokens[i];
4819             break;
4820         }
4821     }
4822 
4823     fix = COrgMod::FixHostCapitalization(fix);
4824     fix = FixDevStageCapitalization(fix);
4825     fix = FixCellTypeCapitalization(fix);
4826 
4827     return fix;
4828 }
4829 
4830 
FixLabHostCapitalization(const string & value)4831 string CSubSource::FixLabHostCapitalization(const string& value)
4832 {
4833     return COrgMod::FixHostCapitalization(value);
4834 }
4835 
4836 
FixCapitalization(TSubtype subtype,const string & value)4837 string CSubSource::FixCapitalization(TSubtype subtype, const string& value)
4838 {
4839     string new_val = value;
4840     switch (subtype) {
4841         case CSubSource::eSubtype_sex:
4842             new_val = FixSexQualifierValue(value);
4843             if (NStr::IsBlank(new_val)) {
4844                 new_val = value;
4845             }
4846             break;
4847         case CSubSource::eSubtype_isolation_source:
4848             new_val = FixIsolationSourceCapitalization(value);
4849             break;
4850         case CSubSource::eSubtype_lab_host:
4851             new_val = FixLabHostCapitalization(value);
4852             break;
4853         case CSubSource::eSubtype_tissue_type:
4854             new_val = FixTissueTypeCapitalization(value);
4855             break;
4856         case CSubSource::eSubtype_dev_stage:
4857             new_val = FixDevStageCapitalization(value);
4858             break;
4859         case CSubSource::eSubtype_cell_type:
4860             new_val = FixCellTypeCapitalization(value);
4861             break;
4862         default:
4863             new_val = value;
4864             break;
4865     }
4866     return new_val;
4867 }
4868 
4869 
FixCapitalization()4870 void CSubSource::FixCapitalization()
4871 {
4872     if (!IsSetSubtype() || !IsSetName()) {
4873         return;
4874     }
4875 
4876     TSubtype subtype = GetSubtype();
4877 
4878     if (subtype == CSubSource::eSubtype_sex) {
4879         string upr = GetName();
4880         string lwr = upr;
4881         NStr::ToLower(lwr);
4882         if (! NStr::Equal(upr, lwr)) {
4883             SetName(lwr);
4884         }
4885     }
4886 
4887     const string& name = GetName();
4888 
4889     string new_val = FixCapitalization(subtype, name);
4890 
4891     if (!NStr::IsBlank(new_val)) {
4892         SetName(new_val);
4893     }
4894 
4895 }
4896 
4897 
AutoFix(TSubtype subtype,const string & value)4898 string CSubSource::AutoFix(TSubtype subtype, const string& value)
4899 {
4900     string new_val;
4901     switch (subtype) {
4902         case CSubSource::eSubtype_country:
4903             new_val = CCountries::NewFixCountry(value);
4904             break;
4905         case CSubSource::eSubtype_collection_date:
4906             new_val = FixDateFormat(value);
4907             break;
4908         case CSubSource::eSubtype_lat_lon:
4909             new_val = FixLatLonFormat(value);
4910             break;
4911         case CSubSource::eSubtype_sex:
4912             new_val = FixSexQualifierValue(value);
4913             break;
4914         case CSubSource::eSubtype_altitude:
4915             new_val = FixAltitude(value);
4916             break;
4917         default:
4918             break;
4919     }
4920     return new_val;
4921 }
4922 
4923 
AutoFix()4924 void CSubSource::AutoFix()
4925 {
4926     if (!IsSetSubtype() || !IsSetName()) {
4927         return;
4928     }
4929 
4930     TSubtype subtype = GetSubtype();
4931     string new_val = AutoFix(subtype, GetName());
4932 
4933     if (!NStr::IsBlank(new_val)) {
4934         SetName(new_val);
4935     } else if (subtype == CSubSource::eSubtype_sex) {
4936         string upr = GetName();
4937         string lwr = upr;
4938         NStr::ToLower(lwr);
4939         if (! NStr::Equal(upr, lwr)) {
4940             SetName(lwr);
4941         }
4942     }
4943 }
4944 
4945 
4946 
4947 // NOTE (for two arrays below): If string A is a prefix of string B, string B should be placed
4948 // BEFORE string A. I.e. longer string should be earlier
4949 static const char * s_RemovableCultureNotes[] = {
4950     "[BankIt_uncultured16S_wizard]; [universal primers]; [tgge]",
4951     "[BankIt_uncultured16S_wizard]; [universal primers]; [dgge]",
4952     "[BankIt_uncultured16S_wizard]; [universal primers]",
4953     "[BankIt_cultured16S_wizard]",
4954     "[BankIt_organellerRNA_wizard]",
4955     "[BankIt_ITS_wizard]; [rRNAITS_notfound]",
4956     "[BankIt_ITS_wizard]",
4957     "[uncultured (using universal primers)]",
4958     "[uncultured (using universal primers) bacterial source]",
4959     "[cultured bacterial source]",
4960     "[enrichment culture bacterial source]",
4961     "[mixed bacterial source (cultured and uncultured)]",
4962     "[uncultured]; [universal primers]",
4963     "[mixed bacterial source]",
4964     "[virus wizard]",
4965     "[cDNA derived from mRNA, purified viral particles]",
4966     "[cDNA derived from mRNA, whole cell/tissue lysate]",
4967     "[cDNA derived from genomic RNA, whole cell/tissue lysate]",
4968     "[cDNA derived from genomic RNA, purified viral particles]",
4969     "[universal primers]",
4970     "[uncultured; wizard]",
4971     "[uncultured; wizard; spans unknown]",
4972     "[cultured; wizard]",
4973     "[cultured; wizard; spans unknown]",
4974     "[intergenic wizard]",
4975     "[intergenic wizard; spans unknown]",
4976     "[Microsatellite wizard]",
4977     "[Microsatellite wizard; multiple repeats]",
4978     "[D-loop wizard]",
4979     "[D-loop wizard; spans unknown]",
4980     "[D-loop wizard; spans known]",
4981     NULL
4982 };
4983 
4984 static const char * s_ReplaceableCultureNotes[] = {
4985  "[BankIt_uncultured16S_wizard]; [species_specific primers]; [tgge]",
4986  "[BankIt_uncultured16S_wizard]; [species_specific primers]; [dgge]",
4987  "[BankIt_uncultured16S_wizard]; [species_specific primers]",
4988  "[uncultured (with species-specific primers)]",
4989  "[uncultured]; [amplified with species-specific primers]",
4990  "[uncultured (using species-specific primers) bacterial source]",
4991  "[amplified with species-specific primers]",
4992  NULL
4993 };
4994 
4995 
HasCultureNotes(const string & value)4996 bool CSubSource::HasCultureNotes(const string& value)
4997 {
4998     for (size_t i = 0; s_RemovableCultureNotes[i] != NULL; i++) {
4999         size_t pos = NStr::FindNoCase(value, s_RemovableCultureNotes[i]);
5000         if (pos != string::npos) {
5001             return true;
5002         }
5003     }
5004     for (size_t i = 0; s_ReplaceableCultureNotes[i] != NULL; i++) {
5005         if (NStr::EqualNocase(value, s_ReplaceableCultureNotes[i])) {
5006             return true;
5007         }
5008     }
5009     return false;
5010 }
5011 
5012 
RemoveCultureNotes(string & value,bool is_species_level)5013 void CSubSource::RemoveCultureNotes (string& value, bool is_species_level)
5014 {
5015     if (NStr::IsBlank(value)) {
5016         return;
5017     }
5018 
5019     for (size_t i = 0; s_RemovableCultureNotes[i] != NULL; i++) {
5020         string to_remove = s_RemovableCultureNotes[i];
5021         size_t remove_len = to_remove.length();
5022         size_t pos = NStr::FindNoCase(value, to_remove);
5023         while (pos != NPOS) {
5024             size_t extra_len = strspn (value.c_str() + pos + remove_len, " ;");
5025             value = value.substr(0, pos) + value.substr(pos + remove_len + extra_len);
5026             pos = NStr::FindNoCase(value, to_remove);
5027         }
5028     }
5029     // remove leading/trailing semicolons
5030     while (NStr::StartsWith(value, " ") || NStr::StartsWith(value, ";")) {
5031         value = value.substr(1);
5032     }
5033     while (NStr::EndsWith(value, " ") || NStr::EndsWith(value, ";")) {
5034         value = value.substr(0, value.length() - 1);
5035     }
5036 
5037     if (is_species_level) {
5038         for (size_t i = 0; s_ReplaceableCultureNotes[i] != NULL; i++) {
5039             if (NStr::EqualNocase(value, s_ReplaceableCultureNotes[i])) {
5040                 value = "amplified with species-specific primers";
5041                 break;
5042             }
5043         }
5044     }
5045 }
5046 
5047 
RemoveCultureNotes(bool is_species_level)5048 void CSubSource::RemoveCultureNotes (bool is_species_level)
5049 {
5050     if (IsSetName()) {
5051         RemoveCultureNotes(SetName(), is_species_level);
5052         if (NStr::IsBlank(GetName())) {
5053             ResetName();
5054         }
5055     }
5056 }
5057 
5058 
5059 // CCountryLine
CCountryLine(const string & country_name,double y,double min_x,double max_x,double scale)5060 CCountryLine::CCountryLine
5061 (const string & country_name, double y, double min_x, double max_x, double scale)
5062 : m_CountryName(country_name) ,
5063   m_Scale (scale)
5064 {
5065     m_Y = x_ConvertLat(y);
5066     m_MinX = x_ConvertLon(min_x);
5067     m_MaxX = x_ConvertLon(max_x);
5068 
5069 }
5070 
5071 
~CCountryLine(void)5072 CCountryLine::~CCountryLine (void)
5073 {
5074 }
5075 
5076 
5077 #define EPSILON 0.001
5078 
ConvertLat(double y,double scale)5079 int CCountryLine::ConvertLat (double y, double scale)
5080 {
5081 
5082     int  val = 0;
5083 
5084     if (y < -90.0) {
5085         y = -90.0;
5086     }
5087     if (y > 90.0) {
5088         y = 90.0;
5089     }
5090 
5091     if (y > 0) {
5092         val = (int) (y * scale + EPSILON);
5093     } else {
5094         val = (int) (-(-y * scale + EPSILON));
5095     }
5096 
5097     return val;
5098 }
5099 
5100 
x_ConvertLat(double y)5101 int CCountryLine::x_ConvertLat (double y)
5102 {
5103     return ConvertLat(y, m_Scale);
5104 }
5105 
ConvertLon(double x,double scale)5106 int CCountryLine::ConvertLon (double x, double scale)
5107 {
5108 
5109   int  val = 0;
5110 
5111   if (x < -180.0) {
5112     x = -180.0;
5113   }
5114   if (x > 180.0) {
5115     x = 180.0;
5116   }
5117 
5118   if (x > 0) {
5119     val = (int) (x * scale + EPSILON);
5120   } else {
5121     val = (int) (-(-x * scale + EPSILON));
5122   }
5123 
5124   return val;
5125 }
5126 
5127 
x_ConvertLon(double x)5128 int CCountryLine::x_ConvertLon (double x)
5129 {
5130     return ConvertLon(x, m_Scale);
5131 }
5132 
5133 
CCountryExtreme(const string & country_name,int min_x,int min_y,int max_x,int max_y)5134 CCountryExtreme::CCountryExtreme (const string & country_name, int min_x, int min_y, int max_x, int max_y)
5135 : m_CountryName(country_name) , m_MinX (min_x), m_MinY (min_y), m_MaxX(max_x), m_MaxY (max_y)
5136 {
5137     m_Area = (1 + m_MaxY - m_MinY) * (1 + m_MaxX - m_MinX);
5138     size_t pos = NStr::Find(country_name, ":");
5139     if (pos == NPOS) {
5140         m_Level0 = country_name;
5141         m_Level1.clear();
5142     } else {
5143         m_Level0 = country_name.substr(0, pos);
5144         NStr::TruncateSpacesInPlace(m_Level0);
5145         m_Level1 = country_name.substr(pos + 1);
5146         NStr::TruncateSpacesInPlace(m_Level1);
5147     }
5148 
5149 }
5150 
5151 
~CCountryExtreme(void)5152 CCountryExtreme::~CCountryExtreme (void)
5153 {
5154 
5155 }
5156 
5157 
SetMinX(int min_x)5158 bool CCountryExtreme::SetMinX(int min_x)
5159 {
5160     if (min_x < m_MinX) {
5161         m_MinX = min_x;
5162         return true;
5163     } else {
5164         return false;
5165     }
5166 }
5167 
5168 
SetMaxX(int max_x)5169 bool CCountryExtreme::SetMaxX(int max_x)
5170 {
5171     if (max_x > m_MaxX) {
5172         m_MaxX = max_x;
5173         return true;
5174     } else {
5175         return false;
5176     }
5177 }
5178 
5179 
SetMinY(int min_y)5180 bool CCountryExtreme::SetMinY(int min_y)
5181 {
5182     if (min_y < m_MinY) {
5183         m_MinY = min_y;
5184         return true;
5185     } else {
5186         return false;
5187     }
5188 }
5189 
5190 
SetMaxY(int max_y)5191 bool CCountryExtreme::SetMaxY(int max_y)
5192 {
5193     if (max_y > m_MaxY) {
5194         m_MaxY = max_y;
5195         return true;
5196     } else {
5197         return false;
5198     }
5199 }
5200 
5201 
AddLine(const CCountryLine * line)5202 void CCountryExtreme::AddLine(const CCountryLine *line)
5203 {
5204     if (line) {
5205         SetMinX(line->GetMinX());
5206         SetMaxX(line->GetMaxX());
5207         SetMinY(line->GetY());
5208         SetMaxY(line->GetY());
5209         m_Area += 1 + line->GetMaxX() - line->GetMinX();
5210     }
5211 }
5212 
5213 
DoesOverlap(const CCountryExtreme * other_block) const5214 bool CCountryExtreme::DoesOverlap(const CCountryExtreme* other_block) const
5215 {
5216     if (!other_block) {
5217         return false;
5218     } else if (m_MaxX >= other_block->GetMinX()
5219         && m_MaxX <= other_block->GetMaxX()
5220         && m_MaxY >= other_block->GetMinY()
5221         && m_MinY <= other_block->GetMaxY()) {
5222         return true;
5223     } else if (other_block->GetMaxX() >= m_MinX
5224         && other_block->GetMaxX() <= m_MaxX
5225         && other_block->GetMaxY() >= m_MinY
5226         && other_block->GetMinY() <= m_MaxY) {
5227         return true;
5228     } else {
5229         return false;
5230     }
5231 }
5232 
5233 
PreferTo(const CCountryExtreme * other_block,const string country,const string province,const bool prefer_new) const5234 bool CCountryExtreme::PreferTo(const CCountryExtreme* other_block, const string country, const string province, const bool prefer_new) const
5235 {
5236     if (!other_block) {
5237         return true;
5238     }
5239 
5240     // if no preferred country, these are equal
5241     if (NStr::IsBlank(country)) {
5242         return prefer_new;
5243     }
5244 
5245     // if match to preferred country
5246     if (NStr::EqualNocase(country, m_Level0)) {
5247         // if best was not preferred country, take new match
5248         if (!NStr::EqualNocase(country, other_block->GetLevel0())) {
5249             return true;
5250         }
5251         // if match to preferred province
5252         if (!NStr::IsBlank(province) && NStr::EqualNocase(province, m_Level1)) {
5253             // if best was not preferred province, take new match
5254             if (!NStr::EqualNocase(province, other_block->GetLevel1())) {
5255                 return true;
5256             }
5257         }
5258 
5259         // if both match province, or neither does, or no preferred province, take smallest
5260         return prefer_new;
5261     }
5262 
5263     // if best matches preferred country, keep
5264     if (NStr::EqualNocase(country, other_block->GetLevel0())) {
5265         return false;
5266     }
5267 
5268     // otherwise take smallest
5269     return prefer_new;
5270 }
5271 
5272 
CLatLonCountryId(float lat,float lon)5273 CLatLonCountryId::CLatLonCountryId(float lat, float lon)
5274     : m_Lat(lat),
5275       m_Lon(lon),
5276       m_LandDistance(-1),
5277       m_WaterDistance(-1),
5278       m_ClaimedDistance(-1)
5279 {}
5280 
5281 
Classify(string country,string province)5282 CLatLonCountryId::TClassificationFlags CLatLonCountryId::Classify(string country, string province)
5283 {
5284     CLatLonCountryId::TClassificationFlags rval = 0;
5285 
5286     // compare guesses or closest regions to indicated country and province
5287     if (!NStr::IsBlank(GetGuessCountry())) {
5288         // if top level countries match
5289         if (NStr::EqualNocase(country, GetGuessCountry())) {
5290             rval |= CLatLonCountryId::fCountryMatch;
5291             // if both are empty, still call it a match
5292             if (NStr::EqualNocase(province, GetGuessProvince())) {
5293                 rval |= CLatLonCountryId::fProvinceMatch;
5294             }
5295         }
5296         // if they don't match, are they closest?
5297         if (!(rval & CLatLonCountryId::fCountryMatch)) {
5298             if (NStr::EqualNocase(country, GetClosestCountry())) {
5299                 rval |= CLatLonCountryId::fCountryClosest;
5300                 if (NStr::EqualNocase(province, GetClosestProvince())) {
5301                     rval |= CLatLonCountryId::fProvinceClosest;
5302                 }
5303             }
5304         } else if (!(rval & CLatLonCountryId::fProvinceMatch) && !NStr::IsBlank(province)) {
5305             if (NStr::EqualNocase (province, GetClosestProvince())) {
5306                 rval |= CLatLonCountryId::fProvinceClosest;
5307             }
5308         }
5309     }
5310 
5311     if (!NStr::IsBlank(GetGuessWater())) {
5312         // was the non-approved body of water correctly indicated?
5313         if (NStr::EqualNocase(country, GetGuessWater())) {
5314             rval |= CLatLonCountryId::fWaterMatch;
5315         } else if (NStr::EqualNocase(country, GetClosestWater())) {
5316             rval |= CLatLonCountryId::fWaterClosest;
5317         }
5318     }
5319 
5320     if (!NStr::IsBlank(GetClosestCountry()) && NStr::EqualNocase(country, GetClosestCountry())) {
5321         if (NStr::IsBlank(GetGuessCountry()) && NStr::IsBlank(GetGuessWater())) {
5322             rval |= CLatLonCountryId::fCountryMatch;
5323             SetGuessCountry(GetClosestCountry());
5324             SetFullGuess(GetClosestCountry());
5325             if (!NStr::IsBlank(GetClosestProvince()) && NStr::EqualNocase(province, GetClosestProvince())) {
5326                 rval |= CLatLonCountryId::fProvinceMatch;
5327                 SetGuessProvince(GetClosestProvince());
5328                 SetFullGuess(GetClosestFull());
5329             }
5330         } else {
5331             rval |= CLatLonCountryId::fCountryClosest;
5332             if (!NStr::IsBlank(GetClosestProvince()) && NStr::EqualNocase(province, GetClosestProvince())) {
5333                 rval |= CLatLonCountryId::fProvinceClosest;
5334             }
5335         }
5336     }
5337     return rval;
5338 }
5339 
5340 
~CLatLonCountryId(void)5341 CLatLonCountryId::~CLatLonCountryId(void)
5342 {
5343 }
5344 
5345 
5346 #include "lat_lon_country.inc"
5347 static const size_t k_NumLatLonCountryText = ArraySize(s_DefaultLatLonCountryText);
5348 
5349 #include "lat_lon_water.inc"
5350 static const size_t k_NumLatLonWaterText = ArraySize(s_DefaultLatLonWaterText);
5351 
x_InitFromDefaultList(const char * const * list,int num)5352 void CLatLonCountryMap::x_InitFromDefaultList(const char * const *list, int num)
5353 {
5354     if (getenv("NCBI_DEBUG")) {
5355         ERR_POST(Note << "Falling back on built-in data for latlon / water data.");
5356     }
5357       // initialize list of country lines
5358     m_CountryLineList.clear();
5359     m_Scale = 20.0;
5360     string current_country;
5361 
5362     for (int i = 0; i < num; i++) {
5363         CTempString line = list[i];
5364         if (line[0] == '-') {
5365             // skip comment
5366         } else if (isalpha ((unsigned char)line[0])) {
5367             current_country = line;
5368         } else if (isdigit ((unsigned char)line[0])) {
5369             m_Scale = NStr::StringToDouble(line);
5370         } else {
5371             vector<string> tokens;
5372              NStr::Split(line, "\t", tokens);
5373             if (tokens.size() > 3) {
5374                 double x = NStr::StringToDouble(tokens[1]);
5375                 for (size_t j = 2; j < tokens.size() - 1; j+=2) {
5376                     m_CountryLineList.push_back(new CCountryLine(current_country, x, NStr::StringToDouble(tokens[j]), NStr::StringToDouble(tokens[j + 1]), m_Scale));
5377                 }
5378             }
5379         }
5380     }
5381 }
5382 
5383 
5384 
5385 
x_InitFromFile(const string & filename)5386 bool CLatLonCountryMap::x_InitFromFile(const string& filename)
5387 {
5388     string fname = g_FindDataFile (filename);
5389     if (NStr::IsBlank (fname)) {
5390         return false;
5391     }
5392     if (getenv("NCBI_DEBUG")) {
5393         ERR_POST(Note << "Reading from " + filename + " for latlon/water data.");
5394     }
5395     CRef<ILineReader> lr = ILineReader::New (fname);
5396     if (lr.Empty()) {
5397         return false;
5398     } else {
5399         m_Scale = 20.0;
5400         string current_country;
5401 
5402         // make sure to clear before using.  in this outer
5403         // scope in the interest of speed (avoid repeated
5404         // construction/destruction)
5405         vector<SIZE_TYPE> tab_positions;
5406 
5407         do {
5408             // const string& line = *++*lr;
5409             CTempString line = *++*lr;
5410             if (line[0] == '-') {
5411                 // skip comment
5412             } else if (isalpha ((unsigned char)line[0])) {
5413                 current_country = line;
5414             } else if (isdigit ((unsigned char)line[0])) {
5415                 m_Scale = NStr::StringToDouble(line);
5416             } else {
5417                 // NStr::Tokenize would be much simpler, but
5418                 // it's just too slow in this case, especially
5419                 // in debug mode.
5420 
5421                 // for the future, if we need even more speed,
5422                 // it should be possible to eliminate the tab_positions
5423                 // vector and collect tab positions on the fly without
5424                 // any heap-allocated memory
5425 
5426                 // find position of all tabs on this line
5427                 tab_positions.clear();
5428                 SIZE_TYPE tab_pos = line.find('\t');
5429                 while( tab_pos != NPOS ) {
5430                     tab_positions.push_back(tab_pos);
5431                     tab_pos = line.find('\t', tab_pos+1);
5432                 }
5433                 // an imaginary sentinel tab
5434                 tab_positions.push_back(line.length());
5435 
5436                 const char * line_start = line.data();
5437                 if( tab_positions.size() >= 4 ) {
5438                     CTempString y_str( line_start + tab_positions[0]+1, tab_positions[1] - tab_positions[0] - 1 );
5439                     double y = NStr::StringToDouble( y_str );
5440 
5441                     // convert into line list
5442                     for (size_t j = 1; j < tab_positions.size() - 2; j+=2) {
5443                         const SIZE_TYPE pos1 = tab_positions[j];
5444                         const SIZE_TYPE pos2 = tab_positions[j+1];
5445                         const SIZE_TYPE pos3 = tab_positions[j+2];
5446                         CTempString first_num( line_start + pos1 + 1, pos2 - pos1 - 1 );
5447                         CTempString second_num( line_start + pos2 + 1, pos3 - pos2 - 1 );
5448                         m_CountryLineList.push_back(new CCountryLine(current_country, y, NStr::StringToDouble(first_num), NStr::StringToDouble(second_num), m_Scale));
5449                     }
5450                 }
5451             }
5452         } while ( !lr->AtEOF() );
5453 
5454         return true;
5455     }
5456 }
5457 
5458 bool
s_CompareTwoLinesByLatLonOnly(const CCountryLine * line1,const CCountryLine * line2)5459 CLatLonCountryMap::s_CompareTwoLinesByLatLonOnly(
5460     const CCountryLine* line1,
5461     const CCountryLine* line2)
5462 {
5463     if (line1->GetY() < line2->GetY()) {
5464         return true;
5465     } else if (line1->GetY() > line2->GetY()) {
5466         return false;
5467     } else {
5468         if (line1->GetMinX() < line2->GetMinX()) {
5469             return true;
5470         } else {
5471             return false;
5472         }
5473     }
5474 }
5475 
5476 bool CLatLonCountryMap::
s_CompareTwoLinesByCountry(const CCountryLine * line1,const CCountryLine * line2)5477         s_CompareTwoLinesByCountry(const CCountryLine* line1,
5478                                     const CCountryLine* line2)
5479 {
5480     int cmp = NStr::CompareNocase(line1->GetCountry(), line2->GetCountry());
5481     if (cmp == 0) {
5482         return s_CompareTwoLinesByLatLonOnly(line1, line2);
5483     } else if (cmp < 0) {
5484         return true;
5485     } else {
5486         return false;
5487     }
5488 }
5489 
5490 
5491 bool CLatLonCountryMap::
s_CompareTwoLinesByLatLonThenCountry(const CCountryLine * line1,const CCountryLine * line2)5492         s_CompareTwoLinesByLatLonThenCountry(const CCountryLine* line1,
5493                                     const CCountryLine* line2)
5494 {
5495     if (line1->GetY() < line2->GetY()) {
5496         return true;
5497     } else if (line1->GetY() > line2->GetY()) {
5498         return false;
5499     } if (line1->GetMinX() < line2->GetMinX()) {
5500         return true;
5501     } else if (line1->GetMinX() > line2->GetMinX()) {
5502         return false;
5503     } else if (line1->GetMaxX() < line2->GetMaxX()) {
5504         return true;
5505     } else if (line1->GetMaxX() > line2->GetMaxX()) {
5506         return false;
5507     } else {
5508         int cmp = NStr::CompareNocase(line1->GetCountry(), line2->GetCountry());
5509         if (cmp < 0) {
5510             return true;
5511         } else {
5512             return false;
5513         }
5514     }
5515 }
5516 
5517 
CLatLonCountryMap(bool is_water)5518 CLatLonCountryMap::CLatLonCountryMap (bool is_water)
5519 {
5520     // initialize list of country lines
5521     m_CountryLineList.clear();
5522 
5523     const char* env_val = getenv("NCBI_LAT_LON_DATA_PATH");
5524     string data_path;
5525     if (env_val) {
5526         data_path = (string) env_val;
5527         if (! NStr::EndsWith(data_path, "/")) {
5528             data_path = data_path + "/";
5529         }
5530     }
5531 
5532     if (is_water) {
5533         if (!x_InitFromFile("lat_lon_water.txt")) {
5534             if (data_path.empty() || !x_InitFromFile(data_path + "lat_lon_water.txt")) {
5535                 x_InitFromDefaultList(s_DefaultLatLonWaterText, k_NumLatLonWaterText);
5536             }
5537         }
5538     } else {
5539         if (!x_InitFromFile("lat_lon_country.txt")) {
5540             if (data_path.empty() || !x_InitFromFile(data_path + "lat_lon_country.txt")) {
5541                 x_InitFromDefaultList(s_DefaultLatLonCountryText, k_NumLatLonCountryText);
5542             }
5543         }
5544     }
5545 
5546     // Instead of doing a plain sort, we take advantage of the fact that
5547     // there are few unique country names versus the number
5548     // of lines.
5549     typedef map<CTempString, TCountryLineList, PNocase> TCountryToLinesMap;
5550     // this map maps a country name (case insens) to all the lines that
5551     // belong to that country.
5552     TCountryToLinesMap countryToLinesMap;
5553     ITERATE(TCountryLineList, line_it, m_CountryLineList) {
5554         countryToLinesMap[(*line_it)->GetCountry()].push_back(*line_it);
5555     }
5556 
5557     // build new m_CountryLineList here:
5558     TCountryLineList new_country_line_list;
5559     NON_CONST_ITERATE(TCountryToLinesMap, country_lines_it, countryToLinesMap)
5560     {
5561         // sort the lines for each country by lat/lon only, since we've already
5562         // implicitly sorted by country in countryToLinesMap
5563         TCountryLineList & line_list_for_this_country =
5564             country_lines_it->second;
5565         stable_sort(
5566             BEGIN_COMMA_END(line_list_for_this_country),
5567             s_CompareTwoLinesByLatLonOnly);
5568         copy(BEGIN_COMMA_END(line_list_for_this_country),
5569              back_inserter(new_country_line_list));
5570     }
5571     // swap should be constant time
5572     m_CountryLineList.swap(new_country_line_list);
5573 
5574     // set up extremes index and copy into LatLon index
5575     m_CountryExtremes.clear();
5576     m_LatLonSortedList.clear();
5577       size_t i, ext = 0;
5578 
5579     for (i = 0; i < m_CountryLineList.size(); i++) {
5580         if (ext > 0 && NStr::Equal(m_CountryLineList[i]->GetCountry(), m_CountryExtremes[ext - 1]->GetCountry())) {
5581             m_CountryExtremes[ext - 1]->AddLine(m_CountryLineList[i]);
5582         } else {
5583             m_CountryExtremes.push_back(new CCountryExtreme(m_CountryLineList[i]->GetCountry(),
5584                                                 m_CountryLineList[i]->GetMinX(),
5585                                                 m_CountryLineList[i]->GetY(),
5586                                                 m_CountryLineList[i]->GetMaxX(),
5587                                                 m_CountryLineList[i]->GetY()));
5588             ext++;
5589         }
5590         m_LatLonSortedList.push_back(m_CountryLineList[i]);
5591         m_CountryLineList[i]->SetBlock(m_CountryExtremes[ext - 1]);
5592     }
5593     sort (m_LatLonSortedList.begin(), m_LatLonSortedList.end(), s_CompareTwoLinesByLatLonThenCountry);
5594 
5595 }
5596 
5597 
~CLatLonCountryMap(void)5598 CLatLonCountryMap::~CLatLonCountryMap (void)
5599 {
5600       size_t i;
5601 
5602     for (i = 0; i < m_CountryLineList.size(); i++) {
5603         delete (m_CountryLineList[i]);
5604     }
5605     m_CountryLineList.clear();
5606 
5607     for (i = 0; i < m_CountryExtremes.size(); i++) {
5608         delete (m_CountryExtremes[i]);
5609     }
5610     m_CountryExtremes.clear();
5611     // note - do not delete items in m_LatLonSortedList, they are pointing to the same objects as m_CountryLineList
5612     m_LatLonSortedList.clear();
5613 }
5614 
5615 
IsCountryInLatLon(const string & country,double lat,double lon)5616 bool CLatLonCountryMap::IsCountryInLatLon(const string& country, double lat,
5617                                           double lon)
5618 {
5619     int x = CCountryLine::ConvertLon(lon, m_Scale);
5620     int y = CCountryLine::ConvertLat(lat, m_Scale);
5621 
5622     size_t L, R, mid;
5623 
5624     L = 0;
5625     R = m_CountryLineList.size() - 1;
5626     mid = 0;
5627 
5628     while (L < R) {
5629         mid = (L + R) / 2;
5630         int cmp = NStr::Compare(m_CountryLineList[mid]->GetCountry(), country);
5631         if (cmp < 0) {
5632             L = mid + 1;
5633         } else if (cmp > 0) {
5634             R = mid;
5635         } else {
5636             while (mid > 0
5637                    && NStr::Compare(m_CountryLineList[mid - 1]->GetCountry(), country) == 0
5638                    && m_CountryLineList[mid - 1]->GetY() >= y) {
5639                 mid--;
5640             }
5641             L = mid;
5642             R = mid;
5643         }
5644     }
5645 
5646     while (R < m_CountryLineList.size()
5647            && NStr::EqualNocase(country, m_CountryLineList[R]->GetCountry())
5648            && m_CountryLineList[R]->GetY() < y) {
5649         R++;
5650     }
5651 
5652     while (R < m_CountryLineList.size()
5653            && NStr::EqualNocase(country, m_CountryLineList[R]->GetCountry())
5654            && m_CountryLineList[R]->GetY() == y
5655            && m_CountryLineList[R]->GetMaxX() < x) {
5656         R++;
5657     }
5658     if (R < m_CountryLineList.size()
5659            && NStr::EqualNocase(country, m_CountryLineList[R]->GetCountry())
5660            && m_CountryLineList[R]->GetY() == y
5661            && m_CountryLineList[R]->GetMinX() <= x
5662            && m_CountryLineList[R]->GetMaxX() >= x) {
5663         return true;
5664     } else {
5665         return false;
5666     }
5667 }
5668 
5669 
5670 const CCountryExtreme *
x_FindCountryExtreme(const string & country)5671 CLatLonCountryMap::x_FindCountryExtreme(const string& country)
5672 {
5673     size_t L, R, mid;
5674 
5675     if (NStr::IsBlank (country)) return NULL;
5676 
5677     L = 0;
5678     R = m_CountryExtremes.size() - 1;
5679 
5680     while (L < R) {
5681         mid = (L + R) / 2;
5682         if (NStr::CompareNocase(m_CountryExtremes[mid]->GetCountry(), country) < 0) {
5683             L = mid + 1;
5684         } else {
5685             R = mid;
5686         }
5687     }
5688     if (!NStr::EqualNocase(m_CountryExtremes[R]->GetCountry(), country)) {
5689         return NULL;
5690     } else {
5691         return m_CountryExtremes[R];
5692     }
5693 }
5694 
5695 
HaveLatLonForRegion(const string & region)5696 bool CLatLonCountryMap::HaveLatLonForRegion(const string& region)
5697 {
5698     if (x_FindCountryExtreme(region) == NULL) {
5699         return false;
5700     } else {
5701         return true;
5702     }
5703 }
5704 
5705 
x_GetLatStartIndex(int y)5706 size_t CLatLonCountryMap::x_GetLatStartIndex (int y)
5707 {
5708     size_t L, R, mid;
5709 
5710     L = 0;
5711     R = m_LatLonSortedList.size() - 1;
5712     mid = 0;
5713 
5714     while (L < R) {
5715         mid = (L + R) / 2;
5716         if (m_LatLonSortedList[mid]->GetY() < y) {
5717             L = mid + 1;
5718         } else if (m_LatLonSortedList[mid]->GetY() > y) {
5719             R = mid;
5720         } else {
5721             while (mid > 0 && m_LatLonSortedList[mid - 1]->GetY() == y) {
5722                 mid--;
5723             }
5724             L = mid;
5725             R = mid;
5726         }
5727     }
5728     return R;
5729 }
5730 
5731 
5732 const CCountryExtreme *
GuessRegionForLatLon(double lat,double lon,const string & country,const string & province)5733 CLatLonCountryMap::GuessRegionForLatLon(double lat, double lon,
5734                                         const string& country,
5735                                         const string& province)
5736 {
5737     int x = CCountryLine::ConvertLon(lon, m_Scale);
5738     int y = CCountryLine::ConvertLon(lat, m_Scale);
5739 
5740     size_t R = x_GetLatStartIndex(y);
5741 
5742     const CCountryExtreme *best = NULL;
5743 
5744     while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() == y) {
5745             if (m_LatLonSortedList[R]->GetMinX() <= x
5746             && m_LatLonSortedList[R]->GetMaxX() >= x) {
5747             const CCountryExtreme *other = m_LatLonSortedList[R]->GetBlock();
5748             if (best == NULL) {
5749                 best = other;
5750             } else if (!best->PreferTo(other, country, province, (bool)(best->GetArea() <= other->GetArea()))) {
5751                 best = other;
5752             }
5753              }
5754         R++;
5755       }
5756       return best;
5757 }
5758 
5759 
5760 //Distance on a spherical surface calculation adapted from
5761 //http://www.linuxjournal.com/magazine/
5762 //work-shell-calculating-distance-between-two-latitudelongitude-points
5763 
5764 #define EARTH_RADIUS 6371.0 /* average radius of non-spherical earth in kilometers */
5765 #define CONST_PI 3.14159265359
5766 
DegreesToRadians(double degrees)5767 static double DegreesToRadians (
5768   double degrees
5769 )
5770 
5771 {
5772   return (degrees * (CONST_PI / 180.0));
5773 }
5774 
DistanceOnGlobe(double latA,double lonA,double latB,double lonB)5775 static double DistanceOnGlobe (
5776   double latA,
5777   double lonA,
5778   double latB,
5779   double lonB
5780 )
5781 
5782 {
5783   double lat1, lon1, lat2, lon2;
5784   double dLat, dLon, a, c;
5785 
5786   lat1 = DegreesToRadians (latA);
5787   lon1 = DegreesToRadians (lonA);
5788   lat2 = DegreesToRadians (latB);
5789   lon2 = DegreesToRadians (lonB);
5790 
5791   dLat = lat2 - lat1;
5792   dLon = lon2 - lon1;
5793 
5794    a = sin (dLat / 2) * sin (dLat / 2) +
5795        cos (lat1) * cos (lat2) * sin (dLon / 2) * sin (dLon / 2);
5796    c = 2 * atan2 (sqrt (a), sqrt (1 - a));
5797 
5798   return (double) (EARTH_RADIUS * c);
5799 }
5800 
5801 
ErrorDistance(double latA,double lonA,double scale)5802 double ErrorDistance (
5803   double latA,
5804   double lonA,
5805   double scale)
5806 {
5807   double lat1, lon1, lat2, lon2;
5808   double dLat, dLon, a, c;
5809 
5810   lat1 = DegreesToRadians (latA);
5811   lon1 = DegreesToRadians (lonA);
5812   lat2 = DegreesToRadians (latA + (1.0 / scale));
5813   lon2 = DegreesToRadians (lonA + (1.0 / scale));
5814 
5815   dLat = lat2 - lat1;
5816   dLon = lon2 - lon1;
5817 
5818    a = sin (dLat / 2) * sin (dLat / 2) +
5819        cos (lat1) * cos (lat2) * sin (dLon / 2) * sin (dLon / 2);
5820    c = 2 * atan2 (sqrt (a), sqrt (1 - a));
5821 
5822   return (double) (EARTH_RADIUS * c);
5823 
5824 }
5825 
5826 
FindClosestToLatLon(double lat,double lon,double range,double & distance)5827 const CCountryExtreme * CLatLonCountryMap::FindClosestToLatLon(double lat,
5828                                                                double lon,
5829                                                                double range,
5830                                                                double &distance)
5831 {
5832     int x = CCountryLine::ConvertLon(lon, m_Scale);
5833     int y = CCountryLine::ConvertLon(lat, m_Scale);
5834 
5835     int maxDelta = (int) (range * m_Scale + EPSILON);
5836     int min_y = y - maxDelta;
5837     int max_y = y + maxDelta;
5838     int min_x = x - maxDelta;
5839     int max_x = x + maxDelta;
5840 
5841     // binary search to lowest lat
5842     size_t R = x_GetLatStartIndex(min_y);
5843 
5844     double closest = 0.0;
5845     CCountryExtreme *rval = NULL;
5846 
5847     while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() <= max_y) {
5848         if (m_LatLonSortedList[R]->GetMaxX() < min_x || m_LatLonSortedList[R]->GetMinX() > max_x) {
5849             // out of range, don't bother calculating distance
5850         } else {
5851             double end;
5852             if (x < m_LatLonSortedList[R]->GetMinX()) {
5853                 end = m_LatLonSortedList[R]->GetMinLon();
5854             } else if (x > m_LatLonSortedList[R]->GetMaxX()) {
5855                 end = m_LatLonSortedList[R]->GetMaxLon();
5856             } else {
5857                 end = lon;
5858             }
5859             double dist = DistanceOnGlobe (lat, lon, m_LatLonSortedList[R]->GetLat(), end);
5860             if (rval == NULL || closest > dist
5861                 || (closest == dist
5862                     && (rval->GetArea() > m_LatLonSortedList[R]->GetBlock()->GetArea()
5863                         || (rval->GetArea() == m_LatLonSortedList[R]->GetBlock()->GetArea()
5864                             && NStr::IsBlank(rval->GetLevel1())
5865                             && !NStr::IsBlank(m_LatLonSortedList[R]->GetBlock()->GetLevel1()))))) {
5866                 rval = m_LatLonSortedList[R]->GetBlock();
5867                 closest = dist;
5868             }
5869         }
5870         R++;
5871     }
5872     distance = closest;
5873     return rval;
5874 }
5875 
5876 
IsClosestToLatLon(const string & comp_country,double lat,double lon,double range,double & distance)5877 bool CLatLonCountryMap::IsClosestToLatLon(const string& comp_country,
5878                                           double lat, double lon,
5879                                           double range, double &distance)
5880 {
5881     int x = CCountryLine::ConvertLon(lon, m_Scale);
5882     int y = CCountryLine::ConvertLon(lat, m_Scale);
5883 
5884     int maxDelta = (int) (range * m_Scale + EPSILON);
5885     int min_y = y - maxDelta;
5886     int max_y = y + maxDelta;
5887     int min_x = x - maxDelta;
5888     int max_x = x + maxDelta;
5889 
5890     // binary search to lowest lat
5891     size_t R = x_GetLatStartIndex(min_y);
5892 
5893     string country;
5894     double closest = 0.0;
5895     int smallest_area = -1;
5896 
5897     while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() <= max_y) {
5898         if (m_LatLonSortedList[R]->GetMaxX() < min_x || m_LatLonSortedList[R]->GetMinX() > max_x) {
5899             // out of range, don't bother calculating distance
5900         } else {
5901             double end;
5902             if (x < m_LatLonSortedList[R]->GetMinX()) {
5903                 end = m_LatLonSortedList[R]->GetMinLon();
5904             } else {
5905                 end = m_LatLonSortedList[R]->GetMaxLon();
5906             }
5907             double dist = DistanceOnGlobe (lat, lon, m_LatLonSortedList[R]->GetLat(), end);
5908             if (NStr::IsBlank (country) || closest > dist) {
5909                 country = m_LatLonSortedList[R]->GetCountry();
5910                 closest = dist;
5911                 const CCountryExtreme * ext = x_FindCountryExtreme(country);
5912                 if (ext) {
5913                     smallest_area = ext->GetArea();
5914                 }
5915             } else if (closest == dist) {
5916                 // if the distances are the same, prefer the input country, otherwise prefer the smaller region
5917                 if (NStr::Equal(country, comp_country)) {
5918                     // keep country we're searching for
5919                 } else if (!NStr::Equal(m_LatLonSortedList[R]->GetCountry(), country)) {
5920                     const CCountryExtreme * ext = x_FindCountryExtreme(m_LatLonSortedList[R]->GetCountry());
5921                     if (ext
5922                         && (ext->GetArea() < smallest_area
5923                             || NStr::Equal(m_LatLonSortedList[R]->GetCountry(), comp_country))) {
5924                         country = m_LatLonSortedList[R]->GetCountry();
5925                         smallest_area = ext->GetArea();
5926                     }
5927                 }
5928             }
5929         }
5930         R++;
5931     }
5932     distance = closest;
5933     return NStr::Equal(country, comp_country);
5934 }
5935 
5936 
IsNearLatLon(double lat,double lon,double range,double & distance,const string & country,const string & province)5937 const CCountryExtreme * CLatLonCountryMap::IsNearLatLon(double lat, double lon,
5938                                                         double range,
5939                                                         double &distance,
5940                                                         const string& country,
5941                                                         const string& province)
5942 {
5943     int x = CCountryLine::ConvertLon(lon, m_Scale);
5944     int y = CCountryLine::ConvertLat(lat, m_Scale);
5945     double closest = -1.0;
5946     int maxDelta = (int) (range * m_Scale + EPSILON);
5947     int min_y = y - maxDelta;
5948     int max_y = y + maxDelta;
5949     int min_x = x - maxDelta;
5950     int max_x = x + maxDelta;
5951     CCountryExtreme *ext = NULL;
5952 
5953     // binary search to lowest lat
5954     size_t R = x_GetLatStartIndex(min_y);
5955 
5956     while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() <= max_y) {
5957         if (m_LatLonSortedList[R]->GetMaxX() < min_x || m_LatLonSortedList[R]->GetMinX() > max_x) {
5958             // out of range, don't bother calculating distance
5959         } else if (!NStr::EqualNocase(m_LatLonSortedList[R]->GetBlock()->GetLevel0(), country)) {
5960             // wrong country, skip
5961         } else if (!NStr::IsBlank(province) && !NStr::EqualNocase(m_LatLonSortedList[R]->GetBlock()->GetLevel1(), province)) {
5962             // wrong province, skip
5963         } else {
5964             double end;
5965             if (x < m_LatLonSortedList[R]->GetMinX()) {
5966                 end = m_LatLonSortedList[R]->GetMinLon();
5967             } else if (x > m_LatLonSortedList[R]->GetMaxX()) {
5968                 end = m_LatLonSortedList[R]->GetMaxLon();
5969             } else {
5970                 end = lon;
5971             }
5972             double dist = DistanceOnGlobe (lat, lon, m_LatLonSortedList[R]->GetLat(), end);
5973             if (closest < 0.0 ||  closest > dist) {
5974                 closest = dist;
5975                 ext = m_LatLonSortedList[R]->GetBlock();
5976             }
5977         }
5978         R++;
5979     }
5980     distance = closest;
5981     return ext;
5982 }
5983 
5984 
5985 
5986 
5987 
DoCountryBoxesOverlap(const string & country1,const string & country2)5988 bool CLatLonCountryMap::DoCountryBoxesOverlap(const string& country1,
5989                                               const string& country2)
5990 {
5991     if (NStr::IsBlank (country1) || NStr::IsBlank(country2)) return false;
5992 
5993     const CCountryExtreme *ext1 = x_FindCountryExtreme (country1);
5994     if (!ext1) {
5995         return false;
5996     }
5997     const CCountryExtreme *ext2 = x_FindCountryExtreme (country2);
5998     if (!ext2) {
5999         return false;
6000     }
6001 
6002 
6003     return ext1->DoesOverlap(ext2);
6004 }
6005 
6006 
AdjustAndRoundDistance(double distance,double scale)6007 int CLatLonCountryMap::AdjustAndRoundDistance (double distance, double scale)
6008 
6009 {
6010   if (scale < 1.1) {
6011     distance += 111.19;
6012   } else if (scale > 19.5 && scale < 20.5) {
6013     distance += 5.56;
6014   } else if (scale > 99.5 && scale < 100.5) {
6015     distance += 1.11;
6016   }
6017 
6018   return (int) (distance + 0.5);
6019 }
6020 
6021 
AdjustAndRoundDistance(double distance)6022 int CLatLonCountryMap::AdjustAndRoundDistance (double distance)
6023 
6024 {
6025   return AdjustAndRoundDistance (distance, m_Scale);
6026 }
6027 
6028 
6029 
6030 
6031 END_objects_SCOPE // namespace ncbi::objects::
6032 
6033 END_NCBI_SCOPE
6034 
6035 /* Original file checksum: lines: 65, chars: 1891, CRC32: 7724f0c5 */
6036