1 /* $Id: SubSource.cpp 632184 2021-05-27 13:27:21Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: .......
27 *
28 * File Description:
29 * .......
30 *
31 * Remark:
32 * This code was originally generated by application DATATOOL
33 * using the following specifications:
34 * 'seqfeat.asn'.
35 */
36
37 // standard includes
38 #include <ncbi_pch.hpp>
39 #include <serial/enumvalues.hpp>
40
41 // generated includes
42 #include <objects/seqfeat/SubSource.hpp>
43
44 #include <math.h>
45 #include <objects/misc/sequence_util_macros.hpp>
46 #include <corelib/ncbitime.hpp>
47
48 #include <util/row_reader_ncbi_tsv.hpp>
49
50 // generated classes
51
52 BEGIN_NCBI_SCOPE
53
54 BEGIN_objects_SCOPE // namespace ncbi::objects::
55
56 unique_ptr<CLatLonCountryMap> CSubSource::m_LatLonCountryMap;
57 unique_ptr<CLatLonCountryMap> CSubSource::m_LatLonWaterMap;
58
59
60 // destructor
~CSubSource(void)61 CSubSource::~CSubSource(void)
62 {
63 }
64
GetLabel(string * str) const65 void CSubSource::GetLabel(string* str) const
66 {
67 *str += '/';
68 string type_name;
69 if (GetSubtype() == eSubtype_other) {
70 type_name = "other";
71 } else {
72 try {
73 // eVocabulary_insdc has some special cases not (historically)
74 // used here.
75 type_name = GetSubtypeName(GetSubtype());
76 replace(type_name.begin(), type_name.end(), '_', '-');
77 } catch (CSerialException&) {
78 type_name = "unknown";
79 }
80 }
81 *str += type_name;
82 *str += '=';
83 *str += GetName();
84 if (IsSetAttrib()) {
85 *str += " (";
86 *str += GetAttrib();
87 *str += ")";
88 }
89 }
90
91
GetSubtypeValue(const string & str,EVocabulary vocabulary)92 CSubSource::TSubtype CSubSource::GetSubtypeValue(const string& str,
93 EVocabulary vocabulary)
94 {
95 string name = NStr::TruncateSpaces(str);
96 NStr::ToLower(name);
97 replace(name.begin(), name.end(), '_', '-');
98 replace(name.begin(), name.end(), ' ', '-');
99
100 if ( NStr::EqualNocase(name, "note") ||
101 NStr::EqualNocase(name, "subsource-note") ||
102 NStr::EqualNocase(name, "subsrc-note") ||
103 NStr::EqualNocase(name, "note-subsource")) {
104 return eSubtype_other;
105 } else if (vocabulary == eVocabulary_insdc) {
106 // consider a table if more special cases arise.
107 if (name == "insertion-seq") {
108 return eSubtype_insertion_seq_name;
109 } else if (name == "plasmid") {
110 return eSubtype_plasmid_name;
111 } else if (name == "transposon") {
112 return eSubtype_transposon_name;
113 } else if (name == "sub-clone") {
114 return eSubtype_subclone;
115 }
116 }
117 return ENUM_METHOD_NAME(ESubtype)()->FindValue(name);
118 }
119
120
IsValidSubtypeName(const string & str,EVocabulary vocabulary)121 bool CSubSource::IsValidSubtypeName(const string& str,
122 EVocabulary vocabulary)
123 {
124
125 string name = NStr::TruncateSpaces(str);
126 NStr::ToLower(name);
127 replace(name.begin(), name.end(), '_', '-');
128 replace(name.begin(), name.end(), ' ', '-');
129
130 if ( NStr::EqualNocase(name, "note") ||
131 NStr::EqualNocase(name, "subsource-note") ||
132 NStr::EqualNocase(name, "subsrc-note") ||
133 NStr::EqualNocase(name, "note-subsource")) {
134 return true;
135 }
136 if (vocabulary == eVocabulary_insdc) {
137 // consider a table if more special cases arise.
138 if (name == "insertion-seq" ||
139 name == "plasmid" ||
140 name == "transposon" ||
141 name == "sub-clone") {
142 return true;
143 }
144 }
145 return ENUM_METHOD_NAME(ESubtype)()->IsValidName(name);
146 }
147
148
GetSubtypeName(CSubSource::TSubtype stype,EVocabulary vocabulary)149 string CSubSource::GetSubtypeName(CSubSource::TSubtype stype,
150 EVocabulary vocabulary)
151 {
152 if (stype == CSubSource::eSubtype_other) {
153 return "note";
154 } else if (vocabulary == eVocabulary_insdc) {
155 switch (stype) {
156 case eSubtype_subclone: return "sub_clone";
157 case eSubtype_plasmid_name: return "plasmid";
158 case eSubtype_transposon_name: return "transposon";
159 case eSubtype_insertion_seq_name: return "insertion_seq";
160 default:
161 return NStr::Replace
162 (ENUM_METHOD_NAME(ESubtype)()->FindName(stype, true),
163 "-", "_");
164 }
165 } else {
166 return ENUM_METHOD_NAME(ESubtype)()->FindName(stype, true);
167 }
168 }
169
170
171
IsMultipleValuesAllowed(TSubtype subtype)172 bool CSubSource::IsMultipleValuesAllowed(TSubtype subtype)
173 {
174 return subtype != eSubtype_chromosome
175 && subtype != eSubtype_sex
176 && subtype != eSubtype_germline
177 && subtype != eSubtype_rearranged
178 && subtype != eSubtype_plasmid_name
179 && subtype != eSubtype_segment
180 && subtype != eSubtype_country
181 && subtype != eSubtype_transgenic
182 && subtype != eSubtype_environmental_sample
183 && subtype != eSubtype_lat_lon
184 && subtype != eSubtype_collection_date
185 && subtype != eSubtype_collected_by
186 && subtype != eSubtype_identified_by
187 && subtype != eSubtype_fwd_primer_seq
188 && subtype != eSubtype_rev_primer_seq
189 && subtype != eSubtype_fwd_primer_name
190 && subtype != eSubtype_rev_primer_name
191 && subtype != eSubtype_metagenomic
192 && subtype != eSubtype_altitude
193 && subtype != eSubtype_clone;
194 }
195
196
NeedsNoText(const TSubtype & subtype)197 bool CSubSource::NeedsNoText(const TSubtype& subtype)
198 {
199 if (subtype == eSubtype_germline
200 || subtype == eSubtype_rearranged
201 || subtype == eSubtype_transgenic
202 || subtype == eSubtype_environmental_sample
203 || subtype == eSubtype_metagenomic) {
204 return true;
205 } else {
206 return false;
207 }
208 }
209
210
IsDiscouraged(const TSubtype subtype)211 bool CSubSource::IsDiscouraged(const TSubtype subtype)
212 {
213 if (subtype == eSubtype_frequency
214 || subtype == eSubtype_insertion_seq_name
215 || subtype == eSubtype_phenotype
216 || subtype == eSubtype_plastid_name
217 || subtype == eSubtype_transposon_name
218 || subtype == eSubtype_fwd_primer_seq
219 || subtype == eSubtype_rev_primer_seq
220 || subtype == eSubtype_fwd_primer_name
221 || subtype == eSubtype_rev_primer_name
222 || subtype == eSubtype_whole_replicon) { // metagenomic subsrc qualifier taken off this list: GB-3384
223 return true;
224 } else {
225 return false;
226 }
227 }
228
229
IsDayValueOkForMonth(int day,int month,int year)230 bool CSubSource::IsDayValueOkForMonth(int day, int month, int year)
231 {
232 if (month < 1 || month > 12 || day < 1) {
233 return false;
234 }
235 bool rval = true;
236 if (year < 100) {
237 year += 2000;
238 } else if (year > 3000) {
239 return false;
240 } else if (year < 1538) {
241 return false;
242 }
243 CTime month_o(year, month, 1);
244 if (day > month_o.DaysInMonth()) {
245 rval = false;
246 }
247 return rval;
248 }
249
250
DateFromCollectionDate(const string & test)251 CRef<CDate> CSubSource::DateFromCollectionDate (const string& test) THROWS((CException))
252 {
253 if (NStr::IsBlank(test)) {
254 NCBI_THROW (CException, eUnknown,
255 "collection-date string is blank");
256 }
257 string str = NStr::TruncateSpaces(test);
258
259 if (IsISOFormatDate(str)) {
260 return GetDateFromISODate(str);
261 }
262
263 size_t pos = NStr::Find(str, "-");
264 string year;
265 string month;
266 string day;
267
268 if (pos == NPOS) {
269 year = str;
270 } else {
271 size_t pos2 = NStr::Find(str, "-", pos + 1);
272 if (pos2 == NPOS) {
273 month = str.substr(0, pos);
274 year = str.substr(pos + 1);
275 if (NStr::IsBlank(month)) {
276 NCBI_THROW (CException, eUnknown,
277 "collection-date string is improperly formatted");
278 }
279 } else {
280 day = str.substr(0, pos);
281 month = str.substr(pos + 1, pos2 - pos - 1);
282 year = str.substr(pos2 + 1);
283 if (NStr::IsBlank(month) || NStr::IsBlank(day)) {
284 NCBI_THROW (CException, eUnknown,
285 "collection-date string is improperly formatted");
286 }
287 }
288 }
289
290 int month_val = 0;
291 if (!NStr::IsBlank(month)) {
292 try {
293 month_val = CTime::MonthNameToNum(month);
294 } catch (CTimeException& ex) {
295 NCBI_THROW (CException, eUnknown,
296 "collection-date string has invalid month");
297 }
298 }
299
300 int day_val = 0;
301 if (!NStr::IsBlank(day)) {
302 try {
303 day_val = NStr::StringToInt (day);
304 if (day_val < 1) {
305 NCBI_THROW (CException, eUnknown,
306 "collection-date string has invalid day value");
307 }
308 } catch ( const exception& ) {
309 // threw exception while converting to int
310 NCBI_THROW (CException, eUnknown,
311 "collection-date string is improperly formatted");
312 }
313 }
314
315 if (NStr::IsBlank(year)) {
316 NCBI_THROW (CException, eUnknown,
317 "collection-date string is improperly formatted");
318 }
319
320 int year_val = 0;
321 try {
322 year_val = NStr::StringToInt (year);
323 } catch ( const exception& ) {
324 // threw exception while converting to int
325 NCBI_THROW (CException, eUnknown,
326 "collection-date string is improperly formatted");
327 }
328
329 /*
330 if (year_val < 1000 || year_val >= 2100) {
331 NCBI_THROW (CException, eUnknown,
332 "collection-date year is out of range");
333 }
334 */
335
336 if (year_val < 1000) {
337 NCBI_THROW (CException, eUnknown,
338 "collection-date year is out of range");
339 }
340
341 if (year_val >= 2100) {
342 NCBI_THROW (CException, eUnknown,
343 "collection-date year is out of range");
344 }
345
346 if (day_val > 0 && month_val > 0 && !IsDayValueOkForMonth(day_val, month_val, year_val)) {
347 NCBI_THROW (CException, eUnknown,
348 "collection-date day is greater than monthly maximum");
349 }
350
351 CRef<CDate> date(new CDate);
352
353 date->SetStd().SetYear (year_val);
354 if (month_val > 0) {
355 date->SetStd().SetMonth (month_val);
356 }
357 if (day_val > 0) {
358 date->SetStd().SetDay (day_val);
359 }
360
361 time_t t;
362
363 time(&t);
364
365 CDate now(t);
366
367 /*
368 if (IsCollectionDateAfterTime(*date, t)) {
369 NCBI_THROW (CException, eUnknown,
370 "collection-date year is out of range");
371 }
372 */
373
374 return date;
375 }
376
377
IsCollectionDateAfterTime(const string & collection_date,time_t t,bool & bad_format)378 bool CSubSource::IsCollectionDateAfterTime(const string& collection_date, time_t t, bool& bad_format)
379 {
380 bad_format = false;
381 bool in_future = false;
382 vector<string> pieces;
383 NStr::Split(collection_date, "/", pieces);
384 if (pieces.size() > 2) {
385 bad_format = true;
386 } else {
387 ITERATE(vector<string>, it, pieces) {
388 CRef<CDate> coll_date = DateFromCollectionDate (*it);
389 if (!coll_date) {
390 bad_format = true;
391 } else if (IsCollectionDateAfterTime(*coll_date, t)) {
392 in_future = true;
393 }
394 }
395 }
396 return in_future;
397 }
398
399
IsCollectionDateAfterTime(const CDate & collection_date,time_t t)400 bool CSubSource::IsCollectionDateAfterTime(const CDate& collection_date, time_t t)
401 {
402 CDate now(t);
403 if (collection_date.Compare(now) == CDate::eCompare_after) {
404 return true;
405 } else {
406 return false;
407 }
408 }
409
410
IsCollectionDateAfterTime(const CDate & collection_date,CTime & ctime)411 bool CSubSource::IsCollectionDateAfterTime(const CDate& collection_date, CTime& ctime)
412 {
413 time_t t = ctime.GetTimeT();
414 return IsCollectionDateAfterTime(collection_date, t);
415 }
416
417
IsCorrectDateFormat(const string & date_string,bool & bad_format,bool & in_future)418 void CSubSource::IsCorrectDateFormat(const string& date_string, bool& bad_format, bool& in_future)
419 {
420 bad_format = false;
421 in_future = false;
422
423 vector<string> pieces;
424 NStr::Split(date_string, "/", pieces);
425 if (pieces.size() > 2) {
426 bad_format = true;
427 return;
428 } else if (pieces.size() == 2) {
429 bool first_bad = false;
430 bool first_future = false;
431 bool second_bad = false;
432 bool second_future = false;
433 IsCorrectDateFormat(pieces[0], first_bad, first_future);
434 IsCorrectDateFormat(pieces[1], second_bad, second_future);
435 bad_format = first_bad || second_bad;
436 if (!bad_format) {
437 in_future = first_future || second_future;
438 }
439 return;
440 }
441
442 try {
443 CRef<CDate> coll_date = CSubSource::DateFromCollectionDate (date_string);
444
445 if (!IsISOFormatDate(date_string)) {
446 // if there are two dashes, then the first token needs to be the day, and the
447 // day has to have two numbers, a leading zero if the day is less than 10
448 size_t pos = NStr::Find(date_string, "-");
449 if (pos != NPOS) {
450 size_t pos2 = NStr::Find(date_string, "-", pos + 1);
451 if (pos2 != NPOS && pos != 2) {
452 bad_format = true;
453 }
454 }
455 }
456
457 if (!bad_format) {
458 time_t t;
459
460 time(&t);
461
462 in_future = IsCollectionDateAfterTime(*coll_date, t);
463 }
464 } catch (CException ) {
465 bad_format = true;
466 }
467 }
468
CheckDateFormat(const string & date_string)469 size_t CSubSource::CheckDateFormat(const string& date_string)
470 {
471 size_t rval = eDateFormatFlag_ok;
472 vector<string> pieces;
473 NStr::Split(date_string, "/", pieces);
474 if (pieces.size() > 2) {
475 rval |= eDateFormatFlag_bad_format;
476 } else if (pieces.size() == 2) {
477 rval |= CheckDateFormat(pieces[0]);
478 rval |= CheckDateFormat(pieces[1]);
479 if (rval == eDateFormatFlag_ok) {
480 try {
481 CRef<CDate> d1 = CSubSource::DateFromCollectionDate(pieces[0]);
482 CRef<CDate> d2 = CSubSource::DateFromCollectionDate(pieces[1]);
483 if (d2->Compare(*d1) == CDate::eCompare_before) {
484 rval |= eDateFormatFlag_out_of_order;
485 }
486 } catch (CException) {
487 rval |= eDateFormatFlag_bad_format;
488 }
489 }
490 return rval;
491 }
492
493 try {
494 CRef<CDate> coll_date = CSubSource::DateFromCollectionDate(date_string);
495
496 if (!IsISOFormatDate(date_string)) {
497 // if there are two dashes, then the first token needs to be the day, and the
498 // day has to have two numbers, a leading zero if the day is less than 10
499 size_t pos = NStr::Find(date_string, "-");
500 if (pos != NPOS) {
501 size_t pos2 = NStr::Find(date_string, "-", pos + 1);
502 if (pos2 != NPOS && pos != 2) {
503 rval |= eDateFormatFlag_bad_format;
504 }
505 }
506 }
507
508 if (rval == eDateFormatFlag_ok) {
509 time_t t;
510
511 time(&t);
512 if (IsCollectionDateAfterTime(*coll_date, t)) {
513 rval |= eDateFormatFlag_in_future;
514 }
515 }
516 } catch (CException) {
517 rval |= eDateFormatFlag_bad_format;
518 }
519 return rval;
520 }
521
GetCollectionDateProblem(const string & date_string)522 string CSubSource::GetCollectionDateProblem (const string& date_string)
523 {
524 string problem;
525 size_t rval = CheckDateFormat(date_string);
526 if (rval & eDateFormatFlag_bad_format) {
527 problem = "Collection_date format is not in DD-Mmm-YYYY format";
528 } else if (rval & eDateFormatFlag_in_future) {
529 problem = "Collection_date is in the future";
530 } else if (rval & eDateFormatFlag_out_of_order) {
531 problem = "Collection_dates are out of order";
532 }
533 return problem;
534 }
535
536
x_ParseDateRangeWithDelimiter(const string & orig_date,CTempString delim)537 string CSubSource::x_ParseDateRangeWithDelimiter(const string& orig_date, CTempString delim)
538 {
539 size_t pos = NStr::Find(orig_date, delim, NStr::eNocase);
540 if (pos == NPOS) {
541 return kEmptyStr;
542 }
543 size_t second_pos = NStr::Find(orig_date.substr(pos + 1), delim, NStr::eNocase);
544 if (second_pos != NPOS) {
545 return kEmptyStr;
546 }
547 bool month_ambig = false;
548 string first_date = FixDateFormat(orig_date.substr(0, pos), true, month_ambig);
549 if (month_ambig || NStr::IsBlank(first_date)) {
550 return kEmptyStr;
551 }
552 string second_date = FixDateFormat(orig_date.substr(pos + delim.length()), true, month_ambig);
553 if (month_ambig || NStr::IsBlank(second_date)) {
554 return kEmptyStr;
555 }
556 string fix = first_date + "/" + second_date;
557 return fix;
558 }
559
560
FixDateFormat(const string & orig_date)561 string CSubSource::FixDateFormat (const string& orig_date)
562 {
563 bool month_ambiguous = false;
564
565 string fix = FixDateFormat(orig_date, true, month_ambiguous);
566 if (month_ambiguous) {
567 fix.clear();
568 } else if (NStr::IsBlank(fix)) {
569 static const char* delimiters[] = {"/", " to ", " and ", "-", "_"};
570 for (size_t i = 0; i < ArraySize(delimiters); i++) {
571 fix = x_ParseDateRangeWithDelimiter(orig_date, delimiters[i]);
572 if (!NStr::IsBlank(fix)) {
573 break;
574 }
575 }
576 }
577 return fix;
578 }
579
580 // ISO Format for time is one of these:
581 // HH:MM:SS
582 // HH:MM
583 // HH
584 // Followed by either Z or +hh:mm to indicate an offset from Zulu
IsISOFormatTime(const string & orig_time,int & hour,int & min,int & sec,bool require_time_zone)585 bool CSubSource::IsISOFormatTime(const string& orig_time, int& hour, int& min, int& sec, bool require_time_zone)
586 {
587 int offset_hour = 0;
588 int offset_min = 0;
589 size_t suffix = NStr::Find(orig_time, "Z");
590 if (suffix == NPOS) {
591 suffix = NStr::Find(orig_time, "+");
592 if (suffix == NPOS) {
593 if (require_time_zone) {
594 return false;
595 } else {
596 suffix = orig_time.length();
597 }
598 } else {
599 if (orig_time.substr(suffix).length() != 6 ||
600 !isdigit((unsigned char)orig_time[suffix + 1]) ||
601 !isdigit((unsigned char)orig_time[suffix + 2]) ||
602 orig_time[suffix + 3] != ':' ||
603 !isdigit((unsigned char)orig_time[suffix + 4]) ||
604 !isdigit((unsigned char)orig_time[suffix + 5])) {
605 return false;
606 }
607 try {
608 offset_hour = NStr::StringToInt(orig_time.substr(suffix + 1, 2));
609 offset_min = NStr::StringToInt(orig_time.substr(suffix + 4, 2));
610 } catch (...) {
611 return false;
612 }
613 }
614 }
615 if (suffix != 2 && suffix != 5 && suffix != 8) {
616 return false;
617 }
618
619 if (!isdigit((unsigned char)orig_time[0]) || !isdigit((unsigned char)orig_time[1])) {
620 return false;
621 }
622 hour = 0;
623 min = 0;
624 sec = 0;
625 try {
626 hour = NStr::StringToInt(orig_time.substr(0, 2));
627 if (hour < 0 || hour > 23) {
628 return false;
629 }
630 hour -= offset_hour;
631 } catch (...) {
632 return false;
633 }
634 if (suffix > 2) {
635 if (!isdigit((unsigned char)orig_time[3]) || !isdigit((unsigned char)orig_time[4])) {
636 return false;
637 }
638 try {
639 min = NStr::StringToInt(orig_time.substr(3, 2));
640 if (min < 0 || min > 59) {
641 return false;
642 }
643 } catch (...) {
644 return false;
645 }
646 min -= offset_min;
647 }
648 if (suffix == 8) {
649 if (!isdigit((unsigned char)orig_time[6]) || !isdigit((unsigned char)orig_time[7])) {
650 return false;
651 }
652 try {
653 sec = NStr::StringToInt(orig_time.substr(6, 2));
654 if (sec < 0) {
655 // negative number bad
656 return false;
657 } else if (sec > 59) {
658 // too big
659 return false;
660 }
661 } catch (...) {
662 return false;
663 }
664 }
665
666 return true;
667 }
668
669 // ISO Format for date is exactly 10 characters long OR exactly 7 characters long.
670 // For ten characters:
671 // First four characters must be digits, represent year.
672 // Fifth character must be dash.
673 // Sixth and seventh characters must be digits, represent month, use zero padding.
674 // Eighth character must be dash.
675 // Ninth and tenth characters must be digits, represent day, use zero padding.
676 // For 7 characters:
677 // First four characters must be digits, represent year.
678 // Fifth character must be dash.
679 // Sixth and seventh characters must be digits, represent month, use zero padding.
IsISOFormatDateOnly(const string & cpy)680 bool CSubSource::IsISOFormatDateOnly (const string& cpy)
681 {
682 if (cpy.length() != 10 && cpy.length() != 7) {
683 return false;
684 }
685 bool rval = true;
686 size_t pos = 0;
687 string::const_iterator it = cpy.begin();
688 while (it != cpy.end() && rval) {
689 if (pos == 4 || pos == 7) {
690 if (*it != '-') {
691 rval = false;
692 }
693 } else if (!isdigit(*it)) {
694 rval = false;
695 }
696 ++it;
697 ++pos;
698 }
699 if (rval) {
700 try {
701 int year = NStr::StringToInt(cpy.substr(0, 4));
702 int month = NStr::StringToInt(cpy.substr(5, 2));
703 if (month < 1 || month > 12) {
704 rval = false;
705 }
706 if (cpy.length() == 10) { // has day
707 int day = NStr::StringToInt(cpy.substr(8, 2));
708 if (!IsDayValueOkForMonth(day, month, year)) {
709 rval = false;
710 }
711 }
712 } catch (...) {
713 rval = false;
714 }
715 }
716 return rval;
717 }
718
719
x_IsFixableIsoDate(const string & orig_date)720 bool CSubSource::x_IsFixableIsoDate(const string& orig_date)
721 {
722 string cpy = orig_date;
723 NStr::TruncateSpacesInPlace(cpy);
724 size_t time_pos = NStr::Find(cpy, "T");
725 bool rval = false;
726 if (time_pos == NPOS) {
727 rval = false;
728 } else {
729 if (!IsISOFormatDateOnly(cpy.substr(0, time_pos))) {
730 rval = false;
731 } else {
732 int h, m, s;
733 if (IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s, true)) {
734 // already fine, not fixable
735 rval = false;
736 } else {
737 rval = IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s, false);
738 }
739 }
740 }
741 return rval;
742 }
743
744
x_RemoveIsoTime(const string & orig_date)745 string CSubSource::x_RemoveIsoTime(const string& orig_date)
746 {
747 string cpy = orig_date;
748 NStr::TruncateSpacesInPlace(cpy);
749 size_t time_pos = NStr::Find(cpy, "T");
750 if (time_pos != NPOS) {
751 cpy = cpy.substr(0, time_pos);
752 }
753 return cpy;
754 }
755
756
IsISOFormatDate(const string & orig_date)757 bool CSubSource::IsISOFormatDate(const string& orig_date)
758 {
759 string cpy = orig_date;
760 NStr::TruncateSpacesInPlace(cpy);
761 size_t time_pos = NStr::Find(cpy, "T");
762 if (time_pos == NPOS) {
763 return IsISOFormatDateOnly(cpy);
764 } else {
765 int h, m, s;
766 return (IsISOFormatDateOnly(cpy.substr(0, time_pos)) &&
767 IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s));
768 }
769
770 }
771
GetDateFromISODate(const string & orig_date)772 CRef<CDate> CSubSource::GetDateFromISODate(const string& orig_date)
773 {
774 try {
775 string cpy = orig_date;
776 NStr::TruncateSpacesInPlace(cpy);
777 CRef<CDate> date(new CDate());
778 int year_val = NStr::StringToInt(cpy.substr(0, 4));
779 int month_val = NStr::StringToInt(cpy.substr(5, 2));
780 date->SetStd().SetYear (year_val);
781 date->SetStd().SetMonth (month_val);
782 if (cpy.length() > 7) {
783 int day_val = NStr::StringToInt(cpy.substr(8, 2));
784 date->SetStd().SetDay (day_val);
785 }
786 return date;
787 } catch (...) {
788 return CRef<CDate>(NULL);
789 }
790 }
791
792
x_GetDateTokens(const string & orig_date)793 vector<string> CSubSource::x_GetDateTokens(const string& orig_date)
794 {
795 vector<string> tokens;
796 string token_delimiters = " ,-/=_.";
797
798 string cpy = orig_date;
799 NStr::TruncateSpacesInPlace (cpy);
800
801 string curr_token;
802 bool is_chars = false;
803 ITERATE(string, s, cpy) {
804 if (token_delimiters.find(*s) != NPOS) {
805 if (!NStr::IsBlank(curr_token)) {
806 tokens.push_back(curr_token);
807 }
808 curr_token.clear();
809 is_chars = false;
810 } else if (is_chars && !isalpha((unsigned char)(*s))) {
811 // previous token was all letters, do not add non-letter characters
812 if (!NStr::IsBlank(curr_token)) {
813 tokens.push_back(curr_token);
814 }
815 curr_token = *s;
816 is_chars = false;
817 } else if (!NStr::IsBlank(curr_token) && !is_chars && isalpha(*s)) {
818 // previous token had no letters
819 tokens.push_back(curr_token);
820 curr_token = *s;
821 is_chars = true;
822 } else {
823 curr_token += *s;
824 if (isalpha(*s)) {
825 is_chars = true;
826 }
827 }
828 }
829 if (!NStr::IsBlank(curr_token)) {
830 tokens.push_back(curr_token);
831 }
832
833 // reattach 'st', 'nd', 'rd', and 'th' to numbers if present
834 if (tokens.size() > 3) {
835 vector<string>::iterator p = tokens.begin();
836 bool prev_is_number = isdigit((unsigned char)(*p)[0]);
837 vector<string>::iterator s = p;
838 ++s;
839 while (s != tokens.end()) {
840 if (prev_is_number &&
841 (NStr::EqualNocase(*s, "st") ||
842 NStr::EqualNocase(*s, "nd") ||
843 NStr::EqualNocase(*s, "rd") ||
844 NStr::EqualNocase(*s, "th"))) {
845 *p += *s;
846 s = tokens.erase(s);
847 prev_is_number = false;
848 } else {
849 ++p;
850 ++s;
851 prev_is_number = isdigit((unsigned char)(*p)[0]);
852 }
853 }
854 }
855
856 return tokens;
857 }
858
859
s_ChooseMonthAndDay(const string & token1,const string & token2,bool month_first,string & month,int & day,bool & month_ambiguous)860 bool s_ChooseMonthAndDay(const string& token1, const string& token2, bool month_first, string& month, int& day, bool& month_ambiguous)
861 {
862 try {
863 int val1 = NStr::StringToInt (token1);
864 int val2 = NStr::StringToInt (token2);
865 if (val1 > 12 && val2 > 12) {
866 // both numbers too big for month
867 return false;
868 } else if (val1 < 13 && val2 < 13) {
869 if (val1 == val2) {
870 // no need to call this ambiguous
871 month = CTime::MonthNumToName(val1, CTime::eAbbr);
872 day = val2;
873 } else {
874 // both numbers could be month
875 month_ambiguous = true;
876 if (month_first) {
877 month = CTime::MonthNumToName(val1, CTime::eAbbr);
878 day = val2;
879 } else {
880 month = CTime::MonthNumToName(val2, CTime::eAbbr);
881 day = val1;
882 }
883 }
884 } else if (val1 < 13) {
885 month = CTime::MonthNumToName(val1, CTime::eAbbr);
886 day = val2;
887 } else {
888 month = CTime::MonthNumToName(val2, CTime::eAbbr);
889 day = val1;
890 }
891 return true;
892 } catch ( ... ) {
893 return false;
894 }
895 }
896
897
FixDateFormat(const string & test,bool month_first,bool & month_ambiguous)898 string CSubSource::FixDateFormat (const string& test, bool month_first, bool& month_ambiguous)
899 {
900 string orig_date = test;
901 NStr::TruncateSpacesInPlace(orig_date);
902
903 if (IsISOFormatDate(orig_date)) {
904 return orig_date;
905 } else if (x_IsFixableIsoDate(orig_date)) {
906 return x_RemoveIsoTime(orig_date);
907 }
908
909 string reformatted_date;
910 string month;
911 int year = 0, day = 0;
912 //string token_delimiters = " ,-/=_.";
913 size_t num_original_tokens = 0;
914
915 month_ambiguous = false;
916 vector<string> tokens = x_GetDateTokens(orig_date);
917
918 num_original_tokens = tokens.size();
919 if (tokens.size() < 1 || tokens.size() > 3) {
920 // no tokens or too many tokens
921 return kEmptyStr;
922 }
923
924 string one_token;
925 vector<string>::iterator it = tokens.begin();
926 while (it != tokens.end()) {
927 one_token = *it;
928 bool found = false;
929 if (NStr::EqualNocase(one_token, "1st") || NStr::EqualNocase(one_token, "first")) {
930 day = 1;
931 found = true;
932 } else if (NStr::EqualNocase(one_token, "2nd") || NStr::EqualNocase(one_token, "second")) {
933 day = 2;
934 found = true;
935 } else if (NStr::EqualNocase(one_token, "3rd") || NStr::EqualNocase (one_token, "third")) {
936 day = 3;
937 found = true;
938 } else if (one_token.length() > 0
939 && isdigit((unsigned char)one_token[0])
940 && NStr::EndsWith(one_token, "th")) {
941 try {
942 day = NStr::StringToInt (one_token.substr(0, one_token.length() - 2));
943 found = true;
944 } catch ( ... ) {
945 // threw exception while converting to int
946 return kEmptyStr;
947 }
948 } else if (isalpha((unsigned char)one_token[0])) {
949 if (!NStr::IsBlank(month)) {
950 // already have month, error
951 return kEmptyStr;
952 }
953 if (one_token.length() > 3) {
954 one_token = one_token.substr(0, 3);
955 }
956 try {
957 int month_num = CTime::MonthNameToNum(one_token);
958 found = true;
959 month = CTime::MonthNumToName(month_num, CTime::eAbbr);
960 } catch (CTimeException& e) {
961 }
962 } else {
963 try {
964 int this_val = NStr::StringToInt (one_token);
965 int min = 1;
966 int max = 31;
967 if (this_val < min) {
968 return kEmptyStr;
969 } else if (this_val > max) {
970 if (year > 0) {
971 // already have year, error
972 return kEmptyStr;
973 }
974 year = this_val;
975 found = true;
976 }
977 } catch ( ... ) {
978 // threw exception while converting to int
979 return kEmptyStr;
980 }
981 }
982 if (found) {
983 it = tokens.erase(it);
984 } else {
985 it++;
986 }
987 }
988
989 if (tokens.size() == 0) {
990 // good - all tokens assigned to values
991 } else if (tokens.size() > 2) {
992 // three numbers: treat last one as year
993 try {
994 year = NStr::StringToInt(tokens[2]);
995 if (year < 100) {
996 year += 2000;
997 }
998 if (!s_ChooseMonthAndDay(tokens[0], tokens[1], month_first, month, day, month_ambiguous)) {
999 return kEmptyStr;
1000 }
1001 // mark month as ambiguous, since we are guessing about year
1002 month_ambiguous = true;
1003 } catch ( ... ) {
1004 // threw exception while converting to int
1005 return kEmptyStr;
1006 }
1007 } else if (tokens.size() == 1) {
1008 try {
1009 int val = NStr::StringToInt (tokens[0]);
1010 if (year == 0) {
1011 year = val;
1012 } else {
1013 if (NStr::IsBlank (month)) {
1014 if (val > 0 && val < 13) {
1015 month = CTime::MonthNumToName(val, CTime::eAbbr);
1016 } else {
1017 // month number out of range
1018 return kEmptyStr;
1019 }
1020 } else {
1021 day = val;
1022 }
1023 }
1024 } catch ( ... ) {
1025 // threw exception while converting to int
1026 return kEmptyStr;
1027 }
1028 } else if (!NStr::IsBlank (month)) {
1029 if (tokens.size() == 2) {
1030 // we have a month and two other numbers (we hope)
1031 int val1 = 0;
1032 int val2 = 0;
1033 try {
1034 val1 = NStr::StringToInt (tokens[0]);
1035 val2 = NStr::StringToInt (tokens[1]);
1036 } catch (CException& /*e*/) {
1037 // not actually numbers
1038 return kEmptyStr;
1039 }
1040 bool zero_pad_1 = NStr::StartsWith(tokens[0], "0");
1041 bool zero_pad_2 = NStr::StartsWith(tokens[1], "0");
1042 if (val1 < 10 && !zero_pad_1 && (val2 > 10 || zero_pad_2)) {
1043 // if one token is not zero-padded and less than 10,
1044 // the other either is zero-padded and greater than 10,
1045 // the "small" token is the day and the second (+2000) is the year
1046 day = val1;
1047 year = val2 + 2000;
1048 } else if (val2 < 10 && !zero_pad_2 && (val1 > 10 || zero_pad_1)) {
1049 // if one token is not zero-padded and less than 10,
1050 // the other either is zero-padded and greater than 10,
1051 // the "small" token is the day and the second (+2000) is the year
1052 day = val2;
1053 year = val1 + 2000;
1054 } else {
1055 int month_num = CTime::MonthNameToNum(month);
1056 if (IsDayValueOkForMonth(val1, month_num, val2 + 2000)) {
1057 day = val1;
1058 year = val2 + 2000;
1059 } else {
1060 day = val2;
1061 year = val1 + 2000;
1062 }
1063 }
1064 } else {
1065 return kEmptyStr;
1066 }
1067 } else {
1068 if (!s_ChooseMonthAndDay(tokens[0], tokens[1], month_first, month, day, month_ambiguous)) {
1069 return kEmptyStr;
1070 }
1071 }
1072
1073 // make sure day is valid
1074 if (day > 0 && !NStr::IsBlank(month) && year > -1) {
1075 try {
1076 int month_num = CTime::MonthNameToNum(month);
1077 if (!IsDayValueOkForMonth(day, month_num, year)) {
1078 return kEmptyStr;
1079 }
1080 } catch (CTimeException& ex) {
1081 return kEmptyStr;
1082 }
1083 }
1084
1085 if (year > 0 && year < 100 && num_original_tokens > 1) {
1086 // try to guess year from two-digit year provided,
1087 // only if it could not possibly be a day of the month
1088 // and if there were at least two tokens provided
1089 string year_date = NStr::NumericToString(year + 2000);
1090 bool format_bad = false;
1091 bool in_future = false;
1092 IsCorrectDateFormat(year_date, format_bad, in_future);
1093 if (in_future) {
1094 year += 1900;
1095 } else {
1096 year += 2000;
1097 }
1098 }
1099 if (year >= 1000 && year < 2100) {
1100 reformatted_date = NStr::NumericToString (year);
1101 if (!NStr::IsBlank (month)) {
1102 reformatted_date = month + "-" + reformatted_date;
1103 if (day > 0) {
1104 string day_str = NStr::NumericToString (day);
1105 if (day_str.length() < 2) {
1106 day_str = "0" + day_str;
1107 }
1108 reformatted_date = day_str + "-" + reformatted_date;
1109 }
1110 }
1111 }
1112
1113 return reformatted_date;
1114 }
1115
1116
DetectDateFormat(const string & orig_date,bool & ambiguous,bool & day_first)1117 void CSubSource::DetectDateFormat(const string& orig_date, bool& ambiguous, bool &day_first)
1118 {
1119 ambiguous = false;
1120 day_first = false;
1121 vector<string> tokens = x_GetDateTokens(orig_date);
1122 if (tokens.size() != 3) {
1123 // can't do detection if there are more or less than three tokens
1124 ambiguous = true;
1125 return;
1126 }
1127 vector<int> nums;
1128
1129 // detection is only valid if all tokens are numbers and at least one is known to be the year
1130 try {
1131 ITERATE(vector<string>, it, tokens) {
1132 nums.push_back(NStr::StringToInt (*it));
1133 }
1134 } catch ( ... ) {
1135 // threw exception while converting to int
1136 ambiguous = true;
1137 return;
1138 }
1139 enum EPos { eDay = 0, eMonth = 1, eYear = 2 };
1140 vector<int> positions;
1141 positions.push_back(0);
1142 positions.push_back(0);
1143 positions.push_back(0);
1144
1145 int token_pos = 1;
1146 ITERATE(vector<int>, it, nums) {
1147 if (*it > 31) {
1148 if (positions[eYear] > 0) {
1149 // already found a year
1150 ambiguous = true;
1151 return;
1152 }
1153 positions[eYear] = token_pos;
1154 } else if (*it > 12) {
1155 if (positions[eDay] > 0) {
1156 // already found a day
1157 ambiguous = true;
1158 return;
1159 }
1160 positions[eDay] = token_pos;
1161 } else if (positions[eMonth] > 0) {
1162 // already found a month
1163 ambiguous = true;
1164 return;
1165 } else {
1166 positions[eMonth] = token_pos;
1167 }
1168 token_pos++;
1169 }
1170 if (positions[eDay] < positions[eMonth]) {
1171 day_first = true;
1172 } else {
1173 day_first = false;
1174 }
1175 }
1176
1177
IsCorrectLatLonFormat(string lat_lon,bool & format_correct,bool & precision_correct,bool & lat_in_range,bool & lon_in_range,double & lat_value,double & lon_value)1178 void CSubSource::IsCorrectLatLonFormat (string lat_lon, bool& format_correct, bool& precision_correct,
1179 bool& lat_in_range, bool& lon_in_range,
1180 double& lat_value, double& lon_value)
1181 {
1182 format_correct = false;
1183 lat_in_range = false;
1184 lon_in_range = false;
1185 precision_correct = false;
1186 double ns, ew;
1187 char lon, lat;
1188 int processed;
1189
1190 lat_value = 0.0;
1191 lon_value = 0.0;
1192
1193 if (NStr::IsBlank(lat_lon)) {
1194 return;
1195 } else if (sscanf (lat_lon.c_str(), "%lf %c %lf %c%n", &ns, &lat, &ew, &lon, &processed) != 4
1196 || size_t(processed) != lat_lon.length()) {
1197 return;
1198 } else if ((lat != 'N' && lat != 'S') || (lon != 'E' && lon != 'W')) {
1199 return;
1200 } else {
1201 // init values found
1202 if (lat == 'N') {
1203 lat_value = ns;
1204 } else {
1205 lat_value = 0.0 - ns;
1206 }
1207 if (lon == 'E') {
1208 lon_value = ew;
1209 } else {
1210 lon_value = 0.0 - ew;
1211 }
1212
1213 // make sure format is correct
1214 vector<string> pieces;
1215 NStr::Split(lat_lon, " ", pieces);
1216 if (pieces.size() > 3) {
1217 int precision_lat = x_GetPrecision(pieces[0]);
1218 int precision_lon = x_GetPrecision(pieces[2]);
1219
1220 char reformatted[1000];
1221 sprintf (reformatted, "%.*lf %c %.*lf %c", precision_lat, ns, lat,
1222 precision_lon, ew, lon);
1223
1224 size_t len = strlen (reformatted);
1225 if (NStr::StartsWith(lat_lon, reformatted)
1226 && (len == lat_lon.length()
1227 || (len < lat_lon.length()
1228 && lat_lon[len] == ';'))) {
1229 format_correct = true;
1230 if (ns <= 90 && ns >= 0) {
1231 lat_in_range = true;
1232 }
1233 if (ew <= 180 && ew >= 0) {
1234 lon_in_range = true;
1235 }
1236 if (precision_lat < 3 && precision_lon < 3) {
1237 precision_correct = true;
1238 }
1239 }
1240 }
1241 }
1242 }
1243
1244
FixLatLonPrecision(const string & orig)1245 string CSubSource::FixLatLonPrecision(const string& orig)
1246 {
1247 bool format_correct = false;
1248 bool precision_correct = false;
1249 bool lat_in_range = false;
1250 bool lon_in_range = false;
1251 double lat_value = 0.0;
1252 double lon_value = 0.0;
1253 IsCorrectLatLonFormat(orig, format_correct, precision_correct,
1254 lat_in_range, lon_in_range,
1255 lat_value, lon_value);
1256 if (!format_correct || !lat_in_range || !lon_in_range || precision_correct) {
1257 return orig;
1258 }
1259 vector<string> pieces;
1260 NStr::Split(orig, " ", pieces);
1261 if (pieces.size() > 3) {
1262 int precision_lat = x_GetPrecision(pieces[0]);
1263 int precision_lon = x_GetPrecision(pieces[2]);
1264 if (precision_lat > 4) {
1265 precision_lat = 4;
1266 }
1267 if (precision_lon > 4) {
1268 precision_lon = 4;
1269 }
1270
1271 char reformatted[1000];
1272 sprintf(reformatted, "%.*lf %c %.*lf %c", precision_lat, fabs(lat_value), pieces[1].c_str()[0],
1273 precision_lon, fabs(lon_value), pieces[3].c_str()[0]);
1274 string new_val = reformatted;
1275 return reformatted;
1276 }
1277 return kEmptyStr;
1278 }
1279
1280 /*
1281 1. String should be converted to UTF8 string, this will get rid of \xC0 and similar substrings
1282 2. Every codepoint (note that this is not regular ascii "char") that is not a digit or a decimal point or a letter should be prepended with a space.
1283 Transitions from alpha to digit/point and from digit/point to alpha should also be prepended with a space.
1284 3. NStr::Split is called with space as a separator and Tokenize flag - need to check if Split works with UTF8 strings properly.
1285 4. After this we should have a vector of tokens, some of which are numbers and others are "modifiers" such as ', '', degrees, N, S, E, W, etc.
1286 5. A pattern string is created where each number is replaced with "1" and modifiers are normalized to "lat", or "N"; the actual numerical values are kept in a separate vector
1287 5. Based on the pattern the vector of numbers is parsed into degrees, minutes, or seconds,
1288 6. NSEW and "lattitude/longitude" are applied to degrees in the order of appearance, if none are present other heuristic to determine which is latitude and which is longitude
1289 */
1290
s_InsertSpacesBetweenTokens(const string & old_str)1291 static string s_InsertSpacesBetweenTokens(const string &old_str)
1292 {
1293 string new_str;
1294 for (string::const_iterator i = old_str.begin(); i != old_str.end(); ++i)
1295 {
1296 TUnicodeSymbol sym = CUtf8::Decode(i);
1297 if (sym < 0x80)
1298 {
1299 char c = static_cast<char>(sym);
1300 if (!isalpha(c) && !isdigit(c) && c != '.' && c != '-' && c != '+')
1301 {
1302 new_str += ' ';
1303 }
1304 else if (!new_str.empty() &&
1305 ((isalpha(new_str.back()) && !isalpha(c)) ||
1306 (!isalpha(new_str.back()) && isalpha(c))))
1307 {
1308 new_str += ' ';
1309 }
1310 new_str += c;
1311 if (!isalpha(c) && !isdigit(c) && c != '.' && c != '-' && c != '+')
1312 {
1313 new_str += ' ';
1314 }
1315 }
1316 else
1317 {
1318 new_str += ' ';
1319 }
1320 }
1321 return new_str;
1322 }
1323
s_RemoveSpacesWithinNumbers(const string & old_str)1324 static string s_RemoveSpacesWithinNumbers(const string &old_str)
1325 {
1326 string new_str;
1327 bool is_number = true;
1328 for (string::const_iterator i = old_str.begin(); i != old_str.end(); ++i)
1329 {
1330 TUnicodeSymbol sym = CUtf8::Decode(i);
1331 if (sym < 0x80)
1332 {
1333 char c = static_cast<char>(sym);
1334 size_t j = new_str.size();
1335 if (j >= 4 && new_str[j-1] == ' ' && new_str[j-2] == '.' && new_str[j-3] == ' ' && isdigit(new_str[j-4]) && isdigit(c))
1336 {
1337 new_str.pop_back();
1338 new_str.pop_back();
1339 new_str.pop_back();
1340 new_str += '.';
1341 }
1342 new_str += c;
1343 if (!isdigit(c) && c != '+' && c != '-' && c != '.' && !isspace(c)) {
1344 is_number = false;
1345 }
1346 }
1347 else
1348 {
1349 new_str += ' ';
1350 is_number = false;
1351 }
1352 }
1353 if (is_number)
1354 {
1355 NStr::ReplaceInPlace(new_str, "+", " +");
1356 NStr::ReplaceInPlace(new_str, "-", " -");
1357 }
1358 return new_str;
1359 }
1360
s_IsNumber(const string & token,double * result=NULL)1361 static bool s_IsNumber(const string &token, double *result = NULL)
1362 {
1363 double num = NStr::StringToDouble(token, NStr::fConvErr_NoThrow);
1364 if (!num && errno)
1365 {
1366 return false;
1367 }
1368 if (result) {
1369 *result = num;
1370 }
1371 return true;
1372 }
1373
s_NormalizeTokens(vector<string> & tokens,vector<double> & numbers,vector<string> & anum,vector<int> & precision,vector<string> & lat_long,vector<string> & nsew)1374 static string s_NormalizeTokens(vector<string> &tokens, vector<double> &numbers, vector<string> &anum, vector<int> &precision, vector<string> &lat_long, vector<string> &nsew)
1375 {
1376 vector<string> pattern;
1377 for (size_t i = 0; i < tokens.size(); i++)
1378 {
1379 string &token = tokens[i];
1380
1381 double num;
1382 if (s_IsNumber(token, &num))
1383 {
1384 numbers.push_back(num);
1385 anum.push_back(token);
1386 pattern.push_back("1");
1387 precision.push_back(0);
1388 if (NStr::Find(token, ".") != NPOS && !NStr::EndsWith(token, "."))
1389 {
1390 precision.back() = token.length() - NStr::Find(token, ".") - 1;
1391 }
1392 continue;
1393 }
1394
1395 {
1396 vector<string> tmp;
1397 NStr::Split(token, ".", tmp);
1398 double num0, num1, num2;
1399 if (tmp.size() == 3 && s_IsNumber(tmp[0], &num0) && s_IsNumber(tmp[1], &num1) && s_IsNumber(tmp[2], &num2))
1400 {
1401 numbers.push_back(num0);
1402 anum.push_back(tmp[0]);
1403 pattern.push_back("1");
1404 precision.push_back(0);
1405 numbers.push_back(num1);
1406 anum.push_back(tmp[1]);
1407 pattern.push_back("1");
1408 precision.push_back(0);
1409 numbers.push_back(num2);
1410 anum.push_back(tmp[2]);
1411 pattern.push_back("1");
1412 precision.push_back(0);
1413 continue;
1414 }
1415 }
1416
1417 if (token == "\'" && i >= 3 && s_IsNumber(tokens[i - 1]) && tokens[i - 2] == "\'" && s_IsNumber(tokens[i - 3]))
1418 {
1419 token = "\"";
1420 }
1421
1422 if (NStr::EqualNocase(token, "degrees") || NStr::EqualNocase(token, "deg") || NStr::EqualNocase(token, "deg.") || NStr::EqualNocase(token, "degree"))
1423 {
1424 token = "degrees";
1425 pattern.push_back("degrees");
1426 }
1427 else if ( token == "\'" || NStr::EqualNocase(token, "min") || NStr::EqualNocase(token, "min.") || NStr::EqualNocase(token, "minute") || NStr::EqualNocase(token, "minutes"))
1428 {
1429 token = "\'";
1430 pattern.push_back("\'");
1431 }
1432 else if (token == "\"" || NStr::EqualNocase(token, "sec") || NStr::EqualNocase(token, "sec.") || NStr::EqualNocase(token, "second") || NStr::EqualNocase(token, "seconds"))
1433 {
1434 token = "\"";
1435 pattern.push_back("\"");
1436 }
1437 else if (token == "," || token == ":" || token == "_" || token == "&" || token == "." || token == ";" || token == "#" || NStr::EqualNocase(token, "and"))
1438 {
1439 }
1440 else if (NStr::EqualNocase(token, "lattitude") || NStr::EqualNocase(token, "latitude") || NStr::EqualNocase(token, "lat") || NStr::EqualNocase(token, "lat."))
1441 {
1442 pattern.push_back("lat");
1443 lat_long.push_back("lat");
1444 }
1445 else if (NStr::EqualNocase(token, "longitude") || NStr::EqualNocase(token, "lo") || NStr::EqualNocase(token, "lon") || NStr::EqualNocase(token, "long")
1446 || NStr::EqualNocase(token, "lo.") || NStr::EqualNocase(token, "lon.") || NStr::EqualNocase(token, "long."))
1447 {
1448 pattern.push_back("lat");
1449 lat_long.push_back("long");
1450 }
1451 else if (token == "N" || NStr::EqualNocase(token, "north"))
1452 {
1453 pattern.push_back("N");
1454 nsew.push_back("N");
1455 }
1456 else if (token == "S" || NStr::EqualNocase(token, "south"))
1457 {
1458 pattern.push_back("N");
1459 nsew.push_back("S");
1460 }
1461 else if (token == "E" || NStr::EqualNocase(token, "east"))
1462 {
1463 pattern.push_back("N");
1464 nsew.push_back("E");
1465 }
1466 else if (token == "W" || NStr::EqualNocase(token, "west") || token == "Wdeg")
1467 {
1468 pattern.push_back("N");
1469 nsew.push_back("W");
1470 }
1471 else if (token == "NW")
1472 {
1473 nsew.push_back("N");
1474 nsew.push_back("W");
1475 }
1476 else if (token == "NE")
1477 {
1478 nsew.push_back("N");
1479 nsew.push_back("E");
1480 }
1481 else if (token == "SW")
1482 {
1483 nsew.push_back("S");
1484 nsew.push_back("W");
1485 }
1486 else if (token == "SE")
1487 {
1488 nsew.push_back("S");
1489 nsew.push_back("E");
1490 }
1491 else
1492 {
1493 //cout << "Token: " << token << endl;
1494 numbers.clear();
1495 return kEmptyStr;
1496 }
1497 }
1498 //cout << "Pattern: " << NStr::Join(pattern, " ") << endl;
1499 return NStr::Join(pattern, " ");
1500 }
1501
s_ReorderNorthSouthEastWest(vector<double> & numbers,vector<int> & precision,const vector<string> & lat_long,vector<string> & nsew)1502 static void s_ReorderNorthSouthEastWest(vector<double> &numbers, vector<int> &precision, const vector<string> &lat_long, vector<string> &nsew)
1503 {
1504 if (numbers.size() != 2)
1505 {
1506 numbers.clear();
1507 return;
1508 }
1509 if (lat_long.size() == 2)
1510 {
1511 if (lat_long.front() == "long")
1512 {
1513 swap(numbers[0], numbers[1]);
1514 swap(precision[0], precision[1]);
1515 if (nsew.size() == 2) {
1516 swap(nsew[0], nsew[1]);
1517 }
1518 }
1519 }
1520 else if (!lat_long.empty())
1521 {
1522 numbers.clear();
1523 return;
1524 }
1525 if (nsew.size() == 2)
1526 {
1527 if ((nsew[0] == "E" || nsew[0] == "W") &&
1528 (nsew[1] == "N" || nsew[1] == "S"))
1529 {
1530 swap(numbers[0], numbers[1]);
1531 swap(precision[0], precision[1]);
1532 swap(nsew[0], nsew[1]);
1533 }
1534 if (nsew[0] == "N")
1535 {
1536 numbers[0] = fabs(numbers[0]);
1537 }
1538 else if (nsew[0] == "S")
1539 {
1540 if (numbers[0] != 0)
1541 numbers[0] = -fabs(numbers[0]);
1542 }
1543 else
1544 {
1545 numbers.clear();
1546 return;
1547 }
1548 if (nsew[1] == "E")
1549 {
1550 numbers[1] = fabs(numbers[1]);
1551 }
1552 else if (nsew[1] == "W")
1553 {
1554 if (numbers[1] != 0)
1555 numbers[1] = -fabs(numbers[1]);
1556 }
1557 else
1558 {
1559 numbers.clear();
1560 return;
1561 }
1562
1563 }
1564 else if (!nsew.empty())
1565 {
1566 numbers.clear();
1567 return;
1568 }
1569 if (lat_long.empty() && nsew.empty() && fabs(numbers[0]) > 90 && fabs(numbers[1]) < 90)
1570 {
1571 swap(numbers[0], numbers[1]);
1572 swap(precision[0], precision[1]);
1573 }
1574 if (fabs(numbers[0]) > 90 || fabs(numbers[1]) > 180)
1575 {
1576 numbers.clear();
1577 return;
1578 }
1579 }
1580
s_GetLatLong(const string & new_str,vector<double> & numbers,vector<int> & precision)1581 static void s_GetLatLong(const string &new_str, vector<double> &numbers, vector<int> &precision)
1582 {
1583 vector<string> tokens;
1584 NStr::Split(new_str, " ", tokens, NStr::fSplit_Tokenize);
1585 vector<string> lat_long;
1586 vector<string> nsew;
1587 vector<string> anum;
1588 string pattern = s_NormalizeTokens(tokens, numbers, anum, precision, lat_long, nsew);
1589 if (pattern.empty())
1590 {
1591 numbers.clear();
1592 return;
1593 }
1594 vector<double> degrees(2, 0);
1595 vector<int> prec(2, 0);
1596 int sign1 = 1;
1597 int sign2 = 1;
1598 if ( pattern == "1 1" ||
1599 pattern == "1 N 1 N" ||
1600 pattern == "N 1 N 1" ||
1601 pattern == "1 degrees N 1 degrees N" ||
1602 pattern == "lat 1 lat 1" ||
1603 pattern == "1 N lat 1 N lat" ||
1604 pattern == "1 degrees N lat 1 degrees N lat")
1605 {
1606 degrees[0] = numbers[0];
1607 degrees[1] = numbers[1];
1608 prec[0] = precision[0];
1609 prec[1] = precision[1];
1610 }
1611 else if ((pattern == "1 1 \" 1 1 '" ||
1612 pattern == "1 degrees 1 \" N 1 degrees 1 ' N")
1613 && numbers[1] < 60 && numbers[3] < 60
1614 && numbers[1] >= 0 && numbers[3] >= 0)
1615 {
1616 sign1 = anum[0][0] == '-' ? -1 : 1;
1617 sign2 = anum[2][0] == '-' ? -1 : 1;
1618 degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 3600);
1619 degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60);
1620 prec[0] = max(precision[0], precision[1] + 4);
1621 prec[1] = max(precision[2], precision[3] + 2);
1622 }
1623 else if ( (pattern == "1 1 ' 1" ||
1624 pattern == "1 degrees 1 ' N 1 degrees N")
1625 && numbers[1] < 60
1626 && numbers[1] >= 0)
1627 {
1628 sign1 = anum[0][0] == '-' ? -1 : 1;
1629 degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
1630 degrees[1] = numbers[2];
1631 prec[0] = max(precision[0], precision[1] + 2);
1632 prec[1] = precision[2];
1633 }
1634 else if (pattern == "1 1 ' 1 \" 1"
1635 && numbers[1] < 60 && numbers[2] < 60
1636 && numbers[1] >= 0 && numbers[2] >= 0)
1637 {
1638 sign1 = anum[0][0] == '-' ? -1 : 1;
1639 degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1640 degrees[1] = numbers[3];
1641 prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1642 prec[1] = precision[3];
1643 }
1644 else if ((pattern == "1 1 ' 1 \" 1 1 '" ||
1645 pattern == "1 1 1 N 1 1 N" ||
1646 pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 ' N")
1647 && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60
1648 && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0)
1649 {
1650 sign1 = anum[0][0] == '-' ? -1 : 1;
1651 sign2 = anum[3][0] == '-' ? -1 : 1;
1652 degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1653 degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 60);
1654 prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1655 prec[1] = max(precision[3], precision[4] + 2);
1656 }
1657 else if (( pattern == "1 1 ' 1 \" 1 1 ' 1 \"" ||
1658 pattern == "1 1 ' 1 \" N 1 1 ' 1 \" N" ||
1659 pattern == "1 degrees 1 ' 1 \" 1 degrees 1 ' 1 \"" ||
1660 pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 ' 1 \" N" ||
1661 pattern == "N 1 degrees 1 ' 1 \" N 1 degrees 1 ' 1 \"" ||
1662 pattern == "1 degrees 1 ' 1 N 1 degrees 1 ' 1 N" ||
1663 pattern == "1 degrees 1 1 N 1 degrees 1 1 N" ||
1664 pattern == "1 1 1 N 1 1 1 N")
1665 && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60 && numbers[5] < 60
1666 && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0 && numbers[5] >= 0)
1667 {
1668 sign1 = anum[0][0] == '-' ? -1 : 1;
1669 sign2 = anum[3][0] == '-' ? -1 : 1;
1670 degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1671 degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 60 + numbers[5] / 3600);
1672 prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1673 prec[1] = max(max(precision[3], precision[4] + 2), precision[5] + 4);
1674 }
1675 else if (( pattern == "1 1 ' 1 1 '" ||
1676 pattern == "1 1 N 1 1 N" ||
1677 pattern == "1 1 ' N 1 1 ' N" ||
1678 pattern == "1 degrees 1 ' N 1 degrees 1 ' N" ||
1679 pattern == "lat 1 degrees 1 ' N lat 1 degrees 1 ' N" ||
1680 pattern == "1 degrees 1 N 1 degrees 1 N" ||
1681 pattern == "1 degrees 1 N 1 degrees 1 ' N" ||
1682 pattern == "1 degrees 1 ' N 1 degrees 1 N" ||
1683 pattern == "N 1 degrees 1 ' N 1 degrees 1" ||
1684 pattern == "N 1 degrees 1 ' N 1 degrees 1 '" ||
1685 pattern == "N 1 degrees 1 ' N 1 1 '")
1686 && numbers[1] < 60 && numbers[3] < 60
1687 && numbers[1] >= 0 && numbers[3] >= 0)
1688 {
1689 sign1 = anum[0][0] == '-' ? -1 : 1;
1690 sign2 = anum[2][0] == '-' ? -1 : 1;
1691 degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
1692 degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60);
1693 prec[0] = max(precision[0], precision[1] + 2);
1694 prec[1] = max(precision[2], precision[3] + 2);
1695 }
1696 else if ((pattern == "1 N 1 1 N" ||
1697 pattern == "1 degrees N 1 degrees 1 ' N")
1698 && numbers[2] < 60
1699 && numbers[2] >= 0)
1700 {
1701 sign2 = anum[1][0] == '-' ? -1 : 1;
1702 degrees[0] = numbers[0];
1703 degrees[1] = sign2*(fabs(numbers[1]) + numbers[2] / 60);
1704 prec[0] = precision[0];
1705 prec[1] = max(precision[1], precision[2] + 2);
1706 }
1707 else if ((pattern == "1 degrees 1 ' 1 degrees 1 ' 1 \"" ||
1708 pattern == "N 1 1 N 1 1 1")
1709 && numbers[1] < 60 && numbers[3] < 60 && numbers[4] < 60
1710 && numbers[1] >= 0 && numbers[3] >= 0 && numbers[4] >= 0)
1711 {
1712 sign1 = anum[0][0] == '-' ? -1 : 1;
1713 sign2 = anum[2][0] == '-' ? -1 : 1;
1714 degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
1715 degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60 + numbers[4] / 3600);
1716 prec[0] = max(precision[0], precision[1] + 2);
1717 prec[1] = max(max(precision[2], precision[3] + 2), precision[4] + 4);
1718 }
1719 else if (pattern == "1 degrees 1 degrees 1 ' 1 \""
1720 && numbers[2] < 60 && numbers[3] < 60
1721 && numbers[2] >= 0 && numbers[3] >= 0)
1722 {
1723 sign2 = anum[1][0] == '-' ? -1 : 1;
1724 degrees[0] = numbers[0];
1725 degrees[1] = sign2*(fabs(numbers[1]) + numbers[2] / 60 + numbers[3] / 3600);
1726 prec[0] = precision[0];
1727 prec[1] = max(max(precision[1], precision[2] + 2), precision[3] + 4);
1728 }
1729 else if (pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 \" N"
1730 && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60
1731 && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0)
1732 {
1733 sign1 = anum[0][0] == '-' ? -1 : 1;
1734 sign2 = anum[3][0] == '-' ? -1 : 1;
1735 degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1736 degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 3600);
1737 prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1738 prec[1] = max(precision[3], precision[4] + 4);
1739 }
1740 else
1741 {
1742 degrees.clear();
1743 prec.clear();
1744 }
1745 swap(degrees, numbers);
1746 swap(prec, precision);
1747 s_ReorderNorthSouthEastWest(numbers, precision, lat_long, nsew);
1748 }
1749
1750
FixLatLonFormat(string orig_lat_lon,bool guess)1751 string CSubSource::FixLatLonFormat (string orig_lat_lon, bool guess)
1752 {
1753 //cout << "Before: " << orig_lat_lon << endl;
1754 NStr::ParseEscapes(orig_lat_lon);
1755 CStringUTF8 old_str = CUtf8::AsUTF8(orig_lat_lon, CUtf8::GuessEncoding(orig_lat_lon));
1756 if (NStr::StartsWith(old_str, "\""))
1757 {
1758 NStr::TrimPrefixInPlace(old_str, "\"");
1759 NStr::TrimSuffixInPlace(old_str, "\"");
1760 }
1761 NStr::ReplaceInPlace(old_str, "\'\'", "\"");
1762 string fixed_str = s_RemoveSpacesWithinNumbers(old_str);
1763 string new_str = s_InsertSpacesBetweenTokens(fixed_str);
1764 NStr::Sanitize(new_str);
1765 vector<double> numbers;
1766 vector<int> precision;
1767 s_GetLatLong(new_str, numbers, precision);
1768 string res;
1769 if (!numbers.empty())
1770 {
1771 res = MakeLatLon(numbers[0], numbers[1], precision[0], precision[1]);
1772 }
1773 //cout << "After: " << res << endl;
1774 return res;
1775 }
1776
1777
MakeLatLon(double lat_value,double lon_value,int lat_precision,int lon_precision)1778 string CSubSource::MakeLatLon(double lat_value, double lon_value, int lat_precision, int lon_precision )
1779 {
1780 char ns = 'N';
1781 if (lat_value < 0) {
1782 ns = 'S';
1783 lat_value = -lat_value;
1784 }
1785 char ew = 'E';
1786 if (lon_value < 0) {
1787 ew = 'W';
1788 lon_value = -lon_value;
1789 }
1790 string lat = NStr::DoubleToString(lat_value, lat_precision);
1791 string lon = NStr::DoubleToString(lon_value, lon_precision);
1792
1793 NStr::TrimSuffixInPlace(lat, ".");
1794 NStr::TrimSuffixInPlace(lon, ".");
1795 string res = lat + " " + ns + " " + lon + " " + ew;
1796 return res;
1797 }
1798
1799
x_CalculateLatLonId(float lat_value,float lon_value,string country,string province)1800 CLatLonCountryId *CSubSource::x_CalculateLatLonId(float lat_value, float lon_value, string country, string province)
1801 {
1802 CLatLonCountryId *id = new CLatLonCountryId(lat_value, lon_value);
1803
1804 bool goodmatch = false;
1805
1806 // lookup region by coordinates, or find nearest region and calculate distance
1807 const CCountryExtreme * guess = m_LatLonCountryMap->GuessRegionForLatLon(lat_value, lon_value, country, province);
1808 if (guess) {
1809 id->SetFullGuess(guess->GetCountry());
1810 id->SetGuessCountry(guess->GetLevel0());
1811 id->SetGuessProvince(guess->GetLevel1());
1812 if (NStr::EqualNocase(country, id->GetGuessCountry())
1813 && (NStr::IsBlank(province) || NStr::EqualNocase(province, id->GetGuessProvince()))) {
1814 goodmatch = true;
1815 }
1816 } else {
1817 // not inside a country, check water
1818 guess = m_LatLonWaterMap->GuessRegionForLatLon(lat_value, lon_value, country);
1819 if (guess) {
1820 // found inside water
1821 id->SetGuessWater(guess->GetCountry());
1822 if (NStr::EqualNocase(country, id->GetGuessWater())) {
1823 goodmatch = true;
1824 }
1825
1826 // also see if close to land for coastal warning (if country is land)
1827 // or proximity message (if country is water)
1828 double landdistance = 0.0;
1829 guess = m_LatLonCountryMap->FindClosestToLatLon (lat_value, lon_value, 5.0, landdistance);
1830 if (guess) {
1831 id->SetClosestFull(guess->GetCountry());
1832 id->SetClosestCountry(guess->GetLevel0());
1833 id->SetClosestProvince(guess->GetLevel1());
1834 id->SetLandDistance(m_LatLonCountryMap->AdjustAndRoundDistance (landdistance));
1835 if (NStr::EqualNocase(country, id->GetClosestCountry())
1836 && (NStr::IsBlank(province) || NStr::EqualNocase(province, guess->GetLevel1()))) {
1837 goodmatch = true;
1838 }
1839 }
1840 } else {
1841 // may be coastal inlet, area of data insufficiency
1842 double landdistance = 0.0;
1843 guess = m_LatLonCountryMap->FindClosestToLatLon (lat_value, lon_value, 5.0, landdistance);
1844 if (guess) {
1845 id->SetClosestFull(guess->GetCountry());
1846 id->SetClosestCountry(guess->GetLevel0());
1847 id->SetClosestProvince(guess->GetLevel1());
1848 id->SetLandDistance(m_LatLonCountryMap->AdjustAndRoundDistance (landdistance));
1849 if (NStr::EqualNocase(country, id->GetClosestCountry())
1850 && (NStr::IsBlank(province) || NStr::EqualNocase(province, guess->GetLevel1()))) {
1851 goodmatch = true;
1852 }
1853 }
1854
1855 double waterdistance = 0.0;
1856 guess = m_LatLonWaterMap->FindClosestToLatLon (lat_value, lon_value, 5.0, waterdistance);
1857 if (guess) {
1858 id->SetClosestWater(guess->GetLevel0());
1859 id->SetWaterDistance(m_LatLonWaterMap->AdjustAndRoundDistance (waterdistance));
1860 if (NStr::EqualNocase(country, id->GetClosestWater())) {
1861 goodmatch = true;
1862 }
1863 }
1864 }
1865 }
1866
1867 // if guess is not the provided country or province, calculate distance to claimed country
1868 if (!goodmatch) {
1869 double distance = 0.0;
1870 guess = m_LatLonCountryMap->IsNearLatLon (lat_value, lon_value, 5.0, distance, country, province);
1871 if (guess) {
1872 if (distance < ErrorDistance(lat_value, lon_value, m_LatLonCountryMap->GetScale())) {
1873 // close enough
1874 id->SetGuessCountry(country);
1875 id->SetGuessProvince(province);
1876 id->SetFullGuess(guess->GetCountry());
1877 } else {
1878 id->SetClaimedFull(guess->GetCountry());
1879 id->SetClaimedDistance(m_LatLonCountryMap->AdjustAndRoundDistance (distance));
1880 }
1881 } else if (NStr::IsBlank(province)) {
1882 guess = m_LatLonWaterMap->IsNearLatLon (lat_value, lon_value, 5.0, distance, country, province);
1883 if (guess) {
1884 id->SetClaimedFull(guess->GetCountry());
1885 id->SetClaimedDistance(m_LatLonWaterMap->AdjustAndRoundDistance (distance));
1886 }
1887 }
1888 }
1889
1890 return id;
1891 }
1892
1893
1894
1895 typedef SStaticPair<const char*, const char*> TWaterPairElem;
1896 static const TWaterPairElem k_water_pair_map[] = {
1897 {"Adriatic Sea", "Mediterranean Sea"},
1898 {"Aegean Sea", "Mediterranean Sea"},
1899 {"Alboran Sea", "Mediterranean Sea"},
1900 {"Andaman Sea", "Indian Ocean"},
1901 {"Arabian Sea", "Indian Ocean"},
1902 {"Argentine Sea", "Atlantic Ocean"},
1903 {"Ariake Sea", "Pacific Ocean"},
1904 {"Baffin Bay", "Atlantic Ocean"},
1905 {"Balearic Sea", "Mediterranean Sea"},
1906 {"Baltic Sea", "Atlantic Ocean"},
1907 {"Barents Sea", "Arctic Ocean"},
1908 {"Bay of Bengal", "Indian Ocean"},
1909 {"Beaufort Sea", "Arctic Ocean"},
1910 {"Bering Sea", "Pacific Ocean"},
1911 {"Bismarck Sea", "Pacific Ocean"},
1912 {"Black Sea", "Mediterranean Sea"},
1913 {"Bohai Sea", "Pacific Ocean"},
1914 {"Caribbean Sea", "Atlantic Ocean"},
1915 {"Celebes Sea", "Pacific Ocean"},
1916 {"Champlain Sea", "Atlantic Ocean"},
1917 {"Chilean Sea", "Pacific Ocean"},
1918 {"China Seas", "Pacific Ocean"},
1919 {"Chukchi Sea", "Arctic Ocean"},
1920 {"Coral Sea", "Pacific Ocean"},
1921 {"Davis Strait", "Atlantic Ocean"},
1922 {"East China Sea", "Pacific Ocean"},
1923 {"East Siberian Sea", "Arctic Ocean"},
1924 {"English Channel", "Atlantic Ocean"},
1925 {"Erythraean Sea", "Indian Ocean"},
1926 {"Golfo de California", "Pacific Ocean"},
1927 {"Greenland Sea", "Arctic Ocean"},
1928 {"Gulf of Mexico", "Atlantic Ocean"},
1929 {"Gulf of Thailand", "Pacific Ocean"},
1930 {"Gulf of Tonkin", "Pacific Ocean"},
1931 {"Hudson Bay", "Arctic Ocean"},
1932 {"Ionian Sea", "Mediterranean Sea"},
1933 {"Irish Sea", "Atlantic Ocean"},
1934 {"Irminger Sea", "Atlantic Ocean"},
1935 {"James Bay", "Atlantic Ocean"},
1936 {"Java Sea", "Indian Ocean"},
1937 {"Kara Sea", "Arctic Ocean"},
1938 {"Koro Sea", "Pacific Ocean"},
1939 {"Labrador Sea", "Atlantic Ocean"},
1940 {"Laccadive Sea", "Indian Ocean"},
1941 {"Laptev Sea", "Arctic Ocean"},
1942 {"Ligurian Sea", "Mediterranean Sea"},
1943 {"Lincoln Sea", "Arctic Ocean"},
1944 {"Myrtoan Sea", "Mediterranean Sea"},
1945 {"North Sea", "Atlantic Ocean"},
1946 {"Norwegian Sea", "Atlantic Ocean"},
1947 {"Pechora Sea", "Arctic Ocean"},
1948 {"Persian Gulf", "Indian Ocean"},
1949 {"Philippine Sea", "Pacific Ocean"},
1950 {"Red Sea", "Indian Ocean"},
1951 {"Salish Sea", "Pacific Ocean"},
1952 {"Sargasso Sea", "Atlantic Ocean"},
1953 {"Scotia Sea", "Southern Ocean"},
1954 {"Sea of Azov", "Black Sea"},
1955 {"Sea of Chiloe", "Pacific Ocean"},
1956 {"Sea of Crete", "Mediterranean Sea"},
1957 {"Sea of Japan", "Pacific Ocean"},
1958 {"Sea of Okhotsk", "Pacific Ocean"},
1959 {"Sea of the Hebrides", "Atlantic Ocean"},
1960 {"Sea of Zanj", "Indian Ocean"},
1961 {"Seas of Greenland", "Atlantic Ocean"},
1962 {"Sethusamudram", "Indian Ocean"},
1963 {"Sibutu Passage", "Pacific Ocean"},
1964 {"Solomon Sea", "Pacific Ocean"},
1965 {"South China Sea", "Pacific Ocean"},
1966 {"Sulu Sea", "Pacific Ocean"},
1967 {"Tasman Sea", "Pacific Ocean"},
1968 {"Thracian Sea", "Mediterranean Sea"},
1969 {"Timor Sea", "Indian Ocean"},
1970 {"Tyrrhenian Sea", "Mediterranean Sea"},
1971 {"Wandel Sea", "Arctic Ocean"},
1972 {"White Sea", "Arctic Ocean"},
1973 {"Yellow Sea", "Pacific Ocean"}
1974 };
1975 typedef CStaticArrayMap<const char*, const char*, PNocase_CStr> TWaterPairMap;
1976 DEFINE_STATIC_ARRAY_MAP(TWaterPairMap, sc_WaterPairMap, k_water_pair_map);
1977
x_FindSurroundingOcean(string & water)1978 static string x_FindSurroundingOcean (string& water)
1979
1980 {
1981 TWaterPairMap::const_iterator new_water_pair_iter = sc_WaterPairMap.find(water.c_str());
1982 if( new_water_pair_iter != sc_WaterPairMap.end() ) {
1983 return new_water_pair_iter->second;
1984 }
1985 return kEmptyStr;
1986 }
1987
1988
ValidateLatLonCountry(const string & input_countryname,string & lat_lon,bool check_state,ELatLonCountryErr & errcode)1989 string CSubSource::ValidateLatLonCountry (const string& input_countryname, string& lat_lon, bool check_state, ELatLonCountryErr& errcode)
1990 {
1991 errcode = eLatLonCountryErr_None;
1992 string countryname = input_countryname;
1993 if (NStr::IsBlank(countryname) || NStr::IsBlank(lat_lon)) {
1994 return kEmptyStr;
1995 }
1996
1997 if ( m_LatLonCountryMap.get() == 0 ) {
1998 m_LatLonCountryMap.reset (new CLatLonCountryMap(false));
1999 }
2000 if ( m_LatLonWaterMap.get() == 0 ) {
2001 m_LatLonWaterMap.reset (new CLatLonCountryMap(true));
2002 }
2003
2004 // only do these checks if the latlon format is good
2005 bool format_correct, lat_in_range, lon_in_range, precision_correct;
2006 double lat_value = 0.0, lon_value = 0.0;
2007 CSubSource::IsCorrectLatLonFormat (lat_lon, format_correct, precision_correct,
2008 lat_in_range, lon_in_range,
2009 lat_value, lon_value);
2010 if (!format_correct) {
2011 // may have comma and then altitude, so just get lat_lon component */
2012 size_t pos = NStr::Find(lat_lon, ",", NStr::eNocase, NStr::eReverseSearch);
2013 if (pos != NPOS) {
2014 lat_lon = lat_lon.substr(0, pos);
2015 CSubSource::IsCorrectLatLonFormat (lat_lon, format_correct, precision_correct,
2016 lat_in_range, lon_in_range,
2017 lat_value, lon_value);
2018 }
2019 }
2020
2021 // reality checks
2022 if (!format_correct || !lat_in_range || !lon_in_range) {
2023 // incorrect lat_lon format should be reported elsewhere
2024 // incorrect latitude range should be reported elsewhere
2025 // incorrect longitude range should be reported elsewhere
2026 return kEmptyStr;
2027 }
2028
2029 // get rid of comments after semicolon or comma in country name
2030 size_t pos = NStr::Find(countryname, ";");
2031 if (pos != NPOS) {
2032 countryname = countryname.substr(0, pos);
2033 }
2034 pos = NStr::Find(countryname, ",");
2035 if (pos != NPOS) {
2036 countryname = countryname.substr(0, pos);
2037 }
2038
2039 // adjust for special cases
2040 if (NStr::StartsWith(countryname, "Norway: Svalbard")) {
2041 countryname = "Svalbard";
2042 }
2043
2044 string country = countryname;
2045 string province;
2046 pos = NStr::Find(country, ":");
2047 if (pos != NPOS) {
2048 // is the full string in the list?
2049 if (m_LatLonCountryMap->HaveLatLonForRegion(countryname)) {
2050 province = country.substr(pos + 1);
2051 NStr::TruncateSpacesInPlace(province, NStr::eTrunc_Both);
2052 }
2053 country = country.substr(0, pos);
2054 NStr::TruncateSpacesInPlace(country, NStr::eTrunc_Both);
2055 }
2056 if (NStr::IsBlank(country)) {
2057 return kEmptyStr;
2058 }
2059
2060 // known exceptions - don't even bother calculating any further
2061 if (NStr::EqualNocase (country, "Antarctica") && lat_value < -60.0) {
2062 return kEmptyStr;
2063 }
2064
2065 if (! NStr::IsBlank(province)) {
2066 // do not attempt quick exit
2067 } else if (m_LatLonCountryMap->HaveLatLonForRegion(country)) {
2068 if (m_LatLonCountryMap->IsCountryInLatLon(country, lat_value, lon_value)) {
2069 return kEmptyStr;
2070 }
2071 } else if (m_LatLonWaterMap->HaveLatLonForRegion(country)) {
2072 if (m_LatLonWaterMap->IsCountryInLatLon(country, lat_value, lon_value)) {
2073 return kEmptyStr;
2074 }
2075 } else if (NStr::EqualNocase (country, "State of Palestine")) {
2076 } else {
2077 // report unrecognized country
2078 return kEmptyStr;
2079 }
2080
2081 CLatLonCountryId *id = x_CalculateLatLonId(lat_value, lon_value, country, province);
2082 CLatLonCountryId::TClassificationFlags flags = (id == NULL ? 0 : id->Classify(country, province));
2083
2084 string wguess = id->GetGuessWater();
2085 string cguess = id->GetGuessCountry();
2086
2087 // special case where subsection of country has been identified but is not in coordinates of country
2088 // VR-840
2089 if (province.empty() && NStr::Equal(cguess, country)) {
2090 delete id;
2091 return kEmptyStr;
2092 }
2093
2094 if (NStr::EqualNocase (country, "State of Palestine") &&
2095 (NStr::EqualNocase (cguess, "Gaza Strip") ||
2096 NStr::EqualNocase (cguess, "West Bank"))) {
2097 delete id;
2098 return kEmptyStr;
2099 }
2100
2101 if (NStr::IsBlank (cguess) && (! NStr::IsBlank (wguess))) {
2102 string parent = x_FindSurroundingOcean (wguess);
2103 if ((! NStr::IsBlank (parent)) && NStr::EqualNocase (country, parent)) {
2104 delete id;
2105 return kEmptyStr;
2106 }
2107 }
2108
2109 double neardist = 0.0;
2110 CLatLonCountryMap::TLatLonAdjustFlags adjustment = CLatLonCountryMap::fNone;
2111 CLatLonCountryId::TClassificationFlags adjusted_flags = 0;
2112
2113 if (!flags && m_LatLonCountryMap->IsNearLatLon(lat_value, lon_value, 2.0, neardist, country) && neardist < 5.0) {
2114 id->SetGuessCountry (country);
2115 id->SetGuessProvince (kEmptyStr);
2116 flags = id->Classify(country, province);
2117 }
2118
2119 if (!flags && !m_LatLonCountryMap->IsNearLatLon(lat_value, lon_value, 20.0, neardist, country)
2120 && !m_LatLonWaterMap->IsNearLatLon(lat_value, lon_value, 20.0, neardist, country)) {
2121 /* do not flip from water */
2122 CLatLonCountryId *adjust_id = x_CalculateLatLonId(lon_value, lat_value, country, province);
2123 adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
2124 if (adjusted_flags) {
2125 string awguess = adjust_id->GetGuessWater();
2126 string acguess = adjust_id->GetGuessCountry();
2127 if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
2128 delete id;
2129 id = adjust_id;
2130 flags = adjusted_flags;
2131 adjustment = CLatLonCountryMap::fFlip;
2132 }
2133 } else {
2134 if (adjust_id) {
2135 delete adjust_id;
2136 }
2137 adjust_id = x_CalculateLatLonId(-lat_value, lon_value, country, province);
2138 adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
2139 if (adjusted_flags) {
2140 string awguess = adjust_id->GetGuessWater();
2141 string acguess = adjust_id->GetGuessCountry();
2142 if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
2143 delete id;
2144 id = adjust_id;
2145 flags = adjusted_flags;
2146 adjustment = CLatLonCountryMap::fNegateLat;
2147 }
2148 } else {
2149 if (adjust_id) {
2150 delete adjust_id;
2151 }
2152 adjust_id = x_CalculateLatLonId(lat_value, -lon_value, country, province);
2153 adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
2154 if (adjusted_flags) {
2155 string awguess = adjust_id->GetGuessWater();
2156 string acguess = adjust_id->GetGuessCountry();
2157 if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
2158 delete id;
2159 id = adjust_id;
2160 flags = adjusted_flags;
2161 adjustment = CLatLonCountryMap::fNegateLon;
2162 }
2163 } else {
2164 if (adjust_id) {
2165 delete adjust_id;
2166 }
2167 }
2168 }
2169 }
2170 }
2171
2172 string error;
2173
2174 if (adjustment != CLatLonCountryMap::fNone) {
2175 if (adjustment == CLatLonCountryMap::fFlip) {
2176 errcode = eLatLonCountryErr_Value;
2177 error = "Latitude and longitude values appear to be exchanged";
2178 lat_lon = MakeLatLon(lon_value, lat_value);
2179 } else if (adjustment == CLatLonCountryMap::fNegateLat) {
2180 errcode = eLatLonCountryErr_Value;
2181 if (lat_value < 0.0) {
2182 error = "Latitude should be set to N (northern hemisphere)";
2183 } else {
2184 error = "Latitude should be set to S (southern hemisphere)";
2185 }
2186 lat_lon = MakeLatLon(-lat_value, lon_value);
2187 } else if (adjustment == CLatLonCountryMap::fNegateLon) {
2188 errcode = eLatLonCountryErr_Value;
2189 if (lon_value < 0.0) {
2190 error = "Longitude should be set to E (eastern hemisphere)";
2191 } else {
2192 error = "Longitude should be set to W (western hemisphere)";
2193 }
2194 lat_lon = MakeLatLon(lat_value, -lon_value);
2195 }
2196 } else if ((flags & CLatLonCountryId::fCountryMatch) && (flags & CLatLonCountryId::fProvinceMatch)) {
2197 // success! nothing to report
2198 } else if (flags & CLatLonCountryId::fWaterMatch) {
2199 // success! nothing to report
2200 } else if (flags & CLatLonCountryId::fCountryMatch && NStr::IsBlank(province)) {
2201 if (check_state) {
2202 string full_guess = id->GetFullGuess();
2203 if (!NStr::Equal(full_guess, country)) {
2204 errcode = eLatLonCountryErr_State;
2205 error = "Lat_lon " + lat_lon + " is in " + id->GetFullGuess()
2206 + " (more specific than " + country + ")";
2207 }
2208 }
2209 } else if (!NStr::IsBlank(id->GetGuessWater())) {
2210 if (flags & (CLatLonCountryId::fCountryClosest | CLatLonCountryId::fProvinceClosest)) {
2211 bool suppress = false;
2212 string reportregion;
2213 string nosubphrase;
2214 string desphrase = "designated subregion ";
2215 string subphrase = "another subregion ";
2216 string phrase = nosubphrase;
2217 bool show_claimed = false;
2218
2219 if (id->GetLandDistance() < 100) {
2220 // for now, will not report
2221 // this is a policy decision
2222 suppress = true;
2223 } else if (NStr::Find(countryname, "Island") != NPOS) {
2224 suppress = true;
2225 }
2226
2227
2228 if (flags & CLatLonCountryId::fProvinceClosest) {
2229 reportregion = countryname;
2230 phrase = desphrase;
2231 } else {
2232 // wasn't closest province, so must be closest country
2233 if (!NStr::IsBlank(province) && check_state) {
2234 phrase = subphrase;
2235 reportregion = id->GetClosestFull();
2236 } else {
2237 reportregion = id->GetClosestCountry();
2238 }
2239 if (!NStr::IsBlank(id->GetClaimedFull())) {
2240 show_claimed = true;
2241 }
2242 }
2243 string water = id->GetGuessWater();
2244 if (NStr::EqualNocase (water, "Red Sea") &&
2245 (NStr::EqualNocase (reportregion, "Egypt") ||
2246 NStr::EqualNocase (reportregion, "Saudi Arabia") ||
2247 NStr::EqualNocase (reportregion, "Sudan") ||
2248 NStr::EqualNocase (reportregion, "Eritrea") ||
2249 NStr::EqualNocase (reportregion, "Dijibouti") ||
2250 NStr::EqualNocase (reportregion, "Yemen") ||
2251 NStr::EqualNocase (reportregion, "Israel") ||
2252 NStr::EqualNocase (reportregion, "Jordan"))) {
2253 } else if (NStr::EqualNocase (water, "Gulf of Mexico") &&
2254 (NStr::EqualNocase (reportregion, "USA") ||
2255 NStr::EqualNocase (reportregion, "Mexico"))) {
2256 } else if (!suppress) {
2257 errcode = eLatLonCountryErr_Water;
2258 if (show_claimed) {
2259 error = "Lat_lon '" + lat_lon + "' is closest to " + phrase + "'" + reportregion + "' at distance "
2260 + NStr::IntToString(id->GetLandDistance())
2261 + " km, but in water '" + id->GetGuessWater()
2262 + "' - claimed region '" + id->GetClaimedFull()
2263 + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
2264 } else {
2265 error = "Lat_lon '" + lat_lon + "' is closest to " + phrase + "'" + reportregion
2266 + "' at distance " + NStr::IntToString(id->GetLandDistance()) + " km, but in water '"
2267 + id->GetGuessWater() + "'";
2268 }
2269 }
2270 } else if (neardist > 0.0) {
2271 errcode = eLatLonCountryErr_Water;
2272 error = "Lat_lon '" + lat_lon + "' is in water '" + id->GetGuessWater() + "', '"
2273 + countryname + "' is " + NStr::IntToString(m_LatLonCountryMap->AdjustAndRoundDistance(neardist)) + " km away";
2274 } else {
2275 errcode = eLatLonCountryErr_Water;
2276 error = "Lat_lon '" + lat_lon + "' is in water '" + id->GetGuessWater() + "'";
2277 }
2278 } else if (!NStr::IsBlank(id->GetGuessCountry())) {
2279 string full_guess = id->GetFullGuess();
2280 if (NStr::EqualNocase (country, "China") && NStr::EqualNocase (full_guess, "Hong Kong")) {
2281 // skip
2282 } else if (NStr::IsBlank(id->GetClaimedFull())) {
2283 if (NStr::Equal(id->GetGuessCountry(), country) && !NStr::Equal(id->GetGuessProvince(), province)) {
2284 errcode = eLatLonCountryErr_State;
2285 } else {
2286 errcode = eLatLonCountryErr_Country;
2287 }
2288 error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
2289 + countryname + "'";
2290 } else {
2291 if (NStr::IsBlank(province)) {
2292 errcode = eLatLonCountryErr_Country;
2293 error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
2294 + country + "' - claimed region '" + id->GetClaimedFull()
2295 + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
2296 } else {
2297 errcode = eLatLonCountryErr_Country;
2298 if (NStr::EqualNocase(id->GetGuessCountry(), country)) {
2299 errcode = eLatLonCountryErr_State;
2300 }
2301 if (errcode == eLatLonCountryErr_Country || check_state) {
2302 error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
2303 + countryname + "' - claimed region '" + id->GetClaimedFull()
2304 + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
2305 } else {
2306 errcode = eLatLonCountryErr_None;
2307 }
2308 }
2309 }
2310 } else if (!NStr::IsBlank(id->GetClosestCountry())) {
2311 errcode = eLatLonCountryErr_Country;
2312 error = "Lat_lon '" + lat_lon + "' is closest to '" + id->GetClosestCountry() + "' instead of '"
2313 + countryname + "'";
2314 } else if (!NStr::IsBlank(id->GetClosestWater())) {
2315 errcode = eLatLonCountryErr_Water;
2316 error = "Lat_lon '" + lat_lon + "' is closest to '" + id->GetClosestWater() + "' instead of '"
2317 + countryname + "'";
2318 } else {
2319 errcode = eLatLonCountryErr_Country;
2320 error = "Unable to determine mapping for lat_lon '" + lat_lon + "' and country '" + countryname + "'";
2321 }
2322
2323
2324 delete id;
2325 return error;
2326 }
2327
2328
2329 const char* sm_ValidSexQualifierTokens[] = {
2330 "asexual",
2331 "bisexual",
2332 "diecious",
2333 "dioecious",
2334 "f",
2335 "female",
2336 "gelding",
2337 "hermaphrodite",
2338 "intersex",
2339 "m",
2340 "male",
2341 "mixed",
2342 "monecious",
2343 "monoecious",
2344 "neuter",
2345 "unisexual",
2346 };
2347
2348
2349 const char* sm_ValidSexQualifierPhrases[] = {
2350 "pooled males and females",
2351 "pooled male and female",
2352 };
2353
2354
s_IsValidSexQualifierPhrase(const string & value)2355 bool s_IsValidSexQualifierPhrase(const string& value)
2356 {
2357 size_t max = sizeof(sm_ValidSexQualifierPhrases) / sizeof(const char*);
2358
2359 const char* *begin = sm_ValidSexQualifierPhrases;
2360 const char* *end = &(sm_ValidSexQualifierPhrases[max]);
2361
2362 if (find(begin, end, value) != end) {
2363 return true;
2364 } else {
2365 return false;
2366 }
2367 }
2368
2369
IsValidSexQualifierValue(const string & value)2370 bool CSubSource::IsValidSexQualifierValue (const string& value)
2371
2372 {
2373 string str = value;
2374 NStr::ToLower(str);
2375
2376 if (s_IsValidSexQualifierPhrase(str)) {
2377 return true;
2378 }
2379
2380 vector<string> words;
2381 NStr::Split(str, " ,/", words);
2382 if (words.size() == 0) {
2383 return false;
2384 }
2385
2386 size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
2387
2388 const char* *begin = sm_ValidSexQualifierTokens;
2389 const char* *end = &(sm_ValidSexQualifierTokens[max]);
2390
2391 bool is_good = false;
2392
2393 ITERATE(vector<string>, w, words) {
2394 if (NStr::Equal(*w, "and")) {
2395 // ok, skip it
2396 } else {
2397 if (find(begin, end, *w) != end) {
2398 is_good = true;
2399 } else {
2400 is_good = false;
2401 break;
2402 }
2403 }
2404 }
2405 return is_good;
2406 }
2407
2408
FixSexQualifierValue(const string & value)2409 string CSubSource::FixSexQualifierValue (const string& value)
2410 {
2411 string str = value;
2412 NStr::ToLower(str);
2413
2414 if (s_IsValidSexQualifierPhrase(str)) {
2415 return str;
2416 }
2417
2418 vector<string> words;
2419 NStr::Split(str, " ,/", words);
2420
2421 if (words.size() == 0) {
2422 return kEmptyStr;
2423 }
2424 size_t max = ArraySize(sm_ValidSexQualifierTokens);
2425
2426 const char* *begin = sm_ValidSexQualifierTokens;
2427 const char* *end = &(sm_ValidSexQualifierTokens[max]);
2428
2429 vector<string> good_values;
2430 bool pooled = false;
2431
2432 ITERATE(vector<string>, w, words) {
2433 if (NStr::Equal(*w, "and")) {
2434 // ok, skip it
2435 } else if (NStr::EqualNocase(*w, "(pooled)") || NStr::EqualNocase(*w, "pooled")) {
2436 // set pooled flag
2437 pooled = true;
2438 } else {
2439 if (find(begin, end, *w) != end) {
2440 if (NStr::Equal(*w, "m")) {
2441 good_values.push_back("male");
2442 } else if (NStr::Equal(*w, "f")) {
2443 good_values.push_back("female");
2444 } else {
2445 good_values.push_back(*w);
2446 }
2447 } else {
2448 // if any bad values, can't autofix
2449 return kEmptyStr;
2450 }
2451 }
2452 }
2453 if (good_values.size() == 0) {
2454 // no good tokens, can't autofix
2455 return kEmptyStr;
2456 }
2457
2458 string fixed = good_values[0];
2459 for (size_t i = 1; i < good_values.size(); i++) {
2460 if (good_values.size() > 2) {
2461 fixed += ",";
2462 }
2463 if (i == good_values.size() - 1) {
2464 fixed += " and";
2465 }
2466 fixed += " " + good_values[i];
2467 }
2468 if (pooled) {
2469 fixed = "pooled " + fixed;
2470 }
2471 return fixed;
2472 }
2473
2474
s_CollectNumberAndUnits(const string & value,string & number,string & units)2475 void s_CollectNumberAndUnits(const string& value, string& number, string& units)
2476 {
2477 number.clear();
2478 units.clear();
2479
2480 if (NStr::IsBlank(value)) {
2481 return;
2482 }
2483
2484 string::const_iterator it = value.begin();
2485 if (*it == '+' || *it == '-') {
2486 number += *it;
2487 it++;
2488 }
2489
2490 bool any_digit = false;
2491 bool skip_comma = true;
2492 while (it != value.end() && (isdigit(*it) || *it == ',')) {
2493 if (*it == ',') {
2494 if (skip_comma) {
2495 // only skip the first comma
2496 skip_comma = false;
2497 } else {
2498 break;
2499 }
2500 } else {
2501 any_digit = true;
2502 number += *it;
2503 }
2504 it++;
2505 }
2506
2507 if (it == value.end()) {
2508 number.clear();
2509 return;
2510 }
2511
2512 if (*it == '.') {
2513 number += *it;
2514 it++;
2515 while (it != value.end() && isdigit(*it)) {
2516 any_digit = true;
2517 number += *it;
2518 it++;
2519 }
2520 }
2521
2522 if (it == value.end() || *it != ' ' || !any_digit) {
2523 number.clear();
2524 return;
2525 }
2526
2527 it++;
2528 while (it != value.end()) {
2529 units += *it;
2530 it++;
2531 }
2532 }
2533
2534
IsAltitudeValid(const string & value)2535 bool CSubSource::IsAltitudeValid (const string& value)
2536 {
2537 if (NStr::IsBlank(value)) {
2538 return false;
2539 }
2540
2541 string number;
2542 string units;
2543 s_CollectNumberAndUnits(value, number, units);
2544 if (NStr::IsBlank(number) || !NStr::EqualCase(units, "m")) {
2545 return false;
2546 } else {
2547 return true;
2548 }
2549
2550 }
2551
2552
x_GetPrecision(const string & num_str)2553 int CSubSource::x_GetPrecision(const string& num_str)
2554 {
2555 int precision = 0;
2556 size_t pos = NStr::Find(num_str, ".");
2557 if (pos != NPOS) {
2558 precision = int(num_str.length() - pos - 1);
2559 }
2560 return precision;
2561 }
2562
2563
x_FormatWithPrecision(double val,int precision)2564 string CSubSource::x_FormatWithPrecision(double val, int precision)
2565 {
2566 char reformatted[1000];
2567 sprintf(reformatted, "%.*lf", precision, val);
2568 string rval = reformatted;
2569 return rval;
2570 }
2571
FixAltitude(const string & value)2572 string CSubSource::FixAltitude (const string& value)
2573 {
2574 if (NStr::IsBlank(value)) {
2575 return kEmptyStr;
2576 }
2577
2578 string number;
2579 string units;
2580 s_CollectNumberAndUnits(value, number, units);
2581 if (NStr::IsBlank(number)) {
2582 return kEmptyStr;
2583 } else if (NStr::Equal(units, "ft.") || NStr::Equal(units, "ft") || NStr::Equal(units, "feet") || NStr::Equal(units, "foot")) {
2584 int precision = x_GetPrecision(number);
2585 double val = NStr::StringToDouble(number);
2586 val *= 0.3048;
2587 number = x_FormatWithPrecision(val, precision);
2588 units = "m";
2589 }
2590
2591 string rval = kEmptyStr;
2592 if (NStr::Equal(units, "m.")
2593 || NStr::Equal(units, "meters")
2594 || NStr::Equal(units, "meter")
2595 || NStr::Equal(units, "m")) {
2596
2597 rval = number + " " + "m";
2598 }
2599 return rval;
2600 }
2601
2602
2603 // From VR-793:
2604 // A. For segment, endogenous_virus_name:
2605 // 1. Must begin with a letter or number
2606 // 2. Spaces and other printable characters are permitted
2607 // 3. Must not be empty, must not be longer than 240 characters
2608
x_GenericRepliconNameValid(const string & value)2609 bool CSubSource::x_GenericRepliconNameValid(const string& value)
2610 {
2611 if (NStr::IsBlank(value)) {
2612 return false;
2613 } else if (!isalnum(value.c_str()[0])) {
2614 return false;
2615 } else if (value.length() > 240) {
2616 return false;
2617 }
2618
2619 for (auto it : value) {
2620 if (!isprint(it)) {
2621 return false;
2622 }
2623 }
2624
2625 return true;
2626 }
2627
2628
IsSegmentValid(const string & value)2629 bool CSubSource::IsSegmentValid(const string& value)
2630 {
2631 return x_GenericRepliconNameValid(value);
2632 }
2633
2634
IsEndogenousVirusNameValid(const string & value)2635 bool CSubSource::IsEndogenousVirusNameValid(const string& value)
2636 {
2637 return x_GenericRepliconNameValid(value);
2638 }
2639
2640
2641 // From VR-793:
2642 // B. For chromosome, linkage_group and plasmid_name values:
2643 // 4. Must begin with a letter or number
2644 // 5. Must not be empty, must not be longer than 32 characters
2645 // 6. Must not contain <tab>
2646 // 7. Spaces and other printable characters are permitted
2647 // 8. Must not contain the word "plasmid" (ignoring case)
2648 // 9. Must not contain the word "chromosome" (ignoring case)
2649 // 10. Must not contain the phrase "linkage group" (ignoring case)
2650 // 11. Must not contain the series of letters "chr" (ignoring case)
2651 // 12. Must not contain the taxname (ignoring case)
2652 // 14. Must not contain the genus (ignoring case)
2653 // 15. Must not contain the species (ignoring case)
2654 // except allow the species to match the value after an initial 'p' (e.g., JX416328)
2655 // 16. Must not contain the series of letters "chrm" (ignoring case)
2656 // 17. Must not contain the series of letters "chrom" (ignoring case)
2657 // 18. Must not contain the phrase "linkage-group" (ignoring case)
2658
x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(const string & value,const string & taxname)2659 bool CSubSource::x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(const string& value, const string& taxname)
2660 {
2661 if (NStr::FindNoCase(taxname, "Borrelia") != NPOS || NStr::FindNoCase(taxname, "Borreliella") != NPOS) {
2662 if (NStr::StartsWith(value, "cp") || NStr::StartsWith(value, "lp")) {
2663 return true;
2664 }
2665 }
2666 if (!x_GenericRepliconNameValid(value)) {
2667 // checks for isalnum start, blankness and unprintable characters
2668 // B.4, B.5, B.7
2669 return false;
2670 } else if (value.length() > 32) {
2671 // B.5
2672 return false;
2673 }
2674 if (!NStr::IsBlank(taxname)) {
2675 if (NStr::FindNoCase(value, taxname) != NPOS) {
2676 // B.12
2677 return false;
2678 }
2679 size_t pos = NStr::Find(taxname, " ");
2680 if (pos != NPOS) {
2681 string genus = taxname.substr(0, pos);
2682 if (NStr::FindNoCase(value, genus) != NPOS) {
2683 // B.14
2684 return false;
2685 }
2686 string species = taxname.substr(pos + 1);
2687 pos = NStr::FindNoCase(value, species);
2688 if (pos != NPOS) {
2689 if (pos != 1 || value[0] != 'p') {
2690 // B.15
2691 return false;
2692 }
2693 }
2694 }
2695 }
2696 static string s_ForbiddenPhrases[] = {
2697 "\t", // B.6.
2698 "plasmid", // B.8
2699 "chromosome", // B.9
2700 "linkage group", // B.10
2701 "chr", // B.11
2702 "linkage_group", // B.15
2703 "chrm", // B.16
2704 "chrom", // B.17
2705 "linkage-group" // B.18
2706 };
2707
2708 for (auto it : s_ForbiddenPhrases) {
2709 if (NStr::FindNoCase(value, it) != NPOS) {
2710 return false;
2711 }
2712 }
2713 return true;
2714 }
2715
2716
IsChromosomeNameValid(const string & value,const string & taxname)2717 bool CSubSource::IsChromosomeNameValid(const string& value, const string& taxname)
2718 {
2719 if (NStr::IsBlank(value)) {
2720 return false;
2721 }
2722 if (NStr::StartsWith(value, "LG", NStr::eNocase)) {
2723 return false;
2724 } else {
2725 return x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(value, taxname);
2726 }
2727 }
2728
2729
IsLinkageGroupNameValid(const string & value,const string & taxname)2730 bool CSubSource::IsLinkageGroupNameValid(const string& value, const string& taxname)
2731 {
2732 if (NStr::IsBlank(value)) {
2733 return false;
2734 }
2735 return x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(value, taxname);
2736 }
2737
2738
2739 // VR-793
2740 // C. For plasmid_name values:
2741 // 19. Exception- megaplasmid is legal
IsPlasmidNameValid(const string & value,const string & taxname)2742 bool CSubSource::IsPlasmidNameValid(const string& value, const string& taxname)
2743 {
2744 if (NStr::IsBlank(value)) {
2745 return false;
2746 }
2747 if (NStr::Equal(value, "megaplasmid")) {
2748 return true;
2749 }
2750 if (NStr::StartsWith(value, "megaplasmid ") && value.length() > 12 && NStr::Find(value.substr(12), " ") == NPOS) {
2751 return true;
2752 }
2753 if (NStr::Equal(value, "F") || NStr::Equal(value, "F factor") || NStr::Equal(value, "F plasmid")) {
2754 return true;
2755 }
2756 if (NStr::Equal(value, "Plasmid R") || NStr::Equal(value, "plasmid R") ||
2757 NStr::Equal(value, "Plasmid F") || NStr::Equal(value, "plasmid F")) {
2758 return true;
2759 }
2760 string val = value;
2761 string tax = taxname;
2762 if (NStr::StartsWith(value, "Plasmid ") || NStr::StartsWith(value, "plasmid ")) {
2763 val = value.substr(8, value.length());
2764 }
2765 if (NStr::StartsWith(taxname, "Plasmid ") || NStr::StartsWith(taxname, "plasmid ")) {
2766 tax = taxname.substr(8, taxname.length());
2767 }
2768 if (NStr::StartsWith(tax, val)) {
2769 if (NStr::Equal(tax, taxname) && NStr::Equal(val, value)) {
2770 return false;
2771 }
2772 return true;
2773 }
2774 return x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(val, tax);
2775 }
2776
2777
2778 typedef pair<string, string> TContaminatingCellLine;
2779 typedef map<string, TContaminatingCellLine> TSpeciesContaminant;
2780 typedef map<string, TSpeciesContaminant> TCellLineContaminationMap;
2781
2782 static TCellLineContaminationMap s_CellLineContaminationMap;
2783 static bool s_CellLineContaminationMapInitialized = false;
2784 DEFINE_STATIC_FAST_MUTEX(s_CellLineContaminationMutex);
2785
2786 #include "cell_line.inc"
2787
s_ProcessCellLineLine(const CTempString & line)2788 static void s_ProcessCellLineLine(const CTempString& line)
2789 {
2790 vector<string> tokens;
2791 NStr::Split(line, "\t", tokens);
2792 if (tokens.size() < 4) {
2793 ERR_POST_X(1, Warning << "Not enough columns in cell_line entry " << line
2794 << "; disregarding");
2795 } else {
2796 NStr::ToUpper(tokens[0]);
2797 (s_CellLineContaminationMap[tokens[0]])[tokens[1]] = TContaminatingCellLine(tokens[2], tokens[3]);
2798 }
2799 }
2800
2801
s_InitializeCellLineContaminationMap(void)2802 static void s_InitializeCellLineContaminationMap(void)
2803 {
2804 CFastMutexGuard GUARD(s_CellLineContaminationMutex);
2805 if (s_CellLineContaminationMapInitialized) {
2806 return;
2807 }
2808
2809 // read table
2810
2811 size_t count = sizeof(kCellLine) / sizeof (*kCellLine);
2812 const char * const * start = kCellLine;
2813 while (count--) {
2814 s_ProcessCellLineLine(*start++);
2815 }
2816
2817
2818 s_CellLineContaminationMapInitialized = true;
2819 }
2820
2821
CheckCellLine(const string & cell_line,const string & organism)2822 string CSubSource::CheckCellLine(const string& cell_line, const string& organism)
2823 {
2824 string rval;
2825
2826 s_InitializeCellLineContaminationMap();
2827 string cell_line_search = cell_line;
2828 NStr::ToUpper(cell_line_search);
2829
2830 if (!NStr::IsBlank(((s_CellLineContaminationMap[cell_line_search])[organism]).first)) {
2831 rval = "The International Cell Line Authentication Committee database indicates that " +
2832 cell_line + " from " + organism + " is known to be contaminated by " +
2833 ((s_CellLineContaminationMap[cell_line_search])[organism]).first +
2834 " from " + ((s_CellLineContaminationMap[cell_line_search])[organism]).second +
2835 ". Please see http://iclac.org/databases/cross-contaminations/ for more information and references.";
2836 }
2837 return rval;
2838 }
2839
2840
2841 // =============================================================================
2842 // Country Names
2843 // =============================================================================
2844
2845
2846 // legal country names, must be in alphabetical order (case sensitive)
2847 static const char* const s_Countries[] = {
2848 "Afghanistan",
2849 "Albania",
2850 "Algeria",
2851 "American Samoa",
2852 "Andorra",
2853 "Angola",
2854 "Anguilla",
2855 "Antarctica",
2856 "Antigua and Barbuda",
2857 "Arctic Ocean",
2858 "Argentina",
2859 "Armenia",
2860 "Aruba",
2861 "Ashmore and Cartier Islands",
2862 "Atlantic Ocean",
2863 "Australia",
2864 "Austria",
2865 "Azerbaijan",
2866 "Bahamas",
2867 "Bahrain",
2868 "Baker Island",
2869 "Baltic Sea",
2870 "Bangladesh",
2871 "Barbados",
2872 "Bassas da India",
2873 "Belarus",
2874 "Belgium",
2875 "Belize",
2876 "Benin",
2877 "Bermuda",
2878 "Bhutan",
2879 "Bolivia",
2880 "Borneo",
2881 "Bosnia and Herzegovina",
2882 "Botswana",
2883 "Bouvet Island",
2884 "Brazil",
2885 "British Virgin Islands",
2886 "Brunei",
2887 "Bulgaria",
2888 "Burkina Faso",
2889 "Burundi",
2890 "Cambodia",
2891 "Cameroon",
2892 "Canada",
2893 "Cape Verde",
2894 "Cayman Islands",
2895 "Central African Republic",
2896 "Chad",
2897 "Chile",
2898 "China",
2899 "Christmas Island",
2900 "Clipperton Island",
2901 "Cocos Islands",
2902 "Colombia",
2903 "Comoros",
2904 "Cook Islands",
2905 "Coral Sea Islands",
2906 "Costa Rica",
2907 "Cote d'Ivoire",
2908 "Croatia",
2909 "Cuba",
2910 "Curacao",
2911 "Cyprus",
2912 "Czech Republic",
2913 "Democratic Republic of the Congo",
2914 "Denmark",
2915 "Djibouti",
2916 "Dominica",
2917 "Dominican Republic",
2918 "Ecuador",
2919 "Egypt",
2920 "El Salvador",
2921 "Equatorial Guinea",
2922 "Eritrea",
2923 "Estonia",
2924 "Eswatini",
2925 "Ethiopia",
2926 "Europa Island",
2927 "Falkland Islands (Islas Malvinas)",
2928 "Faroe Islands",
2929 "Fiji",
2930 "Finland",
2931 "France",
2932 "French Guiana",
2933 "French Polynesia",
2934 "French Southern and Antarctic Lands",
2935 "Gabon",
2936 "Gambia",
2937 "Gaza Strip",
2938 "Georgia",
2939 "Germany",
2940 "Ghana",
2941 "Gibraltar",
2942 "Glorioso Islands",
2943 "Greece",
2944 "Greenland",
2945 "Grenada",
2946 "Guadeloupe",
2947 "Guam",
2948 "Guatemala",
2949 "Guernsey",
2950 "Guinea",
2951 "Guinea-Bissau",
2952 "Guyana",
2953 "Haiti",
2954 "Heard Island and McDonald Islands",
2955 "Honduras",
2956 "Hong Kong",
2957 "Howland Island",
2958 "Hungary",
2959 "Iceland",
2960 "India",
2961 "Indian Ocean",
2962 "Indonesia",
2963 "Iran",
2964 "Iraq",
2965 "Ireland",
2966 "Isle of Man",
2967 "Israel",
2968 "Italy",
2969 "Jamaica",
2970 "Jan Mayen",
2971 "Japan",
2972 "Jarvis Island",
2973 "Jersey",
2974 "Johnston Atoll",
2975 "Jordan",
2976 "Juan de Nova Island",
2977 "Kazakhstan",
2978 "Kenya",
2979 "Kerguelen Archipelago",
2980 "Kingman Reef",
2981 "Kiribati",
2982 "Kosovo",
2983 "Kuwait",
2984 "Kyrgyzstan",
2985 "Laos",
2986 "Latvia",
2987 "Lebanon",
2988 "Lesotho",
2989 "Liberia",
2990 "Libya",
2991 "Liechtenstein",
2992 "Line Islands",
2993 "Lithuania",
2994 "Luxembourg",
2995 "Macau",
2996 "Madagascar",
2997 "Malawi",
2998 "Malaysia",
2999 "Maldives",
3000 "Mali",
3001 "Malta",
3002 "Marshall Islands",
3003 "Martinique",
3004 "Mauritania",
3005 "Mauritius",
3006 "Mayotte",
3007 "Mediterranean Sea",
3008 "Mexico",
3009 "Micronesia",
3010 "Midway Islands",
3011 "Moldova",
3012 "Monaco",
3013 "Mongolia",
3014 "Montenegro",
3015 "Montserrat",
3016 "Morocco",
3017 "Mozambique",
3018 "Myanmar",
3019 "Namibia",
3020 "Nauru",
3021 "Navassa Island",
3022 "Nepal",
3023 "Netherlands",
3024 "New Caledonia",
3025 "New Zealand",
3026 "Nicaragua",
3027 "Niger",
3028 "Nigeria",
3029 "Niue",
3030 "Norfolk Island",
3031 "North Korea",
3032 "North Macedonia",
3033 "North Sea",
3034 "Northern Mariana Islands",
3035 "Norway",
3036 "Oman",
3037 "Pacific Ocean",
3038 "Pakistan",
3039 "Palau",
3040 "Palmyra Atoll",
3041 "Panama",
3042 "Papua New Guinea",
3043 "Paracel Islands",
3044 "Paraguay",
3045 "Peru",
3046 "Philippines",
3047 "Pitcairn Islands",
3048 "Poland",
3049 "Portugal",
3050 "Puerto Rico",
3051 "Qatar",
3052 "Republic of the Congo",
3053 "Reunion",
3054 "Romania",
3055 "Ross Sea",
3056 "Russia",
3057 "Rwanda",
3058 "Saint Barthelemy",
3059 "Saint Helena",
3060 "Saint Kitts and Nevis",
3061 "Saint Lucia",
3062 "Saint Martin",
3063 "Saint Pierre and Miquelon",
3064 "Saint Vincent and the Grenadines",
3065 "Samoa",
3066 "San Marino",
3067 "Sao Tome and Principe",
3068 "Saudi Arabia",
3069 "Senegal",
3070 "Serbia",
3071 "Seychelles",
3072 "Sierra Leone",
3073 "Singapore",
3074 "Sint Maarten",
3075 "Slovakia",
3076 "Slovenia",
3077 "Solomon Islands",
3078 "Somalia",
3079 "South Africa",
3080 "South Georgia and the South Sandwich Islands",
3081 "South Korea",
3082 "South Sudan",
3083 "Southern Ocean",
3084 "Spain",
3085 "Spratly Islands",
3086 "Sri Lanka",
3087 "State of Palestine",
3088 "Sudan",
3089 "Suriname",
3090 "Svalbard",
3091 "Sweden",
3092 "Switzerland",
3093 "Syria",
3094 "Taiwan",
3095 "Tajikistan",
3096 "Tanzania",
3097 "Tasman Sea",
3098 "Thailand",
3099 "Timor-Leste",
3100 "Togo",
3101 "Tokelau",
3102 "Tonga",
3103 "Trinidad and Tobago",
3104 "Tromelin Island",
3105 "Tunisia",
3106 "Turkey",
3107 "Turkmenistan",
3108 "Turks and Caicos Islands",
3109 "Tuvalu",
3110 "USA",
3111 "Uganda",
3112 "Ukraine",
3113 "United Arab Emirates",
3114 "United Kingdom",
3115 "Uruguay",
3116 "Uzbekistan",
3117 "Vanuatu",
3118 "Venezuela",
3119 "Viet Nam",
3120 "Virgin Islands",
3121 "Wake Island",
3122 "Wallis and Futuna",
3123 "West Bank",
3124 "Western Sahara",
3125 "Yemen",
3126 "Zambia",
3127 "Zimbabwe"
3128 };
3129 typedef CStaticArraySet<const char*, PCase_CStr> TCStrSet;
3130 static const TCStrSet s_CountriesSet(s_Countries, sizeof(s_Countries), __FILE__, __LINE__);
3131
3132 // former legal country names, must be in alphabetical order (case sensitive)
3133 static const char* const s_Former_Countries[] = {
3134 "Belgian Congo",
3135 "British Guiana",
3136 "Burma",
3137 "Czechoslovakia",
3138 "East Timor",
3139 "Korea",
3140 "Macedonia",
3141 "Netherlands Antilles",
3142 "Serbia and Montenegro",
3143 "Siam",
3144 "Swaziland",
3145 "The former Yugoslav Republic of Macedonia",
3146 "USSR",
3147 "Yugoslavia",
3148 "Zaire"
3149 };
3150 static const TCStrSet s_Former_CountriesSet(s_Former_Countries, sizeof(s_Former_Countries), __FILE__, __LINE__);
3151
IsValid(const string & country)3152 bool CCountries::IsValid(const string& country)
3153 {
3154 string name = country;
3155 size_t pos = country.find(':');
3156
3157 if ( pos != NPOS ) {
3158 if (pos == country.length() - 1) {
3159 return false;
3160 }
3161 name = country.substr(0, pos);
3162 }
3163
3164 // try current countries
3165 if (s_CountriesSet.find(name.c_str()) != s_CountriesSet.end()) {
3166 return true;
3167 } else if (s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end()) {
3168 return true;
3169 } else {
3170 return false;
3171 }
3172 }
3173
3174
IsValid(const string & country,bool & is_miscapitalized)3175 bool CCountries::IsValid(const string& country, bool& is_miscapitalized)
3176 {
3177 string name = country;
3178 size_t pos = country.find(':');
3179
3180 if ( pos != NPOS ) {
3181 name = country.substr(0, pos);
3182 if (pos == country.length() - 1) {
3183 return false;
3184 }
3185 }
3186
3187 is_miscapitalized = false;
3188 // try current countries
3189 // fast check for properly capitalized
3190 if ( s_CountriesSet.find(name.c_str()) != s_CountriesSet.end() ) {
3191 return true;
3192 }
3193 if ( s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end() ) {
3194 return true;
3195 }
3196 // slow check for miscapitalized
3197 ITERATE ( TCStrSet, it, s_CountriesSet ) {
3198 if ( NStr::EqualNocase(name, *it) ) {
3199 is_miscapitalized = true;
3200 return true;
3201 }
3202 }
3203 ITERATE ( TCStrSet, it, s_Former_CountriesSet ) {
3204 if ( NStr::EqualNocase(name, *it) ) {
3205 is_miscapitalized = true;
3206 return true;
3207 }
3208 }
3209
3210 return false;
3211 }
3212
3213
WasValid(const string & country)3214 bool CCountries::WasValid(const string& country)
3215 {
3216 string name = country;
3217 size_t pos = country.find(':');
3218
3219 if ( pos != NPOS ) {
3220 name = country.substr(0, pos);
3221 }
3222
3223 // try formerly-valid countries
3224 return s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end();
3225 }
3226
3227
WasValid(const string & country,bool & is_miscapitalized)3228 bool CCountries::WasValid(const string& country, bool& is_miscapitalized)
3229 {
3230 string name = country;
3231 size_t pos = country.find(':');
3232
3233 if ( pos != NPOS ) {
3234 name = country.substr(0, pos);
3235 }
3236
3237 is_miscapitalized = false;
3238 // try formerly-valid countries
3239 // fast check for properly capitalized
3240 if ( s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end() ) {
3241 return true;
3242 }
3243 // slow check for miscapitalized
3244 ITERATE ( TCStrSet, it, s_Former_CountriesSet ) {
3245 if ( NStr::EqualNocase(name, *it) ) {
3246 is_miscapitalized = true;
3247 return true;
3248 }
3249 }
3250 return false;
3251 }
3252
3253 /////////////////////////////////////////////////////////////////////////////
3254 ////// Country Capitalization Fix ///////////////////////////////////////////
3255
3256 static const SStaticPair<const char*, const char*> s_map_whole_country_fixes[] =
3257 {
3258 {"england", "United Kingdom: England"},
3259 {"great britain", "United Kingdom: Great Britain"},
3260 {"new jersey, usa", "USA: New Jersey"}
3261 };
3262 typedef CStaticPairArrayMap<const char*, const char*, PCase_CStr> TCStringPairsMap;
3263 DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap, k_whole_country_fixes, s_map_whole_country_fixes);
3264
3265 static const SStaticPair<const char*, const char*> s_map_country_name_fixes[] = {
3266 {"ABW", "Aruba"},
3267 {"AFG", "Afghanistan"},
3268 {"AGO", "Angola"},
3269 {"AIA", "Anguilla"},
3270 {"ALA", "Aland Islands"},
3271 {"ALB", "Albania"},
3272 {"AND", "Andorra"},
3273 {"ARE", "United Arab Emirates"},
3274 {"ARG", "Argentina"},
3275 {"ARM", "Armenia"},
3276 {"ASM", "American Samoa"},
3277 {"ATA", "Antarctica"},
3278 {"ATF", "French Southern Territories"},
3279 {"ATG", "Antigua and Barbuda"},
3280 {"AUS", "Australia"},
3281 {"AUT", "Austria"},
3282 {"AZE", "Azerbaijan"},
3283 {"Antigua & Barbuda", "Antigua and Barbuda"},
3284 {"Ashmore & Cartier Islands", "Ashmore and Cartier Islands"},
3285 {"BDI", "Burundi"},
3286 {"BEL", "Belgium"},
3287 {"BEN", "Benin"},
3288 {"BES", "Bonaire, Sint Eustatius and Saba"},
3289 {"BFA", "Burkina Faso"},
3290 {"BGD", "Bangladesh"},
3291 {"BGR", "Bulgaria"},
3292 {"BHR", "Bahrain"},
3293 {"BHS", "Bahamas"},
3294 {"BIH", "Bosnia and Herzegovina"},
3295 {"BLM", "Saint Barthelemy"},
3296 {"BLR", "Belarus"},
3297 {"BLZ", "Belize"},
3298 {"BMU", "Bermuda"},
3299 {"BOL", "Bolivia"},
3300 {"BRA", "Brazil"},
3301 {"BRB", "Barbados"},
3302 {"BRN", "Brunei"},
3303 {"BTN", "Bhutan"},
3304 {"BVT", "Bouvet Island"},
3305 {"BWA", "Botswana"},
3306 {"Brasil", "Brazil"},
3307 {"CAF", "Central African Republic"},
3308 {"CAN", "Canada"},
3309 {"CCK", "Cocos Islands"},
3310 {"CHE", "Switzerland"},
3311 {"CHL", "Chile"},
3312 {"CHN", "China"},
3313 {"CIV", "Cote d'Ivoire"},
3314 {"CMR", "Cameroon"},
3315 {"COD", "Democratic Republic of the Congo"},
3316 {"COG", "Republic of the Congo"},
3317 {"COK", "Cook Islands"},
3318 {"COL", "Colombia"},
3319 {"COM", "Comoros"},
3320 {"CPV", "Cape Verde"},
3321 {"CRI", "Costa Rica"},
3322 {"CUB", "Cuba"},
3323 {"CUW", "Curacao"},
3324 {"CXR", "Christmas Island"},
3325 {"CYM", "Cayman Islands"},
3326 {"CYP", "Cyprus"},
3327 {"CZE", "Czech Republic"},
3328 {"Cape Verde Islands", "Cape Verde"},
3329 {"DEU", "Germany"},
3330 {"DJI", "Djibouti"},
3331 {"DMA", "Dominica"},
3332 {"DNK", "Denmark"},
3333 {"DOM", "Dominican Republic"},
3334 {"DZA", "Algeria"},
3335 {"Democratic Republic of Congo", "Democratic Republic of the Congo"},
3336 {"ECU", "Ecuador"},
3337 {"EGY", "Egypt"},
3338 {"ERI", "Eritrea"},
3339 {"ESH", "Western Sahara"},
3340 {"ESP", "Spain"},
3341 {"EST", "Estonia"},
3342 {"ETH", "Ethiopia"},
3343 {"FIN", "Finland"},
3344 {"FJI", "Fiji"},
3345 {"FLK", "Falkland Islands (Islas Malvinas)"},
3346 {"FRA", "France"},
3347 {"FRO", "Faroe Islands"},
3348 {"FSM", "Micronesia"},
3349 {"Falkland Islands", "Falkland Islands (Islas Malvinas)"},
3350 {"French Southern & Antarctic Lands", "French Southern and Antarctic Lands"},
3351 {"GAB", "Gabon"},
3352 {"GBR", "United Kingdom"},
3353 {"GEO", "Georgia"},
3354 {"GGY", "Guernsey"},
3355 {"GHA", "Ghana"},
3356 {"GIB", "Gibraltar"},
3357 {"GIN", "Guinea"},
3358 {"GLP", "Guadeloupe"},
3359 {"GMB", "Gambia"},
3360 {"GNB", "Guinea-Bissau"},
3361 {"GNQ", "Equatorial Guinea"},
3362 {"GRC", "Greece"},
3363 {"GRD", "Grenada"},
3364 {"GRL", "Greenland"},
3365 {"GTM", "Guatemala"},
3366 {"GUF", "French Guiana"},
3367 {"GUM", "Guam"},
3368 {"GUY", "Guyana"},
3369 {"HKG", "Hong Kong"},
3370 {"HMD", "Heard Island and McDonald Islands"},
3371 {"HND", "Honduras"},
3372 {"HRV", "Croatia"},
3373 {"HTI", "Haiti"},
3374 {"HUN", "Hungary"},
3375 {"Heard Island & McDonald Islands", "Heard Island and McDonald Islands"},
3376 {"IDN", "Indonesia"},
3377 {"IMN", "Isle of Man"},
3378 {"IND", "India"},
3379 {"IOT", "British Indian Ocean Territory"},
3380 {"IRL", "Ireland"},
3381 {"IRN", "Iran"},
3382 {"IRQ", "Iraq"},
3383 {"ISL", "Iceland"},
3384 {"ISR", "Israel"},
3385 {"ITA", "Italy"},
3386 {"Ivory Coast", "Cote d'Ivoire"},
3387 {"JAM", "Jamaica"},
3388 {"JEY", "Jersey"},
3389 {"JOR", "Jordan"},
3390 {"JPN", "Japan"},
3391 {"KAZ", "Kazakhstan"},
3392 {"KEN", "Kenya"},
3393 {"KGZ", "Kyrgyzstan"},
3394 {"KHM", "Cambodia"},
3395 {"KIR", "Kiribati"},
3396 {"KNA", "Saint Kitts and Nevis"},
3397 {"KOR", "South Korea"},
3398 {"KWT", "Kuwait"},
3399 {"LAO", "Lao People's Democratic Republic"},
3400 {"LBN", "Lebanon"},
3401 {"LBR", "Liberia"},
3402 {"LBY", "Libyan Arab Jamahiriya"},
3403 {"LCA", "Saint Lucia"},
3404 {"LIE", "Liechtenstein"},
3405 {"LKA", "Sri Lanka"},
3406 {"LSO", "Lesotho"},
3407 {"LTU", "Lithuania"},
3408 {"LUX", "Luxembourg"},
3409 {"LVA", "Latvia"},
3410 {"La Reunion Island", "Reunion"},
3411 {"Luxemburg", "Luxembourg"},
3412 {"MAC", "Macao"},
3413 {"MAF", "Saint Martin (French part)"},
3414 {"MAR", "Morocco"},
3415 {"MCO", "Monaco"},
3416 {"MDA", "Moldova"},
3417 {"MDG", "Madagascar"},
3418 {"MDV", "Maldives"},
3419 {"MEX", "Mexico"},
3420 {"MHL", "Marshall Islands"},
3421 {"MKD", "North Macedonia"},
3422 {"MLI", "Mali"},
3423 {"MLT", "Malta"},
3424 {"MMR", "Myanmar"},
3425 {"MNE", "Montenegro"},
3426 {"MNG", "Mongolia"},
3427 {"MNP", "Northern Mariana Islands"},
3428 {"MOZ", "Mozambique"},
3429 {"MRT", "Mauritania"},
3430 {"MSR", "Montserrat"},
3431 {"MTQ", "Martinique"},
3432 {"MUS", "Mauritius"},
3433 {"MWI", "Malawi"},
3434 {"MYS", "Malaysia"},
3435 {"MYT", "Mayotte"},
3436 {"Macedonia", "North Macedonia"},
3437 {"NAM", "Namibia"},
3438 {"NCL", "New Caledonia"},
3439 {"NER", "Niger"},
3440 {"NFK", "Norfolk Island"},
3441 {"NGA", "Nigeria"},
3442 {"NIC", "Nicaragua"},
3443 {"NIU", "Niue"},
3444 {"NLD", "Netherlands"},
3445 {"NOR", "Norway"},
3446 {"NPL", "Nepal"},
3447 {"NRU", "Nauru"},
3448 {"NZL", "New Zealand"},
3449 {"Netherland", "Netherlands"},
3450 {"New Guinea", "Papua New Guinea"},
3451 {"OMN", "Oman"},
3452 {"P, R, China", "China"},
3453 {"P.R. China", "China"},
3454 {"P.R.China", "China"},
3455 {"PAK", "Pakistan"},
3456 {"PAN", "Panama"},
3457 {"PCN", "Pitcairn"},
3458 {"PER", "Peru"},
3459 {"PHL", "Philippines"},
3460 {"PLW", "Palau"},
3461 {"PNG", "Papua New Guinea"},
3462 {"POL", "Poland"},
3463 {"PRI", "Puerto Rico"},
3464 {"PRK", "North Korea"},
3465 {"PRT", "Portugal"},
3466 {"PRY", "Paraguay"},
3467 {"PSE", "Palestinian Territory"},
3468 {"PYF", "French Polynesia"},
3469 {"People's Republic of China", "China"},
3470 {"Pr China", "China"},
3471 {"Prchina", "China"},
3472 {"QAT", "Qatar"},
3473 {"REU", "Reunion"},
3474 {"ROU", "Romania"},
3475 {"RUS", "Russia"},
3476 {"RWA", "Rwanda"},
3477 {"Republic of Congo", "Republic of the Congo"},
3478 {"SAU", "Saudi Arabia"},
3479 {"SDN", "Sudan"},
3480 {"SEN", "Senegal"},
3481 {"SGP", "Singapore"},
3482 {"SGS", "South Georgia and the South Sandwich Islands"},
3483 {"SHN", "Saint Helena"},
3484 {"SJM", "Svalbard and Jan Mayen"},
3485 {"SLB", "Solomon Islands"},
3486 {"SLE", "Sierra Leone"},
3487 {"SLV", "El Salvador"},
3488 {"SMR", "San Marino"},
3489 {"SOM", "Somalia"},
3490 {"SPM", "Saint Pierre and Miquelon"},
3491 {"SRB", "Serbia"},
3492 {"SSD", "South Sudan"},
3493 {"STP", "Sao Tome and Principe"},
3494 {"SUR", "Suriname"},
3495 {"SVK", "Slovakia"},
3496 {"SVN", "Slovenia"},
3497 {"SWE", "Sweden"},
3498 {"SWZ", "Eswatini"},
3499 {"SXM", "Sint Maarten (Dutch part)"},
3500 {"SYC", "Seychelles"},
3501 {"SYR", "Syrian Arab Republic"},
3502 {"Saint Kitts & Nevis", "Saint Kitts and Nevis"},
3503 {"Saint Pierre & Miquelon", "Saint Pierre and Miquelon"},
3504 {"Saint Vincent & Grenadines", "Saint Vincent and the Grenadines"},
3505 {"Saint Vincent & the Grenadines", "Saint Vincent and the Grenadines"},
3506 {"Saint Vincent and Grenadines", "Saint Vincent and the Grenadines"},
3507 {"San Tome and Principe Island", "Sao Tome and Principe"},
3508 {"Sao Tome & Principe", "Sao Tome and Principe"},
3509 {"South Georgia & South Sandwich Islands", "South Georgia and the South Sandwich Islands"},
3510 {"South Georgia & the South Sandwich Islands", "South Georgia and the South Sandwich Islands"},
3511 {"St Helena", "Saint Helena"},
3512 {"St Lucia", "Saint Lucia"},
3513 {"St Pierre and Miquelon", "Saint Pierre and Miquelon"},
3514 {"St Vincent and the Grenadines", "Saint Vincent and the Grenadines"},
3515 {"St. Helena", "Saint Helena"},
3516 {"St. Lucia", "Saint Lucia"},
3517 {"St. Pierre and Miquelon", "Saint Pierre and Miquelon"},
3518 {"St. Vincent and the Grenadines", "Saint Vincent and the Grenadines"},
3519 {"TCA", "Turks and Caicos Islands"},
3520 {"TCD", "Chad"},
3521 {"TGO", "Togo"},
3522 {"THA", "Thailand"},
3523 {"TJK", "Tajikistan"},
3524 {"TKL", "Tokelau"},
3525 {"TKM", "Turkmenistan"},
3526 {"TLS", "Timor-Leste"},
3527 {"TON", "Tonga"},
3528 {"TTO", "Trinidad and Tobago"},
3529 {"TUN", "Tunisia"},
3530 {"TUR", "Turkey"},
3531 {"TUV", "Tuvalu"},
3532 {"TWN", "Taiwan"},
3533 {"TZA", "Tanzania"},
3534 {"The Netherlands", "Netherlands"},
3535 {"Trinidad & Tobago", "Trinidad and Tobago"},
3536 {"Turks & Caicos", "Turks and Caicos Islands"},
3537 {"Turks & Caicos Islands", "Turks and Caicos Islands"},
3538 {"Turks and Caicos", "Turks and Caicos Islands"},
3539 {"U.S.A.", "USA"},
3540 {"UGA", "Uganda"},
3541 {"UK", "United Kingdom"},
3542 {"UKR", "Ukraine"},
3543 {"UMI", "United States Minor Outlying Islands"},
3544 {"URY", "Uruguay"},
3545 {"UZB", "Uzbekistan"},
3546 {"United States", "USA"},
3547 {"United States of America", "USA"},
3548 {"VAT", "Holy See (Vatican City State)"},
3549 {"VCT", "Saint Vincent and the Grenadines"},
3550 {"VEN", "Venezuela"},
3551 {"VGB", "British Virgin Islands"},
3552 {"VIR", "Virgin Islands"},
3553 {"VNM", "Viet Nam"},
3554 {"VUT", "Vanuatu"},
3555 {"Vietnam", "Viet Nam"},
3556 {"WLF", "Wallis and Futuna"},
3557 {"WSM", "Samoa"},
3558 {"YEM", "Yemen"},
3559 {"ZAF", "South Africa"},
3560 {"ZMB", "Zambia"},
3561 {"ZWE", "Zimbabwe"},
3562 {"the Netherlands", "Netherlands"}
3563 };
3564
3565 DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap,k_country_name_fixes, s_map_country_name_fixes);
3566
3567 // for GP-24841
3568 static const SStaticPair<const char*, const char*> s_map_old_country_name_fixes[] = {
3569 {"Burma", "Myanmar"},
3570 {"Siam", "Thailand"}
3571 };
3572 DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap,k_old_country_name_fixes, s_map_old_country_name_fixes);
3573
3574 // for GB-7408
3575 static const SStaticPair<const char*, const char*> s_map_subregion_fixes[] = {
3576 {"Antigua", "Antigua and Barbuda: Antigua"},
3577 {"Ashmore Island", "Ashmore and Cartier Islands: Ashmore Island"},
3578 {"Autonomous Region of the Azores", "Portugal: Azores"},
3579 {"Azores", "Portugal: Azores"},
3580 {"Barbuda", "Antigua and Barbuda: Barbuda"},
3581 {"Bassas da India", "French Southern and Antarctic Lands: Bassas da India"},
3582 {"Caicos Islands", "Turks and Caicos Islands: Caicos Islands"},
3583 {"Canary Islands", "Spain: Canary Islands"},
3584 {"Cartier Island", "Ashmore and Cartier Islands: Cartier Island"},
3585 {"East Germany", "Germany: East Germany"},
3586 {"El Hierro", "Spain: El Hierro"},
3587 {"Europa Island", "French Southern and Antarctic Lands: Europa Island"},
3588 {"Fuerteventura", "Spain: Fuerteventura"},
3589 {"Glorioso Islands", "French Southern and Antarctic Lands: Glorioso Islands"},
3590 {"Gran Canaria", "Spain: Gran Canaria"},
3591 {"Grenadines", "Saint Vincent and the Grenadines: Grenadines"},
3592 {"Heard Island", "Heard Island and McDonald Islands: Heard Island"},
3593 {"Ile Amsterdam", "French Southern and Antarctic Lands: Ile Amsterdam"},
3594 {"Ile Saint-Paul", "French Southern and Antarctic Lands: Ile Saint-Paul"},
3595 {"Iles Crozet", "French Southern and Antarctic Lands: Iles Crozet"},
3596 {"Iles Kerguelen", "French Southern and Antarctic Lands: Iles Kerguelen"},
3597 {"Juan de Nova Island", "French Southern and Antarctic Lands: Juan de Nova Island"},
3598 {"La Gomera", "Spain: La Gomera"},
3599 {"La Graciosa", "Spain: La Graciosa"},
3600 {"La Palma", "Spain: La Palma"},
3601 {"Lanzarote", "Spain: Lanzarote"},
3602 {"Madeira", "Portugal: Madeira"},
3603 {"McDonald Island", "Heard Island and McDonald Islands: McDonald Island"},
3604 {"McDonald Islands", "Heard Island and McDonald Islands: McDonald Islands"},
3605 {"Miquelon", "Saint Pierre and Miquelon: Miquelon"},
3606 {"Nevis", "Saint Kitts and Nevis: Nevis"},
3607 {"Principe", "Sao Tome and Principe: Principe"},
3608 {"Saint Kitts", "Saint Kitts and Nevis: Saint Kitts"},
3609 {"Saint Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
3610 {"Saint Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
3611 {"Sao Tome", "Sao Tome and Principe: Sao Tome"},
3612 {"Scotland", "United Kingdom: Scotland"},
3613 {"South Sandwich Islands", "South Georgia and the South Sandwich Islands: South Sandwich Islands"},
3614 {"St Kitts", "Saint Kitts and Nevis: Saint Kitts"},
3615 {"St Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
3616 {"St Thomas", "USA: Saint Thomas"},
3617 {"St Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
3618 {"St. Kitts", "Saint Kitts and Nevis: Saint Kitts"},
3619 {"St. Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
3620 {"St. Thomas", "USA: Saint Thomas"},
3621 {"St. Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
3622 {"Tenerife", "Spain: Tenerife"},
3623 {"Tobago", "Trinidad and Tobago: Tobago"},
3624 {"Trinidad", "Trinidad and Tobago: Trinidad"},
3625 {"Tromelin Island", "French Southern and Antarctic Lands: Tromelin Island"},
3626 {"Turks Islands", "Turks and Caicos Islands: Turks Islands"},
3627 {"Wales", "United Kingdom: Wales"},
3628 {"West Germany", "Germany: West Germany"},
3629
3630 };
3631 DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap,k_subregion_fixes, s_map_subregion_fixes);
3632
3633
3634 static const char* s_USAStates[] = {
3635 "Alabama",
3636 "Alaska",
3637 "Arizona",
3638 "Arkansas",
3639 "California",
3640 "Colorado",
3641 "Connecticut",
3642 "Delaware",
3643 "District of Columbia",
3644 "Florida",
3645 "Georgia",
3646 "Hawaii",
3647 "Idaho",
3648 "Illinois",
3649 "Indiana",
3650 "Iowa",
3651 "Kansas",
3652 "Kentucky",
3653 "Louisiana",
3654 "Maine",
3655 "Maryland",
3656 "Massachusetts",
3657 "Michigan",
3658 "Minnesota",
3659 "Mississippi",
3660 "Missouri",
3661 "Montana",
3662 "Nebraska",
3663 "Nevada",
3664 "New Hampshire",
3665 "New Jersey",
3666 "New Mexico",
3667 "New York",
3668 "North Carolina",
3669 "North Dakota",
3670 "Ohio",
3671 "Oklahoma",
3672 "Oregon",
3673 "Pennsylvania",
3674 "Rhode Island",
3675 "South Carolina",
3676 "South Dakota",
3677 "Tennessee",
3678 "Texas",
3679 "Utah",
3680 "Vermont",
3681 "Virginia",
3682 "Washington",
3683 "West Virginia",
3684 "Wisconsin",
3685 "Wyoming"
3686 };
3687
CapitalizeFirstLetterOfEveryWord(const string & phrase)3688 string CCountries::CapitalizeFirstLetterOfEveryWord (const string &phrase)
3689 {
3690 vector<string> words;
3691 NStr::Split(phrase, " \t\r\n", words);
3692 for(vector<string>::iterator word = words.begin(); word != words.end(); ++word)
3693 if (!word->empty() && isalpha(word->at(0)))
3694 word->at(0) = (unsigned char)toupper(word->at(0));
3695 return NStr::Join(words," ");
3696 }
3697
WholeCountryFix(string country)3698 string CCountries::WholeCountryFix(string country)
3699 {
3700 string new_country;
3701 TCStringPairsMap::const_iterator found = k_whole_country_fixes.find(NStr::ToLower(country).c_str());
3702 if (found != k_whole_country_fixes.end()) {
3703 new_country = found->second;
3704 return new_country;
3705 }
3706
3707 const size_t num_states = sizeof(s_USAStates) / sizeof(s_USAStates[0]);
3708 for (size_t i = 0; i < num_states; ++i) {
3709 if (NStr::EqualNocase(s_USAStates[i], country)) {
3710 new_country = "USA: " + CTempString(s_USAStates[i]);
3711 break;
3712 }
3713 }
3714
3715 return new_country;
3716 }
3717
IsSubstringOfStringInList(const string & phrase,const string & country1,size_t pos1)3718 bool CCountries::IsSubstringOfStringInList(const string& phrase, const string& country1, size_t pos1)
3719 {
3720 bool r = false;
3721 ITERATE ( TCStrSet, c, s_CountriesSet )
3722 {
3723 string country2(*c);
3724 if (country2.length() > country1.length() && NStr::FindNoCase(country2,country1) != NPOS)
3725 {
3726 SIZE_TYPE pos2 = NStr::FindNoCase(phrase,country2);
3727 while (pos2 != NPOS)
3728 {
3729 if (pos2 <= pos1 && pos2+country2.length() >= pos1+country1.length())
3730 r = true;
3731 pos2 = NStr::FindNoCase(phrase,country2,pos2+country2.length());
3732 }
3733 }
3734 }
3735 return r;
3736 }
3737
ContainsMultipleCountryNames(const string & phrase)3738 bool CCountries::ContainsMultipleCountryNames (const string &phrase)
3739 {
3740 int num_matches = 0;
3741 ITERATE ( TCStrSet, c, s_CountriesSet )
3742 {
3743 string country(*c);
3744 size_t pos = NStr::FindNoCase(phrase,country);
3745 while (pos != NPOS)
3746 {
3747 if (!((pos+country.length()<phrase.length() && isalpha(phrase[pos+country.length()]))
3748 || (pos > 0 && isalpha(phrase[pos-1]))
3749 || IsSubstringOfStringInList(phrase,country,pos)))
3750 num_matches++;
3751 pos = NStr::FindNoCase(phrase,country,pos+country.length());
3752 }
3753
3754 }
3755 return (num_matches > 1);
3756 }
3757
GetCorrectedCountryCapitalization(const string & country)3758 string CCountries::GetCorrectedCountryCapitalization(const string& country)
3759 {
3760 string output = country;
3761 ITERATE ( TCStrSet, it, s_CountriesSet ) {
3762 if ( NStr::EqualNocase(country, *it) ) {
3763 output = *it;
3764 }
3765 }
3766 return output;
3767 }
3768
3769
x_RemoveDelimitersFromEnds(string & val,bool except_paren)3770 void CCountries::x_RemoveDelimitersFromEnds(string& val, bool except_paren)
3771 {
3772 NStr::TruncateSpacesInPlace(val);
3773 bool any_found = true;
3774 while (!val.empty() && any_found) {
3775 any_found = false;
3776 if (NStr::StartsWith(val, ",")
3777 || NStr::StartsWith(val, ":")
3778 || NStr::StartsWith(val, ".")
3779 || (!except_paren && NStr::StartsWith(val, ")"))) {
3780 val = val.substr(1);
3781 any_found = true;
3782 NStr::TruncateSpacesInPlace(val);
3783 } else if (NStr::EndsWith(val, ",")
3784 || NStr::EndsWith(val, ":")
3785 || (!except_paren && NStr::EndsWith(val, "("))) {
3786 val = val.substr(0, val.length() - 1);
3787 any_found = true;
3788 NStr::TruncateSpacesInPlace(val);
3789 } else if (NStr::EndsWith(val, "the") && val.length() > 3 && !isalpha((unsigned char)val[val.length() - 4])) {
3790 val = val.substr(0, val.length() - 4);
3791 any_found = true;
3792 } else if (NStr::EndsWith(val, ".")) {
3793 size_t len = val.length();
3794 if (len > 1 && isspace((unsigned char)val[len - 2])) {
3795 val = val.substr(0, val.length() - 1);
3796 any_found = true;
3797 NStr::TruncateSpacesInPlace(val);
3798 } else if (len > 5) {
3799 // make sure no spaces or punctuation within 4 characters before '.'
3800 bool do_remove = true;
3801 size_t pos = val.length() - 2;
3802 size_t dist = 0;
3803 while (dist < 4 && do_remove) {
3804 if (isspace((unsigned char)val[pos]) || ispunct((unsigned char)val[pos])) {
3805 do_remove = false;
3806 }
3807 pos--;
3808 dist++;
3809 }
3810 if (do_remove) {
3811 val = val.substr(0, val.length() - 1);
3812 any_found = true;
3813 }
3814 }
3815 }
3816 }
3817 }
3818
3819
x_Tokenize(const string & val)3820 vector<string> CCountries::x_Tokenize(const string& val)
3821 {
3822 vector<string> tokens;
3823 NStr::Split(val, ",:()", tokens);
3824 // special tokenizing - if tokens contain periods but resulting token is at least four characters long
3825 vector<string>::iterator it = tokens.begin();
3826 while (it != tokens.end()) {
3827 size_t pos = NStr::Find(*it, ".");
3828 if (pos != NPOS && pos > 3 && (*it).length() - pos > 4) {
3829 string first = (*it).substr(0, pos);
3830 string remainder = (*it).substr(pos + 1);
3831 size_t space_pos = NStr::Find(first, " ");
3832 size_t len_to_space = first.length();
3833 while (space_pos != NPOS) {
3834 first = first.substr(space_pos + 1);
3835 len_to_space = first.length();
3836 space_pos = NStr::Find(first, " ");
3837 }
3838 if (len_to_space > 4) {
3839 (*it) = (*it).substr(0, pos);
3840 it = tokens.insert(it, remainder);
3841 } else {
3842 it++;
3843 }
3844 } else {
3845 it++;
3846 }
3847 }
3848 return tokens;
3849 }
3850
3851
s_ContainsWholeWord(const CTempString test,const CTempString word,NStr::ECase case_sense)3852 bool s_ContainsWholeWord(const CTempString test, const CTempString word, NStr::ECase case_sense)
3853 {
3854 size_t start = 0;
3855 size_t tlen = test.length();
3856 size_t wlen = word.length();
3857
3858 size_t pos = NStr::Find(test, word, case_sense);
3859 while (pos != NPOS) {
3860 size_t p = start + pos;
3861 if ( (p == 0 || !isalpha((unsigned char)test[p - 1])) &&
3862 (p + wlen >= tlen || !isalpha((unsigned char)test[p + wlen])) ) {
3863 return true;
3864 }
3865 start = p + 1;
3866 pos = NStr::Find(CTempString(test, start, tlen - start), word, case_sense);
3867 }
3868 return false;
3869 }
3870
3871
s_SuppressCountryFix(const string & test)3872 bool s_SuppressCountryFix(const string& test)
3873 {
3874 if (s_ContainsWholeWord(test, "Sea", NStr::eNocase)) {
3875 return true;
3876 } else if (s_ContainsWholeWord(test, "USSR", NStr::eNocase)) {
3877 return true;
3878 }
3879 return false;
3880 }
3881
3882
x_FindCountryName(const TCStringPairsMap & fix_map,const vector<string> & countries,string & valid_country,string & orig_valid_country,bool & too_many_countries,bool & bad_cap)3883 void CCountries::x_FindCountryName
3884 (const TCStringPairsMap& fix_map,
3885 const vector<string>& countries,
3886 string& valid_country,
3887 string& orig_valid_country,
3888 bool& too_many_countries,
3889 bool& bad_cap)
3890 {
3891 for (auto country : countries) {
3892 if (!country.empty() && !too_many_countries)
3893 {
3894 string check = country;
3895 NStr::TruncateSpacesInPlace(check);
3896 x_RemoveDelimitersFromEnds(check);
3897
3898 bool check_has_bad_cap = false;
3899 if (IsValid(check,check_has_bad_cap))
3900 {
3901 if (valid_country.empty())
3902 {
3903 valid_country = check;
3904 orig_valid_country = check;
3905 bad_cap = check_has_bad_cap;
3906 }
3907 else
3908 {
3909 too_many_countries = true;
3910 }
3911 }
3912 else // see if this is a fixable country
3913 {
3914 TCStringPairsMap::const_iterator found = fix_map.find(check.c_str());
3915 if (found != fix_map.end())
3916 {
3917 if (valid_country.empty())
3918 {
3919 valid_country = found->second;
3920 orig_valid_country = check;
3921 }
3922 else
3923 {
3924 too_many_countries = true;
3925 }
3926 }
3927 }
3928 }
3929 }
3930 }
3931
3932 // start of RW-1278
3933
s_CompressRunsOfSpaces(string & val)3934 bool s_CompressRunsOfSpaces(string& val)
3935 {
3936 if (val.length() == 0) return false;
3937
3938 char * str = new char[sizeof(char) * (val.length() + 1)];
3939 strcpy(str, val.c_str());
3940
3941 unsigned char ch; /* to use 8bit characters in multibyte languages */
3942 unsigned char pv; /* to use 8bit characters in multibyte languages */
3943 char * dst;
3944 char * ptr;
3945
3946 dst = str;
3947 ptr = str;
3948 ch = *ptr;
3949 pv = '\0';
3950 while (ch != '\0') {
3951 *dst = ch;
3952 dst++;
3953 ptr++;
3954 pv = ch;
3955 ch = *ptr;
3956 if (pv == ' ') {
3957 while (ch == ' ') {
3958 ptr++;
3959 ch = *ptr;
3960 }
3961 pv = '\0';
3962 }
3963 }
3964 if (dst != NULL) {
3965 *dst = '\0';
3966 }
3967
3968 string new_val;
3969 new_val = str;
3970 delete[] str;
3971
3972 if (!NStr::Equal(val, new_val)) {
3973 val = new_val;
3974 return true;
3975 }
3976 else {
3977 return false;
3978 }
3979 }
3980
3981 typedef SStaticPair<const char*, const char*> TParishMapEntry;
3982 static const TParishMapEntry parish_abbrev_array[] = {
3983 { "Acadia Parish", "Acadia Parish" },
3984 { "AcadiaParish", "Acadia Parish" },
3985 { "Allen Parish", "Allen Parish" },
3986 { "AllenParish", "Allen Parish" },
3987 { "Ascension Parish", "Ascension Parish" },
3988 { "AscensionParish", "Ascension Parish" },
3989 { "Assumption Parish", "Assumption Parish" },
3990 { "AssumptionParish", "Assumption Parish" },
3991 { "Avoyelles Parish", "Avoyelles Parish" },
3992 { "AvoyellesParish", "Avoyelles Parish" },
3993 { "Beauregard Parish", "Beauregard Parish" },
3994 { "BeauregardParish", "Beauregard Parish" },
3995 { "Bienville Parish", "Bienville Parish" },
3996 { "BienvilleParish", "Bienville Parish" },
3997 { "Bossier Parish", "Bossier Parish" },
3998 { "BossierParish", "Bossier Parish" },
3999 { "Caddo Parish", "Caddo Parish" },
4000 { "CaddoParish", "Caddo Parish" },
4001 { "Calcasieu Parish", "Calcasieu Parish" },
4002 { "CalcasieuParish", "Calcasieu Parish" },
4003 { "Caldwell Parish", "Caldwell Parish" },
4004 { "CaldwellParish", "Caldwell Parish" },
4005 { "Cameron Parish", "Cameron Parish" },
4006 { "CameronParish", "Cameron Parish" },
4007 { "Catahoula Parish", "Catahoula Parish" },
4008 { "CatahoulaParish", "Catahoula Parish" },
4009 { "Claiborne Parish", "Claiborne Parish" },
4010 { "ClaiborneParish", "Claiborne Parish" },
4011 { "Concordia Parish", "Concordia Parish" },
4012 { "ConcordiaParish", "Concordia Parish" },
4013 { "DeSoto Parish", "DeSoto Parish" },
4014 { "DeSotoParish", "DeSoto Parish" },
4015 { "East Baton Rouge Parish", "East Baton Rouge Parish" },
4016 { "East Carroll Parish", "East Carroll Parish" },
4017 { "East Feliciana Parish", "East Feliciana Parish" },
4018 { "EastBatonRougeParish", "East Baton Rouge Parish" },
4019 { "EastCarrollParish", "East Carroll Parish" },
4020 { "EastFelicianaParish", "East Feliciana Parish" },
4021 { "Evangeline Parish", "Evangeline Parish" },
4022 { "EvangelineParish", "Evangeline Parish" },
4023 { "Franklin Parish", "Franklin Parish" },
4024 { "FranklinParish", "Franklin Parish" },
4025 { "Grant Parish", "Grant Parish" },
4026 { "GrantParish", "Grant Parish" },
4027 { "Iberia Parish", "Iberia Parish" },
4028 { "IberiaParish", "Iberia Parish" },
4029 { "Iberville Parish", "Iberville Parish" },
4030 { "IbervilleParish", "Iberville Parish" },
4031 { "Jackson Parish", "Jackson Parish" },
4032 { "JacksonParish", "Jackson Parish" },
4033 { "Jefferson Davis Parish", "Jefferson Davis Parish" },
4034 { "Jefferson Parish", "Jefferson Parish" },
4035 { "JeffersonDavisParish", "Jefferson Davis Parish" },
4036 { "JeffersonParish", "Jefferson Parish" },
4037 { "Lafayette Parish", "Lafayette Parish" },
4038 { "LafayetteParish", "Lafayette Parish" },
4039 { "Lafourche Parish", "Lafourche Parish" },
4040 { "LafourcheParish", "Lafourche Parish" },
4041 { "LaSalle Parish", "LaSalle Parish" },
4042 { "LaSalleParish", "LaSalle Parish" },
4043 { "Lincoln Parish", "Lincoln Parish" },
4044 { "LincolnParish", "Lincoln Parish" },
4045 { "Livingston Parish", "Livingston Parish" },
4046 { "LivingstonParish", "Livingston Parish" },
4047 { "Madison Parish", "Madison Parish" },
4048 { "MadisonParish", "Madison Parish" },
4049 { "Morehouse Parish", "Morehouse Parish" },
4050 { "MorehouseParish", "Morehouse Parish" },
4051 { "Natchitoches Parish", "Natchitoches Parish" },
4052 { "NatchitochesParish", "Natchitoches Parish" },
4053 { "Orleans Parish", "Orleans Parish" },
4054 { "OrleansParish", "Orleans Parish" },
4055 { "Ouachita Parish", "Ouachita Parish" },
4056 { "OuachitaParish", "Ouachita Parish" },
4057 { "Plaquemines Parish", "Plaquemines Parish" },
4058 { "PlaqueminesParish", "Plaquemines Parish" },
4059 { "Pointe Coupee Parish", "Pointe Coupee Parish" },
4060 { "PointeCoupeeParish", "Pointe Coupee Parish" },
4061 { "Rapides Parish", "Rapides Parish" },
4062 { "RapidesParish", "Rapides Parish" },
4063 { "Red River Parish", "Red River Parish" },
4064 { "RedRiverParish", "Red River Parish" },
4065 { "Richland Parish", "Richland Parish" },
4066 { "RichlandParish", "Richland Parish" },
4067 { "Sabine Parish", "Sabine Parish" },
4068 { "SabineParish", "Sabine Parish" },
4069 { "St. Bernard Parish", "St. Bernard Parish" },
4070 { "St. Charles Parish", "St. Charles Parish" },
4071 { "St. Helena Parish", "St. Helena Parish" },
4072 { "St. James Parish", "St. James Parish" },
4073 { "St. John the Baptist Parish", "St. John the Baptist Parish" },
4074 { "St. Landry Parish", "St. Landry Parish" },
4075 { "St. Martin Parish", "St. Martin Parish" },
4076 { "St. Mary Parish", "St. Mary Parish" },
4077 { "St. Tammany Parish", "St. Tammany Parish" },
4078 { "St.BernardParish", "St. Bernard Parish" },
4079 { "St.CharlesParish", "St. Charles Parish" },
4080 { "St.HelenaParish", "St. Helena Parish" },
4081 { "St.JamesParish", "St. James Parish" },
4082 { "St.JohntheBaptistParish", "St. John the Baptist Parish" },
4083 { "St.LandryParish", "St. Landry Parish" },
4084 { "St.MartinParish", "St. Martin Parish" },
4085 { "St.MaryParish", "St. Mary Parish" },
4086 { "St.TammanyParish", "St. Tammany Parish" },
4087 { "Tangipahoa Parish", "Tangipahoa Parish" },
4088 { "TangipahoaParish", "Tangipahoa Parish" },
4089 { "Tensas Parish", "Tensas Parish" },
4090 { "TensasParish", "Tensas Parish" },
4091 { "Terrebonne Parish", "Terrebonne Parish" },
4092 { "TerrebonneParish", "Terrebonne Parish" },
4093 { "Union Parish", "Union Parish" },
4094 { "UnionParish", "Union Parish" },
4095 { "Vermilion Parish", "Vermilion Parish" },
4096 { "VermilionParish", "Vermilion Parish" },
4097 { "Vernon Parish", "Vernon Parish" },
4098 { "VernonParish", "Vernon Parish" },
4099 { "Washington Parish", "Washington Parish" },
4100 { "WashingtonParish", "Washington Parish" },
4101 { "Webster Parish", "Webster Parish" },
4102 { "WebsterParish", "Webster Parish" },
4103 { "West Baton Rouge Parish", "West Baton Rouge Parish" },
4104 { "West Carroll Parish", "West Carroll Parish" },
4105 { "West Feliciana Parish", "West Feliciana Parish" },
4106 { "WestBatonRougeParish", "West Baton Rouge Parish" },
4107 { "WestCarrollParish", "West Carroll Parish" },
4108 { "WestFelicianaParish", "West Feliciana Parish" },
4109 { "Winn Parish", "Winn Parish" },
4110 { "WinnParish", "Winn Parish" }
4111 };
4112
4113 typedef CStaticPairArrayMap<const char *, const char *, PNocase_CStr> TParishMap;
4114 DEFINE_STATIC_ARRAY_MAP(TParishMap, parishAbbrevMap, parish_abbrev_array);
4115
s_IsParish(string & parish)4116 bool s_IsParish ( string& parish ) {
4117
4118 if ( parish.empty() ) {
4119 return false;
4120 }
4121
4122 TParishMap::const_iterator parish_find_iter = parishAbbrevMap.find(parish.c_str());
4123 if ( parish_find_iter != parishAbbrevMap.end() ) {
4124 // replace with full parish name
4125 parish = parish_find_iter->second;
4126 return true;
4127 }
4128
4129 return false;
4130 }
4131
4132 typedef SStaticPair<const char*, const char*> TStateMapEntry;
4133 static const TStateMapEntry state_abbrev_array[] = {
4134 { "AK", "Alaska" },
4135 { "AL", "Alabama" },
4136 { "Alabama", "Alabama" },
4137 { "Alaska", "Alaska" },
4138 { "American Samoa", "American Samoa" },
4139 { "AR", "Arkansas" },
4140 { "Arizona", "Arizona" },
4141 { "Arkansas", "Arkansas" },
4142 { "AS", "American Samoa" },
4143 { "AZ", "Arizona" },
4144 { "CA", "California" },
4145 { "California", "California" },
4146 { "CO", "Colorado" },
4147 { "Colorado", "Colorado" },
4148 { "Connecticut", "Connecticut" },
4149 { "CT", "Connecticut" },
4150 { "DC", "District of Columbia" },
4151 { "DE", "Delaware" },
4152 { "Delaware", "Delaware" },
4153 { "District of Columbia", "District of Columbia" },
4154 { "FL", "Florida" },
4155 { "Florida", "Florida" },
4156 { "GA", "Georgia" },
4157 { "Georgia", "Georgia" },
4158 { "GU", "Guam" },
4159 { "Guam", "Guam" },
4160 { "Hawaii", "Hawaii" },
4161 { "HI", "Hawaii" },
4162 { "IA", "Iowa" },
4163 { "ID", "Idaho" },
4164 { "Idaho", "Idaho" },
4165 { "IL", "Illinois" },
4166 { "Illinois", "Illinois" },
4167 { "IN", "Indiana" },
4168 { "Indiana", "Indiana" },
4169 { "Iowa", "Iowa" },
4170 { "Kansas", "Kansas" },
4171 { "Kentucky", "Kentucky" },
4172 { "KS", "Kansas" },
4173 { "KY", "Kentucky" },
4174 { "LA", "Louisiana" },
4175 { "Louisiana", "Louisiana" },
4176 { "MA", "Massachusetts" },
4177 { "Maine", "Maine" },
4178 { "Maryland", "Maryland" },
4179 { "Massachusetts", "Massachusetts" },
4180 { "MD", "Maryland" },
4181 { "ME", "Maine" },
4182 { "MI", "Michigan" },
4183 { "Michigan", "Michigan" },
4184 { "Minnesota", "Minnesota" },
4185 { "Mississippi", "Mississippi" },
4186 { "Missouri", "Missouri" },
4187 { "MN", "Minnesota" },
4188 { "MO", "Missouri" },
4189 { "Montana", "Montana" },
4190 { "MS", "Mississippi" },
4191 { "MT", "Montana" },
4192 { "NC", "North Carolina" },
4193 { "ND", "North Dakota" },
4194 { "NE", "Nebraska" },
4195 { "Nebraska", "Nebraska" },
4196 { "Nevada", "Nevada" },
4197 { "New Hampshire", "New Hampshire" },
4198 { "New Jersey", "New Jersey" },
4199 { "New Mexico", "New Mexico" },
4200 { "New York", "New York" },
4201 { "NH", "New Hampshire" },
4202 { "NJ", "New Jersey" },
4203 { "NM", "New Mexico" },
4204 { "North Carolina", "North Carolina" },
4205 { "North Dakota", "North Dakota" },
4206 { "NV", "Nevada" },
4207 { "NY", "New York" },
4208 { "OH", "Ohio" },
4209 { "Ohio", "Ohio" },
4210 { "OK", "Oklahoma" },
4211 { "Oklahoma", "Oklahoma" },
4212 { "OR", "Oregon" },
4213 { "Oregon", "Oregon" },
4214 { "PA", "Pennsylvania" },
4215 { "Pennsylvania", "Pennsylvania" },
4216 { "PR", "Puerto Rico" },
4217 { "Puerto Rico", "Puerto Rico" },
4218 { "Rhode Island", "Rhode Island" },
4219 { "RI", "Rhode Island" },
4220 { "SC", "South Carolina" },
4221 { "SD", "South Dakota" },
4222 { "South Carolina", "South Carolina" },
4223 { "South Dakota", "South Dakota" },
4224 { "Tennessee", "Tennessee" },
4225 { "Texas", "Texas" },
4226 { "TN", "Tennessee" },
4227 { "TX", "Texas" },
4228 { "US Virgin Islands", "US Virgin Islands" },
4229 { "UT", "Utah" },
4230 { "Utah", "Utah" },
4231 { "VA", "Virginia" },
4232 { "Vermont", "Vermont" },
4233 { "VI", "US Virgin Islands" },
4234 { "Virgin Islands", "US Virgin Islands" },
4235 { "Virginia", "Virginia" },
4236 { "VT", "Vermont" },
4237 { "WA", "Washington" },
4238 { "Washington", "Washington" },
4239 { "West Virginia", "West Virginia" },
4240 { "WI", "Wisconsin" },
4241 { "Wisconsin", "Wisconsin" },
4242 { "WV", "West Virginia" },
4243 { "WY", "Wyoming" },
4244 { "Wyoming", "Wyoming" }
4245 };
4246
4247 typedef CStaticPairArrayMap<const char *, const char *, PNocase_CStr> TStateMap;
4248 DEFINE_STATIC_ARRAY_MAP(TStateMap, stateAbbrevMap, state_abbrev_array);
4249
s_IsState(string & state,bool & modified)4250 bool s_IsState ( string& state, bool& modified ) {
4251
4252 if ( state.empty() ) {
4253 return false;
4254 }
4255
4256 string original = state;
4257 string working = state;
4258
4259 if ( NStr::StartsWith ( working, "State of ", NStr::eNocase )) {
4260 NStr::TrimPrefixInPlace ( working, "State of ", NStr::eNocase );
4261 }
4262
4263 if ( NStr::StartsWith ( working, "Commonwealth of ", NStr::eNocase )) {
4264 NStr::TrimPrefixInPlace ( working, "Commonwealth of ", NStr::eNocase );
4265 }
4266
4267 if ( NStr::EndsWith ( working, " State", NStr::eNocase )) {
4268 NStr::TrimSuffixInPlace ( working, " State", NStr::eNocase );
4269 }
4270
4271 NStr::TruncateSpacesInPlace ( working );
4272
4273 TStateMap::const_iterator state_find_iter = stateAbbrevMap.find(working.c_str());
4274 if ( state_find_iter != stateAbbrevMap.end() ) {
4275 // replace with full state name
4276 state = state_find_iter->second;
4277 // report conversion from two-letter, changed capitalization, or prefix/suffix removal
4278 if ( ! NStr::Equal ( original, state )) {
4279 modified = true;
4280 }
4281 return true;
4282 }
4283
4284 return false;
4285 }
4286
s_DoUSAStateCleanup(string & country)4287 CCountries::EStateCleanup s_DoUSAStateCleanup ( string& country ) {
4288
4289 if ( country.empty() ) {
4290 return CCountries::e_NoResult;
4291 }
4292
4293 // make working copy
4294 string original = country;
4295 string working = country;
4296
4297 // remove flanking quotation marks - if CCountries::NewFixCountry not called
4298 if ( NStr::StartsWith ( working, "\"" ) && NStr::EndsWith ( working, "\"" )) {
4299 working = working.substr ( 1, working.length() - 2 );
4300 }
4301
4302 // remove flanking spaces
4303 NStr::TruncateSpacesInPlace ( working );
4304
4305 // separate strings before and after colon
4306 string frst, scnd;
4307 NStr::SplitInTwo ( working, ":", frst, scnd );
4308
4309 NStr::TruncateSpacesInPlace ( frst );
4310 NStr::TruncateSpacesInPlace ( scnd );
4311
4312 // confirm that country is USA
4313 if ( ! NStr::EqualNocase ( frst, "USA") && ! NStr::EqualNocase ( frst, "US")) {
4314 // if not, first try rescuing US territory
4315 working = CCountries::NewFixCountry(working, true);
4316 NStr::SplitInTwo ( working, ":", frst, scnd );
4317 NStr::TruncateSpacesInPlace ( frst );
4318 NStr::TruncateSpacesInPlace ( scnd );
4319 if ( ! NStr::EqualNocase ( frst, "USA") && ! NStr::EqualNocase ( frst, "US")) {
4320 return CCountries::e_NotUSA;
4321 }
4322 }
4323
4324 // split state/county/city clauses at commas
4325 vector<string> components;
4326 NStr::Split(scnd, ",", components);
4327
4328 // check for only country
4329 if ( components.size() < 1 ) {
4330 country = "USA";
4331 return CCountries::e_Valid;
4332 }
4333
4334 for ( int j = 0; j < components.size(); j++ ) {
4335 // remove flanking spaces around components
4336 NStr::TruncateSpacesInPlace ( components[j] );
4337 s_CompressRunsOfSpaces ( components[j] );
4338 // clean up runon strings like EastBatonRougeParish
4339 if ( NStr::EndsWith ( components[j], "Parish", NStr::eNocase )) {
4340 s_IsParish( components[j] );
4341 }
4342 }
4343
4344 bool any_modified = false;
4345 int num_states = 0;
4346 int match = -1;
4347
4348 string* first = 0;
4349 string* last = 0;
4350
4351 // has multiple components
4352 int max = components.size() - 1;
4353 for ( int j = 0; j < components.size(); j++ ) {
4354 bool modified = false;
4355 if ( s_IsState ( components[j], modified )) {
4356 if (modified) {
4357 any_modified = true;
4358 }
4359 if ( match < 0 ) {
4360 // record position of first s_IsState match
4361 match = j;
4362 }
4363 // count successful matches
4364 num_states++;
4365 if ( j == 0 ) {
4366 first = &(components[j]);
4367 }
4368 if ( j == max ) {
4369 last = &(components[j]);
4370 }
4371 }
4372 }
4373
4374 // generate result
4375 string res;
4376 res.append ("USA: ");
4377 string pfx = "";
4378
4379 if ( match >= 0 ) {
4380 // move first state matched to first position
4381 res.append ( components[match] );
4382 pfx = ", ";
4383 }
4384
4385 for ( int j = 0; j < components.size(); j++ ) {
4386 if ( j == match) continue;
4387 res.append ( pfx );
4388 res.append ( components[j] );
4389 pfx = ", ";
4390 }
4391
4392 country = res;
4393
4394 if ( match < 0 ) {
4395 return CCountries::e_Missing;
4396 } else if ( num_states > 1 ) {
4397 return CCountries::e_Ambiguous;
4398 } else if ( ! NStr::Equal ( original, res )) {
4399 return CCountries::e_Corrected;
4400 }
4401
4402 return CCountries::e_Valid;
4403 }
4404
4405 typedef CRowReader<CRowReaderStream_NCBI_TSV> TNCBITSVStream;
4406
4407 static CCountries::TUsaExceptionMap exception_map;
4408 static bool exceptions_initialized = false;
4409
ReadUSAExceptionMap(CCountries::TUsaExceptionMap & exceptions,const string & exception_file)4410 void CCountries::ReadUSAExceptionMap (CCountries::TUsaExceptionMap& exceptions, const string& exception_file ) {
4411
4412 if ( ! exception_file.empty()) {
4413
4414 TNCBITSVStream my_stream (exception_file);
4415 for ( const auto & row : my_stream ) {
4416 TFieldNo number_of_fields = row. GetNumberOfFields();
4417 if ( number_of_fields != 2 ) continue;
4418 string fr = row[0].Get<string>();
4419 string to = row[1].Get<string>();
4420 exceptions [fr] = to;
4421 }
4422 }
4423 }
4424
LoadUSAExceptionMap(const TUsaExceptionMap & exceptions)4425 void CCountries::LoadUSAExceptionMap (const TUsaExceptionMap& exceptions) {
4426
4427 // clear previous map
4428 exception_map.clear();
4429
4430 // initialize internal exception map
4431 for ( const auto & itm : exceptions ) {
4432 string fr = itm.first;
4433 string to = itm.second;
4434
4435 // ensure colon is followed by space to match initial correction
4436 string f1, f2;
4437 NStr::SplitInTwo ( fr, ":", f1, f2 );
4438 NStr::TruncateSpacesInPlace ( f1 );
4439 NStr::TruncateSpacesInPlace ( f2 );
4440 if ( ! f1.empty() && ! f2.empty()) {
4441 fr = f1 + ": " + f2;
4442 }
4443
4444 exception_map [fr] = to;
4445 }
4446
4447 exceptions_initialized = true;
4448 }
4449
LoadUSAExceptionMap(const string & exception_file)4450 void CCountries::LoadUSAExceptionMap (const string& exception_file ) {
4451
4452 if ( ! exception_file.empty()) {
4453
4454 TUsaExceptionMap exceptions;
4455 ReadUSAExceptionMap ( exceptions, exception_file );
4456 LoadUSAExceptionMap ( exceptions );
4457 }
4458 }
4459
USAStateCleanup(const string & country,CCountries::EStateCleanup & type)4460 string CCountries::USAStateCleanup ( const string& country, CCountries::EStateCleanup& type ) {
4461
4462 // call algorithmic mapping function
4463 string working = country;
4464 type = s_DoUSAStateCleanup ( working );
4465
4466 // apply exceptions from preloaded data file
4467 if ( exceptions_initialized ) {
4468 string corrected = exception_map [working];
4469 if ( ! corrected.empty()) {
4470 // presence in map here will disambiguate otherwise ambiguous name pair,
4471 // thus self-entries need to be added to the ambiguous state exception list
4472 if ( ! NStr::StartsWith ( corrected, "USA" )) {
4473 type = e_NotUSA;
4474 } else if ( NStr::Equal ( corrected, working ) && NStr::Equal ( corrected, country )) {
4475 type = e_Valid;
4476 } else {
4477 type = e_Corrected;
4478 }
4479 return corrected;
4480 }
4481 }
4482
4483 if ( ! NStr::StartsWith ( working, "USA" )) {
4484 type = e_NotUSA;
4485 }
4486 return working;
4487 }
4488
USAStateCleanup(const string & country)4489 string CCountries::USAStateCleanup ( const string& country ) {
4490
4491 CCountries::EStateCleanup type = e_NoResult;
4492 return USAStateCleanup ( country, type );
4493 }
4494
4495 // end of RW-1278
4496
NewFixCountry(const string & test,bool us_territories)4497 string CCountries::NewFixCountry (const string& test, bool us_territories)
4498 {
4499 // change requested for JIRA:SQD-1410
4500 if (s_SuppressCountryFix(test)) {
4501 if (IsValid(test)) {
4502 return test;
4503 } else {
4504 return kEmptyStr;
4505 }
4506 }
4507
4508 string input = test;
4509 if (NStr::StartsWith(input, "\"") && NStr::EndsWith(input, "\"")) {
4510 input = input.substr(1, input.length() - 2);
4511 }
4512 NStr::TruncateSpacesInPlace(input);
4513
4514 if (NStr::EndsWith(input, ":")) {
4515 input = input.substr(0, input.length() - 1);
4516 NStr::TruncateSpacesInPlace(input);
4517 }
4518
4519 string usa1,usa2;
4520 NStr::SplitInTwo(input, ":", usa1, usa2);
4521 if (!usa1.empty() && !usa2.empty()) {
4522 NStr::TruncateSpacesInPlace(usa1);
4523 NStr::TruncateSpacesInPlace(usa2);
4524 if (NStr::EqualNocase(usa1, "U.S.A.") || NStr::EqualNocase(usa1, "United States") || NStr::EqualNocase(usa1, "United States of America")) {
4525 input = "USA: " + usa2;
4526 }
4527 }
4528
4529 auto old_name_fix = k_old_country_name_fixes.find(input.c_str());
4530 if (old_name_fix != k_old_country_name_fixes.end()) {
4531 input = old_name_fix->second;
4532 return input;
4533 }
4534
4535 if (us_territories) {
4536 if ( NStr::StartsWith( input, "Puerto Rico", NStr::eNocase) || NStr::StartsWith( input, "Guam", NStr::eNocase) || NStr::StartsWith( input, "American Samoa", NStr::eNocase) ) {
4537 input = "USA: " + input;
4538 CCountries::ChangeExtraColonsToCommas(input);
4539 input = CCountries::USAStateCleanup(input);
4540 return input;
4541 } else if ( NStr::StartsWith( input, "Virgin Islands", NStr::eNocase) ) {
4542 input = "USA: US " + input;
4543 CCountries::ChangeExtraColonsToCommas(input);
4544 input = CCountries::USAStateCleanup(input);
4545 return input;
4546 }
4547 }
4548
4549 if (IsValid(input)) {
4550 CCountries::ChangeExtraColonsToCommas(input);
4551 return input;
4552 }
4553 string new_country = WholeCountryFix(input);
4554 if (!new_country.empty())
4555 return new_country;
4556
4557 bool too_many_countries = false;
4558 bool bad_cap = false;
4559 vector<string> countries = x_Tokenize(input);
4560 string valid_country;
4561 string orig_valid_country;
4562
4563 x_FindCountryName(k_country_name_fixes, countries, valid_country, orig_valid_country, too_many_countries, bad_cap);
4564 if (valid_country.empty()) {
4565 x_FindCountryName(k_subregion_fixes, countries, valid_country, orig_valid_country, too_many_countries, bad_cap);
4566 }
4567
4568 if (!valid_country.empty() && !too_many_countries)
4569 too_many_countries = ContainsMultipleCountryNames (input);
4570
4571 if (!valid_country.empty() && too_many_countries && valid_country == input)
4572 {
4573 string str1,str2;
4574 NStr::SplitInTwo(valid_country,":",str1,str2);
4575 if (!str1.empty() && !str2.empty() && !NStr::StartsWith(str2," "))
4576 new_country = str1+": "+str2;
4577
4578 CCountries::ChangeExtraColonsToCommas(new_country);
4579 }
4580 else if(!valid_country.empty() && !too_many_countries)
4581 {
4582 // find valid_country in input
4583 size_t pos = NStr::Find(input,orig_valid_country);
4584 // save preceeding string without trailing spaces or delimiters ":,"
4585 string before = input.substr(0,pos);
4586
4587 x_RemoveDelimitersFromEnds(before);
4588 NStr::TruncateSpacesInPlace(before);
4589 // save trailing string without initial spaces or delimiters
4590 string after = input.substr(pos+orig_valid_country.length());
4591 x_RemoveDelimitersFromEnds(after, true);
4592 NStr::TruncateSpacesInPlace(after);
4593 if (bad_cap) new_country = GetCorrectedCountryCapitalization(valid_country);
4594 else new_country = valid_country;
4595 if (!before.empty() || !after.empty()) {
4596 if (NStr::Find(valid_country, ":") == NPOS) {
4597 new_country += ": ";
4598 } else {
4599 new_country += ", ";
4600 }
4601 }
4602 if (!before.empty())
4603 new_country += before;
4604 if (!before.empty() && !after.empty() && !NStr::Equal(after, ")"))
4605 new_country += ", ";
4606 if (!after.empty())
4607 new_country += after;
4608 CCountries::ChangeExtraColonsToCommas(new_country);
4609 }
4610
4611 return new_country;
4612 }
4613
4614
ChangeExtraColonsToCommas(string & country)4615 bool CCountries::ChangeExtraColonsToCommas(string& country)
4616 {
4617 // requested in SQD-4516
4618 bool rval = false;
4619 int count = 0;
4620 for (size_t i = 0; i < country.length(); i++) {
4621 if (country[i] == ':') {
4622 count++;
4623 if (count > 1) {
4624 country[i] = ',';
4625 rval = true;
4626 }
4627 }
4628 }
4629 return rval;
4630 }
4631
4632
CountryFixupItem(const string & input,bool capitalize_after_colon)4633 string CCountries::CountryFixupItem(const string &input, bool capitalize_after_colon)
4634 {
4635 string country = NewFixCountry (input);
4636 string new_country = country;
4637 SIZE_TYPE country_end_pos = NStr::Find(country,":");
4638 if (country_end_pos != NPOS)
4639 {
4640 SIZE_TYPE pos = country_end_pos;
4641 while (country[pos] == ',' || country[pos] == ':' || isspace((unsigned char)country[pos]))
4642 {
4643 pos++;
4644 }
4645 string after = country.substr(pos);
4646 if (after.empty()) {
4647 if (pos > country_end_pos) {
4648 new_country = country.substr(0, country_end_pos);
4649 }
4650 } else {
4651 NStr::TruncateSpacesInPlace(after,NStr::eTrunc_Begin);
4652 if (capitalize_after_colon)
4653 after = CapitalizeFirstLetterOfEveryWord (after);
4654 new_country = country.substr(0,country_end_pos);
4655 new_country += ": " + after;
4656 }
4657 }
4658 return new_country;
4659 }
4660
4661
4662 // SubSource Qual Fixups
4663 typedef SStaticPair<const char*, const char*> TStaticQualFixPair;
4664 typedef CStaticPairArrayMap<const char*, const char*, PNocase_CStr> TStaticQualFixMap;
4665
4666 static const TStaticQualFixPair kDevStagePairs[] = {
4667 { "adult", "adult" },
4668 { "egg", "egg" },
4669 { "juvenile", "juvenile" },
4670 { "larva", "larva" }
4671 };
4672
4673 DEFINE_STATIC_ARRAY_MAP(TStaticQualFixMap, sc_DevStagePairs, kDevStagePairs);
4674
4675
FixDevStageCapitalization(const string & value)4676 string CSubSource::FixDevStageCapitalization(const string& value)
4677 {
4678 string fix = value;
4679
4680 TStaticQualFixMap::const_iterator it = sc_DevStagePairs.find(value.c_str());
4681 if (it != sc_DevStagePairs.end()) {
4682 fix = it->second;
4683 }
4684 return fix;
4685 }
4686
4687
4688 static const TStaticQualFixPair kCellTypePairs[] = {
4689 { "hemocyte", "hemocyte" },
4690 { "hepatocyte", "hepatocyte" },
4691 { "lymphocyte", "lymphocyte" },
4692 { "neuroblast", "neuroblast" }
4693 };
4694
4695 DEFINE_STATIC_ARRAY_MAP(TStaticQualFixMap, sc_CellTypePairs, kCellTypePairs);
4696
FixCellTypeCapitalization(const string & value)4697 string CSubSource::FixCellTypeCapitalization(const string& value)
4698 {
4699 string fix = value;
4700
4701 TStaticQualFixMap::const_iterator it = sc_CellTypePairs.find(value.c_str());
4702 if (it != sc_CellTypePairs.end()) {
4703 fix = it->second;
4704 }
4705 return fix;
4706
4707 }
4708
4709 DEFINE_STATIC_FAST_MUTEX(s_QualFixMutex);
4710 typedef map<string, string, PNocase> TQualFixMap;
4711
4712 static TQualFixMap s_IsolationSourceMap;
4713 static bool s_QualFixupMapsInitialized = false;
4714
s_ProcessQualMapLine(const CTempString & line,TQualFixMap & qual_map)4715 static void s_ProcessQualMapLine(const CTempString& line, TQualFixMap& qual_map)
4716 {
4717 vector<CTempString> tokens;
4718 NStr::Split(line, "\t", tokens);
4719 if (tokens.size() > 1) {
4720 qual_map[tokens[0]] = tokens[1];
4721 }
4722 }
4723
4724
s_AddOneDataFile(const string & file_name,const string & data_name,const char ** built_in,size_t num_built_in,TQualFixMap & qual_map)4725 void s_AddOneDataFile(const string& file_name, const string& data_name,
4726 const char **built_in, size_t num_built_in,
4727 TQualFixMap& qual_map)
4728 {
4729 string file = g_FindDataFile(file_name);
4730 CRef<ILineReader> lr;
4731 if (!file.empty()) {
4732 try {
4733 lr = ILineReader::New(file);
4734 } NCBI_CATCH("s_InitializeQualMaps")
4735 }
4736
4737 if (lr.Empty()) {
4738 if (built_in == NULL) {
4739 ERR_POST(Note << "No data for " + data_name);
4740 } else {
4741 if (getenv("NCBI_DEBUG")) {
4742 ERR_POST(Note << "Falling back on built-in data for " + data_name);
4743 }
4744 for (size_t i = 0; i < num_built_in; i++) {
4745 const char *p = built_in[i];
4746 s_ProcessQualMapLine(p, qual_map);
4747 }
4748 }
4749 } else {
4750 if (getenv("NCBI_DEBUG")) {
4751 ERR_POST(Note << "Reading from " + file + " for " + data_name);
4752 }
4753 do {
4754 s_ProcessQualMapLine(*++*lr, qual_map);
4755 } while (!lr->AtEOF());
4756 }
4757 }
4758
4759 #include "isolation_sources.inc"
4760
s_InitializeQualMaps(void)4761 static void s_InitializeQualMaps(void)
4762 {
4763 CFastMutexGuard GUARD(s_QualFixMutex);
4764 if (s_QualFixupMapsInitialized) {
4765 return;
4766 }
4767
4768 // tissue types
4769 s_AddOneDataFile("isolation_sources.txt", "isolation sources", (const char **)k_isolation_sources, sizeof(k_isolation_sources) / sizeof(char *), s_IsolationSourceMap);
4770 s_QualFixupMapsInitialized = true;
4771 }
4772
4773
4774
4775
4776
FixIsolationSourceCapitalization(const string & value)4777 string CSubSource::FixIsolationSourceCapitalization(const string& value)
4778 {
4779 string fix = value;
4780
4781 s_InitializeQualMaps();
4782
4783 TQualFixMap::iterator it = s_IsolationSourceMap.find(value);
4784 if (it != s_IsolationSourceMap.end()) {
4785 return it->second;
4786 }
4787
4788 size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
4789 for (size_t i = 0; i < max; i++) {
4790 if (NStr::EqualNocase(fix, sm_ValidSexQualifierTokens[i])) {
4791 fix = sm_ValidSexQualifierTokens[i];
4792 break;
4793 }
4794 }
4795
4796 fix = COrgMod::FixHostCapitalization(fix);
4797 fix = FixDevStageCapitalization(fix);
4798 fix = FixCellTypeCapitalization(fix);
4799
4800 return fix;
4801 }
4802
4803
FixTissueTypeCapitalization(const string & value)4804 string CSubSource::FixTissueTypeCapitalization(const string& value)
4805 {
4806 string fix = value;
4807
4808 s_InitializeQualMaps();
4809 TQualFixMap::iterator it = s_IsolationSourceMap.find(value);
4810 if (it != s_IsolationSourceMap.end()) {
4811 return it->second;
4812 }
4813
4814
4815 size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
4816 for (size_t i = 0; i < max; i++) {
4817 if (NStr::EqualNocase(fix, sm_ValidSexQualifierTokens[i])) {
4818 fix = sm_ValidSexQualifierTokens[i];
4819 break;
4820 }
4821 }
4822
4823 fix = COrgMod::FixHostCapitalization(fix);
4824 fix = FixDevStageCapitalization(fix);
4825 fix = FixCellTypeCapitalization(fix);
4826
4827 return fix;
4828 }
4829
4830
FixLabHostCapitalization(const string & value)4831 string CSubSource::FixLabHostCapitalization(const string& value)
4832 {
4833 return COrgMod::FixHostCapitalization(value);
4834 }
4835
4836
FixCapitalization(TSubtype subtype,const string & value)4837 string CSubSource::FixCapitalization(TSubtype subtype, const string& value)
4838 {
4839 string new_val = value;
4840 switch (subtype) {
4841 case CSubSource::eSubtype_sex:
4842 new_val = FixSexQualifierValue(value);
4843 if (NStr::IsBlank(new_val)) {
4844 new_val = value;
4845 }
4846 break;
4847 case CSubSource::eSubtype_isolation_source:
4848 new_val = FixIsolationSourceCapitalization(value);
4849 break;
4850 case CSubSource::eSubtype_lab_host:
4851 new_val = FixLabHostCapitalization(value);
4852 break;
4853 case CSubSource::eSubtype_tissue_type:
4854 new_val = FixTissueTypeCapitalization(value);
4855 break;
4856 case CSubSource::eSubtype_dev_stage:
4857 new_val = FixDevStageCapitalization(value);
4858 break;
4859 case CSubSource::eSubtype_cell_type:
4860 new_val = FixCellTypeCapitalization(value);
4861 break;
4862 default:
4863 new_val = value;
4864 break;
4865 }
4866 return new_val;
4867 }
4868
4869
FixCapitalization()4870 void CSubSource::FixCapitalization()
4871 {
4872 if (!IsSetSubtype() || !IsSetName()) {
4873 return;
4874 }
4875
4876 TSubtype subtype = GetSubtype();
4877
4878 if (subtype == CSubSource::eSubtype_sex) {
4879 string upr = GetName();
4880 string lwr = upr;
4881 NStr::ToLower(lwr);
4882 if (! NStr::Equal(upr, lwr)) {
4883 SetName(lwr);
4884 }
4885 }
4886
4887 const string& name = GetName();
4888
4889 string new_val = FixCapitalization(subtype, name);
4890
4891 if (!NStr::IsBlank(new_val)) {
4892 SetName(new_val);
4893 }
4894
4895 }
4896
4897
AutoFix(TSubtype subtype,const string & value)4898 string CSubSource::AutoFix(TSubtype subtype, const string& value)
4899 {
4900 string new_val;
4901 switch (subtype) {
4902 case CSubSource::eSubtype_country:
4903 new_val = CCountries::NewFixCountry(value);
4904 break;
4905 case CSubSource::eSubtype_collection_date:
4906 new_val = FixDateFormat(value);
4907 break;
4908 case CSubSource::eSubtype_lat_lon:
4909 new_val = FixLatLonFormat(value);
4910 break;
4911 case CSubSource::eSubtype_sex:
4912 new_val = FixSexQualifierValue(value);
4913 break;
4914 case CSubSource::eSubtype_altitude:
4915 new_val = FixAltitude(value);
4916 break;
4917 default:
4918 break;
4919 }
4920 return new_val;
4921 }
4922
4923
AutoFix()4924 void CSubSource::AutoFix()
4925 {
4926 if (!IsSetSubtype() || !IsSetName()) {
4927 return;
4928 }
4929
4930 TSubtype subtype = GetSubtype();
4931 string new_val = AutoFix(subtype, GetName());
4932
4933 if (!NStr::IsBlank(new_val)) {
4934 SetName(new_val);
4935 } else if (subtype == CSubSource::eSubtype_sex) {
4936 string upr = GetName();
4937 string lwr = upr;
4938 NStr::ToLower(lwr);
4939 if (! NStr::Equal(upr, lwr)) {
4940 SetName(lwr);
4941 }
4942 }
4943 }
4944
4945
4946
4947 // NOTE (for two arrays below): If string A is a prefix of string B, string B should be placed
4948 // BEFORE string A. I.e. longer string should be earlier
4949 static const char * s_RemovableCultureNotes[] = {
4950 "[BankIt_uncultured16S_wizard]; [universal primers]; [tgge]",
4951 "[BankIt_uncultured16S_wizard]; [universal primers]; [dgge]",
4952 "[BankIt_uncultured16S_wizard]; [universal primers]",
4953 "[BankIt_cultured16S_wizard]",
4954 "[BankIt_organellerRNA_wizard]",
4955 "[BankIt_ITS_wizard]; [rRNAITS_notfound]",
4956 "[BankIt_ITS_wizard]",
4957 "[uncultured (using universal primers)]",
4958 "[uncultured (using universal primers) bacterial source]",
4959 "[cultured bacterial source]",
4960 "[enrichment culture bacterial source]",
4961 "[mixed bacterial source (cultured and uncultured)]",
4962 "[uncultured]; [universal primers]",
4963 "[mixed bacterial source]",
4964 "[virus wizard]",
4965 "[cDNA derived from mRNA, purified viral particles]",
4966 "[cDNA derived from mRNA, whole cell/tissue lysate]",
4967 "[cDNA derived from genomic RNA, whole cell/tissue lysate]",
4968 "[cDNA derived from genomic RNA, purified viral particles]",
4969 "[universal primers]",
4970 "[uncultured; wizard]",
4971 "[uncultured; wizard; spans unknown]",
4972 "[cultured; wizard]",
4973 "[cultured; wizard; spans unknown]",
4974 "[intergenic wizard]",
4975 "[intergenic wizard; spans unknown]",
4976 "[Microsatellite wizard]",
4977 "[Microsatellite wizard; multiple repeats]",
4978 "[D-loop wizard]",
4979 "[D-loop wizard; spans unknown]",
4980 "[D-loop wizard; spans known]",
4981 NULL
4982 };
4983
4984 static const char * s_ReplaceableCultureNotes[] = {
4985 "[BankIt_uncultured16S_wizard]; [species_specific primers]; [tgge]",
4986 "[BankIt_uncultured16S_wizard]; [species_specific primers]; [dgge]",
4987 "[BankIt_uncultured16S_wizard]; [species_specific primers]",
4988 "[uncultured (with species-specific primers)]",
4989 "[uncultured]; [amplified with species-specific primers]",
4990 "[uncultured (using species-specific primers) bacterial source]",
4991 "[amplified with species-specific primers]",
4992 NULL
4993 };
4994
4995
HasCultureNotes(const string & value)4996 bool CSubSource::HasCultureNotes(const string& value)
4997 {
4998 for (size_t i = 0; s_RemovableCultureNotes[i] != NULL; i++) {
4999 size_t pos = NStr::FindNoCase(value, s_RemovableCultureNotes[i]);
5000 if (pos != string::npos) {
5001 return true;
5002 }
5003 }
5004 for (size_t i = 0; s_ReplaceableCultureNotes[i] != NULL; i++) {
5005 if (NStr::EqualNocase(value, s_ReplaceableCultureNotes[i])) {
5006 return true;
5007 }
5008 }
5009 return false;
5010 }
5011
5012
RemoveCultureNotes(string & value,bool is_species_level)5013 void CSubSource::RemoveCultureNotes (string& value, bool is_species_level)
5014 {
5015 if (NStr::IsBlank(value)) {
5016 return;
5017 }
5018
5019 for (size_t i = 0; s_RemovableCultureNotes[i] != NULL; i++) {
5020 string to_remove = s_RemovableCultureNotes[i];
5021 size_t remove_len = to_remove.length();
5022 size_t pos = NStr::FindNoCase(value, to_remove);
5023 while (pos != NPOS) {
5024 size_t extra_len = strspn (value.c_str() + pos + remove_len, " ;");
5025 value = value.substr(0, pos) + value.substr(pos + remove_len + extra_len);
5026 pos = NStr::FindNoCase(value, to_remove);
5027 }
5028 }
5029 // remove leading/trailing semicolons
5030 while (NStr::StartsWith(value, " ") || NStr::StartsWith(value, ";")) {
5031 value = value.substr(1);
5032 }
5033 while (NStr::EndsWith(value, " ") || NStr::EndsWith(value, ";")) {
5034 value = value.substr(0, value.length() - 1);
5035 }
5036
5037 if (is_species_level) {
5038 for (size_t i = 0; s_ReplaceableCultureNotes[i] != NULL; i++) {
5039 if (NStr::EqualNocase(value, s_ReplaceableCultureNotes[i])) {
5040 value = "amplified with species-specific primers";
5041 break;
5042 }
5043 }
5044 }
5045 }
5046
5047
RemoveCultureNotes(bool is_species_level)5048 void CSubSource::RemoveCultureNotes (bool is_species_level)
5049 {
5050 if (IsSetName()) {
5051 RemoveCultureNotes(SetName(), is_species_level);
5052 if (NStr::IsBlank(GetName())) {
5053 ResetName();
5054 }
5055 }
5056 }
5057
5058
5059 // CCountryLine
CCountryLine(const string & country_name,double y,double min_x,double max_x,double scale)5060 CCountryLine::CCountryLine
5061 (const string & country_name, double y, double min_x, double max_x, double scale)
5062 : m_CountryName(country_name) ,
5063 m_Scale (scale)
5064 {
5065 m_Y = x_ConvertLat(y);
5066 m_MinX = x_ConvertLon(min_x);
5067 m_MaxX = x_ConvertLon(max_x);
5068
5069 }
5070
5071
~CCountryLine(void)5072 CCountryLine::~CCountryLine (void)
5073 {
5074 }
5075
5076
5077 #define EPSILON 0.001
5078
ConvertLat(double y,double scale)5079 int CCountryLine::ConvertLat (double y, double scale)
5080 {
5081
5082 int val = 0;
5083
5084 if (y < -90.0) {
5085 y = -90.0;
5086 }
5087 if (y > 90.0) {
5088 y = 90.0;
5089 }
5090
5091 if (y > 0) {
5092 val = (int) (y * scale + EPSILON);
5093 } else {
5094 val = (int) (-(-y * scale + EPSILON));
5095 }
5096
5097 return val;
5098 }
5099
5100
x_ConvertLat(double y)5101 int CCountryLine::x_ConvertLat (double y)
5102 {
5103 return ConvertLat(y, m_Scale);
5104 }
5105
ConvertLon(double x,double scale)5106 int CCountryLine::ConvertLon (double x, double scale)
5107 {
5108
5109 int val = 0;
5110
5111 if (x < -180.0) {
5112 x = -180.0;
5113 }
5114 if (x > 180.0) {
5115 x = 180.0;
5116 }
5117
5118 if (x > 0) {
5119 val = (int) (x * scale + EPSILON);
5120 } else {
5121 val = (int) (-(-x * scale + EPSILON));
5122 }
5123
5124 return val;
5125 }
5126
5127
x_ConvertLon(double x)5128 int CCountryLine::x_ConvertLon (double x)
5129 {
5130 return ConvertLon(x, m_Scale);
5131 }
5132
5133
CCountryExtreme(const string & country_name,int min_x,int min_y,int max_x,int max_y)5134 CCountryExtreme::CCountryExtreme (const string & country_name, int min_x, int min_y, int max_x, int max_y)
5135 : m_CountryName(country_name) , m_MinX (min_x), m_MinY (min_y), m_MaxX(max_x), m_MaxY (max_y)
5136 {
5137 m_Area = (1 + m_MaxY - m_MinY) * (1 + m_MaxX - m_MinX);
5138 size_t pos = NStr::Find(country_name, ":");
5139 if (pos == NPOS) {
5140 m_Level0 = country_name;
5141 m_Level1.clear();
5142 } else {
5143 m_Level0 = country_name.substr(0, pos);
5144 NStr::TruncateSpacesInPlace(m_Level0);
5145 m_Level1 = country_name.substr(pos + 1);
5146 NStr::TruncateSpacesInPlace(m_Level1);
5147 }
5148
5149 }
5150
5151
~CCountryExtreme(void)5152 CCountryExtreme::~CCountryExtreme (void)
5153 {
5154
5155 }
5156
5157
SetMinX(int min_x)5158 bool CCountryExtreme::SetMinX(int min_x)
5159 {
5160 if (min_x < m_MinX) {
5161 m_MinX = min_x;
5162 return true;
5163 } else {
5164 return false;
5165 }
5166 }
5167
5168
SetMaxX(int max_x)5169 bool CCountryExtreme::SetMaxX(int max_x)
5170 {
5171 if (max_x > m_MaxX) {
5172 m_MaxX = max_x;
5173 return true;
5174 } else {
5175 return false;
5176 }
5177 }
5178
5179
SetMinY(int min_y)5180 bool CCountryExtreme::SetMinY(int min_y)
5181 {
5182 if (min_y < m_MinY) {
5183 m_MinY = min_y;
5184 return true;
5185 } else {
5186 return false;
5187 }
5188 }
5189
5190
SetMaxY(int max_y)5191 bool CCountryExtreme::SetMaxY(int max_y)
5192 {
5193 if (max_y > m_MaxY) {
5194 m_MaxY = max_y;
5195 return true;
5196 } else {
5197 return false;
5198 }
5199 }
5200
5201
AddLine(const CCountryLine * line)5202 void CCountryExtreme::AddLine(const CCountryLine *line)
5203 {
5204 if (line) {
5205 SetMinX(line->GetMinX());
5206 SetMaxX(line->GetMaxX());
5207 SetMinY(line->GetY());
5208 SetMaxY(line->GetY());
5209 m_Area += 1 + line->GetMaxX() - line->GetMinX();
5210 }
5211 }
5212
5213
DoesOverlap(const CCountryExtreme * other_block) const5214 bool CCountryExtreme::DoesOverlap(const CCountryExtreme* other_block) const
5215 {
5216 if (!other_block) {
5217 return false;
5218 } else if (m_MaxX >= other_block->GetMinX()
5219 && m_MaxX <= other_block->GetMaxX()
5220 && m_MaxY >= other_block->GetMinY()
5221 && m_MinY <= other_block->GetMaxY()) {
5222 return true;
5223 } else if (other_block->GetMaxX() >= m_MinX
5224 && other_block->GetMaxX() <= m_MaxX
5225 && other_block->GetMaxY() >= m_MinY
5226 && other_block->GetMinY() <= m_MaxY) {
5227 return true;
5228 } else {
5229 return false;
5230 }
5231 }
5232
5233
PreferTo(const CCountryExtreme * other_block,const string country,const string province,const bool prefer_new) const5234 bool CCountryExtreme::PreferTo(const CCountryExtreme* other_block, const string country, const string province, const bool prefer_new) const
5235 {
5236 if (!other_block) {
5237 return true;
5238 }
5239
5240 // if no preferred country, these are equal
5241 if (NStr::IsBlank(country)) {
5242 return prefer_new;
5243 }
5244
5245 // if match to preferred country
5246 if (NStr::EqualNocase(country, m_Level0)) {
5247 // if best was not preferred country, take new match
5248 if (!NStr::EqualNocase(country, other_block->GetLevel0())) {
5249 return true;
5250 }
5251 // if match to preferred province
5252 if (!NStr::IsBlank(province) && NStr::EqualNocase(province, m_Level1)) {
5253 // if best was not preferred province, take new match
5254 if (!NStr::EqualNocase(province, other_block->GetLevel1())) {
5255 return true;
5256 }
5257 }
5258
5259 // if both match province, or neither does, or no preferred province, take smallest
5260 return prefer_new;
5261 }
5262
5263 // if best matches preferred country, keep
5264 if (NStr::EqualNocase(country, other_block->GetLevel0())) {
5265 return false;
5266 }
5267
5268 // otherwise take smallest
5269 return prefer_new;
5270 }
5271
5272
CLatLonCountryId(float lat,float lon)5273 CLatLonCountryId::CLatLonCountryId(float lat, float lon)
5274 : m_Lat(lat),
5275 m_Lon(lon),
5276 m_LandDistance(-1),
5277 m_WaterDistance(-1),
5278 m_ClaimedDistance(-1)
5279 {}
5280
5281
Classify(string country,string province)5282 CLatLonCountryId::TClassificationFlags CLatLonCountryId::Classify(string country, string province)
5283 {
5284 CLatLonCountryId::TClassificationFlags rval = 0;
5285
5286 // compare guesses or closest regions to indicated country and province
5287 if (!NStr::IsBlank(GetGuessCountry())) {
5288 // if top level countries match
5289 if (NStr::EqualNocase(country, GetGuessCountry())) {
5290 rval |= CLatLonCountryId::fCountryMatch;
5291 // if both are empty, still call it a match
5292 if (NStr::EqualNocase(province, GetGuessProvince())) {
5293 rval |= CLatLonCountryId::fProvinceMatch;
5294 }
5295 }
5296 // if they don't match, are they closest?
5297 if (!(rval & CLatLonCountryId::fCountryMatch)) {
5298 if (NStr::EqualNocase(country, GetClosestCountry())) {
5299 rval |= CLatLonCountryId::fCountryClosest;
5300 if (NStr::EqualNocase(province, GetClosestProvince())) {
5301 rval |= CLatLonCountryId::fProvinceClosest;
5302 }
5303 }
5304 } else if (!(rval & CLatLonCountryId::fProvinceMatch) && !NStr::IsBlank(province)) {
5305 if (NStr::EqualNocase (province, GetClosestProvince())) {
5306 rval |= CLatLonCountryId::fProvinceClosest;
5307 }
5308 }
5309 }
5310
5311 if (!NStr::IsBlank(GetGuessWater())) {
5312 // was the non-approved body of water correctly indicated?
5313 if (NStr::EqualNocase(country, GetGuessWater())) {
5314 rval |= CLatLonCountryId::fWaterMatch;
5315 } else if (NStr::EqualNocase(country, GetClosestWater())) {
5316 rval |= CLatLonCountryId::fWaterClosest;
5317 }
5318 }
5319
5320 if (!NStr::IsBlank(GetClosestCountry()) && NStr::EqualNocase(country, GetClosestCountry())) {
5321 if (NStr::IsBlank(GetGuessCountry()) && NStr::IsBlank(GetGuessWater())) {
5322 rval |= CLatLonCountryId::fCountryMatch;
5323 SetGuessCountry(GetClosestCountry());
5324 SetFullGuess(GetClosestCountry());
5325 if (!NStr::IsBlank(GetClosestProvince()) && NStr::EqualNocase(province, GetClosestProvince())) {
5326 rval |= CLatLonCountryId::fProvinceMatch;
5327 SetGuessProvince(GetClosestProvince());
5328 SetFullGuess(GetClosestFull());
5329 }
5330 } else {
5331 rval |= CLatLonCountryId::fCountryClosest;
5332 if (!NStr::IsBlank(GetClosestProvince()) && NStr::EqualNocase(province, GetClosestProvince())) {
5333 rval |= CLatLonCountryId::fProvinceClosest;
5334 }
5335 }
5336 }
5337 return rval;
5338 }
5339
5340
~CLatLonCountryId(void)5341 CLatLonCountryId::~CLatLonCountryId(void)
5342 {
5343 }
5344
5345
5346 #include "lat_lon_country.inc"
5347 static const size_t k_NumLatLonCountryText = ArraySize(s_DefaultLatLonCountryText);
5348
5349 #include "lat_lon_water.inc"
5350 static const size_t k_NumLatLonWaterText = ArraySize(s_DefaultLatLonWaterText);
5351
x_InitFromDefaultList(const char * const * list,int num)5352 void CLatLonCountryMap::x_InitFromDefaultList(const char * const *list, int num)
5353 {
5354 if (getenv("NCBI_DEBUG")) {
5355 ERR_POST(Note << "Falling back on built-in data for latlon / water data.");
5356 }
5357 // initialize list of country lines
5358 m_CountryLineList.clear();
5359 m_Scale = 20.0;
5360 string current_country;
5361
5362 for (int i = 0; i < num; i++) {
5363 CTempString line = list[i];
5364 if (line[0] == '-') {
5365 // skip comment
5366 } else if (isalpha ((unsigned char)line[0])) {
5367 current_country = line;
5368 } else if (isdigit ((unsigned char)line[0])) {
5369 m_Scale = NStr::StringToDouble(line);
5370 } else {
5371 vector<string> tokens;
5372 NStr::Split(line, "\t", tokens);
5373 if (tokens.size() > 3) {
5374 double x = NStr::StringToDouble(tokens[1]);
5375 for (size_t j = 2; j < tokens.size() - 1; j+=2) {
5376 m_CountryLineList.push_back(new CCountryLine(current_country, x, NStr::StringToDouble(tokens[j]), NStr::StringToDouble(tokens[j + 1]), m_Scale));
5377 }
5378 }
5379 }
5380 }
5381 }
5382
5383
5384
5385
x_InitFromFile(const string & filename)5386 bool CLatLonCountryMap::x_InitFromFile(const string& filename)
5387 {
5388 string fname = g_FindDataFile (filename);
5389 if (NStr::IsBlank (fname)) {
5390 return false;
5391 }
5392 if (getenv("NCBI_DEBUG")) {
5393 ERR_POST(Note << "Reading from " + filename + " for latlon/water data.");
5394 }
5395 CRef<ILineReader> lr = ILineReader::New (fname);
5396 if (lr.Empty()) {
5397 return false;
5398 } else {
5399 m_Scale = 20.0;
5400 string current_country;
5401
5402 // make sure to clear before using. in this outer
5403 // scope in the interest of speed (avoid repeated
5404 // construction/destruction)
5405 vector<SIZE_TYPE> tab_positions;
5406
5407 do {
5408 // const string& line = *++*lr;
5409 CTempString line = *++*lr;
5410 if (line[0] == '-') {
5411 // skip comment
5412 } else if (isalpha ((unsigned char)line[0])) {
5413 current_country = line;
5414 } else if (isdigit ((unsigned char)line[0])) {
5415 m_Scale = NStr::StringToDouble(line);
5416 } else {
5417 // NStr::Tokenize would be much simpler, but
5418 // it's just too slow in this case, especially
5419 // in debug mode.
5420
5421 // for the future, if we need even more speed,
5422 // it should be possible to eliminate the tab_positions
5423 // vector and collect tab positions on the fly without
5424 // any heap-allocated memory
5425
5426 // find position of all tabs on this line
5427 tab_positions.clear();
5428 SIZE_TYPE tab_pos = line.find('\t');
5429 while( tab_pos != NPOS ) {
5430 tab_positions.push_back(tab_pos);
5431 tab_pos = line.find('\t', tab_pos+1);
5432 }
5433 // an imaginary sentinel tab
5434 tab_positions.push_back(line.length());
5435
5436 const char * line_start = line.data();
5437 if( tab_positions.size() >= 4 ) {
5438 CTempString y_str( line_start + tab_positions[0]+1, tab_positions[1] - tab_positions[0] - 1 );
5439 double y = NStr::StringToDouble( y_str );
5440
5441 // convert into line list
5442 for (size_t j = 1; j < tab_positions.size() - 2; j+=2) {
5443 const SIZE_TYPE pos1 = tab_positions[j];
5444 const SIZE_TYPE pos2 = tab_positions[j+1];
5445 const SIZE_TYPE pos3 = tab_positions[j+2];
5446 CTempString first_num( line_start + pos1 + 1, pos2 - pos1 - 1 );
5447 CTempString second_num( line_start + pos2 + 1, pos3 - pos2 - 1 );
5448 m_CountryLineList.push_back(new CCountryLine(current_country, y, NStr::StringToDouble(first_num), NStr::StringToDouble(second_num), m_Scale));
5449 }
5450 }
5451 }
5452 } while ( !lr->AtEOF() );
5453
5454 return true;
5455 }
5456 }
5457
5458 bool
s_CompareTwoLinesByLatLonOnly(const CCountryLine * line1,const CCountryLine * line2)5459 CLatLonCountryMap::s_CompareTwoLinesByLatLonOnly(
5460 const CCountryLine* line1,
5461 const CCountryLine* line2)
5462 {
5463 if (line1->GetY() < line2->GetY()) {
5464 return true;
5465 } else if (line1->GetY() > line2->GetY()) {
5466 return false;
5467 } else {
5468 if (line1->GetMinX() < line2->GetMinX()) {
5469 return true;
5470 } else {
5471 return false;
5472 }
5473 }
5474 }
5475
5476 bool CLatLonCountryMap::
s_CompareTwoLinesByCountry(const CCountryLine * line1,const CCountryLine * line2)5477 s_CompareTwoLinesByCountry(const CCountryLine* line1,
5478 const CCountryLine* line2)
5479 {
5480 int cmp = NStr::CompareNocase(line1->GetCountry(), line2->GetCountry());
5481 if (cmp == 0) {
5482 return s_CompareTwoLinesByLatLonOnly(line1, line2);
5483 } else if (cmp < 0) {
5484 return true;
5485 } else {
5486 return false;
5487 }
5488 }
5489
5490
5491 bool CLatLonCountryMap::
s_CompareTwoLinesByLatLonThenCountry(const CCountryLine * line1,const CCountryLine * line2)5492 s_CompareTwoLinesByLatLonThenCountry(const CCountryLine* line1,
5493 const CCountryLine* line2)
5494 {
5495 if (line1->GetY() < line2->GetY()) {
5496 return true;
5497 } else if (line1->GetY() > line2->GetY()) {
5498 return false;
5499 } if (line1->GetMinX() < line2->GetMinX()) {
5500 return true;
5501 } else if (line1->GetMinX() > line2->GetMinX()) {
5502 return false;
5503 } else if (line1->GetMaxX() < line2->GetMaxX()) {
5504 return true;
5505 } else if (line1->GetMaxX() > line2->GetMaxX()) {
5506 return false;
5507 } else {
5508 int cmp = NStr::CompareNocase(line1->GetCountry(), line2->GetCountry());
5509 if (cmp < 0) {
5510 return true;
5511 } else {
5512 return false;
5513 }
5514 }
5515 }
5516
5517
CLatLonCountryMap(bool is_water)5518 CLatLonCountryMap::CLatLonCountryMap (bool is_water)
5519 {
5520 // initialize list of country lines
5521 m_CountryLineList.clear();
5522
5523 const char* env_val = getenv("NCBI_LAT_LON_DATA_PATH");
5524 string data_path;
5525 if (env_val) {
5526 data_path = (string) env_val;
5527 if (! NStr::EndsWith(data_path, "/")) {
5528 data_path = data_path + "/";
5529 }
5530 }
5531
5532 if (is_water) {
5533 if (!x_InitFromFile("lat_lon_water.txt")) {
5534 if (data_path.empty() || !x_InitFromFile(data_path + "lat_lon_water.txt")) {
5535 x_InitFromDefaultList(s_DefaultLatLonWaterText, k_NumLatLonWaterText);
5536 }
5537 }
5538 } else {
5539 if (!x_InitFromFile("lat_lon_country.txt")) {
5540 if (data_path.empty() || !x_InitFromFile(data_path + "lat_lon_country.txt")) {
5541 x_InitFromDefaultList(s_DefaultLatLonCountryText, k_NumLatLonCountryText);
5542 }
5543 }
5544 }
5545
5546 // Instead of doing a plain sort, we take advantage of the fact that
5547 // there are few unique country names versus the number
5548 // of lines.
5549 typedef map<CTempString, TCountryLineList, PNocase> TCountryToLinesMap;
5550 // this map maps a country name (case insens) to all the lines that
5551 // belong to that country.
5552 TCountryToLinesMap countryToLinesMap;
5553 ITERATE(TCountryLineList, line_it, m_CountryLineList) {
5554 countryToLinesMap[(*line_it)->GetCountry()].push_back(*line_it);
5555 }
5556
5557 // build new m_CountryLineList here:
5558 TCountryLineList new_country_line_list;
5559 NON_CONST_ITERATE(TCountryToLinesMap, country_lines_it, countryToLinesMap)
5560 {
5561 // sort the lines for each country by lat/lon only, since we've already
5562 // implicitly sorted by country in countryToLinesMap
5563 TCountryLineList & line_list_for_this_country =
5564 country_lines_it->second;
5565 stable_sort(
5566 BEGIN_COMMA_END(line_list_for_this_country),
5567 s_CompareTwoLinesByLatLonOnly);
5568 copy(BEGIN_COMMA_END(line_list_for_this_country),
5569 back_inserter(new_country_line_list));
5570 }
5571 // swap should be constant time
5572 m_CountryLineList.swap(new_country_line_list);
5573
5574 // set up extremes index and copy into LatLon index
5575 m_CountryExtremes.clear();
5576 m_LatLonSortedList.clear();
5577 size_t i, ext = 0;
5578
5579 for (i = 0; i < m_CountryLineList.size(); i++) {
5580 if (ext > 0 && NStr::Equal(m_CountryLineList[i]->GetCountry(), m_CountryExtremes[ext - 1]->GetCountry())) {
5581 m_CountryExtremes[ext - 1]->AddLine(m_CountryLineList[i]);
5582 } else {
5583 m_CountryExtremes.push_back(new CCountryExtreme(m_CountryLineList[i]->GetCountry(),
5584 m_CountryLineList[i]->GetMinX(),
5585 m_CountryLineList[i]->GetY(),
5586 m_CountryLineList[i]->GetMaxX(),
5587 m_CountryLineList[i]->GetY()));
5588 ext++;
5589 }
5590 m_LatLonSortedList.push_back(m_CountryLineList[i]);
5591 m_CountryLineList[i]->SetBlock(m_CountryExtremes[ext - 1]);
5592 }
5593 sort (m_LatLonSortedList.begin(), m_LatLonSortedList.end(), s_CompareTwoLinesByLatLonThenCountry);
5594
5595 }
5596
5597
~CLatLonCountryMap(void)5598 CLatLonCountryMap::~CLatLonCountryMap (void)
5599 {
5600 size_t i;
5601
5602 for (i = 0; i < m_CountryLineList.size(); i++) {
5603 delete (m_CountryLineList[i]);
5604 }
5605 m_CountryLineList.clear();
5606
5607 for (i = 0; i < m_CountryExtremes.size(); i++) {
5608 delete (m_CountryExtremes[i]);
5609 }
5610 m_CountryExtremes.clear();
5611 // note - do not delete items in m_LatLonSortedList, they are pointing to the same objects as m_CountryLineList
5612 m_LatLonSortedList.clear();
5613 }
5614
5615
IsCountryInLatLon(const string & country,double lat,double lon)5616 bool CLatLonCountryMap::IsCountryInLatLon(const string& country, double lat,
5617 double lon)
5618 {
5619 int x = CCountryLine::ConvertLon(lon, m_Scale);
5620 int y = CCountryLine::ConvertLat(lat, m_Scale);
5621
5622 size_t L, R, mid;
5623
5624 L = 0;
5625 R = m_CountryLineList.size() - 1;
5626 mid = 0;
5627
5628 while (L < R) {
5629 mid = (L + R) / 2;
5630 int cmp = NStr::Compare(m_CountryLineList[mid]->GetCountry(), country);
5631 if (cmp < 0) {
5632 L = mid + 1;
5633 } else if (cmp > 0) {
5634 R = mid;
5635 } else {
5636 while (mid > 0
5637 && NStr::Compare(m_CountryLineList[mid - 1]->GetCountry(), country) == 0
5638 && m_CountryLineList[mid - 1]->GetY() >= y) {
5639 mid--;
5640 }
5641 L = mid;
5642 R = mid;
5643 }
5644 }
5645
5646 while (R < m_CountryLineList.size()
5647 && NStr::EqualNocase(country, m_CountryLineList[R]->GetCountry())
5648 && m_CountryLineList[R]->GetY() < y) {
5649 R++;
5650 }
5651
5652 while (R < m_CountryLineList.size()
5653 && NStr::EqualNocase(country, m_CountryLineList[R]->GetCountry())
5654 && m_CountryLineList[R]->GetY() == y
5655 && m_CountryLineList[R]->GetMaxX() < x) {
5656 R++;
5657 }
5658 if (R < m_CountryLineList.size()
5659 && NStr::EqualNocase(country, m_CountryLineList[R]->GetCountry())
5660 && m_CountryLineList[R]->GetY() == y
5661 && m_CountryLineList[R]->GetMinX() <= x
5662 && m_CountryLineList[R]->GetMaxX() >= x) {
5663 return true;
5664 } else {
5665 return false;
5666 }
5667 }
5668
5669
5670 const CCountryExtreme *
x_FindCountryExtreme(const string & country)5671 CLatLonCountryMap::x_FindCountryExtreme(const string& country)
5672 {
5673 size_t L, R, mid;
5674
5675 if (NStr::IsBlank (country)) return NULL;
5676
5677 L = 0;
5678 R = m_CountryExtremes.size() - 1;
5679
5680 while (L < R) {
5681 mid = (L + R) / 2;
5682 if (NStr::CompareNocase(m_CountryExtremes[mid]->GetCountry(), country) < 0) {
5683 L = mid + 1;
5684 } else {
5685 R = mid;
5686 }
5687 }
5688 if (!NStr::EqualNocase(m_CountryExtremes[R]->GetCountry(), country)) {
5689 return NULL;
5690 } else {
5691 return m_CountryExtremes[R];
5692 }
5693 }
5694
5695
HaveLatLonForRegion(const string & region)5696 bool CLatLonCountryMap::HaveLatLonForRegion(const string& region)
5697 {
5698 if (x_FindCountryExtreme(region) == NULL) {
5699 return false;
5700 } else {
5701 return true;
5702 }
5703 }
5704
5705
x_GetLatStartIndex(int y)5706 size_t CLatLonCountryMap::x_GetLatStartIndex (int y)
5707 {
5708 size_t L, R, mid;
5709
5710 L = 0;
5711 R = m_LatLonSortedList.size() - 1;
5712 mid = 0;
5713
5714 while (L < R) {
5715 mid = (L + R) / 2;
5716 if (m_LatLonSortedList[mid]->GetY() < y) {
5717 L = mid + 1;
5718 } else if (m_LatLonSortedList[mid]->GetY() > y) {
5719 R = mid;
5720 } else {
5721 while (mid > 0 && m_LatLonSortedList[mid - 1]->GetY() == y) {
5722 mid--;
5723 }
5724 L = mid;
5725 R = mid;
5726 }
5727 }
5728 return R;
5729 }
5730
5731
5732 const CCountryExtreme *
GuessRegionForLatLon(double lat,double lon,const string & country,const string & province)5733 CLatLonCountryMap::GuessRegionForLatLon(double lat, double lon,
5734 const string& country,
5735 const string& province)
5736 {
5737 int x = CCountryLine::ConvertLon(lon, m_Scale);
5738 int y = CCountryLine::ConvertLon(lat, m_Scale);
5739
5740 size_t R = x_GetLatStartIndex(y);
5741
5742 const CCountryExtreme *best = NULL;
5743
5744 while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() == y) {
5745 if (m_LatLonSortedList[R]->GetMinX() <= x
5746 && m_LatLonSortedList[R]->GetMaxX() >= x) {
5747 const CCountryExtreme *other = m_LatLonSortedList[R]->GetBlock();
5748 if (best == NULL) {
5749 best = other;
5750 } else if (!best->PreferTo(other, country, province, (bool)(best->GetArea() <= other->GetArea()))) {
5751 best = other;
5752 }
5753 }
5754 R++;
5755 }
5756 return best;
5757 }
5758
5759
5760 //Distance on a spherical surface calculation adapted from
5761 //http://www.linuxjournal.com/magazine/
5762 //work-shell-calculating-distance-between-two-latitudelongitude-points
5763
5764 #define EARTH_RADIUS 6371.0 /* average radius of non-spherical earth in kilometers */
5765 #define CONST_PI 3.14159265359
5766
DegreesToRadians(double degrees)5767 static double DegreesToRadians (
5768 double degrees
5769 )
5770
5771 {
5772 return (degrees * (CONST_PI / 180.0));
5773 }
5774
DistanceOnGlobe(double latA,double lonA,double latB,double lonB)5775 static double DistanceOnGlobe (
5776 double latA,
5777 double lonA,
5778 double latB,
5779 double lonB
5780 )
5781
5782 {
5783 double lat1, lon1, lat2, lon2;
5784 double dLat, dLon, a, c;
5785
5786 lat1 = DegreesToRadians (latA);
5787 lon1 = DegreesToRadians (lonA);
5788 lat2 = DegreesToRadians (latB);
5789 lon2 = DegreesToRadians (lonB);
5790
5791 dLat = lat2 - lat1;
5792 dLon = lon2 - lon1;
5793
5794 a = sin (dLat / 2) * sin (dLat / 2) +
5795 cos (lat1) * cos (lat2) * sin (dLon / 2) * sin (dLon / 2);
5796 c = 2 * atan2 (sqrt (a), sqrt (1 - a));
5797
5798 return (double) (EARTH_RADIUS * c);
5799 }
5800
5801
ErrorDistance(double latA,double lonA,double scale)5802 double ErrorDistance (
5803 double latA,
5804 double lonA,
5805 double scale)
5806 {
5807 double lat1, lon1, lat2, lon2;
5808 double dLat, dLon, a, c;
5809
5810 lat1 = DegreesToRadians (latA);
5811 lon1 = DegreesToRadians (lonA);
5812 lat2 = DegreesToRadians (latA + (1.0 / scale));
5813 lon2 = DegreesToRadians (lonA + (1.0 / scale));
5814
5815 dLat = lat2 - lat1;
5816 dLon = lon2 - lon1;
5817
5818 a = sin (dLat / 2) * sin (dLat / 2) +
5819 cos (lat1) * cos (lat2) * sin (dLon / 2) * sin (dLon / 2);
5820 c = 2 * atan2 (sqrt (a), sqrt (1 - a));
5821
5822 return (double) (EARTH_RADIUS * c);
5823
5824 }
5825
5826
FindClosestToLatLon(double lat,double lon,double range,double & distance)5827 const CCountryExtreme * CLatLonCountryMap::FindClosestToLatLon(double lat,
5828 double lon,
5829 double range,
5830 double &distance)
5831 {
5832 int x = CCountryLine::ConvertLon(lon, m_Scale);
5833 int y = CCountryLine::ConvertLon(lat, m_Scale);
5834
5835 int maxDelta = (int) (range * m_Scale + EPSILON);
5836 int min_y = y - maxDelta;
5837 int max_y = y + maxDelta;
5838 int min_x = x - maxDelta;
5839 int max_x = x + maxDelta;
5840
5841 // binary search to lowest lat
5842 size_t R = x_GetLatStartIndex(min_y);
5843
5844 double closest = 0.0;
5845 CCountryExtreme *rval = NULL;
5846
5847 while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() <= max_y) {
5848 if (m_LatLonSortedList[R]->GetMaxX() < min_x || m_LatLonSortedList[R]->GetMinX() > max_x) {
5849 // out of range, don't bother calculating distance
5850 } else {
5851 double end;
5852 if (x < m_LatLonSortedList[R]->GetMinX()) {
5853 end = m_LatLonSortedList[R]->GetMinLon();
5854 } else if (x > m_LatLonSortedList[R]->GetMaxX()) {
5855 end = m_LatLonSortedList[R]->GetMaxLon();
5856 } else {
5857 end = lon;
5858 }
5859 double dist = DistanceOnGlobe (lat, lon, m_LatLonSortedList[R]->GetLat(), end);
5860 if (rval == NULL || closest > dist
5861 || (closest == dist
5862 && (rval->GetArea() > m_LatLonSortedList[R]->GetBlock()->GetArea()
5863 || (rval->GetArea() == m_LatLonSortedList[R]->GetBlock()->GetArea()
5864 && NStr::IsBlank(rval->GetLevel1())
5865 && !NStr::IsBlank(m_LatLonSortedList[R]->GetBlock()->GetLevel1()))))) {
5866 rval = m_LatLonSortedList[R]->GetBlock();
5867 closest = dist;
5868 }
5869 }
5870 R++;
5871 }
5872 distance = closest;
5873 return rval;
5874 }
5875
5876
IsClosestToLatLon(const string & comp_country,double lat,double lon,double range,double & distance)5877 bool CLatLonCountryMap::IsClosestToLatLon(const string& comp_country,
5878 double lat, double lon,
5879 double range, double &distance)
5880 {
5881 int x = CCountryLine::ConvertLon(lon, m_Scale);
5882 int y = CCountryLine::ConvertLon(lat, m_Scale);
5883
5884 int maxDelta = (int) (range * m_Scale + EPSILON);
5885 int min_y = y - maxDelta;
5886 int max_y = y + maxDelta;
5887 int min_x = x - maxDelta;
5888 int max_x = x + maxDelta;
5889
5890 // binary search to lowest lat
5891 size_t R = x_GetLatStartIndex(min_y);
5892
5893 string country;
5894 double closest = 0.0;
5895 int smallest_area = -1;
5896
5897 while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() <= max_y) {
5898 if (m_LatLonSortedList[R]->GetMaxX() < min_x || m_LatLonSortedList[R]->GetMinX() > max_x) {
5899 // out of range, don't bother calculating distance
5900 } else {
5901 double end;
5902 if (x < m_LatLonSortedList[R]->GetMinX()) {
5903 end = m_LatLonSortedList[R]->GetMinLon();
5904 } else {
5905 end = m_LatLonSortedList[R]->GetMaxLon();
5906 }
5907 double dist = DistanceOnGlobe (lat, lon, m_LatLonSortedList[R]->GetLat(), end);
5908 if (NStr::IsBlank (country) || closest > dist) {
5909 country = m_LatLonSortedList[R]->GetCountry();
5910 closest = dist;
5911 const CCountryExtreme * ext = x_FindCountryExtreme(country);
5912 if (ext) {
5913 smallest_area = ext->GetArea();
5914 }
5915 } else if (closest == dist) {
5916 // if the distances are the same, prefer the input country, otherwise prefer the smaller region
5917 if (NStr::Equal(country, comp_country)) {
5918 // keep country we're searching for
5919 } else if (!NStr::Equal(m_LatLonSortedList[R]->GetCountry(), country)) {
5920 const CCountryExtreme * ext = x_FindCountryExtreme(m_LatLonSortedList[R]->GetCountry());
5921 if (ext
5922 && (ext->GetArea() < smallest_area
5923 || NStr::Equal(m_LatLonSortedList[R]->GetCountry(), comp_country))) {
5924 country = m_LatLonSortedList[R]->GetCountry();
5925 smallest_area = ext->GetArea();
5926 }
5927 }
5928 }
5929 }
5930 R++;
5931 }
5932 distance = closest;
5933 return NStr::Equal(country, comp_country);
5934 }
5935
5936
IsNearLatLon(double lat,double lon,double range,double & distance,const string & country,const string & province)5937 const CCountryExtreme * CLatLonCountryMap::IsNearLatLon(double lat, double lon,
5938 double range,
5939 double &distance,
5940 const string& country,
5941 const string& province)
5942 {
5943 int x = CCountryLine::ConvertLon(lon, m_Scale);
5944 int y = CCountryLine::ConvertLat(lat, m_Scale);
5945 double closest = -1.0;
5946 int maxDelta = (int) (range * m_Scale + EPSILON);
5947 int min_y = y - maxDelta;
5948 int max_y = y + maxDelta;
5949 int min_x = x - maxDelta;
5950 int max_x = x + maxDelta;
5951 CCountryExtreme *ext = NULL;
5952
5953 // binary search to lowest lat
5954 size_t R = x_GetLatStartIndex(min_y);
5955
5956 while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() <= max_y) {
5957 if (m_LatLonSortedList[R]->GetMaxX() < min_x || m_LatLonSortedList[R]->GetMinX() > max_x) {
5958 // out of range, don't bother calculating distance
5959 } else if (!NStr::EqualNocase(m_LatLonSortedList[R]->GetBlock()->GetLevel0(), country)) {
5960 // wrong country, skip
5961 } else if (!NStr::IsBlank(province) && !NStr::EqualNocase(m_LatLonSortedList[R]->GetBlock()->GetLevel1(), province)) {
5962 // wrong province, skip
5963 } else {
5964 double end;
5965 if (x < m_LatLonSortedList[R]->GetMinX()) {
5966 end = m_LatLonSortedList[R]->GetMinLon();
5967 } else if (x > m_LatLonSortedList[R]->GetMaxX()) {
5968 end = m_LatLonSortedList[R]->GetMaxLon();
5969 } else {
5970 end = lon;
5971 }
5972 double dist = DistanceOnGlobe (lat, lon, m_LatLonSortedList[R]->GetLat(), end);
5973 if (closest < 0.0 || closest > dist) {
5974 closest = dist;
5975 ext = m_LatLonSortedList[R]->GetBlock();
5976 }
5977 }
5978 R++;
5979 }
5980 distance = closest;
5981 return ext;
5982 }
5983
5984
5985
5986
5987
DoCountryBoxesOverlap(const string & country1,const string & country2)5988 bool CLatLonCountryMap::DoCountryBoxesOverlap(const string& country1,
5989 const string& country2)
5990 {
5991 if (NStr::IsBlank (country1) || NStr::IsBlank(country2)) return false;
5992
5993 const CCountryExtreme *ext1 = x_FindCountryExtreme (country1);
5994 if (!ext1) {
5995 return false;
5996 }
5997 const CCountryExtreme *ext2 = x_FindCountryExtreme (country2);
5998 if (!ext2) {
5999 return false;
6000 }
6001
6002
6003 return ext1->DoesOverlap(ext2);
6004 }
6005
6006
AdjustAndRoundDistance(double distance,double scale)6007 int CLatLonCountryMap::AdjustAndRoundDistance (double distance, double scale)
6008
6009 {
6010 if (scale < 1.1) {
6011 distance += 111.19;
6012 } else if (scale > 19.5 && scale < 20.5) {
6013 distance += 5.56;
6014 } else if (scale > 99.5 && scale < 100.5) {
6015 distance += 1.11;
6016 }
6017
6018 return (int) (distance + 0.5);
6019 }
6020
6021
AdjustAndRoundDistance(double distance)6022 int CLatLonCountryMap::AdjustAndRoundDistance (double distance)
6023
6024 {
6025 return AdjustAndRoundDistance (distance, m_Scale);
6026 }
6027
6028
6029
6030
6031 END_objects_SCOPE // namespace ncbi::objects::
6032
6033 END_NCBI_SCOPE
6034
6035 /* Original file checksum: lines: 65, chars: 1891, CRC32: 7724f0c5 */
6036