1 /* $Id: SubSource.hpp 629611 2021-04-19 15:57:28Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 */
27
28 /// @SubSource.hpp
29 /// User-defined methods of the data storage class.
30 ///
31 /// This file was originally generated by application DATATOOL
32 /// using the following specifications:
33 /// 'seqfeat.asn'.
34 ///
35 /// New methods or data members can be added to it if needed.
36 /// See also: SubSource_.hpp
37
38
39 #ifndef OBJECTS_SEQFEAT_SUBSOURCE_HPP
40 #define OBJECTS_SEQFEAT_SUBSOURCE_HPP
41
42
43 // generated includes
44 #include <objects/seqfeat/SubSource_.hpp>
45
46 // generated classes
47
48
49 // other includes
50 #include <objects/general/Date.hpp>
51 #include <objects/general/Date_std.hpp>
52 #include <corelib/ncbitime.hpp>
53 #include <util/static_map.hpp>
54
55 BEGIN_NCBI_SCOPE
56
57 BEGIN_objects_SCOPE // namespace ncbi::objects::
58 class CDate;
59 class CDate_std;
60 class CLatLonCountryId;
61 class CLatLonCountryMap;
62
63 /////////////////////////////////////////////////////////////////////////////
64 class NCBI_SEQFEAT_EXPORT CSubSource : public CSubSource_Base
65 {
66 typedef CSubSource_Base Tparent;
67 public:
68 // constructor
69 CSubSource(void);
70 CSubSource(TSubtype subtype, const TName& name);
71 CSubSource(const string& subtype, const TName& name);
72
73 // destructor
74 ~CSubSource(void);
75
76 void GetLabel(string* str) const;
77
78 enum EVocabulary {
79 eVocabulary_raw, // per ASN.1, except eSubtype_other <-> "note"
80 eVocabulary_insdc // per GB/DDBJ/EMBL qualifier names
81 };
82
83 // convert subtype from string to enum.
84 static TSubtype GetSubtypeValue(const string& str,
85 EVocabulary vocabulary = eVocabulary_raw);
86
87 // get name for subsource
88 static string GetSubtypeName(CSubSource::TSubtype stype,
89 EVocabulary vocabulary = eVocabulary_raw);
90
91 // tests whether GetSubtypeName is expected to throw an exception
92 static bool IsValidSubtypeName(const string& str,
93 EVocabulary vocabulary = eVocabulary_raw);
94
95 static bool IsMultipleValuesAllowed(TSubtype);
96
97 // identify whether subsource value should be blank
98 static bool NeedsNoText (const TSubtype& subtype);
99
100 // some subsources are discouraged and should not be offered to the user as an option
101 static bool IsDiscouraged (const TSubtype subtype);
102
103 // read collection date from string
104 static CRef<CDate> DateFromCollectionDate (const string& str) THROWS((CException));
105
106 static void IsCorrectDateFormat(const string& date_string, bool& bad_format, bool& in_future);
107 typedef enum {
108 eDateFormatFlag_ok = 0,
109 eDateFormatFlag_bad_format = 1,
110 eDateFormatFlag_in_future = 2,
111 eDateFormatFlag_out_of_order = 4
112 } EDateFormatFlag;
113 static size_t CheckDateFormat(const string& date_string);
114
115 static string GetCollectionDateProblem (const string& date_string);
116 static bool IsCollectionDateAfterTime(const string& collection_date, time_t t, bool& bad_format);
117 static bool IsCollectionDateAfterTime(const CDate& collection_date, time_t t);
118 static bool IsCollectionDateAfterTime(const CDate& collection_date, CTime& ctime);
119
120 static bool IsISOFormatTime(const string& orig_time, int& hour, int& min, int& sec, bool require_time_zone = true);
121 static bool IsISOFormatDateOnly(const string& date);
122 static bool IsISOFormatDate (const string& orig_date);
123 static CRef<CDate> GetDateFromISODate(const string& orig_date);
124
125 /// Determine whether day number could occur in month.
126 /// @param day The number of the day, 1-based [in]
127 /// @param month The number of the month, 1-based [in]
128 /// @param year The number of the year, 1-based [in]
129 static bool IsDayValueOkForMonth(int day, int month, int year);
130
131 /// Attempt to fix the format of the date
132 /// Returns a blank if the format of the date cannot be determined
133 /// @param orig_date The original date [in]
134 /// Rules:
135 /// First, check to see if the string matches the ISO format (YYYY-MM-DD);
136 /// if so then just return original string as this is valid.
137 /// Second, try to split the string into tokens using the following delimiters:
138 /// * space
139 /// * comma (,)
140 /// * dash (-)
141 /// * slash (/)
142 /// * underscore (_)
143 /// * equals (=)
144 /// * period (.)
145 /// If more than three tokens are generated, fail and return empty string.
146 /// Of the tokens that are generated, look for a token that contains letters.
147 /// If there is such a token, this token is assumed to be the month, and will
148 /// be checked to see if it begins with any of the three-letter abbreviations
149 /// for months (Jan, Feb, Mar, etc.). If so, the month is known. If none of
150 /// the abbreviations produce a match, fail and return an empty string. If
151 /// more than one token that contains letters is found, return an empty string.
152 /// If there are no tokens that contain letters, try to determine which token is
153 /// the month by eliminating tokens that would be year or day.
154 /// Any token that is a number and has a value greater than 31 will be assumed
155 /// to be the year. If there is more than one such token, return an empty string.
156 /// After making this initial pass, try to guess the identities of the remaining tokens.
157 /// Numbers between 1 and 12 could be considered months, if no month token containing
158 /// letters was already identified. If two or more such tokens are found, the date is
159 /// ambiguous: return an empty string, unless one of these two conditions are met:
160 /// a) the numbers are equal, in which case ambiguity about placement is irrelevant
161 /// b) if one token is NOT zero-padded and less than 10, and the other is
162 /// either 10 or more or IS zero-padded, then the token that is not padded and
163 /// less than 10 is the day, and the other is the year, to which we should add 2000
164 /// If a number is between 1 and 31, it could be considered the day. If two such tokens
165 /// are found, the date is ambiguous: return an empty string.
166 /// If there is a number that cannot be the month or the day, assume that this is the
167 /// year. If the year is less than 100, this may be a two-digit representation. If
168 /// 2000 + the value is not in the future, use this as the year, otherwise use
169 /// 1900 + the value for the year.
170 /// If all tokens can be identified, arrange them in the output string in one of the
171 /// following formats:
172 /// YYYY
173 /// Mmm-YYYY
174 /// DD-Mmm-YYYY
175
176 static string FixDateFormat(const string& orig_date);
177 static string FixDateFormat(const string& orig_date, bool month_first, bool& month_ambiguous);
178 static void DetectDateFormat(const string& orig_date, bool& ambiguous, bool &day_first);
179 static void IsCorrectLatLonFormat (string lat_lon, bool& format_correct, bool& precision_correct,
180 bool& lat_in_range, bool& lon_in_range,
181 double& lat_value, double& lon_value);
182 static string FixLatLonFormat (string orig_lat_lon, bool guess = false);
183 static string MakeLatLon(double lat_value, double lon_value, int lat_precision = 2, int lon_precision = 2);
184 static string FixLatLonPrecision(const string& orig);
185
186 enum ELatLonCountryErr {
187 eLatLonCountryErr_None = 0,
188 eLatLonCountryErr_Country,
189 eLatLonCountryErr_State,
190 eLatLonCountryErr_Water,
191 eLatLonCountryErr_Value
192 };
193
194 static string ValidateLatLonCountry (const string& countryname, string& lat_lon, bool check_state, ELatLonCountryErr& errcode);
195
196 static bool IsValidSexQualifierValue (const string& value);
197 static string FixSexQualifierValue (const string& value);
198
199 static bool IsAltitudeValid (const string& value);
200 static string FixAltitude (const string& value);
201
202 static bool IsPlasmidNameValid(const string& value, const string& taxname);
203 static bool IsChromosomeNameValid(const string& value, const string& taxname);
204 static bool IsLinkageGroupNameValid(const string& value, const string& taxname);
205 static bool IsSegmentValid(const string& value);
206 static bool IsEndogenousVirusNameValid(const string& value);
207
208
209 static string FixDevStageCapitalization(const string& value);
210 static string FixCellTypeCapitalization(const string& value);
211 static string FixIsolationSourceCapitalization(const string& value);
212 static string FixTissueTypeCapitalization(const string& value);
213 static string FixLabHostCapitalization(const string& value);
214 static string FixCapitalization(TSubtype subtype, const string& value);
215 void FixCapitalization();
216
217 static string AutoFix(TSubtype subtype, const string& value);
218 void AutoFix();
219
220 static bool HasCultureNotes(const string& value);
221 static void RemoveCultureNotes(string& value, bool is_species_level = true);
222 void RemoveCultureNotes(bool is_species_level = true);
223
224 static string CheckCellLine(const string& cell_line, const string& organism);
225
226 private:
227 // Prohibit copy constructor and assignment operator
228 CSubSource(const CSubSource& value);
229 CSubSource& operator=(const CSubSource& value);
230
231 static string x_ParseDateRangeWithDelimiter(const string& orig_date, CTempString delim);
232 static vector<string> x_GetDateTokens(const string& orig_date);
233 static CLatLonCountryId * x_CalculateLatLonId(float lat_value, float lon_value, string country, string province);
234 static bool x_IsFixableIsoDate(const string& orig_date);
235 static string x_RemoveIsoTime(const string& orig_date);
236
237 static int x_GetPrecision(const string& num_str);
238 static string x_FormatWithPrecision(double val, int precision);
239
240 static bool x_GenericRepliconNameValid(const string& value);
241 static bool x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(const string& value, const string& taxname);
242
243 // validation data read from external files
244 static unique_ptr<CLatLonCountryMap> m_LatLonCountryMap;
245 static unique_ptr<CLatLonCountryMap> m_LatLonWaterMap;
246
247 };
248
249 /////////////////// CSubSource inline methods
250
251 // constructor
252 inline
CSubSource(void)253 CSubSource::CSubSource(void)
254 {
255 }
256
257 inline
CSubSource(TSubtype subtype,const TName & name)258 CSubSource::CSubSource(TSubtype subtype, const TName& name)
259 {
260 SetSubtype(subtype);
261 SetName(name);
262 }
263
264 inline
CSubSource(const string & subtype,const TName & name)265 CSubSource::CSubSource(const string& subtype, const TName& name)
266 {
267 SetSubtype(GetSubtypeValue(subtype));
268 SetName(name);
269 }
270
271
272 /////////////////// end of CSubSource inline methods
273
274
275 // =============================================================================
276 // Country Names (legal values found in country subtype)
277 // =============================================================================
278
279 class NCBI_SEQFEAT_EXPORT CCountries
280 {
281 public:
282 // USAStateCleanup return types and examples:
283 enum EStateCleanup {
284 e_NoResult = 0, // ""
285 e_Valid = 1, // "USA: Colorado"
286 e_Corrected = 2, // "USA: Hamilton, MT" -> "USA: Montana, Hamilton"
287 e_Ambiguous = 3, // "USA: Montana, Maine"
288 e_Missing = 4, // "USA: Springfield"
289 e_NotUSA = 5 // "France: Paris"
290 };
291
292 static bool IsValid(const string& country);
293 static bool IsValid(const string& country, bool& is_miscapitalized);
294 static bool WasValid(const string& country);
295 static bool WasValid(const string& country, bool& is_miscapitalized);
296 static string CapitalizeFirstLetterOfEveryWord (const string &phrase);
297 static string WholeCountryFix(string country);
298 static bool IsSubstringOfStringInList(const string& phrase, const string& country1, size_t pos1);
299 static bool ContainsMultipleCountryNames (const string &phrase);
300 static string GetCorrectedCountryCapitalization(const string& country);
301 static string NewFixCountry (const string& input, bool us_territories = false);
302 static bool ChangeExtraColonsToCommas(string& country);
303 static string CountryFixupItem(const string &input, bool capitalize_after_colon);
304 typedef CStaticPairArrayMap<const char*, const char*, PCase_CStr> TCStringPairsMap;
305
306 typedef map<string, string, PNocase> TUsaExceptionMap;
307 static void ReadUSAExceptionMap (TUsaExceptionMap& exceptions, const string& filepath);
308 static void LoadUSAExceptionMap (const TUsaExceptionMap& exceptions);
309 static void LoadUSAExceptionMap (const string& exception_file );
310
311 static string USAStateCleanup (const string& country );
312 static string USAStateCleanup (const string& country, EStateCleanup& type );
313 private:
314 static const string sm_Countries[];
315 static const string sm_Former_Countries[];
316 static void x_RemoveDelimitersFromEnds(string& val, bool except_paren = false);
317 static vector<string> x_Tokenize(const string& val);
318 static void x_FindCountryName(const TCStringPairsMap& fix_map, const vector<string>& countries, string& valid_country, string& orig_valid_country, bool& too_many_countries, bool& bad_cap);
319 };
320
321
322 // ==================== for validating lat-lon versus country ================
323
324 class CCountryLine;
325
326 class NCBI_SEQFEAT_EXPORT CCountryExtreme
327 {
328 public:
329 CCountryExtreme (const string & country_name, int min_x, int min_y, int max_x, int max_y);
330 ~CCountryExtreme (void);
331
GetCountry(void) const332 string GetCountry(void) const { return m_CountryName; }
GetLevel0(void) const333 string GetLevel0(void) const { return m_Level0; }
GetLevel1(void) const334 string GetLevel1(void) const { return m_Level1; }
GetMinX(void) const335 int GetMinX(void) const { return m_MinX; }
GetMinY(void) const336 int GetMinY(void) const { return m_MinY; }
GetMaxX(void) const337 int GetMaxX(void) const { return m_MaxX; }
GetMaxY(void) const338 int GetMaxY(void) const { return m_MaxY; }
GetArea(void) const339 int GetArea(void) const { return m_Area; }
340 void AddLine(const CCountryLine* line);
341 bool SetMinX(int min_x);
342 bool SetMinY(int min_y);
343 bool SetMaxX(int max_x);
344 bool SetMaxY(int max_y);
345 bool DoesOverlap(const CCountryExtreme* other_block) const;
346 bool PreferTo(const CCountryExtreme* other_block, const string country, const string province, const bool prefer_new) const;
347
348 private:
349 string m_CountryName;
350 string m_Level0;
351 string m_Level1;
352 int m_MinX;
353 int m_MinY;
354 int m_MaxX;
355 int m_MaxY;
356 int m_Area;
357 };
358
359
360 class CCountryLine
361 {
362 public:
363 CCountryLine (const string & country_name, double y, double min_x, double max_x, double scale);
364 ~CCountryLine (void);
365
GetCountry(void) const366 const string & GetCountry(void) const { return m_CountryName; }
GetLat(void) const367 double GetLat(void) const { return m_Y / m_Scale; }
GetMinLon(void) const368 double GetMinLon(void) const { return m_MinX / m_Scale; }
GetMaxLon(void) const369 double GetMaxLon(void) const { return m_MaxX / m_Scale; }
GetY(void) const370 int GetY(void) const { return m_Y; }
GetMinX(void) const371 int GetMinX(void) const { return m_MinX; }
GetMaxX(void) const372 int GetMaxX(void) const { return m_MaxX; }
373
374 static int ConvertLat(double y, double scale);
375 static int ConvertLon(double x, double scale);
376
SetBlock(CCountryExtreme * block)377 void SetBlock (CCountryExtreme *block) { m_Block = block; }
GetBlock(void) const378 CCountryExtreme * GetBlock(void) const {return m_Block; }
379
380 private:
381 int x_ConvertLat(double y);
382 int x_ConvertLon(double x);
383
384 CCountryExtreme *m_Block;
385 string m_CountryName;
386 int m_Y;
387 int m_MinX;
388 int m_MaxX;
389 double m_Scale;
390 };
391
392
393 class NCBI_SEQFEAT_EXPORT CLatLonCountryId
394 {
395 public:
396 CLatLonCountryId(float lat, float lon);
397 ~CLatLonCountryId(void);
398
GetLat(void) const399 float GetLat(void) const { return m_Lat; }
SetLat(float lat)400 void SetLat(float lat) { m_Lat = lat; }
GetLon(void) const401 float GetLon(void) const { return m_Lon; }
SetLon(float lon)402 void SetLon(float lon) { m_Lon = lon; }
GetFullGuess(void) const403 string GetFullGuess(void) const { return m_FullGuess; }
SetFullGuess(string guess)404 void SetFullGuess(string guess) { m_FullGuess = guess; }
GetGuessCountry(void) const405 string GetGuessCountry(void) const { return m_GuessCountry; }
SetGuessCountry(string guess)406 void SetGuessCountry(string guess) { m_GuessCountry = guess; }
GetGuessProvince(void) const407 string GetGuessProvince(void) const { return m_GuessProvince; }
SetGuessProvince(string guess)408 void SetGuessProvince(string guess) { m_GuessProvince = guess; }
GetGuessWater(void) const409 string GetGuessWater(void) const { return m_GuessWater; }
SetGuessWater(string guess)410 void SetGuessWater(string guess) { m_GuessWater = guess; }
GetClosestFull(void) const411 string GetClosestFull(void) const { return m_ClosestFull; }
SetClosestFull(string closest)412 void SetClosestFull(string closest) { m_ClosestFull = closest; }
GetClosestCountry(void) const413 string GetClosestCountry(void) const { return m_ClosestCountry; }
SetClosestCountry(string closest)414 void SetClosestCountry(string closest) { m_ClosestCountry = closest; }
GetClosestProvince(void) const415 string GetClosestProvince(void) const { return m_ClosestProvince; }
SetClosestProvince(string closest)416 void SetClosestProvince(string closest) { m_ClosestProvince = closest; }
GetClosestWater(void) const417 string GetClosestWater(void) const { return m_ClosestWater; }
SetClosestWater(string closest)418 void SetClosestWater(string closest) { m_ClosestWater = closest; }
GetClaimedFull(void) const419 string GetClaimedFull(void) const { return m_ClaimedFull; }
SetClaimedFull(string claimed)420 void SetClaimedFull(string claimed) { m_ClaimedFull = claimed; }
421
GetLandDistance(void) const422 int GetLandDistance(void) const { return m_LandDistance; }
SetLandDistance(int dist)423 void SetLandDistance (int dist) { m_LandDistance = dist; }
GetWaterDistance(void) const424 int GetWaterDistance(void) const { return m_WaterDistance; }
SetWaterDistance(int dist)425 void SetWaterDistance (int dist) { m_WaterDistance = dist; }
GetClaimedDistance(void) const426 int GetClaimedDistance(void) const { return m_ClaimedDistance; }
SetClaimedDistance(int dist)427 void SetClaimedDistance (int dist) { m_ClaimedDistance = dist; }
428
429
430 enum EClassificationFlags {
431 fCountryMatch = (1),
432 fProvinceMatch = (1 << 1),
433 fWaterMatch = (1 << 2),
434 fOverlap = (1 << 3),
435 fCountryClosest = (1 << 4),
436 fProvinceClosest = (1 << 5),
437 fWaterClosest = (1 << 6)
438 };
439 typedef int TClassificationFlags; ///< Bitwise OR of "EClassificationFlags"
440
441 CLatLonCountryId::TClassificationFlags Classify(string country, string province);
442
443
444 private:
445 float m_Lat;
446 float m_Lon;
447 string m_FullGuess;
448 string m_GuessCountry;
449 string m_GuessProvince;
450 string m_GuessWater;
451 string m_ClosestFull;
452 string m_ClosestCountry;
453 string m_ClosestProvince;
454 string m_ClosestWater;
455 string m_ClaimedFull;
456 int m_LandDistance;
457 int m_WaterDistance;
458 int m_ClaimedDistance;
459 };
460
461 class NCBI_SEQFEAT_EXPORT CLatLonCountryMap
462 {
463 public:
464 CLatLonCountryMap(bool is_water);
465 ~CLatLonCountryMap(void);
466 bool IsCountryInLatLon(const string& country, double lat, double lon);
467 const CCountryExtreme * GuessRegionForLatLon(double lat, double lon,
468 const string& country = kEmptyStr,
469 const string& province = kEmptyStr);
470 const CCountryExtreme * FindClosestToLatLon(double lat, double lon,
471 double range, double& distance);
472 bool IsClosestToLatLon(const string& country, double lat, double lon,
473 double range, double& distance);
474 bool HaveLatLonForRegion(const string& country);
475 bool DoCountryBoxesOverlap(const string& country1, const string& country2);
476 const CCountryExtreme * IsNearLatLon(double lat, double lon, double range,
477 double& distance,
478 const string& country,
479 const string& province = kEmptyStr);
GetScale(void)480 double GetScale (void) { return m_Scale; }
481 static int AdjustAndRoundDistance (double distance, double scale);
482 int AdjustAndRoundDistance (double distance);
483
484 enum ELatLonAdjustFlags {
485 fNone = 0 ,
486 fFlip = 1 ,
487 fNegateLat = (1 << 1),
488 fNegateLon = (1 << 2),
489 };
490 typedef int TLatLonAdjustFlags; ///< Bitwise OR of "ELatLonAdjustFlags"
491
492
493 private:
494 void x_InitFromDefaultList(const char * const *list, int num);
495 bool x_InitFromFile(const string& filename);
496 static bool s_CompareTwoLinesByLatLonOnly(const CCountryLine* line1,
497 const CCountryLine* line2);
498 static bool s_CompareTwoLinesByCountry(const CCountryLine* line1,
499 const CCountryLine* line2);
500 static bool s_CompareTwoLinesByLatLonThenCountry(const CCountryLine* line1,
501 const CCountryLine* line2);
502
503 size_t x_GetLatStartIndex (int y);
504 const CCountryExtreme * x_FindCountryExtreme (const string& country);
505
506
507 typedef vector <CCountryLine *> TCountryLineList;
508 typedef TCountryLineList::const_iterator TCountryLineList_iter;
509
510 TCountryLineList m_CountryLineList;
511 TCountryLineList m_LatLonSortedList;
512 double m_Scale;
513
514 typedef vector <CCountryExtreme *> TCountryExtremeList;
515 typedef TCountryExtremeList::const_iterator TCountryExtremeList_iter;
516 TCountryExtremeList m_CountryExtremes;
517
518
519 static const string sm_BodiesOfWater[];
520
521
522
523 };
524
525 NCBI_SEQFEAT_EXPORT double ErrorDistance (
526 double latA,
527 double lonA,
528 double scale);
529
530
531
532 END_objects_SCOPE // namespace ncbi::objects::
533
534 END_NCBI_SCOPE
535
536 #endif // OBJECTS_SEQFEAT_SUBSOURCE_HPP
537 /* Original file checksum: lines: 94, chars: 2578, CRC32: 1c534244 */
538