1 /* $Id: SubSource.hpp 629611 2021-04-19 15:57:28Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  */
27 
28 /// @SubSource.hpp
29 /// User-defined methods of the data storage class.
30 ///
31 /// This file was originally generated by application DATATOOL
32 /// using the following specifications:
33 /// 'seqfeat.asn'.
34 ///
35 /// New methods or data members can be added to it if needed.
36 /// See also: SubSource_.hpp
37 
38 
39 #ifndef OBJECTS_SEQFEAT_SUBSOURCE_HPP
40 #define OBJECTS_SEQFEAT_SUBSOURCE_HPP
41 
42 
43 // generated includes
44 #include <objects/seqfeat/SubSource_.hpp>
45 
46 // generated classes
47 
48 
49 // other includes
50 #include <objects/general/Date.hpp>
51 #include <objects/general/Date_std.hpp>
52 #include <corelib/ncbitime.hpp>
53 #include <util/static_map.hpp>
54 
55 BEGIN_NCBI_SCOPE
56 
57 BEGIN_objects_SCOPE // namespace ncbi::objects::
58 class CDate;
59 class CDate_std;
60 class CLatLonCountryId;
61 class CLatLonCountryMap;
62 
63 /////////////////////////////////////////////////////////////////////////////
64 class NCBI_SEQFEAT_EXPORT CSubSource : public CSubSource_Base
65 {
66     typedef CSubSource_Base Tparent;
67 public:
68     // constructor
69     CSubSource(void);
70     CSubSource(TSubtype subtype, const TName& name);
71     CSubSource(const string& subtype, const TName& name);
72 
73     // destructor
74     ~CSubSource(void);
75 
76     void GetLabel(string* str) const;
77 
78     enum EVocabulary {
79         eVocabulary_raw, // per ASN.1, except eSubtype_other <-> "note"
80         eVocabulary_insdc // per GB/DDBJ/EMBL qualifier names
81     };
82 
83     // convert subtype from string to enum.
84     static TSubtype GetSubtypeValue(const string& str,
85                                     EVocabulary vocabulary = eVocabulary_raw);
86 
87 	// get name for subsource
88     static string GetSubtypeName(CSubSource::TSubtype stype,
89                                  EVocabulary vocabulary = eVocabulary_raw);
90 
91     // tests whether GetSubtypeName is expected to throw an exception
92     static bool IsValidSubtypeName(const string& str,
93                                    EVocabulary vocabulary = eVocabulary_raw);
94 
95     static bool IsMultipleValuesAllowed(TSubtype);
96 
97     // identify whether subsource value should be blank
98     static bool NeedsNoText (const TSubtype& subtype);
99 
100     // some subsources are discouraged and should not be offered to the user as an option
101     static bool IsDiscouraged (const TSubtype subtype);
102 
103 	// read collection date from string
104     static CRef<CDate> DateFromCollectionDate (const string& str) THROWS((CException));
105 
106     static void IsCorrectDateFormat(const string& date_string, bool& bad_format, bool& in_future);
107     typedef enum {
108         eDateFormatFlag_ok = 0,
109         eDateFormatFlag_bad_format = 1,
110         eDateFormatFlag_in_future = 2,
111         eDateFormatFlag_out_of_order = 4
112     } EDateFormatFlag;
113     static size_t CheckDateFormat(const string& date_string);
114 
115     static string GetCollectionDateProblem (const string& date_string);
116     static bool IsCollectionDateAfterTime(const string& collection_date, time_t t, bool& bad_format);
117     static bool IsCollectionDateAfterTime(const CDate& collection_date, time_t t);
118     static bool IsCollectionDateAfterTime(const CDate& collection_date, CTime& ctime);
119 
120     static bool IsISOFormatTime(const string& orig_time, int& hour, int& min, int& sec, bool require_time_zone = true);
121     static bool IsISOFormatDateOnly(const string& date);
122     static bool IsISOFormatDate (const string& orig_date);
123     static CRef<CDate> GetDateFromISODate(const string& orig_date);
124 
125     /// Determine whether day number could occur in month.
126     /// @param day   The number of the day, 1-based [in]
127     /// @param month The number of the month, 1-based [in]
128     /// @param year  The number of the year, 1-based [in]
129     static bool IsDayValueOkForMonth(int day, int month, int year);
130 
131     /// Attempt to fix the format of the date
132     /// Returns a blank if the format of the date cannot be determined
133     /// @param orig_date The original date [in]
134     /// Rules:
135     /// First, check to see if the string matches the ISO format (YYYY-MM-DD);
136     /// if so then just return original string as this is valid.
137     /// Second, try to split the string into tokens using the following delimiters:
138     ///      * space
139     ///      * comma (,)
140     ///      * dash (-)
141     ///      * slash (/)
142     ///      * underscore (_)
143     ///      * equals (=)
144     ///      * period (.)
145     /// If more than three tokens are generated, fail and return empty string.
146     /// Of the tokens that are generated, look for a token that contains letters.
147     /// If there is such a token, this token is assumed to be the month, and will
148     /// be checked to see if it begins with any of the three-letter abbreviations
149     /// for months (Jan, Feb, Mar, etc.). If so, the month is known. If none of
150     /// the abbreviations produce a match, fail and return an empty string. If
151     /// more than one token that contains letters is found, return an empty string.
152     /// If there are no tokens that contain letters, try to determine which token is
153     /// the month by eliminating tokens that would be year or day.
154     /// Any token that is a number and has a value greater than 31 will be assumed
155     /// to be the year. If there is more than one such token, return an empty string.
156     /// After making this initial pass, try to guess the identities of the remaining tokens.
157     /// Numbers between 1 and 12 could be considered months, if no month token containing
158     /// letters was already identified. If two or more such tokens are found, the date is
159     /// ambiguous: return an empty string, unless one of these two conditions are met:
160     ///   a) the numbers are equal, in which case ambiguity about placement is irrelevant
161     ///   b) if one token is NOT zero-padded and less than 10, and the other is
162     ///      either 10 or more or IS zero-padded, then the token that is not padded and
163     ///      less than 10 is the day, and the other is the year, to which we should add 2000
164     /// If a number is between 1 and 31, it could be considered the day. If two such tokens
165     /// are found, the date is ambiguous: return an empty string.
166     /// If there is a number that cannot be the month or the day, assume that this is the
167     /// year. If the year is less than 100, this may be a two-digit representation. If
168     /// 2000 + the value is not in the future, use this as the year, otherwise use
169     /// 1900 + the value for the year.
170     /// If all tokens can be identified, arrange them in the output string in one of the
171     /// following formats:
172     /// YYYY
173     /// Mmm-YYYY
174     /// DD-Mmm-YYYY
175 
176     static string FixDateFormat(const string& orig_date);
177     static string FixDateFormat(const string& orig_date, bool month_first, bool& month_ambiguous);
178     static void DetectDateFormat(const string& orig_date, bool& ambiguous, bool &day_first);
179     static void IsCorrectLatLonFormat (string lat_lon, bool& format_correct, bool& precision_correct,
180                                      bool& lat_in_range, bool& lon_in_range,
181                                      double& lat_value, double& lon_value);
182     static string FixLatLonFormat (string orig_lat_lon, bool guess = false);
183     static string MakeLatLon(double lat_value, double lon_value, int lat_precision = 2, int lon_precision = 2);
184     static string FixLatLonPrecision(const string& orig);
185 
186     enum ELatLonCountryErr {
187         eLatLonCountryErr_None = 0,
188         eLatLonCountryErr_Country,
189         eLatLonCountryErr_State,
190         eLatLonCountryErr_Water,
191         eLatLonCountryErr_Value
192     };
193 
194     static string ValidateLatLonCountry (const string& countryname, string& lat_lon, bool check_state, ELatLonCountryErr& errcode);
195 
196     static bool IsValidSexQualifierValue (const string& value);
197     static string FixSexQualifierValue (const string& value);
198 
199     static bool IsAltitudeValid (const string& value);
200     static string FixAltitude (const string& value);
201 
202     static bool IsPlasmidNameValid(const string& value, const string& taxname);
203     static bool IsChromosomeNameValid(const string& value, const string& taxname);
204     static bool IsLinkageGroupNameValid(const string& value, const string& taxname);
205     static bool IsSegmentValid(const string& value);
206     static bool IsEndogenousVirusNameValid(const string& value);
207 
208 
209     static string FixDevStageCapitalization(const string& value);
210     static string FixCellTypeCapitalization(const string& value);
211     static string FixIsolationSourceCapitalization(const string& value);
212     static string FixTissueTypeCapitalization(const string& value);
213     static string FixLabHostCapitalization(const string& value);
214     static string FixCapitalization(TSubtype subtype, const string& value);
215     void FixCapitalization();
216 
217     static string AutoFix(TSubtype subtype, const string& value);
218     void AutoFix();
219 
220     static bool HasCultureNotes(const string& value);
221     static void RemoveCultureNotes(string& value, bool is_species_level = true);
222     void RemoveCultureNotes(bool is_species_level = true);
223 
224     static string CheckCellLine(const string& cell_line, const string& organism);
225 
226 private:
227     // Prohibit copy constructor and assignment operator
228     CSubSource(const CSubSource& value);
229     CSubSource& operator=(const CSubSource& value);
230 
231     static string x_ParseDateRangeWithDelimiter(const string& orig_date, CTempString delim);
232     static vector<string> x_GetDateTokens(const string& orig_date);
233     static CLatLonCountryId * x_CalculateLatLonId(float lat_value, float lon_value, string country, string province);
234     static bool x_IsFixableIsoDate(const string& orig_date);
235     static string x_RemoveIsoTime(const string& orig_date);
236 
237     static int x_GetPrecision(const string& num_str);
238     static string x_FormatWithPrecision(double val, int precision);
239 
240     static bool x_GenericRepliconNameValid(const string& value);
241     static bool x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(const string& value, const string& taxname);
242 
243     // validation data read from external files
244     static unique_ptr<CLatLonCountryMap> m_LatLonCountryMap;
245     static unique_ptr<CLatLonCountryMap> m_LatLonWaterMap;
246 
247 };
248 
249 /////////////////// CSubSource inline methods
250 
251 // constructor
252 inline
CSubSource(void)253 CSubSource::CSubSource(void)
254 {
255 }
256 
257 inline
CSubSource(TSubtype subtype,const TName & name)258 CSubSource::CSubSource(TSubtype subtype, const TName& name)
259 {
260     SetSubtype(subtype);
261     SetName(name);
262 }
263 
264 inline
CSubSource(const string & subtype,const TName & name)265 CSubSource::CSubSource(const string& subtype, const TName& name)
266 {
267     SetSubtype(GetSubtypeValue(subtype));
268     SetName(name);
269 }
270 
271 
272 /////////////////// end of CSubSource inline methods
273 
274 
275 // =============================================================================
276 //                 Country Names (legal values found in country subtype)
277 // =============================================================================
278 
279 class NCBI_SEQFEAT_EXPORT CCountries
280 {
281 public:
282     // USAStateCleanup return types and examples:
283     enum EStateCleanup {
284         e_NoResult  = 0, //  ""
285         e_Valid     = 1, //  "USA: Colorado"
286         e_Corrected = 2, //  "USA: Hamilton, MT" -> "USA: Montana, Hamilton"
287         e_Ambiguous = 3, //  "USA: Montana, Maine"
288         e_Missing   = 4, //  "USA: Springfield"
289         e_NotUSA    = 5  //  "France: Paris"
290     };
291 
292     static bool IsValid(const string& country);
293     static bool IsValid(const string& country, bool& is_miscapitalized);
294     static bool WasValid(const string& country);
295     static bool WasValid(const string& country, bool& is_miscapitalized);
296     static string CapitalizeFirstLetterOfEveryWord (const string &phrase);
297     static string WholeCountryFix(string country);
298     static bool IsSubstringOfStringInList(const string& phrase, const string& country1, size_t pos1);
299     static bool ContainsMultipleCountryNames (const string &phrase);
300     static string GetCorrectedCountryCapitalization(const string& country);
301     static string NewFixCountry (const string& input, bool us_territories = false);
302     static bool ChangeExtraColonsToCommas(string& country);
303     static string CountryFixupItem(const string &input, bool capitalize_after_colon);
304     typedef CStaticPairArrayMap<const char*, const char*, PCase_CStr> TCStringPairsMap;
305 
306     typedef map<string, string, PNocase> TUsaExceptionMap;
307     static void ReadUSAExceptionMap (TUsaExceptionMap& exceptions, const string& filepath);
308     static void LoadUSAExceptionMap (const TUsaExceptionMap& exceptions);
309     static void LoadUSAExceptionMap (const string& exception_file );
310 
311     static string USAStateCleanup (const string& country );
312     static string USAStateCleanup (const string& country, EStateCleanup& type );
313 private:
314     static const string sm_Countries[];
315     static const string sm_Former_Countries[];
316     static void x_RemoveDelimitersFromEnds(string& val, bool except_paren = false);
317     static vector<string> x_Tokenize(const string& val);
318     static void x_FindCountryName(const TCStringPairsMap& fix_map, const vector<string>& countries, string& valid_country, string& orig_valid_country, bool& too_many_countries, bool& bad_cap);
319 };
320 
321 
322 // ==================== for validating lat-lon versus country ================
323 
324 class CCountryLine;
325 
326 class NCBI_SEQFEAT_EXPORT CCountryExtreme
327 {
328 public:
329     CCountryExtreme (const string & country_name, int min_x, int min_y, int max_x, int max_y);
330     ~CCountryExtreme (void);
331 
GetCountry(void) const332     string GetCountry(void)         const { return m_CountryName; }
GetLevel0(void) const333     string GetLevel0(void)         const { return m_Level0; }
GetLevel1(void) const334     string GetLevel1(void)         const { return m_Level1; }
GetMinX(void) const335     int GetMinX(void)               const { return m_MinX; }
GetMinY(void) const336     int GetMinY(void)               const { return m_MinY; }
GetMaxX(void) const337     int GetMaxX(void)               const { return m_MaxX; }
GetMaxY(void) const338     int GetMaxY(void)               const { return m_MaxY; }
GetArea(void) const339     int GetArea(void)               const { return m_Area; }
340     void AddLine(const CCountryLine* line);
341     bool SetMinX(int min_x);
342     bool SetMinY(int min_y);
343     bool SetMaxX(int max_x);
344     bool SetMaxY(int max_y);
345     bool DoesOverlap(const CCountryExtreme* other_block) const;
346     bool PreferTo(const CCountryExtreme* other_block, const string country, const string province, const bool prefer_new) const;
347 
348 private:
349     string m_CountryName;
350     string m_Level0;
351     string m_Level1;
352     int m_MinX;
353     int m_MinY;
354     int m_MaxX;
355     int m_MaxY;
356     int m_Area;
357 };
358 
359 
360 class CCountryLine
361 {
362 public:
363     CCountryLine (const string & country_name, double y, double min_x, double max_x, double scale);
364     ~CCountryLine (void);
365 
GetCountry(void) const366     const string & GetCountry(void)            const { return m_CountryName; }
GetLat(void) const367   double GetLat(void)                const { return m_Y / m_Scale; }
GetMinLon(void) const368   double GetMinLon(void)             const { return m_MinX / m_Scale; }
GetMaxLon(void) const369   double GetMaxLon(void)             const { return m_MaxX / m_Scale; }
GetY(void) const370   int GetY(void)                  const { return m_Y; }
GetMinX(void) const371   int GetMinX(void)               const { return m_MinX; }
GetMaxX(void) const372   int GetMaxX(void)               const { return m_MaxX; }
373 
374   static int ConvertLat(double y, double scale);
375   static int ConvertLon(double x, double scale);
376 
SetBlock(CCountryExtreme * block)377   void SetBlock (CCountryExtreme *block) { m_Block = block; }
GetBlock(void) const378   CCountryExtreme * GetBlock(void) const {return m_Block; }
379 
380 private:
381   int x_ConvertLat(double y);
382   int x_ConvertLon(double x);
383 
384   CCountryExtreme *m_Block;
385     string m_CountryName;
386     int m_Y;
387     int m_MinX;
388     int m_MaxX;
389   double m_Scale;
390 };
391 
392 
393 class NCBI_SEQFEAT_EXPORT CLatLonCountryId
394 {
395 public:
396     CLatLonCountryId(float lat, float lon);
397     ~CLatLonCountryId(void);
398 
GetLat(void) const399     float GetLat(void) const { return m_Lat; }
SetLat(float lat)400     void  SetLat(float lat) { m_Lat = lat; }
GetLon(void) const401     float GetLon(void) const { return m_Lon; }
SetLon(float lon)402     void  SetLon(float lon) { m_Lon = lon; }
GetFullGuess(void) const403     string GetFullGuess(void) const { return m_FullGuess; }
SetFullGuess(string guess)404     void  SetFullGuess(string guess) { m_FullGuess = guess; }
GetGuessCountry(void) const405     string GetGuessCountry(void) const { return m_GuessCountry; }
SetGuessCountry(string guess)406     void  SetGuessCountry(string guess) { m_GuessCountry = guess; }
GetGuessProvince(void) const407     string GetGuessProvince(void) const { return m_GuessProvince; }
SetGuessProvince(string guess)408     void  SetGuessProvince(string guess) { m_GuessProvince = guess; }
GetGuessWater(void) const409     string GetGuessWater(void) const { return m_GuessWater; }
SetGuessWater(string guess)410     void  SetGuessWater(string guess) { m_GuessWater = guess; }
GetClosestFull(void) const411     string GetClosestFull(void) const { return m_ClosestFull; }
SetClosestFull(string closest)412     void  SetClosestFull(string closest) { m_ClosestFull = closest; }
GetClosestCountry(void) const413     string GetClosestCountry(void) const { return m_ClosestCountry; }
SetClosestCountry(string closest)414     void  SetClosestCountry(string closest) { m_ClosestCountry = closest; }
GetClosestProvince(void) const415     string GetClosestProvince(void) const { return m_ClosestProvince; }
SetClosestProvince(string closest)416     void  SetClosestProvince(string closest) { m_ClosestProvince = closest; }
GetClosestWater(void) const417     string GetClosestWater(void) const { return m_ClosestWater; }
SetClosestWater(string closest)418     void  SetClosestWater(string closest) { m_ClosestWater = closest; }
GetClaimedFull(void) const419     string GetClaimedFull(void) const { return m_ClaimedFull; }
SetClaimedFull(string claimed)420     void  SetClaimedFull(string claimed) { m_ClaimedFull = claimed; }
421 
GetLandDistance(void) const422     int GetLandDistance(void) const { return m_LandDistance; }
SetLandDistance(int dist)423     void SetLandDistance (int dist) { m_LandDistance = dist; }
GetWaterDistance(void) const424     int GetWaterDistance(void) const { return m_WaterDistance; }
SetWaterDistance(int dist)425     void SetWaterDistance (int dist) { m_WaterDistance = dist; }
GetClaimedDistance(void) const426     int GetClaimedDistance(void) const { return m_ClaimedDistance; }
SetClaimedDistance(int dist)427     void SetClaimedDistance (int dist) { m_ClaimedDistance = dist; }
428 
429 
430     enum EClassificationFlags {
431         fCountryMatch    = (1),
432         fProvinceMatch   = (1 << 1),
433         fWaterMatch      = (1 << 2),
434         fOverlap         = (1 << 3),
435         fCountryClosest  = (1 << 4),
436         fProvinceClosest = (1 << 5),
437         fWaterClosest    = (1 << 6)
438     };
439     typedef int TClassificationFlags;    ///< Bitwise OR of "EClassificationFlags"
440 
441     CLatLonCountryId::TClassificationFlags Classify(string country, string province);
442 
443 
444 private:
445   float  m_Lat;
446   float  m_Lon;
447   string m_FullGuess;
448   string m_GuessCountry;
449   string m_GuessProvince;
450   string m_GuessWater;
451   string m_ClosestFull;
452   string m_ClosestCountry;
453   string m_ClosestProvince;
454   string m_ClosestWater;
455   string m_ClaimedFull;
456   int    m_LandDistance;
457   int    m_WaterDistance;
458   int    m_ClaimedDistance;
459 };
460 
461 class NCBI_SEQFEAT_EXPORT CLatLonCountryMap
462 {
463 public:
464     CLatLonCountryMap(bool is_water);
465     ~CLatLonCountryMap(void);
466     bool IsCountryInLatLon(const string& country, double lat, double lon);
467     const CCountryExtreme * GuessRegionForLatLon(double lat, double lon,
468                                             const string& country = kEmptyStr,
469                                             const string& province = kEmptyStr);
470     const CCountryExtreme * FindClosestToLatLon(double lat, double lon,
471                                                 double range, double& distance);
472     bool IsClosestToLatLon(const string& country, double lat, double lon,
473                            double range, double& distance);
474     bool HaveLatLonForRegion(const string& country);
475     bool DoCountryBoxesOverlap(const string& country1, const string& country2);
476     const CCountryExtreme * IsNearLatLon(double lat, double lon, double range,
477                                          double& distance,
478                                          const string& country,
479                                          const string& province = kEmptyStr);
GetScale(void)480     double GetScale (void) { return m_Scale; }
481     static int AdjustAndRoundDistance (double distance, double scale);
482     int AdjustAndRoundDistance (double distance);
483 
484     enum ELatLonAdjustFlags {
485       fNone      = 0 ,
486       fFlip      = 1 ,
487       fNegateLat = (1 << 1),
488       fNegateLon = (1 << 2),
489     };
490     typedef int TLatLonAdjustFlags;    ///< Bitwise OR of "ELatLonAdjustFlags"
491 
492 
493 private:
494     void x_InitFromDefaultList(const char * const *list, int num);
495     bool x_InitFromFile(const string& filename);
496     static bool s_CompareTwoLinesByLatLonOnly(const CCountryLine* line1,
497                                     const CCountryLine* line2);
498     static bool s_CompareTwoLinesByCountry(const CCountryLine* line1,
499                                     const CCountryLine* line2);
500     static bool s_CompareTwoLinesByLatLonThenCountry(const CCountryLine* line1,
501                                     const CCountryLine* line2);
502 
503     size_t x_GetLatStartIndex (int y);
504     const CCountryExtreme * x_FindCountryExtreme (const string& country);
505 
506 
507     typedef vector <CCountryLine *> TCountryLineList;
508     typedef TCountryLineList::const_iterator TCountryLineList_iter;
509 
510     TCountryLineList m_CountryLineList;
511     TCountryLineList m_LatLonSortedList;
512     double m_Scale;
513 
514     typedef vector <CCountryExtreme *> TCountryExtremeList;
515     typedef TCountryExtremeList::const_iterator TCountryExtremeList_iter;
516     TCountryExtremeList m_CountryExtremes;
517 
518 
519       static const string sm_BodiesOfWater[];
520 
521 
522 
523 };
524 
525 NCBI_SEQFEAT_EXPORT double ErrorDistance (
526   double latA,
527   double lonA,
528   double scale);
529 
530 
531 
532 END_objects_SCOPE // namespace ncbi::objects::
533 
534 END_NCBI_SCOPE
535 
536 #endif // OBJECTS_SEQFEAT_SUBSOURCE_HPP
537 /* Original file checksum: lines: 94, chars: 2578, CRC32: 1c534244 */
538