1 /* 2 * "GEDKeeper", the personal genealogical database editor. 3 * Copyright (C) 2009-2020 by Sergey V. Zhdanovskih. 4 * 5 * This file is part of "GEDKeeper". 6 * 7 * This program is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this program. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 21 using System; 22 23 namespace GDModel.Providers.GEDCOM 24 { 25 public enum GEDCOMToken 26 { 27 Unknown, 28 Whitespace, 29 Symbol, 30 Word, 31 Number, 32 XRef, 33 EOL 34 } 35 36 /// <summary> 37 /// GEDCOMParser tokenized string into tokens. 38 /// </summary> 39 /// <remarks> 40 /// This class has been heavily refactored under profiling. Any alterations must take into account the factor 41 /// of performance degradation when changing the approach, even in small things. 42 /// </remarks> 43 public sealed class GEDCOMParser 44 { 45 private const char EOL = (char)0; 46 47 private GEDCOMToken fCurrentToken; 48 private char[] fData; 49 private bool fIgnoreWhitespace; 50 private int fLength; 51 private int fPos; 52 private int fSavePos; 53 private int fTokenEnd; 54 55 private int fIntValue; 56 private string fStrValue; 57 private bool fValueReset; 58 59 60 public GEDCOMToken CurrentToken 61 { 62 get { return fCurrentToken; } 63 } 64 65 public char[] Data 66 { 67 get { return fData; } 68 } 69 70 public int Length 71 { 72 get { return fLength; } 73 } 74 75 public int Position 76 { 77 get { return fPos; } 78 } 79 80 GEDCOMParser(bool ignoreWhitespace)81 public GEDCOMParser(bool ignoreWhitespace) 82 { 83 fIgnoreWhitespace = ignoreWhitespace; 84 } 85 GEDCOMParser(string data, bool ignoreWhitespace)86 public GEDCOMParser(string data, bool ignoreWhitespace) 87 { 88 if (data == null) 89 throw new ArgumentNullException("data"); 90 91 fIgnoreWhitespace = ignoreWhitespace; 92 93 Reset(data.ToCharArray(), 0, data.Length); 94 } 95 GEDCOMParser(char[] data, int startIndex, int length, bool ignoreWhitespace)96 public GEDCOMParser(char[] data, int startIndex, int length, bool ignoreWhitespace) 97 { 98 if (data == null) 99 throw new ArgumentNullException("data"); 100 101 fIgnoreWhitespace = ignoreWhitespace; 102 103 Reset(data, startIndex, length); 104 } 105 Reset(char[] data, int startIndex, int length)106 public void Reset(char[] data, int startIndex, int length) 107 { 108 fData = data; 109 fLength = length; 110 111 fCurrentToken = GEDCOMToken.Unknown; 112 fPos = startIndex; 113 fValueReset = false; 114 } 115 Next()116 public GEDCOMToken Next() 117 { 118 while (true) { 119 char ch = (fPos >= fLength) ? EOL : fData[fPos]; 120 char ltr = (char)(ch | ' '); 121 122 if ((ltr >= 'a' && ltr <= 'z') || ch == '_') { 123 fSavePos = fPos; 124 fPos++; 125 while (true) { 126 ch = (fPos >= fLength) ? EOL : fData[fPos]; 127 ltr = (char)(ch | ' '); 128 if ((ltr >= 'a' && ltr <= 'z') || (ch >= '0' && ch <= '9') || ch == '_') { 129 fPos++; 130 } else 131 break; 132 } 133 134 fTokenEnd = fPos; 135 fValueReset = true; 136 fCurrentToken = GEDCOMToken.Word; 137 return fCurrentToken; 138 } 139 140 if (ch >= '0' && ch <= '9') { 141 fSavePos = fPos; 142 fPos++; 143 fIntValue = ((int)ch - 48); 144 while (true) { 145 ch = (fPos >= fLength) ? EOL : fData[fPos]; 146 if (ch >= '0' && ch <= '9') { 147 fPos++; 148 fIntValue = (fIntValue * 10 + ((int)ch - 48)); 149 } else 150 break; 151 } 152 153 fTokenEnd = fPos; 154 fValueReset = true; 155 fCurrentToken = GEDCOMToken.Number; 156 return fCurrentToken; 157 } 158 159 if (ch == ' ' || ch == '\t') { 160 if (fIgnoreWhitespace) { 161 fPos++; 162 continue; 163 } 164 165 fSavePos = fPos; 166 fPos++; 167 while (true) { 168 ch = (fPos >= fLength) ? EOL : fData[fPos]; 169 if (ch == ' ' || ch == '\t') 170 fPos++; 171 else 172 break; 173 } 174 175 fTokenEnd = fPos; 176 fValueReset = true; 177 fCurrentToken = GEDCOMToken.Whitespace; 178 return fCurrentToken; 179 } 180 181 if (ch == '@') { 182 fSavePos = ++fPos; 183 while (true) { 184 ch = (fPos >= fLength) ? EOL : fData[fPos]; 185 fPos++; 186 if (ch == '@') { 187 fTokenEnd = fPos - 1; 188 break; 189 } 190 } 191 192 fValueReset = true; 193 fCurrentToken = GEDCOMToken.XRef; 194 return fCurrentToken; 195 } 196 197 if (ch == EOL) { 198 fValueReset = true; 199 fCurrentToken = GEDCOMToken.EOL; 200 return fCurrentToken; 201 } else { 202 fSavePos = fPos; 203 fPos++; 204 205 fTokenEnd = fPos; 206 fValueReset = true; 207 fCurrentToken = GEDCOMToken.Symbol; 208 return fCurrentToken; 209 } 210 } 211 } 212 SkipWhitespaces()213 public void SkipWhitespaces() 214 { 215 if (fCurrentToken == GEDCOMToken.Unknown) { 216 Next(); 217 } 218 219 while (fCurrentToken == GEDCOMToken.Whitespace) { 220 Next(); 221 } 222 } 223 GetWord()224 public string GetWord() 225 { 226 if (fValueReset) { 227 fStrValue = new string(fData, fSavePos, fTokenEnd - fSavePos); 228 fValueReset = false; 229 } 230 return fStrValue; 231 } 232 GetNumber()233 public int GetNumber() 234 { 235 return fIntValue; 236 } 237 GetSymbol()238 public char GetSymbol() 239 { 240 return fData[fSavePos]; 241 } 242 GetRest()243 public string GetRest() 244 { 245 return (fPos >= fLength) ? string.Empty : new string(fData, fPos, fLength - fPos); 246 } 247 GetFullStr()248 public string GetFullStr() 249 { 250 return new string(fData, 0, fLength); 251 } 252 RequireToken(GEDCOMToken tokenKind)253 public bool RequireToken(GEDCOMToken tokenKind) 254 { 255 return (fCurrentToken == tokenKind); 256 } 257 RequireWord(string token)258 public bool RequireWord(string token) 259 { 260 return (fCurrentToken == GEDCOMToken.Word && GetWord() == token); 261 } 262 RequireSymbol(char symbol)263 public bool RequireSymbol(char symbol) 264 { 265 return (fCurrentToken == GEDCOMToken.Symbol && GetSymbol() == symbol); 266 } 267 RequestSymbol(char symbol)268 public void RequestSymbol(char symbol) 269 { 270 if (fCurrentToken != GEDCOMToken.Symbol || GetSymbol() != symbol) { 271 throw new GEDCOMParserException("Required symbol not found"); 272 } 273 } 274 RequestNextSymbol(char symbol)275 public void RequestNextSymbol(char symbol) 276 { 277 var token = Next(); 278 if (token != GEDCOMToken.Symbol || GetSymbol() != symbol) { 279 throw new GEDCOMParserException("Required symbol not found"); 280 } 281 } 282 RequestInt()283 public int RequestInt() 284 { 285 if (fCurrentToken != GEDCOMToken.Number) { 286 throw new GEDCOMParserException("Required integer not found"); 287 } 288 return GetNumber(); 289 } 290 RequestNextInt()291 public int RequestNextInt() 292 { 293 var token = Next(); 294 if (token != GEDCOMToken.Number) { 295 throw new GEDCOMParserException("Required integer not found"); 296 } 297 return GetNumber(); 298 } 299 TokenLength()300 public int TokenLength() 301 { 302 return fTokenEnd - fSavePos; 303 } 304 } 305 } 306