1 /*
2  *  "GEDKeeper", the personal genealogical database editor.
3  *  Copyright (C) 2009-2020 by Sergey V. Zhdanovskih.
4  *
5  *  This file is part of "GEDKeeper".
6  *
7  *  This program is free software: you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation, either version 3 of the License, or
10  *  (at your option) any later version.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 using System;
22 
23 namespace GDModel.Providers.GEDCOM
24 {
25     public enum GEDCOMToken
26     {
27         Unknown,
28         Whitespace,
29         Symbol,
30         Word,
31         Number,
32         XRef,
33         EOL
34     }
35 
36     /// <summary>
37     /// GEDCOMParser tokenized string into tokens.
38     /// </summary>
39     /// <remarks>
40     /// This class has been heavily refactored under profiling. Any alterations must take into account the factor
41     /// of performance degradation when changing the approach, even in small things.
42     /// </remarks>
43     public sealed class GEDCOMParser
44     {
45         private const char EOL = (char)0;
46 
47         private GEDCOMToken fCurrentToken;
48         private char[] fData;
49         private bool fIgnoreWhitespace;
50         private int fLength;
51         private int fPos;
52         private int fSavePos;
53         private int fTokenEnd;
54 
55         private int fIntValue;
56         private string fStrValue;
57         private bool fValueReset;
58 
59 
60         public GEDCOMToken CurrentToken
61         {
62             get { return fCurrentToken; }
63         }
64 
65         public char[] Data
66         {
67             get { return fData; }
68         }
69 
70         public int Length
71         {
72             get { return fLength; }
73         }
74 
75         public int Position
76         {
77             get { return fPos; }
78         }
79 
80 
GEDCOMParser(bool ignoreWhitespace)81         public GEDCOMParser(bool ignoreWhitespace)
82         {
83             fIgnoreWhitespace = ignoreWhitespace;
84         }
85 
GEDCOMParser(string data, bool ignoreWhitespace)86         public GEDCOMParser(string data, bool ignoreWhitespace)
87         {
88             if (data == null)
89                 throw new ArgumentNullException("data");
90 
91             fIgnoreWhitespace = ignoreWhitespace;
92 
93             Reset(data.ToCharArray(), 0, data.Length);
94         }
95 
GEDCOMParser(char[] data, int startIndex, int length, bool ignoreWhitespace)96         public GEDCOMParser(char[] data, int startIndex, int length, bool ignoreWhitespace)
97         {
98             if (data == null)
99                 throw new ArgumentNullException("data");
100 
101             fIgnoreWhitespace = ignoreWhitespace;
102 
103             Reset(data, startIndex, length);
104         }
105 
Reset(char[] data, int startIndex, int length)106         public void Reset(char[] data, int startIndex, int length)
107         {
108             fData = data;
109             fLength = length;
110 
111             fCurrentToken = GEDCOMToken.Unknown;
112             fPos = startIndex;
113             fValueReset = false;
114         }
115 
Next()116         public GEDCOMToken Next()
117         {
118             while (true) {
119                 char ch = (fPos >= fLength) ? EOL : fData[fPos];
120                 char ltr = (char)(ch | ' ');
121 
122                 if ((ltr >= 'a' && ltr <= 'z') || ch == '_') {
123                     fSavePos = fPos;
124                     fPos++;
125                     while (true) {
126                         ch = (fPos >= fLength) ? EOL : fData[fPos];
127                         ltr = (char)(ch | ' ');
128                         if ((ltr >= 'a' && ltr <= 'z') || (ch >= '0' && ch <= '9') || ch == '_') {
129                             fPos++;
130                         } else
131                             break;
132                     }
133 
134                     fTokenEnd = fPos;
135                     fValueReset = true;
136                     fCurrentToken = GEDCOMToken.Word;
137                     return fCurrentToken;
138                 }
139 
140                 if (ch >= '0' && ch <= '9') {
141                     fSavePos = fPos;
142                     fPos++;
143                     fIntValue = ((int)ch - 48);
144                     while (true) {
145                         ch = (fPos >= fLength) ? EOL : fData[fPos];
146                         if (ch >= '0' && ch <= '9') {
147                             fPos++;
148                             fIntValue = (fIntValue * 10 + ((int)ch - 48));
149                         } else
150                             break;
151                     }
152 
153                     fTokenEnd = fPos;
154                     fValueReset = true;
155                     fCurrentToken = GEDCOMToken.Number;
156                     return fCurrentToken;
157                 }
158 
159                 if (ch == ' ' || ch == '\t') {
160                     if (fIgnoreWhitespace) {
161                         fPos++;
162                         continue;
163                     }
164 
165                     fSavePos = fPos;
166                     fPos++;
167                     while (true) {
168                         ch = (fPos >= fLength) ? EOL : fData[fPos];
169                         if (ch == ' ' || ch == '\t')
170                             fPos++;
171                         else
172                             break;
173                     }
174 
175                     fTokenEnd = fPos;
176                     fValueReset = true;
177                     fCurrentToken = GEDCOMToken.Whitespace;
178                     return fCurrentToken;
179                 }
180 
181                 if (ch == '@') {
182                     fSavePos = ++fPos;
183                     while (true) {
184                         ch = (fPos >= fLength) ? EOL : fData[fPos];
185                         fPos++;
186                         if (ch == '@') {
187                             fTokenEnd = fPos - 1;
188                             break;
189                         }
190                     }
191 
192                     fValueReset = true;
193                     fCurrentToken = GEDCOMToken.XRef;
194                     return fCurrentToken;
195                 }
196 
197                 if (ch == EOL) {
198                     fValueReset = true;
199                     fCurrentToken = GEDCOMToken.EOL;
200                     return fCurrentToken;
201                 } else {
202                     fSavePos = fPos;
203                     fPos++;
204 
205                     fTokenEnd = fPos;
206                     fValueReset = true;
207                     fCurrentToken = GEDCOMToken.Symbol;
208                     return fCurrentToken;
209                 }
210             }
211         }
212 
SkipWhitespaces()213         public void SkipWhitespaces()
214         {
215             if (fCurrentToken == GEDCOMToken.Unknown) {
216                 Next();
217             }
218 
219             while (fCurrentToken == GEDCOMToken.Whitespace) {
220                 Next();
221             }
222         }
223 
GetWord()224         public string GetWord()
225         {
226             if (fValueReset) {
227                 fStrValue = new string(fData, fSavePos, fTokenEnd - fSavePos);
228                 fValueReset = false;
229             }
230             return fStrValue;
231         }
232 
GetNumber()233         public int GetNumber()
234         {
235             return fIntValue;
236         }
237 
GetSymbol()238         public char GetSymbol()
239         {
240             return fData[fSavePos];
241         }
242 
GetRest()243         public string GetRest()
244         {
245             return (fPos >= fLength) ? string.Empty : new string(fData, fPos, fLength - fPos);
246         }
247 
GetFullStr()248         public string GetFullStr()
249         {
250             return new string(fData, 0, fLength);
251         }
252 
RequireToken(GEDCOMToken tokenKind)253         public bool RequireToken(GEDCOMToken tokenKind)
254         {
255             return (fCurrentToken == tokenKind);
256         }
257 
RequireWord(string token)258         public bool RequireWord(string token)
259         {
260             return (fCurrentToken == GEDCOMToken.Word && GetWord() == token);
261         }
262 
RequireSymbol(char symbol)263         public bool RequireSymbol(char symbol)
264         {
265             return (fCurrentToken == GEDCOMToken.Symbol && GetSymbol() == symbol);
266         }
267 
RequestSymbol(char symbol)268         public void RequestSymbol(char symbol)
269         {
270             if (fCurrentToken != GEDCOMToken.Symbol || GetSymbol() != symbol) {
271                 throw new GEDCOMParserException("Required symbol not found");
272             }
273         }
274 
RequestNextSymbol(char symbol)275         public void RequestNextSymbol(char symbol)
276         {
277             var token = Next();
278             if (token != GEDCOMToken.Symbol || GetSymbol() != symbol) {
279                 throw new GEDCOMParserException("Required symbol not found");
280             }
281         }
282 
RequestInt()283         public int RequestInt()
284         {
285             if (fCurrentToken != GEDCOMToken.Number) {
286                 throw new GEDCOMParserException("Required integer not found");
287             }
288             return GetNumber();
289         }
290 
RequestNextInt()291         public int RequestNextInt()
292         {
293             var token = Next();
294             if (token != GEDCOMToken.Number) {
295                 throw new GEDCOMParserException("Required integer not found");
296             }
297             return GetNumber();
298         }
299 
TokenLength()300         public int TokenLength()
301         {
302             return fTokenEnd - fSavePos;
303         }
304     }
305 }
306