1 /*
2 * CifParser.cpp
3 *
4 * Copyright Notice: see copyright.txt
5 *
6 * Date: 3/8/2012 (D/M/Y)
7 * Author: Dmitrij Lioubartsev
8 * Email: dmitrijl42@gmail.com
9 *
10 * For class info, see CifParser.h
11 */
12
13 #include "CifParser.h"
14 #include <cstdlib>
15 #include <cstring>
16 #include <fstream>
17 #include <vector>
18
CifParser(CifCrystal * structure)19 CifParser::CifParser(CifCrystal* structure) {
20 crystal = structure;
21 line = 1;
22 }
23
~CifParser()24 CifParser::~CifParser() {
25 // do nothing
26 }
27
28 /*
29 * Tries to open the file with the specified filename.
30 * If file doesn't exist, returns false.
31 */
opencif(const char * fileName)32 bool CifParser::opencif(const char * fileName) {
33 cifFile.open(fileName);
34 if(cifFile)
35 return true;
36 else
37 return false;
38 }
39
40 /*
41 * Reads the file opened by opencif, and puts all
42 * information into the CifCrystal pointed by the
43 * specified variable crystal.
44 */
readcif()45 void CifParser::readcif() {
46
47 bool saveFrame = false; //to indicate if in a save frame
48 //only used for some warning messages, since this program doesn't
49 // change the contains of the file.
50
51 bool dataBlock = false; //to indicate if inside a data block
52 //if data found outside of data block, it is still read, but
53 //a warning message is printed.
54
55 bool fulhax = false; //this is to fix a weird bug with not
56 //recognizing eof for some reason.
57
58 while(!cifFile.eof()) { //main parsing loop
59
60 if(fulhax)
61 break;
62
63 //skip all whitespaces
64 //Whitespace characters are ' ', '\t', '\n' and comments (#)
65 if(skipWhiteSpace())
66 break; //found eof, stop
67
68 string token("");
69
70 if(getWord(token)) //get next token (word)
71 break;
72
73 /*
74 * Now, token can be one of these things:
75 * 1. data block header, in that case it starts with data_
76 * -> set dataBlock = true
77 * 2. Save Frame header, begins with "save_"
78 * -> do nothing (except warning messages)
79 * 3. Save frame end, is "save_"
80 * -> do nothing (except warning messages)
81 * 4. Data tag, begins with '_'
82 * -> fetch value
83 * 5. Loop, begins with "loop_"
84 * -> fetch loop data values
85 * 6. Something else, in which case there is an error
86 */
87
88 if(beginsWith(token.c_str(), "data_")) { //case 1, datablock
89 dataBlock = true;
90 } else if(beginsWith(token.c_str(),"save_")) { //cases 2-3, save frame
91
92 if(!dataBlock) {
93 cout << "(line " << line << ") WARNING: A Save Frame "
94 << "token is found outside a datablock. It is "
95 << "still read, but its advised that the file "
96 << "gets repaired." << endl;
97 }
98
99 //For the program, the following code isn't important, but
100 //it will give the user a notification if incorrect usage
101 //of save frames is found. It does not have any impact on
102 //the functioning of this program.
103
104 if(!saveFrame && token.length() > 5) { //save frame header
105 saveFrame = true;
106 } else if(!saveFrame && token.length() == 5) { //save frame end without it being initialized
107 cout << "(line " << line << ") WARNING: Save Frame header has no name."
108 " Ignoring." << endl;
109 saveFrame = true;
110 } else if(saveFrame && token.length() > 5) { //found save frame inside a save frame
111 cout << "(line " << line << ") WARNING: Save Frames inside Save Frames"
112 " are not allowed. Ignoring." << endl;
113 } else if(saveFrame && token.length() == 5 ) { //save frame end.
114 saveFrame = false;
115 }
116
117 } else if((token[0] == '_') && token.length() > 1) { // case 4, data item
118
119 if(!dataBlock) {
120 cout << "(line " << line << ") WARNING: A Data Item "
121 << "token is found outside a datablock. It is "
122 << "still read, but its advised that the file "
123 << "gets repaired." << endl;
124 }
125
126
127 if(skipWhiteSpace()) { //skip the whitespace after the tag
128 cout << "(line " << line << ") WARNING: Data item { "
129 << token << " } does not have an assigned data "
130 << "value. Ignoring item." << endl;
131 break;
132 }
133
134 char ch;
135 ch = cifFile.peek();
136 if(ch == '_' || ch == '$' || ch == '|') {
137 cout << "(line " << line << ") WARNING: Data item { "
138 << token << " } does not have an assigned data "
139 << "value, or the value starts with a forbidden "
140 << "starting character ('_','$','|'). Ignoring "
141 << "data tag." << endl;
142
143 } else {
144 string value("");
145 getValue(value); //get value
146 ifReservedReset(value); //check if its a reserved word
147 setDataItem(token, value); //set data item
148 }
149
150 } else if(beginsWith(token.c_str(),"loop_")) { //case 5, loop
151 if(!dataBlock) {
152 cout << "(line " << line << ") WARNING: A Loop "
153 << "token is found outside a datablock. It is "
154 << "still read, but its advised that the file "
155 << "gets repaired." << endl;
156 }
157
158 if(skipWhiteSpace()) {
159 cout << "(line " << line << ") WARNING: A Loop "
160 << "token is found but nothing after it."<< endl;
161 break;
162 }
163
164 if(!(cifFile.peek() == '_')) {
165 cout << "(line " << line << ") WARNING: A Loop "
166 << "token is found but no data tags found "<< endl;
167 continue;
168 }
169
170 //parse loop data tags
171
172 vector<string> tokens(0); //vector that holds data tags.
173 while(cifFile.peek() == '_') { //while there are tags
174 string token = "";
175
176 if(getWord(token)) //get tag
177 break;
178
179 tokens.push_back(token); //add to tag vector
180
181 if(skipWhiteSpace()) //skip whitespaces.
182 break;
183 }
184 if(cifFile.eof()) {
185 cout << "(line " << line << ") WARNING: A loop is "
186 "initialized but end of file is reached before "
187 "any values were found. Ignoring everything in "
188 "the loop body." << endl;
189 break;
190 }
191
192 //Loop header parsed, all data tags are in vector "tokens"
193 unsigned long int vals = 0; //to keep track of which tag to add the value to
194 string value("");
195
196 while(!cifFile.eof()) {
197 char c = cifFile.peek();
198 if((c == '_') || (c == '$') || c == '|') {
199 break; //loop over
200 }
201
202 getValue(value);
203 if(ifReservedReset(value)) { //if value is a reserved word
204 break;
205 } else {
206 setDataItem(tokens[vals % tokens.size()], value); //set data item
207 vals++; //increment vals
208 if(skipWhiteSpace()) {//and skip whitespaces to next value
209 fulhax = true;
210 break;
211 }
212 }
213
214 }
215
216 if((vals % tokens.size()) != 0 ) {
217 cout << "(line " << line << ") WARNING: a loop was "
218 "terminated but amount of values does not "
219 "go even, and thus not enough values was "
220 "read, which may or may not cause problems." << endl;
221 }
222
223 } else if(beginsWith(token.c_str(),"global_")){ // part of case 6, since global_ is forbidden
224 cout << "(line " << line << ") WARNING: usage of the \"global_\" keyword "
225 << "is forbidden by .cif standards. Ignoring." << endl;
226 } else { //case 6, file syntax error
227 cout << "(line " << line << ") WARNING: found unknown token { "
228 << token << " }. Ignoring." << endl;
229 }
230 }
231 return;
232 }
233
234 /*
235 * Checks if c is a whitespace char ('\t', ' '),
236 * but not end of line.
237 */
isWhiteSpace(char c)238 bool CifParser::isWhiteSpace(char c) {
239 return (c == ' ') || (c == '\t'); //11 is ASCII for tab '\t', 12 is new line '\n'
240 }
241
242 /*
243 * Checks if c is a whitespace char ('\t', ' ', \n).
244 */
isWhiteSpaceNL(char c)245 bool CifParser::isWhiteSpaceNL(char c) {
246 return (c == ' ') || (c == '\t') || (c == '\n') || (c == '\r'); //11 is ASCII for tab '\t', 12 is new line '\n'
247 }
248
249 /*
250 * Skips through all whitespace characters and comments, and leaves
251 * the file stream at the next token. White space characters are
252 * ' ', '\t', '\n' and comments start with '#' and continue until
253 * a '\n' character is found.
254 * The integer "line" is incremented each time a '\n' is found.
255 * If eof found, returns true.
256 */
skipWhiteSpace()257 bool CifParser::skipWhiteSpace() {
258 char c = ' ';
259
260 bool eof = false;
261 while((isWhiteSpaceNL(c) || c == '#') && !cifFile.eof()) {
262 cifFile.get(c); //get next char
263 if(cifFile.eof()) {
264 eof = true;
265 break; //reached end of file, stop
266 }
267 //some special cases
268 if(c == '\n') { //new line
269 line++;
270 } else if(c == '#') { //comment
271 cifFile.ignore(CIF_MAX_LINE_LENGTH, '\n'); // ignore rest of line.
272 line++; //increment line
273 if(cifFile.eof())
274 eof = true;
275 }
276
277 }
278
279 cifFile.unget(); //put the char from get() back.
280
281 return eof;
282 }
283
284 /*
285 * Similar to the skipWhiteSpace, it extracts all characters found
286 * and puts them in word. It stops when a whitespace character is found.
287 * If eof found, returns true.
288 */
getWord(string & word)289 bool CifParser::getWord(string& word) {
290 bool eof = false;
291 char c;
292 word = "";
293 do {
294 cifFile.get(c);
295 if(cifFile.eof()) {
296 //reached end of file, stop
297 eof = true;
298 break;
299 }
300 word.append(1, c);
301 }
302 while(!isWhiteSpaceNL(c));
303
304 if(!eof) { //reset last char
305 cifFile.unget();
306 word.erase(word.length()-1,1);
307 }
308
309 return eof;
310 }
311
312 /*
313 * beginsWith() checks if "what" begins with "with".
314 * "with" should be lowercase, because the test
315 * is case-insensitive (uses toUpper() ).
316 * Example use: beginsWith(myString, "data_")
317 */
beginsWith(const char * what,const char * with)318 bool CifParser::beginsWith(const char * what, const char* with) {
319
320 if(strlen(with) > strlen(what))
321 return false;
322
323 bool correct = true; //the return variable
324 int i = 0; //byte counter
325
326 while(correct && with[i]) { // only continue tests while still equal, and
327 // end of comparing string isn't reached
328 if(!((what[i] == with[i]) || (what[i] == toupper(with[i]))))
329 correct = false;
330
331 i++;
332 }
333
334 return correct;
335 }
336
337 /*
338 * Extracts a value, as defined by cif standards,
339 * and puts it into "value".
340 * If eof found, returns true.
341 */
getValue(string & value)342 bool CifParser::getValue(string& value) {
343 bool eof = false;
344 value = "";
345
346 //try to identify the type of the value
347 char c = (char) cifFile.peek();
348
349 if(c == '\'') { //single-quote string
350 if(getQuoteValue(value, '\''))
351 eof = true;
352
353 } else if(c == '\"') { //double-quote string
354 if(getQuoteValue(value, '\"'))
355 eof = true;
356 } else if(c == ';') { //text field OR unquoted string
357 //check last char, if its '\n' then its a text field
358 cifFile.unget();
359 cifFile.get(c);
360 if(c == '\n') { //we have ourselves a text field
361 cifFile.ignore(2); //ignore the '\n'
362 //and ignore the ;
363 if(cifFile.eof()) { //if eof is after the ;
364 cout << "(line " << line << ") WARNING: End of file reached, "
365 "immediatly after a text field was initialized. Value "
366 "of last token is therefore set to \"?\" (unknown)." << endl;
367 value = "?";
368 eof = true;
369 return eof;
370 }
371
372 bool finished = false;
373 while(!finished) { //main loop of parsing text field data
374 cifFile.get(c);
375
376 if(cifFile.eof()) { //if suddenly eof
377 cout << "(line " << line << ") WARNING: End of file reached "
378 "before the text field data value was properly terminated."
379 " Value is still read until that point, but "
380 "it is advised to repair the file." << endl;
381 eof = true;
382 break;
383 }
384
385 if(c == '\n') { //if read char is end of line
386 line++; //increment line
387 char ch = (char) cifFile.peek();
388 if(ch == ';') { //and terminate if a ';' is found at start of line
389 finished = true;
390 cifFile.ignore(); //ignores the ';'
391 } else {
392 value.append(1,c); //otherwise appends the '\n'
393 }
394 } else {
395 value.append(1,c); //if not a '\n', just append
396 }
397 }
398
399 } else { //a single unquoted string
400 // this assumes that ; is included in the value
401 if(getWord(value))
402 eof = true;
403 }
404
405 } else { //unquoted string, a word continues until whitespace
406 if(getWord(value))
407 eof = true;
408
409 }
410
411 return eof;
412 }
413
414 /*
415 * This is solely a helper method used by getValue, to avoid duplicating
416 * very similar code. It extracts a single quote value or a double quote value,
417 * depending on what is specified.
418 * If eof found, returns true.
419 */
getQuoteValue(string & value,const char quote)420 bool CifParser::getQuoteValue(string& value, const char quote) {
421 cifFile.ignore(); //ignore first quote.
422 bool finished = false;
423 bool eof = false;
424 char c;
425
426 //main loop
427 while(!finished) {
428 cifFile.get(c); //get char
429 if(cifFile.eof()) { //in case eof was reached
430 if(!(c == quote)) {
431 if(!(c == '\n'))
432 value.append(1, c);
433 cout << "(line " << line << ") WARNING: Quote data value did not "
434 "terminate before end of file was reached. Ignoring." << endl;
435 }
436 eof = true;
437 break;
438 }
439
440 if(c == '\n') { //if line break is found.
441 //print an error message and increment line.
442
443 cout << "(line " << line << ") WARNING: Single and double quote "
444 "data values are not permitted to extend over "
445 "multiple lines. Value is read until the end of this "
446 "line, which might lead to bad results. "
447 "Suggesting usage of a text field instead." << endl;
448 line++;
449 break;
450 }
451
452 //normal procedure, all above is basically error handling
453 if(c == quote && isWhiteSpaceNL(cifFile.peek())) {
454 finished = true; //end of value found
455 } else {
456 value.append(1, c); //else append
457 }
458 }
459
460 return eof;
461
462 }
463
464 /*
465 * Assigns the data item with the given header and
466 * value. If the value is "unknown" ('.','?'), nothing happens.
467 */
setDataItem(string & token,string & value)468 void CifParser::setDataItem(string& token, string& value) {
469
470 if(value != "." && value != "?") {
471
472 if(token == "_cell_length_a") {
473 if(!((*crystal).seta(atof(value.c_str())))) {
474 printErrorAS("_cell_length_a");
475 }
476 /*
477 * To explain this above which looks very complicated:
478 * 1. Follow the CifCrystalPointer, to set its 'a' field.
479 * 2. The value to set it to is calculated by std::atof().
480 * 3. atof() does only take a c string, so need to convert.
481 * 4. seta() returns false if the value has already been
482 * assigned, meaning that the tag "_cell_length_a" is
483 * appearing multiple times in the file. If so, print
484 * a warning message.
485 */
486
487 } else if(token == "_cell_length_b") {
488 if(!((*crystal).setb(atof(value.c_str())))) {
489 printErrorAS("_cell_length_b");
490 }
491 } else if(token == "_cell_length_c") {
492 if(!((*crystal).setc(atof(value.c_str())))) {
493 printErrorAS("_cell_length_c");
494 }
495 } else if(token == "_cell_angle_alpha") {
496 if(!((*crystal).setalpha(atof(value.c_str())))) {
497 printErrorAS("_cell_angle_alpha");
498 }
499 } else if(token == "_cell_angle_beta") {
500 if(!((*crystal).setbeta(atof(value.c_str())))) {
501 printErrorAS("_cell_angle_beta");
502 }
503 } else if(token == "_cell_angle_gamma") {
504 if(!((*crystal).setgamma(atof(value.c_str())))) {
505 printErrorAS("_cell_angle_gamma");
506 }
507 } else if(token == "_symmetry_equiv_pos_as_xyz"
508 || token == "_space_group_symop_operation_xyz") {
509 (*crystal).addSymmetry(value); //simply add the symmetry
510
511 } else if(token == "_atom_site_type_symbol") {
512 (*crystal).setAElement(value);
513 } else if(token == "_atom_site_fract_x") {
514 (*crystal).setAx(atof(value.c_str()));
515 } else if(token == "_atom_site_fract_y") {
516 (*crystal).setAy(atof(value.c_str()));
517 } else if(token == "_atom_site_fract_z") {
518 (*crystal).setAz(atof(value.c_str()));
519 } else {
520 //do nothing
521 }
522
523 }
524
525 return;
526 }
527
528 /*
529 * This takes the string value, and checks if it
530 * is a reserved word (data_,loop_ etc.). If it is, it
531 * sets back the cifFile pointer to the beginning of
532 * the word, and sets the value to "?" (unknown).
533 */
ifReservedReset(string & value)534 bool CifParser::ifReservedReset(string& value) {
535 if(beginsWith(value.c_str(), "data_") || //not allowed to be a value.
536 beginsWith(value.c_str(), "save_") ||
537 beginsWith(value.c_str(), "loop_") ||
538 beginsWith(value.c_str(), "global_")) {
539
540 for(unsigned int i = 0; i < value.length(); i++) {
541 cifFile.unget(); //unget the amount of characters got.
542 }
543 /*
544 * This below is another way of fixig the above.
545 while(!isWhiteSpaceNL(cifFile.peek())) {
546 cifFile.unget();
547 }
548 cifFile.ignore();*/
549
550 value = "?"; //set value to unknown
551 return true;
552 } else {
553 return false;
554 }
555 }
556
557 /*
558 * This is a small helper method to indicate that the
559 * data value has already been assigned (multiple
560 * instances of a data tag found). "value" should be
561 * something like "_cell_length_a"
562 */
printErrorAS(const char * value)563 void CifParser::printErrorAS(const char * value) {
564 cout << "(line " << line << ") WARNING: data item "
565 " {" << value << " } has been found "
566 "multiple times in the file. Using the "
567 "first encountered value. " << endl;
568 }
569
570