1 /*
2  * CifParser.cpp
3  *
4  * Copyright Notice: see copyright.txt
5  *
6  * Date: 3/8/2012 (D/M/Y)
7  * Author: Dmitrij Lioubartsev
8  * Email: dmitrijl42@gmail.com
9  *
10  * For class info, see CifParser.h
11  */
12 
13 #include "CifParser.h"
14 #include <cstdlib>
15 #include <cstring>
16 #include <fstream>
17 #include <vector>
18 
CifParser(CifCrystal * structure)19 CifParser::CifParser(CifCrystal* structure) {
20 	crystal = structure;
21 	line = 1;
22 }
23 
~CifParser()24 CifParser::~CifParser() {
25 	// do nothing
26 }
27 
28 /*
29  * Tries to open the file with the specified filename.
30  * If file doesn't exist, returns false.
31  */
opencif(const char * fileName)32 bool CifParser::opencif(const char * fileName) {
33 	cifFile.open(fileName);
34 	if(cifFile)
35 		return true;
36 	else
37 		return false;
38 }
39 
40 /*
41  * Reads the file opened by opencif, and puts all
42  * information into the CifCrystal pointed by the
43  * specified variable crystal.
44  */
readcif()45 void CifParser::readcif() {
46 
47 	bool saveFrame = false; //to indicate if in a save frame
48 	//only used for some warning messages, since this program doesn't
49 	// change the contains of the file.
50 
51 	bool dataBlock = false; //to indicate if inside a data block
52 	//if data found outside of data block, it is still read, but
53 	//a warning message is printed.
54 
55 	bool fulhax = false; //this is to fix a weird bug with not
56 						//recognizing eof for some reason.
57 
58 	while(!cifFile.eof()) { //main parsing loop
59 
60 		if(fulhax)
61 			break;
62 
63 		//skip all whitespaces
64 		//Whitespace characters are ' ', '\t', '\n' and comments (#)
65 		if(skipWhiteSpace())
66 			break; //found eof, stop
67 
68 		string token("");
69 
70 		if(getWord(token)) //get next token (word)
71 			break;
72 
73 		/*
74 		 * Now, token can be one of these things:
75 		 * 1. data block header, in that case it starts with data_
76 		 * 			-> set dataBlock = true
77 		 * 2. Save Frame header, begins with "save_"
78 		 * 			-> do nothing (except warning messages)
79 		 * 3. Save frame end, is "save_"
80 		 * 			-> do nothing (except warning messages)
81 		 * 4. Data tag, begins with '_'
82 		 * 			-> fetch value
83 		 * 5. Loop, begins with "loop_"
84 		 * 			-> fetch loop data values
85 		 * 6. Something else, in which case there is an error
86 		 */
87 
88 		if(beginsWith(token.c_str(), "data_")) { //case 1, datablock
89 			dataBlock = true;
90 		} else if(beginsWith(token.c_str(),"save_")) { //cases 2-3, save frame
91 
92 			if(!dataBlock) {
93 				cout << "(line " << line << ") WARNING: A Save Frame "
94 						<< "token is found outside a datablock. It is "
95 						<< "still read, but its advised that the file "
96 						<< "gets repaired." << endl;
97 			}
98 
99 			//For the program, the following code isn't important, but
100 			//it will give the user a notification if incorrect usage
101 			//of save frames is found. It does not have any impact on
102 			//the functioning of this program.
103 
104 			if(!saveFrame && token.length() > 5) { //save frame header
105 				saveFrame = true;
106 			} else if(!saveFrame && token.length() == 5) { //save frame end without it being initialized
107 				cout << "(line " << line << ") WARNING: Save Frame header has no name."
108 						" Ignoring." << endl;
109 				saveFrame = true;
110 			} else if(saveFrame && token.length() > 5) { //found save frame inside a save frame
111 				cout << "(line " << line << ") WARNING: Save Frames inside Save Frames"
112 						" are not allowed. Ignoring." << endl;
113 			} else if(saveFrame && token.length() == 5 ) { //save frame end.
114 				saveFrame = false;
115 			}
116 
117 		} else if((token[0] == '_') && token.length() > 1) { // case 4, data item
118 
119 			if(!dataBlock) {
120 				cout << "(line " << line << ") WARNING: A Data Item "
121 						<< "token is found outside a datablock. It is "
122 						<< "still read, but its advised that the file "
123 						<< "gets repaired." << endl;
124 			}
125 
126 
127 			if(skipWhiteSpace()) { //skip the whitespace after the tag
128 				cout << "(line " << line << ") WARNING: Data item { "
129 						<< token << " } does not have an assigned data "
130 						<< "value. Ignoring item." << endl;
131 				break;
132 			}
133 
134 			char ch;
135 			ch = cifFile.peek();
136 			if(ch == '_' || ch == '$' || ch == '|') {
137 				cout << "(line " << line << ") WARNING: Data item { "
138 						<< token << " } does not have an assigned data "
139 						<< "value, or the value starts with a forbidden "
140 						<< "starting character ('_','$','|'). Ignoring "
141 						<< "data tag." << endl;
142 
143 			} else {
144 				string value("");
145 				getValue(value); //get value
146 				ifReservedReset(value); //check if its a reserved word
147 				setDataItem(token, value); //set data item
148 			}
149 
150 		} else if(beginsWith(token.c_str(),"loop_")) { //case 5, loop
151 			if(!dataBlock) {
152 				cout << "(line " << line << ") WARNING: A Loop "
153 						<< "token is found outside a datablock. It is "
154 						<< "still read, but its advised that the file "
155 						<< "gets repaired." << endl;
156 			}
157 
158 			if(skipWhiteSpace()) {
159 				cout << "(line " << line << ") WARNING: A Loop "
160 					<< "token is found but nothing after it."<< endl;
161 				break;
162 			}
163 
164 			if(!(cifFile.peek() == '_')) {
165 				cout << "(line " << line << ") WARNING: A Loop "
166 					<< "token is found but no data tags found "<< endl;
167 				continue;
168 			}
169 
170 			//parse loop data tags
171 
172 			vector<string> tokens(0); //vector that holds data tags.
173 			while(cifFile.peek() == '_') { //while there are tags
174 				string token = "";
175 
176 				if(getWord(token)) //get tag
177 					break;
178 
179 				tokens.push_back(token); //add to tag vector
180 
181 				if(skipWhiteSpace()) //skip whitespaces.
182 					break;
183 			}
184 			if(cifFile.eof()) {
185 				cout << "(line " << line << ") WARNING: A loop is "
186 					"initialized but end of file is reached before "
187 					"any values were found. Ignoring everything in "
188 					"the loop body." << endl;
189 				break;
190 			}
191 
192 			//Loop header parsed, all data tags are in vector "tokens"
193 			unsigned long int vals = 0; //to keep track of which tag to add the value to
194 			string value("");
195 
196 			while(!cifFile.eof()) {
197 				char c = cifFile.peek();
198 				if((c == '_') || (c == '$') || c == '|') {
199 					break; //loop over
200 				}
201 
202 				getValue(value);
203 				if(ifReservedReset(value)) { //if value is a reserved word
204 					break;
205 				} else {
206 					setDataItem(tokens[vals % tokens.size()], value); //set data item
207 					vals++; //increment vals
208 					if(skipWhiteSpace()) {//and skip whitespaces to next value
209 						fulhax = true;
210 						break;
211 					}
212 				}
213 
214 			}
215 
216 			if((vals % tokens.size()) != 0 ) {
217 				cout << "(line " << line << ") WARNING: a loop was "
218 						"terminated but amount of values does not "
219 						"go even, and thus not enough values was "
220 						"read, which may or may not cause problems." << endl;
221 			}
222 
223 		} else if(beginsWith(token.c_str(),"global_")){ // part of case 6, since global_ is forbidden
224 			cout << "(line " << line << ") WARNING: usage of the \"global_\" keyword "
225 					<< "is forbidden by .cif standards. Ignoring." << endl;
226 		} else { //case 6, file syntax error
227 			cout << "(line " << line << ") WARNING: found unknown token { "
228 					<< token << " }. Ignoring." << endl;
229 		}
230 	}
231 	return;
232 }
233 
234 /*
235  * Checks if c is a whitespace char ('\t', ' '),
236  * but not end of line.
237  */
isWhiteSpace(char c)238 bool CifParser::isWhiteSpace(char c) {
239 	return (c == ' ') || (c == '\t'); //11 is ASCII for tab '\t', 12 is new line '\n'
240 }
241 
242 /*
243  * Checks if c is a whitespace char ('\t', ' ', \n).
244  */
isWhiteSpaceNL(char c)245 bool CifParser::isWhiteSpaceNL(char c) {
246 	return (c == ' ') || (c == '\t') || (c == '\n') || (c == '\r'); //11 is ASCII for tab '\t', 12 is new line '\n'
247 }
248 
249 /*
250  * Skips through all whitespace characters and comments, and leaves
251  * the file stream at the next token. White space characters are
252  * ' ', '\t', '\n' and comments start with '#' and continue until
253  * a '\n' character is found.
254  * The integer "line" is incremented each time a '\n' is found.
255  * If eof found, returns true.
256  */
skipWhiteSpace()257 bool CifParser::skipWhiteSpace() {
258 	char c = ' ';
259 
260 	bool eof = false;
261 	while((isWhiteSpaceNL(c) || c == '#') && !cifFile.eof()) {
262 		cifFile.get(c); //get next char
263 		if(cifFile.eof()) {
264 			eof = true;
265 			break; //reached end of file, stop
266 		}
267 		//some special cases
268 		if(c == '\n') { //new line
269 			line++;
270 		} else if(c == '#') { //comment
271 			cifFile.ignore(CIF_MAX_LINE_LENGTH, '\n'); // ignore rest of line.
272 			line++; //increment line
273 			if(cifFile.eof())
274 				eof = true;
275 		}
276 
277 	}
278 
279 	cifFile.unget(); //put the char from get() back.
280 
281 	return eof;
282 }
283 
284 /*
285  * Similar to the skipWhiteSpace, it extracts all characters found
286  * and puts them in word. It stops when a whitespace character is found.
287  * If eof found, returns true.
288  */
getWord(string & word)289 bool CifParser::getWord(string& word) {
290 	bool eof = false;
291 	char c;
292 	word = "";
293 	do {
294 		cifFile.get(c);
295 		if(cifFile.eof()) {
296 			//reached end of file, stop
297 			eof = true;
298 			break;
299 		}
300 		word.append(1, c);
301 	}
302 	while(!isWhiteSpaceNL(c));
303 
304 	if(!eof) { //reset last char
305 		cifFile.unget();
306 		word.erase(word.length()-1,1);
307 	}
308 
309 	return eof;
310 }
311 
312 /*
313  * beginsWith() checks if "what" begins with "with".
314  * "with" should be lowercase, because the test
315  * is case-insensitive (uses toUpper() ).
316  * Example use: beginsWith(myString, "data_")
317  */
beginsWith(const char * what,const char * with)318 bool CifParser::beginsWith(const char * what, const char* with) {
319 
320 	if(strlen(with) > strlen(what))
321 		return false;
322 
323 	bool correct = true; //the return variable
324 	int i = 0; //byte counter
325 
326 	while(correct && with[i]) { // only continue tests while still equal, and
327 								// end of comparing string isn't reached
328 		if(!((what[i] == with[i]) || (what[i] == toupper(with[i]))))
329 			correct = false;
330 
331 		i++;
332 	}
333 
334 	return correct;
335 }
336 
337 /*
338  * Extracts a value, as defined by cif standards,
339  * and puts it into "value".
340  * If eof found, returns true.
341  */
getValue(string & value)342 bool CifParser::getValue(string& value) {
343 	bool eof = false;
344 	value = "";
345 
346 	//try to identify the type of the value
347 	char c = (char) cifFile.peek();
348 
349 	if(c == '\'') { //single-quote string
350 		if(getQuoteValue(value, '\''))
351 			eof = true;
352 
353 	} else if(c == '\"') { //double-quote string
354 		if(getQuoteValue(value, '\"'))
355 			eof = true;
356 	} else if(c == ';') { //text field OR unquoted string
357 		//check last char, if its '\n' then its a text field
358 		cifFile.unget();
359 		cifFile.get(c);
360 		if(c == '\n') { //we have ourselves a text field
361 			cifFile.ignore(2); //ignore the '\n'
362 								//and ignore the ;
363 			if(cifFile.eof()) { //if eof is after the ;
364 				cout << "(line " << line << ") WARNING: End of file reached, "
365 						"immediatly after a text field was initialized. Value "
366 						"of last token is therefore set to \"?\" (unknown)." << endl;
367 				value = "?";
368 				eof = true;
369 				return eof;
370 			}
371 
372 			bool finished = false;
373 			while(!finished) { //main loop of parsing text field data
374 				cifFile.get(c);
375 
376 				if(cifFile.eof()) { //if suddenly eof
377 					cout << "(line " << line << ") WARNING: End of file reached "
378 							"before the text field data value was properly terminated."
379 							" Value is still read until that point, but "
380 							"it is advised to repair the file." << endl;
381 					eof = true;
382 					break;
383 				}
384 
385 				if(c == '\n') { //if read char is end of line
386 					line++;     //increment line
387 					char ch = (char) cifFile.peek();
388 					if(ch == ';') { //and terminate if a ';' is found at start of line
389 						finished = true;
390 						cifFile.ignore(); //ignores the ';'
391 					} else {
392 						value.append(1,c); //otherwise appends the '\n'
393 					}
394 				} else {
395 					value.append(1,c); //if not a '\n', just append
396 				}
397 			}
398 
399 		} else { //a single unquoted string
400 			// this assumes that ; is included in the value
401 			if(getWord(value))
402 				eof = true;
403 		}
404 
405 	} else { //unquoted string, a word continues until whitespace
406 		if(getWord(value))
407 			eof = true;
408 
409 	}
410 
411 	return eof;
412 }
413 
414 /*
415  * This is solely a helper method used by getValue, to avoid duplicating
416  * very similar code. It extracts a single quote value or a double quote value,
417  * depending on what is specified.
418  * If eof found, returns true.
419  */
getQuoteValue(string & value,const char quote)420 bool CifParser::getQuoteValue(string& value, const char quote) {
421 	cifFile.ignore(); //ignore first quote.
422 	bool finished = false;
423 	bool eof = false;
424 	char c;
425 
426 	//main loop
427 	while(!finished) {
428 		cifFile.get(c); //get char
429 		if(cifFile.eof()) { //in case eof was reached
430 			if(!(c == quote)) {
431 				if(!(c == '\n'))
432 					value.append(1, c);
433 				cout << "(line " << line << ") WARNING: Quote data value did not "
434 						"terminate before end of file was reached. Ignoring." << endl;
435 			}
436 			eof = true;
437 			break;
438 		}
439 
440 		if(c == '\n') { //if line break is found.
441 			//print an error message and increment line.
442 
443 			cout << "(line " << line << ") WARNING: Single and double quote "
444 					"data values are not permitted to extend over "
445 					"multiple lines. Value is read until the end of this "
446 					"line, which might lead to bad results. "
447 					"Suggesting usage of a text field instead." << endl;
448 			line++;
449 			break;
450 		}
451 
452 		//normal procedure, all above is basically error handling
453 		if(c == quote && isWhiteSpaceNL(cifFile.peek())) {
454 			finished = true; //end of value found
455 		} else {
456 			value.append(1, c); //else append
457 		}
458 	}
459 
460 	return eof;
461 
462 }
463 
464 /*
465  * Assigns the data item with the given header and
466  * value. If the value is "unknown" ('.','?'), nothing happens.
467  */
setDataItem(string & token,string & value)468 void CifParser::setDataItem(string& token, string& value) {
469 
470 	if(value != "." && value != "?") {
471 
472 	if(token == "_cell_length_a") {
473 		if(!((*crystal).seta(atof(value.c_str())))) {
474 			printErrorAS("_cell_length_a");
475 		}
476 		/*
477 		 * To explain this above which looks very complicated:
478 		 * 1. Follow the CifCrystalPointer, to set its 'a' field.
479 		 * 2. The value to set it to is calculated by std::atof().
480 		 * 3. atof() does only take a c string, so need to convert.
481 		 * 4. seta() returns false if the value has already been
482 		 * 		assigned, meaning that the tag "_cell_length_a" is
483 		 * 		appearing multiple times in the file. If so, print
484 		 * 		a warning message.
485 		 */
486 
487 	} else if(token == "_cell_length_b") {
488 		if(!((*crystal).setb(atof(value.c_str())))) {
489 			printErrorAS("_cell_length_b");
490 		}
491 	} else if(token == "_cell_length_c") {
492 		if(!((*crystal).setc(atof(value.c_str())))) {
493 			printErrorAS("_cell_length_c");
494 		}
495 	} else if(token == "_cell_angle_alpha") {
496 		if(!((*crystal).setalpha(atof(value.c_str())))) {
497 			printErrorAS("_cell_angle_alpha");
498 		}
499 	} else if(token == "_cell_angle_beta") {
500 		if(!((*crystal).setbeta(atof(value.c_str())))) {
501 			printErrorAS("_cell_angle_beta");
502 		}
503 	} else if(token == "_cell_angle_gamma") {
504 		if(!((*crystal).setgamma(atof(value.c_str())))) {
505 			printErrorAS("_cell_angle_gamma");
506 		}
507 	} else if(token == "_symmetry_equiv_pos_as_xyz"
508 			|| token == "_space_group_symop_operation_xyz") {
509 		(*crystal).addSymmetry(value); //simply add the symmetry
510 
511 	} else if(token == "_atom_site_type_symbol") {
512 		(*crystal).setAElement(value);
513 	} else if(token == "_atom_site_fract_x") {
514 		(*crystal).setAx(atof(value.c_str()));
515 	} else if(token == "_atom_site_fract_y") {
516 		(*crystal).setAy(atof(value.c_str()));
517 	} else if(token == "_atom_site_fract_z") {
518 		(*crystal).setAz(atof(value.c_str()));
519 	} else {
520 		//do nothing
521 	}
522 
523 	}
524 
525 	return;
526 }
527 
528 /*
529  * This takes the string value, and checks if it
530  * is a reserved word (data_,loop_ etc.). If it is, it
531  * sets back the cifFile pointer to the beginning of
532  * the word, and sets the value to "?" (unknown).
533  */
ifReservedReset(string & value)534 bool CifParser::ifReservedReset(string& value) {
535 	if(beginsWith(value.c_str(), "data_") ||	//not allowed to be a value.
536 		beginsWith(value.c_str(), "save_") ||
537 		beginsWith(value.c_str(), "loop_") ||
538 		beginsWith(value.c_str(), "global_")) {
539 
540 		for(unsigned int i = 0; i < value.length(); i++) {
541 			cifFile.unget(); //unget the amount of characters got.
542 		}
543 		/*
544 		 * This below is another way of fixig the above.
545 		while(!isWhiteSpaceNL(cifFile.peek())) {
546 			cifFile.unget();
547 		}
548 		cifFile.ignore();*/
549 
550 		value = "?"; //set value to unknown
551 		return true;
552 	} else {
553 		return false;
554 	}
555 }
556 
557 /*
558  * This is a small helper method to indicate that the
559  * data value has already been assigned (multiple
560  * instances of a data tag found). "value" should be
561  * something like "_cell_length_a"
562  */
printErrorAS(const char * value)563 void CifParser::printErrorAS(const char * value) {
564 	cout << "(line " << line << ") WARNING: data item "
565 			" {" << value << " } has been found "
566 			"multiple times in the file. Using the "
567 			"first encountered value. " << endl;
568 }
569 
570