1/* valamarkupreader.vala
2 *
3 * Copyright (C) 2008-2009  Jürg Billeter
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 * Lesser General Public License for more details.
14
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
18 *
19 * Author:
20 * 	Jürg Billeter <j@bitron.ch>
21 */
22
23using GLib;
24
25/**
26 * Simple reader for a subset of XML.
27 */
28public class Vala.MarkupReader {
29	public string filename { get; private set; }
30
31	public string name { get; private set; }
32
33	public string content { get; private set; }
34
35	MappedFile mapped_file;
36
37	char* begin;
38	char* current;
39	char* end;
40
41	int line;
42	int column;
43
44	Map<string,string> attributes = new HashMap<string,string> (str_hash, str_equal);
45	bool empty_element;
46
47	public MarkupReader (string filename) {
48		this.filename = filename;
49
50		try {
51			mapped_file = new MappedFile (filename, false);
52			begin = mapped_file.get_contents ();
53			end = begin + mapped_file.get_length ();
54
55			current = begin;
56
57			line = 1;
58			column = 1;
59		} catch (FileError e) {
60			Report.error (null, "Unable to map file `%s': %s".printf (filename, e.message));
61		}
62	}
63
64	public MarkupReader.from_string (string filename, string content) {
65		this.filename = filename;
66
67		begin = content;
68		end = begin + content.length;
69
70		current = begin;
71
72		line = 1;
73		column = 1;
74	}
75
76	public string? get_attribute (string attr) {
77		return attributes[attr];
78	}
79
80	/*
81	 * Returns a copy of the current attributes.
82	 *
83	 * @return map of current attributes
84	 */
85	public Map<string,string> get_attributes () {
86		var result = new HashMap<string,string> (str_hash, str_equal);
87		foreach (var key in attributes.get_keys ()) {
88			result.set (key, attributes.get (key));
89		}
90		return result;
91	}
92
93	string read_name () {
94		char* begin = current;
95		while (current < end) {
96			if (current[0] == ' ' || current[0] == '\t' || current[0] == '>'
97			    || current[0] == '/' || current[0] == '=' || current[0] == '\n') {
98				break;
99			}
100			unichar u = ((string) current).get_char_validated ((long) (end - current));
101			if (u != (unichar) (-1)) {
102				current += u.to_utf8 (null);
103			} else {
104				Report.error (null, "invalid UTF-8 character");
105			}
106		}
107		if (current == begin) {
108			// syntax error: invalid name
109		}
110		return ((string) begin).substring (0, (int) (current - begin));
111	}
112
113	public MarkupTokenType read_token (out SourceLocation token_begin, out SourceLocation token_end) {
114		attributes.clear ();
115
116		if (empty_element) {
117			empty_element = false;
118			token_begin = SourceLocation (begin, line, column);
119			token_end = SourceLocation (begin, line, column);
120			return MarkupTokenType.END_ELEMENT;
121		}
122
123		content = null;
124		name = null;
125
126		space ();
127
128		MarkupTokenType type = MarkupTokenType.NONE;
129		char* begin = current;
130		token_begin = SourceLocation (begin, line, column);
131
132		if (current >= end) {
133			type = MarkupTokenType.EOF;
134		} else if (current[0] == '<') {
135			current++;
136			if (current >= end) {
137				// error
138			} else if (current[0] == '?') {
139				// processing instruction
140			} else if (current[0] == '!') {
141				// comment or doctype
142				current++;
143				if (current < end - 1 && current[0] == '-' && current[1] == '-') {
144					// comment
145					current += 2;
146					while (current < end - 2) {
147						if (current[0] == '-' && current[1] == '-' && current[2] == '>') {
148							// end of comment
149							current += 3;
150							break;
151						} else if (current[0] == '\n') {
152							line++;
153							column = 0;
154						}
155						current++;
156					}
157
158					// ignore comment, read next token
159					return read_token (out token_begin, out token_end);
160				}
161			} else if (current[0] == '/') {
162				type = MarkupTokenType.END_ELEMENT;
163				current++;
164				name = read_name ();
165				if (current >= end || current[0] != '>') {
166					// error
167				}
168				current++;
169			} else {
170				type = MarkupTokenType.START_ELEMENT;
171				name = read_name ();
172				space ();
173				while (current < end && current[0] != '>' && current[0] != '/') {
174					string attr_name = read_name ();
175					space ();
176					if (current >= end || current[0] != '=') {
177						// error
178					}
179					current++;
180					space ();
181					if (current >= end || current[0] != '"' || current[0] != '\'') {
182						// error
183					}
184					char quote = current[0];
185					current++;
186
187					string attr_value = text (quote, false);
188
189					if (current >= end || current[0] != quote) {
190						// error
191					}
192					current++;
193					attributes.set (attr_name, attr_value);
194					space ();
195				}
196				if (current[0] == '/') {
197					empty_element = true;
198					current++;
199					space ();
200				} else {
201					empty_element = false;
202				}
203				if (current >= end || current[0] != '>') {
204					// error
205				}
206				current++;
207			}
208		} else {
209			space ();
210
211			if (current[0] != '<') {
212				content = text ('<', true);
213			} else {
214				// no text
215				// read next token
216				return read_token (out token_begin, out token_end);
217			}
218
219			type = MarkupTokenType.TEXT;
220		}
221
222		token_end = SourceLocation (current, line, column - 1);
223
224		return type;
225	}
226
227	string text (char end_char, bool rm_trailing_whitespace) {
228		StringBuilder content = new StringBuilder ();
229		char* text_begin = current;
230		char* last_linebreak = current;
231
232		while (current < end && current[0] != end_char) {
233			unichar u = ((string) current).get_char_validated ((long) (end - current));
234			if (u == (unichar) (-1)) {
235				Report.error (null, "invalid UTF-8 character");
236			} else if (u == '&') {
237				char* next_pos = current + u.to_utf8 (null);
238				if (((string) next_pos).has_prefix ("amp;")) {
239					content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
240					content.append_c ('&');
241					current += 5;
242					text_begin = current;
243				} else if (((string) next_pos).has_prefix ("quot;")) {
244					content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
245					content.append_c ('"');
246					current += 6;
247					text_begin = current;
248				} else if (((string) next_pos).has_prefix ("apos;")) {
249					content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
250					content.append_c ('\'');
251					current += 6;
252					text_begin = current;
253				} else if (((string) next_pos).has_prefix ("lt;")) {
254					content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
255					content.append_c ('<');
256					current += 4;
257					text_begin = current;
258				} else if (((string) next_pos).has_prefix ("gt;")) {
259					content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
260					content.append_c ('>');
261					current += 4;
262					text_begin = current;
263				} else if (((string) next_pos).has_prefix ("percnt;")) {
264					content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
265					content.append_c ('%');
266					current += 8;
267					text_begin = current;
268				} else {
269					current += u.to_utf8 (null);
270				}
271			} else {
272				if (u == '\n') {
273					line++;
274					column = 0;
275					last_linebreak = current;
276				}
277
278				current += u.to_utf8 (null);
279				column++;
280			}
281		}
282
283		if (text_begin != current) {
284			content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
285		}
286
287		column += (int) (current - last_linebreak);
288
289		// Removes trailing whitespace
290		if (rm_trailing_whitespace) {
291			char* str_pos = ((char*)content.str) + content.len;
292			for (str_pos--; str_pos > ((char*)content.str) && str_pos[0].isspace(); str_pos--);
293			content.erase ((ssize_t) (str_pos-((char*) content.str) + 1), -1);
294		}
295
296		return content.str;
297	}
298
299	void space () {
300		while (current < end && current[0].isspace ()) {
301			if (current[0] == '\n') {
302				line++;
303				column = 0;
304			}
305			current++;
306			column++;
307		}
308	}
309}
310
311public enum Vala.MarkupTokenType {
312	NONE,
313	START_ELEMENT,
314	END_ELEMENT,
315	TEXT,
316	EOF;
317
318	public unowned string to_string () {
319		switch (this) {
320		case START_ELEMENT: return "start element";
321		case END_ELEMENT: return "end element";
322		case TEXT: return "text";
323		case EOF: return "end of file";
324		default: return "unknown token type";
325		}
326	}
327}
328
329