1/* valamarkupreader.vala 2 * 3 * Copyright (C) 2008-2009 Jürg Billeter 4 * 5 * This library is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU Lesser General Public 7 * License as published by the Free Software Foundation; either 8 * version 2.1 of the License, or (at your option) any later version. 9 10 * This library is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Lesser General Public License for more details. 14 15 * You should have received a copy of the GNU Lesser General Public 16 * License along with this library; if not, write to the Free Software 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 18 * 19 * Author: 20 * Jürg Billeter <j@bitron.ch> 21 */ 22 23using GLib; 24 25/** 26 * Simple reader for a subset of XML. 27 */ 28public class Vala.MarkupReader { 29 public string filename { get; private set; } 30 31 public string name { get; private set; } 32 33 public string content { get; private set; } 34 35 MappedFile mapped_file; 36 37 char* begin; 38 char* current; 39 char* end; 40 41 int line; 42 int column; 43 44 Map<string,string> attributes = new HashMap<string,string> (str_hash, str_equal); 45 bool empty_element; 46 47 public MarkupReader (string filename) { 48 this.filename = filename; 49 50 try { 51 mapped_file = new MappedFile (filename, false); 52 begin = mapped_file.get_contents (); 53 end = begin + mapped_file.get_length (); 54 55 current = begin; 56 57 line = 1; 58 column = 1; 59 } catch (FileError e) { 60 Report.error (null, "Unable to map file `%s': %s".printf (filename, e.message)); 61 } 62 } 63 64 public MarkupReader.from_string (string filename, string content) { 65 this.filename = filename; 66 67 begin = content; 68 end = begin + content.length; 69 70 current = begin; 71 72 line = 1; 73 column = 1; 74 } 75 76 public string? get_attribute (string attr) { 77 return attributes[attr]; 78 } 79 80 /* 81 * Returns a copy of the current attributes. 82 * 83 * @return map of current attributes 84 */ 85 public Map<string,string> get_attributes () { 86 var result = new HashMap<string,string> (str_hash, str_equal); 87 foreach (var key in attributes.get_keys ()) { 88 result.set (key, attributes.get (key)); 89 } 90 return result; 91 } 92 93 string read_name () { 94 char* begin = current; 95 while (current < end) { 96 if (current[0] == ' ' || current[0] == '\t' || current[0] == '>' 97 || current[0] == '/' || current[0] == '=' || current[0] == '\n') { 98 break; 99 } 100 unichar u = ((string) current).get_char_validated ((long) (end - current)); 101 if (u != (unichar) (-1)) { 102 current += u.to_utf8 (null); 103 } else { 104 Report.error (null, "invalid UTF-8 character"); 105 } 106 } 107 if (current == begin) { 108 // syntax error: invalid name 109 } 110 return ((string) begin).substring (0, (int) (current - begin)); 111 } 112 113 public MarkupTokenType read_token (out SourceLocation token_begin, out SourceLocation token_end) { 114 attributes.clear (); 115 116 if (empty_element) { 117 empty_element = false; 118 token_begin = SourceLocation (begin, line, column); 119 token_end = SourceLocation (begin, line, column); 120 return MarkupTokenType.END_ELEMENT; 121 } 122 123 content = null; 124 name = null; 125 126 space (); 127 128 MarkupTokenType type = MarkupTokenType.NONE; 129 char* begin = current; 130 token_begin = SourceLocation (begin, line, column); 131 132 if (current >= end) { 133 type = MarkupTokenType.EOF; 134 } else if (current[0] == '<') { 135 current++; 136 if (current >= end) { 137 // error 138 } else if (current[0] == '?') { 139 // processing instruction 140 } else if (current[0] == '!') { 141 // comment or doctype 142 current++; 143 if (current < end - 1 && current[0] == '-' && current[1] == '-') { 144 // comment 145 current += 2; 146 while (current < end - 2) { 147 if (current[0] == '-' && current[1] == '-' && current[2] == '>') { 148 // end of comment 149 current += 3; 150 break; 151 } else if (current[0] == '\n') { 152 line++; 153 column = 0; 154 } 155 current++; 156 } 157 158 // ignore comment, read next token 159 return read_token (out token_begin, out token_end); 160 } 161 } else if (current[0] == '/') { 162 type = MarkupTokenType.END_ELEMENT; 163 current++; 164 name = read_name (); 165 if (current >= end || current[0] != '>') { 166 // error 167 } 168 current++; 169 } else { 170 type = MarkupTokenType.START_ELEMENT; 171 name = read_name (); 172 space (); 173 while (current < end && current[0] != '>' && current[0] != '/') { 174 string attr_name = read_name (); 175 space (); 176 if (current >= end || current[0] != '=') { 177 // error 178 } 179 current++; 180 space (); 181 if (current >= end || current[0] != '"' || current[0] != '\'') { 182 // error 183 } 184 char quote = current[0]; 185 current++; 186 187 string attr_value = text (quote, false); 188 189 if (current >= end || current[0] != quote) { 190 // error 191 } 192 current++; 193 attributes.set (attr_name, attr_value); 194 space (); 195 } 196 if (current[0] == '/') { 197 empty_element = true; 198 current++; 199 space (); 200 } else { 201 empty_element = false; 202 } 203 if (current >= end || current[0] != '>') { 204 // error 205 } 206 current++; 207 } 208 } else { 209 space (); 210 211 if (current[0] != '<') { 212 content = text ('<', true); 213 } else { 214 // no text 215 // read next token 216 return read_token (out token_begin, out token_end); 217 } 218 219 type = MarkupTokenType.TEXT; 220 } 221 222 token_end = SourceLocation (current, line, column - 1); 223 224 return type; 225 } 226 227 string text (char end_char, bool rm_trailing_whitespace) { 228 StringBuilder content = new StringBuilder (); 229 char* text_begin = current; 230 char* last_linebreak = current; 231 232 while (current < end && current[0] != end_char) { 233 unichar u = ((string) current).get_char_validated ((long) (end - current)); 234 if (u == (unichar) (-1)) { 235 Report.error (null, "invalid UTF-8 character"); 236 } else if (u == '&') { 237 char* next_pos = current + u.to_utf8 (null); 238 if (((string) next_pos).has_prefix ("amp;")) { 239 content.append (((string) text_begin).substring (0, (int) (current - text_begin))); 240 content.append_c ('&'); 241 current += 5; 242 text_begin = current; 243 } else if (((string) next_pos).has_prefix ("quot;")) { 244 content.append (((string) text_begin).substring (0, (int) (current - text_begin))); 245 content.append_c ('"'); 246 current += 6; 247 text_begin = current; 248 } else if (((string) next_pos).has_prefix ("apos;")) { 249 content.append (((string) text_begin).substring (0, (int) (current - text_begin))); 250 content.append_c ('\''); 251 current += 6; 252 text_begin = current; 253 } else if (((string) next_pos).has_prefix ("lt;")) { 254 content.append (((string) text_begin).substring (0, (int) (current - text_begin))); 255 content.append_c ('<'); 256 current += 4; 257 text_begin = current; 258 } else if (((string) next_pos).has_prefix ("gt;")) { 259 content.append (((string) text_begin).substring (0, (int) (current - text_begin))); 260 content.append_c ('>'); 261 current += 4; 262 text_begin = current; 263 } else if (((string) next_pos).has_prefix ("percnt;")) { 264 content.append (((string) text_begin).substring (0, (int) (current - text_begin))); 265 content.append_c ('%'); 266 current += 8; 267 text_begin = current; 268 } else { 269 current += u.to_utf8 (null); 270 } 271 } else { 272 if (u == '\n') { 273 line++; 274 column = 0; 275 last_linebreak = current; 276 } 277 278 current += u.to_utf8 (null); 279 column++; 280 } 281 } 282 283 if (text_begin != current) { 284 content.append (((string) text_begin).substring (0, (int) (current - text_begin))); 285 } 286 287 column += (int) (current - last_linebreak); 288 289 // Removes trailing whitespace 290 if (rm_trailing_whitespace) { 291 char* str_pos = ((char*)content.str) + content.len; 292 for (str_pos--; str_pos > ((char*)content.str) && str_pos[0].isspace(); str_pos--); 293 content.erase ((ssize_t) (str_pos-((char*) content.str) + 1), -1); 294 } 295 296 return content.str; 297 } 298 299 void space () { 300 while (current < end && current[0].isspace ()) { 301 if (current[0] == '\n') { 302 line++; 303 column = 0; 304 } 305 current++; 306 column++; 307 } 308 } 309} 310 311public enum Vala.MarkupTokenType { 312 NONE, 313 START_ELEMENT, 314 END_ELEMENT, 315 TEXT, 316 EOF; 317 318 public unowned string to_string () { 319 switch (this) { 320 case START_ELEMENT: return "start element"; 321 case END_ELEMENT: return "end element"; 322 case TEXT: return "text"; 323 case EOF: return "end of file"; 324 default: return "unknown token type"; 325 } 326 } 327} 328 329