1/* HtmlExtractor.m 2 * 3 * Copyright (C) 2006 Free Software Foundation, Inc. 4 * 5 * Author: Enrico Sersale <enrico@dtedu.net> 6 * Date: May 2006 7 * 8 * This file is part of the GNUstep GWorkspace application 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 23 */ 24 25#include <AppKit/AppKit.h> 26#include "HtmlExtractor.h" 27 28#define MAXFSIZE 600000 29#define DLENGTH 256 30#define WORD_MAX 40 31 32void strip(const char *inbuf, NSMutableString *outstr, NSMutableDictionary *metadict); 33int escapeChar(char *buf, NSMutableString *str); 34 35 36@implementation HtmlExtractor 37 38- (void)dealloc 39{ 40 RELEASE (extensions); 41 RELEASE (skipSet); 42 43 [super dealloc]; 44} 45 46- (id)initForExtractor:(id)extr 47{ 48 self = [super init]; 49 50 if (self) { 51 NSCharacterSet *set; 52 53 skipSet = [NSMutableCharacterSet new]; 54 55 set = [NSCharacterSet controlCharacterSet]; 56 [skipSet formUnionWithCharacterSet: set]; 57 58 set = [NSCharacterSet illegalCharacterSet]; 59 [skipSet formUnionWithCharacterSet: set]; 60 61 set = [NSCharacterSet punctuationCharacterSet]; 62 [skipSet formUnionWithCharacterSet: set]; 63 64 set = [NSCharacterSet symbolCharacterSet]; 65 [skipSet formUnionWithCharacterSet: set]; 66 67 set = [NSCharacterSet whitespaceAndNewlineCharacterSet]; 68 [skipSet formUnionWithCharacterSet: set]; 69 70 set = [NSCharacterSet decimalDigitCharacterSet]; 71 [skipSet formUnionWithCharacterSet: set]; 72 73 set = [NSCharacterSet characterSetWithCharactersInString: @"+-=<>&@$*%#\"\'^`|~_/\\"]; 74 [skipSet formUnionWithCharacterSet: set]; 75 76 ASSIGN (extensions, ([NSArray arrayWithObjects: @"html", @"htm", nil])); 77 extractor = extr; 78 } 79 80 return self; 81} 82 83- (NSArray *)pathExtensions 84{ 85 return extensions; 86} 87 88- (BOOL)canExtractFromFileType:(NSString *)type 89 withExtension:(NSString *)ext 90 attributes:(NSDictionary *)attributes 91 testData:(NSData *)testdata 92{ 93 if (testdata && ([attributes fileSize] < MAXFSIZE)) { 94 const char *bytes = (const char *)[testdata bytes]; 95 int i; 96 97 for (i = 0; i < [testdata length]; i++) { 98 if (bytes[i] == 0x00) { 99 return NO; 100 break; 101 } 102 } 103 104 return ([extensions containsObject: ext]); 105 } 106 107 return NO; 108} 109 110- (BOOL)extractMetadataAtPath:(NSString *)path 111 withID:(int)path_id 112 attributes:(NSDictionary *)attributes 113{ 114 CREATE_AUTORELEASE_POOL(arp); 115 NSMutableDictionary *mddict = [NSMutableDictionary dictionary]; 116 NSString *contents = [NSString stringWithContentsOfFile: path]; 117 BOOL success = NO; 118 119 if (contents && [contents length]) { 120 const char *inbuf = [contents UTF8String]; 121 NSMutableString *stripped = [NSMutableString stringWithCapacity: [contents length]]; 122 NSMutableDictionary *attrsdict = [NSMutableDictionary dictionary]; 123 124 strip(inbuf, stripped, attrsdict); 125 126 if (stripped && [stripped length]) { 127 NSScanner *scanner = [NSScanner scannerWithString: stripped]; 128 SEL scanSel = @selector(scanUpToCharactersFromSet:intoString:); 129 IMP scanImp = [scanner methodForSelector: scanSel]; 130 NSMutableDictionary *wordsDict = [NSMutableDictionary dictionary]; 131 NSCountedSet *wordset = [[NSCountedSet alloc] initWithCapacity: 1]; 132 unsigned long wcount = 0; 133 NSString *word; 134 135 [scanner setCharactersToBeSkipped: skipSet]; 136 137 while ([scanner isAtEnd] == NO) { 138 (*scanImp)(scanner, scanSel, skipSet, &word); 139 140 if (word) { 141 unsigned wl = [word length]; 142 143 if ((wl > 3) && (wl < WORD_MAX)) { 144 [wordset addObject: word]; 145 } 146 147 wcount++; 148 } 149 } 150 151 [wordsDict setObject: wordset forKey: @"wset"]; 152 [wordsDict setObject: [NSNumber numberWithUnsignedLong: wcount] 153 forKey: @"wcount"]; 154 155 [mddict setObject: wordsDict forKey: @"words"]; 156 [mddict setObject: attrsdict forKey: @"attributes"]; 157 158 RELEASE (wordset); 159 } 160 } 161 162 success = [extractor setMetadata: mddict forPath: path withID: path_id]; 163 164 RELEASE (arp); 165 166 return success; 167} 168 169@end 170 171 172void strip(const char *inbuf, NSMutableString *outstr, NSMutableDictionary *metadict) 173{ 174 int len = strlen(inbuf); 175 BOOL isScript = NO; 176 BOOL isMarkup = NO; 177 BOOL isMeta = NO; 178 BOOL isTitle = NO; 179 BOOL spaceAdded = NO; 180 int offset; 181 int i; 182 183#define CHK_POS(x, l) \ 184do { \ 185 if (x >= (l - 1)) return; \ 186} while (0) 187 188 for (i = 0; i < len; i++) { 189 /* end of buffer are possible points of failure 190 if a markup or a token is cut, it will not be parsed. */ 191 if ((i > len - 9) 192 && ((strncmp(inbuf + i, "\x3c", 1) == 0) 193 || (strncmp(inbuf + i, "\x26", 1) == 0))) { 194 break; 195 } 196 197 /* detecting end of script */ 198 if (isScript && ((strncmp(inbuf + i, "</script>", 9) == 0))) { 199 isScript = NO; 200 i += 9; 201 } 202 203 /* detecting new paragraph */ 204 if ((isScript == NO) && (strncmp(inbuf + i, "<p", 2) == 0)) { 205 i += 2; 206 207 while (strncmp(inbuf + i, ">", 1) != 0) { 208 i++; 209 CHK_POS (i, len); 210 } 211 } 212 213 /* detecting beginning of markup */ 214 if ((isScript == NO) && (isMarkup == NO) 215 && (strncmp(inbuf + i, "\x3c", 1) == 0)) { 216 /* detecting begining of script */ 217 if ((strncmp(inbuf + i, "<script", 7) == 0) 218 || (strncmp(inbuf + i, "<SCRIPT", 7) == 0)) { 219 isScript = YES; 220 i += 7; 221 222 } else if ((strncmp(inbuf + i, "<title>", 7) == 0) 223 || (strncmp(inbuf + i, "<TITLE>", 7) == 0)) { 224 isMeta = YES; 225 isTitle = YES; 226 i += 7; 227 228 } else if ((strncmp(inbuf + i, "<meta", 5) == 0) 229 || (strncmp(inbuf + i, "<META", 5) == 0)) { 230 isMeta = YES; 231 i += 5; 232 233 } else { 234 isMarkup = YES; 235 } 236 } 237 238 CHK_POS (i, len); 239 240 /* get metadata value */ 241 if ((isScript == NO) && isMeta) { 242 NSMutableString *mdbuff = [NSMutableString stringWithCapacity: 128]; 243 char endstr[16]; 244 // NSString *key; 245 // NSString *value; 246 247 while (strncmp(inbuf + i, "\x20", 1) == 0) { 248 i++; 249 CHK_POS (i, len); 250 } 251 252 memset(endstr, '\0', 16); 253 254 if (isTitle) { 255 strncpy(endstr, "</title>", 8); 256 } else { 257 strncpy(endstr, "/>", 2); 258 } 259 260 while (strncmp(inbuf + i, endstr, strlen(endstr)) != 0) { 261 if (strncmp(inbuf + i, "\x26", 1) == 0) { 262 offset = escapeChar((char *)(inbuf + i), mdbuff); 263 i += offset; 264 } else { 265 [mdbuff appendFormat: @"%c", inbuf[i]]; 266 i++; 267 } 268 269 CHK_POS (i, len); 270 } 271 272 if (isTitle) { 273 [metadict setObject: [mdbuff makeImmutableCopyOnFail: NO] 274 forKey: @"GSMDItemTitle"]; 275 i += 8; 276 } else { 277 /* TODO - extract metadata from <meta> */ 278 279 i += 2; 280 } 281 282 isTitle = NO; 283 isMeta = NO; 284 CHK_POS (i, len); 285 continue; 286 } 287 288 /* detecting end of markup */ 289 if ((isScript == NO) && isMarkup && (strncmp(inbuf + i, "\x3e", 1) == 0)) { 290 if (spaceAdded == NO) { 291 [outstr appendFormat: @"%C", 0x20]; 292 spaceAdded = YES; 293 } 294 295 isMarkup = NO; 296 } 297 298 CHK_POS (i, len); 299 300 /* handling text */ 301 if ((isScript == NO) && (isMarkup == NO) 302 && (strncmp(inbuf + i, "\x3e", 1) != 0)) { 303 if ((strncmp(inbuf + i, "\n", 1) != 0) 304 && (strncmp(inbuf + i, "\t", 1) != 0)) { 305 if (strncmp(inbuf + i, "\x26", 1) == 0) { 306 offset = escapeChar((char *)(inbuf + i), outstr); 307 i += (offset - 1); 308 CHK_POS (i, len); 309 spaceAdded = NO; 310 311 } else { 312 [outstr appendFormat: @"%c", inbuf[i]]; 313 } 314 315 spaceAdded = NO; 316 317 } else { 318 /* replace tabs and eol by spaces */ 319 [outstr appendFormat: @"%C", 0x20]; 320 } 321 } 322 } 323} 324 325int escapeChar(char *buf, NSMutableString *str) 326{ 327 char token[9]; 328 unichar c = 0x26; 329 int len = 0; 330 int i = 0; 331 332 /* copying token into local buffer */ 333 while (i <= 8 && (strncmp(buf + i, ";", 1) != 0)) { 334 strncpy(token + i, buf + i, 1); 335 i++; 336 } 337 338 if (strncmp(buf + i, ";\0", 2) == 0) { 339 strncpy(token + i, buf + i, 1); 340 341 } else { /* if it does not seem to be a token, result is '&' */ 342 [str appendFormat: @"%C", c]; 343 return 1; 344 } 345 346 /* identifying token */ 347 if (strncmp(token, "&", 5) == 0) { 348 c = 0x26; 349 len = 5; 350 } else if (strncmp(token, "<", 4) == 0) { 351 c = 0x3C; 352 len = 4; 353 } else if (strncmp(token, ">", 4) == 0) { 354 c = 0x3E; 355 len = 4; 356 } else if (strncmp(token, """, 6) == 0) { 357 c = 0x22; 358 len = 6; 359 } else if (strncmp(token, "é", 8) == 0) { 360 c = 0xE9; 361 len = 8; 362 } else if (strncmp(token, "É", 8) == 0) { 363 c = 0xC9; 364 len = 8; 365 } else if (strncmp(token, "è", 8) == 0) { 366 c = 0xE8; 367 len = 8; 368 } else if (strncmp(token, "È", 8) == 0) { 369 c = 0xC8; 370 len = 8; 371 } else if (strncmp(token, "ê", 7) == 0) { 372 c = 0xEA; 373 len = 7; 374 } else if (strncmp(token, "à", 8) == 0) { 375 c = 0xE0; 376 len = 8; 377 } else if (strncmp(token, "ï", 6) == 0) { 378 c = 0xEF; 379 len = 6; 380 } else if (strncmp(token, "ç", 8) == 0) { 381 c = 0xE7; 382 len = 8; 383 } else if (strncmp(token, "ñ", 8) == 0) { 384 c = 0xF1; 385 len = 8; 386 } else if (strncmp(token, "©", 6) == 0) { 387 c = 0xA9; 388 len = 6; 389 } else if (strncmp(token, "®", 5) == 0) { 390 c = 0xAE; 391 len = 5; 392 } else if (strncmp(token, "°", 5) == 0) { 393 c = 0xB0; 394 len = 5; 395 } else if (strncmp(token, "º", 6) == 0) { 396 c = 0xBA; 397 len = 6; 398 } else if (strncmp(token, "«", 7) == 0) { 399 c = 0xAB; 400 len = 7; 401 } else if (strncmp(token, "»", 7) == 0) { 402 c = 0xBB; 403 len = 7; 404 } else if (strncmp(token, "µ", 7) == 0) { 405 c = 0xB5; 406 len = 7; 407 } else if (strncmp(token, "¶", 6) == 0) { 408 c = 0xB6; 409 len = 6; 410 } else if (strncmp(token, "¼", 8) == 0) { 411 c = 0xBC; 412 len = 8; 413 } else if (strncmp(token, "½", 8) == 0) { 414 c = 0xBD; 415 len = 8; 416 } else if (strncmp(token, "¾", 8) == 0) { 417 c = 0xBE; 418 len = 8; 419 } else if (strncmp(token, "&#", 2) == 0) { 420 [str appendFormat: @"%i", atoi(token + 2)]; 421 return 6; 422 } else { 423 c = 0x20; 424 len = i+1; 425 } 426 427 if (len != 0) { 428 [str appendFormat: @"%C", c]; 429 } 430 431 return len; 432} 433 434 435 436 437 438 439