1/* HtmlExtractor.m
2 *
3 * Copyright (C) 2006 Free Software Foundation, Inc.
4 *
5 * Author: Enrico Sersale <enrico@dtedu.net>
6 * Date: May 2006
7 *
8 * This file is part of the GNUstep GWorkspace application
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
23 */
24
25#include <AppKit/AppKit.h>
26#include "HtmlExtractor.h"
27
28#define MAXFSIZE 600000
29#define DLENGTH 256
30#define WORD_MAX 40
31
32void strip(const char *inbuf, NSMutableString *outstr, NSMutableDictionary *metadict);
33int escapeChar(char *buf, NSMutableString *str);
34
35
36@implementation HtmlExtractor
37
38- (void)dealloc
39{
40  RELEASE (extensions);
41  RELEASE (skipSet);
42
43	[super dealloc];
44}
45
46- (id)initForExtractor:(id)extr
47{
48  self = [super init];
49
50  if (self) {
51    NSCharacterSet *set;
52
53    skipSet = [NSMutableCharacterSet new];
54
55    set = [NSCharacterSet controlCharacterSet];
56    [skipSet formUnionWithCharacterSet: set];
57
58    set = [NSCharacterSet illegalCharacterSet];
59    [skipSet formUnionWithCharacterSet: set];
60
61    set = [NSCharacterSet punctuationCharacterSet];
62    [skipSet formUnionWithCharacterSet: set];
63
64    set = [NSCharacterSet symbolCharacterSet];
65    [skipSet formUnionWithCharacterSet: set];
66
67    set = [NSCharacterSet whitespaceAndNewlineCharacterSet];
68    [skipSet formUnionWithCharacterSet: set];
69
70    set = [NSCharacterSet decimalDigitCharacterSet];
71    [skipSet formUnionWithCharacterSet: set];
72
73    set = [NSCharacterSet characterSetWithCharactersInString: @"+-=<>&@$*%#\"\'^`|~_/\\"];
74    [skipSet formUnionWithCharacterSet: set];
75
76    ASSIGN (extensions, ([NSArray arrayWithObjects: @"html", @"htm", nil]));
77    extractor = extr;
78  }
79
80  return self;
81}
82
83- (NSArray *)pathExtensions
84{
85  return extensions;
86}
87
88- (BOOL)canExtractFromFileType:(NSString *)type
89                 withExtension:(NSString *)ext
90                    attributes:(NSDictionary *)attributes
91                      testData:(NSData *)testdata
92{
93  if (testdata && ([attributes fileSize] < MAXFSIZE)) {
94    const char *bytes = (const char *)[testdata bytes];
95    int i;
96
97    for (i = 0; i < [testdata length]; i++) {
98      if (bytes[i] == 0x00) {
99        return NO;
100        break;
101      }
102    }
103
104    return ([extensions containsObject: ext]);
105  }
106
107  return NO;
108}
109
110- (BOOL)extractMetadataAtPath:(NSString *)path
111                       withID:(int)path_id
112                   attributes:(NSDictionary *)attributes
113{
114  CREATE_AUTORELEASE_POOL(arp);
115  NSMutableDictionary *mddict = [NSMutableDictionary dictionary];
116  NSString *contents = [NSString stringWithContentsOfFile: path];
117  BOOL success = NO;
118
119  if (contents && [contents length]) {
120    const char *inbuf = [contents UTF8String];
121    NSMutableString	*stripped = [NSMutableString stringWithCapacity: [contents length]];
122    NSMutableDictionary *attrsdict = [NSMutableDictionary dictionary];
123
124    strip(inbuf, stripped, attrsdict);
125
126    if (stripped && [stripped length]) {
127      NSScanner *scanner = [NSScanner scannerWithString: stripped];
128      SEL scanSel = @selector(scanUpToCharactersFromSet:intoString:);
129      IMP scanImp = [scanner methodForSelector: scanSel];
130      NSMutableDictionary *wordsDict = [NSMutableDictionary dictionary];
131      NSCountedSet *wordset = [[NSCountedSet alloc] initWithCapacity: 1];
132      unsigned long wcount = 0;
133      NSString *word;
134
135      [scanner setCharactersToBeSkipped: skipSet];
136
137      while ([scanner isAtEnd] == NO) {
138        (*scanImp)(scanner, scanSel, skipSet, &word);
139
140        if (word) {
141          unsigned wl = [word length];
142
143          if ((wl > 3) && (wl < WORD_MAX)) {
144            [wordset addObject: word];
145          }
146
147          wcount++;
148        }
149      }
150
151      [wordsDict setObject: wordset forKey: @"wset"];
152      [wordsDict setObject: [NSNumber numberWithUnsignedLong: wcount]
153                    forKey: @"wcount"];
154
155      [mddict setObject: wordsDict forKey: @"words"];
156      [mddict setObject: attrsdict forKey: @"attributes"];
157
158      RELEASE (wordset);
159    }
160  }
161
162  success = [extractor setMetadata: mddict forPath: path withID: path_id];
163
164  RELEASE (arp);
165
166  return success;
167}
168
169@end
170
171
172void strip(const char *inbuf, NSMutableString *outstr, NSMutableDictionary *metadict)
173{
174  int len = strlen(inbuf);
175  BOOL isScript = NO;
176  BOOL isMarkup = NO;
177  BOOL isMeta = NO;
178  BOOL isTitle = NO;
179  BOOL spaceAdded = NO;
180  int offset;
181  int i;
182
183#define CHK_POS(x, l) \
184do { \
185  if (x >= (l - 1)) return; \
186} while (0)
187
188  for (i = 0; i < len; i++) {
189    /* end of buffer are possible points of failure
190      if a markup or a token is cut, it will not be parsed. */
191    if ((i > len - 9)
192            && ((strncmp(inbuf + i, "\x3c", 1) == 0)
193                          || (strncmp(inbuf + i, "\x26", 1) == 0))) {
194      break;
195    }
196
197    /* detecting end of script */
198    if (isScript && ((strncmp(inbuf + i, "</script>", 9) == 0))) {
199      isScript = NO;
200      i += 9;
201    }
202
203    /* detecting new paragraph */
204    if ((isScript == NO) && (strncmp(inbuf + i, "<p", 2) == 0)) {
205	    i += 2;
206
207      while (strncmp(inbuf + i, ">", 1) != 0) {
208	      i++;
209        CHK_POS (i, len);
210	    }
211    }
212
213    /* detecting beginning of markup */
214    if ((isScript == NO) && (isMarkup == NO)
215                                && (strncmp(inbuf + i, "\x3c", 1) == 0)) {
216      /* detecting begining of script */
217      if ((strncmp(inbuf + i, "<script", 7) == 0)
218                        || (strncmp(inbuf + i, "<SCRIPT", 7) == 0)) {
219        isScript = YES;
220        i += 7;
221
222      } else if ((strncmp(inbuf + i, "<title>", 7) == 0)
223		                             || (strncmp(inbuf + i, "<TITLE>", 7) == 0)) {
224        isMeta = YES;
225        isTitle = YES;
226        i += 7;
227
228      } else if ((strncmp(inbuf + i, "<meta", 5) == 0)
229                                  || (strncmp(inbuf + i, "<META", 5) == 0)) {
230        isMeta = YES;
231        i += 5;
232
233      } else {
234        isMarkup = YES;
235      }
236    }
237
238    CHK_POS (i, len);
239
240    /* get metadata value */
241    if ((isScript == NO) && isMeta) {
242      NSMutableString	*mdbuff = [NSMutableString stringWithCapacity: 128];
243      char endstr[16];
244   //   NSString *key;
245   //   NSString *value;
246
247      while (strncmp(inbuf + i, "\x20", 1) == 0) {
248        i++;
249        CHK_POS (i, len);
250      }
251
252      memset(endstr, '\0', 16);
253
254      if (isTitle) {
255        strncpy(endstr, "</title>", 8);
256      } else {
257        strncpy(endstr, "/>", 2);
258      }
259
260      while (strncmp(inbuf + i, endstr, strlen(endstr)) != 0) {
261        if (strncmp(inbuf + i, "\x26", 1) == 0) {
262          offset = escapeChar((char *)(inbuf + i), mdbuff);
263          i += offset;
264        } else {
265          [mdbuff appendFormat: @"%c", inbuf[i]];
266          i++;
267        }
268
269        CHK_POS (i, len);
270      }
271
272      if (isTitle) {
273        [metadict setObject: [mdbuff makeImmutableCopyOnFail: NO]
274                     forKey: @"GSMDItemTitle"];
275        i += 8;
276      } else {
277        /* TODO - extract metadata from <meta> */
278
279        i += 2;
280      }
281
282      isTitle = NO;
283      isMeta = NO;
284      CHK_POS (i, len);
285      continue;
286    }
287
288    /* detecting end of markup */
289    if ((isScript == NO) && isMarkup && (strncmp(inbuf + i, "\x3e", 1) == 0)) {
290	    if (spaceAdded == NO) {
291              [outstr appendFormat: @"%C", 0x20];
292	      spaceAdded = YES;
293	    }
294
295	    isMarkup = NO;
296    }
297
298    CHK_POS (i, len);
299
300    /* handling text */
301    if ((isScript == NO) && (isMarkup == NO)
302                              && (strncmp(inbuf + i, "\x3e", 1) != 0)) {
303      if ((strncmp(inbuf + i, "\n", 1) != 0)
304                                && (strncmp(inbuf + i, "\t", 1) != 0)) {
305        if (strncmp(inbuf + i, "\x26", 1) == 0) {
306          offset = escapeChar((char *)(inbuf + i), outstr);
307          i += (offset - 1);
308          CHK_POS (i, len);
309          spaceAdded = NO;
310
311        } else {
312          [outstr appendFormat: @"%c", inbuf[i]];
313        }
314
315        spaceAdded = NO;
316
317      } else {
318        /* replace tabs and eol by spaces */
319        [outstr appendFormat: @"%C", 0x20];
320      }
321    }
322  }
323}
324
325int escapeChar(char *buf, NSMutableString *str)
326{
327  char token[9];
328  unichar c = 0x26;
329  int len = 0;
330  int i = 0;
331
332  /* copying token into local buffer */
333  while (i <= 8 && (strncmp(buf + i, ";", 1) != 0)) {
334    strncpy(token + i, buf + i, 1);
335    i++;
336  }
337
338  if (strncmp(buf + i, ";\0", 2) == 0) {
339    strncpy(token + i, buf + i, 1);
340
341  } else { /* if it does not seem to be a token, result is '&' */
342    [str appendFormat: @"%C", c];
343    return 1;
344  }
345
346  /* identifying token */
347  if (strncmp(token, "&amp;", 5) == 0) {
348    c = 0x26;
349    len = 5;
350  } else if (strncmp(token, "&lt;", 4) == 0) {
351    c = 0x3C;
352    len = 4;
353  } else if (strncmp(token, "&gt;", 4) == 0) {
354    c = 0x3E;
355    len = 4;
356  } else if (strncmp(token, "&quot;", 6) == 0) {
357    c = 0x22;
358    len = 6;
359  } else if (strncmp(token, "&eacute;", 8) == 0) {
360    c = 0xE9;
361    len = 8;
362  } else if (strncmp(token, "&Eacute;", 8) == 0) {
363    c = 0xC9;
364    len = 8;
365  } else if (strncmp(token, "&egrave;", 8) == 0) {
366    c = 0xE8;
367    len = 8;
368  } else if (strncmp(token, "&Egrave;", 8) == 0) {
369    c = 0xC8;
370    len = 8;
371  } else if (strncmp(token, "&ecirc;", 7) == 0) {
372    c = 0xEA;
373    len = 7;
374  } else if (strncmp(token, "&agrave;", 8) == 0) {
375    c = 0xE0;
376    len = 8;
377  } else if (strncmp(token, "&iuml;", 6) == 0) {
378    c = 0xEF;
379    len = 6;
380  } else if (strncmp(token, "&ccedil;", 8) == 0) {
381    c = 0xE7;
382    len = 8;
383  } else if (strncmp(token, "&ntilde;", 8) == 0) {
384    c = 0xF1;
385    len = 8;
386  } else if (strncmp(token, "&copy;", 6) == 0) {
387    c = 0xA9;
388    len = 6;
389  } else if (strncmp(token, "&reg;", 5) == 0) {
390    c = 0xAE;
391    len = 5;
392  } else if (strncmp(token, "&deg;", 5) == 0) {
393    c = 0xB0;
394    len = 5;
395  } else if (strncmp(token, "&ordm;", 6) == 0) {
396    c = 0xBA;
397    len = 6;
398  } else if (strncmp(token, "&laquo;", 7) == 0) {
399    c = 0xAB;
400    len = 7;
401  } else if (strncmp(token, "&raquo;", 7) == 0) {
402    c = 0xBB;
403    len = 7;
404  } else if (strncmp(token, "&micro;", 7) == 0) {
405    c = 0xB5;
406    len = 7;
407  } else if (strncmp(token, "&para;", 6) == 0) {
408    c = 0xB6;
409    len = 6;
410  } else if (strncmp(token, "&frac14;", 8) == 0) {
411    c = 0xBC;
412    len = 8;
413  } else if (strncmp(token, "&frac12;", 8) == 0) {
414    c = 0xBD;
415    len = 8;
416  } else if (strncmp(token, "&frac34;", 8) == 0) {
417    c = 0xBE;
418    len = 8;
419  } else if (strncmp(token, "&#", 2) == 0) {
420    [str appendFormat: @"%i", atoi(token + 2)];
421    return 6;
422  } else {
423    c = 0x20;
424    len = i+1;
425  }
426
427  if (len != 0) {
428    [str appendFormat: @"%C", c];
429  }
430
431  return len;
432}
433
434
435
436
437
438
439