1/* TextExtractor.m
2 *
3 * Copyright (C) 2006 Free Software Foundation, Inc.
4 *
5 * Author: Enrico Sersale <enrico@dtedu.net>
6 * Date: February 2006
7 *
8 * This file is part of the GNUstep GWorkspace application
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
23 */
24
25#include <AppKit/AppKit.h>
26#include "TextExtractor.h"
27
28#define MAXFSIZE 600000
29#define DLENGTH 256
30#define WORD_MAX 40
31
32@implementation TextExtractor
33
34- (void)dealloc
35{
36  RELEASE (extensions);
37  RELEASE (skipSet);
38	[super dealloc];
39}
40
41- (id)initForExtractor:(id)extr
42{
43  self = [super init];
44
45  if (self) {
46    NSCharacterSet *set;
47
48    skipSet = [NSMutableCharacterSet new];
49
50    set = [NSCharacterSet controlCharacterSet];
51    [skipSet formUnionWithCharacterSet: set];
52
53    set = [NSCharacterSet illegalCharacterSet];
54    [skipSet formUnionWithCharacterSet: set];
55
56    set = [NSCharacterSet punctuationCharacterSet];
57    [skipSet formUnionWithCharacterSet: set];
58
59    set = [NSCharacterSet symbolCharacterSet];
60    [skipSet formUnionWithCharacterSet: set];
61
62    set = [NSCharacterSet whitespaceAndNewlineCharacterSet];
63    [skipSet formUnionWithCharacterSet: set];
64
65    set = [NSCharacterSet decimalDigitCharacterSet];
66    [skipSet formUnionWithCharacterSet: set];
67
68    set = [NSCharacterSet characterSetWithCharactersInString: @"+-=<>&@$*%#\"\'^`|~_/\\"];
69    [skipSet formUnionWithCharacterSet: set];
70
71    ASSIGN (extensions, [NSArray arrayWithObject: @"txt"]);
72
73    extractor = extr;
74  }
75
76  return self;
77}
78
79- (NSArray *)pathExtensions
80{
81  return extensions;
82}
83
84- (BOOL)canExtractFromFileType:(NSString *)type
85                 withExtension:(NSString *)ext
86                    attributes:(NSDictionary *)attributes
87                      testData:(NSData *)testdata
88{
89  if (testdata && ([attributes fileSize] < MAXFSIZE)) {
90    const char *bytes = (const char *)[testdata bytes];
91    int i;
92
93    for (i = 0; i < [testdata length]; i++) {
94      if (bytes[i] == 0x00) {
95        return NO;
96        break;
97      }
98    }
99
100    return YES;
101  }
102
103  return NO;
104}
105
106- (BOOL)extractMetadataAtPath:(NSString *)path
107                       withID:(int)path_id
108                   attributes:(NSDictionary *)attributes
109{
110  CREATE_AUTORELEASE_POOL(arp);
111  NSString *contents = [NSString stringWithContentsOfFile: path];
112  BOOL success = YES;
113
114  if (contents && [contents length]) {
115    NSScanner *scanner = [NSScanner scannerWithString: contents];
116    SEL scanSel = @selector(scanUpToCharactersFromSet:intoString:);
117    IMP scanImp = [scanner methodForSelector: scanSel];
118    NSMutableDictionary *mddict = [NSMutableDictionary dictionary];
119    NSMutableDictionary *wordsDict = [NSMutableDictionary dictionary];
120    NSCountedSet *wordset = [[NSCountedSet alloc] initWithCapacity: 1];
121    unsigned long wcount = 0;
122    NSString *word;
123
124    [scanner setCharactersToBeSkipped: skipSet];
125
126    while ([scanner isAtEnd] == NO) {
127      (*scanImp)(scanner, scanSel, skipSet, &word);
128
129      if (word) {
130        unsigned wl = [word length];
131
132        if ((wl > 3) && (wl < WORD_MAX)) {
133          [wordset addObject: word];
134        }
135
136        wcount++;
137      }
138    }
139
140    [wordsDict setObject: wordset forKey: @"wset"];
141    [wordsDict setObject: [NSNumber numberWithUnsignedLong: wcount]
142                  forKey: @"wcount"];
143
144    [mddict setObject: wordsDict forKey: @"words"];
145
146    success = [extractor setMetadata: mddict forPath: path withID: path_id];
147
148    RELEASE (wordset);
149  }
150
151  RELEASE (arp);
152
153  return success;
154}
155
156@end
157
158
159
160
161
162
163
164
165