1/* TextExtractor.m 2 * 3 * Copyright (C) 2006 Free Software Foundation, Inc. 4 * 5 * Author: Enrico Sersale <enrico@dtedu.net> 6 * Date: February 2006 7 * 8 * This file is part of the GNUstep GWorkspace application 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 23 */ 24 25#include <AppKit/AppKit.h> 26#include "TextExtractor.h" 27 28#define MAXFSIZE 600000 29#define DLENGTH 256 30#define WORD_MAX 40 31 32@implementation TextExtractor 33 34- (void)dealloc 35{ 36 RELEASE (extensions); 37 RELEASE (skipSet); 38 [super dealloc]; 39} 40 41- (id)initForExtractor:(id)extr 42{ 43 self = [super init]; 44 45 if (self) { 46 NSCharacterSet *set; 47 48 skipSet = [NSMutableCharacterSet new]; 49 50 set = [NSCharacterSet controlCharacterSet]; 51 [skipSet formUnionWithCharacterSet: set]; 52 53 set = [NSCharacterSet illegalCharacterSet]; 54 [skipSet formUnionWithCharacterSet: set]; 55 56 set = [NSCharacterSet punctuationCharacterSet]; 57 [skipSet formUnionWithCharacterSet: set]; 58 59 set = [NSCharacterSet symbolCharacterSet]; 60 [skipSet formUnionWithCharacterSet: set]; 61 62 set = [NSCharacterSet whitespaceAndNewlineCharacterSet]; 63 [skipSet formUnionWithCharacterSet: set]; 64 65 set = [NSCharacterSet decimalDigitCharacterSet]; 66 [skipSet formUnionWithCharacterSet: set]; 67 68 set = [NSCharacterSet characterSetWithCharactersInString: @"+-=<>&@$*%#\"\'^`|~_/\\"]; 69 [skipSet formUnionWithCharacterSet: set]; 70 71 ASSIGN (extensions, [NSArray arrayWithObject: @"txt"]); 72 73 extractor = extr; 74 } 75 76 return self; 77} 78 79- (NSArray *)pathExtensions 80{ 81 return extensions; 82} 83 84- (BOOL)canExtractFromFileType:(NSString *)type 85 withExtension:(NSString *)ext 86 attributes:(NSDictionary *)attributes 87 testData:(NSData *)testdata 88{ 89 if (testdata && ([attributes fileSize] < MAXFSIZE)) { 90 const char *bytes = (const char *)[testdata bytes]; 91 int i; 92 93 for (i = 0; i < [testdata length]; i++) { 94 if (bytes[i] == 0x00) { 95 return NO; 96 break; 97 } 98 } 99 100 return YES; 101 } 102 103 return NO; 104} 105 106- (BOOL)extractMetadataAtPath:(NSString *)path 107 withID:(int)path_id 108 attributes:(NSDictionary *)attributes 109{ 110 CREATE_AUTORELEASE_POOL(arp); 111 NSString *contents = [NSString stringWithContentsOfFile: path]; 112 BOOL success = YES; 113 114 if (contents && [contents length]) { 115 NSScanner *scanner = [NSScanner scannerWithString: contents]; 116 SEL scanSel = @selector(scanUpToCharactersFromSet:intoString:); 117 IMP scanImp = [scanner methodForSelector: scanSel]; 118 NSMutableDictionary *mddict = [NSMutableDictionary dictionary]; 119 NSMutableDictionary *wordsDict = [NSMutableDictionary dictionary]; 120 NSCountedSet *wordset = [[NSCountedSet alloc] initWithCapacity: 1]; 121 unsigned long wcount = 0; 122 NSString *word; 123 124 [scanner setCharactersToBeSkipped: skipSet]; 125 126 while ([scanner isAtEnd] == NO) { 127 (*scanImp)(scanner, scanSel, skipSet, &word); 128 129 if (word) { 130 unsigned wl = [word length]; 131 132 if ((wl > 3) && (wl < WORD_MAX)) { 133 [wordset addObject: word]; 134 } 135 136 wcount++; 137 } 138 } 139 140 [wordsDict setObject: wordset forKey: @"wset"]; 141 [wordsDict setObject: [NSNumber numberWithUnsignedLong: wcount] 142 forKey: @"wcount"]; 143 144 [mddict setObject: wordsDict forKey: @"words"]; 145 146 success = [extractor setMetadata: mddict forPath: path withID: path_id]; 147 148 RELEASE (wordset); 149 } 150 151 RELEASE (arp); 152 153 return success; 154} 155 156@end 157 158 159 160 161 162 163 164 165