1/* PdfExtractor.m 2 * 3 * Copyright (C) 2006 Free Software Foundation, Inc. 4 * 5 * Author: Enrico Sersale <enrico@dtedu.net> 6 * Date: June 2006 7 * 8 * This file is part of the GNUstep GWorkspace application 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 23 */ 24 25#include <AppKit/AppKit.h> 26#include <PDFKit/PDFDocument.h> 27#include "PdfExtractor.h" 28 29#define MAXFSIZE 600000 30#define DLENGTH 256 31#define WORD_MAX 40 32 33@implementation PdfExtractor 34 35- (void)dealloc 36{ 37 RELEASE (extensions); 38 RELEASE (skipSet); 39 40 [super dealloc]; 41} 42 43- (id)initForExtractor:(id)extr 44{ 45 self = [super init]; 46 47 if (self) { 48 NSCharacterSet *set; 49 50 skipSet = [NSMutableCharacterSet new]; 51 52 set = [NSCharacterSet controlCharacterSet]; 53 [skipSet formUnionWithCharacterSet: set]; 54 55 set = [NSCharacterSet illegalCharacterSet]; 56 [skipSet formUnionWithCharacterSet: set]; 57 58 set = [NSCharacterSet punctuationCharacterSet]; 59 [skipSet formUnionWithCharacterSet: set]; 60 61 set = [NSCharacterSet symbolCharacterSet]; 62 [skipSet formUnionWithCharacterSet: set]; 63 64 set = [NSCharacterSet whitespaceAndNewlineCharacterSet]; 65 [skipSet formUnionWithCharacterSet: set]; 66 67 set = [NSCharacterSet decimalDigitCharacterSet]; 68 [skipSet formUnionWithCharacterSet: set]; 69 70 set = [NSCharacterSet characterSetWithCharactersInString: @"+-=<>&@$*%#\"\'^`|~_/\\"]; 71 [skipSet formUnionWithCharacterSet: set]; 72 73 ASSIGN (extensions, ([NSArray arrayWithObject: @"pdf"])); 74 extractor = extr; 75 } 76 77 return self; 78} 79 80- (NSArray *)pathExtensions 81{ 82 return extensions; 83} 84 85- (BOOL)canExtractFromFileType:(NSString *)type 86 withExtension:(NSString *)ext 87 attributes:(NSDictionary *)attributes 88 testData:(NSData *)testdata 89{ 90 if (testdata && ([attributes fileSize] < MAXFSIZE)) { 91 return ([extensions containsObject: ext]); 92 } 93 94 return NO; 95} 96 97- (BOOL)extractMetadataAtPath:(NSString *)path 98 withID:(int)path_id 99 attributes:(NSDictionary *)attributes 100{ 101 CREATE_AUTORELEASE_POOL(arp); 102 NSMutableDictionary *mddict = [NSMutableDictionary dictionary]; 103 PDFDocument *doc = [PDFDocument documentFromFile: path]; 104 BOOL success = NO; 105 106 if (doc && [doc isOk] && ([doc errorCode] == 0)) { 107 NSString *contents = [doc getAllText]; 108 NSDictionary *info = [doc getDocumentInfo]; 109 110 if (contents && [contents length]) { 111 NSScanner *scanner = [NSScanner scannerWithString: contents]; 112 SEL scanSel = @selector(scanUpToCharactersFromSet:intoString:); 113 IMP scanImp = [scanner methodForSelector: scanSel]; 114 NSMutableDictionary *wordsDict = [NSMutableDictionary dictionary]; 115 NSCountedSet *wordset = [[NSCountedSet alloc] initWithCapacity: 1]; 116 unsigned long wcount = 0; 117 NSString *word; 118 119 [scanner setCharactersToBeSkipped: skipSet]; 120 121 while ([scanner isAtEnd] == NO) { 122 (*scanImp)(scanner, scanSel, skipSet, &word); 123 124 if (word) { 125 unsigned wl = [word length]; 126 127 if ((wl > 3) && (wl < WORD_MAX)) { 128 [wordset addObject: word]; 129 } 130 131 wcount++; 132 } 133 } 134 135 [wordsDict setObject: wordset forKey: @"wset"]; 136 RELEASE (wordset); 137 [wordsDict setObject: [NSNumber numberWithUnsignedLong: wcount] 138 forKey: @"wcount"]; 139 140 [mddict setObject: wordsDict forKey: @"words"]; 141 } 142 143 if (info) { 144 NSMutableDictionary *attrsdict = [NSMutableDictionary dictionary]; 145 id entry; 146 147 entry = [info objectForKey: @"Title"]; 148 if (entry) { 149 [attrsdict setObject: entry forKey: @"GSMDItemTitle"]; 150 } 151 152 // entry = [info objectForKey: @"Subject"]; 153 // if (entry) { 154 // [attrsdict setObject: entry forKey: @"GSMDItemTitle"]; 155 // } 156 157 entry = [info objectForKey: @"Keywords"]; 158 if (entry) { 159 NSArray *words = [entry componentsSeparatedByString: @", "]; 160 161 [attrsdict setObject: [words description] 162 forKey: @"GSMDItemKeywords"]; 163 } 164 165 entry = [info objectForKey: @"Author"]; 166 if (entry) { 167 [attrsdict setObject: [[NSArray arrayWithObject: entry] description] 168 forKey: @"GSMDItemAuthors"]; 169 } 170 171 entry = [info objectForKey: @"Creator"]; 172 if (entry) { 173 [attrsdict setObject: entry forKey: @"GSMDItemCreator"]; 174 } 175 176 entry = [info objectForKey: @"Producer"]; 177 if (entry) { 178 [attrsdict setObject: [[NSArray arrayWithObject: entry] description] 179 forKey: @"GSMDItemEncodingApplications"]; 180 } 181 182 [mddict setObject: attrsdict forKey: @"attributes"]; 183 } 184 } 185 186 success = [extractor setMetadata: mddict forPath: path withID: path_id]; 187 188 RELEASE (arp); 189 190 return success; 191} 192 193@end 194 195 196