1/* PdfExtractor.m
2 *
3 * Copyright (C) 2006 Free Software Foundation, Inc.
4 *
5 * Author: Enrico Sersale <enrico@dtedu.net>
6 * Date: June 2006
7 *
8 * This file is part of the GNUstep GWorkspace application
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
23 */
24
25#include <AppKit/AppKit.h>
26#include <PDFKit/PDFDocument.h>
27#include "PdfExtractor.h"
28
29#define MAXFSIZE 600000
30#define DLENGTH 256
31#define WORD_MAX 40
32
33@implementation PdfExtractor
34
35- (void)dealloc
36{
37  RELEASE (extensions);
38  RELEASE (skipSet);
39
40	[super dealloc];
41}
42
43- (id)initForExtractor:(id)extr
44{
45  self = [super init];
46
47  if (self) {
48    NSCharacterSet *set;
49
50    skipSet = [NSMutableCharacterSet new];
51
52    set = [NSCharacterSet controlCharacterSet];
53    [skipSet formUnionWithCharacterSet: set];
54
55    set = [NSCharacterSet illegalCharacterSet];
56    [skipSet formUnionWithCharacterSet: set];
57
58    set = [NSCharacterSet punctuationCharacterSet];
59    [skipSet formUnionWithCharacterSet: set];
60
61    set = [NSCharacterSet symbolCharacterSet];
62    [skipSet formUnionWithCharacterSet: set];
63
64    set = [NSCharacterSet whitespaceAndNewlineCharacterSet];
65    [skipSet formUnionWithCharacterSet: set];
66
67    set = [NSCharacterSet decimalDigitCharacterSet];
68    [skipSet formUnionWithCharacterSet: set];
69
70    set = [NSCharacterSet characterSetWithCharactersInString: @"+-=<>&@$*%#\"\'^`|~_/\\"];
71    [skipSet formUnionWithCharacterSet: set];
72
73    ASSIGN (extensions, ([NSArray arrayWithObject: @"pdf"]));
74    extractor = extr;
75  }
76
77  return self;
78}
79
80- (NSArray *)pathExtensions
81{
82  return extensions;
83}
84
85- (BOOL)canExtractFromFileType:(NSString *)type
86                 withExtension:(NSString *)ext
87                    attributes:(NSDictionary *)attributes
88                      testData:(NSData *)testdata
89{
90  if (testdata && ([attributes fileSize] < MAXFSIZE)) {
91    return ([extensions containsObject: ext]);
92  }
93
94  return NO;
95}
96
97- (BOOL)extractMetadataAtPath:(NSString *)path
98                       withID:(int)path_id
99                   attributes:(NSDictionary *)attributes
100{
101  CREATE_AUTORELEASE_POOL(arp);
102  NSMutableDictionary *mddict = [NSMutableDictionary dictionary];
103  PDFDocument *doc = [PDFDocument documentFromFile: path];
104  BOOL success = NO;
105
106  if (doc && [doc isOk] && ([doc errorCode] == 0)) {
107    NSString *contents = [doc getAllText];
108    NSDictionary *info = [doc getDocumentInfo];
109
110    if (contents && [contents length]) {
111      NSScanner *scanner = [NSScanner scannerWithString: contents];
112      SEL scanSel = @selector(scanUpToCharactersFromSet:intoString:);
113      IMP scanImp = [scanner methodForSelector: scanSel];
114      NSMutableDictionary *wordsDict = [NSMutableDictionary dictionary];
115      NSCountedSet *wordset = [[NSCountedSet alloc] initWithCapacity: 1];
116      unsigned long wcount = 0;
117      NSString *word;
118
119      [scanner setCharactersToBeSkipped: skipSet];
120
121      while ([scanner isAtEnd] == NO) {
122        (*scanImp)(scanner, scanSel, skipSet, &word);
123
124        if (word) {
125          unsigned wl = [word length];
126
127          if ((wl > 3) && (wl < WORD_MAX)) {
128            [wordset addObject: word];
129          }
130
131          wcount++;
132        }
133      }
134
135      [wordsDict setObject: wordset forKey: @"wset"];
136      RELEASE (wordset);
137      [wordsDict setObject: [NSNumber numberWithUnsignedLong: wcount]
138                    forKey: @"wcount"];
139
140      [mddict setObject: wordsDict forKey: @"words"];
141    }
142
143    if (info) {
144      NSMutableDictionary *attrsdict = [NSMutableDictionary dictionary];
145      id entry;
146
147      entry = [info objectForKey: @"Title"];
148      if (entry) {
149        [attrsdict setObject: entry forKey: @"GSMDItemTitle"];
150      }
151
152  //    entry = [info objectForKey: @"Subject"];
153  //    if (entry) {
154  //      [attrsdict setObject: entry forKey: @"GSMDItemTitle"];
155  //    }
156
157      entry = [info objectForKey: @"Keywords"];
158      if (entry) {
159        NSArray *words = [entry componentsSeparatedByString: @", "];
160
161        [attrsdict setObject: [words description]
162                      forKey: @"GSMDItemKeywords"];
163      }
164
165      entry = [info objectForKey: @"Author"];
166      if (entry) {
167        [attrsdict setObject: [[NSArray arrayWithObject: entry] description]
168                      forKey: @"GSMDItemAuthors"];
169      }
170
171      entry = [info objectForKey: @"Creator"];
172      if (entry) {
173        [attrsdict setObject: entry forKey: @"GSMDItemCreator"];
174      }
175
176      entry = [info objectForKey: @"Producer"];
177      if (entry) {
178        [attrsdict setObject: [[NSArray arrayWithObject: entry] description]
179                      forKey: @"GSMDItemEncodingApplications"];
180      }
181
182      [mddict setObject: attrsdict forKey: @"attributes"];
183    }
184  }
185
186  success = [extractor setMetadata: mddict forPath: path withID: path_id];
187
188  RELEASE (arp);
189
190  return success;
191}
192
193@end
194
195
196