1 //
2 // Parsable.cc
3 //
4 // Parsable: Base class for file parsers (HTML, PDF, ExternalParser ...)
5 //
6 // Part of the ht://Dig package <http://www.htdig.org/>
7 // Copyright (c) 1995-2004 The ht://Dig Group
8 // For copyright details, see the file COPYING in your distribution
9 // or the GNU Library General Public License (LGPL) version 2 or later
10 // <http://www.gnu.org/copyleft/lgpl.html>
11 //
12 // $Id: Parsable.cc,v 1.9 2004/05/28 13:15:15 lha Exp $
13 //
14
15 #ifdef HAVE_CONFIG_H
16 #include "htconfig.h"
17 #endif /* HAVE_CONFIG_H */
18
19 #include "Parsable.h"
20 #include "htdig.h"
21 #include "defaults.h"
22
23
24 //*****************************************************************************
25 // Parsable::Parsable()
26 //
Parsable()27 Parsable::Parsable()
28 {
29 HtConfiguration* config= HtConfiguration::config();
30 contents = 0;
31 max_head_length = config->Value("max_head_length", 0);
32 max_description_length = config->Value("max_description_length", 50);
33 max_meta_description_length = config->Value("max_meta_description_length", 0);
34
35 max_keywords = config->Value("max_keywords", -1);
36 if (max_keywords < 0)
37 max_keywords = (int) ((unsigned int) ~1 >> 1);
38 minimum_word_length = config->Value("minimum_word_length", 3);
39 }
40
41
42 //*****************************************************************************
43 // Parsable::~Parsable()
44 //
~Parsable()45 Parsable::~Parsable()
46 {
47 delete contents;
48 }
49
50
51 //*****************************************************************************
52 // void Parsable::setContents(char *data, int length)
53 // This will set the contents of the parsable object.
54 //
55 void
setContents(char * data,int length)56 Parsable::setContents(char *data, int length)
57 {
58 delete contents;
59 contents = new String(data, length);
60 }
61
62 //*****************************************************************************
63 // void Parsable::addString(char *s, int& wordindex, int slot)
64 // Add all words in string s in "heading level" slot, incrementing wordindex
65 // along the way. String s is corrupted.
66 //
67 void
addString(Retriever & retriever,char * s,int & wordindex,int slot)68 Parsable::addString(Retriever& retriever, char *s, int& wordindex, int slot)
69 {
70 char *w = HtWordToken(s);
71 while (w)
72 {
73 if (strlen(w) >= minimum_word_length)
74 retriever.got_word(w, wordindex++, slot); // slot for img_alt
75 w = HtWordToken(0);
76 }
77 w = '\0';
78 }
79
80 //*****************************************************************************
81 // void Parsable::addKeywordString(char *s, int& wordindex)
82 // Add all words in string s as keywords, incrementing wordindex
83 // along the way. String s is corrupted.
84 //
85 void
addKeywordString(Retriever & retriever,char * s,int & wordindex)86 Parsable::addKeywordString(Retriever& retriever, char *s, int& wordindex)
87 {
88 char *w = HtWordToken(s);
89 while (w)
90 {
91 if (strlen(w) >= minimum_word_length && ++keywordsCount <= max_keywords)
92 retriever.got_word(w, wordindex++, 9);
93 w = HtWordToken(0);
94 }
95 w = '\0';
96 }
97