1 //
2 // Parsable.cc
3 //
4 // Parsable: Base class for file parsers (HTML, PDF, ExternalParser ...)
5 //
6 // Part of the ht://Dig package   <http://www.htdig.org/>
7 // Copyright (c) 1995-2004 The ht://Dig Group
8 // For copyright details, see the file COPYING in your distribution
9 // or the GNU Library General Public License (LGPL) version 2 or later
10 // <http://www.gnu.org/copyleft/lgpl.html>
11 //
12 // $Id: Parsable.cc,v 1.9 2004/05/28 13:15:15 lha Exp $
13 //
14 
15 #ifdef HAVE_CONFIG_H
16 #include "htconfig.h"
17 #endif /* HAVE_CONFIG_H */
18 
19 #include "Parsable.h"
20 #include "htdig.h"
21 #include "defaults.h"
22 
23 
24 //*****************************************************************************
25 // Parsable::Parsable()
26 //
Parsable()27 Parsable::Parsable()
28 {
29 	HtConfiguration* config= HtConfiguration::config();
30     contents = 0;
31     max_head_length = config->Value("max_head_length", 0);
32     max_description_length = config->Value("max_description_length", 50);
33     max_meta_description_length = config->Value("max_meta_description_length", 0);
34 
35     max_keywords = config->Value("max_keywords", -1);
36     if (max_keywords < 0)
37 	max_keywords = (int) ((unsigned int) ~1 >> 1);
38     minimum_word_length = config->Value("minimum_word_length", 3);
39 }
40 
41 
42 //*****************************************************************************
43 // Parsable::~Parsable()
44 //
~Parsable()45 Parsable::~Parsable()
46 {
47     delete contents;
48 }
49 
50 
51 //*****************************************************************************
52 // void Parsable::setContents(char *data, int length)
53 //   This will set the contents of the parsable object.
54 //
55 void
setContents(char * data,int length)56 Parsable::setContents(char *data, int length)
57 {
58     delete contents;
59     contents = new String(data, length);
60 }
61 
62 //*****************************************************************************
63 // void Parsable::addString(char *s, int& wordindex, int slot)
64 //   Add all words in string s in "heading level" slot, incrementing  wordindex
65 //   along the way.  String  s  is corrupted.
66 //
67 void
addString(Retriever & retriever,char * s,int & wordindex,int slot)68 Parsable::addString(Retriever& retriever, char *s, int& wordindex, int slot)
69 {
70     char *w = HtWordToken(s);
71     while (w)
72     {
73 	if (strlen(w) >= minimum_word_length)
74 	    retriever.got_word(w, wordindex++, slot); // slot for img_alt
75 	w = HtWordToken(0);
76     }
77     w = '\0';
78 }
79 
80 //*****************************************************************************
81 // void Parsable::addKeywordString(char *s, int& wordindex)
82 //   Add all words in string  s  as keywords, incrementing  wordindex
83 //   along the way.  String  s  is corrupted.
84 //
85 void
addKeywordString(Retriever & retriever,char * s,int & wordindex)86 Parsable::addKeywordString(Retriever& retriever, char *s, int& wordindex)
87 {
88     char	*w = HtWordToken(s);
89     while (w)
90     {
91 	if (strlen(w) >= minimum_word_length && ++keywordsCount <= max_keywords)
92 	    retriever.got_word(w, wordindex++, 9);
93 	w = HtWordToken(0);
94     }
95     w = '\0';
96 }
97