1 { Copyright (C) <2008> <Andrew Haines> htmlindexer.pas
2 
3   This library is free software; you can redistribute it and/or modify it
4   under the terms of the GNU Library General Public License as published by
5   the Free Software Foundation; either version 2 of the License, or (at your
6   option) any later version.
7 
8   This program is distributed in the hope that it will be useful, but WITHOUT
9   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10   FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
11   for more details.
12 
13   You should have received a copy of the GNU Library General Public License
14   along with this library; if not, write to the Free Software Foundation,
15   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 }
17 {
18   See the file COPYING.FPC, included in this distribution,
19   for details about the copyright.
20 }
21 unit HTMLIndexer;
22 {$MODE OBJFPC}{$H+}
23 interface
24 uses Classes, SysUtils, FastHTMLParser,{$ifdef userb}fos_redblacktree_gen{$else}avl_tree{$endif};
25 
26 Type
27 
28   { TIndexDocument }
29   TIndexDocument = class(TObject)
30   private
31     FDocumentIndex: Integer;
32     FLastEntry : Integer;
33     WordIndex: array of Integer;
getindexentriesnull34     function getindexentries:integer;
35   public
GetWordIndexnull36     function GetWordIndex(i:integer):integer; inline;
37     procedure AddWordIndex(AIndex: Integer);
38     constructor Create(ADocumentIndex: Integer);
39     property DocumentIndex: Integer read FDocumentIndex;
40     property IndexEntry[i:integer] : Integer read GetWordIndex;
41     property NumberofIndexEntries : integer read getindexentries;
42   end;
43 
44   { TIndexedWord }
45   TIndexedWord = class(TObject)
46   private
47     FIsTitle: Boolean;
48     FTheWord: string;
49     FCachedTopic: TIndexDocument;
50     FDocuments: Array of TIndexDocument;
GetDocumentnull51     function GetDocument ( TopicIndexNum: Integer ) : TIndexDocument;
GetDocumentCountnull52     function GetDocumentCount: Integer;
53   public
54     constructor Create(AWord: String; AIsTitle: Boolean);
55     destructor Destroy; override;
GetLogicalDocumentnull56     function GetLogicalDocument(AIndex: Integer): TIndexDocument;
57     property TheWord: string read FTheWord write ftheword; // Always lowercase
58     property DocumentTopic[TopicIndexNum: Integer]: TIndexDocument read GetDocument;
59     property DocumentCount: Integer read GetDocumentCount;
60     property IsTitle: Boolean read FIsTitle write fistitle;
61   end;
62 
63   { TIndexedWordList }
64 
65   {$ifdef userb}
66   TRBIndexTree = specialize TGFOS_RBTree<String,TIndexedWord>;
67   {$endif}
68 
69   TForEachMethod = procedure (AWord:TIndexedWord) of object;
70   TForEachProcedure = Procedure (AWord:TIndexedWord;state:pointer);
71   TIndexedWordList = class(TObject)
72   private
73     FIndexTitlesOnly: Boolean;
74     FIndexedFileCount: DWord;
75     //vars while processing page
76     FInTitle,
77     FInBody: Boolean;
78     FWordCount: Integer; // only words in body
79     FDocTitle: String;
80     FTopicIndex: Integer;
81     //end vars
82     FTotalDifferentWordLength: DWord;
83     FTotalDIfferentWords: DWord;
84     FTotalWordCount: DWord;
85     FTotalWordLength: DWord;
86     FLongestWord: DWord;
87     FParser: THTMLParser;
88     {$ifdef userb}
89     FAVLTree : TRBIndexTree;
90     {$else}
91     FAVLTree : TAVLTree;
92     Spare :TIndexedWord;
93     {$endif}
94 
AddGetWordnull95     function AddGetWord(AWord: String; IsTitle: Boolean): TIndexedWord;
96     // callbacks
97     procedure CBFoundTag(NoCaseTag, ActualTag: string);
98     procedure CBFountText(Text: string);
99 
100     procedure EatWords(Words: String; IsTitle: Boolean);
101   public
102     constructor Create;
103     destructor  Destroy; override;
IndexFilenull104     function  IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String; // returns the documents <Title>
105     procedure Clear;
106     procedure AddWord(const AWord: TIndexedWord);
107     procedure ForEach(Proc:TForEachMethod);
108     procedure ForEach(Proc:TForEachProcedure;state:pointer);
109     property IndexedFileCount: DWord read FIndexedFileCount;
110     property LongestWord: DWord read FLongestWord;
111     property TotalWordCount: DWord read FTotalWordCount;
112     property TotalDIfferentWords: DWord read FTotalDIfferentWords;
113     property TotalWordLength: DWord read FTotalWordLength;
114     property TotalDifferentWordLength: DWord read FTotalDifferentWordLength;
115     property Words[AWord: String; IsTitle: Boolean] : TIndexedWord read AddGetWord;
116   end;
117 
118 implementation
119 
120 Const GrowSpeed = 10;
121 
Maxnull122 function Max(ANumber, BNumber: DWord): DWord;
123 begin
124   if ANumber > BNumber then
125     Result := ANumber
126   else
127     Result := BNumber;
128 end;
129 
130 const titlexlat : array [boolean] of char = ('0','1');
131 
makekeynull132 function  makekey( n : string;istitle:boolean):string; inline;
133 
134 begin
135    result:=n+'___'+titlexlat[istitle];
136 end;
137 
CompareProcObjnull138 Function CompareProcObj(Node1, Node2: Pointer): integer;
139 var n1,n2 : TIndexedWord;
140 begin
141   n1:=TIndexedWord(Node1); n2:=TIndexedWord(Node2);
142   Result := CompareText(n1.theword, n2.theword);
143   if Result = 0 then
144   begin
145     Result := ord(n2.IsTitle)-ord(n1.IsTitle);
146   end;
147   if Result < 0 then Result := -1
148   else if Result > 0 then Result := 1;
149 end;
150 
151 { TIndexedWordList }
TIndexedWordList.AddGetWordnull152 function TIndexedWordList.AddGetWord(AWord: String; IsTitle: Boolean): TIndexedWord;
153 var
154 {$ifdef userb}
155    key : string;
156 {$else}
157    n : TAVLTreeNode;
158 {$endif}
159 begin
160   Result := nil;
161   AWord := LowerCase(AWord);
162  {$ifdef userb}
163    key:=makekey(aword,istitle);
164    if not favltree.Find(key,result) then result:=nil;;
165   {$else}
166   if not assigned(spare) then
167     spare:=TIndexedWord.Create(AWord,IsTitle)
168   else
169     begin
170       spare.TheWord:=aword;
171       spare.IsTitle:=IsTitle;
172     end;
173 
174   n:=favltree.FindKey(Spare,@CompareProcObj);
175   if assigned(n) then
176    result:=TIndexedWord(n.Data);
177   {$endif}
178 
179   if Result = nil then
180   begin
181     Inc(FTotalDifferentWordLength, Length(AWord));
182     Inc(FTotalDIfferentWords);
183     {$ifdef  userb}
184       result:=TIndexedWord.Create(AWord,IsTitle);
185       favltree.add(key,result);
186     {$else}
187     Result := spare; // TIndexedWord.Create(AWord,IsTitle);
188     spare:=nil;
189     AddWord(Result);
190     {$endif}
191 
192     //  if IsTitle then
193     //WriteLn('Creating word: ', AWord);
194     FLongestWord := Max(FLongestWord, Length(AWord));
195   end;
196   Inc(FTotalWordLength, Length(AWord));
197   Inc(FTotalWordCount);
198 end;
199 
200 procedure TIndexedWordList.CBFoundTag(NoCaseTag, ActualTag: string);
201 begin
202   if FInBody then begin
203     if NoCaseTag = '</BODY>' then FInBody := False;
204   end
205   else begin
206     //WriteLn('"',NoCaseTag,'"');
207     if NoCaseTag      = '<TITLE>' then FInTitle := True
208     else if NoCaseTag = '</TITLE>' then FInTitle := False
209     else if NoCaseTag = '<BODY>' then FInBody := True
210     else
211   end;
212   if FInBody and FIndexTitlesOnly then FParser.Done := True;
213 end;
214 
215 procedure TIndexedWordList.CBFountText(Text: string);
216 begin
217   if Length(Text) < 1 then
218     Exit;
219 
220   if (not FInTitle) and (not FInBody) then
221     Exit;
222 
223   EatWords(Text, FInTitle and not FInBody);
224 end;
225 
226 procedure TIndexedWordList.EatWords ( Words: String; IsTitle: Boolean ) ;
227 var
228   WordPtr: PChar;
229   WordStart: PChar;
230   InWord: Boolean;
231   IsNumberWord: Boolean;
IsEndOfWordnull232   function IsEndOfWord: Boolean;
233   begin
234     Result := not (WordPtr^ in ['a'..'z', '0'..'9', #01, #$DE, #$FE]);
235     if  Result and IsNumberWord then
236       Result :=  Result and (WordPtr[0] <> '.');
237     if Result and InWord then
238       Result := Result and (WordPtr[0] <> '''');
239   ;
240   end;
241   var
242     WordIndex: TIndexedWord;
243     WordName: String;
244     FPos: Integer;
245 begin
246   if IsTitle then
247     FDocTitle := Words;
248   Words := LowerCase(Words);
249   WordStart := PChar(Words);
250   WordPtr := WordStart;
251   IsNumberWord := False;
252   InWord := False;
253   repeat
254     if InWord and IsEndOfWord then
255     begin
256       WordName := Copy(WordStart, 0, (WordPtr-WordStart));
257       FPos := Pos('''', WordName);
258       while FPos > 0 do
259       begin
260         Delete(WordName, FPos, 1);
261         FPos := Pos('''', WordName);
262       end;
263       WordIndex := addgetword(wordname,istitle);
264       InWord := False;
265       IsNumberWord := False;
266       WordIndex.DocumentTopic[FTopicIndex].AddWordIndex(FWordCount);
267       //if not IsTitle then
268         Inc(FWordCount);
269 
270     end
271     else if not InWord and not IsEndOfWord then
272     begin
273       InWord := True;
274       WordStart := WordPtr;
275       IsNumberWord := WordPtr^ in ['0'..'9'];
276     end;
277     Inc(WordPtr);
278   until WordPtr^ = #0;
279 
280   if InWord then
281   begin
282     WordName := Copy(WordStart, 0, (WordPtr-WordStart));
283     try
284     WordIndex := addgetword(wordname,istitle); // Self.Words[WordName, IsTitle];
285     except on e:exception do writeln('Error: ', wordname); end;
286     WordIndex.DocumentTopic[FTopicIndex].AddWordIndex(FWordCount);
287     InWord := False;
288     //if IsNumberWord then WriteLn('Following is NUMBER WORD: "', (WordStart[0]),'"'); ;
289     IsNumberWord := False;
290     //WriteLn(FWordCount, ' "', WordName,'"');
291     if not IsTitle then
292       Inc(FWordCount);
293   end;
294 end;
295 
defaultindexedwordnull296 function defaultindexedword : TIndexedWord;
297 
298 begin
299   result:=Tindexedword.create('',false);
300 end;
301 
302 constructor TIndexedWordList.Create;
303 begin
304   inherited;
305   {$ifdef userb}
306   FAVLTree :=TRBIndexTree.create(@default_rb_string_compare,
307                                  @defaultindexedword,
308                                  @default_rb_string_undef );
309   {$else}
310   favltree:=TAVLTree.Create(@CompareProcObj);
311   spare:=nil;
312   {$endif}
313 end;
314 
315 procedure FreeObject(const Obj:TIndexedWord);
316 begin
317  obj.free;
318 end;
319 
320 
321 destructor TIndexedWordList.Destroy;
322 begin
323   clear;
324   {$ifndef userb}
325   if assigned(spare) then spare.free;
326   {$endif}
327   favltree.free;
328   inherited Destroy;
329 end;
330 
TIndexedWordList.IndexFilenull331 function TIndexedWordList.IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String;
332 var
333   TheFile: String;
334 begin
335   FInBody := False;
336   FInTitle:= False;
337   FIndexTitlesOnly := AIndexOnlyTitles;
338   FWordCount := 0;
339   FTopicIndex := ATOPICIndex;
340   FIndexedFileCount := FIndexedFileCount +1;
341 
342   SetLength(TheFile, AStream.Size+1);
343   AStream.Position := 0;
344   AStream.Read(TheFile[1], AStream.Size);
345   TheFile[Length(TheFile)] := #0;
346 
347   FParser := THTMLParser.Create(@TheFile[1]);
348   FParser.OnFoundTag := @CBFoundTag;
349   FParser.OnFoundText := @CBFountText;
350   FParser.Exec;
351   FParser.Free;
352 
353   Result := FDocTitle;
354   FDocTitle := '';
355   FInBody := False;
356   FInTitle:= False;
357   FWordCount := 0;
358   FTopicIndex := -1;
359 
360   AStream.Position := 0;
361 end;
362 
363 procedure TIndexedWordList.Clear;
364 begin
365   {$ifdef userb}
366    fAvlTree.ClearN(@FreeObject);
367   {$else}
368   fAvlTree.FreeAndClear;
369   {$endif}
370 end;
371 
372 procedure TIndexedWordList.AddWord(const AWord: TIndexedWord);
373 begin
374  {$ifdef userb}
375   favltree.add(makekey(aword.theword,aword.istitle),AWord);
376  {$else}
377   favltree.add(aword);
378  {$endif}
379 end;
380 
381 procedure TIndexedWordList.ForEach(Proc:TForEachMethod);
382 {$ifdef userb}
383 var key : string;
384     val:TIndexedWord;
385 {$else}
386 var
387     AVLNode   : TAVLTreeNode;
388 {$endif}
389 begin
390  {$ifdef userb}
391     if favltree.FirstNode(key,val) then
392       begin  // Scan it forward
393         repeat
394           proc(val);
395         until not favltree.FindNext(key,val);
396       end;
397  {$else}
398    AVLNode:=fAVLTree.FindLowest;
399    while (AVLNode<>nil) do
400       begin
401         Proc(TIndexedWord(AVLNode.Data));
402         AVLNode:=FAVLTree.FindSuccessor(AVLNode)
403       end;
404  {$endif}
405 end;
406 
407 procedure TIndexedWordList.ForEach(Proc:TForEachProcedure;state:pointer);
408 
409 {$ifdef userb}
410 var key : string;
411     val:TIndexedWord;
412 {$else}
413 var
414     AVLNode   : TAVLTreeNode;
415 {$endif}
416 begin
417  {$ifdef userb}
418     if favltree.FirstNode(key,val) then
419       begin  // Scan it forward
420         repeat
421           proc(val,state);
422         until not favltree.FindNext(key,val);
423       end;
424  {$else}
425    AVLNode:=fAVLTree.FindLowest;
426    while (AVLNode<>nil) do
427       begin
428         Proc(TIndexedWord(AVLNode.Data),State);
429         AVLNode:=FAVLTree.FindSuccessor(AVLNode)
430       end;
431   {$endif}
432 end;
433 
434 { TIndexedWord }
GetDocumentnull435 function TIndexedWord.GetDocument ( TopicIndexNum: Integer ) : TIndexDocument;
436 var
437   i: Integer;
438 begin
439   Result := nil;
440   if (FCachedTopic <> nil) and (FCachedTopic.FDocumentIndex = TopicIndexNum) then
441     Exit(FCachedTopic);
442 
443   for i := 0 to High(FDocuments) do
444     if FDocuments[i].FDocumentIndex = TopicIndexNum then
445       Exit(FDocuments[i]);
446   if Result = nil then
447   begin
448     Result := TIndexDocument.Create(TopicIndexNum);
449     SetLength(FDocuments, Length(FDocuments)+1);
450     FDocuments[High(FDocuments)] := Result;
451   end;
452   FCachedTopic := Result;
453 end;
454 
TIndexedWord.GetDocumentCountnull455 function TIndexedWord.GetDocumentCount: Integer;
456 begin
457   Result := Length(FDocuments);
458 end;
459 
460 constructor TIndexedWord.Create(AWord: String; AIsTitle: Boolean);
461 begin
462   FTheWord := AWord;
463   FIsTitle := AIsTitle;
464 end;
465 
466 destructor TIndexedWord.Destroy;
467 var
468   i: Integer;
469 begin
470   // here the word removed itself from the linked list. But it can't
471   // touch the AVL tree here.
472   for i := 0 to High(FDocuments) do
473     FreeAndNil(FDocuments[i]);
474   inherited Destroy;
475 end;
476 
GetLogicalDocumentnull477 function TIndexedWord.GetLogicalDocument ( AIndex: Integer ) : TIndexDocument;
478 begin
479   Result := FDocuments[AIndex];;
480 end;
481 
482 { TIndexDocument }
483 procedure TIndexDocument.AddWordIndex ( AIndex: Integer ) ;
484 begin
485   if FLastEntry>=Length(WordIndex) Then
486   SetLength(WordIndex, Length(WordIndex)+GrowSpeed);
487   WordIndex[FLastEntry] := AIndex;
488   Inc(FLastEntry);
489 end;
490 
491 constructor TIndexDocument.Create ( ADocumentIndex: Integer ) ;
492 begin
493   FDocumentIndex := ADocumentIndex;
494   flastentry:=0;
495 end;
496 
GetWordIndexnull497 function TIndexDocument.GetWordIndex(i:integer):integer;
498 begin
499   result:=WordIndex[i];
500 end;
501 
getindexentriesnull502 function TIndexDocument.getindexentries:integer;
503 begin
504  result:=flastentry;
505 end;
506 
507 end.
508