1 { Copyright (C) <2008> <Andrew Haines> htmlindexer.pas
2
3 This library is free software; you can redistribute it and/or modify it
4 under the terms of the GNU Library General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or (at your
6 option) any later version.
7
8 This program is distributed in the hope that it will be useful, but WITHOUT
9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
11 for more details.
12
13 You should have received a copy of the GNU Library General Public License
14 along with this library; if not, write to the Free Software Foundation,
15 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 }
17 {
18 See the file COPYING.FPC, included in this distribution,
19 for details about the copyright.
20 }
21 unit HTMLIndexer;
22 {$MODE OBJFPC}{$H+}
23 interface
24 uses Classes, SysUtils, FastHTMLParser,{$ifdef userb}fos_redblacktree_gen{$else}avl_tree{$endif};
25
26 Type
27
28 { TIndexDocument }
29 TIndexDocument = class(TObject)
30 private
31 FDocumentIndex: Integer;
32 FLastEntry : Integer;
33 WordIndex: array of Integer;
getindexentriesnull34 function getindexentries:integer;
35 public
GetWordIndexnull36 function GetWordIndex(i:integer):integer; inline;
37 procedure AddWordIndex(AIndex: Integer);
38 constructor Create(ADocumentIndex: Integer);
39 property DocumentIndex: Integer read FDocumentIndex;
40 property IndexEntry[i:integer] : Integer read GetWordIndex;
41 property NumberofIndexEntries : integer read getindexentries;
42 end;
43
44 { TIndexedWord }
45 TIndexedWord = class(TObject)
46 private
47 FIsTitle: Boolean;
48 FTheWord: string;
49 FCachedTopic: TIndexDocument;
50 FDocuments: Array of TIndexDocument;
GetDocumentnull51 function GetDocument ( TopicIndexNum: Integer ) : TIndexDocument;
GetDocumentCountnull52 function GetDocumentCount: Integer;
53 public
54 constructor Create(AWord: String; AIsTitle: Boolean);
55 destructor Destroy; override;
GetLogicalDocumentnull56 function GetLogicalDocument(AIndex: Integer): TIndexDocument;
57 property TheWord: string read FTheWord write ftheword; // Always lowercase
58 property DocumentTopic[TopicIndexNum: Integer]: TIndexDocument read GetDocument;
59 property DocumentCount: Integer read GetDocumentCount;
60 property IsTitle: Boolean read FIsTitle write fistitle;
61 end;
62
63 { TIndexedWordList }
64
65 {$ifdef userb}
66 TRBIndexTree = specialize TGFOS_RBTree<String,TIndexedWord>;
67 {$endif}
68
69 TForEachMethod = procedure (AWord:TIndexedWord) of object;
70 TForEachProcedure = Procedure (AWord:TIndexedWord;state:pointer);
71 TIndexedWordList = class(TObject)
72 private
73 FIndexTitlesOnly: Boolean;
74 FIndexedFileCount: DWord;
75 //vars while processing page
76 FInTitle,
77 FInBody: Boolean;
78 FWordCount: Integer; // only words in body
79 FDocTitle: String;
80 FTopicIndex: Integer;
81 //end vars
82 FTotalDifferentWordLength: DWord;
83 FTotalDIfferentWords: DWord;
84 FTotalWordCount: DWord;
85 FTotalWordLength: DWord;
86 FLongestWord: DWord;
87 FParser: THTMLParser;
88 {$ifdef userb}
89 FAVLTree : TRBIndexTree;
90 {$else}
91 FAVLTree : TAVLTree;
92 Spare :TIndexedWord;
93 {$endif}
94
AddGetWordnull95 function AddGetWord(AWord: String; IsTitle: Boolean): TIndexedWord;
96 // callbacks
97 procedure CBFoundTag(NoCaseTag, ActualTag: string);
98 procedure CBFountText(Text: string);
99
100 procedure EatWords(Words: String; IsTitle: Boolean);
101 public
102 constructor Create;
103 destructor Destroy; override;
IndexFilenull104 function IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String; // returns the documents <Title>
105 procedure Clear;
106 procedure AddWord(const AWord: TIndexedWord);
107 procedure ForEach(Proc:TForEachMethod);
108 procedure ForEach(Proc:TForEachProcedure;state:pointer);
109 property IndexedFileCount: DWord read FIndexedFileCount;
110 property LongestWord: DWord read FLongestWord;
111 property TotalWordCount: DWord read FTotalWordCount;
112 property TotalDIfferentWords: DWord read FTotalDIfferentWords;
113 property TotalWordLength: DWord read FTotalWordLength;
114 property TotalDifferentWordLength: DWord read FTotalDifferentWordLength;
115 property Words[AWord: String; IsTitle: Boolean] : TIndexedWord read AddGetWord;
116 end;
117
118 implementation
119
120 Const GrowSpeed = 10;
121
Maxnull122 function Max(ANumber, BNumber: DWord): DWord;
123 begin
124 if ANumber > BNumber then
125 Result := ANumber
126 else
127 Result := BNumber;
128 end;
129
130 const titlexlat : array [boolean] of char = ('0','1');
131
makekeynull132 function makekey( n : string;istitle:boolean):string; inline;
133
134 begin
135 result:=n+'___'+titlexlat[istitle];
136 end;
137
CompareProcObjnull138 Function CompareProcObj(Node1, Node2: Pointer): integer;
139 var n1,n2 : TIndexedWord;
140 begin
141 n1:=TIndexedWord(Node1); n2:=TIndexedWord(Node2);
142 Result := CompareText(n1.theword, n2.theword);
143 if Result = 0 then
144 begin
145 Result := ord(n2.IsTitle)-ord(n1.IsTitle);
146 end;
147 if Result < 0 then Result := -1
148 else if Result > 0 then Result := 1;
149 end;
150
151 { TIndexedWordList }
TIndexedWordList.AddGetWordnull152 function TIndexedWordList.AddGetWord(AWord: String; IsTitle: Boolean): TIndexedWord;
153 var
154 {$ifdef userb}
155 key : string;
156 {$else}
157 n : TAVLTreeNode;
158 {$endif}
159 begin
160 Result := nil;
161 AWord := LowerCase(AWord);
162 {$ifdef userb}
163 key:=makekey(aword,istitle);
164 if not favltree.Find(key,result) then result:=nil;;
165 {$else}
166 if not assigned(spare) then
167 spare:=TIndexedWord.Create(AWord,IsTitle)
168 else
169 begin
170 spare.TheWord:=aword;
171 spare.IsTitle:=IsTitle;
172 end;
173
174 n:=favltree.FindKey(Spare,@CompareProcObj);
175 if assigned(n) then
176 result:=TIndexedWord(n.Data);
177 {$endif}
178
179 if Result = nil then
180 begin
181 Inc(FTotalDifferentWordLength, Length(AWord));
182 Inc(FTotalDIfferentWords);
183 {$ifdef userb}
184 result:=TIndexedWord.Create(AWord,IsTitle);
185 favltree.add(key,result);
186 {$else}
187 Result := spare; // TIndexedWord.Create(AWord,IsTitle);
188 spare:=nil;
189 AddWord(Result);
190 {$endif}
191
192 // if IsTitle then
193 //WriteLn('Creating word: ', AWord);
194 FLongestWord := Max(FLongestWord, Length(AWord));
195 end;
196 Inc(FTotalWordLength, Length(AWord));
197 Inc(FTotalWordCount);
198 end;
199
200 procedure TIndexedWordList.CBFoundTag(NoCaseTag, ActualTag: string);
201 begin
202 if FInBody then begin
203 if NoCaseTag = '</BODY>' then FInBody := False;
204 end
205 else begin
206 //WriteLn('"',NoCaseTag,'"');
207 if NoCaseTag = '<TITLE>' then FInTitle := True
208 else if NoCaseTag = '</TITLE>' then FInTitle := False
209 else if NoCaseTag = '<BODY>' then FInBody := True
210 else
211 end;
212 if FInBody and FIndexTitlesOnly then FParser.Done := True;
213 end;
214
215 procedure TIndexedWordList.CBFountText(Text: string);
216 begin
217 if Length(Text) < 1 then
218 Exit;
219
220 if (not FInTitle) and (not FInBody) then
221 Exit;
222
223 EatWords(Text, FInTitle and not FInBody);
224 end;
225
226 procedure TIndexedWordList.EatWords ( Words: String; IsTitle: Boolean ) ;
227 var
228 WordPtr: PChar;
229 WordStart: PChar;
230 InWord: Boolean;
231 IsNumberWord: Boolean;
IsEndOfWordnull232 function IsEndOfWord: Boolean;
233 begin
234 Result := not (WordPtr^ in ['a'..'z', '0'..'9', #01, #$DE, #$FE]);
235 if Result and IsNumberWord then
236 Result := Result and (WordPtr[0] <> '.');
237 if Result and InWord then
238 Result := Result and (WordPtr[0] <> '''');
239 ;
240 end;
241 var
242 WordIndex: TIndexedWord;
243 WordName: String;
244 FPos: Integer;
245 begin
246 if IsTitle then
247 FDocTitle := Words;
248 Words := LowerCase(Words);
249 WordStart := PChar(Words);
250 WordPtr := WordStart;
251 IsNumberWord := False;
252 InWord := False;
253 repeat
254 if InWord and IsEndOfWord then
255 begin
256 WordName := Copy(WordStart, 0, (WordPtr-WordStart));
257 FPos := Pos('''', WordName);
258 while FPos > 0 do
259 begin
260 Delete(WordName, FPos, 1);
261 FPos := Pos('''', WordName);
262 end;
263 WordIndex := addgetword(wordname,istitle);
264 InWord := False;
265 IsNumberWord := False;
266 WordIndex.DocumentTopic[FTopicIndex].AddWordIndex(FWordCount);
267 //if not IsTitle then
268 Inc(FWordCount);
269
270 end
271 else if not InWord and not IsEndOfWord then
272 begin
273 InWord := True;
274 WordStart := WordPtr;
275 IsNumberWord := WordPtr^ in ['0'..'9'];
276 end;
277 Inc(WordPtr);
278 until WordPtr^ = #0;
279
280 if InWord then
281 begin
282 WordName := Copy(WordStart, 0, (WordPtr-WordStart));
283 try
284 WordIndex := addgetword(wordname,istitle); // Self.Words[WordName, IsTitle];
285 except on e:exception do writeln('Error: ', wordname); end;
286 WordIndex.DocumentTopic[FTopicIndex].AddWordIndex(FWordCount);
287 InWord := False;
288 //if IsNumberWord then WriteLn('Following is NUMBER WORD: "', (WordStart[0]),'"'); ;
289 IsNumberWord := False;
290 //WriteLn(FWordCount, ' "', WordName,'"');
291 if not IsTitle then
292 Inc(FWordCount);
293 end;
294 end;
295
defaultindexedwordnull296 function defaultindexedword : TIndexedWord;
297
298 begin
299 result:=Tindexedword.create('',false);
300 end;
301
302 constructor TIndexedWordList.Create;
303 begin
304 inherited;
305 {$ifdef userb}
306 FAVLTree :=TRBIndexTree.create(@default_rb_string_compare,
307 @defaultindexedword,
308 @default_rb_string_undef );
309 {$else}
310 favltree:=TAVLTree.Create(@CompareProcObj);
311 spare:=nil;
312 {$endif}
313 end;
314
315 procedure FreeObject(const Obj:TIndexedWord);
316 begin
317 obj.free;
318 end;
319
320
321 destructor TIndexedWordList.Destroy;
322 begin
323 clear;
324 {$ifndef userb}
325 if assigned(spare) then spare.free;
326 {$endif}
327 favltree.free;
328 inherited Destroy;
329 end;
330
TIndexedWordList.IndexFilenull331 function TIndexedWordList.IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String;
332 var
333 TheFile: String;
334 begin
335 FInBody := False;
336 FInTitle:= False;
337 FIndexTitlesOnly := AIndexOnlyTitles;
338 FWordCount := 0;
339 FTopicIndex := ATOPICIndex;
340 FIndexedFileCount := FIndexedFileCount +1;
341
342 SetLength(TheFile, AStream.Size+1);
343 AStream.Position := 0;
344 AStream.Read(TheFile[1], AStream.Size);
345 TheFile[Length(TheFile)] := #0;
346
347 FParser := THTMLParser.Create(@TheFile[1]);
348 FParser.OnFoundTag := @CBFoundTag;
349 FParser.OnFoundText := @CBFountText;
350 FParser.Exec;
351 FParser.Free;
352
353 Result := FDocTitle;
354 FDocTitle := '';
355 FInBody := False;
356 FInTitle:= False;
357 FWordCount := 0;
358 FTopicIndex := -1;
359
360 AStream.Position := 0;
361 end;
362
363 procedure TIndexedWordList.Clear;
364 begin
365 {$ifdef userb}
366 fAvlTree.ClearN(@FreeObject);
367 {$else}
368 fAvlTree.FreeAndClear;
369 {$endif}
370 end;
371
372 procedure TIndexedWordList.AddWord(const AWord: TIndexedWord);
373 begin
374 {$ifdef userb}
375 favltree.add(makekey(aword.theword,aword.istitle),AWord);
376 {$else}
377 favltree.add(aword);
378 {$endif}
379 end;
380
381 procedure TIndexedWordList.ForEach(Proc:TForEachMethod);
382 {$ifdef userb}
383 var key : string;
384 val:TIndexedWord;
385 {$else}
386 var
387 AVLNode : TAVLTreeNode;
388 {$endif}
389 begin
390 {$ifdef userb}
391 if favltree.FirstNode(key,val) then
392 begin // Scan it forward
393 repeat
394 proc(val);
395 until not favltree.FindNext(key,val);
396 end;
397 {$else}
398 AVLNode:=fAVLTree.FindLowest;
399 while (AVLNode<>nil) do
400 begin
401 Proc(TIndexedWord(AVLNode.Data));
402 AVLNode:=FAVLTree.FindSuccessor(AVLNode)
403 end;
404 {$endif}
405 end;
406
407 procedure TIndexedWordList.ForEach(Proc:TForEachProcedure;state:pointer);
408
409 {$ifdef userb}
410 var key : string;
411 val:TIndexedWord;
412 {$else}
413 var
414 AVLNode : TAVLTreeNode;
415 {$endif}
416 begin
417 {$ifdef userb}
418 if favltree.FirstNode(key,val) then
419 begin // Scan it forward
420 repeat
421 proc(val,state);
422 until not favltree.FindNext(key,val);
423 end;
424 {$else}
425 AVLNode:=fAVLTree.FindLowest;
426 while (AVLNode<>nil) do
427 begin
428 Proc(TIndexedWord(AVLNode.Data),State);
429 AVLNode:=FAVLTree.FindSuccessor(AVLNode)
430 end;
431 {$endif}
432 end;
433
434 { TIndexedWord }
GetDocumentnull435 function TIndexedWord.GetDocument ( TopicIndexNum: Integer ) : TIndexDocument;
436 var
437 i: Integer;
438 begin
439 Result := nil;
440 if (FCachedTopic <> nil) and (FCachedTopic.FDocumentIndex = TopicIndexNum) then
441 Exit(FCachedTopic);
442
443 for i := 0 to High(FDocuments) do
444 if FDocuments[i].FDocumentIndex = TopicIndexNum then
445 Exit(FDocuments[i]);
446 if Result = nil then
447 begin
448 Result := TIndexDocument.Create(TopicIndexNum);
449 SetLength(FDocuments, Length(FDocuments)+1);
450 FDocuments[High(FDocuments)] := Result;
451 end;
452 FCachedTopic := Result;
453 end;
454
TIndexedWord.GetDocumentCountnull455 function TIndexedWord.GetDocumentCount: Integer;
456 begin
457 Result := Length(FDocuments);
458 end;
459
460 constructor TIndexedWord.Create(AWord: String; AIsTitle: Boolean);
461 begin
462 FTheWord := AWord;
463 FIsTitle := AIsTitle;
464 end;
465
466 destructor TIndexedWord.Destroy;
467 var
468 i: Integer;
469 begin
470 // here the word removed itself from the linked list. But it can't
471 // touch the AVL tree here.
472 for i := 0 to High(FDocuments) do
473 FreeAndNil(FDocuments[i]);
474 inherited Destroy;
475 end;
476
GetLogicalDocumentnull477 function TIndexedWord.GetLogicalDocument ( AIndex: Integer ) : TIndexDocument;
478 begin
479 Result := FDocuments[AIndex];;
480 end;
481
482 { TIndexDocument }
483 procedure TIndexDocument.AddWordIndex ( AIndex: Integer ) ;
484 begin
485 if FLastEntry>=Length(WordIndex) Then
486 SetLength(WordIndex, Length(WordIndex)+GrowSpeed);
487 WordIndex[FLastEntry] := AIndex;
488 Inc(FLastEntry);
489 end;
490
491 constructor TIndexDocument.Create ( ADocumentIndex: Integer ) ;
492 begin
493 FDocumentIndex := ADocumentIndex;
494 flastentry:=0;
495 end;
496
GetWordIndexnull497 function TIndexDocument.GetWordIndex(i:integer):integer;
498 begin
499 result:=WordIndex[i];
500 end;
501
getindexentriesnull502 function TIndexDocument.getindexentries:integer;
503 begin
504 result:=flastentry;
505 end;
506
507 end.
508