1 {
2 Reads a HTML Document
3
4 License: The same modified LGPL as the Free Pascal RTL
5 See the file COPYING.modifiedLGPL for more details
6
7 AUTHORS: Felipe Monteiro de Carvalho
8 }
9 unit htmlvectorialreader;
10
11 {$mode objfpc}{$H+}
12
13 interface
14
15 uses
16 Classes, SysUtils, math, contnrs,
17 fpimage, fpcanvas, laz2_xmlread, laz2_dom, fgl, lazfileutils,
18 // image data formats
19 fpreadpng,
20 // HTML can contain SVG
21 svgvectorialreader,
22 fpvectorial, fpvutils, lazutf8, TypInfo;
23
24 type
25 { TvHTMLVectorialReader }
26
27 TvHTMLVectorialReader = class(TvCustomVectorialReader)
28 private
29 FPointSeparator, FCommaSeparator: TFormatSettings;
30 //
GetTextContentFromNodenull31 function GetTextContentFromNode(ANode: TDOMNode): string;
32 //
ReadEntityFromNodenull33 function ReadEntityFromNode(ANode: TDOMNode; AData: TvTextPageSequence; ADoc: TvVectorialDocument): TvEntity;
ReadHeaderFromNodenull34 function ReadHeaderFromNode(ANode: TDOMNode; AData: TvTextPageSequence; ADoc: TvVectorialDocument): TvEntity;
35 procedure ReadParagraphFromNode(ADest: TvParagraph; ANode: TDOMNode; AData: TvTextPageSequence; ADoc: TvVectorialDocument);
ReadSVGFromNodenull36 function ReadSVGFromNode(ANode: TDOMNode; AData: TvTextPageSequence; ADoc: TvVectorialDocument): TvEntity;
ReadSVGFromNode_WithEmbDocnull37 function ReadSVGFromNode_WithEmbDoc(ANode: TDOMNode; AEmbDoc: TvEmbeddedVectorialDoc; ADoc: TvVectorialDocument): TvEntity;
ReadMathFromNodenull38 function ReadMathFromNode(ANode: TDOMNode; AData: TvTextPageSequence; ADoc: TvVectorialDocument): TvEntity;
ReadTableFromNodenull39 function ReadTableFromNode(ANode: TDOMNode; AData: TvTextPageSequence; ADoc: TvVectorialDocument): TvEntity;
ReadTableRowNodenull40 function ReadTableRowNode(ATable: TvTable; ANode: TDOMNode; AData: TvTextPageSequence; ADoc: TvVectorialDocument): TvEntity;
ReadUListFromNodenull41 function ReadUListFromNode(ANode: TDOMNode; AData: TvTextPageSequence; ADoc: TvVectorialDocument): TvEntity;
42 public
43 { General reading methods }
44 constructor Create; override;
45 Destructor Destroy; override;
46 procedure ReadFromStrings(AStrings: TStrings; AData: TvVectorialDocument); override;
47 procedure ReadFromXML(Doc: TXMLDocument; AData: TvVectorialDocument);
IsSupportedRasterImagenull48 class function IsSupportedRasterImage(AFileName: string): Boolean;
49 end;
50
51 implementation
52
53 const
54 // SVG requires hardcoding a DPI value
55
56 // The Opera Browser and Inkscape use 90 DPI, so we follow that
57
58 // 1 Inch = 25.4 milimiters
59 // 90 inches per pixel = (1 / 90) * 25.4 = 0.2822
60 // FLOAT_MILIMETERS_PER_PIXEL = 0.3528; // DPI 72 = 1 / 72 inches per pixel
61
62 FLOAT_MILIMETERS_PER_PIXEL = 5*0.2822; // DPI 90 = 1 / 90 inches per pixel => Actually I changed the value by this factor! Because otherwise it looks ugly!
63 FLOAT_PIXELS_PER_MILIMETER = 1 / FLOAT_MILIMETERS_PER_PIXEL; // DPI 90 = 1 / 90 inches per pixel
64
65 { TvHTMLVectorialReader }
66
TvHTMLVectorialReader.GetTextContentFromNodenull67 function TvHTMLVectorialReader.GetTextContentFromNode(ANode: TDOMNode): string;
68 var
69 i: Integer;
70 begin
71 Result := '';
72 for i := 0 to ANode.ChildNodes.Count-1 do
73 begin
74 if ANode.ChildNodes.Item[i] is TDOMText then
75 Result := ANode.ChildNodes.Item[i].NodeValue;
76 end;
77 end;
78
79 // if something is returned, it will be added to the base document
80 // if nothing is returned, either nothing was written, or it was already added
ReadEntityFromNodenull81 function TvHTMLVectorialReader.ReadEntityFromNode(ANode: TDOMNode;
82 AData: TvTextPageSequence; ADoc: TvVectorialDocument): TvEntity;
83 var
84 lEntityName, lTextValue: DOMString;
85 lPara: TvParagraph;
86 begin
87 Result := nil;
88 lEntityName := LowerCase(ANode.NodeName);
89 case lEntityName of
90 'h1', 'h2', 'h3', 'h4', 'h5', 'h6': Result := ReadHeaderFromNode(ANode, AData, ADoc);
91 'p':
92 begin
93 lPara := AData.AddParagraph();
94 ReadParagraphFromNode(lPara, ANode, AData, ADoc);
95 Result := nil;
96 end;
97 'svg': Result := ReadSVGFromNode(ANode, AData, ADoc);
98 'math': Result := ReadMathFromNode(ANode, AData, ADoc);
99 'table': Result := ReadTableFromNode(ANode, AData, ADoc);
100 'br':
101 begin
102 AData.AddParagraph().AddText(LineEnding);
103 Result := nil;
104 end;
105 'ul': Result := ReadUListFromNode(ANode, AData, ADoc);
106 end;
107 // Raw text
108 if (ANode is TDOMText) and (lEntityName = '#text') then
109 begin
110 lTextValue := RemoveLineEndingsAndTrim(ANode.NodeValue);
111 if (lTextValue <> '') then
112 begin
113 AData.AddParagraph().AddText(lTextValue);
114 Result := nil;
115 end;
116 end;
117 end;
118
ReadHeaderFromNodenull119 function TvHTMLVectorialReader.ReadHeaderFromNode(ANode: TDOMNode;
120 AData: TvTextPageSequence; ADoc: TvVectorialDocument): TvEntity;
121 var
122 CurParagraph: TvParagraph;
123 lText: TvText;
124 lTextStr: string;
125 lHeaderType: DOMString;
126 begin
127 Result := nil;
128 CurParagraph := AData.AddParagraph();
129 CurParagraph.Style := ADoc.StyleTextBody;
130 lTextStr := ANode.FirstChild.NodeValue;
131 lText := CurParagraph.AddText(lTextStr);
132 lHeaderType := LowerCase(ANode.NodeName);
133 case lHeaderType of
134 'h1': lText.Style := ADoc.StyleHeading1;
135 'h2': lText.Style := ADoc.StyleHeading2;
136 'h3': lText.Style := ADoc.StyleHeading3;
137 'h4': lText.Style := ADoc.StyleHeading4;
138 'h5': lText.Style := ADoc.StyleHeading5;
139 'h6': lText.Style := ADoc.StyleHeading6;
140 end;
141 end;
142
143 procedure TvHTMLVectorialReader.ReadParagraphFromNode(ADest: TvParagraph; ANode: TDOMNode;
144 AData: TvTextPageSequence; ADoc: TvVectorialDocument);
145 var
146 lText: TvText = nil;
147 lTextStr: string = '';
148 lCurNode: TDOMNode;
149 lNodeName, lNodeValue, lAttrName, lAttrValue: DOMString;
150 lCurAttr: TDOMNode;
151 lRasterImage: TvRasterImage;
152 lEmbVecImg: TvEmbeddedVectorialDoc = nil;
153 i: Integer;
154 lWidth, lHeight: Double;
155 lAltText: string;
156 // xlink:href
157 lx, ly, lw, lh: Double;
158 lImageDataParts: TStringList;
159 lImageDataBase64: string;
160 lImageData: array of Byte;
161 lImageDataStream: TMemoryStream;
162 lImageReader: TFPCustomImageReader;
163
164 procedure TextMerging();
165 begin
166 if lTextStr <> '' then
167 begin
168 if lText = nil then
169 lText := ADest.AddText(lTextStr)
170 else
171 lText.Value.Add(lTextStr);
172 lTextStr := '';
173 end;
174 end;
175
176 begin
177 ADest.Style := ADoc.StyleTextBody;
178
179 lCurNode := ANode.FirstChild;
180 while Assigned(lCurNode) do
181 begin
182 lNodeName := LowerCase(lCurNode.NodeName);
183 lNodeValue := lCurNode.NodeValue;
184
185 if (lCurNode is TDOMText) then
186 begin
187 lTextStr += RemoveLineEndingsAndTrim(lNodeValue);
188 lCurNode := lCurNode.NextSibling;
189 Continue;
190 end;
191
192 // text merging
193 TextMerging();
194 // reset text merging
195 if lNodeName <> 'br' then
196 lText := nil;
197
198 case lNodeName of
199 // <image width="100" height="100" xlink:href="data:image/png;base64,UgAAA....QSK5CYII="/>
200 // <img src="images/noimage.gif" width="100" height="100" alt="No image" />
201 'img', 'image':
202 begin
203 lRasterImage := nil;
204 lEmbVecImg := nil;
205 lWidth := -1;
206 lHeight := -1;
207 lAltText := '';
208
209 for i := 0 to lCurNode.Attributes.Length - 1 do
210 begin
211 lCurAttr := lCurNode.Attributes.Item[i];
212 lAttrName := lCurAttr.NodeName;
213 lAttrValue := lCurAttr.NodeValue;
214
215 case lAttrName of
216 'alt':
217 begin
218 lAltText := lAttrValue;
219 end;
220 'src':
221 begin
222 lAttrValue := lCurAttr.NodeValue;
223 lAttrValue := ExtractFilePath(FFilename) + lAttrValue;
224
225 if TvHTMLVectorialReader.IsSupportedRasterImage(lAttrValue) then
226 begin
227 if not FileExists(lAttrValue) then Continue;
228
229 lRasterImage := ADest.AddRasterImage();
230 lRasterImage.CreateImageFromFile(lAttrValue);
231 end
232 else if TvVectorialDocument.GetFormatFromExtension(lAttrValue, False) <> vfUnknown then
233 begin
234 lEmbVecImg := ADest.AddEmbeddedVectorialDoc();
235 if FileExistsUTF8(lAttrValue) then
236 lEmbVecImg.Document.ReadFromFile(lAttrValue);
237 end;
238 end;
239 'xlink:href':
240 begin
241 lRasterImage := ADest.AddRasterImage();
242 lImageDataParts := TvSVGVectorialReader.ReadSpaceSeparatedStrings(lNodeValue, ':;,');
243 try
244 if (lImageDataParts.Strings[0] = 'data') and
245 (lImageDataParts.Strings[1] = 'image/png') and
246 (lImageDataParts.Strings[2] = 'base64') then
247 begin
248 lImageReader := TFPReaderPNG.Create;
249 lImageDataStream := TMemoryStream.Create;
250 try
251 lImageDataBase64 := lImageDataParts.Strings[3];
252 DecodeBase64(lImageDataBase64, lImageDataStream);
253 lImageDataStream.Position := 0;
254 lRasterImage.CreateRGB888Image(10, 10);
255 lRasterImage.RasterImage.LoadFromStream(lImageDataStream, lImageReader);
256 finally
257 lImageDataStream.Free;
258 lImageReader.Free;
259 end;
260 end
261 else
262 raise Exception.Create('[TvSVGVectorialReader.ReadImageFromNode] Unimplemented image format');
263 finally
264 lImageDataParts.Free;
265 end;
266 end;
267 'width':
268 begin
269 lWidth := StrToInt(lAttrValue);
270 end;
271 'height':
272 begin
273 lHeight := StrToInt(lAttrValue);
274 end;
275 end;
276 end;
277
278 if lRasterImage <> nil then
279 begin
280 if lWidth <= 0 then
281 lWidth := lRasterImage.RasterImage.Width;
282 if lHeight <= 0 then
283 lHeight := lRasterImage.RasterImage.Height;
284
285 lRasterImage.Width := lWidth;
286 lRasterImage.Height := lHeight;
287 lRasterImage.AltText := lAltText;
288 end
289 else if (lEmbVecImg <> nil) and (lWidth > 0) and (lHeight > 0) then
290 begin
291 lEmbVecImg.SetWidth(lWidth);
292 lEmbVecImg.SetHeight(lHeight);
293 end;
294 end;
295 'svg':
296 begin
297 lEmbVecImg := ADest.AddEmbeddedVectorialDoc();
298 ReadSVGFromNode_WithEmbDoc(lCurNode, lEmbVecImg, ADoc);
299 end;
300 end;
301
302 lCurNode := lCurNode.NextSibling;
303 end;
304
305 TextMerging();
306 end;
307
ReadSVGFromNodenull308 function TvHTMLVectorialReader.ReadSVGFromNode(ANode: TDOMNode;
309 AData: TvTextPageSequence; ADoc: TvVectorialDocument): TvEntity;
310 var
311 CurSVG: TvEmbeddedVectorialDoc;
312 lText: TvText;
313 lDoc: TXMLDocument;
314 lImportedNode: TDOMNode;
315 begin
316 Result := nil;
317 CurSVG := AData.AddEmbeddedVectorialDoc();
318 lDoc := TXMLDocument.Create;
319 try
320 lImportedNode := lDoc.ImportNode(ANode, True);
321 lDoc.AppendChild(lImportedNode);
322 CurSVG.Document.ReadFromXML(lDoc, vfSVG);
323 finally
324 lDoc.Free;
325 end;
326 end;
327
ReadSVGFromNode_WithEmbDocnull328 function TvHTMLVectorialReader.ReadSVGFromNode_WithEmbDoc(ANode: TDOMNode;
329 AEmbDoc: TvEmbeddedVectorialDoc; ADoc: TvVectorialDocument): TvEntity;
330 var
331 CurSVG: TvEmbeddedVectorialDoc;
332 lText: TvText;
333 lDoc: TXMLDocument;
334 lImportedNode: TDOMNode;
335 begin
336 Result := nil;
337 CurSVG := AEmbDoc;
338 lDoc := TXMLDocument.Create;
339 try
340 lImportedNode := lDoc.ImportNode(ANode, True);
341 lDoc.AppendChild(lImportedNode);
342 CurSVG.Document.ReadFromXML(lDoc, vfSVG);
343 finally
344 lDoc.Free;
345 end;
346 end;
347
ReadMathFromNodenull348 function TvHTMLVectorialReader.ReadMathFromNode(ANode: TDOMNode;
349 AData: TvTextPageSequence; ADoc: TvVectorialDocument): TvEntity;
350 var
351 CurSVG: TvEmbeddedVectorialDoc;
352 lText: TvText;
353 lDoc: TXMLDocument;
354 lImportedNode: TDOMNode;
355 begin
356 Result := nil;
357 CurSVG := AData.AddEmbeddedVectorialDoc();
358 lDoc := TXMLDocument.Create;
359 try
360 lImportedNode := lDoc.ImportNode(ANode, True);
361 lDoc.AppendChild(lImportedNode);
362 CurSVG.Document.ReadFromXML(lDoc, vfMathML);
363 finally
364 lDoc.Free;
365 end;
366 end;
367
TvHTMLVectorialReader.ReadTableFromNodenull368 function TvHTMLVectorialReader.ReadTableFromNode(ANode: TDOMNode;
369 AData: TvTextPageSequence; ADoc: TvVectorialDocument): TvEntity;
370 var
371 CurTable: TvTable;
372 lCurNode, lCurSubnode: TDOMNode;
373 lNodeName, lNodeValue: DOMString;
374 CurRow: TvTableRow;
375 Caption_Cell: TvTableCell = nil;
376 CurCellPara: TvParagraph;
377 // attributes
378 i, lBorderNr: Integer;
379 lAttrName, lAttrValue: DOMString;
380
381 procedure SetBorderLineType(AType: TvTableBorderType);
382 begin
383 CurTable.Borders.Left.LineType := AType;
384 CurTable.Borders.Right.LineType := AType;
385 CurTable.Borders.Top.LineType := AType;
386 CurTable.Borders.Bottom.LineType := AType;
387 CurTable.Borders.InsideHoriz.LineType := AType;
388 CurTable.Borders.InsideVert.LineType := AType;
389 end;
390
391 begin
392 Result := nil;
393 CurTable := AData.AddTable();
394 CurTable.CellSpacingLeft := 3;
395 CurTable.CellSpacingTop := 2;
396
397 // Default to no border without "border" attribute
398 SetBorderLineType(tbtNone);
399
400 // table attributes
401 for i := 0 to ANode.Attributes.Length - 1 do
402 begin
403 lAttrName := ANode.Attributes.Item[i].NodeName;
404 lAttrValue := ANode.Attributes.Item[i].NodeValue;
405
406 case lAttrName of
407 'border':
408 begin
409 lBorderNr := StrToInt(lAttrValue);
410
411 SetBorderLineType(tbtSingle);
412 CurTable.Borders.Left.Width := lBorderNr;
413 CurTable.Borders.Right.Width := lBorderNr;
414 CurTable.Borders.Top.Width := lBorderNr;
415 CurTable.Borders.Bottom.Width := lBorderNr;
416 CurTable.Borders.InsideHoriz.Width := lBorderNr;
417 CurTable.Borders.InsideVert.Width := lBorderNr;
418
419 if lBorderNr = 0 then
420 SetBorderLineType(tbtNone);
421 end;
422 end;
423 end;
424
425 // table child nodes
426 lCurNode := ANode.FirstChild;
427 while Assigned(lCurNode) do
428 begin
429 lNodeName := lCurNode.NodeName;
430 lNodeValue := lCurNode.NodeValue;
431 case lNodeName of
432 'caption':
433 begin
434 CurRow := CurTable.AddRow();
435 Caption_Cell := CurRow.AddCell();
436 {Caption_Cell.Borders.Left.LineType := tbtNone;
437 Caption_Cell.Borders.Top.LineType := tbtNone;
438 Caption_Cell.Borders.Right.LineType := tbtNone;
439 Caption_Cell.Borders.Bottom.LineType := tbtNone;}
440 CurCellPara := Caption_Cell.AddParagraph();
441 CurCellPara.Style := ADoc.StyleTextBodyCentralized;
442 CurCellPara.AddText(GetTextContentFromNode(lCurNode));
443 end;
444 'tbody':
445 begin
446 lCurSubnode := lCurNode.FirstChild;
447 while Assigned(lCurSubnode) do
448 begin
449 ReadTableRowNode(CurTable, lCurSubnode, AData, ADoc);
450
451 lCurSubnode := lCurSubnode.NextSibling;
452 end;
453 end;
454 end;
455
456 lCurNode := lCurNode.NextSibling;
457 end;
458
459 // the caption spans all columns
460 if Caption_Cell <> nil then
461 Caption_Cell.SpannedCols := CurTable.GetColCount();
462 end;
463
TvHTMLVectorialReader.ReadTableRowNodenull464 function TvHTMLVectorialReader.ReadTableRowNode(ATable: TvTable; ANode: TDOMNode;
465 AData: TvTextPageSequence; ADoc: TvVectorialDocument): TvEntity;
466 var
467 lCurNode: TDOMNode;
468 lNodeName, lNodeValue: DOMString;
469 CurRow: TvTableRow;
470 CurCell: TvTableCell;
471 CurCellPara: TvParagraph;
472 begin
473 Result := nil;
474 CurRow := ATable.AddRow();
475
476 lCurNode := ANode.FirstChild;
477 while Assigned(lCurNode) do
478 begin
479 lNodeName := lCurNode.NodeName;
480 lNodeValue := lCurNode.NodeValue;
481 case lNodeName of
482 'th':
483 begin
484 CurCell := CurRow.AddCell();
485 CurCellPara := CurCell.AddParagraph();
486 CurCellPara.Style := ADoc.StyleTextBodyBold;
487 CurCellPara.AddText(GetTextContentFromNode(lCurNode));
488 end;
489 'td':
490 begin
491 CurCell := CurRow.AddCell();
492
493 CurCellPara := CurCell.AddParagraph();
494 Self.ReadParagraphFromNode(CurCellPara, lCurNode, AData, ADoc);
495 end;
496 end;
497
498 lCurNode := lCurNode.NextSibling;
499 end;
500 end;
501
ReadUListFromNodenull502 function TvHTMLVectorialReader.ReadUListFromNode(ANode: TDOMNode;
503 AData: TvTextPageSequence; ADoc: TvVectorialDocument): TvEntity;
504 var
505 lCurNode: TDOMNode;
506 lNodeName, lNodeValue: DOMString;
507 lNodeText: string;
508 //
509 lList: TvList;
510 lCurPara: TvParagraph;
511 begin
512 Result := nil;
513 lList := AData.AddList();
514
515 lCurNode := ANode.FirstChild;
516 while Assigned(lCurNode) do
517 begin
518 lNodeName := lCurNode.NodeName;
519 lNodeValue := lCurNode.NodeValue;
520 case lNodeName of
521 'li':
522 begin
523 lNodeText := GetTextContentFromNode(lCurNode);
524 lCurPara := lList.AddParagraph(lNodeText);
525 end;
526 end;
527
528 lCurNode := lCurNode.NextSibling;
529 end;
530 end;
531
532 constructor TvHTMLVectorialReader.Create;
533 begin
534 inherited Create;
535
536 FPointSeparator := DefaultFormatSettings;
537 FPointSeparator.DecimalSeparator := '.';
538 FPointSeparator.ThousandSeparator := '#';// disable the thousand separator
539 end;
540
541 destructor TvHTMLVectorialReader.Destroy;
542 begin
543 inherited Destroy;
544 end;
545
546 procedure TvHTMLVectorialReader.ReadFromStrings(AStrings: TStrings;
547 AData: TvVectorialDocument);
548 var
549 Doc: TXMLDocument = nil;
550 lStream: TMemoryStream;
551 lTmp: String;
552 begin
553 lStream := TMemoryStream.Create();
554 try
555 // Remove the <!DOCTYPE line
556 if Pos('<!DOCTYPE', AStrings.Strings[0]) <> 0 then
557 AStrings.Delete(0);
558 // Create a header
559 AStrings.Insert(0, '<?xml version="1.0"?>');
560 lTmp := AStrings.Text;
561 // Flush it back to a stream
562 AStrings.SaveToStream(lStream);
563 lStream.Position := 0;
564 // HTML is not XML, but might be compatible enough... a dedicated reader will be complex, but eventually necessary
565 ReadXMLFile(Doc, lStream);
566 lStream.Free; // Release as soon as unnecessary
567 lStream := nil;
568 //
569 ReadFromXML(Doc, AData);
570 finally
571 Doc.Free;
572 lStream.Free;
573 end;
574 end;
575
576 procedure TvHTMLVectorialReader.ReadFromXML(Doc: TXMLDocument;
577 AData: TvVectorialDocument);
578 var
579 lCurNode, lCurSubnode: TDOMNode;
580 lPage: TvTextPageSequence;
581 lNodeName, lNodeValue: DOMString;
582 ANode: TDOMElement;
583 i: Integer;
584 lCurEntity: TvEntity;
585 begin
586 {ANode := Doc.DocumentElement;
587 for i := 0 to ANode.Attributes.Length - 1 do
588 begin
589 lNodeName := ANode.Attributes.Item[i].NodeName;
590 lNodeValue := ANode.Attributes.Item[i].NodeValue;
591 end;}
592
593 AData.AddStandardTextDocumentStyles(vfHTML);
594
595 // ----------------
596 // Now process the elements
597 // ----------------
598 lCurNode := Doc.DocumentElement.FirstChild;
599 lPage := AData.AddTextPageSequence();
600 //lPage.Width := AData.Width;
601 //lPage.Height := AData.Height;
602 while Assigned(lCurNode) do
603 begin
604 lNodeName := lCurNode.NodeName;
605 if lNodeName = 'body' then
606 begin
607 lCurSubnode := lCurNode.FirstChild;
608 while Assigned(lCurSubnode) do
609 begin
610 lCurEntity := ReadEntityFromNode(lCurSubnode, lPage, AData);
611 if lCurEntity <> nil then
612 lPage.AddEntity(lCurEntity);
613
614 lCurSubnode := lCurSubnode.NextSibling;
615 end;
616 end;
617
618 lCurNode := lCurNode.NextSibling;
619 end;
620 end;
621
TvHTMLVectorialReader.IsSupportedRasterImagenull622 class function TvHTMLVectorialReader.IsSupportedRasterImage(AFileName: string): Boolean;
623 begin
624 Result := FilenameExtIn(AFileName,['.png','.jpg','.jpeg','.bmp','.xpm','.gif']);
625 end;
626
627 initialization
628
629 RegisterVectorialReader(TvHTMLVectorialReader, vfHTML);
630
631 end.
632
633